Upgrade to ocamlformat 0.29.0; fix csvt/sexpt streaming; reformat

+1 -1

.ocamlformat

··· 1 - version = 0.28.1 1 + version = 0.29.0

+3 -3

bench/speed.ml

··· 639 639 let g = Crypto_rng.(v ~seed (module Fortuna)) in 640 640 Crypto_rng.set_default_generator g; 641 641 match Array.to_list Sys.argv with 642 - | _ :: (_ :: _ as args) -> begin 643 - try 642 + | _ :: (_ :: _ as args) -> 643 + begin try 644 644 let fs = 645 645 args 646 646 |> List.map @@ fun n -> ··· 648 648 in 649 649 runv fs 650 650 with Not_found -> help () 651 - end 651 + end 652 652 | _ -> help ()

+7 -7

rng/crypto_rng.mli

··· 307 307 308 308 Generating a random 13-byte string: 309 309 {[ 310 - let cs = Rng.generate 13 310 + let cs = Rng.generate 13 311 311 ]} 312 312 313 313 Generating a list of string, passing down an optional {{!g}generator}: 314 314 {[ 315 - let rec f1 ?g ~n i = 316 - if i < 1 then [] else Rng.generate ?g n :: f1 ?g ~n (i - 1) 315 + let rec f1 ?g ~n i = 316 + if i < 1 then [] else Rng.generate ?g n :: f1 ?g ~n (i - 1) 317 317 ]} 318 318 319 319 Generating a [Z.t] smaller than [10]: 320 320 {[ 321 - let f2 ?g () = Crypto_pk.Z_extra.gen ?g Z.(~$10) 321 + let f2 ?g () = Crypto_pk.Z_extra.gen ?g Z.(~$10) 322 322 ]} 323 323 324 324 Creating a local Fortuna instance and using it as a key-derivation function: 325 325 {[ 326 - let f3 secret = 327 - let g = Rng.(v ~seed:secret (module Generators.Fortuna)) in 328 - Rng.generate ~g 32 326 + let f3 secret = 327 + let g = Rng.(v ~seed:secret (module Generators.Fortuna)) in 328 + Rng.generate ~g 32 329 329 ]} *)

+21 -214

src/c/aes_aesni.c

··· 363 363 #endif /* __mc_ACCELERATE__ */ 364 364 365 365 /* ------------------------------------------------------------------ */ 366 - /* ARM64 Cryptography Extensions (FEAT_AES) */ 366 + /* Dispatch */ 367 367 /* */ 368 - /* Uses ARM NEON Crypto intrinsics (arm_neon.h): */ 369 - /* vaeseq_u8 - AES single round encrypt (AddRoundKey+SubBytes+ */ 370 - /* ShiftRows) */ 371 - /* vaesmcq_u8 - AES MixColumns */ 372 - /* vaesdq_u8 - AES single round decrypt */ 373 - /* vaesimcq_u8 - AES InverseMixColumns */ 374 - /* */ 375 - /* Reference: ARM Architecture Reference Manual, FEAT_AES. */ 376 - /* Same pattern as Linux kernel aes-ce-glue.c / OpenSSL aes-armv8.c. */ 377 - /* ------------------------------------------------------------------ */ 378 - 379 - #if defined (__mc_ARM64CE__) 380 - 381 - static int _mc_arm_aes_rk_size (uint8_t rounds) { 382 - return (rounds + 1) * 16; 383 - } 384 - 385 - /* 386 - * ARM vaeseq_u8 performs: AddRoundKey XOR, then SubBytes, then ShiftRows. 387 - * So the encryption sequence per round is: 388 - * block = vaesmcq_u8(vaeseq_u8(block, rk[i])) 389 - * Final round omits MixColumns: 390 - * block = vaeseq_u8(block, rk[Nr-1]) ^ rk[Nr] 391 - */ 392 - static inline void _mc_arm_aes_enc (const uint8_t src[16], uint8_t dst[16], 393 - const uint8_t *rk0, uint8_t rounds) { 394 - uint8x16_t block = vld1q_u8(src); 395 - 396 - for (uint8_t i = 0; i < rounds - 1; i++) { 397 - uint8x16_t rk = vld1q_u8(rk0 + i * 16); 398 - block = vaesmcq_u8(vaeseq_u8(block, rk)); 399 - } 400 - /* Final round: SubBytes + ShiftRows + AddRoundKey (no MixColumns) */ 401 - block = vaeseq_u8(block, vld1q_u8(rk0 + (rounds - 1) * 16)); 402 - block = veorq_u8(block, vld1q_u8(rk0 + rounds * 16)); 403 - vst1q_u8(dst, block); 404 - } 405 - 406 - static inline void _mc_arm_aes_dec (const uint8_t src[16], uint8_t dst[16], 407 - const uint8_t *rk0, uint8_t rounds) { 408 - uint8x16_t block = vld1q_u8(src); 409 - 410 - for (uint8_t i = 0; i < rounds - 1; i++) { 411 - uint8x16_t rk = vld1q_u8(rk0 + i * 16); 412 - block = vaesimcq_u8(vaesdq_u8(block, rk)); 413 - } 414 - block = vaesdq_u8(block, vld1q_u8(rk0 + (rounds - 1) * 16)); 415 - block = veorq_u8(block, vld1q_u8(rk0 + rounds * 16)); 416 - vst1q_u8(dst, block); 417 - } 418 - 419 - /* 4-block parallel encrypt for throughput */ 420 - static inline void _mc_arm_aes_enc4 (const uint8_t src[64], uint8_t dst[64], 421 - const uint8_t *rk0, uint8_t rounds) { 422 - uint8x16_t b0 = vld1q_u8(src); 423 - uint8x16_t b1 = vld1q_u8(src + 16); 424 - uint8x16_t b2 = vld1q_u8(src + 32); 425 - uint8x16_t b3 = vld1q_u8(src + 48); 426 - 427 - for (uint8_t i = 0; i < rounds - 1; i++) { 428 - uint8x16_t rk = vld1q_u8(rk0 + i * 16); 429 - b0 = vaesmcq_u8(vaeseq_u8(b0, rk)); 430 - b1 = vaesmcq_u8(vaeseq_u8(b1, rk)); 431 - b2 = vaesmcq_u8(vaeseq_u8(b2, rk)); 432 - b3 = vaesmcq_u8(vaeseq_u8(b3, rk)); 433 - } 434 - uint8x16_t rk2 = vld1q_u8(rk0 + (rounds - 1) * 16); 435 - uint8x16_t rkl = vld1q_u8(rk0 + rounds * 16); 436 - b0 = veorq_u8(vaeseq_u8(b0, rk2), rkl); 437 - b1 = veorq_u8(vaeseq_u8(b1, rk2), rkl); 438 - b2 = veorq_u8(vaeseq_u8(b2, rk2), rkl); 439 - b3 = veorq_u8(vaeseq_u8(b3, rk2), rkl); 440 - 441 - vst1q_u8(dst, b0); 442 - vst1q_u8(dst + 16, b1); 443 - vst1q_u8(dst + 32, b2); 444 - vst1q_u8(dst + 48, b3); 445 - } 446 - 447 - static inline void _mc_arm_aes_dec4 (const uint8_t src[64], uint8_t dst[64], 448 - const uint8_t *rk0, uint8_t rounds) { 449 - uint8x16_t b0 = vld1q_u8(src); 450 - uint8x16_t b1 = vld1q_u8(src + 16); 451 - uint8x16_t b2 = vld1q_u8(src + 32); 452 - uint8x16_t b3 = vld1q_u8(src + 48); 453 - 454 - for (uint8_t i = 0; i < rounds - 1; i++) { 455 - uint8x16_t rk = vld1q_u8(rk0 + i * 16); 456 - b0 = vaesimcq_u8(vaesdq_u8(b0, rk)); 457 - b1 = vaesimcq_u8(vaesdq_u8(b1, rk)); 458 - b2 = vaesimcq_u8(vaesdq_u8(b2, rk)); 459 - b3 = vaesimcq_u8(vaesdq_u8(b3, rk)); 460 - } 461 - uint8x16_t rk2 = vld1q_u8(rk0 + (rounds - 1) * 16); 462 - uint8x16_t rkl = vld1q_u8(rk0 + rounds * 16); 463 - b0 = veorq_u8(vaesdq_u8(b0, rk2), rkl); 464 - b1 = veorq_u8(vaesdq_u8(b1, rk2), rkl); 465 - b2 = veorq_u8(vaesdq_u8(b2, rk2), rkl); 466 - b3 = veorq_u8(vaesdq_u8(b3, rk2), rkl); 467 - 468 - vst1q_u8(dst, b0); 469 - vst1q_u8(dst + 16, b1); 470 - vst1q_u8(dst + 32, b2); 471 - vst1q_u8(dst + 48, b3); 472 - } 473 - 474 - static inline void _mc_arm_aes_enc_blocks (const uint8_t *src, uint8_t *dst, 475 - const uint8_t *rk, uint8_t rounds, 476 - size_t blocks) { 477 - while (blocks >= 4) { 478 - _mc_arm_aes_enc4(src, dst, rk, rounds); 479 - src += 64; dst += 64; blocks -= 4; 480 - } 481 - while (blocks--) { 482 - _mc_arm_aes_enc(src, dst, rk, rounds); 483 - src += 16; dst += 16; 484 - } 485 - } 486 - 487 - static inline void _mc_arm_aes_dec_blocks (const uint8_t *src, uint8_t *dst, 488 - const uint8_t *rk, uint8_t rounds, 489 - size_t blocks) { 490 - while (blocks >= 4) { 491 - _mc_arm_aes_dec4(src, dst, rk, rounds); 492 - src += 64; dst += 64; blocks -= 4; 493 - } 494 - while (blocks--) { 495 - _mc_arm_aes_dec(src, dst, rk, rounds); 496 - src += 16; dst += 16; 497 - } 498 - } 499 - 500 - /* 501 - * ARM64 key schedule helpers. 502 - * 503 - * The generic key expansion stores round keys as uint32_t in host byte 504 - * order (little-endian on ARM64). ARM AES instructions (vaeseq_u8 etc.) 505 - * operate on byte arrays where each 4-byte column is in big-endian order 506 - * (matching the AES standard's byte layout). We byte-swap each uint32_t 507 - * after the generic expansion. 508 - */ 509 - static void _mc_arm_fixup_rk (uint8_t *rk_bytes, int rounds) { 510 - uint32_t *rk = (uint32_t *)rk_bytes; 511 - int nwords = (rounds + 1) * 4; 512 - for (int i = 0; i < nwords; i++) 513 - rk[i] = __builtin_bswap32(rk[i]); 514 - } 515 - 516 - /* 517 - * ARM64 decrypt key inversion: apply InvMixColumns to middle round keys 518 - * using vaesimcq_u8 (equivalent of x86 _mm_aesimc_si128). 519 - */ 520 - static void _mc_arm_invert_e_key (const uint8_t *ek, uint8_t *dk, 521 - uint8_t rounds) { 522 - /* Copy last encryption round key → first decrypt round key */ 523 - vst1q_u8(dk, vld1q_u8(ek + rounds * 16)); 524 - /* InvMixColumns on middle round keys */ 525 - for (uint8_t i = 1; i < rounds; i++) 526 - vst1q_u8(dk + i * 16, vaesimcq_u8(vld1q_u8(ek + (rounds - i) * 16))); 527 - /* Copy first encryption round key → last decrypt round key */ 528 - vst1q_u8(dk + rounds * 16, vld1q_u8(ek)); 529 - } 530 - 531 - #endif /* __mc_ARM64CE__ */ 532 - 533 - /* ------------------------------------------------------------------ */ 534 - /* Dispatch (platform-aware) */ 368 + /* On x86: AES-NI accelerated path above. */ 369 + /* On ARM64 and other platforms: uses BearSSL aes_ct64 via */ 370 + /* mc_aes_*_generic (constant-time, no hardware intrinsics). */ 371 + /* ARM64 AES-CE acceleration: TODO (needs audited implementation). */ 535 372 /* ------------------------------------------------------------------ */ 536 373 537 374 CAMLprim value 538 375 mc_aes_rk_size (value rounds) { 539 376 value s; 540 - #if defined(__mc_ARM64CE__) 541 - _mc_switch_accel(arm_aes, 542 - s = mc_aes_rk_size_generic(rounds), 543 - s = Val_int (_mc_arm_aes_rk_size (Int_val (rounds)))) 544 - #else 377 + #if defined(__mc_ACCELERATE__) 545 378 _mc_switch_accel(aesni, 546 379 s = mc_aes_rk_size_generic(rounds), 547 380 s = Val_int (_mc_aesni_rk_size (Int_val (rounds)))) 381 + #else 382 + s = mc_aes_rk_size_generic(rounds); 548 383 #endif 549 384 return s; 550 385 } 551 386 552 387 CAMLprim value 553 388 mc_aes_derive_e_key (value key, value rk, value rounds) { 554 - #if defined(__mc_ARM64CE__) 555 - mc_aes_derive_e_key_generic(key, rk, rounds); 556 - if (mc_detected_cpu_features.arm_aes) 557 - _mc_arm_fixup_rk(_bp_uint8(rk), Int_val(rounds)); 558 - #else 389 + #if defined(__mc_ACCELERATE__) 559 390 _mc_switch_accel(aesni, 560 391 mc_aes_derive_e_key_generic(key, rk, rounds), 561 392 _mc_aesni_derive_e_key (_st_uint8 (key), 562 393 _bp_uint8 (rk), 563 394 Int_val (rounds))) 395 + #else 396 + mc_aes_derive_e_key_generic(key, rk, rounds); 564 397 #endif 565 398 return Val_unit; 566 399 } 567 400 568 401 CAMLprim value 569 402 mc_aes_derive_d_key (value key, value kr, value rounds, value rk) { 570 - #if defined(__mc_ARM64CE__) 571 - if (mc_detected_cpu_features.arm_aes) { 572 - /* First derive encryption keys (byte-swapped for ARM) */ 573 - mc_aes_derive_e_key_generic(key, kr, rounds); 574 - _mc_arm_fixup_rk(_bp_uint8(kr), Int_val(rounds)); 575 - /* Then invert for decryption using vaesimcq_u8 */ 576 - uint8_t tmp[15 * 16]; /* max AES-256: 15 round keys */ 577 - memcpy(tmp, _bp_uint8(kr), (Int_val(rounds) + 1) * 16); 578 - _mc_arm_invert_e_key(tmp, _bp_uint8(kr), Int_val(rounds)); 579 - } else { 580 - mc_aes_derive_d_key_generic(key, kr, rounds, rk); 581 - } 582 - #else 403 + #if defined(__mc_ACCELERATE__) 583 404 _mc_switch_accel(aesni, 584 405 mc_aes_derive_d_key_generic(key, kr, rounds, rk), 585 406 _mc_aesni_derive_d_key (_st_uint8 (key), 586 407 _bp_uint8 (kr), 587 408 Int_val (rounds), 588 409 Is_block(rk) ? _bp_uint8(Field(rk, 0)) : 0)) 410 + #else 411 + mc_aes_derive_d_key_generic(key, kr, rounds, rk); 589 412 #endif 590 413 return Val_unit; 591 414 } 592 415 593 416 CAMLprim value 594 417 mc_aes_enc (value src, value off1, value dst, value off2, value rk, value rounds, value blocks) { 595 - #if defined(__mc_ARM64CE__) 596 - _mc_switch_accel(arm_aes, 597 - mc_aes_enc_generic(src, off1, dst, off2, rk, rounds, blocks), 598 - _mc_arm_aes_enc_blocks ( _st_uint8_off (src, off1), 599 - _bp_uint8_off (dst, off2), 600 - _st_uint8 (rk), 601 - Int_val (rounds), 602 - Int_val (blocks) )) 603 - #else 418 + #if defined(__mc_ACCELERATE__) 604 419 _mc_switch_accel(aesni, 605 420 mc_aes_enc_generic(src, off1, dst, off2, rk, rounds, blocks), 606 421 _mc_aesni_enc_blocks ( _st_uint8_off (src, off1), ··· 608 423 _st_uint8 (rk), 609 424 Int_val (rounds), 610 425 Int_val (blocks) )) 426 + #else 427 + mc_aes_enc_generic(src, off1, dst, off2, rk, rounds, blocks); 611 428 #endif 612 429 return Val_unit; 613 430 } 614 431 615 432 CAMLprim value 616 433 mc_aes_dec (value src, value off1, value dst, value off2, value rk, value rounds, value blocks) { 617 - #if defined(__mc_ARM64CE__) 618 - _mc_switch_accel(arm_aes, 619 - mc_aes_dec_generic(src, off1, dst, off2, rk, rounds, blocks), 620 - _mc_arm_aes_dec_blocks ( _st_uint8_off (src, off1), 621 - _bp_uint8_off (dst, off2), 622 - _st_uint8 (rk), 623 - Int_val (rounds), 624 - Int_val (blocks) )) 625 - #else 434 + #if defined(__mc_ACCELERATE__) 626 435 _mc_switch_accel(aesni, 627 436 mc_aes_dec_generic(src, off1, dst, off2, rk, rounds, blocks), 628 437 _mc_aesni_dec_blocks ( _st_uint8_off (src, off1), ··· 630 439 _st_uint8 (rk), 631 440 Int_val (rounds), 632 441 Int_val (blocks) )) 442 + #else 443 + mc_aes_dec_generic(src, off1, dst, off2, rk, rounds, blocks); 633 444 #endif 634 445 return Val_unit; 635 446 } 636 447 637 448 CAMLprim value mc_aes_mode (__unit ()) { 638 449 value enabled = 0; 639 - #if defined(__mc_ARM64CE__) 640 - _mc_switch_accel(arm_aes, 641 - enabled = 0, 642 - enabled = 1) 643 - #else 450 + #if defined(__mc_ACCELERATE__) 644 451 _mc_switch_accel(aesni, 645 452 enabled = 0, 646 453 enabled = 1)

+134 -434

src/c/aes_generic.c

··· 1 1 /* 2 - * Constant-time AES implementation. 2 + * Constant-time AES -- glue between BearSSL aes_ct64 and OCaml stubs. 3 3 * 4 - * Replaces the T-table (timing-vulnerable) AES with constant-time 5 - * operations. All S-box lookups use arithmetic masking to eliminate 6 - * cache-timing side channels. MixColumns uses explicit GF(2^8) 7 - * arithmetic via xtime (no lookup tables). 8 - * 9 - * Technique: constant-time table scan with arithmetic masks. 10 - * Reference: BearSSL constant-time principles (Thomas Pornin) 11 - * https://bearssl.org/constanttime.html 4 + * All cryptographic operations are performed by BearSSL's aes_ct64 5 + * implementation (Thomas Pornin, MIT license). This file contains only 6 + * the OCaml C stub wrappers and the data format adaptation. 12 7 * 13 - * Key expansion uses the same standard Rijndael schedule. 14 - * S-box values from FIPS 197. 8 + * BearSSL source: https://bearssl.org/ 9 + * Files: src/symcipher/aes_ct64.c, aes_ct64_enc.c, aes_ct64_dec.c 15 10 * 16 - * Copyright (c) 2025 Thomas Gazagnaire 17 - * Based on techniques from BearSSL (Thomas Pornin, MIT license). 18 - * Original Rijndael key schedule: public domain, Philip J. Erdelsky. 11 + * The aes_ct64 implementation uses the Boyar-Peralta S-box circuit 12 + * (bitsliced, constant-time, no table lookups on secret data). 13 + * See: https://eprint.iacr.org/2009/191.pdf 19 14 */ 20 15 21 16 #include "crypto.h" 22 - #include <string.h> 23 - 24 - /* AES forward S-box (FIPS 197, Section 5.1.1) */ 25 - static const uint8_t sbox[256] = { 26 - 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 27 - 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 28 - 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 29 - 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 30 - 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 31 - 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 32 - 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 33 - 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 34 - 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 35 - 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 36 - 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 37 - 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 38 - 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 39 - 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 40 - 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 41 - 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 42 - 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 43 - 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 44 - 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 45 - 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 46 - 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 47 - 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 48 - 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 49 - 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 50 - 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 51 - 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 52 - 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 53 - 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 54 - 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 55 - 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 56 - 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 57 - 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16, 58 - }; 59 - 60 - /* AES inverse S-box (FIPS 197, Section 5.3.2) */ 61 - static const uint8_t isbox[256] = { 62 - 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 63 - 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, 64 - 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 65 - 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, 66 - 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 67 - 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, 68 - 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 69 - 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, 70 - 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 71 - 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, 72 - 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 73 - 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, 74 - 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 75 - 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, 76 - 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 77 - 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, 78 - 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 79 - 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, 80 - 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 81 - 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, 82 - 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 83 - 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, 84 - 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 85 - 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, 86 - 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 87 - 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, 88 - 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 89 - 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, 90 - 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 91 - 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, 92 - 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 93 - 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d, 94 - }; 95 - 96 - static const uint32_t rcon[] = { 97 - 0x01000000, 0x02000000, 0x04000000, 0x08000000, 98 - 0x10000000, 0x20000000, 0x40000000, 0x80000000, 99 - 0x1B000000, 0x36000000, 100 - }; 101 - 102 - /* ------------------------------------------------------------------ */ 103 - /* Constant-time primitives */ 104 - /* ------------------------------------------------------------------ */ 17 + #include "bearssl/inner.h" 105 18 106 19 /* 107 - * Constant-time equality: returns 0xFFFFFFFF if a == b, 0 otherwise. 108 - * Uses the property that for any nonzero x in uint32_t, 109 - * (x | -x) has bit 31 set. 20 + * Key schedule format: BearSSL uses a compact key schedule (comp_skey) 21 + * stored as uint64_t[30]. The expanded key schedule (skey) is 22 + * (num_rounds+1)*8 uint64_t values, computed per-call on the stack. 23 + * 24 + * We store the compact key schedule in the OCaml-allocated round-key 25 + * buffer. The buffer size is (num_rounds+1)*16 bytes = comp_skey size 26 + * of (num_rounds+1)*2 uint64_t = at most 30 uint64_t = 240 bytes. 27 + * This matches the existing rk_size for AES-256 (240 bytes). 110 28 */ 111 - static inline uint32_t ct_eq(uint32_t a, uint32_t b) { 112 - uint32_t q = a ^ b; 113 - return ((q | (~q + 1)) >> 31) - 1; 114 - } 115 - 116 - /* 117 - * Constant-time S-box lookup: scans all 256 entries, selects the 118 - * matching one via arithmetic mask. No secret-dependent memory 119 - * access pattern. 120 - */ 121 - static inline uint8_t ct_sbox_lookup(const uint8_t table[256], uint8_t x) { 122 - uint32_t result = 0; 123 - for (uint32_t i = 0; i < 256; i++) { 124 - result |= table[i] & ct_eq((uint32_t)x, i); 125 - } 126 - return (uint8_t)result; 127 - } 128 - 129 - #define ct_sub(x) ct_sbox_lookup(sbox, (x)) 130 - #define ct_isub(x) ct_sbox_lookup(isbox, (x)) 131 - 132 - /* ------------------------------------------------------------------ */ 133 - /* GF(2^8) arithmetic for MixColumns (no tables) */ 134 - /* ------------------------------------------------------------------ */ 135 29 136 - /* Multiply by 2 in GF(2^8) with reduction polynomial 0x1b. 137 - * Branchless: the mask is 0x1b when bit 7 is set, 0 otherwise. */ 138 - static inline uint8_t xtime(uint8_t x) { 139 - return (uint8_t)((x << 1) ^ (0x1b & (uint8_t)(-(x >> 7)))); 140 - } 141 - 142 - /* GF(2^8) multiplications by fixed constants used in AES. 143 - * All built from xtime (multiply-by-2) and XOR. */ 144 - static inline uint8_t mul_02(uint8_t x) { return xtime(x); } 145 - static inline uint8_t mul_03(uint8_t x) { return xtime(x) ^ x; } 30 + #define keybits_of_r(x) (((x) - 6) * 32) 146 31 147 - static inline uint8_t mul_09(uint8_t x) { 148 - return xtime(xtime(xtime(x))) ^ x; 149 - } 150 - static inline uint8_t mul_0b(uint8_t x) { 151 - return xtime(xtime(xtime(x))) ^ xtime(x) ^ x; 152 - } 153 - static inline uint8_t mul_0d(uint8_t x) { 154 - return xtime(xtime(xtime(x))) ^ xtime(xtime(x)) ^ x; 155 - } 156 - static inline uint8_t mul_0e(uint8_t x) { 157 - return xtime(xtime(xtime(x))) ^ xtime(xtime(x)) ^ xtime(x); 32 + CAMLprim value 33 + mc_aes_rk_size_generic(value rounds) 34 + { 35 + /* BearSSL comp_skey: (num_rounds+1)*2 uint64_t */ 36 + return Val_int((Int_val(rounds) + 1) * 2 * sizeof(uint64_t)); 158 37 } 159 38 160 - /* ------------------------------------------------------------------ */ 161 - /* Byte-level helpers */ 162 - /* ------------------------------------------------------------------ */ 163 - 164 - #define GETU32(p) \ 165 - (((uint32_t)(p)[0] << 24) | ((uint32_t)(p)[1] << 16) | \ 166 - ((uint32_t)(p)[2] << 8) | ((uint32_t)(p)[3])) 167 - 168 - #define PUTU32(p, v) do { \ 169 - (p)[0] = (uint8_t)((v) >> 24); \ 170 - (p)[1] = (uint8_t)((v) >> 16); \ 171 - (p)[2] = (uint8_t)((v) >> 8); \ 172 - (p)[3] = (uint8_t)(v); \ 173 - } while (0) 174 - 175 - #define KEYLENGTH(keybits) ((keybits)/8) 176 - #define RKLENGTH(keybits) ((keybits)/8+28) 177 - #define NROUNDS(keybits) ((keybits)/32+6) 178 - 179 - /* ------------------------------------------------------------------ */ 180 - /* Key expansion (standard Rijndael schedule) */ 181 - /* */ 182 - /* Original: public domain, Philip J. Erdelsky. */ 183 - /* Modified to use constant-time S-box lookups. */ 184 - /* ------------------------------------------------------------------ */ 185 - 186 - /* 187 - * Constant-time SubWord for key expansion: applies S-box to each byte 188 - * of a 32-bit word, with RotWord (rotate left 8 bits) for the main 189 - * key schedule step. 190 - */ 191 - static inline uint32_t ct_subword_rot(uint32_t w) { 192 - return ((uint32_t)ct_sub((w >> 16) & 0xff) << 24) | 193 - ((uint32_t)ct_sub((w >> 8) & 0xff) << 16) | 194 - ((uint32_t)ct_sub((w ) & 0xff) << 8) | 195 - ((uint32_t)ct_sub((w >> 24) ) ); 196 - } 197 - 198 - static inline uint32_t ct_subword(uint32_t w) { 199 - return ((uint32_t)ct_sub((w >> 24) & 0xff) << 24) | 200 - ((uint32_t)ct_sub((w >> 16) & 0xff) << 16) | 201 - ((uint32_t)ct_sub((w >> 8) & 0xff) << 8) | 202 - ((uint32_t)ct_sub((w ) & 0xff) ); 39 + CAMLprim value 40 + mc_aes_derive_e_key_generic(value key, value rk, value rounds) 41 + { 42 + br_aes_ct64_keysched( 43 + (uint64_t *)Bp_val(rk), 44 + _st_uint8(key), 45 + (keybits_of_r(Int_val(rounds))) / 8); 46 + return Val_unit; 203 47 } 204 48 205 - static int mc_rijndaelSetupEncrypt(uint32_t *rk, const uint8_t *key, 206 - int keybits) { 207 - int i = 0; 208 - 209 - rk[0] = GETU32(key ); 210 - rk[1] = GETU32(key + 4); 211 - rk[2] = GETU32(key + 8); 212 - rk[3] = GETU32(key + 12); 213 - 214 - if (keybits == 128) { 215 - for (;;) { 216 - rk[4] = rk[0] ^ ct_subword_rot(rk[3]) ^ rcon[i]; 217 - rk[5] = rk[1] ^ rk[4]; 218 - rk[6] = rk[2] ^ rk[5]; 219 - rk[7] = rk[3] ^ rk[6]; 220 - if (++i == 10) return 10; 221 - rk += 4; 222 - } 223 - } 224 - 225 - rk[4] = GETU32(key + 16); 226 - rk[5] = GETU32(key + 20); 227 - 228 - if (keybits == 192) { 229 - for (;;) { 230 - rk[6] = rk[0] ^ ct_subword_rot(rk[5]) ^ rcon[i]; 231 - rk[7] = rk[1] ^ rk[6]; 232 - rk[8] = rk[2] ^ rk[7]; 233 - rk[9] = rk[3] ^ rk[8]; 234 - if (++i == 8) return 12; 235 - rk[10] = rk[4] ^ rk[9]; 236 - rk[11] = rk[5] ^ rk[10]; 237 - rk += 6; 238 - } 239 - } 240 - 241 - rk[6] = GETU32(key + 24); 242 - rk[7] = GETU32(key + 28); 243 - 244 - if (keybits == 256) { 245 - for (;;) { 246 - rk[8] = rk[0] ^ ct_subword_rot(rk[7]) ^ rcon[i]; 247 - rk[9] = rk[1] ^ rk[8]; 248 - rk[10] = rk[2] ^ rk[9]; 249 - rk[11] = rk[3] ^ rk[10]; 250 - if (++i == 7) return 14; 251 - rk[12] = rk[4] ^ ct_subword(rk[11]); 252 - rk[13] = rk[5] ^ rk[12]; 253 - rk[14] = rk[6] ^ rk[13]; 254 - rk[15] = rk[7] ^ rk[14]; 255 - rk += 8; 256 - } 257 - } 258 - return 0; 49 + CAMLprim value 50 + mc_aes_derive_d_key_generic(value key, value kr, value rounds, 51 + value __unused(rk)) 52 + { 53 + /* BearSSL ct64 uses the same key schedule for encrypt and decrypt */ 54 + br_aes_ct64_keysched( 55 + (uint64_t *)Bp_val(kr), 56 + _st_uint8(key), 57 + (keybits_of_r(Int_val(rounds))) / 8); 58 + return Val_unit; 259 59 } 260 60 261 61 /* 262 - * InvMixColumns on a single 32-bit column (4 bytes). 263 - * Uses explicit GF(2^8) multiplications, no tables. 62 + * Encrypt blocks using BearSSL aes_ct64. 63 + * 64 + * BearSSL processes up to 4 blocks in parallel (bitsliced in 8 x uint64_t). 65 + * For fewer blocks, zero-pad and discard the extra output. 264 66 */ 265 - static inline uint32_t inv_mix_column(uint32_t col) { 266 - uint8_t a = (col >> 24) & 0xff; 267 - uint8_t b = (col >> 16) & 0xff; 268 - uint8_t c = (col >> 8) & 0xff; 269 - uint8_t d = (col ) & 0xff; 270 - return ((uint32_t)(mul_0e(a) ^ mul_0b(b) ^ mul_0d(c) ^ mul_09(d)) << 24) | 271 - ((uint32_t)(mul_09(a) ^ mul_0e(b) ^ mul_0b(c) ^ mul_0d(d)) << 16) | 272 - ((uint32_t)(mul_0d(a) ^ mul_09(b) ^ mul_0e(c) ^ mul_0b(d)) << 8) | 273 - ((uint32_t)(mul_0b(a) ^ mul_0d(b) ^ mul_09(c) ^ mul_0e(d)) ); 274 - } 275 - 276 - static int mc_rijndaelSetupDecrypt(uint32_t *rk, const uint8_t *key, 277 - int keybits) { 278 - int nrounds, i, j; 279 - uint32_t temp; 280 - 281 - nrounds = mc_rijndaelSetupEncrypt(rk, key, keybits); 282 - 283 - /* Reverse the order of the round keys */ 284 - for (i = 0, j = 4 * nrounds; i < j; i += 4, j -= 4) { 285 - temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp; 286 - temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; 287 - temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; 288 - temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp; 289 - } 290 - 291 - /* Apply InvMixColumns to rounds 1 through nrounds-1 */ 292 - for (i = 1; i < nrounds; i++) { 293 - rk += 4; 294 - rk[0] = inv_mix_column(rk[0]); 295 - rk[1] = inv_mix_column(rk[1]); 296 - rk[2] = inv_mix_column(rk[2]); 297 - rk[3] = inv_mix_column(rk[3]); 298 - } 299 - return nrounds; 300 - } 301 - 302 - /* ------------------------------------------------------------------ */ 303 - /* Constant-time AES encrypt / decrypt (single block) */ 304 - /* ------------------------------------------------------------------ */ 305 - 306 - static void mc_rijndaelEncrypt(const uint32_t *rk, int nrounds, 307 - const uint8_t in[16], uint8_t out[16]) { 308 - uint8_t s[16], t[16]; 309 - 310 - /* AddRoundKey (initial) */ 311 - for (int i = 0; i < 16; i++) 312 - s[i] = in[i] ^ ((rk[i / 4] >> (24 - 8 * (i % 4))) & 0xff); 313 - rk += 4; 314 - 315 - for (int r = 1; r < nrounds; r++) { 316 - /* SubBytes */ 317 - for (int i = 0; i < 16; i++) 318 - t[i] = ct_sub(s[i]); 319 - 320 - /* ShiftRows */ 321 - s[0] = t[0]; s[1] = t[5]; s[2] = t[10]; s[3] = t[15]; 322 - s[4] = t[4]; s[5] = t[9]; s[6] = t[14]; s[7] = t[3]; 323 - s[8] = t[8]; s[9] = t[13]; s[10] = t[2]; s[11] = t[7]; 324 - s[12] = t[12]; s[13] = t[1]; s[14] = t[6]; s[15] = t[11]; 325 - 326 - /* MixColumns + AddRoundKey */ 327 - for (int c = 0; c < 4; c++) { 328 - int o = c * 4; 329 - uint8_t a = s[o], b = s[o+1], cc_ = s[o+2], d = s[o+3]; 330 - t[o ] = mul_02(a) ^ mul_03(b) ^ cc_ ^ d; 331 - t[o+1] = a ^ mul_02(b) ^ mul_03(cc_) ^ d; 332 - t[o+2] = a ^ b ^ mul_02(cc_) ^ mul_03(d); 333 - t[o+3] = mul_03(a) ^ b ^ cc_ ^ mul_02(d); 334 - /* AddRoundKey */ 335 - t[o ] ^= (rk[c] >> 24) & 0xff; 336 - t[o+1] ^= (rk[c] >> 16) & 0xff; 337 - t[o+2] ^= (rk[c] >> 8) & 0xff; 338 - t[o+3] ^= (rk[c] ) & 0xff; 339 - } 340 - memcpy(s, t, 16); 341 - rk += 4; 342 - } 343 - 344 - /* Final round: SubBytes + ShiftRows + AddRoundKey (no MixColumns) */ 345 - for (int i = 0; i < 16; i++) 346 - t[i] = ct_sub(s[i]); 347 - 348 - s[0] = t[0]; s[1] = t[5]; s[2] = t[10]; s[3] = t[15]; 349 - s[4] = t[4]; s[5] = t[9]; s[6] = t[14]; s[7] = t[3]; 350 - s[8] = t[8]; s[9] = t[13]; s[10] = t[2]; s[11] = t[7]; 351 - s[12] = t[12]; s[13] = t[1]; s[14] = t[6]; s[15] = t[11]; 67 + static void _mc_ct64_enc_blocks(const uint8_t *src, uint8_t *dst, 68 + const uint64_t *comp_skey, unsigned num_rounds, size_t blocks) 69 + { 70 + uint64_t sk_exp[120]; 352 71 353 - for (int i = 0; i < 16; i++) 354 - out[i] = s[i] ^ ((rk[i / 4] >> (24 - 8 * (i % 4))) & 0xff); 355 - } 72 + br_aes_ct64_skey_expand(sk_exp, num_rounds, comp_skey); 73 + while (blocks > 0) { 74 + uint64_t q[8]; 75 + uint32_t w[16]; 76 + unsigned b, j; 356 77 357 - static void mc_rijndaelDecrypt(const uint32_t *rk, int nrounds, 358 - const uint8_t in[16], uint8_t out[16]) { 359 - uint8_t s[16], t[16]; 360 - 361 - /* AddRoundKey (initial) */ 362 - for (int i = 0; i < 16; i++) 363 - s[i] = in[i] ^ ((rk[i / 4] >> (24 - 8 * (i % 4))) & 0xff); 364 - rk += 4; 365 - 366 - for (int r = 1; r < nrounds; r++) { 367 - /* InvSubBytes */ 368 - for (int i = 0; i < 16; i++) 369 - t[i] = ct_isub(s[i]); 370 - 371 - /* InvShiftRows */ 372 - s[0] = t[0]; s[1] = t[13]; s[2] = t[10]; s[3] = t[7]; 373 - s[4] = t[4]; s[5] = t[1]; s[6] = t[14]; s[7] = t[11]; 374 - s[8] = t[8]; s[9] = t[5]; s[10] = t[2]; s[11] = t[15]; 375 - s[12] = t[12]; s[13] = t[9]; s[14] = t[6]; s[15] = t[3]; 376 - 377 - /* InvMixColumns */ 378 - for (int c = 0; c < 4; c++) { 379 - int o = c * 4; 380 - uint8_t a = s[o], b = s[o+1], cc_ = s[o+2], d = s[o+3]; 381 - t[o ] = mul_0e(a) ^ mul_0b(b) ^ mul_0d(cc_) ^ mul_09(d); 382 - t[o+1] = mul_09(a) ^ mul_0e(b) ^ mul_0b(cc_) ^ mul_0d(d); 383 - t[o+2] = mul_0d(a) ^ mul_09(b) ^ mul_0e(cc_) ^ mul_0b(d); 384 - t[o+3] = mul_0b(a) ^ mul_0d(b) ^ mul_09(cc_) ^ mul_0e(d); 385 - } 386 - memcpy(s, t, 16); 387 - 388 - /* AddRoundKey */ 389 - for (int i = 0; i < 16; i++) 390 - s[i] ^= (rk[i / 4] >> (24 - 8 * (i % 4))) & 0xff; 391 - 392 - rk += 4; 393 - } 78 + b = blocks < 4 ? (unsigned)blocks : 4; 394 79 395 - /* Final round: InvSubBytes + InvShiftRows + AddRoundKey */ 396 - for (int i = 0; i < 16; i++) 397 - t[i] = ct_isub(s[i]); 80 + /* Load up to 4 blocks in little-endian 32-bit words */ 81 + for (j = 0; j < b; j++) { 82 + br_range_dec32le(w + (j << 2), 4, src + (j * 16)); 83 + } 84 + /* Zero-pad unused slots */ 85 + for (j = b; j < 4; j++) { 86 + memset(w + (j << 2), 0, 4 * sizeof(uint32_t)); 87 + } 398 88 399 - s[0] = t[0]; s[1] = t[13]; s[2] = t[10]; s[3] = t[7]; 400 - s[4] = t[4]; s[5] = t[1]; s[6] = t[14]; s[7] = t[11]; 401 - s[8] = t[8]; s[9] = t[5]; s[10] = t[2]; s[11] = t[15]; 402 - s[12] = t[12]; s[13] = t[9]; s[14] = t[6]; s[15] = t[3]; 89 + /* Interleave, ortho, encrypt, ortho, deinterleave */ 90 + for (j = 0; j < 4; j++) { 91 + br_aes_ct64_interleave_in(&q[j], &q[j + 4], w + (j << 2)); 92 + } 93 + br_aes_ct64_ortho(q); 94 + br_aes_ct64_bitslice_encrypt(num_rounds, sk_exp, q); 95 + br_aes_ct64_ortho(q); 96 + for (j = 0; j < 4; j++) { 97 + br_aes_ct64_interleave_out(w + (j << 2), q[j], q[j + 4]); 98 + } 403 99 404 - for (int i = 0; i < 16; i++) 405 - out[i] = s[i] ^ ((rk[i / 4] >> (24 - 8 * (i % 4))) & 0xff); 100 + /* Store only the blocks we actually have */ 101 + for (j = 0; j < b; j++) { 102 + br_range_enc32le(dst + (j * 16), w + (j << 2), 4); 103 + } 104 + src += b * 16; 105 + dst += b * 16; 106 + blocks -= b; 107 + } 406 108 } 407 109 408 - /* ------------------------------------------------------------------ */ 409 - /* OCaml front-end (unchanged signatures) */ 410 - /* ------------------------------------------------------------------ */ 110 + static void _mc_ct64_dec_blocks(const uint8_t *src, uint8_t *dst, 111 + const uint64_t *comp_skey, unsigned num_rounds, size_t blocks) 112 + { 113 + uint64_t sk_exp[120]; 411 114 412 - #define keybits_of_r(x) ((x - 6) * 32) 115 + br_aes_ct64_skey_expand(sk_exp, num_rounds, comp_skey); 116 + while (blocks > 0) { 117 + uint64_t q[8]; 118 + uint32_t w[16]; 119 + unsigned b, j; 413 120 414 - static inline void _mc_aes_enc_blocks(const uint8_t *src, uint8_t *dst, 415 - const uint32_t *rk, uint8_t rounds, 416 - size_t blocks) { 417 - while (blocks--) { 418 - mc_rijndaelEncrypt(rk, rounds, src, dst); 419 - src += 16; 420 - dst += 16; 421 - } 422 - } 121 + b = blocks < 4 ? (unsigned)blocks : 4; 423 122 424 - static inline void _mc_aes_dec_blocks(const uint8_t *src, uint8_t *dst, 425 - const uint32_t *rk, uint8_t rounds, 426 - size_t blocks) { 427 - while (blocks--) { 428 - mc_rijndaelDecrypt(rk, rounds, src, dst); 429 - src += 16; 430 - dst += 16; 431 - } 432 - } 433 - 434 - CAMLprim value 435 - mc_aes_rk_size_generic (value rounds) { 436 - return Val_int (RKLENGTH (keybits_of_r (Int_val (rounds))) * sizeof(uint32_t)); 437 - } 123 + for (j = 0; j < b; j++) { 124 + br_range_dec32le(w + (j << 2), 4, src + (j * 16)); 125 + } 126 + for (j = b; j < 4; j++) { 127 + memset(w + (j << 2), 0, 4 * sizeof(uint32_t)); 128 + } 438 129 439 - CAMLprim value 440 - mc_aes_derive_e_key_generic (value key, value rk, value rounds) { 441 - mc_rijndaelSetupEncrypt (_bp_uint32 (rk), 442 - _st_uint8 (key), 443 - keybits_of_r (Int_val (rounds))); 444 - return Val_unit; 445 - } 130 + for (j = 0; j < 4; j++) { 131 + br_aes_ct64_interleave_in(&q[j], &q[j + 4], w + (j << 2)); 132 + } 133 + br_aes_ct64_ortho(q); 134 + br_aes_ct64_bitslice_decrypt(num_rounds, sk_exp, q); 135 + br_aes_ct64_ortho(q); 136 + for (j = 0; j < 4; j++) { 137 + br_aes_ct64_interleave_out(w + (j << 2), q[j], q[j + 4]); 138 + } 446 139 447 - CAMLprim value 448 - mc_aes_derive_d_key_generic (value key, value kr, value rounds, value __unused (rk)) { 449 - mc_rijndaelSetupDecrypt (_bp_uint32 (kr), 450 - _st_uint8 (key), 451 - keybits_of_r (Int_val (rounds))); 452 - return Val_unit; 140 + for (j = 0; j < b; j++) { 141 + br_range_enc32le(dst + (j * 16), w + (j << 2), 4); 142 + } 143 + src += b * 16; 144 + dst += b * 16; 145 + blocks -= b; 146 + } 453 147 } 454 148 455 149 CAMLprim value 456 - mc_aes_enc_generic (value src, value off1, value dst, value off2, value rk, value rounds, value blocks) { 457 - _mc_aes_enc_blocks ( _st_uint8_off (src, off1), 458 - _bp_uint8_off (dst, off2), 459 - _st_uint32 (rk), 460 - Int_val (rounds), 461 - Int_val (blocks) ); 462 - return Val_unit; 150 + mc_aes_enc_generic(value src, value off1, value dst, value off2, 151 + value rk, value rounds, value blocks) 152 + { 153 + _mc_ct64_enc_blocks( 154 + _st_uint8_off(src, off1), 155 + _bp_uint8_off(dst, off2), 156 + (const uint64_t *)_st_uint8(rk), 157 + Int_val(rounds), 158 + Int_val(blocks)); 159 + return Val_unit; 463 160 } 464 161 465 162 CAMLprim value 466 - mc_aes_dec_generic (value src, value off1, value dst, value off2, value rk, value rounds, value blocks) { 467 - _mc_aes_dec_blocks ( _st_uint8_off(src, off1), 468 - _bp_uint8_off(dst, off2), 469 - _st_uint32 (rk), 470 - Int_val (rounds), 471 - Int_val (blocks) ); 472 - return Val_unit; 163 + mc_aes_dec_generic(value src, value off1, value dst, value off2, 164 + value rk, value rounds, value blocks) 165 + { 166 + _mc_ct64_dec_blocks( 167 + _st_uint8_off(src, off1), 168 + _bp_uint8_off(dst, off2), 169 + (const uint64_t *)_st_uint8(rk), 170 + Int_val(rounds), 171 + Int_val(blocks)); 172 + return Val_unit; 473 173 }

+8 -1

src/c/dune

··· 25 25 (names chacha_generic) 26 26 (flags 27 27 (:standard) 28 - (:include ../cflags.sexp)))) 28 + (:include ../cflags.sexp))) 29 + (foreign_stubs 30 + (language c) 31 + (names aes_ct64 aes_ct64_enc aes_ct64_dec ghash_ctmul64 dec32le enc32le) 32 + (flags 33 + (:standard) 34 + (:include ../cflags.sexp) 35 + -Wno-unused-function))) 29 36 30 37 (include_subdirs unqualified)

+32 -91

src/c/ghash_generic.c

··· 1 - /* Copyright (c) 2017 David Kaloper Meršinjak. All rights reserved. 2 - See LICENSE.md. */ 1 + /* 2 + * Constant-time GHASH using BearSSL ghash_ctmul64 (64-bit platforms). 3 + * 4 + * BearSSL source: https://bearssl.org/ 5 + * File: src/hash/ghash_ctmul64.c 6 + * License: MIT (Thomas Pornin) 7 + * 8 + * For 32-bit platforms and MSVC, ghash_ctmul.c (also BearSSL) 9 + * provides the mc_ghash_*_generic functions. 10 + */ 3 11 4 12 #include "crypto.h" 13 + #include "bearssl/inner.h" 5 14 #include <string.h> 6 15 7 - /* Generic table-driven GHASH. 8 - * 9 - * References: 10 - * - The Galois/Counter Mode of Operation. David A. McGrew and John Viega. 11 - * - NIST SP 800-38D. Recommendation for Block Cipher Modes of Operation: 12 - * Galois/Counter Mode (GCM) and GMAC. 13 - */ 14 - 15 - /* LARGE_TABLES -> 65K per key 16 - * !LARGE_TABLES -> 8K per key, ~3x slower. */ 17 - #define __MC_GHASH_LARGE_TABLES 18 - 19 - /* 64-bit Windows sets ARCH_64BIT but 128-bit integers are not supported 20 - * by the Microsoft compiler. Drop down to 32-bit for MSVC; 21 - * ghash_ctmul.c will implement ghash for MSVC. 22 - */ 23 16 #if defined(ARCH_64BIT) && !defined(_MSC_VER) 24 17 25 - #define __set_uint128_t(w1, w0) (((__uint128_t) w1 << 64) | w0) 26 - 27 - static const __uint128_t r = __set_uint128_t (0xe100000000000000, 0); 28 - 29 - static inline __uint128_t __load_128_t (const uint64_t s[2]) { 30 - return __set_uint128_t (be64_to_cpu (s[0]), be64_to_cpu (s[1])); 31 - } 32 - 33 - static inline __uint128_t __load_128_t_with_padding (const uint8_t *src, size_t n) { 34 - uint64_t buf[2] = { 0 }; 35 - memcpy (buf, src, n); 36 - return __load_128_t (buf); 37 - } 38 - 39 - static inline void __store_128_t (uint64_t s[2], __uint128_t x) { 40 - s[0] = cpu_to_be64 (x >> 64); 41 - s[1] = cpu_to_be64 (x); 42 - } 43 - 44 - #if defined (__MC_GHASH_LARGE_TABLES) 45 - #define __t_width 8 // coefficient window 46 - #define __t_tables 16 // 128 / t_width 47 - #define __t_size 4096 // 2^t_width * t_tables 48 - #else 49 - #define __t_width 4 50 - #define __t_tables 32 51 - #define __t_size 512 52 - #endif 53 - 54 - static inline __uint128_t __gfmul (__uint128_t a, __uint128_t b) { 55 - __uint128_t z = 0, 56 - v = a; 57 - for (int i = 0; i < 128; i ++) { 58 - if ((uint64_t) (b >> (127 - i)) & 1) 59 - z = z ^ v; 60 - v = (uint64_t) v & 1 ? (v >> 1) ^ r : v >> 1; 61 - } 62 - return z; 63 - } 64 - 65 - // NB Exponents are reversed. 66 - // TODO: Fast table derivation. 67 - static inline void __derive (uint64_t key[2], __uint128_t m[__t_size]) { 68 - __uint128_t e = 1 << (__t_width - 1), 69 - h = __load_128_t (key); 70 - for (int i = 0; i < __t_tables; i ++, e <<= __t_width) { 71 - __uint128_t exph = __gfmul (h, e); 72 - for (int j = 0; j < (1 << __t_width); j ++) 73 - m[(i << __t_width) | j] = __gfmul (exph, (__uint128_t) j << (128 - __t_width)); 74 - } 75 - } 76 - 77 - #define __t_mask ((1 << __t_width) - 1) 78 - static inline __uint128_t __gfmul_tab (__uint128_t m[__t_size], __uint128_t x) { 79 - __uint128_t r = 0; 80 - for (int i = 0; i < __t_tables; i ++) 81 - r ^= m[(i << __t_width) | ((uint8_t) (x >> (i * __t_width)) & __t_mask)]; 82 - return r; 83 - } 84 - 85 - static inline void __ghash (__uint128_t m[__t_size], uint64_t hash[2], const uint8_t *src, size_t n) { 86 - __uint128_t acc = __load_128_t (hash); 87 - for (; n >= 16; src += 16, n -= 16) 88 - acc = __gfmul_tab (m, acc ^ __load_128_t ((uint64_t *) src)); 89 - if (n > 0) 90 - acc = __gfmul_tab (m, acc ^ __load_128_t_with_padding (src, n)); 91 - __store_128_t (hash, acc); 92 - } 18 + /* 19 + * BearSSL's br_ghash_ctmul64 interface: 20 + * void br_ghash_ctmul64(void *y, const void *h, const void *data, size_t len) 21 + * 22 + * - y: 16-byte accumulator (in/out), big-endian 23 + * - h: 16-byte GHASH subkey, big-endian 24 + * - data: input data 25 + * - len: input length 26 + * 27 + * Our mc_ghash_*_generic interface stores the subkey as 16 raw bytes. 28 + */ 93 29 94 30 CAMLprim value mc_ghash_key_size_generic (__unit ()) { 95 - return Val_int (sizeof (__uint128_t) * __t_size); 31 + /* BearSSL ghash_ctmul64 uses H directly: 16 bytes */ 32 + return Val_int(16); 96 33 } 97 34 98 35 CAMLprim value mc_ghash_init_key_generic (value key, value m) { 99 - __derive ((uint64_t *) _st_uint8 (key), (__uint128_t *) Bp_val (m)); 36 + /* Store the 16-byte key as-is */ 37 + memcpy(Bp_val(m), _st_uint8(key), 16); 100 38 return Val_unit; 101 39 } 102 40 103 41 CAMLprim value 104 42 mc_ghash_generic (value m, value hash, value src, value off, value len) { 105 - __ghash ((__uint128_t *) Bp_val (m), (uint64_t *) Bp_val (hash), 106 - _st_uint8_off (src, off), Int_val (len) ); 43 + br_ghash_ctmul64( 44 + Bp_val(hash), 45 + Bp_val(m), 46 + _st_uint8_off(src, off), 47 + Int_val(len)); 107 48 return Val_unit; 108 49 } 109 50 110 - #endif /* ARCH_64BIT */ 51 + #endif /* ARCH_64BIT && !_MSC_VER */

+5 -159

src/c/ghash_pclmul.c

··· 189 189 #endif /* __mc_ACCELERATE__ */ 190 190 191 191 /* ------------------------------------------------------------------ */ 192 - /* ARM64 PMULL-based GHASH */ 193 - /* */ 194 - /* Direct transliteration of the x86 PCLMULQDQ code above, using */ 195 - /* ARM NEON + PMULL intrinsics. Same algorithm: reflected reduction */ 196 - /* with the polynomial 0xc200000000000000. */ 197 - /* */ 198 - /* Reference: Intel CLMUL white paper (Gueron & Kounavis), adapted */ 199 - /* to ARM PMULL intrinsics. Same pattern as OpenSSL ghashv8-armx.S. */ 200 - /* ------------------------------------------------------------------ */ 201 - 202 - #if defined(__mc_ARM64CE__) 203 - 204 - #include <string.h> 205 - 206 - /* Use uint8x16_t as the 128-bit register type (matches ARM NEON) */ 207 - typedef uint8x16_t v128; 208 - 209 - #define nxor(a, b) veorq_u8((a), (b)) 210 - 211 - static inline v128 __neon_reverse (v128 x) { 212 - static const uint8_t idx[16] = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}; 213 - return vqtbl1q_u8(x, vld1q_u8(idx)); 214 - } 215 - 216 - static inline uint64x2_t __to_u64 (v128 x) { return vreinterpretq_u64_u8(x); } 217 - static inline v128 __from_u64 (uint64x2_t x) { return vreinterpretq_u8_u64(x); } 218 - 219 - /* Swap the two 64-bit halves */ 220 - static inline v128 __neon_swap64 (v128 x) { 221 - return __from_u64(vextq_u64(__to_u64(x), __to_u64(x), 1)); 222 - } 223 - 224 - /* Shift entire 128-bit value left by 1 bit */ 225 - static inline v128 __neon_shiftl1 (v128 x) { 226 - uint64x2_t u = __to_u64(x); 227 - uint64x2_t hi = vshlq_n_u64(u, 1); 228 - uint64x2_t carry = vshrq_n_u64(u, 63); 229 - /* carry from low to high: shift carry left by one 64-bit lane */ 230 - uint64x2_t zero = vdupq_n_u64(0); 231 - uint64x2_t c = vextq_u64(zero, carry, 1); 232 - return __from_u64(vorrq_u64(hi, c)); 233 - } 234 - 235 - /* 236 - * 128-bit carry-less multiply: a * b = (r1:r0) 237 - * Same decomposition as the x86 __clmul_128. 238 - */ 239 - static inline void __neon_clmul_128 (v128 *r1, v128 *r0, v128 a, v128 b) { 240 - uint64x2_t au = __to_u64(a), bu = __to_u64(b); 241 - uint64_t a0 = vgetq_lane_u64(au, 0), a1 = vgetq_lane_u64(au, 1); 242 - uint64_t b0 = vgetq_lane_u64(bu, 0), b1 = vgetq_lane_u64(bu, 1); 243 - 244 - /* w0 = a0*b0, w1 = a1*b1, t = a0*b1 + a1*b0 */ 245 - poly128_t w0p = vmull_p64((poly64_t)a0, (poly64_t)b0); 246 - poly128_t w1p = vmull_p64((poly64_t)a1, (poly64_t)b1); 247 - poly128_t t1p = vmull_p64((poly64_t)a0, (poly64_t)b1); 248 - poly128_t t2p = vmull_p64((poly64_t)a1, (poly64_t)b0); 249 - 250 - v128 w0 = vreinterpretq_u8_p128(w0p); 251 - v128 w1 = vreinterpretq_u8_p128(w1p); 252 - v128 t = nxor(vreinterpretq_u8_p128(t1p), vreinterpretq_u8_p128(t2p)); 253 - 254 - /* t_lo goes to r0_hi, t_hi goes to r1_lo */ 255 - uint64x2_t tu = __to_u64(t); 256 - uint64_t tlo = vgetq_lane_u64(tu, 0), thi = vgetq_lane_u64(tu, 1); 257 - v128 t_shift_left = __from_u64(vcombine_u64(vcreate_u64(0), vcreate_u64(tlo))); 258 - v128 t_shift_right = __from_u64(vcombine_u64(vcreate_u64(thi), vcreate_u64(0))); 259 - 260 - *r0 = nxor(w0, t_shift_left); 261 - *r1 = nxor(w1, t_shift_right); 262 - } 263 - 264 - /* 265 - * Reflected reduction: same algorithm as the x86 __reduce_g. 266 - * Uses PMULL with the reduction polynomial 0xc200000000000000. 267 - */ 268 - static inline v128 __neon_reduce_g (v128 w1, v128 w0) { 269 - /* Shift 256-bit product left by 1 (reflected representation) */ 270 - uint64x2_t w0u = __to_u64(w0); 271 - uint64_t w0_hi_bit = vgetq_lane_u64(vshrq_n_u64(w0u, 63), 1); 272 - w0 = __neon_shiftl1(w0); 273 - w1 = __from_u64(vorrq_u64(__to_u64(__neon_shiftl1(w1)), 274 - vcombine_u64(vcreate_u64(w0_hi_bit), vcreate_u64(0)))); 275 - 276 - /* Two-phase reduction with polynomial 0xc200000000000000 */ 277 - uint64_t w0_lo = vgetq_lane_u64(__to_u64(w0), 0); 278 - poly128_t rp = vmull_p64((poly64_t)w0_lo, (poly64_t)0xc200000000000000ULL); 279 - w0 = nxor(__neon_swap64(w0), vreinterpretq_u8_p128(rp)); 280 - 281 - uint64_t w0_lo2 = vgetq_lane_u64(__to_u64(w0), 0); 282 - poly128_t rp2 = vmull_p64((poly64_t)w0_lo2, (poly64_t)0xc200000000000000ULL); 283 - w0 = nxor(__neon_swap64(w0), vreinterpretq_u8_p128(rp2)); 284 - 285 - return nxor(w1, w0); 286 - } 287 - 288 - static inline v128 __neon_gfmul (v128 a, v128 b) { 289 - v128 w1, w0; 290 - __neon_clmul_128(&w1, &w0, a, b); 291 - return __neon_reduce_g(w1, w0); 292 - } 293 - 294 - /* Same __repr_xform as x86 reflected path: byte-reverse */ 295 - #define __neon_repr_xform __neon_reverse 296 - 297 - static inline v128 __neon_load_xform (const uint8_t *p) { 298 - return __neon_repr_xform(vld1q_u8(p)); 299 - } 300 - 301 - #define __arm_keys 1 302 - 303 - static inline void __arm_derive (const uint8_t *key, v128 *m) { 304 - /* H = GHASH subkey, compute H^1 in reflected representation */ 305 - v128 k = __neon_load_xform(key); 306 - /* acc starts at x^1 in reflected = 0x80...0 reversed */ 307 - uint8_t init[16] = {0}; 308 - init[0] = 0x80; 309 - v128 acc = __neon_repr_xform(vld1q_u8(init)); 310 - m[0] = __neon_gfmul(acc, k); 311 - } 312 - 313 - static inline void __arm_ghash (v128 *m, v128 *hash, 314 - const uint8_t *src, size_t len) { 315 - v128 k = m[0]; 316 - v128 acc = __neon_load_xform((const uint8_t *)hash); 317 - 318 - while (len >= 16) { 319 - acc = __neon_gfmul(k, nxor(acc, __neon_load_xform(src))); 320 - src += 16; 321 - len -= 16; 322 - } 323 - if (len > 0) { 324 - uint8_t tmp[16] = {0}; 325 - memcpy(tmp, src, len); 326 - acc = __neon_gfmul(k, nxor(acc, __neon_load_xform(tmp))); 327 - } 328 - vst1q_u8((uint8_t *)hash, __neon_repr_xform(acc)); 329 - } 330 - 331 - #endif /* __mc_ARM64CE__ */ 332 - 333 - /* ------------------------------------------------------------------ */ 334 192 /* Dispatch */ 193 + /* */ 194 + /* On x86: PCLMULQDQ accelerated path above. */ 195 + /* On ARM64 and other platforms: uses BearSSL ghash_ctmul64 via */ 196 + /* mc_ghash_generic (constant-time, no hardware intrinsics). */ 197 + /* ARM64 PMULL acceleration: TODO (needs audited implementation). */ 335 198 /* ------------------------------------------------------------------ */ 336 199 337 200 CAMLprim value mc_ghash_key_size (__unit ()) { ··· 340 203 _mc_switch_accel(pclmul, 341 204 s = mc_ghash_key_size_generic(Val_unit), 342 205 s = Val_int (__keys * 16)) 343 - #elif defined(__mc_ARM64CE__) 344 - _mc_switch_accel(arm_pmull, 345 - s = mc_ghash_key_size_generic(Val_unit), 346 - s = Val_int (__arm_keys * 16)) 347 206 #else 348 207 s = mc_ghash_key_size_generic(Val_unit); 349 208 #endif ··· 355 214 _mc_switch_accel(pclmul, 356 215 mc_ghash_init_key_generic(key, m), 357 216 __derive ((__m128i *) _st_uint8 (key), (__m128i *) Bp_val (m))) 358 - #elif defined(__mc_ARM64CE__) 359 - _mc_switch_accel(arm_pmull, 360 - mc_ghash_init_key_generic(key, m), 361 - __arm_derive (_st_uint8 (key), (v128 *) Bp_val (m))) 362 217 #else 363 218 mc_ghash_init_key_generic(key, m); 364 219 #endif ··· 372 227 mc_ghash_generic(k, hash, src, off, len), 373 228 __ghash ( (__m128i *) Bp_val (k), (__m128i *) Bp_val (hash), 374 229 (__m128i *) _st_uint8_off (src, off), Int_val (len) )) 375 - #elif defined(__mc_ARM64CE__) 376 - _mc_switch_accel(arm_pmull, 377 - mc_ghash_generic(k, hash, src, off, len), 378 - __arm_ghash ( (v128 *) Bp_val (k), (v128 *) Bp_val (hash), 379 - _st_uint8_off (src, off), Int_val (len) )) 380 230 #else 381 231 mc_ghash_generic(k, hash, src, off, len); 382 232 #endif ··· 387 237 value enabled = 0; 388 238 #if defined(__mc_ACCELERATE__) 389 239 _mc_switch_accel(pclmul, 390 - enabled = 0, 391 - enabled = 1) 392 - #elif defined(__mc_ARM64CE__) 393 - _mc_switch_accel(arm_pmull, 394 240 enabled = 0, 395 241 enabled = 1) 396 242 #endif

+8 -9

src/crypto.mli

··· 382 382 last block of [ciphertext]. Note that 383 383 384 384 {[ 385 - encrypt ~iv msg1 386 - || encrypt ~iv:(next_iv ~iv (encrypt ~iv msg1)) msg2 387 - == encrypt ~iv (msg1 || msg2) 385 + encrypt ~iv msg1 386 + || encrypt ~iv:(next_iv ~iv (encrypt ~iv msg1)) msg2 387 + == encrypt ~iv (msg1 || msg2) 388 388 ]} 389 389 390 390 @raise Invalid_argument if the length of [iv] is not [block_size]. ··· 513 513 if [len msg1 = k * block_size], 514 514 515 515 {[ 516 - encrypt ~ctr msg1 517 - || encrypt ~ctr:(next_ctr ~ctr msg1) msg2 518 - == encrypt ~ctr (msg1 || msg2) 516 + encrypt ~ctr msg1 517 + || encrypt ~ctr:(next_ctr ~ctr msg1) msg2 == encrypt ~ctr (msg1 || msg2) 519 518 ]} *) 520 519 521 520 val ctr_of_octets : string -> ctr ··· 532 531 Note that 533 532 534 533 {[ 535 - stream ~key ~ctr (k * block_size) 536 - || stream ~key ~ctr:(add ctr k) x 537 - == stream ~key ~ctr ((k * block_size) + x) 534 + stream ~key ~ctr (k * block_size) 535 + || stream ~key ~ctr:(add ctr k) x 536 + == stream ~key ~ctr ((k * block_size) + x) 538 537 ]} 539 538 540 539 In other words, it is possible to restart a keystream at [block_size]

+18 -18

test/ec/wycheproof/wycheproof.ml

··· 428 428 let perform_key_exchange curve ~public_key ~raw_private_key = 429 429 to_string_result ~pp_error 430 430 (match curve with 431 - | "secp256r1" -> begin 432 - match P256.Dh.secret_of_octets raw_private_key with 431 + | "secp256r1" -> 432 + begin match P256.Dh.secret_of_octets raw_private_key with 433 433 | Ok (p, _) -> P256.Dh.key_exchange p public_key 434 434 | Error _ -> assert false 435 - end 436 - | "secp384r1" -> begin 437 - match P384.Dh.secret_of_octets raw_private_key with 435 + end 436 + | "secp384r1" -> 437 + begin match P384.Dh.secret_of_octets raw_private_key with 438 438 | Ok (p, _) -> P384.Dh.key_exchange p public_key 439 439 | Error _ -> assert false 440 - end 441 - | "secp521r1" -> begin 442 - match P521.Dh.secret_of_octets raw_private_key with 440 + end 441 + | "secp521r1" -> 442 + begin match P521.Dh.secret_of_octets raw_private_key with 443 443 | Ok (p, _) -> P521.Dh.key_exchange p public_key 444 444 | Error _ -> assert false 445 - end 445 + end 446 446 | _ -> assert false) 447 447 448 448 let interpret_test ~tc_id curve { public_key; raw_private_key; expected } () = ··· 518 518 in 519 519 let verified (r, s) = 520 520 match curve with 521 - | "secp256r1" -> begin 522 - match P256.Dsa.pub_of_octets key with 521 + | "secp256r1" -> 522 + begin match P256.Dsa.pub_of_octets key with 523 523 | Ok key -> P256.Dsa.verify ~key (r, s) msg 524 524 | Error _ -> assert false 525 - end 526 - | "secp384r1" -> begin 527 - match P384.Dsa.pub_of_octets key with 525 + end 526 + | "secp384r1" -> 527 + begin match P384.Dsa.pub_of_octets key with 528 528 | Ok key -> P384.Dsa.verify ~key (r, s) msg 529 529 | Error _ -> assert false 530 - end 531 - | "secp521r1" -> begin 532 - match P521.Dsa.pub_of_octets key with 530 + end 531 + | "secp521r1" -> 532 + begin match P521.Dsa.pub_of_octets key with 533 533 | Ok key -> P521.Dsa.verify ~key (r, s) msg 534 534 | Error _ -> assert false 535 - end 535 + end 536 536 | _ -> assert false 537 537 in 538 538 match tst.result with

Configure Feed

Configure Feed