crypto: fix portability and security bugs in C primitives · gazagnaire.org/ocaml-crypto@f181e63

+17 -1

config/cfg.ml

··· 43 43 | (`x86_64 | `x86), _ -> [ "-DENTROPY"; "-mrdrnd"; "-mrdseed" ] 44 44 | _ -> [] 45 45 in 46 + let hardening_flags = 47 + (* Hardening flags appropriate for cryptographic C code: 48 + -fstack-protector-strong -- canary on functions with stack arrays; 49 + catches stack-smashing in key derivation. 50 + -D_FORTIFY_SOURCE=2 -- compile-time and runtime overflow checks 51 + for memcpy/memset/strcpy and friends. 52 + Requires -O1 or higher (we use -O3). 53 + Not enabled here: 54 + -fPIE / -pie -- conflicts with how dune links foreign stubs 55 + on some targets; relies on the OCaml link 56 + step to set position-independence. *) 57 + match ccomp_type_opt with 58 + | Some "msvc" -> [] 59 + | _ -> [ "-fstack-protector-strong"; "-D_FORTIFY_SOURCE=2" ] 60 + in 46 61 let std_flags = 47 62 match ccomp_type_opt with 48 63 | Some "msvc" -> [ "/Wall" ] 49 - | _ -> [ "--std=c11"; "-Wall"; "-Wextra"; "-Wpedantic"; "-O3" ] 64 + | _ -> 65 + [ "--std=c11"; "-Wall"; "-Wextra"; "-Wpedantic"; "-O3" ] @ hardening_flags 50 66 in 51 67 let warn_flags = 52 68 (* See #178, there may be false positives on ppc&s390 with no-stringop-overflow *)

+14 -6

src/c/aes_aesni.c

··· 142 142 for (i = 0; i <= rounds; i++) { 143 143 _mm_storeu_si128((__m128i*) rk0 + i, rk[i]); 144 144 } 145 + 146 + /* The schedule[] stack array holds derived round-key material. 147 + * Wipe it before return to avoid leaking through stack residue 148 + * (core dumps, swap, transient execution, etc.). */ 149 + mc_secure_bzero(schedule, sizeof schedule); 145 150 } 146 151 147 152 static inline void _mc_aesni_invert_e_key (const uint8_t *rk1, uint8_t *kr0, uint8_t rounds) { ··· 157 162 _mm_storeu_si128((__m128i*) kr0 + i, _mm_aesimc_si128 (rk[rounds - i])); 158 163 159 164 _mm_storeu_si128((__m128i*) kr0 + rounds, rk[0]); 165 + 166 + /* Wipe the temporary round-key copy before return. */ 167 + mc_secure_bzero(rk, sizeof rk); 160 168 } 161 169 162 170 static void _mc_aesni_derive_d_key (const uint8_t *key, uint8_t *kr, uint8_t rounds, uint8_t *rk) { ··· 367 375 value s; 368 376 _mc_switch_accel(aesni, 369 377 s = mc_aes_rk_size_generic(rounds), 370 - s = Val_int (_mc_aesni_rk_size (Int_val (rounds)))) 378 + s = Val_int (_mc_aesni_rk_size (Int_val (rounds)))); 371 379 return s; 372 380 } 373 381 ··· 377 385 mc_aes_derive_e_key_generic(key, rk, rounds), 378 386 _mc_aesni_derive_e_key (_st_uint8 (key), 379 387 _bp_uint8 (rk), 380 - Int_val (rounds))) 388 + Int_val (rounds))); 381 389 return Val_unit; 382 390 } 383 391 ··· 388 396 _mc_aesni_derive_d_key (_st_uint8 (key), 389 397 _bp_uint8 (kr), 390 398 Int_val (rounds), 391 - Is_block(rk) ? _bp_uint8(Field(rk, 0)) : 0)) 399 + Is_block(rk) ? _bp_uint8(Field(rk, 0)) : 0)); 392 400 return Val_unit; 393 401 } 394 402 ··· 400 408 _bp_uint8_off (dst, off2), 401 409 _st_uint8 (rk), 402 410 Int_val (rounds), 403 - Int_val (blocks) )) 411 + Int_val (blocks) )); 404 412 return Val_unit; 405 413 } 406 414 ··· 412 420 _bp_uint8_off (dst, off2), 413 421 _st_uint8 (rk), 414 422 Int_val (rounds), 415 - Int_val (blocks) )) 423 + Int_val (blocks) )); 416 424 return Val_unit; 417 425 } 418 426 ··· 420 428 value enabled = 0; 421 429 _mc_switch_accel(aesni, 422 430 enabled = 0, 423 - enabled = 1) 431 + enabled = 1); 424 432 return Val_int (enabled); 425 433 } 426 434

+17 -4

src/c/aes_generic.c

··· 63 63 * 64 64 * BearSSL processes up to 4 blocks in parallel (bitsliced in 8 x uint64_t). 65 65 * For fewer blocks, zero-pad and discard the extra output. 66 + * 67 + * The expanded key schedule (sk_exp), the bitsliced state (q), and 68 + * the per-block working buffer (w) all hold key-derived material on 69 + * the stack and are wiped before return to avoid leaking through 70 + * stack residue (core dumps, transient execution, etc.). 66 71 */ 67 72 static void _mc_ct64_enc_blocks(const uint8_t *src, uint8_t *dst, 68 73 const uint64_t *comp_skey, unsigned num_rounds, size_t blocks) 69 74 { 70 75 uint64_t sk_exp[120]; 76 + uint64_t q[8]; 77 + uint32_t w[16]; 71 78 72 79 br_aes_ct64_skey_expand(sk_exp, num_rounds, comp_skey); 73 80 while (blocks > 0) { 74 - uint64_t q[8]; 75 - uint32_t w[16]; 76 81 unsigned b, j; 77 82 78 83 b = blocks < 4 ? (unsigned)blocks : 4; ··· 105 110 dst += b * 16; 106 111 blocks -= b; 107 112 } 113 + 114 + mc_secure_bzero(sk_exp, sizeof sk_exp); 115 + mc_secure_bzero(q, sizeof q); 116 + mc_secure_bzero(w, sizeof w); 108 117 } 109 118 110 119 static void _mc_ct64_dec_blocks(const uint8_t *src, uint8_t *dst, 111 120 const uint64_t *comp_skey, unsigned num_rounds, size_t blocks) 112 121 { 113 122 uint64_t sk_exp[120]; 123 + uint64_t q[8]; 124 + uint32_t w[16]; 114 125 115 126 br_aes_ct64_skey_expand(sk_exp, num_rounds, comp_skey); 116 127 while (blocks > 0) { 117 - uint64_t q[8]; 118 - uint32_t w[16]; 119 128 unsigned b, j; 120 129 121 130 b = blocks < 4 ? (unsigned)blocks : 4; ··· 144 153 dst += b * 16; 145 154 blocks -= b; 146 155 } 156 + 157 + mc_secure_bzero(sk_exp, sizeof sk_exp); 158 + mc_secure_bzero(q, sizeof q); 159 + mc_secure_bzero(w, sizeof w); 147 160 } 148 161 149 162 CAMLprim value

+16 -42

src/c/bitfn.h

··· 26 26 #define BITFN_H 27 27 #include <stdint.h> 28 28 29 - # if (defined(__i386__)) 30 - # define ARCH_HAS_SWAP32 31 - static inline uint32_t bitfn_swap32(uint32_t a) 32 - { 33 - __asm__ ("bswap %0" : "=r" (a) : "0" (a)); 34 - return a; 35 - } 36 - /**********************************************************/ 37 - # elif (defined(__arm__)) 38 - # define ARCH_HAS_SWAP32 39 - static inline uint32_t bitfn_swap32(uint32_t a) 40 - { 41 - uint32_t tmp = a; 42 - __asm__ volatile ("eor %1, %0, %0, ror #16\n" 43 - "bic %1, %1, #0xff0000\n" 44 - "mov %0, %0, ror #8\n" 45 - "eor %0, %0, %1, lsr #8\n" 46 - : "=r" (a), "=r" (tmp) : "0" (a), "1" (tmp)); 47 - return a; 48 - } 49 - /**********************************************************/ 50 - # elif defined(__x86_64__) 51 - # define ARCH_HAS_SWAP32 52 - # define ARCH_HAS_SWAP64 53 - static inline uint32_t bitfn_swap32(uint32_t a) 54 - { 55 - __asm__ ("bswap %0" : "=r" (a) : "0" (a)); 56 - return a; 57 - } 58 - 59 - static inline uint64_t bitfn_swap64(uint64_t a) 60 - { 61 - __asm__ ("bswap %0" : "=r" (a) : "0" (a)); 62 - return a; 63 - } 64 - 65 - # endif 66 - 67 - #ifndef ARCH_HAS_SWAP32 29 + /* Byte swapping. 30 + * 31 + * Use the compiler built-ins on GCC/Clang -- they emit the architecture's 32 + * native byte-swap instruction (bswap on x86, rev on ARMv6+, lwbrx on 33 + * POWER, revb.d on RISC-V Zbb, etc.) and are portable across every 34 + * target the compiler supports. MSVC has its own intrinsic. The 35 + * portable C fallback is for any compiler that supports neither, which 36 + * in practice means: nothing we ship on. */ 37 + #if defined(__GNUC__) || defined(__clang__) 38 + # define bitfn_swap32(a) __builtin_bswap32(a) 39 + # define bitfn_swap64(a) __builtin_bswap64(a) 40 + #elif defined(_MSC_VER) 41 + # include <stdlib.h> 42 + # define bitfn_swap32(a) _byteswap_ulong(a) 43 + # define bitfn_swap64(a) _byteswap_uint64(a) 44 + #else 68 45 static inline uint32_t bitfn_swap32(uint32_t a) 69 46 { 70 47 return (a << 24) | ((a & 0xff00) << 8) | ((a >> 8) & 0xff00) | (a >> 24); 71 48 } 72 - #endif 73 - 74 - #ifndef ARCH_HAS_SWAP64 75 49 static inline uint64_t bitfn_swap64(uint64_t a) 76 50 { 77 51 return ((uint64_t) bitfn_swap32((uint32_t) (a >> 32))) |

+25 -7

src/c/crypto.h

··· 16 16 # define __mc_ACCELERATE__ 17 17 # elif defined(__aarch64__) 18 18 # include <arm_neon.h> 19 - # define __mc_ARM64CE__ 19 + # define __mc_ARM64NEON__ 20 20 # endif 21 21 #define __mc_detect_features__ 22 22 #endif ··· 35 35 int ssse3; 36 36 int rdrand; 37 37 int rdseed; 38 - /* ARM64 Cryptography Extensions */ 39 - int arm_aes; 40 - int arm_pmull; 41 38 }; 42 39 43 40 /* Supported accelerations */ ··· 48 45 #ifdef __mc_ACCELERATE__ 49 46 50 47 #define _mc_switch_accel(FEATURE, GENERIC_CALL, ACCELERATED_CALL) \ 51 - if (!(mc_detected_cpu_features.FEATURE)) { GENERIC_CALL; } \ 52 - else { ACCELERATED_CALL; } 48 + do { \ 49 + if (!(mc_detected_cpu_features.FEATURE)) { GENERIC_CALL; } \ 50 + else { ACCELERATED_CALL; } \ 51 + } while (0) 53 52 54 53 #else /* __mc_ACCELERATE__ */ 55 54 56 55 #define _mc_switch_accel(_FEATURE, GENERIC_CALL, _ACCELERATED_CALL) \ 57 - GENERIC_CALL; 56 + do { GENERIC_CALL; } while (0) 58 57 59 58 #endif /* __mc_ACCELERATE__ */ 59 + 60 + /* mc_secure_bzero -- optimization-resistant memory zeroization for 61 + * cryptographic key material on the stack. Plain memset() can be 62 + * elided by the compiler if the buffer is not read afterwards; the 63 + * asm memory barrier prevents that. 64 + * 65 + * INVARIANT for callers: ocaml-crypto C primitives must not yield 66 + * to the OCaml runtime (no caml_alloc, no caml_callback) so that 67 + * String_val/Bp_val pointers remain valid for the duration of the 68 + * call. This is currently respected by every CAMLprim in src/c/. */ 69 + static inline void mc_secure_bzero(void *p, size_t n) { 70 + #if defined(__GNUC__) || defined(__clang__) 71 + memset(p, 0, n); 72 + __asm__ __volatile__ ("" : : "r"(p) : "memory"); 73 + #else 74 + volatile unsigned char *q = (volatile unsigned char *)p; 75 + while (n--) *q++ = 0; 76 + #endif 77 + } 60 78 61 79 #if defined (__x86_64__) || defined (__aarch64__) || defined (__powerpc64__) || defined (__ppc64__) || (64 == __riscv_xlen) || defined (__s390x__) || (defined (__mips__) && _MIPS_SIM==_ABI64) || defined (__loongarch_lp64) || (1 == _WIN64) 62 80 #define ARCH_64BIT

-44

src/c/detect_cpu_features.c

··· 8 8 #endif 9 9 #endif 10 10 11 - #if defined(__aarch64__) && defined(__linux__) 12 - # include <sys/auxval.h> 13 - # ifndef HWCAP_AES 14 - # define HWCAP_AES (1 << 3) 15 - # endif 16 - # ifndef HWCAP_PMULL 17 - # define HWCAP_PMULL (1 << 4) 18 - # endif 19 - #elif defined(__aarch64__) && defined(__FreeBSD__) 20 - # include <sys/auxval.h> 21 - #endif 22 - 23 11 struct _mc_cpu_features mc_detected_cpu_features = { 0 }; 24 12 25 13 #if defined(_MSC_VER) ··· 87 75 mc_detected_cpu_features.rdseed = 1; 88 76 } 89 77 90 - return Val_unit; 91 - } 92 - 93 - #elif defined(__aarch64__) 94 - 95 - /* 96 - * ARM64 Cryptography Extensions detection. 97 - * 98 - * On Linux: getauxval(AT_HWCAP) returns HWCAP_AES and HWCAP_PMULL bits. 99 - * On macOS/iOS: Apple Silicon always has crypto extensions. 100 - * On FreeBSD: elf_aux_info(AT_HWCAP, ...) provides the same. 101 - */ 102 - CAMLprim value 103 - mc_detect_cpu_features (__unit ()) { 104 - #if defined(__APPLE__) 105 - /* Apple Silicon always supports AES and PMULL */ 106 - mc_detected_cpu_features.arm_aes = 1; 107 - mc_detected_cpu_features.arm_pmull = 1; 108 - #elif defined(__linux__) 109 - unsigned long hwcap = getauxval(AT_HWCAP); 110 - if (hwcap & HWCAP_AES) 111 - mc_detected_cpu_features.arm_aes = 1; 112 - if (hwcap & HWCAP_PMULL) 113 - mc_detected_cpu_features.arm_pmull = 1; 114 - #elif defined(__FreeBSD__) 115 - unsigned long hwcap = 0; 116 - elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); 117 - if (hwcap & HWCAP_AES) 118 - mc_detected_cpu_features.arm_aes = 1; 119 - if (hwcap & HWCAP_PMULL) 120 - mc_detected_cpu_features.arm_pmull = 1; 121 - #endif 122 78 return Val_unit; 123 79 } 124 80

+4 -4

src/c/ghash_pclmul.c

··· 192 192 value s; 193 193 _mc_switch_accel(pclmul, 194 194 s = mc_ghash_key_size_generic(Val_unit), 195 - s = Val_int (__keys * 16)) 195 + s = Val_int (__keys * 16)); 196 196 return s; 197 197 } 198 198 199 199 CAMLprim value mc_ghash_init_key (value key, value m) { 200 200 _mc_switch_accel(pclmul, 201 201 mc_ghash_init_key_generic(key, m), 202 - __derive ((__m128i *) _st_uint8 (key), (__m128i *) Bp_val (m))) 202 + __derive ((__m128i *) _st_uint8 (key), (__m128i *) Bp_val (m))); 203 203 return Val_unit; 204 204 } 205 205 ··· 208 208 _mc_switch_accel(pclmul, 209 209 mc_ghash_generic(k, hash, src, off, len), 210 210 __ghash ( (__m128i *) Bp_val (k), (__m128i *) Bp_val (hash), 211 - (__m128i *) _st_uint8_off (src, off), Int_val (len) )) 211 + (__m128i *) _st_uint8_off (src, off), Int_val (len) )); 212 212 return Val_unit; 213 213 } 214 214 ··· 216 216 value enabled = 0; 217 217 _mc_switch_accel(pclmul, 218 218 enabled = 0, 219 - enabled = 1) 219 + enabled = 1); 220 220 return Val_int (enabled); 221 221 }

+33 -14

src/c/misc.c

··· 1 1 #include "crypto.h" 2 2 3 + /* xor_into -- alignment-safe XOR of n bytes from src into dst. 4 + * 5 + * Both src and dst may be unaligned. We use memcpy on both sides 6 + * (not just the source, as previously) so the loop is well-defined 7 + * on architectures that trap unaligned accesses (SPARC, MIPS without 8 + * fixup, strict ARM, etc.). On x86 and other tolerant architectures 9 + * GCC and Clang elide the memcpy into a single load/store. */ 3 10 static inline void xor_into (const uint8_t *src, uint8_t *dst, size_t n) { 4 - /* see issue #70 #81 for alignment considerations (memcpy used below) */ 5 11 #ifdef ARCH_64BIT 6 - uint64_t s; 7 - for (; n >= 8; n -= 8, src += 8, dst += 8) 8 - *(uint64_t*) dst ^= *(uint64_t*)memcpy(&s, src, 8); 12 + uint64_t s, d; 13 + for (; n >= 8; n -= 8, src += 8, dst += 8) { 14 + memcpy(&d, dst, 8); 15 + memcpy(&s, src, 8); 16 + d ^= s; 17 + memcpy(dst, &d, 8); 18 + } 9 19 #endif 10 20 11 - uint32_t t; 12 - for (; n >= 4; n -= 4, src += 4, dst += 4) 13 - *(uint32_t*) dst ^= *(uint32_t*)memcpy(&t, src, 4); 21 + uint32_t s4, d4; 22 + for (; n >= 4; n -= 4, src += 4, dst += 4) { 23 + memcpy(&d4, dst, 4); 24 + memcpy(&s4, src, 4); 25 + d4 ^= s4; 26 + memcpy(dst, &d4, 4); 27 + } 14 28 15 29 for (; n --; ++ src, ++ dst) *dst = *src ^ *dst; 16 30 } ··· 40 54 } 41 55 } 42 56 43 - /* The GCM counter. Counts on the last 32 bits, ignoring carry. */ 57 + /* The GCM counter. Counts on the last 32 bits, ignoring carry. 58 + * 59 + * The counter is laid out as a single 16-byte big-endian block: the 60 + * first 12 bytes are the IV (treated as opaque), and the final 4 61 + * bytes are the big-endian block counter that increments. We use 62 + * memcpy to load/store the four 32-bit words to avoid the strict 63 + * aliasing problem of casting uint64_t* to uint32_t*. */ 44 64 static inline void _mc_count_16_be_4 (uint64_t *init, uint64_t *dst, size_t blocks) { 45 65 46 - uint64_t qw1 = init[0]; 47 - uint32_t dw3 = ((uint32_t*) init)[2], 48 - dw4 = be32_to_cpu (((uint32_t*) init)[3]); 66 + uint32_t w[4]; 67 + memcpy(w, init, 16); 68 + uint32_t ctr = be32_to_cpu(w[3]); 49 69 for (; blocks --; dst += 2) { 50 - dst[0] = qw1; 51 - ((uint32_t*) dst)[2] = dw3; 52 - ((uint32_t*) dst)[3] = cpu_to_be32 (dw4 ++); 70 + w[3] = cpu_to_be32(ctr++); 71 + memcpy(dst, w, 16); 53 72 } 54 73 } 55 74

+28 -13

src/c/misc_sse.c

··· 2 2 3 3 #ifdef __mc_ACCELERATE__ 4 4 5 + /* xor_into -- alignment-safe XOR of n bytes from src into dst. 6 + * 7 + * The __m128i path uses _mm_loadu_si128/_mm_storeu_si128 (the 8 + * unaligned variants) so it tolerates any alignment. The 64-bit 9 + * and 32-bit fallback paths use memcpy on both sides; previously 10 + * only the source was memcpy'd, leaving the destination as a raw 11 + * cast which is undefined on architectures that trap unaligned 12 + * accesses. GCC and Clang elide the memcpy on x86. */ 5 13 static inline void xor_into (const uint8_t *src, uint8_t *dst, size_t n) { 6 - /* see issue #70 #81 for alignment considerations (memcpy used below) */ 7 14 #ifdef ARCH_64BIT 8 15 __m128i r; 9 16 for (; n >= 16; n -= 16, src += 16, dst += 16) ··· 13 20 _mm_loadu_si128 ((__m128i*) memcpy(&r, src, 16)), 14 21 _mm_loadu_si128 ((__m128i*) dst))); 15 22 16 - uint64_t s; 17 - for (; n >= 8; n -= 8, src += 8, dst += 8) 18 - *(uint64_t*) dst ^= *(uint64_t*) memcpy(&s, src, 8); 23 + uint64_t s, d; 24 + for (; n >= 8; n -= 8, src += 8, dst += 8) { 25 + memcpy(&d, dst, 8); 26 + memcpy(&s, src, 8); 27 + d ^= s; 28 + memcpy(dst, &d, 8); 29 + } 19 30 #endif 20 31 21 - uint32_t t; 22 - for (; n >= 4; n -= 4, src += 4, dst += 4) 23 - *(uint32_t*) dst ^= *(uint32_t*)memcpy(&t, src, 4); 32 + uint32_t s4, d4; 33 + for (; n >= 4; n -= 4, src += 4, dst += 4) { 34 + memcpy(&d4, dst, 4); 35 + memcpy(&s4, src, 4); 36 + d4 ^= s4; 37 + memcpy(dst, &d4, 4); 38 + } 24 39 25 40 for (; n --; ++ src, ++ dst) *dst = *src ^ *dst; 26 41 } ··· 44 59 /* NEON is always available on ARM64, no feature check needed. */ 45 60 /* ------------------------------------------------------------------ */ 46 61 47 - #if defined(__mc_ARM64CE__) || (defined(__aarch64__) && !defined(__mc_ACCELERATE__)) 62 + #if defined(__mc_ARM64NEON__) || (defined(__aarch64__) && !defined(__mc_ACCELERATE__)) 48 63 49 64 static inline void xor_into_neon (const uint8_t *src, uint8_t *dst, size_t n) { 50 65 for (; n >= 16; n -= 16, src += 16, dst += 16) { ··· 59 74 60 75 CAMLprim value 61 76 mc_xor_into_bytes (value b1, value off1, value b2, value off2, value n) { 62 - #if defined(__mc_ARM64CE__) || (defined(__aarch64__) && !defined(__mc_ACCELERATE__)) 77 + #if defined(__mc_ARM64NEON__) || (defined(__aarch64__) && !defined(__mc_ACCELERATE__)) 63 78 xor_into_neon (_st_uint8_off (b1, off1), _bp_uint8_off (b2, off2), Int_val (n)); 64 79 #else 65 80 _mc_switch_accel(ssse3, 66 81 mc_xor_into_bytes_generic(b1, off1, b2, off2, n), 67 - xor_into (_st_uint8_off (b1, off1), _bp_uint8_off (b2, off2), Int_val (n))) 82 + xor_into (_st_uint8_off (b1, off1), _bp_uint8_off (b2, off2), Int_val (n))); 68 83 #endif 69 84 return Val_unit; 70 85 } ··· 75 90 _mc_switch_accel(ssse3, 76 91 mc_count_16_be_4_generic (ctr, dst, off, blocks), 77 92 _mc_count_16_be_4 ( (uint64_t*) Bp_val (ctr), 78 - (uint64_t*) _bp_uint8_off (dst, off), Long_val (blocks) )) 93 + (uint64_t*) _bp_uint8_off (dst, off), Long_val (blocks) )); 79 94 #else 80 95 mc_count_16_be_4_generic (ctr, dst, off, blocks); 81 96 #endif ··· 84 99 85 100 CAMLprim value mc_misc_mode (__unit ()) { 86 101 value enabled = 0; 87 - #if defined(__mc_ARM64CE__) || (defined(__aarch64__) && !defined(__mc_ACCELERATE__)) 102 + #if defined(__mc_ARM64NEON__) || (defined(__aarch64__) && !defined(__mc_ACCELERATE__)) 88 103 enabled = 1; 89 104 #else 90 105 _mc_switch_accel(ssse3, 91 106 enabled = 0, 92 - enabled = 1) 107 + enabled = 1); 93 108 #endif 94 109 return Val_int (enabled); 95 110 }

Configure Feed

Configure Feed