Formatting fixes (dune fmt) · gazagnaire.org/ocaml-crypto@5e96327

+12 -4

src/c/detect_cpu_features.c

··· 22 22 23 23 struct _mc_cpu_features mc_detected_cpu_features = { 0 }; 24 24 25 - #ifdef _MSC_VER 25 + #if defined(_MSC_VER) 26 26 #define bit_PCLMUL ((int)1 << 1) 27 27 #define bit_SSSE3 ((int)1 << 9) 28 28 #define bit_AES ((int)1 << 25) ··· 61 61 return Val_unit; 62 62 } 63 63 64 - #else 64 + #elif defined(__x86_64__) || defined(__i386__) 65 65 66 66 CAMLprim value 67 67 mc_detect_cpu_features (__unit ()) { ··· 89 89 90 90 return Val_unit; 91 91 } 92 - #endif /* _MSC_VER */ 93 92 94 93 #elif defined(__aarch64__) 95 94 ··· 123 122 return Val_unit; 124 123 } 125 124 126 - #else /* __mc_detect_features__ but unknown arch */ 125 + #else /* unknown arch with __mc_detect_features__ */ 126 + 127 + CAMLprim value 128 + mc_detect_cpu_features (__unit ()) { 129 + return Val_unit; 130 + } 131 + 132 + #endif /* _MSC_VER / x86 / aarch64 / other */ 133 + 134 + #else /* !__mc_detect_features__ */ 127 135 128 136 CAMLprim value 129 137 mc_detect_cpu_features (__unit ()) {

+122 -92

src/c/ghash_pclmul.c

··· 191 191 /* ------------------------------------------------------------------ */ 192 192 /* ARM64 PMULL-based GHASH */ 193 193 /* */ 194 - /* Uses vmull_p64 for carry-less multiplication (same algorithm as */ 195 - /* the PCLMULQDQ path, different intrinsics). */ 194 + /* Direct transliteration of the x86 PCLMULQDQ code above, using */ 195 + /* ARM NEON + PMULL intrinsics. Same algorithm: reflected reduction */ 196 + /* with the polynomial 0xc200000000000000. */ 196 197 /* */ 197 - /* Reference: ARM Architecture Reference Manual, FEAT_PMULL. */ 198 - /* Same GF(2^128) reduction as Intel CLMUL white paper. */ 198 + /* Reference: Intel CLMUL white paper (Gueron & Kounavis), adapted */ 199 + /* to ARM PMULL intrinsics. Same pattern as OpenSSL ghashv8-armx.S. */ 199 200 /* ------------------------------------------------------------------ */ 200 201 201 202 #if defined(__mc_ARM64CE__) 202 203 203 204 #include <string.h> 204 205 205 - /* 206 - * Carry-less multiply two 64-bit values, producing a 128-bit result. 207 - */ 208 - static inline poly128_t __pmull (uint64_t a, uint64_t b) { 209 - return vmull_p64 ((poly64_t)a, (poly64_t)b); 206 + /* Use uint8x16_t as the 128-bit register type (matches ARM NEON) */ 207 + typedef uint8x16_t v128; 208 + 209 + #define nxor(a, b) veorq_u8((a), (b)) 210 + 211 + static inline v128 __neon_reverse (v128 x) { 212 + static const uint8_t idx[16] = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}; 213 + return vqtbl1q_u8(x, vld1q_u8(idx)); 210 214 } 211 215 212 - /* Extract high/low 64 bits of a uint8x16_t (treating as 128-bit value) */ 213 - static inline uint64_t __lo64 (uint8x16_t v) { 214 - return vgetq_lane_u64(vreinterpretq_u64_u8(v), 0); 216 + static inline uint64x2_t __to_u64 (v128 x) { return vreinterpretq_u64_u8(x); } 217 + static inline v128 __from_u64 (uint64x2_t x) { return vreinterpretq_u8_u64(x); } 218 + 219 + /* Swap the two 64-bit halves */ 220 + static inline v128 __neon_swap64 (v128 x) { 221 + return __from_u64(vextq_u64(__to_u64(x), __to_u64(x), 1)); 215 222 } 216 - static inline uint64_t __hi64 (uint8x16_t v) { 217 - return vgetq_lane_u64(vreinterpretq_u64_u8(v), 1); 223 + 224 + /* Shift entire 128-bit value left by 1 bit */ 225 + static inline v128 __neon_shiftl1 (v128 x) { 226 + uint64x2_t u = __to_u64(x); 227 + uint64x2_t hi = vshlq_n_u64(u, 1); 228 + uint64x2_t carry = vshrq_n_u64(u, 63); 229 + /* carry from low to high: shift carry left by one 64-bit lane */ 230 + uint64x2_t zero = vdupq_n_u64(0); 231 + uint64x2_t c = vextq_u64(zero, carry, 1); 232 + return __from_u64(vorrq_u64(hi, c)); 218 233 } 219 234 220 - /* Construct uint8x16_t from two uint64_t */ 221 - static inline uint8x16_t __from64 (uint64_t hi, uint64_t lo) { 222 - uint64x2_t v = vcombine_u64(vcreate_u64(lo), vcreate_u64(hi)); 223 - return vreinterpretq_u8_u64(v); 224 - } 235 + /* 236 + * 128-bit carry-less multiply: a * b = (r1:r0) 237 + * Same decomposition as the x86 __clmul_128. 238 + */ 239 + static inline void __neon_clmul_128 (v128 *r1, v128 *r0, v128 a, v128 b) { 240 + uint64x2_t au = __to_u64(a), bu = __to_u64(b); 241 + uint64_t a0 = vgetq_lane_u64(au, 0), a1 = vgetq_lane_u64(au, 1); 242 + uint64_t b0 = vgetq_lane_u64(bu, 0), b1 = vgetq_lane_u64(bu, 1); 243 + 244 + /* w0 = a0*b0, w1 = a1*b1, t = a0*b1 + a1*b0 */ 245 + poly128_t w0p = vmull_p64((poly64_t)a0, (poly64_t)b0); 246 + poly128_t w1p = vmull_p64((poly64_t)a1, (poly64_t)b1); 247 + poly128_t t1p = vmull_p64((poly64_t)a0, (poly64_t)b1); 248 + poly128_t t2p = vmull_p64((poly64_t)a1, (poly64_t)b0); 225 249 226 - static inline uint64_t __p128_lo (poly128_t p) { 227 - return (uint64_t)p; 228 - } 229 - static inline uint64_t __p128_hi (poly128_t p) { 230 - return (uint64_t)(p >> 64); 231 - } 250 + v128 w0 = vreinterpretq_u8_p128(w0p); 251 + v128 w1 = vreinterpretq_u8_p128(w1p); 252 + v128 t = nxor(vreinterpretq_u8_p128(t1p), vreinterpretq_u8_p128(t2p)); 253 + 254 + /* t_lo goes to r0_hi, t_hi goes to r1_lo */ 255 + uint64x2_t tu = __to_u64(t); 256 + uint64_t tlo = vgetq_lane_u64(tu, 0), thi = vgetq_lane_u64(tu, 1); 257 + v128 t_shift_left = __from_u64(vcombine_u64(vcreate_u64(0), vcreate_u64(tlo))); 258 + v128 t_shift_right = __from_u64(vcombine_u64(vcreate_u64(thi), vcreate_u64(0))); 232 259 233 - /* Byte-reverse a 128-bit value (GHASH uses reflected bit order) */ 234 - static inline uint8x16_t __bswap128 (uint8x16_t v) { 235 - static const uint8_t rev_idx[16] = { 236 - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 237 - }; 238 - return vqtbl1q_u8(v, vld1q_u8(rev_idx)); 260 + *r0 = nxor(w0, t_shift_left); 261 + *r1 = nxor(w1, t_shift_right); 239 262 } 240 263 241 264 /* 242 - * GF(2^128) multiply with reflected reduction (x^128 + x^7 + x^2 + x + 1). 243 - * 244 - * Reflected representation means the polynomial is bit-reversed compared to 245 - * the standard representation. The reduction constant is 0xc200000000000000 246 - * (same as the PCLMULQDQ path). 265 + * Reflected reduction: same algorithm as the x86 __reduce_g. 266 + * Uses PMULL with the reduction polynomial 0xc200000000000000. 247 267 */ 248 - static inline uint8x16_t __arm_gfmul (uint8x16_t H, uint8x16_t X) { 249 - uint64_t h0 = __lo64(H), h1 = __hi64(H); 250 - uint64_t x0 = __lo64(X), x1 = __hi64(X); 268 + static inline v128 __neon_reduce_g (v128 w1, v128 w0) { 269 + /* Shift 256-bit product left by 1 (reflected representation) */ 270 + uint64x2_t w0u = __to_u64(w0); 271 + uint64_t w0_hi_bit = vgetq_lane_u64(vshrq_n_u64(w0u, 63), 1); 272 + w0 = __neon_shiftl1(w0); 273 + w1 = __from_u64(vorrq_u64(__to_u64(__neon_shiftl1(w1)), 274 + vcombine_u64(vcreate_u64(w0_hi_bit), vcreate_u64(0)))); 251 275 252 - /* Schoolbook multiplication: H * X = (h1:h0) * (x1:x0) 253 - * Product = (z3:z2:z1:z0) 254 - * z0 = h0 * x0 (low) 255 - * z3 = h1 * x1 (high) 256 - * z1,z2 = h0*x1 + h1*x0 (cross terms) */ 257 - poly128_t p_lo = __pmull(h0, x0); 258 - poly128_t p_hi = __pmull(h1, x1); 259 - poly128_t p_m1 = __pmull(h0, x1); 260 - poly128_t p_m2 = __pmull(h1, x0); 276 + /* Two-phase reduction with polynomial 0xc200000000000000 */ 277 + uint64_t w0_lo = vgetq_lane_u64(__to_u64(w0), 0); 278 + poly128_t rp = vmull_p64((poly64_t)w0_lo, (poly64_t)0xc200000000000000ULL); 279 + w0 = nxor(__neon_swap64(w0), vreinterpretq_u8_p128(rp)); 261 280 262 - uint64_t z0 = __p128_lo(p_lo); 263 - uint64_t z1 = __p128_hi(p_lo) ^ __p128_lo(p_m1) ^ __p128_lo(p_m2); 264 - uint64_t z2 = __p128_lo(p_hi) ^ __p128_hi(p_m1) ^ __p128_hi(p_m2); 265 - uint64_t z3 = __p128_hi(p_hi); 281 + uint64_t w0_lo2 = vgetq_lane_u64(__to_u64(w0), 0); 282 + poly128_t rp2 = vmull_p64((poly64_t)w0_lo2, (poly64_t)0xc200000000000000ULL); 283 + w0 = nxor(__neon_swap64(w0), vreinterpretq_u8_p128(rp2)); 266 284 267 - /* Reflected reduction modulo x^128 + x^7 + x^2 + x + 1. 268 - * Reduce (z3:z2:z1:z0) to 128 bits. */ 269 - uint64_t v0, v1; 270 - poly128_t r0 = __pmull(z0, 0xc200000000000000ULL); 271 - v0 = z1 ^ __p128_hi(r0); 272 - v1 = z0 ^ __p128_lo(r0); 285 + return nxor(w1, w0); 286 + } 273 287 274 - poly128_t r1 = __pmull(v1, 0xc200000000000000ULL); 275 - v0 = z2 ^ v0 ^ __p128_hi(r1); 276 - v1 = z3 ^ v1 ^ __p128_lo(r1); 288 + static inline v128 __neon_gfmul (v128 a, v128 b) { 289 + v128 w1, w0; 290 + __neon_clmul_128(&w1, &w0, a, b); 291 + return __neon_reduce_g(w1, w0); 292 + } 277 293 278 - return __from64(v1, v0); 294 + /* Same __repr_xform as x86 reflected path: byte-reverse */ 295 + #define __neon_repr_xform __neon_reverse 296 + 297 + static inline v128 __neon_load_xform (const uint8_t *p) { 298 + return __neon_repr_xform(vld1q_u8(p)); 279 299 } 280 300 281 301 #define __arm_keys 1 282 302 283 - static inline void __arm_derive (const uint8_t *key, uint8x16_t *m) { 284 - m[0] = __bswap128(vld1q_u8(key)); 303 + static inline void __arm_derive (const uint8_t *key, v128 *m) { 304 + /* H = GHASH subkey, compute H^1 in reflected representation */ 305 + v128 k = __neon_load_xform(key); 306 + /* acc starts at x^1 in reflected = 0x80...0 reversed */ 307 + uint8_t init[16] = {0}; 308 + init[0] = 0x80; 309 + v128 acc = __neon_repr_xform(vld1q_u8(init)); 310 + m[0] = __neon_gfmul(acc, k); 285 311 } 286 312 287 - static inline void __arm_ghash (uint8x16_t *m, uint8x16_t *hash, 313 + static inline void __arm_ghash (v128 *m, v128 *hash, 288 314 const uint8_t *src, size_t len) { 289 - uint8x16_t acc = __bswap128(*hash); 290 - uint8x16_t H = m[0]; 315 + v128 k = m[0]; 316 + v128 acc = __neon_load_xform((const uint8_t *)hash); 291 317 292 318 while (len >= 16) { 293 - uint8x16_t block = __bswap128(vld1q_u8(src)); 294 - acc = __arm_gfmul(H, veorq_u8(acc, block)); 319 + acc = __neon_gfmul(k, nxor(acc, __neon_load_xform(src))); 295 320 src += 16; 296 321 len -= 16; 297 322 } 298 323 if (len > 0) { 299 - uint8_t tmp[16] = { 0 }; 324 + uint8_t tmp[16] = {0}; 300 325 memcpy(tmp, src, len); 301 - uint8x16_t block = __bswap128(vld1q_u8(tmp)); 302 - acc = __arm_gfmul(H, veorq_u8(acc, block)); 326 + acc = __neon_gfmul(k, nxor(acc, __neon_load_xform(tmp))); 303 327 } 304 - *hash = __bswap128(acc); 328 + vst1q_u8((uint8_t *)hash, __neon_repr_xform(acc)); 305 329 } 306 330 307 331 #endif /* __mc_ARM64CE__ */ ··· 312 336 313 337 CAMLprim value mc_ghash_key_size (__unit ()) { 314 338 value s; 315 - #if defined(__mc_ARM64CE__) 339 + #if defined(__mc_ACCELERATE__) 340 + _mc_switch_accel(pclmul, 341 + s = mc_ghash_key_size_generic(Val_unit), 342 + s = Val_int (__keys * 16)) 343 + #elif defined(__mc_ARM64CE__) 316 344 _mc_switch_accel(arm_pmull, 317 345 s = mc_ghash_key_size_generic(Val_unit), 318 346 s = Val_int (__arm_keys * 16)) 319 347 #else 320 - _mc_switch_accel(pclmul, 321 - s = mc_ghash_key_size_generic(Val_unit), 322 - s = Val_int (__keys * 16)) 348 + s = mc_ghash_key_size_generic(Val_unit); 323 349 #endif 324 350 return s; 325 351 } 326 352 327 353 CAMLprim value mc_ghash_init_key (value key, value m) { 328 - #if defined(__mc_ARM64CE__) 354 + #if defined(__mc_ACCELERATE__) 355 + _mc_switch_accel(pclmul, 356 + mc_ghash_init_key_generic(key, m), 357 + __derive ((__m128i *) _st_uint8 (key), (__m128i *) Bp_val (m))) 358 + #elif defined(__mc_ARM64CE__) 329 359 _mc_switch_accel(arm_pmull, 330 360 mc_ghash_init_key_generic(key, m), 331 - __arm_derive (_st_uint8 (key), (uint8x16_t *) Bp_val (m))) 361 + __arm_derive (_st_uint8 (key), (v128 *) Bp_val (m))) 332 362 #else 333 - _mc_switch_accel(pclmul, 334 - mc_ghash_init_key_generic(key, m), 335 - __derive ((__m128i *) _st_uint8 (key), (__m128i *) Bp_val (m))) 363 + mc_ghash_init_key_generic(key, m); 336 364 #endif 337 365 return Val_unit; 338 366 } 339 367 340 368 CAMLprim value 341 369 mc_ghash (value k, value hash, value src, value off, value len) { 342 - #if defined(__mc_ARM64CE__) 343 - _mc_switch_accel(arm_pmull, 344 - mc_ghash_generic(k, hash, src, off, len), 345 - __arm_ghash ( (uint8x16_t *) Bp_val (k), (uint8x16_t *) Bp_val (hash), 346 - _st_uint8_off (src, off), Int_val (len) )) 347 - #else 370 + #if defined(__mc_ACCELERATE__) 348 371 _mc_switch_accel(pclmul, 349 372 mc_ghash_generic(k, hash, src, off, len), 350 373 __ghash ( (__m128i *) Bp_val (k), (__m128i *) Bp_val (hash), 351 374 (__m128i *) _st_uint8_off (src, off), Int_val (len) )) 375 + #elif defined(__mc_ARM64CE__) 376 + _mc_switch_accel(arm_pmull, 377 + mc_ghash_generic(k, hash, src, off, len), 378 + __arm_ghash ( (v128 *) Bp_val (k), (v128 *) Bp_val (hash), 379 + _st_uint8_off (src, off), Int_val (len) )) 380 + #else 381 + mc_ghash_generic(k, hash, src, off, len); 352 382 #endif 353 383 return Val_unit; 354 384 } 355 385 356 386 CAMLprim value mc_ghash_mode (__unit ()) { 357 387 value enabled = 0; 358 - #if defined(__mc_ARM64CE__) 359 - _mc_switch_accel(arm_pmull, 388 + #if defined(__mc_ACCELERATE__) 389 + _mc_switch_accel(pclmul, 360 390 enabled = 0, 361 391 enabled = 1) 362 - #else 363 - _mc_switch_accel(pclmul, 392 + #elif defined(__mc_ARM64CE__) 393 + _mc_switch_accel(arm_pmull, 364 394 enabled = 0, 365 395 enabled = 1) 366 396 #endif

+38 -10

src/c/misc_sse.c

··· 39 39 40 40 #endif /* __mc_ACCELERATE__ */ 41 41 42 + /* ------------------------------------------------------------------ */ 43 + /* ARM64 NEON XOR and counters. */ 44 + /* NEON is always available on ARM64, no feature check needed. */ 45 + /* ------------------------------------------------------------------ */ 46 + 47 + #if defined(__mc_ARM64CE__) || (defined(__aarch64__) && !defined(__mc_ACCELERATE__)) 48 + 49 + static inline void xor_into_neon (const uint8_t *src, uint8_t *dst, size_t n) { 50 + for (; n >= 16; n -= 16, src += 16, dst += 16) { 51 + uint8x16_t s = vld1q_u8(src); 52 + uint8x16_t d = vld1q_u8(dst); 53 + vst1q_u8(dst, veorq_u8(s, d)); 54 + } 55 + for (; n--; ++src, ++dst) *dst = *src ^ *dst; 56 + } 57 + 58 + #endif /* aarch64 */ 59 + 42 60 CAMLprim value 43 61 mc_xor_into_bytes (value b1, value off1, value b2, value off2, value n) { 62 + #if defined(__mc_ARM64CE__) || (defined(__aarch64__) && !defined(__mc_ACCELERATE__)) 63 + xor_into_neon (_st_uint8_off (b1, off1), _bp_uint8_off (b2, off2), Int_val (n)); 64 + #else 44 65 _mc_switch_accel(ssse3, 45 66 mc_xor_into_bytes_generic(b1, off1, b2, off2, n), 46 67 xor_into (_st_uint8_off (b1, off1), _bp_uint8_off (b2, off2), Int_val (n))) 68 + #endif 47 69 return Val_unit; 48 70 } 49 71 50 - #define __export_counter(name, f) \ 51 - CAMLprim value name (value ctr, value dst, value off, value blocks) { \ 52 - _mc_switch_accel(ssse3, \ 53 - name##_generic (ctr, dst, off, blocks), \ 54 - f ( (uint64_t*) Bp_val (ctr), \ 55 - (uint64_t*) _bp_uint8_off (dst, off), Long_val (blocks) )) \ 56 - return Val_unit; \ 57 - } 58 - 59 - __export_counter(mc_count_16_be_4, _mc_count_16_be_4) 72 + CAMLprim value 73 + mc_count_16_be_4 (value ctr, value dst, value off, value blocks) { 74 + #if defined(__mc_ACCELERATE__) 75 + _mc_switch_accel(ssse3, 76 + mc_count_16_be_4_generic (ctr, dst, off, blocks), 77 + _mc_count_16_be_4 ( (uint64_t*) Bp_val (ctr), 78 + (uint64_t*) _bp_uint8_off (dst, off), Long_val (blocks) )) 79 + #else 80 + mc_count_16_be_4_generic (ctr, dst, off, blocks); 81 + #endif 82 + return Val_unit; 83 + } 60 84 61 85 CAMLprim value mc_misc_mode (__unit ()) { 62 86 value enabled = 0; 87 + #if defined(__mc_ARM64CE__) || (defined(__aarch64__) && !defined(__mc_ACCELERATE__)) 88 + enabled = 1; 89 + #else 63 90 _mc_switch_accel(ssse3, 64 91 enabled = 0, 65 92 enabled = 1) 93 + #endif 66 94 return Val_int (enabled); 67 95 }

Configure Feed

Configure Feed