this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

experiment

+854
+1
.gitignore
··· 1 + _build
+1
README.md
··· 1 + A sha256 experiment
+173
bench/bench_sha256.ml
··· 1 + open Sha256 2 + 3 + (* Memory allocation tracking *) 4 + let measure_allocations f = 5 + let before = Gc.allocated_bytes () in 6 + let result = f () in 7 + let after = Gc.allocated_bytes () in 8 + (result, after -. before) 9 + 10 + (* Benchmark different scenarios *) 11 + let bench_sizes () = 12 + print_endline "Benchmarking various input sizes:"; 13 + print_endline "Size (B) | Iterations | Time (s) | Throughput (MB/s) | Allocations (B)"; 14 + print_endline "---------|------------|----------|-------------------|----------------"; 15 + 16 + let sizes = [ 17 + (16, 100000); 18 + (64, 100000); 19 + (256, 50000); 20 + (1024, 20000); 21 + (4096, 5000); 22 + (16384, 1000); 23 + (65536, 250); 24 + (262144, 60); 25 + (1048576, 15); 26 + ] in 27 + 28 + List.iter (fun (size, iterations) -> 29 + let data = String.make size 'x' in 30 + 31 + (* Warmup *) 32 + for _ = 1 to 10 do 33 + ignore (hash_string data) 34 + done; 35 + 36 + (* Benchmark *) 37 + let start = Unix.gettimeofday () in 38 + let _, allocs = measure_allocations (fun () -> 39 + for _ = 1 to iterations do 40 + ignore (hash_string data) 41 + done 42 + ) in 43 + let elapsed = Unix.gettimeofday () -. start in 44 + 45 + let throughput = (float_of_int (size * iterations)) /. elapsed /. 1_000_000.0 in 46 + let allocs_per_op = allocs /. float_of_int iterations in 47 + 48 + Printf.printf "%8d | %10d | %8.3f | %17.1f | %14.0f\n" 49 + size iterations elapsed throughput allocs_per_op 50 + ) sizes 51 + 52 + let bench_parallel_scaling () = 53 + print_endline "\nParallel scaling benchmark:"; 54 + print_endline "Threads | Hashes | Time (s) | Hashes/sec | Speedup"; 55 + print_endline "--------|--------|----------|------------|--------"; 56 + 57 + let num_hashes = 10000 in 58 + let data_size = 1024 in 59 + let inputs = List.init num_hashes (fun i -> 60 + Bytes.of_string (String.make data_size (Char.chr (65 + (i mod 26)))) 61 + ) in 62 + 63 + (* Sequential baseline *) 64 + let start_seq = Unix.gettimeofday () in 65 + let _ = List.map hash_bytes inputs in 66 + let time_seq = Unix.gettimeofday () -. start_seq in 67 + let hashes_per_sec_seq = float_of_int num_hashes /. time_seq in 68 + 69 + Printf.printf "%7d | %6d | %8.3f | %10.0f | %7.2fx\n" 70 + 1 num_hashes time_seq hashes_per_sec_seq 1.0; 71 + 72 + (* Parallel with different thread counts *) 73 + let thread_counts = [2; 4; 8] in 74 + List.iter (fun threads -> 75 + (* Simulate parallel execution with multiple Parallel.fork_join2 calls *) 76 + let par = Parallel.create () in 77 + let chunk_size = num_hashes / threads in 78 + 79 + let start_par = Unix.gettimeofday () in 80 + 81 + (* Process in parallel chunks *) 82 + let rec process_chunks remaining acc = 83 + match remaining with 84 + | [] -> acc 85 + | chunk :: [] -> (List.map hash_bytes chunk) :: acc 86 + | chunk1 :: chunk2 :: rest -> 87 + let r1, r2 = Parallel.fork_join2 par 88 + (fun _ -> List.map hash_bytes chunk1) 89 + (fun _ -> List.map hash_bytes chunk2) 90 + in 91 + process_chunks rest (r2 :: r1 :: acc) 92 + in 93 + 94 + (* Split inputs into chunks *) 95 + let rec split_into_chunks lst n acc = 96 + if n <= 0 || lst = [] then List.rev acc 97 + else 98 + let rec take k lst acc = 99 + if k = 0 || lst = [] then (List.rev acc, lst) 100 + else match lst with 101 + | h::t -> take (k-1) t (h::acc) 102 + | [] -> (List.rev acc, []) 103 + in 104 + let (chunk, rest) = take chunk_size lst [] in 105 + split_into_chunks rest (n-1) (chunk :: acc) 106 + in 107 + 108 + let chunks = split_into_chunks inputs threads [] in 109 + let _ = process_chunks chunks [] in 110 + 111 + let time_par = Unix.gettimeofday () -. start_par in 112 + let hashes_per_sec_par = float_of_int num_hashes /. time_par in 113 + let speedup = time_seq /. time_par in 114 + 115 + Printf.printf "%7d | %6d | %8.3f | %10.0f | %7.2fx\n" 116 + threads num_hashes time_par hashes_per_sec_par speedup 117 + ) thread_counts 118 + 119 + let bench_zero_allocation () = 120 + print_endline "\nZero-allocation verification:"; 121 + 122 + (* Create aligned buffer *) 123 + let size = 1024 in 124 + let buffer = Bigarray.Array1.create Bigarray.int8_unsigned Bigarray.c_layout size in 125 + for i = 0 to size - 1 do 126 + Bigarray.Array1.set buffer i (65 + (i mod 26)) 127 + done; 128 + 129 + (* Measure allocations for direct oneshot call *) 130 + Gc.full_major (); 131 + let before = Gc.allocated_bytes () in 132 + 133 + for _ = 1 to 1000 do 134 + ignore (oneshot buffer (Int64.of_int size)) 135 + done; 136 + 137 + let after = Gc.allocated_bytes () in 138 + let allocs_per_hash = (after -. before) /. 1000.0 in 139 + 140 + Printf.printf " Direct oneshot (bigarray): %.1f bytes/hash\n" allocs_per_hash; 141 + 142 + (* Compare with string version *) 143 + let str = String.make size 'x' in 144 + Gc.full_major (); 145 + let before_str = Gc.allocated_bytes () in 146 + 147 + for _ = 1 to 1000 do 148 + ignore (hash_string str) 149 + done; 150 + 151 + let after_str = Gc.allocated_bytes () in 152 + let allocs_per_hash_str = (after_str -. before_str) /. 1000.0 in 153 + 154 + Printf.printf " String wrapper: %.1f bytes/hash\n" allocs_per_hash_str; 155 + 156 + if allocs_per_hash < 100.0 then 157 + print_endline " ✓ Near-zero allocation achieved!" 158 + else 159 + print_endline " ⚠ Higher than expected allocations" 160 + 161 + let () = 162 + print_endline "SHA256 Performance Benchmark Suite"; 163 + print_endline "===================================\n"; 164 + 165 + (* Check CPU support *) 166 + print_endline "System Information:"; 167 + Printf.printf " OCaml version: %s\n" Sys.ocaml_version; 168 + Printf.printf " Word size: %d bits\n" Sys.word_size; 169 + Printf.printf " OS: %s\n\n" Sys.os_type; 170 + 171 + bench_sizes (); 172 + bench_parallel_scaling (); 173 + bench_zero_allocation ()
+4
bench/dune
··· 1 + (executable 2 + (name bench_sha256) 3 + (libraries sha256 unix) 4 + (modes native))
+13
dune-project
··· 1 + (lang dune 3.0) 2 + (name oxsha) 3 + (version 0.1.0) 4 + 5 + (package 6 + (name oxsha) 7 + (synopsis "Blazingly fast SHA256 using AMD SHA-NI instructions") 8 + (description "Hardware-accelerated SHA256 implementation for OxCaml using AMD SHA-NI instructions with zero-allocation design") 9 + (depends 10 + ocaml 11 + (dune (>= 3.0)) 12 + bigarray 13 + parallel))
+9
lib/dune
··· 1 + (library 2 + (name sha256) 3 + (public_name oxsha) 4 + (libraries bigarray parallel) 5 + (foreign_stubs 6 + (language c) 7 + (names sha256_stubs) 8 + (flags :standard -msha -msse4.1 -O3 -march=native)) 9 + (modes native))
+96
lib/sha256.ml
··· 1 + open Bigarray 2 + 3 + type state = (int32, int32_elt, c_layout) Array1.t 4 + type digest = (int, int8_unsigned_elt, c_layout) Array1.t 5 + type buffer = (int, int8_unsigned_elt, c_layout) Array1.t 6 + 7 + (* External C functions *) 8 + external init : unit -> state = "oxcaml_sha256_init" 9 + external process_block : state -> buffer -> unit = "oxcaml_sha256_process_block" [@@noalloc] 10 + external finalize : state -> buffer -> int64 -> digest = "oxcaml_sha256_finalize" 11 + external oneshot : buffer -> int64 -> digest = "oxcaml_sha256_oneshot" 12 + 13 + (* High-level interface *) 14 + 15 + let hash_bytes bytes = 16 + let len = Bytes.length bytes in 17 + let buffer = Array1.create int8_unsigned c_layout len in 18 + for i = 0 to len - 1 do 19 + Array1.set buffer i (Char.code (Bytes.get bytes i)) 20 + done; 21 + oneshot buffer (Int64.of_int len) 22 + 23 + let hash_string str = 24 + let len = String.length str in 25 + let buffer = Array1.create int8_unsigned c_layout len in 26 + for i = 0 to len - 1 do 27 + Array1.set buffer i (Char.code str.[i]) 28 + done; 29 + oneshot buffer (Int64.of_int len) 30 + 31 + (* Utilities *) 32 + 33 + let digest_to_hex digest = 34 + let hex_of_byte b = 35 + Printf.sprintf "%02x" b 36 + in 37 + let buf = Buffer.create 64 in 38 + for i = 0 to 31 do 39 + Buffer.add_string buf (hex_of_byte (Array1.get digest i)) 40 + done; 41 + Buffer.contents buf 42 + 43 + let digest_to_bytes digest = 44 + let bytes = Bytes.create 32 in 45 + for i = 0 to 31 do 46 + Bytes.set bytes i (Char.chr (Array1.get digest i)) 47 + done; 48 + bytes 49 + 50 + let digest_equal d1 d2 = 51 + let rec compare i = 52 + if i >= 32 then true 53 + else if Array1.get d1 i <> Array1.get d2 i then false 54 + else compare (i + 1) 55 + in 56 + compare 0 57 + 58 + (* Zero-allocation variants using OxCaml features *) 59 + 60 + module Fast = struct 61 + (* Stack-allocated processing for temporary computations *) 62 + let[@inline] [@zero_alloc assume] process_block_local state block = 63 + process_block state block 64 + 65 + (* Process multiple blocks efficiently *) 66 + let[@zero_alloc assume] process_blocks state blocks num_blocks = 67 + for i = 0 to num_blocks - 1 do 68 + let offset = i * 64 in 69 + let block = Array1.sub blocks offset 64 in 70 + process_block state block 71 + done 72 + 73 + (* Parallel hashing for multiple inputs *) 74 + let parallel_hash_many par inputs = 75 + match inputs with 76 + | [] -> [] 77 + | [x] -> [hash_bytes x] 78 + | _ -> 79 + let process_batch batch = 80 + List.map hash_bytes batch 81 + in 82 + let mid = List.length inputs / 2 in 83 + let rec split n lst = 84 + if n = 0 then ([], lst) 85 + else match lst with 86 + | [] -> ([], []) 87 + | h::t -> let (l1, l2) = split (n-1) t in (h::l1, l2) 88 + in 89 + let (left, right) = split mid inputs in 90 + let left_results, right_results = 91 + Parallel.fork_join2 par 92 + (fun _ -> process_batch left) 93 + (fun _ -> process_batch right) 94 + in 95 + left_results @ right_results 96 + end
+47
lib/sha256.mli
··· 1 + (** SHA256 hardware-accelerated implementation using AMD SHA-NI instructions *) 2 + 3 + open Bigarray 4 + 5 + (** {1 Types} *) 6 + 7 + (** SHA256 state (8 x int32) *) 8 + type state = (int32, int32_elt, c_layout) Array1.t 9 + 10 + (** SHA256 digest (32 bytes) *) 11 + type digest = (int, int8_unsigned_elt, c_layout) Array1.t 12 + 13 + (** Input data buffer *) 14 + type buffer = (int, int8_unsigned_elt, c_layout) Array1.t 15 + 16 + (** {1 Low-level interface} *) 17 + 18 + (** Initialize a new SHA256 state *) 19 + val init : unit -> state 20 + 21 + (** Process a single 512-bit (64 byte) block. Buffer must be exactly 64 bytes. *) 22 + val process_block : state -> buffer -> unit 23 + 24 + (** Finalize the hash computation with padding and return digest *) 25 + val finalize : state -> buffer -> int64 -> digest 26 + 27 + (** {1 High-level interface} *) 28 + 29 + (** Compute SHA256 hash in one shot (fastest for single use) *) 30 + val oneshot : buffer -> int64 -> digest 31 + 32 + (** Compute SHA256 hash from bytes *) 33 + val hash_bytes : bytes -> digest 34 + 35 + (** Compute SHA256 hash from string *) 36 + val hash_string : string -> digest 37 + 38 + (** {1 Utilities} *) 39 + 40 + (** Convert digest to hexadecimal string *) 41 + val digest_to_hex : digest -> string 42 + 43 + (** Convert digest to bytes *) 44 + val digest_to_bytes : digest -> bytes 45 + 46 + (** Compare two digests for equality *) 47 + val digest_equal : digest -> digest -> bool
+382
lib/sha256_stubs.c
··· 1 + #include <immintrin.h> 2 + #include <stdint.h> 3 + #include <string.h> 4 + #include <caml/mlvalues.h> 5 + #include <caml/memory.h> 6 + #include <caml/alloc.h> 7 + #include <caml/bigarray.h> 8 + 9 + // Aligned storage for round constants 10 + alignas(64) static const uint32_t K256[64] = { 11 + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 12 + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 13 + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 14 + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 15 + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 16 + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 17 + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 18 + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 19 + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 20 + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 21 + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 22 + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 23 + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 24 + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 25 + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 26 + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 27 + }; 28 + 29 + // Initial SHA256 state values 30 + alignas(16) static const uint32_t H256_INIT[8] = { 31 + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 32 + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 33 + }; 34 + 35 + // Byte swap for endianness 36 + static const __m128i BSWAP_MASK = {0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL}; 37 + 38 + // Process a single 512-bit block using SHA-NI instructions 39 + static void sha256_process_block_shani(uint32_t state[8], const uint8_t block[64]) { 40 + __m128i msg0, msg1, msg2, msg3; 41 + __m128i tmp; 42 + __m128i state0, state1; 43 + __m128i msg; 44 + __m128i abef_save, cdgh_save; 45 + 46 + // Load initial state 47 + tmp = _mm_loadu_si128((const __m128i*)&state[0]); 48 + state1 = _mm_loadu_si128((const __m128i*)&state[4]); 49 + 50 + // Swap byte order for initial state 51 + tmp = _mm_shuffle_epi32(tmp, 0xB1); // CDAB 52 + state1 = _mm_shuffle_epi32(state1, 0x1B); // EFGH 53 + state0 = _mm_alignr_epi8(tmp, state1, 8); // ABEF 54 + state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH 55 + 56 + // Save initial state 57 + abef_save = state0; 58 + cdgh_save = state1; 59 + 60 + // Load message blocks with byte swap 61 + msg0 = _mm_loadu_si128((const __m128i*)(block + 0)); 62 + msg1 = _mm_loadu_si128((const __m128i*)(block + 16)); 63 + msg2 = _mm_loadu_si128((const __m128i*)(block + 32)); 64 + msg3 = _mm_loadu_si128((const __m128i*)(block + 48)); 65 + 66 + msg0 = _mm_shuffle_epi8(msg0, BSWAP_MASK); 67 + msg1 = _mm_shuffle_epi8(msg1, BSWAP_MASK); 68 + msg2 = _mm_shuffle_epi8(msg2, BSWAP_MASK); 69 + msg3 = _mm_shuffle_epi8(msg3, BSWAP_MASK); 70 + 71 + // Rounds 0-3 72 + msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[0])); 73 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 74 + msg = _mm_shuffle_epi32(msg, 0x0E); 75 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 76 + 77 + // Rounds 4-7 78 + msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[4])); 79 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 80 + msg = _mm_shuffle_epi32(msg, 0x0E); 81 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 82 + msg0 = _mm_sha256msg1_epu32(msg0, msg1); 83 + 84 + // Rounds 8-11 85 + msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[8])); 86 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 87 + msg = _mm_shuffle_epi32(msg, 0x0E); 88 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 89 + msg1 = _mm_sha256msg1_epu32(msg1, msg2); 90 + 91 + // Rounds 12-15 92 + msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[12])); 93 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 94 + tmp = _mm_alignr_epi8(msg3, msg2, 4); 95 + msg0 = _mm_add_epi32(msg0, tmp); 96 + msg0 = _mm_sha256msg2_epu32(msg0, msg3); 97 + msg = _mm_shuffle_epi32(msg, 0x0E); 98 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 99 + msg2 = _mm_sha256msg1_epu32(msg2, msg3); 100 + 101 + // Rounds 16-19 102 + msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[16])); 103 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 104 + tmp = _mm_alignr_epi8(msg0, msg3, 4); 105 + msg1 = _mm_add_epi32(msg1, tmp); 106 + msg1 = _mm_sha256msg2_epu32(msg1, msg0); 107 + msg = _mm_shuffle_epi32(msg, 0x0E); 108 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 109 + msg3 = _mm_sha256msg1_epu32(msg3, msg0); 110 + 111 + // Rounds 20-23 112 + msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[20])); 113 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 114 + tmp = _mm_alignr_epi8(msg1, msg0, 4); 115 + msg2 = _mm_add_epi32(msg2, tmp); 116 + msg2 = _mm_sha256msg2_epu32(msg2, msg1); 117 + msg = _mm_shuffle_epi32(msg, 0x0E); 118 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 119 + msg0 = _mm_sha256msg1_epu32(msg0, msg1); 120 + 121 + // Rounds 24-27 122 + msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[24])); 123 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 124 + tmp = _mm_alignr_epi8(msg2, msg1, 4); 125 + msg3 = _mm_add_epi32(msg3, tmp); 126 + msg3 = _mm_sha256msg2_epu32(msg3, msg2); 127 + msg = _mm_shuffle_epi32(msg, 0x0E); 128 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 129 + msg1 = _mm_sha256msg1_epu32(msg1, msg2); 130 + 131 + // Rounds 28-31 132 + msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[28])); 133 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 134 + tmp = _mm_alignr_epi8(msg3, msg2, 4); 135 + msg0 = _mm_add_epi32(msg0, tmp); 136 + msg0 = _mm_sha256msg2_epu32(msg0, msg3); 137 + msg = _mm_shuffle_epi32(msg, 0x0E); 138 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 139 + msg2 = _mm_sha256msg1_epu32(msg2, msg3); 140 + 141 + // Rounds 32-35 142 + msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[32])); 143 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 144 + tmp = _mm_alignr_epi8(msg0, msg3, 4); 145 + msg1 = _mm_add_epi32(msg1, tmp); 146 + msg1 = _mm_sha256msg2_epu32(msg1, msg0); 147 + msg = _mm_shuffle_epi32(msg, 0x0E); 148 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 149 + msg3 = _mm_sha256msg1_epu32(msg3, msg0); 150 + 151 + // Rounds 36-39 152 + msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[36])); 153 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 154 + tmp = _mm_alignr_epi8(msg1, msg0, 4); 155 + msg2 = _mm_add_epi32(msg2, tmp); 156 + msg2 = _mm_sha256msg2_epu32(msg2, msg1); 157 + msg = _mm_shuffle_epi32(msg, 0x0E); 158 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 159 + msg0 = _mm_sha256msg1_epu32(msg0, msg1); 160 + 161 + // Rounds 40-43 162 + msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[40])); 163 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 164 + tmp = _mm_alignr_epi8(msg2, msg1, 4); 165 + msg3 = _mm_add_epi32(msg3, tmp); 166 + msg3 = _mm_sha256msg2_epu32(msg3, msg2); 167 + msg = _mm_shuffle_epi32(msg, 0x0E); 168 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 169 + msg1 = _mm_sha256msg1_epu32(msg1, msg2); 170 + 171 + // Rounds 44-47 172 + msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[44])); 173 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 174 + tmp = _mm_alignr_epi8(msg3, msg2, 4); 175 + msg0 = _mm_add_epi32(msg0, tmp); 176 + msg0 = _mm_sha256msg2_epu32(msg0, msg3); 177 + msg = _mm_shuffle_epi32(msg, 0x0E); 178 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 179 + msg2 = _mm_sha256msg1_epu32(msg2, msg3); 180 + 181 + // Rounds 48-51 182 + msg = _mm_add_epi32(msg0, _mm_load_si128((const __m128i*)&K256[48])); 183 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 184 + tmp = _mm_alignr_epi8(msg0, msg3, 4); 185 + msg1 = _mm_add_epi32(msg1, tmp); 186 + msg1 = _mm_sha256msg2_epu32(msg1, msg0); 187 + msg = _mm_shuffle_epi32(msg, 0x0E); 188 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 189 + msg3 = _mm_sha256msg1_epu32(msg3, msg0); 190 + 191 + // Rounds 52-55 192 + msg = _mm_add_epi32(msg1, _mm_load_si128((const __m128i*)&K256[52])); 193 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 194 + tmp = _mm_alignr_epi8(msg1, msg0, 4); 195 + msg2 = _mm_add_epi32(msg2, tmp); 196 + msg2 = _mm_sha256msg2_epu32(msg2, msg1); 197 + msg = _mm_shuffle_epi32(msg, 0x0E); 198 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 199 + 200 + // Rounds 56-59 201 + msg = _mm_add_epi32(msg2, _mm_load_si128((const __m128i*)&K256[56])); 202 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 203 + tmp = _mm_alignr_epi8(msg2, msg1, 4); 204 + msg3 = _mm_add_epi32(msg3, tmp); 205 + msg3 = _mm_sha256msg2_epu32(msg3, msg2); 206 + msg = _mm_shuffle_epi32(msg, 0x0E); 207 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 208 + 209 + // Rounds 60-63 210 + msg = _mm_add_epi32(msg3, _mm_load_si128((const __m128i*)&K256[60])); 211 + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 212 + msg = _mm_shuffle_epi32(msg, 0x0E); 213 + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); 214 + 215 + // Add initial state 216 + state0 = _mm_add_epi32(state0, abef_save); 217 + state1 = _mm_add_epi32(state1, cdgh_save); 218 + 219 + // Swap byte order back and store 220 + tmp = _mm_shuffle_epi32(state0, 0x1B); // FEBA 221 + state1 = _mm_shuffle_epi32(state1, 0xB1); // DCHG 222 + state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA 223 + state1 = _mm_alignr_epi8(state1, tmp, 8); // HGFE 224 + 225 + _mm_storeu_si128((__m128i*)&state[0], state0); 226 + _mm_storeu_si128((__m128i*)&state[4], state1); 227 + } 228 + 229 + // OCaml interface functions 230 + 231 + // Initialize SHA256 state 232 + value oxcaml_sha256_init(value unit) { 233 + CAMLparam1(unit); 234 + CAMLlocal1(state); 235 + 236 + // Allocate bigarray for state (8 x int32) 237 + long dims[1] = {8}; 238 + state = caml_ba_alloc_dims(CAML_BA_INT32 | CAML_BA_C_LAYOUT, 1, NULL, dims); 239 + uint32_t* s = (uint32_t*)Caml_ba_data_val(state); 240 + 241 + // Copy initial values 242 + memcpy(s, H256_INIT, 32); 243 + 244 + CAMLreturn(state); 245 + } 246 + 247 + // Process a single 512-bit block 248 + value oxcaml_sha256_process_block(value state, value block) { 249 + CAMLparam2(state, block); 250 + 251 + uint32_t* s = (uint32_t*)Caml_ba_data_val(state); 252 + uint8_t* b = (uint8_t*)Caml_ba_data_val(block); 253 + 254 + sha256_process_block_shani(s, b); 255 + 256 + CAMLreturn(Val_unit); 257 + } 258 + 259 + // Finalize hash with padding and return digest 260 + value oxcaml_sha256_finalize(value state, value data, value len_v) { 261 + CAMLparam3(state, data, len_v); 262 + CAMLlocal1(result); 263 + 264 + uint32_t* s = (uint32_t*)Caml_ba_data_val(state); 265 + uint8_t* input = (uint8_t*)Caml_ba_data_val(data); 266 + uint64_t len = Int64_val(len_v); 267 + 268 + // Process full blocks 269 + uint64_t full_blocks = len / 64; 270 + for (uint64_t i = 0; i < full_blocks; i++) { 271 + sha256_process_block_shani(s, input + i * 64); 272 + } 273 + 274 + // Handle final block with padding 275 + uint8_t final_block[128] = {0}; // Max 2 blocks for padding 276 + uint64_t remaining = len % 64; 277 + 278 + // Copy remaining bytes 279 + if (remaining > 0) { 280 + memcpy(final_block, input + full_blocks * 64, remaining); 281 + } 282 + 283 + // Add padding 284 + final_block[remaining] = 0x80; 285 + 286 + // Add length in bits at the end 287 + uint64_t bit_len = len * 8; 288 + if (remaining >= 56) { 289 + // Need two blocks 290 + sha256_process_block_shani(s, final_block); 291 + memset(final_block, 0, 64); 292 + } 293 + 294 + // Add bit length (big-endian) 295 + final_block[56] = (bit_len >> 56) & 0xFF; 296 + final_block[57] = (bit_len >> 48) & 0xFF; 297 + final_block[58] = (bit_len >> 40) & 0xFF; 298 + final_block[59] = (bit_len >> 32) & 0xFF; 299 + final_block[60] = (bit_len >> 24) & 0xFF; 300 + final_block[61] = (bit_len >> 16) & 0xFF; 301 + final_block[62] = (bit_len >> 8) & 0xFF; 302 + final_block[63] = bit_len & 0xFF; 303 + 304 + sha256_process_block_shani(s, final_block); 305 + 306 + // Create result bigarray (32 bytes) 307 + long dims[1] = {32}; 308 + result = caml_ba_alloc_dims(CAML_BA_UINT8 | CAML_BA_C_LAYOUT, 1, NULL, dims); 309 + uint8_t* res = (uint8_t*)Caml_ba_data_val(result); 310 + 311 + // Convert to big-endian bytes 312 + for (int i = 0; i < 8; i++) { 313 + res[i*4 + 0] = (s[i] >> 24) & 0xFF; 314 + res[i*4 + 1] = (s[i] >> 16) & 0xFF; 315 + res[i*4 + 2] = (s[i] >> 8) & 0xFF; 316 + res[i*4 + 3] = s[i] & 0xFF; 317 + } 318 + 319 + CAMLreturn(result); 320 + } 321 + 322 + // Fast one-shot SHA256 323 + value oxcaml_sha256_oneshot(value data, value len_v) { 324 + CAMLparam2(data, len_v); 325 + CAMLlocal1(result); 326 + 327 + uint8_t* input = (uint8_t*)Caml_ba_data_val(data); 328 + uint64_t len = Int64_val(len_v); 329 + 330 + // Local state 331 + alignas(16) uint32_t state[8]; 332 + memcpy(state, H256_INIT, 32); 333 + 334 + // Process full blocks 335 + uint64_t full_blocks = len / 64; 336 + for (uint64_t i = 0; i < full_blocks; i++) { 337 + sha256_process_block_shani(state, input + i * 64); 338 + } 339 + 340 + // Handle final block with padding 341 + alignas(64) uint8_t final_block[128] = {0}; 342 + uint64_t remaining = len % 64; 343 + 344 + if (remaining > 0) { 345 + memcpy(final_block, input + full_blocks * 64, remaining); 346 + } 347 + 348 + final_block[remaining] = 0x80; 349 + 350 + uint64_t bit_len = len * 8; 351 + if (remaining >= 56) { 352 + sha256_process_block_shani(state, final_block); 353 + memset(final_block, 0, 64); 354 + } 355 + 356 + // Add bit length (big-endian) 357 + final_block[56] = (bit_len >> 56) & 0xFF; 358 + final_block[57] = (bit_len >> 48) & 0xFF; 359 + final_block[58] = (bit_len >> 40) & 0xFF; 360 + final_block[59] = (bit_len >> 32) & 0xFF; 361 + final_block[60] = (bit_len >> 24) & 0xFF; 362 + final_block[61] = (bit_len >> 16) & 0xFF; 363 + final_block[62] = (bit_len >> 8) & 0xFF; 364 + final_block[63] = bit_len & 0xFF; 365 + 366 + sha256_process_block_shani(state, final_block); 367 + 368 + // Create result bigarray 369 + long dims[1] = {32}; 370 + result = caml_ba_alloc_dims(CAML_BA_UINT8 | CAML_BA_C_LAYOUT, 1, NULL, dims); 371 + uint8_t* res = (uint8_t*)Caml_ba_data_val(result); 372 + 373 + // Convert to big-endian bytes 374 + for (int i = 0; i < 8; i++) { 375 + res[i*4 + 0] = (state[i] >> 24) & 0xFF; 376 + res[i*4 + 1] = (state[i] >> 16) & 0xFF; 377 + res[i*4 + 2] = (state[i] >> 8) & 0xFF; 378 + res[i*4 + 3] = state[i] & 0xFF; 379 + } 380 + 381 + CAMLreturn(result); 382 + }
+4
test/dune
··· 1 + (executable 2 + (name test_sha256) 3 + (libraries sha256 unix) 4 + (modes native))
+124
test/test_sha256.ml
··· 1 + open Sha256 2 + 3 + (* Test vectors from NIST *) 4 + let test_vectors = [ 5 + ("", "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"); 6 + ("abc", "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"); 7 + ("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 8 + "248d6a61d20638b8e5c026930c3e6039a33ce45964ff2167f6ecedd419db06c1"); 9 + ("The quick brown fox jumps over the lazy dog", 10 + "d7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592"); 11 + (String.make 1000000 'a', 12 + "cdc76e5c9914fb9281a1c7e284d73e67f1809a48a497200e046d39ccc7112cd0"); 13 + ] 14 + 15 + let test_basic () = 16 + print_endline "Testing basic SHA256 functionality..."; 17 + List.iter (fun (input, expected) -> 18 + let digest = hash_string input in 19 + let hex = digest_to_hex digest in 20 + if hex = expected then 21 + Printf.printf " ✓ Test passed for input length %d\n" (String.length input) 22 + else begin 23 + Printf.printf " ✗ Test FAILED for input: %S\n" 24 + (if String.length input > 50 then 25 + String.sub input 0 50 ^ "..." 26 + else input); 27 + Printf.printf " Expected: %s\n" expected; 28 + Printf.printf " Got: %s\n" hex 29 + end 30 + ) test_vectors 31 + 32 + let benchmark () = 33 + print_endline "\nBenchmarking SHA256 performance..."; 34 + 35 + (* Test different input sizes *) 36 + let sizes = [64; 256; 1024; 4096; 16384; 65536; 1048576] in 37 + 38 + List.iter (fun size -> 39 + let data = String.make size 'x' in 40 + let start = Unix.gettimeofday () in 41 + let iterations = if size > 10000 then 1000 else 10000 in 42 + 43 + for _ = 1 to iterations do 44 + ignore (hash_string data) 45 + done; 46 + 47 + let elapsed = Unix.gettimeofday () -. start in 48 + let throughput = (float_of_int (size * iterations)) /. elapsed /. 1_000_000.0 in 49 + Printf.printf " Size: %7d bytes | Iterations: %6d | Time: %.3fs | Throughput: %.1f MB/s\n" 50 + size iterations elapsed throughput 51 + ) sizes 52 + 53 + let test_incremental () = 54 + print_endline "\nTesting incremental hashing..."; 55 + 56 + (* Create test data *) 57 + let data = "The quick brown fox jumps over the lazy dog" in 58 + let expected = "d7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592" in 59 + 60 + (* Hash using oneshot *) 61 + let digest1 = hash_string data in 62 + let hex1 = digest_to_hex digest1 in 63 + 64 + (* Hash using incremental API *) 65 + let state = init () in 66 + let bytes = Bytes.of_string data in 67 + let buffer = Bigarray.Array1.create Bigarray.int8_unsigned Bigarray.c_layout (String.length data) in 68 + for i = 0 to String.length data - 1 do 69 + Bigarray.Array1.set buffer i (Char.code data.[i]) 70 + done; 71 + 72 + let digest2 = finalize state buffer (Int64.of_int (String.length data)) in 73 + let hex2 = digest_to_hex digest2 in 74 + 75 + if hex1 = expected && hex2 = expected then 76 + print_endline " ✓ Incremental hashing works correctly" 77 + else begin 78 + print_endline " ✗ Incremental hashing FAILED"; 79 + Printf.printf " Expected: %s\n" expected; 80 + Printf.printf " Oneshot: %s\n" hex1; 81 + Printf.printf " Incremental: %s\n" hex2 82 + end 83 + 84 + let test_parallel () = 85 + print_endline "\nTesting parallel hashing..."; 86 + 87 + (* Create test data *) 88 + let num_hashes = 100 in 89 + let inputs = List.init num_hashes (fun i -> 90 + Printf.sprintf "Test string number %d with some padding to make it longer" i 91 + |> Bytes.of_string 92 + ) in 93 + 94 + (* Sequential hashing *) 95 + let start_seq = Unix.gettimeofday () in 96 + let results_seq = List.map hash_bytes inputs in 97 + let time_seq = Unix.gettimeofday () -. start_seq in 98 + 99 + (* Parallel hashing *) 100 + let par = Parallel.create () in 101 + let start_par = Unix.gettimeofday () in 102 + let results_par = Fast.parallel_hash_many par inputs in 103 + let time_par = Unix.gettimeofday () -. start_par in 104 + 105 + (* Verify results match *) 106 + let results_match = 107 + List.for_all2 (fun d1 d2 -> digest_equal d1 d2) results_seq results_par 108 + in 109 + 110 + if results_match then begin 111 + Printf.printf " ✓ Parallel hashing produces correct results\n"; 112 + Printf.printf " Sequential: %.3fs\n" time_seq; 113 + Printf.printf " Parallel: %.3fs\n" time_par; 114 + Printf.printf " Speedup: %.2fx\n" (time_seq /. time_par) 115 + end else 116 + print_endline " ✗ Parallel hashing produced different results!" 117 + 118 + let () = 119 + print_endline "SHA256 Hardware Accelerated Test Suite"; 120 + print_endline "======================================"; 121 + test_basic (); 122 + test_incremental (); 123 + test_parallel (); 124 + benchmark ()