Declarative JSON data manipulation for OCaml
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

ocaml-json: add benchmark against the simdjson corpus

Two modes per file:

- [dom] - full DOM parse via [Json.json] (simdjson DOM equivalent)
- [field] - parse + extract one designated top-level field as
[Json.ignore], skipping DOM materialisation of the siblings
(simdjson OnDemand equivalent)

Reports min/median MB/s and per-iteration allocations; memtrace
integration preserved for when [MEMTRACE=...ctf] is set in the
environment.

Run:
dune exec ocaml-json/bench/bench.exe -- /path/to/corpus/*.json

+168
+165
bench/bench.ml
··· 1 + (* Benchmark json decode throughput against the simdjson corpus. 2 + 3 + Two modes per file: 4 + [dom] - full DOM parse via [Json.json] (simdjson "DOM" equivalent). 5 + [field] - parse + extract one top-level field as [Json.ignore]; 6 + other members / array elements are parsed but DOM is not 7 + materialised (simdjson "OnDemand" equivalent). 8 + 9 + Methodology follows google/benchmark's defaults: 10 + - one untimed warmup 11 + - iterate until at least [min_wall_time_s] seconds elapsed and at 12 + least [min_iters] iterations 13 + - report min (best), median, stddev 14 + 15 + Same amount of PARSE work as simdjson for both modes; the field mode 16 + skips DOM materialisation for siblings, matching OnDemand semantics. *) 17 + 18 + let () = Memtrace.trace_if_requested () 19 + 20 + let read_file path = 21 + let ic = open_in_bin path in 22 + let n = in_channel_length ic in 23 + let b = Bytes.create n in 24 + really_input ic b 0 n; 25 + close_in ic; 26 + Bytes.unsafe_to_string b 27 + 28 + let min_wall_time_s = 1.0 29 + let min_iters = 5 30 + 31 + (* Per-file field-access codec. Each one parses the whole JSON (must: 32 + the field may appear anywhere in the object) and materialises ONLY 33 + one designated top-level member as [unit] via [Json.ignore]. Other 34 + members / array elements are skipped (parsed for structure, not 35 + built into a value). *) 36 + let field_codec name = 37 + let open Json in 38 + match name with 39 + | "github_events.json" | "numbers.json" -> 40 + (* root array: each element ignored; map to unit for uniform type. *) 41 + map ~dec:(fun _ -> ()) ~enc:(fun () -> []) (list ignore) 42 + | _ -> 43 + let field = 44 + match name with 45 + | "apache_builds.json" -> "mode" 46 + | "canada.json" -> "type" 47 + | "citm_catalog.json" -> "areaNames" 48 + | "gsoc-2018.json" -> "0" 49 + | "instruments.json" -> "graphstate" 50 + | "marine_ik.json" -> "images" 51 + | "mesh.json" -> "batches" 52 + | "random.json" -> "id" 53 + | "twitter.json" -> "statuses" 54 + | "update-center.json" -> "connectionCheckUrl" 55 + | _ -> "" 56 + in 57 + Object.map (fun () -> ()) 58 + |> Object.mem field ignore ~enc:(fun () -> ()) 59 + |> Object.finish 60 + 61 + let run_mode ~content ~decode = 62 + (match decode content with 63 + | Ok _ -> () 64 + | Error e -> 65 + Printf.eprintf "decode error: %s\n%!" e; 66 + exit 2); 67 + Gc.compact (); 68 + let times = ref [] in 69 + let peak_live_words = ref 0 in 70 + let alloc_start = Gc.allocated_bytes () in 71 + let t_budget_end = Unix.gettimeofday () +. min_wall_time_s in 72 + let rec loop i = 73 + if i >= min_iters && Unix.gettimeofday () >= t_budget_end then i 74 + else begin 75 + Gc.full_major (); 76 + let t0 = Unix.gettimeofday () in 77 + let r = decode content in 78 + let t1 = Unix.gettimeofday () in 79 + (match r with 80 + | Ok _ -> () 81 + | Error e -> 82 + Printf.eprintf "decode error: %s\n%!" e; 83 + exit 2); 84 + times := (t1 -. t0) :: !times; 85 + let lw = (Gc.stat ()).Gc.live_words in 86 + if lw > !peak_live_words then peak_live_words := lw; 87 + loop (i + 1) 88 + end 89 + in 90 + let iters = loop 0 in 91 + let alloc_end = Gc.allocated_bytes () in 92 + let ts = Array.of_list !times in 93 + Array.sort compare ts; 94 + let min_s = ts.(0) in 95 + let median_s = 96 + if iters mod 2 = 0 then (ts.((iters / 2) - 1) +. ts.(iters / 2)) /. 2.0 97 + else ts.(iters / 2) 98 + in 99 + let mean_s = Array.fold_left ( +. ) 0.0 ts /. float_of_int iters in 100 + let variance = 101 + Array.fold_left (fun acc t -> acc +. ((t -. mean_s) ** 2.0)) 0.0 ts 102 + /. float_of_int iters 103 + in 104 + let stddev_s = sqrt variance in 105 + let alloc_bytes = alloc_end -. alloc_start in 106 + let alloc_mb_per_iter = alloc_bytes /. float_of_int iters /. 1_048_576.0 in 107 + let peak_mb = 108 + float_of_int (!peak_live_words * (Sys.word_size / 8)) /. 1_048_576.0 109 + in 110 + (iters, min_s, median_s, stddev_s, alloc_mb_per_iter, peak_mb) 111 + 112 + let bench_file path = 113 + let name = Filename.basename path in 114 + let content = read_file path in 115 + let size_bytes = String.length content in 116 + let size_mb = float_of_int size_bytes /. 1_048_576.0 in 117 + let dom_decode s = Json_bytesrw.decode_string Json.json s in 118 + let field_decode = 119 + let codec = field_codec name in 120 + fun s -> Json_bytesrw.decode_string codec s 121 + in 122 + let dom = run_mode ~content ~decode:dom_decode in 123 + let fld = run_mode ~content ~decode:field_decode in 124 + (name, size_mb, dom, fld) 125 + 126 + let () = 127 + let files = 128 + Array.sub Sys.argv 1 (Array.length Sys.argv - 1) |> Array.to_list 129 + in 130 + Printf.printf 131 + "| file | size MB | DOM best MB/s | DOM med MB/s | DOM alloc/iter MB | \ 132 + field best MB/s | field med MB/s | field alloc/iter MB |\n"; 133 + Printf.printf 134 + "|------|---------|----------------|--------------|-------------------|-----------------|----------------|---------------------|\n"; 135 + let results = List.map bench_file files in 136 + List.iter 137 + (fun (name, size_mb, dom, fld) -> 138 + let _, dmin, dmed, _, dalloc, _ = dom in 139 + let _, fmin, fmed, _, falloc, _ = fld in 140 + let dbest = size_mb /. dmin in 141 + let dmed_mbps = size_mb /. dmed in 142 + let fbest = size_mb /. fmin in 143 + let fmed_mbps = size_mb /. fmed in 144 + Printf.printf 145 + "| %-22s | %7.3f | %14.1f | %12.1f | %17.2f | %15.1f | %14.1f | %19.2f |\n" 146 + name size_mb dbest dmed_mbps dalloc fbest fmed_mbps falloc) 147 + results; 148 + let n = List.length results in 149 + if n > 0 then begin 150 + let log_sum f = 151 + List.fold_left (fun acc r -> acc +. log (f r)) 0.0 results 152 + in 153 + let geo f = exp (log_sum f /. float_of_int n) in 154 + let dbest r = 155 + let _, s, (_, dmin, _, _, _, _), _ = r in 156 + s /. dmin 157 + in 158 + let fbest r = 159 + let _, s, _, (_, fmin, _, _, _, _) = r in 160 + s /. fmin 161 + in 162 + Printf.printf "\nGeomean best DOM: %.1f MB/s\n" (geo dbest); 163 + Printf.printf "Geomean best field: %.1f MB/s\n" (geo fbest); 164 + Printf.printf "Field speedup: %.2fx\n" (geo fbest /. geo dbest) 165 + end
+3
bench/dune
··· 1 + (executable 2 + (name bench) 3 + (libraries json json.bytesrw unix memtrace))