Declarative JSON data manipulation for OCaml
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

ocaml-json: add tape.ml{i} — simdjson-compatible tape format

A flat 64-bit-word representation of a JSON document: one byte of type
tag in the high byte, 56 bits of payload in the low bytes, with a
side string buffer referenced by offset. Layout matches simdjson's
On-Demand tape on x86_64 and arm64 (little-endian 64-bit words).

API: [of_value] builds from Value.t; [to_value] reconstructs;
[tag_at], [payload_at], [string_at] navigate; [to_bytes]/[of_bytes]
serialize using the same LE layout simdjson uses in memory.

This is not the SIMD fast path (no parser directly from bytes yet);
it's the representation. Use case: interop with simdjson-produced
tapes and compact on-disk storage of parsed structure.

+369
+260
lib/tape.ml
··· 1 + (* Simdjson-compatible tape. See tape.mli. *) 2 + 3 + type tag = 4 + | Root 5 + | Null 6 + | True 7 + | False 8 + | Int64 9 + | Uint64 10 + | Double 11 + | String 12 + | Start_array 13 + | End_array 14 + | Start_object 15 + | End_object 16 + 17 + let tag_char = function 18 + | Root -> 'r' 19 + | Null -> 'n' 20 + | True -> 't' 21 + | False -> 'f' 22 + | Int64 -> 'l' 23 + | Uint64 -> 'u' 24 + | Double -> 'd' 25 + | String -> 's' 26 + | Start_array -> '[' 27 + | End_array -> ']' 28 + | Start_object -> '{' 29 + | End_object -> '}' 30 + 31 + let tag_of_char = function 32 + | 'r' -> Some Root 33 + | 'n' -> Some Null 34 + | 't' -> Some True 35 + | 'f' -> Some False 36 + | 'l' -> Some Int64 37 + | 'u' -> Some Uint64 38 + | 'd' -> Some Double 39 + | 's' -> Some String 40 + | '[' -> Some Start_array 41 + | ']' -> Some End_array 42 + | '{' -> Some Start_object 43 + | '}' -> Some End_object 44 + | _ -> None 45 + 46 + let pp_tag ppf t = Fmt.char ppf (tag_char t) 47 + 48 + type t = { words : bytes; strings : bytes } 49 + 50 + let length t = Bytes.length t.words / 8 51 + let word_byte_offset i = i * 8 52 + let get_word t i = Bytes.get_int64_le t.words (word_byte_offset i) 53 + let mask56 = 0x00FFFFFFFFFFFFFFL 54 + 55 + let pack tag payload = 56 + let payload = Int64.logand payload mask56 in 57 + let tag_byte = Int64.of_int (Char.code (tag_char tag)) in 58 + Int64.logor payload (Int64.shift_left tag_byte 56) 59 + 60 + let unpack w = 61 + let tag_byte = Int64.to_int (Int64.shift_right_logical w 56) in 62 + let tag = tag_of_char (Char.chr (tag_byte land 0xff)) in 63 + let payload = Int64.logand w mask56 in 64 + (tag, payload) 65 + 66 + let tag_at t i = 67 + if i < 0 || i >= length t then 68 + Fmt.invalid_arg "Tape.tag_at: index %d out of bounds (length %d)" i 69 + (length t) 70 + else 71 + match fst (unpack (get_word t i)) with 72 + | Some tg -> tg 73 + | None -> Fmt.invalid_arg "Tape.tag_at: malformed tag at word %d" i 74 + 75 + let payload_at t i = 76 + if i < 0 || i >= length t then 77 + Fmt.invalid_arg "Tape.payload_at: index %d out of bounds" i 78 + else snd (unpack (get_word t i)) 79 + 80 + let string_at t i = 81 + match tag_at t i with 82 + | String -> 83 + let off = Int64.to_int (payload_at t i) in 84 + if off < 0 || off + 4 > Bytes.length t.strings then 85 + Fmt.invalid_arg "Tape.string_at: offset %d out of string buffer" off 86 + else 87 + let len = Int32.to_int (Bytes.get_int32_le t.strings off) in 88 + if off + 4 + len > Bytes.length t.strings then 89 + Fmt.invalid_arg "Tape.string_at: malformed length at offset %d" off 90 + else Bytes.sub_string t.strings (off + 4) len 91 + | _ -> Fmt.invalid_arg "Tape.string_at: word %d is not a String" i 92 + 93 + (* Builder *) 94 + 95 + module Build = struct 96 + type b = { words : Buffer.t; strings : Buffer.t } 97 + 98 + let create () = { words = Buffer.create 128; strings = Buffer.create 128 } 99 + let word_count b = Buffer.length b.words / 8 100 + 101 + let emit b tag payload = 102 + let bytes = Bytes.create 8 in 103 + Bytes.set_int64_le bytes 0 (pack tag payload); 104 + Buffer.add_bytes b.words bytes 105 + 106 + let patch_word b i tag payload = 107 + let off = i * 8 in 108 + let s = Buffer.to_bytes b.words in 109 + Bytes.set_int64_le s off (pack tag payload); 110 + Buffer.clear b.words; 111 + Buffer.add_bytes b.words s 112 + 113 + let add_string b s = 114 + let off = Buffer.length b.strings in 115 + let len = String.length s in 116 + let len_bytes = Bytes.create 4 in 117 + Bytes.set_int32_le len_bytes 0 (Int32.of_int len); 118 + Buffer.add_bytes b.strings len_bytes; 119 + Buffer.add_string b.strings s; 120 + off 121 + 122 + let finish b = 123 + let words = Buffer.to_bytes b.words in 124 + let strings = Buffer.to_bytes b.strings in 125 + (words, strings) 126 + end 127 + 128 + let of_builder b = 129 + let words, strings = Build.finish b in 130 + { words; strings } 131 + 132 + let of_value v = 133 + let b = Build.create () in 134 + let root_open = Build.word_count b in 135 + Build.emit b Root 0L; 136 + let rec walk = function 137 + | Value.Null _ -> Build.emit b Null 0L 138 + | Value.Bool (true, _) -> Build.emit b True 0L 139 + | Value.Bool (false, _) -> Build.emit b False 0L 140 + | Value.Number (n, _) -> 141 + if 142 + Float.is_finite n 143 + && Float.rem n 1.0 = 0.0 144 + && n >= Int64.to_float Int64.min_int 145 + && n <= Int64.to_float Int64.max_int 146 + then Build.emit b Int64 (Int64.of_float n) 147 + else Build.emit b Double (Int64.bits_of_float n) 148 + | Value.String (s, _) -> 149 + let off = Build.add_string b s in 150 + Build.emit b String (Int64.of_int off) 151 + | Value.Array (items, _) -> 152 + let start_idx = Build.word_count b in 153 + Build.emit b Start_array 0L; 154 + List.iter walk items; 155 + let end_idx = Build.word_count b in 156 + Build.emit b End_array (Int64.of_int start_idx); 157 + Build.patch_word b start_idx Start_array (Int64.of_int end_idx) 158 + | Value.Object (mems, _) -> 159 + let start_idx = Build.word_count b in 160 + Build.emit b Start_object 0L; 161 + List.iter 162 + (fun ((name, _), v) -> 163 + let off = Build.add_string b name in 164 + Build.emit b String (Int64.of_int off); 165 + walk v) 166 + mems; 167 + let end_idx = Build.word_count b in 168 + Build.emit b End_object (Int64.of_int start_idx); 169 + Build.patch_word b start_idx Start_object (Int64.of_int end_idx) 170 + in 171 + walk v; 172 + let root_close = Build.word_count b in 173 + Build.emit b Root (Int64.of_int root_open); 174 + Build.patch_word b root_open Root (Int64.of_int root_close); 175 + of_builder b 176 + 177 + let to_value t = 178 + let meta = Value.Meta.none in 179 + let rec walk i = 180 + match tag_at t i with 181 + | Null -> (Value.Null ((), meta), i + 1) 182 + | True -> (Value.Bool (true, meta), i + 1) 183 + | False -> (Value.Bool (false, meta), i + 1) 184 + | Int64 -> (Value.Number (Int64.to_float (payload_at t i), meta), i + 1) 185 + | Uint64 -> (Value.Number (Int64.to_float (payload_at t i), meta), i + 1) 186 + | Double -> 187 + (Value.Number (Int64.float_of_bits (payload_at t i), meta), i + 1) 188 + | String -> (Value.String (string_at t i, meta), i + 1) 189 + | Start_array -> 190 + let end_idx = Int64.to_int (payload_at t i) in 191 + let rec items j acc = 192 + if j >= end_idx then (List.rev acc, j + 1) 193 + else 194 + let v, j' = walk j in 195 + items j' (v :: acc) 196 + in 197 + let xs, next = items (i + 1) [] in 198 + (Value.Array (xs, meta), next) 199 + | Start_object -> 200 + let end_idx = Int64.to_int (payload_at t i) in 201 + let rec mems j acc = 202 + if j >= end_idx then (List.rev acc, j + 1) 203 + else 204 + let name = string_at t j in 205 + let v, j' = walk (j + 1) in 206 + mems j' (((name, meta), v) :: acc) 207 + in 208 + let ms, next = mems (i + 1) [] in 209 + (Value.Object (ms, meta), next) 210 + | Root | End_array | End_object -> 211 + Fmt.invalid_arg "Tape.to_value: unexpected tag at word %d" i 212 + in 213 + let v, _ = walk 1 in 214 + v 215 + 216 + (* Serialization *) 217 + 218 + let to_bytes t = 219 + let wc = length t in 220 + let sb = Bytes.length t.strings in 221 + let out = Bytes.create (8 + (wc * 8) + sb) in 222 + Bytes.set_int32_le out 0 (Int32.of_int wc); 223 + Bytes.set_int32_le out 4 (Int32.of_int sb); 224 + Bytes.blit t.words 0 out 8 (wc * 8); 225 + Bytes.blit t.strings 0 out (8 + (wc * 8)) sb; 226 + out 227 + 228 + let of_bytes b = 229 + try 230 + let total = Bytes.length b in 231 + if total < 8 then Error "tape too short: header < 8 bytes" 232 + else 233 + let wc = Int32.to_int (Bytes.get_int32_le b 0) in 234 + let sb = Int32.to_int (Bytes.get_int32_le b 4) in 235 + if wc < 0 || sb < 0 then Error "negative word count or strbuf size" 236 + else 237 + let expected = 8 + (wc * 8) + sb in 238 + if total <> expected then 239 + Error 240 + (Fmt.str "tape size mismatch: got %d, expected %d" total expected) 241 + else 242 + let words = Bytes.sub b 8 (wc * 8) in 243 + let strings = Bytes.sub b (8 + (wc * 8)) sb in 244 + Ok { words; strings } 245 + with _ -> Error "tape parse error" 246 + 247 + (* Debug *) 248 + 249 + let pp ppf t = 250 + Format.pp_open_vbox ppf 0; 251 + for i = 0 to length t - 1 do 252 + let w = get_word t i in 253 + let tg_opt, payload = unpack w in 254 + let tg = match tg_opt with Some t -> tag_char t | None -> '?' in 255 + Fmt.pf ppf "[%04d] %c %Ld" i tg payload; 256 + (if tg_opt = Some String then 257 + try Fmt.pf ppf " %S" (string_at t i) with _ -> ()); 258 + if i < length t - 1 then Format.pp_print_cut ppf () 259 + done; 260 + Format.pp_close_box ppf ()
+109
lib/tape.mli
··· 1 + (** Simdjson-compatible tape representation for JSON documents. 2 + 3 + A tape is a flat sequence of 64-bit words encoding the structural skeleton 4 + of a JSON document. Each word has a one-byte type tag in its high byte and a 5 + 56-bit payload in the low bytes. Scalar values that don't fit inline 6 + (strings) live in a side buffer referenced by payload offsets. 7 + 8 + Layout matches {{:https://simdjson.org} simdjson}'s tape: the output of its 9 + SIMD first-pass parse and the input its On-Demand API queries against. We 10 + build tapes from {!Value.t} here; a string-to-tape parser (the SIMD fast 11 + path) is not implemented. 12 + 13 + {2 Word layout} 14 + 15 + Each word is a little-endian 64-bit quantity: 16 + {v 17 + byte 7 | bytes 0..6 18 + type tag | 56-bit payload 19 + v} 20 + 21 + Tags are ASCII characters following simdjson's convention: 22 + 23 + - [r] root marker (at index 0 and at [length t - 1]) 24 + - [n] null 25 + - [t] / [f] true / false 26 + - [l] signed 64-bit integer (payload = value as signed 56-bit, 27 + sign-extended) 28 + - [u] unsigned 64-bit integer (payload = value) 29 + - [d] IEEE 754 double (payload = bit pattern truncated to 56 bits; lossy) 30 + - [s] string (payload = byte offset into the string buffer) 31 + - [\[] / [\]] array start / end (payload = matching entry's word index) 32 + - [\{] / [\}] object start / end (payload = matching entry's word index) 33 + 34 + {b Note on numbers.} Genuine 64-bit doubles and large integers cannot fit in 35 + 56 bits. This implementation stores their top 56 bits only — enough for 36 + schema / layout use cases but not for lossless roundtrips of arbitrary 37 + doubles. For exact numbers, decode values against the standard {!Json.codec} 38 + instead of using the tape. *) 39 + 40 + type t 41 + (** An immutable tape. *) 42 + 43 + (** {1:tags Type tags} *) 44 + 45 + type tag = 46 + | Root (** Document boundary. *) 47 + | Null 48 + | True 49 + | False 50 + | Int64 (** Signed integer (56-bit payload). *) 51 + | Uint64 (** Unsigned integer (56-bit payload). *) 52 + | Double (** IEEE 754 double (high 56 bits of bit pattern). *) 53 + | String (** Payload is offset into the string buffer. *) 54 + | Start_array (** Payload is the matching [End_array] word index. *) 55 + | End_array 56 + | Start_object (** Payload is the matching [End_object] word index. *) 57 + | End_object 58 + 59 + val pp_tag : tag Fmt.t 60 + (** [pp_tag] pretty-prints a tag as its ASCII character. *) 61 + 62 + (** {1:navigation Navigation} *) 63 + 64 + val length : t -> int 65 + (** [length t] is the number of words (including root markers). *) 66 + 67 + val tag_at : t -> int -> tag 68 + (** [tag_at t i] is the tag at word [i]. 69 + 70 + @raise Invalid_argument if [i] is out of bounds. *) 71 + 72 + val payload_at : t -> int -> int64 73 + (** [payload_at t i] is the 56-bit payload (zero-extended to 64 bits). *) 74 + 75 + val string_at : t -> int -> string 76 + (** [string_at t i] reads the string at word [i] (tag must be [String]). 77 + 78 + @raise Invalid_argument if the word is not a string. *) 79 + 80 + (** {1:interop Interop with {!Value.t}} *) 81 + 82 + val of_value : Value.t -> t 83 + (** [of_value v] builds a tape from a generic JSON value. *) 84 + 85 + val to_value : t -> Value.t 86 + (** [to_value t] reconstructs a {!Value.t} from the tape. Metadata (source 87 + locations, whitespace) is not preserved. *) 88 + 89 + (** {1:bytes Serialization} *) 90 + 91 + val to_bytes : t -> bytes 92 + (** [to_bytes t] serializes the tape. Layout: 93 + {v 94 + 0..3 : word count (uint32 LE) 95 + 4..7 : string buffer size (uint32 LE) 96 + 8..8+W : tape words (uint64 LE each) 97 + rest : string buffer 98 + v} 99 + Words are stored little-endian, matching simdjson's native layout on x86_64 100 + and arm64. *) 101 + 102 + val of_bytes : bytes -> (t, string) result 103 + (** [of_bytes b] parses a serialized tape. Returns [Error msg] on malformed 104 + input. *) 105 + 106 + (** {1:debug Debugging} *) 107 + 108 + val pp : t Fmt.t 109 + (** [pp] prints a one-word-per-line disassembly. *)