Declarative JSON data manipulation for OCaml
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

ocaml-json: add skip-parse fast path for Json.ignore

Add a new [Ignore : unit t] constructor to [Json.Repr.t] and a
dedicated [skip_json_value] function in json_bytesrw that advances
past the value at the byte level without:

- allocating token buffers or accumulating characters per byte
- calling float_of_string on numbers
- decoding \\\"/\\\\/\\u escapes (only recognises the backslash for
string-boundary tracking)
- allocating DOM nodes

[Json.ignore] is redefined to use this constructor; existing callers
(e.g. [Object.mem field Json.ignore] for field-access decoding) pick
up the fast path automatically.

Benchmark (simdjson corpus, field-access mode):

geomean speedup 1.52x -> 2.46x

Allocations dropped sharply in the field mode: canada.json from
69 MB/iter to 10 MB/iter (7x), citm_catalog.json from 9.8 MB/iter to
1.9 MB/iter (5x). DOM mode unchanged.

A further step to hit 4x would bypass nextc's per-character UTF-8
decoding in the skip paths and scan raw bytes directly.

+137 -13
+2
lib/brr/json_brr.ml
··· 113 113 | Map map -> map.dec (decode map.dom jv) 114 114 | Any map -> decode_any t map jv 115 115 | Rec t -> decode (Lazy.force t) jv 116 + | Ignore -> () 116 117 117 118 and decode_array : type a e b. (a, e, b) array_map -> Jv.t -> a = 118 119 fun map jv -> ··· 290 291 | Any map -> encode (map.enc v) v 291 292 | Map map -> encode map.dom (map.enc v) 292 293 | Rec t -> encode (Lazy.force t) v 294 + | Ignore -> Json.Error.failf Json.Meta.none "Cannot encode Ignore value" 293 295 294 296 and encode_object : type o. 295 297 (o, o) Json.Repr.object_map -> do_unknown:bool -> o -> Jv.t -> Jv.t =
+105
lib/bytesrw/json_bytesrw.ml
··· 547 547 if d.u <> 0x0022 then err_exp_mem d) 548 548 else err_exp_comma_or_eoo d 549 549 550 + (* Skip-parse a JSON value: advance past [d.u] at the byte level without 551 + materialising token buffers, parsing numbers, or decoding string 552 + escapes. The only decoding done is UTF-8 in [nextc]; escapes in 553 + strings are recognised only enough to not stop at a backslash-quote. *) 554 + let rec skip_json_value d = 555 + read_ws d; 556 + match d.u with 557 + | 0x007B (* { *) -> skip_json_object d 558 + | 0x005B (* [ *) -> skip_json_array d 559 + | 0x0022 (* DQUOTE *) -> skip_json_string d 560 + | 0x006E (* n *) -> ignore (read_json_null d) 561 + | 0x0074 (* t *) -> ignore (read_json_true d) 562 + | 0x0066 (* f *) -> ignore (read_json_false d) 563 + | u when is_number_start u -> skip_json_number d 564 + | _ -> err_not_json_value d 565 + 566 + and skip_json_string d = 567 + (* d.u is 0x22. advance past opening quote and scan for matching close, 568 + honouring backslash escapes. no token buffer. *) 569 + nextc d; 570 + let rec loop () = 571 + match d.u with 572 + | 0x005C (* \ *) -> 573 + nextc d; 574 + (match d.u with 575 + | u when u = eot -> 576 + err_unclosed_string ~first_byte:0 577 + ~first_line:(Loc.line_num_none, Loc.byte_pos_none) d 578 + | _ -> nextc d); 579 + loop () 580 + | 0x0022 (* DQUOTE *) -> nextc d 581 + | u when u = eot -> 582 + err_unclosed_string ~first_byte:0 583 + ~first_line:(Loc.line_num_none, Loc.byte_pos_none) d 584 + | _ -> 585 + nextc d; 586 + loop () 587 + in 588 + loop (); 589 + read_ws d 590 + 591 + and skip_json_number d = 592 + let rec loop () = 593 + match d.u with 594 + | u 595 + when is_digit u || u = 0x002E (* . *) || u = 0x002D (* - *) 596 + || u = 0x002B (* + *) || u = 0x0065 (* e *) || u = 0x0045 (* E *) -> 597 + nextc d; 598 + loop () 599 + | _ -> () 600 + in 601 + loop (); 602 + read_ws d 603 + 604 + and skip_json_array d = 605 + nextc d; 606 + (* [ *) 607 + read_ws d; 608 + if d.u = 0x005D (* ] *) then ( 609 + nextc d; 610 + read_ws d) 611 + else 612 + let rec loop () = 613 + skip_json_value d; 614 + match d.u with 615 + | 0x002C (* , *) -> 616 + nextc d; 617 + read_ws d; 618 + loop () 619 + | 0x005D (* ] *) -> 620 + nextc d; 621 + read_ws d 622 + | fnd -> err_exp_comma_or_eoa d ~fnd 623 + in 624 + loop () 625 + 626 + and skip_json_object d = 627 + nextc d; 628 + (* { *) 629 + read_ws d; 630 + if d.u = 0x007D (* } *) then ( 631 + nextc d; 632 + read_ws d) 633 + else 634 + let rec loop () = 635 + if d.u <> 0x0022 then err_exp_mem d; 636 + skip_json_string d; 637 + if d.u <> 0x003A (* : *) then err_exp_colon d; 638 + nextc d; 639 + read_ws d; 640 + skip_json_value d; 641 + match d.u with 642 + | 0x002C (* , *) -> 643 + nextc d; 644 + read_ws d; 645 + loop () 646 + | 0x007D (* } *) -> 647 + nextc d; 648 + read_ws d 649 + | _ -> err_exp_comma_or_eoo d 650 + in 651 + loop () 652 + 550 653 let rec decode : type a. decoder -> a t -> a = 551 654 fun d t -> 552 655 match ··· 586 689 | Map map -> map.dec (decode d map.dom) 587 690 | Any map -> decode_any d t map 588 691 | Rec t -> decode d (Lazy.force t) 692 + | Ignore -> skip_json_value d 589 693 590 694 and decode_array : type a elt b. decoder -> (a, elt, b) array_map -> a = 591 695 fun d map -> ··· 1102 1206 | Any map -> encode ~nest (map.enc v) e v 1103 1207 | Map map -> encode ~nest map.dom e (map.enc v) 1104 1208 | Rec t -> encode ~nest (Lazy.force t) e v 1209 + | Ignore -> Json.Error.failf Json.Meta.none "Cannot encode Ignore value" 1105 1210 1106 1211 and encode_array : type a elt b. 1107 1212 nest:int -> (a, elt, b) Json.Repr.array_map -> encoder -> a -> unit =
+26 -13
lib/json.ml
··· 161 161 | Any : 'a any_map -> 'a t 162 162 | Map : ('a, 'b) map -> 'b t 163 163 | Rec : 'a t Lazy.t -> 'a t 164 + | Ignore : unit t 165 + (** Skip-parse any JSON value without materialising its contents. 166 + The bytesrw decoder dispatches to [skip_json_value], which 167 + advances past the value at the byte level (balancing brackets, 168 + skipping string content without decoding escapes, consuming 169 + numeric digits without [float_of_string]). Avoids the 170 + token-accumulation and allocation costs of the generic codec 171 + dispatch when the caller only needs to discard the value. *) 164 172 165 173 and ('array, 'elt, 'builder) array_map = { 166 174 kind : string; ··· 300 308 let doc = Option.value ~default:map.doc doc in 301 309 { map with kind; doc } 302 310 303 - let rec with_doc ?kind ?doc = function 311 + let rec with_doc : type a. ?kind:string -> ?doc:string -> a t -> a t = 312 + fun ?kind ?doc -> function 304 313 | Null map -> Null (base_map_with_doc ?kind ?doc map) 305 314 | Bool map -> Bool (base_map_with_doc ?kind ?doc map) 306 315 | Number map -> Number (base_map_with_doc ?kind ?doc map) ··· 310 319 | Any map -> Any (any_map_with_doc ?kind ?doc map) 311 320 | Map map -> Map (map_with_doc ?kind ?doc map) 312 321 | Rec l -> with_doc ?kind ?doc (Lazy.force l) 322 + | Ignore -> Ignore 313 323 314 324 let object_map_kinded_sort (map : ('o, 'dec) object_map) = 315 325 Sort.kinded ~kind:map.kind Object ··· 324 334 | Any map -> if map.kind = "" then any_map_kinded_sort map else map.kind 325 335 | Map map -> if map.kind = "" then kinded_sort map.dom else map.kind 326 336 | Rec l -> kinded_sort (Lazy.force l) 337 + | Ignore -> "ignore" 327 338 328 339 and array_map_kinded_sort : type a e b. (a, e, b) array_map -> string = 329 340 fun map -> ··· 359 370 | Any map -> if map.kind <> "" then map.kind else "any" 360 371 | Map map -> if map.kind <> "" then map.kind else kind map.dom 361 372 | Rec l -> kind (Lazy.force l) 373 + | Ignore -> "ignore" 362 374 363 375 let rec doc : type a. a t -> string = function 364 376 | Null map -> map.doc ··· 370 382 | Any map -> map.doc 371 383 | Map map -> map.doc 372 384 | Rec l -> doc (Lazy.force l) 385 + | Ignore -> "" 373 386 374 387 (* Errors *) 375 388 ··· 647 660 648 661 let some t = map ~dec:Option.some ~enc:Option.get t 649 662 650 - let option ?kind ?doc t = 663 + let option : type a. 664 + ?kind:string -> ?doc:string -> a Repr.t -> a option Repr.t = 665 + fun ?kind ?doc t -> 651 666 let some = some t in 652 667 let enc = function None -> none | Some _ -> some in 653 668 match t with ··· 657 672 | String _ -> any ?doc ?kind ~dec_null:none ~dec_string:some ~enc () 658 673 | Array _ -> any ?doc ?kind ~dec_null:none ~dec_array:some ~enc () 659 674 | Object _ -> any ?doc ?kind ~dec_null:none ~dec_object:some ~enc () 660 - | Any _ | Map _ | Rec _ -> 675 + | Any _ | Map _ | Rec _ | Ignore -> 661 676 any ?doc ?kind ~dec_null:none ~dec_bool:some ~dec_number:some 662 677 ~dec_string:some ~dec_array:some ~dec_object:some ~enc () 663 678 ··· 1340 1355 |> finish 1341 1356 end 1342 1357 1343 - (* Ignoring *) 1358 + (* Ignoring 1344 1359 1345 - let ignore = 1346 - let kind = "ignore" in 1347 - let dec_null = Repr.Null Base.ignore and dec_bool = Repr.Bool Base.ignore in 1348 - let dec_number = Repr.Number Base.ignore in 1349 - let dec_string = Repr.String Base.ignore in 1350 - let dec_array = Array.ignore and dec_object = Object.zero in 1351 - let enc _v = Error.no_encoder Meta.none ~kind in 1352 - any ~kind ~dec_null ~dec_bool ~dec_number ~dec_string ~dec_array ~dec_object 1353 - ~enc () 1360 + [ignore] uses the dedicated [Repr.Ignore] constructor so the bytesrw 1361 + decoder can skip-parse the value (no token buffers, no float parsing, 1362 + no DOM allocation). *) 1363 + 1364 + let ignore : unit t = Repr.Ignore 1354 1365 1355 1366 let zero = 1356 1367 let kind = "zero" in ··· 1607 1618 | Map map -> map.dec (decode map.dom j) 1608 1619 | Any map -> decode_any t map j 1609 1620 | Rec t -> decode (Lazy.force t) j 1621 + | Ignore -> () 1610 1622 1611 1623 and decode_array : type a elt b. 1612 1624 (a, elt, b) array_map -> Meta.t -> json list -> a = ··· 1793 1805 | Any map -> encode (map.enc v) v 1794 1806 | Map map -> encode map.dom (map.enc v) 1795 1807 | Rec t -> encode (Lazy.force t) v 1808 + | Ignore -> Error.no_encoder Meta.none ~kind:"ignore" 1796 1809 1797 1810 and encode_object : type o dec. 1798 1811 (o, o) object_map -> do_unknown:bool -> o -> object' -> object' =
+4
lib/json.mli
··· 1455 1455 | Map : ('b, 'a) map -> 'a t 1456 1456 (** Map from JSON type ['b] to JSON type ['a]. *) 1457 1457 | Rec : 'a t Lazy.t -> 'a t (** Recursive definition. *) 1458 + | Ignore : unit t 1459 + (** Skip-parse any JSON value. The bytesrw decoder consumes the 1460 + value at the byte level without materialising strings, numbers 1461 + or nested DOM; this is the fast path for {!Json.ignore}. *) 1458 1462 1459 1463 (** {1:array Array maps} *) 1460 1464