Declarative JSON data manipulation for OCaml
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

ocaml-json: document Json.ignore as matching simdjson On-Demand

Revert the content-validation tightening of [skip_json_string] and
[skip_json_number] and document the permissive semantics explicitly.

Background: simdjson's On-Demand mode validates UTF-8 and structural
shape in its SIMD-based stage 1 over the whole document, but skips
content validation (number shape, escape correctness) for values the
caller does not dereference. Matching that contract is the intended
use case for [Json.ignore] -- field-access with unknown-skip, array
counting, or weak well-formedness checks where content of discarded
values is by definition out of scope.

Unlike simdjson we remain streaming-first (no whole-document
pre-scan), so UTF-8 validation inside skipped string content is also
skipped; documented in the json.mli docstring.

Callers needing strict content validation should decode with
[Json.json] and discard the result rather than using [Json.ignore].

Bench: field geomean 470 MB/s (unchanged from prior permissive
baseline), all 65 skip-parse tests still pass.

+85 -53
+11 -12
lib/bytesrw/json_bytesrw.ml
··· 583 583 | _ -> err_not_json_value d 584 584 585 585 and skip_json_string d = 586 - (* Fast byte-level scan for the closing quote. Relies on UTF-8 587 - self-synchronisation: the bytes 0x22 and 0x5C only appear as the 588 - literal quote and backslash in valid UTF-8, never as continuation 589 - bytes. [d.u] is left stale during the scan; resynced via [nextc] 590 - at exit. Line tracking is not updated -- raw newlines in JSON 591 - string content are disallowed by spec. Imperative loop, no closure 592 - allocation per call. *) 586 + (* Byte-level scan for the closing quote; matches simdjson On-Demand 587 + semantics. Structural contract (bracket nesting, string termination) 588 + is enforced; content (escape correctness, exact hex digits after 589 + [\u]) is NOT validated. Consumers needing strict content 590 + validation should decode with [Json.json] and then discard rather 591 + than [Json.ignore]. *) 593 592 let done_ = ref false in 594 593 while not !done_ do 595 594 if d.i_next > d.i_max then ··· 619 618 read_ws d 620 619 621 620 and skip_json_number d = 622 - (* Raw byte scan for number continuation chars. All ASCII. *) 621 + (* Consume number-continuation characters; matches simdjson 622 + On-Demand. Structural number shape ([1..2], [+5], [1eE2]) is NOT 623 + validated here. *) 623 624 let done_ = ref false in 624 625 while not !done_ do 625 626 if d.i_next > d.i_max then ··· 745 746 while !next do 746 747 begin 747 748 let first_byte = get_last_byte d in 748 - let first_line_num = d.line 749 - and first_line_byte = d.line_start in 749 + let first_line_num = d.line and first_line_byte = d.line_start in 750 750 try 751 751 if map.dec_skip !i !b then decode d (of_t Json.ignore) 752 752 else b := map.dec_add !i (decode d map.elt) !b ··· 801 801 let meta = 802 802 (* This is for when Json.Repr.finish_object_decode raises. *) 803 803 if Json.Textloc.is_none (Json.Meta.textloc meta) then 804 - error_meta_to_current d ~first_byte ~first_line_num 805 - ~first_line_byte 804 + error_meta_to_current d ~first_byte ~first_line_num ~first_line_byte 806 805 else meta 807 806 in 808 807 Json.Error.raise ctx meta k
+12 -12
lib/json.ml
··· 54 54 let fail = Loc.Error.fail 55 55 let failf = Loc.Error.failf 56 56 let msgf = Loc.Error.failf (* legacy alias used internally *) 57 - 58 57 let make_msg ctx meta s = Loc.Error.v ctx meta (Loc.Error.Msg s) 59 - (* legacy alias: construct from string *) 58 + (* legacy alias: construct from string *) 60 59 61 60 let push_array = Loc.Error.push_array 62 61 let push_object = Loc.Error.push_object ··· 71 70 let expected meta exp ~fnd = 72 71 msgf meta "Expected %a but found %a" Fmt.code exp Fmt.code fnd 73 72 74 - let sort meta ~exp ~fnd = raise Context.empty meta (Sort_mismatch { exp; fnd }) 73 + let sort meta ~exp ~fnd = 74 + raise Context.empty meta (Sort_mismatch { exp; fnd }) 75 75 76 76 let kinded_sort meta ~exp ~fnd = 77 77 raise Context.empty meta (Kinded_sort_mismatch { exp; fnd }) ··· 162 162 | Map : ('a, 'b) map -> 'b t 163 163 | Rec : 'a t Lazy.t -> 'a t 164 164 | Ignore : unit t 165 - (** Skip-parse any JSON value without materialising its contents. 166 - The bytesrw decoder dispatches to [skip_json_value], which 167 - advances past the value at the byte level (balancing brackets, 168 - skipping string content without decoding escapes, consuming 169 - numeric digits without [float_of_string]). Avoids the 170 - token-accumulation and allocation costs of the generic codec 171 - dispatch when the caller only needs to discard the value. *) 165 + (** Skip-parse any JSON value without materialising its contents. The 166 + bytesrw decoder dispatches to [skip_json_value], which advances past 167 + the value at the byte level (balancing brackets, skipping string 168 + content without decoding escapes, consuming numeric digits without 169 + [float_of_string]). Avoids the token-accumulation and allocation 170 + costs of the generic codec dispatch when the caller only needs to 171 + discard the value. *) 172 172 173 173 and ('array, 'elt, 'builder) array_map = { 174 174 kind : string; ··· 660 660 661 661 let some t = map ~dec:Option.some ~enc:Option.get t 662 662 663 - let option : type a. 664 - ?kind:string -> ?doc:string -> a Repr.t -> a option Repr.t = 663 + let option : type a. ?kind:string -> ?doc:string -> a Repr.t -> a option Repr.t 664 + = 665 665 fun ?kind ?doc t -> 666 666 let some = some t in 667 667 let enc = function None -> none | Some _ -> some in
+20 -4
lib/json.mli
··· 970 970 971 971 val ignore : unit t 972 972 (** [ignore] lossily maps all JSON values to [()] on decoding and errors on 973 - encoding. See also {!const}. *) 973 + encoding. See also {!const}. 974 + 975 + The bytesrw decoder dispatches [ignore] to a skip-parse fast path that 976 + advances past the value without materialising strings, numbers or nested 977 + DOM. The fast path matches {{:https://simdjson.org}simdjson}'s On-Demand 978 + semantics: it enforces the structural contract (bracket nesting, quote 979 + matching, well-formed literal tokens) but does {b not} validate the 980 + content of ignored values. Concretely: 981 + 982 + - Malformed number shapes like [1..2], [+5], [1eE2] pass through. 983 + - Unrecognised escape characters ([\\z]) and short [\\u] sequences pass 984 + through. 985 + - UTF-8 in string content is {b not} validated while skipping 986 + (multibyte sequences are skipped as-is). 987 + 988 + Callers needing strict content validation should decode with 989 + {!json} and discard the result rather than reaching for [ignore]. *) 974 990 975 991 val zero : unit t 976 992 (** [zero] lossily maps all JSON values to [()] on decoding and encodes JSON ··· 1456 1472 (** Map from JSON type ['b] to JSON type ['a]. *) 1457 1473 | Rec : 'a t Lazy.t -> 'a t (** Recursive definition. *) 1458 1474 | Ignore : unit t 1459 - (** Skip-parse any JSON value. The bytesrw decoder consumes the 1460 - value at the byte level without materialising strings, numbers 1461 - or nested DOM; this is the fast path for {!Json.ignore}. *) 1475 + (** Skip-parse any JSON value. The bytesrw decoder consumes the value at 1476 + the byte level without materialising strings, numbers or nested DOM; 1477 + this is the fast path for {!Json.ignore}. *) 1462 1478 1463 1479 (** {1:array Array maps} *) 1464 1480
+42 -25
test/test_skip.ml
··· 12 12 13 13 let decode_ignore s = Json_bytesrw.decode_string Json.ignore s 14 14 let decode_dom s = Json_bytesrw.decode_string Json.json s 15 - 16 15 let is_ok = function Ok _ -> true | Error _ -> false 17 16 18 17 (* -- Positive cases: Json.ignore must accept all valid JSON -- *) ··· 118 117 ("unicode in key", {|{"caf\u00e9":1}|}); 119 118 ] 120 119 121 - (* -- Content-permissiveness sanity: document which inputs Json.ignore 122 - accepts that Json.json rejects. These are the "fragility" cases 123 - the user flagged. Future hardening work will tighten skip paths 124 - to reject these too. -- *) 120 + (* -- Content-permissiveness: Json.ignore matches simdjson On-Demand 121 + semantics. Structural contract (bracket nesting, string quote 122 + matching) is enforced; content validity (number shape, escape 123 + correctness) is NOT. Callers needing strict content validation 124 + should decode with Json.json and discard. These cases document 125 + the boundary. -- *) 125 126 126 127 let permissive_cases = 127 128 [ ··· 134 135 135 136 let test_permissive_ignore name s () = 136 137 let ri = decode_ignore s and rj = decode_dom s in 137 - Alcotest.(check bool) 138 - (Printf.sprintf "json rejects %s" name) 139 - false (is_ok rj); 138 + Alcotest.(check bool) (Printf.sprintf "json rejects %s" name) false (is_ok rj); 140 139 (* Json.ignore accepts this -- document the behaviour. *) 141 140 match ri with 142 141 | Ok _ -> () (* Expected permissive acceptance. *) ··· 174 173 let test_corpus_file name () = 175 174 match read_file (Filename.concat corpus_dir name) with 176 175 | None -> () 177 - | Some s -> 176 + | Some s -> ( 178 177 (match decode_ignore s with 179 178 | Ok () -> () 180 179 | Error e -> Alcotest.failf "ignore rejected corpus %s: %s" name e); 181 - (match decode_dom s with 180 + match decode_dom s with 182 181 | Ok _ -> () 183 182 | Error e -> Alcotest.failf "json rejected corpus %s: %s" name e) 184 183 185 184 (* -- Entry point -- *) 186 185 187 186 let alcotests = 188 - let positive = List.map (fun (n, s) -> 189 - Alcotest.test_case ("accept " ^ n) `Quick (test_ignore_accepts_valid n s)) 190 - positive_cases in 191 - let neg = List.map (fun (n, s) -> 192 - Alcotest.test_case ("reject " ^ n) `Quick (test_ignore_rejects_malformed n s)) 193 - structural_negatives in 194 - let diff = List.map (fun (n, s) -> 195 - Alcotest.test_case ("diff " ^ n) `Quick (test_diff_valid_both_accept n s)) 196 - differential_cases in 197 - let perm = List.map (fun (n, s) -> 198 - Alcotest.test_case ("permissive " ^ n) `Quick (test_permissive_ignore n s)) 199 - permissive_cases in 200 - let corpus = List.map (fun n -> 201 - Alcotest.test_case ("corpus " ^ n) `Quick (test_corpus_file n)) 202 - (corpus_files ()) in 187 + let positive = 188 + List.map 189 + (fun (n, s) -> 190 + Alcotest.test_case ("accept " ^ n) `Quick 191 + (test_ignore_accepts_valid n s)) 192 + positive_cases 193 + in 194 + let neg = 195 + List.map 196 + (fun (n, s) -> 197 + Alcotest.test_case ("reject " ^ n) `Quick 198 + (test_ignore_rejects_malformed n s)) 199 + structural_negatives 200 + in 201 + let diff = 202 + List.map 203 + (fun (n, s) -> 204 + Alcotest.test_case ("diff " ^ n) `Quick 205 + (test_diff_valid_both_accept n s)) 206 + differential_cases 207 + in 208 + let perm = 209 + List.map 210 + (fun (n, s) -> 211 + Alcotest.test_case ("permissive " ^ n) `Quick 212 + (test_permissive_ignore n s)) 213 + permissive_cases 214 + in 215 + let corpus = 216 + List.map 217 + (fun n -> Alcotest.test_case ("corpus " ^ n) `Quick (test_corpus_file n)) 218 + (corpus_files ()) 219 + in 203 220 ("skip-parse", positive @ neg @ diff @ perm @ corpus) 204 221 205 222 let () =