OCaml Zarr jsont codecs for v2/v3 and common conventions
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat: dtype codec with full NumPy typestr parsing

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

+236
+156
src/zarr_jsont.ml
··· 89 89 | `Bytes _ -> array_codec) 90 90 () 91 91 92 + type endian = [ `Little | `Big | `Not_applicable ] 93 + 94 + type dtype = [ 95 + | `Bool 96 + | `Int of endian * int 97 + | `Uint of endian * int 98 + | `Float of endian * int 99 + | `Complex of endian * int 100 + | `Timedelta of endian * string 101 + | `Datetime of endian * string 102 + | `String of int 103 + | `Unicode of endian * int 104 + | `Raw of int 105 + | `Structured of (string * dtype * int list option) list 106 + ] 107 + 108 + let parse_endian = function 109 + | '<' -> `Little 110 + | '>' -> `Big 111 + | '|' | '=' -> `Not_applicable 112 + | c -> failwith (Printf.sprintf "dtype: unknown endian char %c" c) 113 + 114 + let endian_char = function 115 + | `Little -> '<' 116 + | `Big -> '>' 117 + | `Not_applicable -> '|' 118 + 119 + (* Parse a NumPy typestr like "<f8", "|b1", "<M8[ns]", "|S10", etc. *) 120 + let parse_typestr s = 121 + if String.length s < 3 then 122 + failwith (Printf.sprintf "dtype: typestr too short: %s" s); 123 + let endian = parse_endian s.[0] in 124 + let kind = s.[1] in 125 + let rest = String.sub s 2 (String.length s - 2) in 126 + match kind with 127 + | 'b' -> 128 + let n = int_of_string rest in 129 + if n = 1 then `Bool 130 + else failwith (Printf.sprintf "dtype: invalid bool size %d" n) 131 + | 'i' -> `Int (endian, int_of_string rest) 132 + | 'u' -> `Uint (endian, int_of_string rest) 133 + | 'f' -> `Float (endian, int_of_string rest) 134 + | 'c' -> `Complex (endian, int_of_string rest) 135 + | 'M' -> 136 + (* e.g. "8[ns]" *) 137 + let unit_str = 138 + if String.length rest > 2 && rest.[0] = '8' && rest.[1] = '[' then begin 139 + let close = String.index rest ']' in 140 + String.sub rest 2 (close - 2) 141 + end else failwith (Printf.sprintf "dtype: invalid datetime typestr: %s" s) 142 + in 143 + `Datetime (endian, unit_str) 144 + | 'm' -> 145 + let unit_str = 146 + if String.length rest > 2 && rest.[0] = '8' && rest.[1] = '[' then begin 147 + let close = String.index rest ']' in 148 + String.sub rest 2 (close - 2) 149 + end else failwith (Printf.sprintf "dtype: invalid timedelta typestr: %s" s) 150 + in 151 + `Timedelta (endian, unit_str) 152 + | 'S' -> `String (int_of_string rest) 153 + | 'U' -> `Unicode (endian, int_of_string rest) 154 + | 'V' -> `Raw (int_of_string rest) 155 + | c -> failwith (Printf.sprintf "dtype: unknown kind char %c" c) 156 + 157 + let encode_typestr (dt : dtype) : string = 158 + match dt with 159 + | `Bool -> "|b1" 160 + | `Int (e, n) -> Printf.sprintf "%ci%d" (endian_char e) n 161 + | `Uint (e, n) -> Printf.sprintf "%cu%d" (endian_char e) n 162 + | `Float (e, n) -> Printf.sprintf "%cf%d" (endian_char e) n 163 + | `Complex (e, n) -> Printf.sprintf "%cc%d" (endian_char e) n 164 + | `Datetime (e, u) -> Printf.sprintf "%cM8[%s]" (endian_char e) u 165 + | `Timedelta (e, u) -> Printf.sprintf "%cm8[%s]" (endian_char e) u 166 + | `String n -> Printf.sprintf "|S%d" n 167 + | `Unicode (e, n) -> Printf.sprintf "%cU%d" (endian_char e) n 168 + | `Raw n -> Printf.sprintf "|V%d" n 169 + | `Structured _ -> failwith "dtype: encode_typestr called on structured dtype" 170 + 171 + (* Forward reference to allow recursive dtype_jsont. *) 172 + let dtype_jsont_fwd : dtype Jsont.t ref = ref (Jsont.todo ~kind:"dtype" ()) 173 + 174 + let dtype_jsont : dtype Jsont.t = Jsont.rec' (lazy ( 175 + let simple_codec = 176 + Jsont.map ~kind:"dtype_string" 177 + ~dec:parse_typestr 178 + ~enc:encode_typestr 179 + Jsont.string 180 + in 181 + let decode_shape items = 182 + List.map (function 183 + | Jsont.Number (f, _) -> int_of_float f 184 + | j -> failwith (Format.asprintf "dtype: expected int in shape, got %a" Jsont.pp_json j)) 185 + items 186 + in 187 + (* Decode a single field descriptor from a JSON list: 188 + ["name", "<f4"] or ["name", "<f4", [3, 2]] *) 189 + let decode_field (json_items : Jsont.json list) : string * dtype * int list option = 190 + match json_items with 191 + | [ Jsont.String (name, _); Jsont.String (typestr, _) ] -> 192 + (name, parse_typestr typestr, None) 193 + | [ Jsont.String (name, _); Jsont.String (typestr, _); Jsont.Array (shape_items, _) ] -> 194 + (name, parse_typestr typestr, Some (decode_shape shape_items)) 195 + | [ Jsont.String (name, _); (Jsont.Array _ as nested_json) ] -> 196 + let nested = match Jsont.Json.decode !dtype_jsont_fwd nested_json with 197 + | Ok v -> v 198 + | Error e -> failwith (Printf.sprintf "dtype: nested structured decode error: %s" e) 199 + in 200 + (name, nested, None) 201 + | [ Jsont.String (name, _); (Jsont.Array _ as nested_json); Jsont.Array (shape_items, _) ] -> 202 + let nested = match Jsont.Json.decode !dtype_jsont_fwd nested_json with 203 + | Ok v -> v 204 + | Error e -> failwith (Printf.sprintf "dtype: nested structured decode error: %s" e) 205 + in 206 + (name, nested, Some (decode_shape shape_items)) 207 + | _ -> failwith "dtype: invalid field descriptor" 208 + in 209 + let structured_codec = 210 + Jsont.map ~kind:"dtype_array" 211 + ~dec:(fun (fields_json : Jsont.json list) -> 212 + let fields = List.map (function 213 + | Jsont.Array (items, _) -> decode_field items 214 + | j -> failwith (Format.asprintf "dtype: expected array field descriptor, got %a" Jsont.pp_json j)) 215 + fields_json 216 + in 217 + `Structured fields) 218 + ~enc:(function 219 + | `Structured fields -> 220 + List.map (fun (name, dt, shape_opt) -> 221 + let name_json = Jsont.Json.string name in 222 + let dtype_json = match Jsont.Json.encode !dtype_jsont_fwd dt with 223 + | Ok j -> j 224 + | Error e -> failwith (Printf.sprintf "dtype: encode error: %s" e) 225 + in 226 + match shape_opt with 227 + | None -> Jsont.Json.list [ name_json; dtype_json ] 228 + | Some shape -> 229 + let shape_json = Jsont.Json.list 230 + (List.map (fun n -> Jsont.Json.number (float_of_int n)) shape) 231 + in 232 + Jsont.Json.list [ name_json; dtype_json; shape_json ]) 233 + fields 234 + | _ -> assert false) 235 + (Jsont.list Jsont.json) 236 + in 237 + Jsont.any ~kind:"dtype" 238 + ~dec_string:simple_codec 239 + ~dec_array:structured_codec 240 + ~enc:(function 241 + | `Structured _ -> structured_codec 242 + | _ -> simple_codec) 243 + () 244 + )) 245 + 246 + let () = dtype_jsont_fwd := dtype_jsont 247 + 92 248 module Other_ext = struct 93 249 type t = { name : string; configuration : Jsont.json option; must_understand : bool } 94 250
+27
src/zarr_jsont.mli
··· 19 19 val fill_value_jsont : fill_value Jsont.t 20 20 (** Codec for {!fill_value}. Dispatches on the JSON sort via {!Jsont.any}. *) 21 21 22 + (** Byte order of a NumPy array dtype. *) 23 + type endian = [ `Little | `Big | `Not_applicable ] 24 + 25 + (** NumPy array dtype as used in Zarr v2 array metadata [".zarray"]. 26 + 27 + Simple types are encoded as JSON strings in NumPy typestr format (e.g. 28 + ["<f8"], ["|b1"]). Structured types are encoded as JSON arrays of field 29 + descriptors, each of the form [["name","<dtype_str"]] or 30 + [["name","<dtype_str",[dim1,...]]]. *) 31 + type dtype = [ 32 + | `Bool 33 + | `Int of endian * int 34 + | `Uint of endian * int 35 + | `Float of endian * int 36 + | `Complex of endian * int 37 + | `Timedelta of endian * string 38 + | `Datetime of endian * string 39 + | `String of int 40 + | `Unicode of endian * int 41 + | `Raw of int 42 + | `Structured of (string * dtype * int list option) list 43 + ] 44 + 45 + val dtype_jsont : dtype Jsont.t 46 + (** Codec for {!dtype}. Simple types decode/encode as JSON strings; 47 + structured types decode/encode as JSON arrays. *) 48 + 22 49 (** Catch-all type for unrecognized v2 codecs. 23 50 24 51 Represents objects with an ["id"] key plus arbitrary extra fields,
+53
test/test_zarr_jsont.ml
··· 55 55 (match v with `Bytes s -> assert (String.length s = 3) | _ -> assert false); 56 56 print_endline "test_fill_value: ok" 57 57 58 + let test_dtype () = 59 + let dt = Zarr_jsont.dtype_jsont in 60 + (* simple float *) 61 + let v = decode dt {|"<f8"|} in 62 + assert (v = `Float (`Little, 8)); 63 + (* big-endian int *) 64 + let v = decode dt {|">i4"|} in 65 + assert (v = `Int (`Big, 4)); 66 + (* boolean *) 67 + let v = decode dt {|"|b1"|} in 68 + assert (v = `Bool); 69 + (* unsigned int *) 70 + let v = decode dt {|"<u2"|} in 71 + assert (v = `Uint (`Little, 2)); 72 + (* complex *) 73 + let v = decode dt {|"<c16"|} in 74 + assert (v = `Complex (`Little, 16)); 75 + (* datetime *) 76 + let v = decode dt {|"<M8[ns]"|} in 77 + assert (v = `Datetime (`Little, "ns")); 78 + (* timedelta *) 79 + let v = decode dt {|"<m8[s]"|} in 80 + assert (v = `Timedelta (`Little, "s")); 81 + (* fixed string *) 82 + let v = decode dt {|"|S10"|} in 83 + assert (v = `String 10); 84 + (* unicode *) 85 + let v = decode dt {|"<U5"|} in 86 + assert (v = `Unicode (`Little, 5)); 87 + (* void/raw *) 88 + let v = decode dt {|"|V16"|} in 89 + assert (v = `Raw 16); 90 + (* structured *) 91 + let v = decode dt {|[["x","<f4"],["y","<f4",[3]]]|} in 92 + (match v with 93 + | `Structured fields -> 94 + assert (List.length fields = 2); 95 + let (n1, t1, s1) = List.nth fields 0 in 96 + assert (n1 = "x" && t1 = `Float (`Little, 4) && s1 = None); 97 + let (n2, t2, s2) = List.nth fields 1 in 98 + assert (n2 = "y" && t2 = `Float (`Little, 4) && s2 = Some [3]) 99 + | _ -> assert false); 100 + (* roundtrip simple *) 101 + let json' = encode dt (`Float (`Little, 8)) in 102 + assert (decode dt json' = `Float (`Little, 8)); 103 + (* roundtrip structured *) 104 + let s = `Structured [("x", `Float (`Little, 4), None); 105 + ("y", `Int (`Big, 2), Some [3; 2])] in 106 + let json' = encode dt s in 107 + assert (decode dt json' = s); 108 + print_endline "test_dtype: ok" 109 + 58 110 let () = test_other_codec () 59 111 let () = test_other_ext () 60 112 let () = test_fill_value () 113 + let () = test_dtype ()