OCaml Zarr jsont codecs for v2/v3 and common conventions
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat: Conv.Geoemb geoembeddings convention codec

Implements the geoemb: convention from
geo-embeddings/embeddings-zarr-convention v1 with full typed codecs:

- Geoemb: type (pixel/chip), dimensions, model, source_data, data_type
plus optional gsd, spatial_layout, build_version, benchmark
- Chip_layout: layout_type (regular_grid/irregular), chip_size, stride
- Quantization: method, original_dtype, quantized_dtype, scale, link
- Scale: tagged union with Scalar (scale+offset) and Array (array_name+nodata)
using case_mem on the "type" discriminant

The Scale tagged union uses case_mem properly — the "type" field is
the discriminant and each case codec omits it from its own members.

zarr-inspect now displays geoemb metadata inline:
geoemb: pixel 128d model=... dtype=int8 gsd=10 layout=utm_zones
quantization: per_pixel_scale float32 -> int8 scale_array=scales

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

+314 -3
+252 -2
src/zarr_jsont.ml
··· 963 963 |> Jsont.Object.finish 964 964 end 965 965 966 + module Geoemb = struct 967 + module Chip_layout = struct 968 + type t = { 969 + layout_type : [ `Regular_grid | `Irregular ]; 970 + chip_size : int * int; 971 + stride : (int * int) option; 972 + grid_id : string option; 973 + grid_definition : string option; 974 + } 975 + let layout_type t = t.layout_type 976 + let chip_size t = t.chip_size 977 + let stride t = t.stride 978 + let grid_id t = t.grid_id 979 + let grid_definition t = t.grid_definition 980 + 981 + let layout_type_jsont = Jsont.enum ~kind:"layout_type" 982 + ["regular_grid", `Regular_grid; "irregular", `Irregular] 983 + 984 + let int_pair = 985 + Jsont.t2 ~kind:"int_pair" 986 + ~dec:(fun a b -> (a, b)) 987 + ~enc:(fun (a, b) i -> if i = 0 then a else b) 988 + Jsont.int 989 + 990 + let jsont = 991 + Jsont.Object.map ~kind:"Chip_layout" 992 + (fun lt cs st gi gd -> { layout_type = lt; chip_size = cs; 993 + stride = st; grid_id = gi; 994 + grid_definition = gd }) 995 + |> Jsont.Object.mem "layout_type" layout_type_jsont 996 + ~enc:(fun t -> t.layout_type) 997 + |> Jsont.Object.mem "chip_size" int_pair ~enc:(fun t -> t.chip_size) 998 + |> Jsont.Object.opt_mem "stride" int_pair ~enc:(fun t -> t.stride) 999 + |> Jsont.Object.opt_mem "grid_id" Jsont.string ~enc:(fun t -> t.grid_id) 1000 + |> Jsont.Object.opt_mem "grid_definition" Jsont.string 1001 + ~enc:(fun t -> t.grid_definition) 1002 + |> Jsont.Object.skip_unknown 1003 + |> Jsont.Object.finish 1004 + end 1005 + 1006 + module Scale = struct 1007 + module Scalar = struct 1008 + type t = { scale : float; offset : float } 1009 + let scale t = t.scale 1010 + let offset t = t.offset 1011 + let jsont = 1012 + Jsont.Object.map ~kind:"Scale.Scalar" 1013 + (fun scale offset -> 1014 + { scale; offset = Option.value ~default:0.0 offset }) 1015 + |> Jsont.Object.mem "scale" Jsont.number ~enc:(fun t -> t.scale) 1016 + |> Jsont.Object.opt_mem "offset" Jsont.number 1017 + ~enc:(fun t -> 1018 + if t.offset = 0.0 then None else Some t.offset) 1019 + |> Jsont.Object.skip_unknown 1020 + |> Jsont.Object.finish 1021 + end 1022 + module Array_ = struct 1023 + type t = { array_name : string; nodata : string option } 1024 + let array_name t = t.array_name 1025 + let nodata t = t.nodata 1026 + let nodata_jsont = 1027 + let from_string = Jsont.map ~kind:"nodata" 1028 + ~dec:(fun s -> s) ~enc:(fun s -> s) Jsont.string in 1029 + let from_number = Jsont.map ~kind:"nodata" 1030 + ~dec:(fun f -> Printf.sprintf "%g" f) 1031 + ~enc:(fun _ -> 0.0) 1032 + Jsont.number in 1033 + Jsont.any ~kind:"nodata" 1034 + ~dec_string:from_string ~dec_number:from_number 1035 + ~enc:(fun _ -> from_string) () 1036 + let jsont = 1037 + Jsont.Object.map ~kind:"Scale.Array" 1038 + (fun array_name nodata -> { array_name; nodata }) 1039 + |> Jsont.Object.mem "array_name" Jsont.string 1040 + ~enc:(fun t -> t.array_name) 1041 + |> Jsont.Object.opt_mem "nodata" nodata_jsont 1042 + ~enc:(fun t -> t.nodata) 1043 + |> Jsont.Object.skip_unknown 1044 + |> Jsont.Object.finish 1045 + end 1046 + type t = [ `Scalar of Scalar.t | `Array of Array_.t ] 1047 + 1048 + let jsont = 1049 + let scalar = Jsont.Object.Case.map ~dec:(fun s -> `Scalar s) 1050 + "scalar" Scalar.jsont in 1051 + let array_ = Jsont.Object.Case.map ~dec:(fun a -> `Array a) 1052 + "array" Array_.jsont in 1053 + let enc_case = function 1054 + | `Scalar s -> Jsont.Object.Case.value scalar s 1055 + | `Array a -> Jsont.Object.Case.value array_ a 1056 + in 1057 + Jsont.Object.map ~kind:"Scale" Fun.id 1058 + |> Jsont.Object.case_mem "type" Jsont.string 1059 + ~enc:Fun.id ~enc_case 1060 + ~tag_to_string:Fun.id ~tag_compare:String.compare 1061 + Jsont.Object.Case.[make scalar; make array_] 1062 + |> Jsont.Object.finish 1063 + end 1064 + 1065 + module Quantization = struct 1066 + type t = { 1067 + meth : string; 1068 + original_dtype : string; 1069 + quantized_dtype : string option; 1070 + scale : Scale.t option; 1071 + link : string option; 1072 + } 1073 + let meth t = t.meth 1074 + let original_dtype t = t.original_dtype 1075 + let quantized_dtype t = t.quantized_dtype 1076 + let scale t = t.scale 1077 + let link t = t.link 1078 + let jsont = 1079 + Jsont.Object.map ~kind:"Quantization" 1080 + (fun m od qd sc lk -> 1081 + { meth = m; original_dtype = od; quantized_dtype = qd; 1082 + scale = sc; link = lk }) 1083 + |> Jsont.Object.mem "method" Jsont.string ~enc:(fun t -> t.meth) 1084 + |> Jsont.Object.mem "original_dtype" Jsont.string 1085 + ~enc:(fun t -> t.original_dtype) 1086 + |> Jsont.Object.opt_mem "quantized_dtype" Jsont.string 1087 + ~enc:(fun t -> t.quantized_dtype) 1088 + |> Jsont.Object.opt_mem "scale" Scale.jsont ~enc:(fun t -> t.scale) 1089 + |> Jsont.Object.opt_mem "link" Jsont.string ~enc:(fun t -> t.link) 1090 + |> Jsont.Object.skip_unknown 1091 + |> Jsont.Object.finish 1092 + end 1093 + 1094 + type t = { 1095 + type_ : [ `Pixel | `Chip ]; 1096 + dimensions : int; 1097 + model : string; 1098 + source_data : string list; 1099 + data_type : string; 1100 + gsd : float option; 1101 + chip_layout : Chip_layout.t option; 1102 + quantization : Quantization.t option; 1103 + spatial_layout : string option; 1104 + build_version : string option; 1105 + benchmark : string list option; 1106 + } 1107 + let type_ t = t.type_ 1108 + let dimensions t = t.dimensions 1109 + let model t = t.model 1110 + let source_data t = t.source_data 1111 + let data_type t = t.data_type 1112 + let gsd t = t.gsd 1113 + let chip_layout t = t.chip_layout 1114 + let quantization t = t.quantization 1115 + let spatial_layout t = t.spatial_layout 1116 + let build_version t = t.build_version 1117 + let benchmark t = t.benchmark 1118 + 1119 + let meta = { Meta. 1120 + uuid = "61c12cc5-0e28-4056-999a-480cf3fb7e4c"; 1121 + name = "geoemb:"; 1122 + schema_url = Some "https://raw.githubusercontent.com/geo-embeddings/embeddings-zarr-convention/refs/tags/v1/schema.json"; 1123 + spec_url = Some "https://github.com/geo-embeddings/embeddings-zarr-convention/blob/v1/README.md"; 1124 + description = Some "Geoembeddings convention for geospatial embedding arrays with model provenance"; 1125 + } 1126 + 1127 + let type_jsont = Jsont.enum ~kind:"geoemb_type" ["pixel", `Pixel; "chip", `Chip] 1128 + 1129 + let jsont = 1130 + Jsont.Object.map ~kind:"Geoemb" 1131 + (fun ty dim mdl src dt gsd cl qu sl bv bm -> 1132 + { type_ = ty; dimensions = dim; model = mdl; source_data = src; 1133 + data_type = dt; gsd; chip_layout = cl; quantization = qu; 1134 + spatial_layout = sl; build_version = bv; benchmark = bm }) 1135 + |> Jsont.Object.mem "geoemb:type" type_jsont ~enc:(fun t -> t.type_) 1136 + |> Jsont.Object.mem "geoemb:dimensions" Jsont.int 1137 + ~enc:(fun t -> t.dimensions) 1138 + |> Jsont.Object.mem "geoemb:model" Jsont.string ~enc:(fun t -> t.model) 1139 + |> Jsont.Object.mem "geoemb:source_data" (Jsont.list Jsont.string) 1140 + ~enc:(fun t -> t.source_data) 1141 + |> Jsont.Object.mem "geoemb:data_type" Jsont.string 1142 + ~enc:(fun t -> t.data_type) 1143 + |> Jsont.Object.opt_mem "geoemb:gsd" Jsont.number ~enc:(fun t -> t.gsd) 1144 + |> Jsont.Object.opt_mem "geoemb:chip_layout" Chip_layout.jsont 1145 + ~enc:(fun t -> t.chip_layout) 1146 + |> Jsont.Object.opt_mem "geoemb:quantization" Quantization.jsont 1147 + ~enc:(fun t -> t.quantization) 1148 + |> Jsont.Object.opt_mem "geoemb:spatial_layout" Jsont.string 1149 + ~enc:(fun t -> t.spatial_layout) 1150 + |> Jsont.Object.opt_mem "geoemb:build_version" Jsont.string 1151 + ~enc:(fun t -> t.build_version) 1152 + |> Jsont.Object.opt_mem "geoemb:benchmark" (Jsont.list Jsont.string) 1153 + ~enc:(fun t -> t.benchmark) 1154 + |> Jsont.Object.skip_unknown 1155 + |> Jsont.Object.finish 1156 + end 1157 + 966 1158 module Multiscales = struct 967 1159 module Transform = struct 968 1160 type t = { ··· 1043 1235 proj : Conv.Proj.t option; 1044 1236 spatial : Conv.Spatial.t option; 1045 1237 multiscales : Conv.Multiscales.t option; 1238 + geoemb : Conv.Geoemb.t option; 1046 1239 unknown : Jsont.json; 1047 1240 } 1048 1241 let conventions t = t.conventions 1049 1242 let proj t = t.proj 1050 1243 let spatial t = t.spatial 1051 1244 let multiscales t = t.multiscales 1245 + let geoemb t = t.geoemb 1052 1246 let unknown t = t.unknown 1053 1247 let empty = { 1054 1248 conventions = []; proj = None; spatial = None; multiscales = None; 1055 - unknown = Jsont.Json.object' []; 1249 + geoemb = None; unknown = Jsont.Json.object' []; 1056 1250 } 1057 1251 end 1058 1252 ··· 1070 1264 let is_known ((k, _), _) = 1071 1265 k = "zarr_conventions" || k = "multiscales" || 1072 1266 (String.length k > 5 && String.sub k 0 5 = "proj:") || 1073 - (String.length k > 8 && String.sub k 0 8 = "spatial:") 1267 + (String.length k > 8 && String.sub k 0 8 = "spatial:") || 1268 + (String.length k > 7 && String.sub k 0 7 = "geoemb:") 1074 1269 in 1075 1270 (* zarr_conventions *) 1076 1271 let convs = match find_mem "zarr_conventions" with ··· 1105 1300 | Ok m -> Some m | Error _ -> None) 1106 1301 | None -> None 1107 1302 in 1303 + (* geoemb: prefixed keys *) 1304 + let has_geoemb = List.exists (fun ((k, _), _) -> 1305 + k = "geoemb:type") mems in 1306 + let geoemb_val = 1307 + if has_geoemb then 1308 + match Jsont.Json.decode Conv.Geoemb.jsont json with 1309 + | Ok g -> Some g | Error _ -> None 1310 + else None 1311 + in 1108 1312 (* unknown: everything that's not a known convention key *) 1109 1313 let unknown_mems = List.filter (fun m -> not (is_known m)) mems in 1110 1314 let unknown_val = Jsont.Json.object' unknown_mems in ··· 1113 1317 proj = proj_val; 1114 1318 spatial = spatial_val; 1115 1319 multiscales = multiscales_val; 1320 + geoemb = geoemb_val; 1116 1321 unknown = unknown_val; 1117 1322 }) 1118 1323 ~enc:(fun (t : Attrs.t) -> ··· 1120 1325 let add m = mems := m :: !mems in 1121 1326 (* Auto-populate zarr_conventions *) 1122 1327 let conv_metas = 1328 + (match t.geoemb with Some _ -> [Conv.Geoemb.meta] | None -> []) @ 1123 1329 (match t.proj with Some _ -> [Conv.Proj.meta] | None -> []) @ 1124 1330 (match t.spatial with Some _ -> [Conv.Spatial.meta] | None -> []) @ 1125 1331 (match t.multiscales with Some _ -> [Conv.Multiscales.meta] | None -> []) ··· 1154 1360 | Some m -> 1155 1361 (match Jsont.Json.encode Conv.Multiscales.jsont m with 1156 1362 | Ok j -> add (("multiscales", Jsont.Meta.none), j) 1363 + | _ -> ()) 1364 + | None -> ()); 1365 + (* geoemb *) 1366 + (match t.geoemb with 1367 + | Some g -> 1368 + (match Jsont.Json.encode Conv.Geoemb.jsont g with 1369 + | Ok (Jsont.Object (gm, _)) -> 1370 + List.iter (fun ((k, _), _ as m) -> 1371 + if String.length k > 7 && String.sub k 0 7 = "geoemb:" then add m 1372 + ) gm 1157 1373 | _ -> ()) 1158 1374 | None -> ()); 1159 1375 (* unknown *) ··· 1763 1979 pp_opt (pp_labelled "scale" pp_floats) ppf (Conv.Multiscales.Transform.scale t) 1764 1980 ) (Conv.Multiscales.Layout_item.transform item) 1765 1981 ) layout 1982 + | None -> ()); 1983 + (* geoemb *) 1984 + (match Attrs.geoemb attrs with 1985 + | Some g -> 1986 + let ty = match Conv.Geoemb.type_ g with `Pixel -> "pixel" | `Chip -> "chip" in 1987 + nl (); pf ppf "geoemb: %s %dd model=%s dtype=%s" 1988 + ty (Conv.Geoemb.dimensions g) (Conv.Geoemb.model g) (Conv.Geoemb.data_type g); 1989 + Option.iter (pf ppf " gsd=%.4g") (Conv.Geoemb.gsd g); 1990 + Option.iter (pf ppf " layout=%s") (Conv.Geoemb.spatial_layout g); 1991 + Option.iter (pf ppf " build=%s") (Conv.Geoemb.build_version g); 1992 + (match Conv.Geoemb.chip_layout g with 1993 + | Some cl -> 1994 + let lt = match Conv.Geoemb.Chip_layout.layout_type cl with 1995 + | `Regular_grid -> "regular_grid" | `Irregular -> "irregular" in 1996 + let (h, w) = Conv.Geoemb.Chip_layout.chip_size cl in 1997 + nl (); pf ppf " chip: %s %dx%d" lt h w; 1998 + Option.iter (fun (sy, sx) -> pf ppf " stride=%dx%d" sy sx) 1999 + (Conv.Geoemb.Chip_layout.stride cl) 2000 + | None -> ()); 2001 + (match Conv.Geoemb.quantization g with 2002 + | Some q -> 2003 + nl (); pf ppf " quantization: %s %s" (Conv.Geoemb.Quantization.meth q) 2004 + (Conv.Geoemb.Quantization.original_dtype q); 2005 + Option.iter (pf ppf " -> %s") (Conv.Geoemb.Quantization.quantized_dtype q); 2006 + (match Conv.Geoemb.Quantization.scale q with 2007 + | Some (`Scalar s) -> 2008 + pf ppf " scale=%g" (Conv.Geoemb.Scale.Scalar.scale s); 2009 + if Conv.Geoemb.Scale.Scalar.offset s <> 0.0 then 2010 + pf ppf " offset=%g" (Conv.Geoemb.Scale.Scalar.offset s) 2011 + | Some (`Array a) -> 2012 + pf ppf " scale_array=%s" (Conv.Geoemb.Scale.Array_.array_name a); 2013 + Option.iter (pf ppf " nodata=%s") (Conv.Geoemb.Scale.Array_.nodata a) 2014 + | None -> ()) 2015 + | None -> ()) 1766 2016 | None -> ()); 1767 2017 (match Attrs.unknown attrs with 1768 2018 | Jsont.Object (mems, _) ->
+62 -1
src/zarr_jsont.mli
··· 1 1 (** Jsont codecs for Zarr v2 and v3 metadata. *) 2 2 3 - [@@@ai_disclosure "ai-generated"] 3 + [@@@ai_disclosure "ai-assisted"] 4 4 [@@@ai_model "claude-opus-4"] 5 5 [@@@ai_provider "Anthropic"] 6 6 ··· 309 309 val jsont : t Jsont.t 310 310 end 311 311 312 + (** Geoembeddings convention attributes ([geoemb:] prefix). 313 + 314 + Describes geospatial embedding provenance including encoder model, 315 + source data, quantization, and chip layout. Supports pixel and 316 + chip embedding types. *) 317 + module Geoemb : sig 318 + (** Chip layout for chip-type embeddings. *) 319 + module Chip_layout : sig 320 + type t 321 + val layout_type : t -> [ `Regular_grid | `Irregular ] 322 + val chip_size : t -> int * int 323 + val stride : t -> (int * int) option 324 + val grid_id : t -> string option 325 + val grid_definition : t -> string option 326 + end 327 + 328 + (** Scale parameters for dequantization. *) 329 + module Scale : sig 330 + module Scalar : sig 331 + type t 332 + val scale : t -> float 333 + val offset : t -> float 334 + end 335 + module Array_ : sig 336 + type t 337 + val array_name : t -> string 338 + val nodata : t -> string option 339 + end 340 + type t = [ `Scalar of Scalar.t | `Array of Array_.t ] 341 + val jsont : t Jsont.t 342 + end 343 + 344 + (** Quantization details for compressed embeddings. *) 345 + module Quantization : sig 346 + type t 347 + val meth : t -> string 348 + val original_dtype : t -> string 349 + val quantized_dtype : t -> string option 350 + val scale : t -> Scale.t option 351 + val link : t -> string option 352 + end 353 + 354 + type t 355 + val type_ : t -> [ `Pixel | `Chip ] 356 + val dimensions : t -> int 357 + val model : t -> string 358 + val source_data : t -> string list 359 + val data_type : t -> string 360 + val gsd : t -> float option 361 + val chip_layout : t -> Chip_layout.t option 362 + val quantization : t -> Quantization.t option 363 + val spatial_layout : t -> string option 364 + val build_version : t -> string option 365 + val benchmark : t -> string list option 366 + val meta : Meta.t 367 + val jsont : t Jsont.t 368 + end 369 + 312 370 (** Multiscales convention attributes. *) 313 371 module Multiscales : sig 314 372 (** Coordinate transform for a layout item. *) ··· 357 415 358 416 val multiscales : t -> Conv.Multiscales.t option 359 417 (** Multiscales convention attributes, present if [multiscales] key exists. *) 418 + 419 + val geoemb : t -> Conv.Geoemb.t option 420 + (** Geoembeddings convention attributes, present if [geoemb:type] exists. *) 360 421 361 422 val unknown : t -> Jsont.json 362 423 (** Remaining keys not belonging to any known convention. *)