···11+type header = {
22+ nbytes : int;
33+ cbytes : int;
44+ typesize : int;
55+ blocksize : int;
66+ flags : int;
77+}
88+99+type shuffle_mode = No_shuffle | Byte_shuffle | Bit_shuffle
1010+1111+let get_u32_le s off =
1212+ Char.code s.[off]
1313+ lor (Char.code s.[off+1] lsl 8)
1414+ lor (Char.code s.[off+2] lsl 16)
1515+ lor (Char.code s.[off+3] lsl 24)
1616+1717+let parse_header s =
1818+ if String.length s < 16 then failwith "Blosc: frame too short";
1919+ let flags = Char.code s.[2] in
2020+ let typesize = Char.code s.[3] in
2121+ let nbytes = get_u32_le s 4 in
2222+ let blocksize = get_u32_le s 8 in
2323+ let cbytes = get_u32_le s 12 in
2424+ { nbytes; cbytes; typesize; blocksize; flags }
2525+2626+let shuffle_mode h =
2727+ if h.flags land 0x04 <> 0 then Bit_shuffle
2828+ else if h.flags land 0x01 <> 0 then Byte_shuffle
2929+ else No_shuffle
3030+3131+(* Undo byte shuffle: elements were rearranged so that all first bytes
3232+ of each element are contiguous, then all second bytes, etc.
3333+ typesize = element size in bytes, n = number of elements *)
3434+let unshuffle ~typesize data =
3535+ let len = String.length data in
3636+ if typesize <= 1 then data (* no-op for single-byte elements *)
3737+ else begin
3838+ let n = len / typesize in
3939+ let out = Bytes.create len in
4040+ for i = 0 to n - 1 do
4141+ for j = 0 to typesize - 1 do
4242+ (* In shuffled layout: byte j of element i is at position j*n + i *)
4343+ Bytes.set out (i * typesize + j) data.[j * n + i]
4444+ done
4545+ done;
4646+ Bytes.to_string out
4747+ end
4848+4949+(* Undo bit shuffle: bits were rearranged across elements.
5050+ For typesize=1 (int8), each bit position is grouped together:
5151+ all bit-0s of every byte, then all bit-1s, etc. *)
5252+let unbitshuffle ~typesize data =
5353+ let len = String.length data in
5454+ let n_elems = len / typesize in
5555+ let out = Bytes.make len '\x00' in
5656+ (* Process element by element *)
5757+ for elem = 0 to n_elems - 1 do
5858+ for byte_in_elem = 0 to typesize - 1 do
5959+ let out_byte_idx = elem * typesize + byte_in_elem in
6060+ let v = ref 0 in
6161+ for bit = 0 to 7 do
6262+ (* In bitshuffled layout, bit 'bit' of byte 'byte_in_elem' of element 'elem'
6363+ is at: bit_offset = (byte_in_elem * 8 + bit) * n_elems + elem
6464+ stored as: byte_offset = bit_offset / 8, bit_within_byte = bit_offset mod 8 *)
6565+ let bit_offset = (byte_in_elem * 8 + bit) * n_elems + elem in
6666+ let src_byte = bit_offset / 8 in
6767+ let src_bit = bit_offset mod 8 in
6868+ if src_byte < len then begin
6969+ let src_val = Char.code data.[src_byte] in
7070+ if src_val land (1 lsl src_bit) <> 0 then
7171+ v := !v lor (1 lsl bit)
7272+ end
7373+ done;
7474+ Bytes.set out out_byte_idx (Char.chr !v)
7575+ done
7676+ done;
7777+ Bytes.to_string out
7878+7979+let decode ?decompress s =
8080+ let h = parse_header s in
8181+ if h.cbytes = h.nbytes + 16 then
8282+ (* Memcpy mode: data is stored uncompressed, no shuffle applied *)
8383+ String.sub s 16 h.nbytes
8484+ else begin
8585+ (* Compressed: parse block structure, decompress, then unshuffle.
8686+ After the 16-byte header:
8787+ - uint32 LE: offset to first block within the frame
8888+ - At that offset: uint32 LE compressed block size
8989+ - Then: the actual compressed data *)
9090+ let block_offset = get_u32_le s 16 in
9191+ let block_csize = get_u32_le s block_offset in
9292+ let compressed = String.sub s (block_offset + 4) block_csize in
9393+ let decompressed = match decompress with
9494+ | Some f -> f compressed h.nbytes
9595+ | None ->
9696+ failwith (Printf.sprintf
9797+ "Blosc: compressed frame (cbytes=%d, nbytes=%d) but no decompressor provided"
9898+ h.cbytes h.nbytes)
9999+ in
100100+ (* Apply unshuffle if needed *)
101101+ match shuffle_mode h with
102102+ | No_shuffle -> decompressed
103103+ | Byte_shuffle -> unshuffle ~typesize:h.typesize decompressed
104104+ | Bit_shuffle -> unbitshuffle ~typesize:h.typesize decompressed
105105+ end
+37
lib/blosc.mli
···11+(** Blosc frame decoder.
22+33+ Parses the 16-byte Blosc header, decompresses the payload via a
44+ pluggable decompressor, and applies unshuffle (byte or bit) if needed.
55+66+ For uncompressed (memcpy) frames, the raw data is returned directly. *)
77+88+type header = {
99+ nbytes : int; (** Uncompressed data size in bytes *)
1010+ cbytes : int; (** Compressed size including the 16-byte header *)
1111+ typesize : int; (** Element size in bytes (for shuffle) *)
1212+ blocksize : int; (** Block size in bytes *)
1313+ flags : int; (** Raw flags byte *)
1414+}
1515+(** Parsed Blosc frame header. *)
1616+1717+type shuffle_mode = No_shuffle | Byte_shuffle | Bit_shuffle
1818+1919+val parse_header : string -> header
2020+(** Parse a Blosc header from the first 16 bytes of a frame.
2121+ @raise Failure if input is shorter than 16 bytes. *)
2222+2323+val shuffle_mode : header -> shuffle_mode
2424+(** Extract the shuffle mode from the header flags. *)
2525+2626+val decode : ?decompress:(string -> int -> string) -> string -> string
2727+(** Decode a Blosc frame, returning the raw uncompressed data.
2828+2929+ If the frame uses memcpy mode ([cbytes = nbytes + 16]), the payload
3030+ is returned directly without decompression or unshuffle.
3131+3232+ If compressed, [decompress compressed_payload expected_size] is called
3333+ to decompress the inner payload, then unshuffle is applied based on
3434+ the header flags.
3535+3636+ @raise Failure if the frame is too short, or if compressed
3737+ without a [decompress] callback. *)
···11+type fetch = string -> ?off:int -> ?len:int -> unit -> string Lwt.t
22+type codec = string -> string
33+type codec_registry = string -> codec option
44+55+type data_type = Int8 | Uint8 | Int32 | Float32 | Float64
66+77+type array_meta = {
88+ shape : int array;
99+ data_type : data_type;
1010+ chunk_shape : int array;
1111+ chunk_separator : string;
1212+ is_sharded : bool;
1313+ inner_chunk_shape : int array option;
1414+ inner_codecs : string list;
1515+ index_location : [ `Start | `End ];
1616+}
1717+1818+type store = {
1919+ base_url : string;
2020+ fetch : fetch;
2121+ codecs : codec_registry;
2222+ consolidated : (string * Yojson.Safe.t) list;
2323+}
2424+2525+type arr = {
2626+ store : store;
2727+ path : string;
2828+ meta : array_meta;
2929+}
3030+3131+let data_type_size = function
3232+ | Int8 | Uint8 -> 1
3333+ | Int32 | Float32 -> 4
3434+ | Float64 -> 8
3535+3636+let data_type_of_string = function
3737+ | "int8" -> Int8
3838+ | "uint8" -> Uint8
3939+ | "int32" -> Int32
4040+ | "float32" -> Float32
4141+ | "float64" -> Float64
4242+ | s -> failwith (Printf.sprintf "Unsupported data type: %s" s)
4343+4444+(* --- JSON helpers --- *)
4545+4646+let json_member key = function
4747+ | `Assoc l -> (try List.assoc key l with Not_found -> `Null)
4848+ | _ -> `Null
4949+5050+let json_to_int = function `Int i -> i | _ -> failwith "expected int"
5151+let json_to_string = function `String s -> s | _ -> failwith "expected string"
5252+let json_to_list f = function `List l -> List.map f l | _ -> failwith "expected list"
5353+let json_to_int_list j = json_to_list json_to_int j
5454+5555+(* --- Metadata parsing --- *)
5656+5757+let parse_array_meta json_str =
5858+ let j = Yojson.Safe.from_string json_str in
5959+ let shape = Array.of_list (json_to_int_list (json_member "shape" j)) in
6060+ let data_type = data_type_of_string (json_to_string (json_member "data_type" j)) in
6161+ let chunk_grid = json_member "chunk_grid" j in
6262+ let chunk_shape = Array.of_list (json_to_int_list
6363+ (json_member "chunk_shape" (json_member "configuration" chunk_grid))) in
6464+ let chunk_key = json_member "chunk_key_encoding" j in
6565+ let chunk_separator = match json_member "separator"
6666+ (json_member "configuration" chunk_key) with
6767+ | `String s -> s | _ -> "/" in
6868+ let codecs_json = match json_member "codecs" j with
6969+ | `List l -> l | _ -> [] in
7070+ (* Check for sharding *)
7171+ let sharding = List.find_opt (fun c ->
7272+ json_to_string (json_member "name" c) = "sharding_indexed") codecs_json in
7373+ match sharding with
7474+ | Some shard_codec ->
7575+ let config = json_member "configuration" shard_codec in
7676+ let inner_chunk_shape = Array.of_list
7777+ (json_to_int_list (json_member "chunk_shape" config)) in
7878+ let inner_codecs_json = match json_member "codecs" config with
7979+ | `List l -> l | _ -> [] in
8080+ let inner_codecs = List.map (fun c ->
8181+ json_to_string (json_member "name" c)) inner_codecs_json in
8282+ let index_location = match json_member "index_location" config with
8383+ | `String "start" -> `Start | _ -> `End in
8484+ { shape; data_type; chunk_shape; chunk_separator;
8585+ is_sharded = true; inner_chunk_shape = Some inner_chunk_shape;
8686+ inner_codecs; index_location }
8787+ | None ->
8888+ let codecs = List.map (fun c ->
8989+ json_to_string (json_member "name" c)) codecs_json in
9090+ { shape; data_type; chunk_shape; chunk_separator;
9191+ is_sharded = false; inner_chunk_shape = None;
9292+ inner_codecs = codecs; index_location = `End }
9393+9494+let parse_consolidated json_str =
9595+ let j = Yojson.Safe.from_string json_str in
9696+ let cm = json_member "consolidated_metadata" j in
9797+ match json_member "metadata" cm with
9898+ | `Assoc entries -> entries
9999+ | _ -> []
100100+101101+(* --- Store and array access --- *)
102102+103103+let open_store ~(fetch : fetch) ~(codecs : codec_registry) base_url =
104104+ let open Lwt.Syntax in
105105+ let+ root_json = fetch (base_url ^ "/zarr.json") () in
106106+ let consolidated = parse_consolidated root_json in
107107+ { base_url; fetch; codecs; consolidated }
108108+109109+let store_meta store = store.consolidated
110110+111111+let open_array store path =
112112+ let meta_json = try
113113+ let (_, j) = List.find (fun (k, _) -> k = path) store.consolidated in
114114+ Yojson.Safe.to_string j
115115+ with Not_found ->
116116+ failwith (Printf.sprintf "Array %s not found in consolidated metadata" path)
117117+ in
118118+ let meta = parse_array_meta meta_json in
119119+ Lwt.return { store; path; meta }
120120+121121+let array_meta arr = arr.meta
122122+123123+let group_attrs store path =
124124+ let (_, j) = try
125125+ List.find (fun (k, _) -> k = path) store.consolidated
126126+ with Not_found ->
127127+ failwith (Printf.sprintf "Group %s not found in consolidated metadata" path)
128128+ in
129129+ match json_member "attributes" j with
130130+ | `Assoc l -> l
131131+ | _ -> []
132132+133133+(* --- Shard reading --- *)
134134+135135+let get_u64_le s off =
136136+ let b i = Int64.of_int (Char.code s.[off + i]) in
137137+ let ( lor ) = Int64.logor in
138138+ let ( lsl ) = Int64.shift_left in
139139+ Int64.to_int (
140140+ (b 0) lor ((b 1) lsl 8) lor ((b 2) lsl 16) lor ((b 3) lsl 24)
141141+ lor ((b 4) lsl 32) lor ((b 5) lsl 40) lor ((b 6) lsl 48) lor ((b 7) lsl 56))
142142+143143+(* Apply the inner codec chain to a raw chunk.
144144+ Blosc.decode now handles the full pipeline: decompress + unshuffle.
145145+ The "zstd" codec from the registry provides the raw decompressor. *)
146146+let apply_inner_codecs codecs codec_names data =
147147+ List.fold_right (fun name acc ->
148148+ match name with
149149+ | "bytes" -> acc
150150+ | "blosc" ->
151151+ let decompress = match codecs "zstd" with
152152+ | Some f -> Some (fun s _nbytes -> f s)
153153+ | None -> None
154154+ in
155155+ Blosc.decode ?decompress acc
156156+ | "crc32c" -> acc
157157+ | other ->
158158+ match codecs other with
159159+ | Some f -> f acc
160160+ | None -> failwith (Printf.sprintf "Zarr: unknown codec %s" other)
161161+ ) codec_names data
162162+163163+(* Compute the linear index of an inner chunk within a shard.
164164+ inner_idx is the chunk's position within the shard (per dimension).
165165+ inner_per_shard is the number of inner chunks per dimension. *)
166166+let linearize_inner_idx inner_idx inner_per_shard ndim =
167167+ let idx = ref 0 in
168168+ let stride = ref 1 in
169169+ for d = ndim - 1 downto 0 do
170170+ idx := !idx + inner_idx.(d) * !stride;
171171+ stride := !stride * inner_per_shard.(d)
172172+ done;
173173+ !idx
174174+175175+(* Decompress an inner chunk and copy overlapping pixels to the output buffer *)
176176+let decode_inner codecs codec_names
177177+ data local_off nbytes chunk_pixel_start chunk_pixel_stop
178178+ ndim start shape elem_size inner_chunk_shape out_buf =
179179+ let compressed = String.sub data local_off nbytes in
180180+ let raw = apply_inner_codecs codecs codec_names compressed in
181181+ let stop = Array.init ndim (fun d -> start.(d) + shape.(d)) in
182182+ let copy_lo = Array.init ndim (fun d ->
183183+ max chunk_pixel_start.(d) start.(d)) in
184184+ let copy_hi = Array.init ndim (fun d ->
185185+ min chunk_pixel_stop.(d) stop.(d)) in
186186+ let idx = Array.make ndim 0 in
187187+ let rec copy dim =
188188+ if dim = ndim then begin
189189+ let src_off = ref 0 in
190190+ let src_stride = ref elem_size in
191191+ for d = ndim - 1 downto 0 do
192192+ src_off := !src_off +
193193+ (idx.(d) - chunk_pixel_start.(d)) * !src_stride;
194194+ src_stride := !src_stride * inner_chunk_shape.(d)
195195+ done;
196196+ let dst_off = ref 0 in
197197+ let dst_stride = ref elem_size in
198198+ for d = ndim - 1 downto 0 do
199199+ dst_off := !dst_off +
200200+ (idx.(d) - start.(d)) * !dst_stride;
201201+ dst_stride := !dst_stride * shape.(d)
202202+ done;
203203+ Bytes.blit_string raw !src_off out_buf !dst_off elem_size
204204+ end else begin
205205+ for i = copy_lo.(dim) to copy_hi.(dim) - 1 do
206206+ idx.(dim) <- i;
207207+ copy (dim + 1)
208208+ done
209209+ end
210210+ in
211211+ copy 0
212212+213213+let read ?on_shard arr ~start ~shape =
214214+ let open Lwt.Syntax in
215215+ let meta = arr.meta in
216216+ let ndim = Array.length meta.shape in
217217+ let stop = Array.init ndim (fun d -> start.(d) + shape.(d)) in
218218+ let elem_size = data_type_size meta.data_type in
219219+ let chunk_shape = meta.chunk_shape in
220220+221221+ (* For sharded arrays *)
222222+ let inner_chunk_shape = match meta.inner_chunk_shape with
223223+ | Some s -> s
224224+ | None -> chunk_shape (* non-sharded: treat the chunk as both shard and inner *)
225225+ in
226226+227227+ (* Output buffer *)
228228+ let out_elems = Array.fold_left ( * ) 1 shape in
229229+ let out_buf = Bytes.make (out_elems * elem_size) '\x00' in
230230+231231+ (* Inner chunks per shard, per dimension *)
232232+ let inner_per_shard = Array.init ndim (fun d ->
233233+ chunk_shape.(d) / inner_chunk_shape.(d)) in
234234+ let n_inner_chunks = Array.fold_left ( * ) 1 inner_per_shard in
235235+ let index_entry_size = 16 in (* 2 × uint64 *)
236236+ let index_size = n_inner_chunks * index_entry_size + 4 (* CRC32C *) in
237237+238238+ (* Inner chunk size in bytes *)
239239+ let inner_chunk_elems = Array.fold_left ( * ) 1 inner_chunk_shape in
240240+ let _inner_chunk_bytes = inner_chunk_elems * elem_size in
241241+242242+ (* Which shards do we need? *)
243243+ let shard_start = Array.init ndim (fun d -> start.(d) / chunk_shape.(d)) in
244244+ let shard_stop = Array.init ndim (fun d -> (stop.(d) - 1) / chunk_shape.(d) + 1) in
245245+246246+ (* Count total shards *)
247247+ let n_shards = Array.init ndim (fun d -> shard_stop.(d) - shard_start.(d))
248248+ |> Array.fold_left ( * ) 1 in
249249+ let shards_done = ref 0 in
250250+251251+ (* Iterate over all needed shards *)
252252+ let shard_tasks = ref [] in
253253+254254+ let rec iter_shards shard_idx dim =
255255+ if dim = ndim then begin
256256+ (* Build shard URL *)
257257+ let shard_key = String.concat meta.chunk_separator
258258+ ("c" :: Array.to_list (Array.map string_of_int shard_idx)) in
259259+ let shard_url = Printf.sprintf "%s/%s/%s"
260260+ arr.store.base_url arr.path shard_key in
261261+262262+ let task =
263263+ (* Phase 1: Fetch just the shard index via suffix byte-range
264264+ request (bytes=-N fetches last N bytes).
265265+ For non-sharded arrays, fetch the whole chunk. *)
266266+ let* index_data =
267267+ if meta.is_sharded then
268268+ arr.store.fetch shard_url ~len:index_size ()
269269+ else
270270+ arr.store.fetch shard_url ()
271271+ in
272272+273273+ (* The server may not support suffix ranges (returns full shard)
274274+ or may return slightly more data. Handle gracefully. *)
275275+ let shard_data_opt, index_data =
276276+ if String.length index_data > index_size then
277277+ (* Got the whole shard — use it directly for sub-chunk reads *)
278278+ let full = index_data in
279279+ let idx_off = String.length full - index_size in
280280+ (Some full, String.sub full idx_off index_size)
281281+ else
282282+ (None, index_data)
283283+ in
284284+285285+ incr shards_done;
286286+ (match on_shard with
287287+ | Some f -> f !shards_done n_shards
288288+ | None -> ());
289289+290290+ (* Phase 2: Collect all overlapping inner chunks, then fetch
291291+ the byte range spanning all of them in a single request. *)
292292+ let needed_chunks = ref [] in
293293+ let rec collect_inner inner_idx dim =
294294+ if dim = ndim then begin
295295+ let chunk_pixel_start = Array.init ndim (fun d ->
296296+ shard_idx.(d) * chunk_shape.(d) +
297297+ inner_idx.(d) * inner_chunk_shape.(d)) in
298298+ let chunk_pixel_stop = Array.init ndim (fun d ->
299299+ min (chunk_pixel_start.(d) + inner_chunk_shape.(d))
300300+ meta.shape.(d)) in
301301+ let overlaps = ref true in
302302+ for d = 0 to ndim - 1 do
303303+ if chunk_pixel_start.(d) >= stop.(d) ||
304304+ chunk_pixel_stop.(d) <= start.(d) then
305305+ overlaps := false
306306+ done;
307307+ if !overlaps then begin
308308+ let lin = linearize_inner_idx inner_idx inner_per_shard ndim in
309309+ let offset = get_u64_le index_data (lin * index_entry_size) in
310310+ let nbytes = get_u64_le index_data (lin * index_entry_size + 8) in
311311+ if offset < max_int && nbytes > 0 then
312312+ needed_chunks := (offset, nbytes,
313313+ chunk_pixel_start, chunk_pixel_stop) :: !needed_chunks
314314+ end
315315+ end else
316316+ for i = 0 to inner_per_shard.(dim) - 1 do
317317+ inner_idx.(dim) <- i;
318318+ collect_inner (Array.copy inner_idx) (dim + 1)
319319+ done
320320+ in
321321+ collect_inner (Array.make ndim 0) 0;
322322+323323+ let chunks = !needed_chunks in
324324+ if chunks = [] then Lwt.return_unit
325325+ else match shard_data_opt with
326326+ | Some full ->
327327+ (* Already have the full shard — just decompress in place *)
328328+ List.iter (fun (offset, nbytes, cps, cpe) ->
329329+ decode_inner arr.store.codecs meta.inner_codecs
330330+ full offset nbytes cps cpe
331331+ ndim start shape elem_size inner_chunk_shape out_buf
332332+ ) chunks;
333333+ Lwt.return_unit
334334+ | None ->
335335+ (* Group nearby sub-chunks into merged byte ranges.
336336+ Sort by offset, then merge when gap < 64KB. *)
337337+ let sorted = List.sort (fun (a,_,_,_) (b,_,_,_) -> compare a b) chunks in
338338+ let max_gap = 65536 in
339339+ (* Build groups: each group is (range_start, range_end, chunk list) *)
340340+ let groups = ref [] in
341341+ let cur_start = ref 0 in
342342+ let cur_end = ref 0 in
343343+ let cur_chunks = ref [] in
344344+ List.iter (fun ((off, nb, _, _) as chunk) ->
345345+ if !cur_chunks = [] then begin
346346+ cur_start := off;
347347+ cur_end := off + nb;
348348+ cur_chunks := [chunk]
349349+ end else if off - !cur_end <= max_gap then begin
350350+ cur_end := max !cur_end (off + nb);
351351+ cur_chunks := chunk :: !cur_chunks
352352+ end else begin
353353+ groups := (!cur_start, !cur_end, !cur_chunks) :: !groups;
354354+ cur_start := off;
355355+ cur_end := off + nb;
356356+ cur_chunks := [chunk]
357357+ end
358358+ ) sorted;
359359+ if !cur_chunks <> [] then
360360+ groups := (!cur_start, !cur_end, !cur_chunks) :: !groups;
361361+362362+ (* Fetch each group in parallel *)
363363+ let group_tasks = List.map (fun (g_start, g_end, g_chunks) ->
364364+ let+ data = arr.store.fetch shard_url
365365+ ~off:g_start ~len:(g_end - g_start) () in
366366+ List.iter (fun (offset, nbytes, cps, cpe) ->
367367+ decode_inner arr.store.codecs meta.inner_codecs
368368+ data (offset - g_start) nbytes cps cpe
369369+ ndim start shape elem_size inner_chunk_shape out_buf
370370+ ) g_chunks
371371+ ) !groups in
372372+ Lwt.join group_tasks
373373+ in
374374+ shard_tasks := task :: !shard_tasks
375375+ end else begin
376376+ for i = shard_start.(dim) to shard_stop.(dim) - 1 do
377377+ shard_idx.(dim) <- i;
378378+ iter_shards (Array.copy shard_idx) (dim + 1)
379379+ done
380380+ end
381381+ in
382382+ iter_shards (Array.make ndim 0) 0;
383383+ let+ () = Lwt.join !shard_tasks in
384384+ Bytes.to_string out_buf
+135
lib/store.mli
···11+(** Pure OCaml Zarr v3 store reader.
22+33+ {b Warning:} This library was vibe-coded with AI assistance and has not
44+ been thoroughly reviewed or tested. Use at your own risk and expect
55+ breaking changes.
66+77+ Reads sharded Zarr v3 arrays over HTTP with pluggable codecs and
88+ fetch functions. Platform-independent — bring your own HTTP client
99+ and decompressors.
1010+1111+ {2 Example}
1212+1313+ {[
1414+ let store = Zarr_v3.open_store ~fetch ~codecs url in
1515+ let arr = Zarr_v3.open_array store "utm31/embeddings" in
1616+ let data = Zarr_v3.read arr ~start:[|100; 200; 0|] ~shape:[|4; 4; 128|] in
1717+ (* data is a string of raw bytes in C-order *)
1818+ ]}
1919+2020+ {2 Pluggable I/O}
2121+2222+ The [fetch] parameter provides HTTP access. The [codecs] parameter
2323+ provides decompression. Both are passed in by platform backends
2424+ (e.g., zarr-v3-unix for testing, tessera-zarr-jsoo for the browser). *)
2525+2626+(** {1 Pluggable interfaces} *)
2727+2828+type fetch = string -> ?off:int -> ?len:int -> unit -> string Lwt.t
2929+(** [fetch url ?off ?len ()] fetches bytes from [url].
3030+ If [off] and [len] are provided, fetches byte range [off..off+len-1].
3131+ If only [len] is provided (no [off]), fetches the last [len] bytes
3232+ (suffix range, i.e. HTTP [bytes=-len]).
3333+ Returns the response body as a string. *)
3434+3535+type codec = string -> string
3636+(** A decompression codec. Takes compressed bytes, returns decompressed bytes. *)
3737+3838+type codec_registry = string -> codec option
3939+(** Maps codec names (e.g., ["zstd"]) to decompression functions.
4040+ Return [None] for unknown codecs. The built-in [bytes] and [blosc]
4141+ (memcpy mode) codecs are handled internally. *)
4242+4343+(** {1 Metadata types} *)
4444+4545+type data_type =
4646+ | Int8
4747+ | Uint8
4848+ | Int32
4949+ | Float32
5050+ | Float64
5151+(** Supported Zarr data types. *)
5252+5353+type array_meta = {
5454+ shape : int array;
5555+ data_type : data_type;
5656+ chunk_shape : int array;
5757+ chunk_separator : string;
5858+ is_sharded : bool;
5959+ inner_chunk_shape : int array option;
6060+ inner_codecs : string list;
6161+ index_location : [ `Start | `End ];
6262+}
6363+(** Parsed metadata for a Zarr v3 array. *)
6464+6565+(** {1 Store and array handles} *)
6666+6767+type store
6868+(** An open Zarr v3 store backed by HTTP. Holds the base URL,
6969+ fetch function, codec registry, and consolidated metadata. *)
7070+7171+type arr
7272+(** An open Zarr v3 array with parsed metadata and shard access methods. *)
7373+7474+(** {1 Metadata parsing} *)
7575+7676+val parse_array_meta : string -> array_meta
7777+(** Parse array metadata from a JSON string.
7878+ @raise Failure on invalid or unsupported metadata. *)
7979+8080+val parse_consolidated : string -> (string * Yojson.Safe.t) list
8181+(** Parse consolidated metadata from a root [zarr.json] string.
8282+ Returns a list of [(path, metadata_json)] pairs. *)
8383+8484+(** {1 Opening stores and arrays} *)
8585+8686+val open_store : fetch:fetch -> codecs:codec_registry -> string -> store Lwt.t
8787+(** [open_store ~fetch ~codecs base_url] opens a Zarr v3 store.
8888+ Fetches and parses the root [zarr.json], including any
8989+ consolidated metadata. *)
9090+9191+val open_array : store -> string -> arr Lwt.t
9292+(** [open_array store path] opens an array by path (e.g., ["utm31/scales"]).
9393+ Uses consolidated metadata if available.
9494+ @raise Failure if the array is not found. *)
9595+9696+(** {1 Metadata access} *)
9797+9898+val array_meta : arr -> array_meta
9999+(** Get the parsed metadata for an open array. *)
100100+101101+val store_meta : store -> (string * Yojson.Safe.t) list
102102+(** Access the consolidated metadata entries.
103103+ Returns all [(path, json)] pairs from the root [zarr.json]. *)
104104+105105+val group_attrs : store -> string -> (string * Yojson.Safe.t) list
106106+(** [group_attrs store path] returns the attributes of a group
107107+ (e.g., ["utm31"] for spatial transform and CRS info).
108108+ @raise Failure if the group is not found. *)
109109+110110+(** {1 Reading data} *)
111111+112112+val read : ?on_shard:(int -> int -> unit) ->
113113+ arr -> start:int array -> shape:int array -> string Lwt.t
114114+(** [read ?on_shard arr ~start ~shape] reads a rectangular region of an array.
115115+116116+ [start] is the origin (inclusive) in pixel coordinates.
117117+ [shape] is the size of the region in each dimension.
118118+ Returns raw bytes in C-order. The caller must interpret the bytes
119119+ according to {!array_meta.data_type}.
120120+121121+ [on_shard i n] is called when shard [i] of [n] total has been fetched.
122122+123123+ For sharded arrays, fetches only the shards that overlap the
124124+ requested region. Shard fetches run in parallel via [Lwt.join].
125125+126126+ @raise Failure if the region is out of bounds. *)
127127+128128+(** {1 Utility} *)
129129+130130+val data_type_size : data_type -> int
131131+(** Size in bytes of a single element of the given data type. *)
132132+133133+val data_type_of_string : string -> data_type
134134+(** Parse a Zarr data type string (e.g., ["int8"], ["float32"]).
135135+ @raise Failure for unsupported types. *)