My aggregated monorepo of OCaml code, automaintained
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Zarr v3: full pipeline working — Blosc/Zstd decompression + fetch_region

- Blosc decoder: handle block offset table + compressed block size prefix
for genuinely compressed frames (not just memcpy fallback)
- Add bitshuffle/byteshuffle unshuffle in pure OCaml
- Unix backend: Zstd decompression via zstd CLI subprocess
- tessera-zarr fetch_region: bbox → UTM → shards → dequantize → reproject
- Expose Geotessera.Utm module for external use
- Verified against live GeoTessera store: 229x146 mosaic from 0.02° bbox

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

+264 -40
+2
tessera-geotessera/lib/geotessera.ml
··· 1 + module Utm = Utm 2 + 1 3 type bbox = { 2 4 min_lon : float; 3 5 min_lat : float;
+4
tessera-geotessera/lib/geotessera.mli
··· 3 3 Fetches, dequantizes, and mosaics GeoTessera embedding tiles. 4 4 Parameterised over I/O for portability. *) 5 5 6 + (** {1 UTM projection} *) 7 + 8 + module Utm = Utm 9 + 6 10 type bbox = { 7 11 min_lon : float; 8 12 min_lat : float;
+97 -2
tessera-zarr/lib/tessera_zarr.ml
··· 15 15 let zone = match List.assoc_opt "tessera:utm_zone" attrs with 16 16 | Some (`Int z) -> z 17 17 | _ -> 18 - (* Fallback: parse from zone name *) 19 18 int_of_string (String.sub zone_name 3 (String.length zone_name - 3)) 20 19 in 21 20 { ··· 25 24 pixel_size = Float.abs transform.(0); 26 25 } 27 26 28 - let fetch_region ~store:_ _bbox = failwith "TODO: fetch_region not yet implemented" 27 + let read_f32_le s off = 28 + let bits = Int32.logor (Int32.of_int (Char.code s.[off])) 29 + (Int32.logor (Int32.shift_left (Int32.of_int (Char.code s.[off+1])) 8) 30 + (Int32.logor (Int32.shift_left (Int32.of_int (Char.code s.[off+2])) 16) 31 + (Int32.shift_left (Int32.of_int (Char.code s.[off+3])) 24))) in 32 + Int32.float_of_bits bits 33 + 34 + let fetch_region ~store bbox = 35 + let open Lwt.Syntax in 36 + let open Geotessera in 37 + (* 1. Determine UTM zone from bbox centre *) 38 + let center_lon = (bbox.min_lon +. bbox.max_lon) /. 2.0 in 39 + let center_lat = (bbox.min_lat +. bbox.max_lat) /. 2.0 in 40 + let zone = Geotessera.Utm.zone_of_lon center_lon in 41 + let zone_name = Printf.sprintf "utm%d" zone in 42 + 43 + (* 2. Get spatial metadata *) 44 + let zi = zone_info store zone_name in 45 + 46 + (* 3. Convert WGS84 bbox corners to UTM to find pixel range *) 47 + let corners = [ 48 + (bbox.min_lon, bbox.min_lat); (bbox.max_lon, bbox.min_lat); 49 + (bbox.min_lon, bbox.max_lat); (bbox.max_lon, bbox.max_lat); 50 + ] in 51 + let utm_corners = List.map (fun (lon, lat) -> 52 + Geotessera.Utm.wgs84_to_utm ~zone lon lat) corners in 53 + let min_e = List.fold_left (fun acc (e, _) -> Float.min acc e) 54 + Float.max_float utm_corners in 55 + let max_e = List.fold_left (fun acc (e, _) -> Float.max acc e) 56 + Float.neg_infinity utm_corners in 57 + let min_n = List.fold_left (fun acc (_, n) -> Float.min acc n) 58 + Float.max_float utm_corners in 59 + let max_n = List.fold_left (fun acc (_, n) -> Float.max acc n) 60 + Float.neg_infinity utm_corners in 61 + 62 + (* Pixel coordinates (row 0 = max northing, row increases southward) *) 63 + let pixel_size = zi.pixel_size in 64 + let col_start = max 0 65 + (Float.to_int (Float.floor ((min_e -. zi.origin_easting) /. pixel_size))) in 66 + let col_stop = 67 + Float.to_int (Float.ceil ((max_e -. zi.origin_easting) /. pixel_size)) in 68 + let row_start = max 0 69 + (Float.to_int (Float.floor ((zi.origin_northing -. max_n) /. pixel_size))) in 70 + let row_stop = 71 + Float.to_int (Float.ceil ((zi.origin_northing -. min_n) /. pixel_size)) in 72 + 73 + let tile_h = row_stop - row_start in 74 + let tile_w = col_stop - col_start in 75 + let n_features = 128 in 76 + 77 + (* 4. Fetch embeddings and scales in parallel *) 78 + let* emb_arr = Zarr_v3.Store.open_array store (zone_name ^ "/embeddings") in 79 + let* scales_arr = Zarr_v3.Store.open_array store (zone_name ^ "/scales") in 80 + 81 + let emb_fetch = Zarr_v3.Store.read emb_arr 82 + ~start:[| row_start; col_start; 0 |] 83 + ~shape:[| tile_h; tile_w; n_features |] in 84 + let scales_fetch = Zarr_v3.Store.read scales_arr 85 + ~start:[| row_start; col_start |] 86 + ~shape:[| tile_h; tile_w |] in 87 + 88 + let* emb_data = emb_fetch 89 + and* scales_data = scales_fetch in 90 + 91 + (* 5. Dequantize: float32 = int8 × scale *) 92 + let mat = Linalg.create_mat ~rows:(tile_h * tile_w) ~cols:n_features in 93 + for i = 0 to tile_h - 1 do 94 + for j = 0 to tile_w - 1 do 95 + let pixel = i * tile_w + j in 96 + let scale = read_f32_le scales_data (pixel * 4) in 97 + for f = 0 to n_features - 1 do 98 + let e_off = (pixel * n_features + f) in 99 + let emb_val = Char.code emb_data.[e_off] in 100 + let emb_signed = if emb_val >= 128 then emb_val - 256 else emb_val in 101 + Linalg.mat_set mat pixel f (Float.of_int emb_signed *. scale) 102 + done 103 + done 104 + done; 105 + 106 + (* 6. Reproject from UTM to WGS84 *) 107 + let out_h = tile_h in 108 + let out_w = tile_w in 109 + let coord = { lon = center_lon; lat = center_lat } in 110 + let reprojected = Geotessera.reproject_tile coord 111 + ~tile_h ~tile_w ~n_features ~out_h ~out_w mat in 112 + 113 + (* 7. Compute WGS84 bounds *) 114 + let utm_w = zi.origin_easting +. Float.of_int col_start *. pixel_size in 115 + let utm_e = zi.origin_easting +. Float.of_int col_stop *. pixel_size in 116 + let utm_n = zi.origin_northing -. Float.of_int row_start *. pixel_size in 117 + let utm_s = zi.origin_northing -. Float.of_int row_stop *. pixel_size in 118 + let (w_lon, s_lat) = Geotessera.Utm.utm_to_wgs84 ~zone utm_w utm_s in 119 + let (e_lon, n_lat) = Geotessera.Utm.utm_to_wgs84 ~zone utm_e utm_n in 120 + let wgs_bbox = { min_lon = w_lon; min_lat = s_lat; 121 + max_lon = e_lon; max_lat = n_lat } in 122 + 123 + Lwt.return (reprojected, out_h, out_w, wgs_bbox)
+24 -1
zarr-v3-unix/lib/zarr_v3_unix.ml
··· 7 7 in 8 8 Lwt_process.pread (Lwt_process.shell cmd) 9 9 10 - let codecs _name = None 10 + (* Decompress raw Zstd data using the zstd command-line tool *) 11 + let zstd_decompress data _expected_size = 12 + let tmp_in = Filename.temp_file "zstd_in" ".zst" in 13 + let tmp_out = Filename.temp_file "zstd_out" ".bin" in 14 + Fun.protect ~finally:(fun () -> 15 + (try Sys.remove tmp_in with _ -> ()); 16 + (try Sys.remove tmp_out with _ -> ())) 17 + (fun () -> 18 + let oc = open_out_bin tmp_in in 19 + output_string oc data; 20 + close_out oc; 21 + let cmd = Printf.sprintf "zstd -d -q -f -o '%s' '%s' 2>/dev/null" tmp_out tmp_in in 22 + let exit_code = Sys.command cmd in 23 + if exit_code <> 0 then 24 + failwith (Printf.sprintf "zstd decompress failed (exit %d)" exit_code); 25 + let ic = open_in_bin tmp_out in 26 + let result = In_channel.input_all ic in 27 + close_in ic; 28 + result) 29 + 30 + let codecs name = 31 + match name with 32 + | "zstd" -> Some (fun data -> zstd_decompress data 0) 33 + | _ -> None
+81 -10
zarr-v3/lib/blosc.ml
··· 3 3 cbytes : int; 4 4 typesize : int; 5 5 blocksize : int; 6 + flags : int; 6 7 } 8 + 9 + type shuffle_mode = No_shuffle | Byte_shuffle | Bit_shuffle 7 10 8 11 let get_u32_le s off = 9 12 Char.code s.[off] ··· 13 16 14 17 let parse_header s = 15 18 if String.length s < 16 then failwith "Blosc: frame too short"; 19 + let flags = Char.code s.[2] in 16 20 let typesize = Char.code s.[3] in 17 21 let nbytes = get_u32_le s 4 in 18 22 let blocksize = get_u32_le s 8 in 19 23 let cbytes = get_u32_le s 12 in 20 - { nbytes; cbytes; typesize; blocksize } 24 + { nbytes; cbytes; typesize; blocksize; flags } 25 + 26 + let shuffle_mode h = 27 + if h.flags land 0x04 <> 0 then Bit_shuffle 28 + else if h.flags land 0x01 <> 0 then Byte_shuffle 29 + else No_shuffle 30 + 31 + (* Undo byte shuffle: elements were rearranged so that all first bytes 32 + of each element are contiguous, then all second bytes, etc. 33 + typesize = element size in bytes, n = number of elements *) 34 + let unshuffle ~typesize data = 35 + let len = String.length data in 36 + if typesize <= 1 then data (* no-op for single-byte elements *) 37 + else begin 38 + let n = len / typesize in 39 + let out = Bytes.create len in 40 + for i = 0 to n - 1 do 41 + for j = 0 to typesize - 1 do 42 + (* In shuffled layout: byte j of element i is at position j*n + i *) 43 + Bytes.set out (i * typesize + j) data.[j * n + i] 44 + done 45 + done; 46 + Bytes.to_string out 47 + end 48 + 49 + (* Undo bit shuffle: bits were rearranged across elements. 50 + For typesize=1 (int8), each bit position is grouped together: 51 + all bit-0s of every byte, then all bit-1s, etc. *) 52 + let unbitshuffle ~typesize data = 53 + let len = String.length data in 54 + let n_elems = len / typesize in 55 + let out = Bytes.make len '\x00' in 56 + (* Process element by element *) 57 + for elem = 0 to n_elems - 1 do 58 + for byte_in_elem = 0 to typesize - 1 do 59 + let out_byte_idx = elem * typesize + byte_in_elem in 60 + let v = ref 0 in 61 + for bit = 0 to 7 do 62 + (* In bitshuffled layout, bit 'bit' of byte 'byte_in_elem' of element 'elem' 63 + is at: bit_offset = (byte_in_elem * 8 + bit) * n_elems + elem 64 + stored as: byte_offset = bit_offset / 8, bit_within_byte = bit_offset mod 8 *) 65 + let bit_offset = (byte_in_elem * 8 + bit) * n_elems + elem in 66 + let src_byte = bit_offset / 8 in 67 + let src_bit = bit_offset mod 8 in 68 + if src_byte < len then begin 69 + let src_val = Char.code data.[src_byte] in 70 + if src_val land (1 lsl src_bit) <> 0 then 71 + v := !v lor (1 lsl bit) 72 + end 73 + done; 74 + Bytes.set out out_byte_idx (Char.chr !v) 75 + done 76 + done; 77 + Bytes.to_string out 21 78 22 79 let decode ?decompress s = 23 80 let h = parse_header s in 24 81 if h.cbytes = h.nbytes + 16 then 25 - (* Memcpy mode: data is stored uncompressed after header *) 82 + (* Memcpy mode: data is stored uncompressed, no shuffle applied *) 26 83 String.sub s 16 h.nbytes 27 - else match decompress with 28 - | Some f -> 29 - let compressed = String.sub s 16 (h.cbytes - 16) in 30 - f compressed h.nbytes 31 - | None -> 32 - failwith (Printf.sprintf 33 - "Blosc: compressed frame (cbytes=%d, nbytes=%d) but no decompressor provided" 34 - h.cbytes h.nbytes) 84 + else begin 85 + (* Compressed: parse block structure, decompress, then unshuffle. 86 + After the 16-byte header: 87 + - uint32 LE: offset to first block within the frame 88 + - At that offset: uint32 LE compressed block size 89 + - Then: the actual compressed data *) 90 + let block_offset = get_u32_le s 16 in 91 + let block_csize = get_u32_le s block_offset in 92 + let compressed = String.sub s (block_offset + 4) block_csize in 93 + let decompressed = match decompress with 94 + | Some f -> f compressed h.nbytes 95 + | None -> 96 + failwith (Printf.sprintf 97 + "Blosc: compressed frame (cbytes=%d, nbytes=%d) but no decompressor provided" 98 + h.cbytes h.nbytes) 99 + in 100 + (* Apply unshuffle if needed *) 101 + match shuffle_mode h with 102 + | No_shuffle -> decompressed 103 + | Byte_shuffle -> unshuffle ~typesize:h.typesize decompressed 104 + | Bit_shuffle -> unbitshuffle ~typesize:h.typesize decompressed 105 + end
+13 -9
zarr-v3/lib/blosc.mli
··· 1 1 (** Blosc frame decoder. 2 2 3 - Blosc wraps compressed (or uncompressed) data in a 16-byte header 4 - containing sizes and codec metadata. In the GeoTessera Zarr store, 5 - Blosc typically stores data uncompressed (memcpy fallback) with 6 - just the 16-byte header overhead. 3 + Parses the 16-byte Blosc header, decompresses the payload via a 4 + pluggable decompressor, and applies unshuffle (byte or bit) if needed. 7 5 8 - For compressed frames, a [decompress] callback must be provided 9 - that handles the inner compression algorithm (e.g., Zstd). *) 6 + For uncompressed (memcpy) frames, the raw data is returned directly. *) 10 7 11 8 type header = { 12 9 nbytes : int; (** Uncompressed data size in bytes *) 13 10 cbytes : int; (** Compressed size including the 16-byte header *) 14 11 typesize : int; (** Element size in bytes (for shuffle) *) 15 12 blocksize : int; (** Block size in bytes *) 13 + flags : int; (** Raw flags byte *) 16 14 } 17 15 (** Parsed Blosc frame header. *) 18 16 17 + type shuffle_mode = No_shuffle | Byte_shuffle | Bit_shuffle 18 + 19 19 val parse_header : string -> header 20 20 (** Parse a Blosc header from the first 16 bytes of a frame. 21 21 @raise Failure if input is shorter than 16 bytes. *) 22 22 23 + val shuffle_mode : header -> shuffle_mode 24 + (** Extract the shuffle mode from the header flags. *) 25 + 23 26 val decode : ?decompress:(string -> int -> string) -> string -> string 24 27 (** Decode a Blosc frame, returning the raw uncompressed data. 25 28 26 29 If the frame uses memcpy mode ([cbytes = nbytes + 16]), the payload 27 - is returned directly without decompression. 30 + is returned directly without decompression or unshuffle. 28 31 29 - If the frame is actually compressed and [decompress] is provided, 30 - calls [decompress compressed_payload expected_size]. 32 + If compressed, [decompress compressed_payload expected_size] is called 33 + to decompress the inner payload, then unshuffle is applied based on 34 + the header flags. 31 35 32 36 @raise Failure if the frame is too short, or if compressed 33 37 without a [decompress] callback. *)
+4 -2
zarr-v3/lib/store.ml
··· 140 140 (b 0) lor ((b 1) lsl 8) lor ((b 2) lsl 16) lor ((b 3) lsl 24) 141 141 lor ((b 4) lsl 32) lor ((b 5) lsl 40) lor ((b 6) lsl 48) lor ((b 7) lsl 56)) 142 142 143 - (* Apply the inner codec chain to a raw chunk *) 143 + (* Apply the inner codec chain to a raw chunk. 144 + Blosc.decode now handles the full pipeline: decompress + unshuffle. 145 + The "zstd" codec from the registry provides the raw decompressor. *) 144 146 let apply_inner_codecs codecs codec_names data = 145 147 List.fold_right (fun name acc -> 146 148 match name with ··· 151 153 | None -> None 152 154 in 153 155 Blosc.decode ?decompress acc 154 - | "crc32c" -> acc (* skip validation *) 156 + | "crc32c" -> acc 155 157 | other -> 156 158 match codecs other with 157 159 | Some f -> f acc
+14
zarr-v3/test/test_live.ml
··· 60 60 Printf.printf " zone=%d, origin_e=%.1f, origin_n=%.1f, pixel_size=%.1f\n" 61 61 zi.zone zi.origin_easting zi.origin_northing zi.pixel_size; 62 62 63 + (* Test full fetch_region pipeline *) 64 + Printf.printf "\n=== Testing fetch_region (small bbox near Cambridge) ===\n%!"; 65 + let bbox = Geotessera.{ min_lon = 0.11; min_lat = 52.19; 66 + max_lon = 0.13; max_lat = 52.21 } in 67 + let* (mat, h, w, bounds) = Tessera_zarr.fetch_region ~store bbox in 68 + Printf.printf " mosaic: %d x %d (%d rows, %d cols in mat)\n%!" h w mat.Linalg.rows mat.Linalg.cols; 69 + Printf.printf " bounds: S=%.4f N=%.4f W=%.4f E=%.4f\n%!" 70 + bounds.min_lat bounds.max_lat bounds.min_lon bounds.max_lon; 71 + (* Spot-check: first pixel should have non-zero embedding values *) 72 + let v0 = Linalg.mat_get mat 0 0 in 73 + let v1 = Linalg.mat_get mat 0 1 in 74 + Printf.printf " first pixel: feat[0]=%.4f, feat[1]=%.4f\n%!" v0 v1; 75 + Printf.printf " (should be non-zero dequantized values)\n%!"; 76 + 63 77 Printf.printf "\n=== Done ===\n%!"; 64 78 Lwt.return_unit 65 79 end
+25 -16
zarr-v3/test/test_zarr_v3.ml
··· 49 49 (fun () -> ignore (Zarr_v3.Blosc.decode "short")) 50 50 51 51 let test_blosc_decode_compressed_no_decompressor () = 52 - (* Make a frame where cbytes < nbytes + 16 (looks compressed) *) 53 - let buf = Bytes.create 20 in 54 - Bytes.set buf 0 '\x02'; 55 - Bytes.set buf 3 '\x01'; 52 + (* Compressed Blosc frame with block offset table: 53 + 16 bytes header + 4 bytes block offset + 4 bytes block csize + compressed data *) 54 + let compressed_data = "\xDE\xAD\xBE\xEF" in (* 4 bytes fake compressed *) 55 + let cbytes = 16 + 4 + 4 + 4 in (* = 28 *) 56 + let buf = Bytes.create cbytes in 57 + Bytes.set buf 0 '\x02'; (* version *) 58 + Bytes.set buf 3 '\x01'; (* typesize *) 56 59 (* nbytes = 64 *) 57 60 Bytes.set buf 4 '\x40'; 58 61 (* blocksize = 64 *) 59 62 Bytes.set buf 8 '\x40'; 60 - (* cbytes = 20 (compressed!) *) 61 - Bytes.set buf 12 '\x14'; 63 + (* cbytes = 28 *) 64 + Bytes.set buf 12 (Char.chr cbytes); 65 + (* block offset: 20 (= 16 header + 4 offset table) *) 66 + Bytes.set buf 16 '\x14'; 67 + (* block csize: 4 *) 68 + Bytes.set buf 20 '\x04'; 69 + (* compressed payload *) 70 + Bytes.blit_string compressed_data 0 buf 24 4; 62 71 let frame = Bytes.to_string buf in 63 72 Alcotest.check_raises "no decompressor" 64 - (Failure "Blosc: compressed frame (cbytes=20, nbytes=64) but no decompressor provided") 73 + (Failure "Blosc: compressed frame (cbytes=28, nbytes=64) but no decompressor provided") 65 74 (fun () -> ignore (Zarr_v3.Blosc.decode frame)) 66 75 67 76 let test_blosc_decode_with_decompressor () = 68 - (* Simulate compressed frame: cbytes=20, nbytes=64 *) 69 - let buf = Bytes.create 20 in 77 + (* Same structure as above but with a decompressor *) 78 + let compressed_data = "\xDE\xAD\xBE\xEF" in 79 + let cbytes = 16 + 4 + 4 + 4 in 80 + let buf = Bytes.create cbytes in 70 81 Bytes.set buf 0 '\x02'; 71 - Bytes.set buf 3 '\x01'; 82 + Bytes.set buf 3 '\x01'; (* typesize=1 *) 72 83 Bytes.set buf 4 '\x40'; (* nbytes=64 *) 73 84 Bytes.set buf 8 '\x40'; (* blocksize=64 *) 74 - Bytes.set buf 12 '\x14'; (* cbytes=20 *) 75 - (* 4 bytes of "compressed" data after header *) 76 - Bytes.set buf 16 '\xDE'; 77 - Bytes.set buf 17 '\xAD'; 78 - Bytes.set buf 18 '\xBE'; 79 - Bytes.set buf 19 '\xEF'; 85 + Bytes.set buf 12 (Char.chr cbytes); 86 + Bytes.set buf 16 '\x14'; (* block offset=20 *) 87 + Bytes.set buf 20 '\x04'; (* block csize=4 *) 88 + Bytes.blit_string compressed_data 0 buf 24 4; 80 89 let frame = Bytes.to_string buf in 81 90 let decompress _data nbytes = String.make nbytes '\x42' in 82 91 let result = Zarr_v3.Blosc.decode ~decompress frame in