···11+type zone_info = {
22+ zone : int;
33+ origin_easting : float;
44+ origin_northing : float;
55+ pixel_size : float;
66+}
77+88+let zone_info store zone_name =
99+ let attrs = Zarr_v3.Store.group_attrs store zone_name in
1010+ let transform = match List.assoc_opt "spatial:transform" attrs with
1111+ | Some (`List l) -> Array.of_list (List.map (function
1212+ | `Float f -> f | `Int i -> Float.of_int i | _ -> 0.0) l)
1313+ | _ -> failwith (Printf.sprintf "Missing spatial:transform in %s" zone_name)
1414+ in
1515+ let zone = match List.assoc_opt "tessera:utm_zone" attrs with
1616+ | Some (`Int z) -> z
1717+ | _ ->
1818+ int_of_string (String.sub zone_name 3 (String.length zone_name - 3))
1919+ in
2020+ {
2121+ zone;
2222+ origin_easting = transform.(2);
2323+ origin_northing = transform.(5);
2424+ pixel_size = Float.abs transform.(0);
2525+ }
2626+2727+let read_f32_le s off =
2828+ let bits = Int32.logor (Int32.of_int (Char.code s.[off]))
2929+ (Int32.logor (Int32.shift_left (Int32.of_int (Char.code s.[off+1])) 8)
3030+ (Int32.logor (Int32.shift_left (Int32.of_int (Char.code s.[off+2])) 16)
3131+ (Int32.shift_left (Int32.of_int (Char.code s.[off+3])) 24))) in
3232+ Int32.float_of_bits bits
3333+3434+let fetch_region ?(progress = fun (_:string) -> ()) ?(year = 2024) ~store bbox =
3535+ let open Lwt.Syntax in
3636+ let open Geotessera in
3737+ (* 1. Determine UTM zone from bbox centre *)
3838+ let center_lon = (bbox.min_lon +. bbox.max_lon) /. 2.0 in
3939+ let zone = Geotessera.Utm.zone_of_lon center_lon in
4040+ let zone_name = Printf.sprintf "utm%d" zone in
4141+4242+ (* 2. Get spatial metadata *)
4343+ let zi = zone_info store zone_name in
4444+4545+ (* 3. Convert WGS84 bbox corners to UTM to find pixel range *)
4646+ let corners = [
4747+ (bbox.min_lon, bbox.min_lat); (bbox.max_lon, bbox.min_lat);
4848+ (bbox.min_lon, bbox.max_lat); (bbox.max_lon, bbox.max_lat);
4949+ ] in
5050+ let utm_corners = List.map (fun (lon, lat) ->
5151+ Geotessera.Utm.wgs84_to_utm ~zone lon lat) corners in
5252+ let min_e = List.fold_left (fun acc (e, _) -> Float.min acc e)
5353+ Float.max_float utm_corners in
5454+ let max_e = List.fold_left (fun acc (e, _) -> Float.max acc e)
5555+ Float.neg_infinity utm_corners in
5656+ let min_n = List.fold_left (fun acc (_, n) -> Float.min acc n)
5757+ Float.max_float utm_corners in
5858+ let max_n = List.fold_left (fun acc (_, n) -> Float.max acc n)
5959+ Float.neg_infinity utm_corners in
6060+6161+ (* Pixel coordinates (row 0 = max northing, row increases southward) *)
6262+ let pixel_size = zi.pixel_size in
6363+ let col_start = max 0
6464+ (Float.to_int (Float.floor ((min_e -. zi.origin_easting) /. pixel_size))) in
6565+ let col_stop =
6666+ Float.to_int (Float.ceil ((max_e -. zi.origin_easting) /. pixel_size)) in
6767+ let row_start = max 0
6868+ (Float.to_int (Float.floor ((zi.origin_northing -. max_n) /. pixel_size))) in
6969+ let row_stop =
7070+ Float.to_int (Float.ceil ((zi.origin_northing -. min_n) /. pixel_size)) in
7171+7272+ let tile_h = row_stop - row_start in
7373+ let tile_w = col_stop - col_start in
7474+ let n_features = 128 in
7575+7676+ (* 4. Fetch embeddings and scales in parallel.
7777+ Detect layout by the number of dimensions in the embeddings array:
7878+ MegaZarr: [time, band, y, x] — 4D, single store with year as dimension
7979+ Legacy: [y, x, band] — 3D, one store per year *)
8080+ let* emb_arr = Zarr_v3.Store.open_array store (zone_name ^ "/embeddings") in
8181+ let* scales_arr = Zarr_v3.Store.open_array store (zone_name ^ "/scales") in
8282+8383+ let emb_meta = Zarr_v3.Store.array_meta emb_arr in
8484+ let is_megazarr = Array.length emb_meta.shape = 4 in
8585+8686+ let on_emb_shard i n =
8787+ progress (Printf.sprintf "Fetching embeddings: shard %d/%d" i n) in
8888+ let on_scales_shard i n =
8989+ progress (Printf.sprintf "Fetching scales: shard %d/%d" i n) in
9090+9191+ let emb_fetch, scales_fetch =
9292+ if is_megazarr then begin
9393+ (* MegaZarr: embeddings[time, band, y, x], scales[time, y, x] *)
9494+ let time_idx = year - 2017 in
9595+ let n_times = emb_meta.shape.(0) in
9696+ if time_idx < 0 || time_idx >= n_times then
9797+ failwith (Printf.sprintf "Year %d out of range (2017-%d)" year
9898+ (2016 + n_times));
9999+ (* MegaZarr layout *)
100100+ ( Zarr_v3.Store.read ~on_shard:on_emb_shard emb_arr
101101+ ~start:[| time_idx; 0; row_start; col_start |]
102102+ ~shape:[| 1; n_features; tile_h; tile_w |],
103103+ Zarr_v3.Store.read ~on_shard:on_scales_shard scales_arr
104104+ ~start:[| time_idx; row_start; col_start |]
105105+ ~shape:[| 1; tile_h; tile_w |] )
106106+ end else
107107+ (* Legacy per-year layout: embeddings[y, x, band], scales[y, x] *)
108108+ ( Zarr_v3.Store.read ~on_shard:on_emb_shard emb_arr
109109+ ~start:[| row_start; col_start; 0 |]
110110+ ~shape:[| tile_h; tile_w; n_features |],
111111+ Zarr_v3.Store.read ~on_shard:on_scales_shard scales_arr
112112+ ~start:[| row_start; col_start |]
113113+ ~shape:[| tile_h; tile_w |] )
114114+ in
115115+116116+ let* emb_data = emb_fetch
117117+ and* scales_data = scales_fetch in
118118+119119+ progress "Dequantizing...";
120120+ (* 5. Dequantize: float32 = int8 × scale.
121121+ Legacy data is C-order [y, x, band] → offset = (row*W + col)*128 + f
122122+ MegaZarr data is C-order [1, band, y, x] → offset = (f*H*W + row*W + col) *)
123123+ let mat = Linalg.create_mat ~rows:(tile_h * tile_w) ~cols:n_features in
124124+ for i = 0 to tile_h - 1 do
125125+ for j = 0 to tile_w - 1 do
126126+ let pixel = i * tile_w + j in
127127+ (* Scales offset is pixel*4 for both layouts *)
128128+ let scale = read_f32_le scales_data (pixel * 4) in
129129+ for f = 0 to n_features - 1 do
130130+ let e_off =
131131+ if is_megazarr then
132132+ (* MegaZarr: C-order [1, 128, H, W] → offset = f*H*W + pixel *)
133133+ f * tile_h * tile_w + pixel
134134+ else
135135+ (* Legacy: C-order [H, W, 128] → offset = pixel*128 + f *)
136136+ pixel * n_features + f
137137+ in
138138+ let emb_val = Char.code emb_data.[e_off] in
139139+ let emb_signed = if emb_val >= 128 then emb_val - 256 else emb_val in
140140+ Linalg.mat_set mat pixel f (Float.of_int emb_signed *. scale)
141141+ done
142142+ done
143143+ done;
144144+145145+ (* 6. Compute actual UTM bounds of fetched pixels *)
146146+ let utm_w = zi.origin_easting +. Float.of_int col_start *. pixel_size in
147147+ let utm_e = zi.origin_easting +. Float.of_int col_stop *. pixel_size in
148148+ let utm_n = zi.origin_northing -. Float.of_int row_start *. pixel_size in
149149+ let utm_s = zi.origin_northing -. Float.of_int row_stop *. pixel_size in
150150+151151+ (* 7. Compute WGS84 bounds from the UTM corners *)
152152+ let (w_lon, s_lat) = Geotessera.Utm.utm_to_wgs84 ~zone utm_w utm_s in
153153+ let (e_lon, n_lat) = Geotessera.Utm.utm_to_wgs84 ~zone utm_e utm_n in
154154+ let wgs_bbox = { min_lon = w_lon; min_lat = s_lat;
155155+ max_lon = e_lon; max_lat = n_lat } in
156156+157157+ (* 8. Reproject from UTM to WGS84 using the actual bounds.
158158+ For each pixel in the output WGS84 grid, convert to UTM and
159159+ sample the nearest input pixel. *)
160160+ progress "Reprojecting...";
161161+ let out_h = tile_h in
162162+ let out_w = tile_w in
163163+ let reprojected = Linalg.create_mat ~rows:(out_h * out_w) ~cols:n_features in
164164+ for oi = 0 to out_h - 1 do
165165+ let lat = wgs_bbox.max_lat -.
166166+ (Float.of_int oi +. 0.5) *. (wgs_bbox.max_lat -. wgs_bbox.min_lat)
167167+ /. Float.of_int out_h in
168168+ for oj = 0 to out_w - 1 do
169169+ let lon = wgs_bbox.min_lon +.
170170+ (Float.of_int oj +. 0.5) *. (wgs_bbox.max_lon -. wgs_bbox.min_lon)
171171+ /. Float.of_int out_w in
172172+ let (e, n) = Geotessera.Utm.wgs84_to_utm ~zone lon lat in
173173+ (* Map UTM coord to local pixel in mat.
174174+ Row 0 = utm_n (north edge), row increases southward.
175175+ Col 0 = utm_w (west edge), col increases eastward. *)
176176+ let pi = Float.to_int (Float.floor ((utm_n -. n) /. pixel_size)) in
177177+ let pj = Float.to_int (Float.floor ((e -. utm_w) /. pixel_size)) in
178178+ if pi >= 0 && pi < tile_h && pj >= 0 && pj < tile_w then begin
179179+ let in_idx = pi * tile_w + pj in
180180+ let out_idx = oi * out_w + oj in
181181+ for f = 0 to n_features - 1 do
182182+ Linalg.mat_set reprojected out_idx f
183183+ (Linalg.mat_get mat in_idx f)
184184+ done
185185+ end
186186+ done
187187+ done;
188188+189189+ Lwt.return (reprojected, out_h, out_w, wgs_bbox)
+85
lib/tessera_zarr.mli
···11+(** GeoTessera Zarr v3 client.
22+33+ {b Warning:} This library was vibe-coded with AI assistance and has not
44+ been thoroughly reviewed or tested. Use at your own risk and expect
55+ breaking changes.
66+77+ Fetches GeoTessera embeddings from sharded Zarr v3 stores,
88+ mapping WGS84 bounding boxes to UTM pixel ranges. Dequantizes
99+ int8 embeddings using float32 scales and reprojects from the
1010+ native UTM grid to a regular WGS84 grid.
1111+1212+ Supports the MegaZarr store layout (single store with year as
1313+ dimension) as well as legacy per-year stores. The layout is
1414+ detected automatically from the array dimensionality.
1515+1616+ {2 Example}
1717+1818+ {[
1919+ let store = Zarr_v3.Store.open_store ~fetch ~codecs
2020+ "https://dl2.geotessera.org/zarr/v2/store.zarr" in
2121+ let (mat, h, w, bounds) = Tessera_zarr.fetch_region ~year:2024 ~store bbox in
2222+ ]}
2323+2424+ {2 MegaZarr store layout}
2525+2626+ {[
2727+ store.zarr/
2828+ ├── zarr.json (consolidated metadata for all zones)
2929+ ├── utm30/
3030+ │ ├── embeddings (int8, T×128×H×W, sharded 1×128×4096×4096)
3131+ │ ├── scales (float32, T×H×W, sharded 1×4096×4096)
3232+ │ └── ...
3333+ └── utm31/
3434+ └── ...
3535+ ]}
3636+3737+ Each zone group carries [spatial:transform] (6-element affine)
3838+ and [proj:code] (e.g., ["EPSG:32631"]) attributes. *)
3939+4040+(** {1 Spatial metadata} *)
4141+4242+type zone_info = {
4343+ zone : int; (** UTM zone number *)
4444+ origin_easting : float; (** Easting of pixel (0, 0) *)
4545+ origin_northing : float; (** Northing of pixel (0, 0) *)
4646+ pixel_size : float; (** Pixel size in metres (typically 10.0) *)
4747+}
4848+(** Spatial metadata extracted from a zone group's attributes. *)
4949+5050+val zone_info : Zarr_v3.Store.store -> string -> zone_info
5151+(** [zone_info store zone_name] extracts spatial metadata from a zone
5252+ group (e.g., ["utm31"]).
5353+ @raise Failure if the group or required attributes are missing. *)
5454+5555+(** {1 Fetching embeddings} *)
5656+5757+val fetch_region :
5858+ ?progress:(string -> unit) ->
5959+ ?year:int ->
6060+ store:Zarr_v3.Store.store ->
6161+ Geotessera.bbox ->
6262+ (Linalg.mat * int * int * Geotessera.bbox) Lwt.t
6363+(** [fetch_region ?year ~store bbox] fetches dequantized embeddings for
6464+ a WGS84 bounding box.
6565+6666+ Automatically detects the store layout from array dimensionality:
6767+ - {b MegaZarr} stores have 4D embeddings [[time, band, y, x]] — single
6868+ store with year as dimension. [year] selects the time slice
6969+ (default 2024, range 2017–2025).
7070+ - Legacy per-year stores have 3D embeddings [[y, x, band]] —
7171+ [year] is ignored.
7272+7373+ Steps:
7474+ + Determines the UTM zone from the bbox centre
7575+ + Converts bbox corners to UTM pixel coordinates
7676+ + Fetches [embeddings] and [scales] shards (in parallel)
7777+ + Dequantizes: [float32 = int8 × scale]
7878+ + Reprojects from the UTM pixel grid to a regular WGS84 grid
7979+8080+ Returns [(mosaic_mat, height, width, wgs84_bounds)] — the same
8181+ tuple shape as {!Geotessera.fetch_mosaic_sync} for compatibility
8282+ with existing notebook code.
8383+8484+ @raise Failure if the UTM zone is not in the store or [year] is
8585+ out of range for MegaZarr stores. *)