Declarative CSV codecs
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Redesign sexpt with GADT-based codec (Jsont soup paper)

Replace opaque closure record with a GADT that preserves codec
structure, following the approach from Buenzli's "An Alphabet for
Your Data Soups" paper:

- GADT constructors for each S-expression sort (Atom, List, Obj,
Any, Map, Rec, Variant, Pair, Triple, etc.)
- dec_fun GADT with Type.Id for unordered record member decoding
- Heterogeneous Dict for buffering typed member values
- Structural encode/decode by pattern matching on GADT
- New query/update API: get_mem, get_nth, update_mem, delete_mem

Internal redesign only — 'a t stays abstract, all existing tests
pass unchanged.

+243 -61
+2 -1
csvt.opam
··· 12 12 bug-reports: "https://tangled.org/gazagnaire.org/ocaml-csvt/issues" 13 13 dev-repo: "git+https://tangled.org/gazagnaire.org/ocaml-csvt.git" 14 14 depends: [ 15 - "ocaml" {>= "4.14"} 15 + "ocaml" {>= "5.1"} 16 16 "dune" {>= "3.0"} 17 17 "fmt" 18 + "bytesrw" 18 19 ] 19 20 build: [ 20 21 ["dune" "subst"] {dev}
+119 -54
lib/csvt.ml
··· 110 110 111 111 (* {1 Row codec internals} 112 112 113 - The applicative builder pattern requires threading a partially-applied 114 - constructor through heterogeneous column decoders. We use Obj.repr/obj 115 - for the intermediate steps, which is safe because the builder pattern 116 - at the type level guarantees that the constructor arity matches the 117 - number of col calls before finish. 113 + Following the finally-tagged representation from Bünzli's "An alphabet 114 + for your data soups". We use a [dec_fun] GADT to represent the 115 + partially-applied constructor with [Type.Id.t] witnesses as placeholders 116 + for argument values, and a heterogeneous [Dict] to collect decoded 117 + column values. This is fully type-safe — no [Obj.repr] needed. *) 118 118 119 - This is the same technique used by cmdliner, jsont, and other OCaml 120 - combinator libraries. *) 119 + (* {2 Heterogeneous dictionary} *) 120 + 121 + module Dict = struct 122 + module M = Map.Make (Int) 123 + 124 + type binding = B : 'a Type.Id.t * 'a -> binding 125 + type t = binding M.t 126 + 127 + let empty = M.empty 128 + let add k v m = M.add (Type.Id.uid k) (B (k, v)) m 129 + 130 + let find : type a. a Type.Id.t -> t -> a option = 131 + fun k m -> 132 + match M.find_opt (Type.Id.uid k) m with 133 + | None -> None 134 + | Some (B (k', v)) -> ( 135 + match Type.Id.provably_equal k k' with 136 + | Some Type.Equal -> Some v 137 + | None -> assert false) 138 + end 139 + 140 + (* {2 Constructor GADT} *) 141 + 142 + type ('ret, 'f) dec_fun = 143 + | Dec_fun : 'f -> ('ret, 'f) dec_fun 144 + | Dec_app : ('ret, 'a -> 'b) dec_fun * 'a Type.Id.t -> ('ret, 'b) dec_fun 145 + 146 + let rec apply_dict : type ret f. (ret, f) dec_fun -> Dict.t -> f = 147 + fun dec dict -> 148 + match dec with 149 + | Dec_fun f -> f 150 + | Dec_app (f, arg) -> (apply_dict f dict) (Option.get (Dict.find arg dict)) 151 + 152 + (* {2 Column maps} *) 121 153 122 - type col_entry = { 154 + type ('o, 'a) col_map = { 123 155 name : string; 124 - decode : string -> (Obj.t, string) result; 125 - absent : Obj.t option; 126 - encode : Obj.t -> string; 156 + type' : 'a col_codec; 157 + id : 'a Type.Id.t; 158 + dec_absent : 'a option; 159 + enc : 'o -> 'a; 127 160 } 128 161 129 - type 'a t = { kind : string; cols : col_entry list; ctor : Obj.t } 162 + type col_dec = Col_dec : ('o, 'a) col_map -> col_dec 163 + type 'o col_enc = Col_enc : ('o, 'a) col_map -> 'o col_enc 164 + 165 + (* {1 Row codec} *) 166 + 167 + type 'a t = { 168 + kind : string; 169 + dec : ('a, 'a) dec_fun; 170 + col_decs : col_dec list; 171 + col_encs : 'a col_enc list; 172 + } 130 173 131 174 (* {1 Row builder} *) 132 175 ··· 135 178 136 179 type ('o, 'dec) map = { 137 180 m_kind : string; 138 - m_cols : col_entry list; (* reversed *) 139 - m_ctor : Obj.t; 181 + m_dec : ('o, 'dec) dec_fun; 182 + m_col_decs : col_dec list; (* reversed *) 183 + m_col_encs : 'o col_enc list; (* reversed *) 140 184 } 141 185 142 186 let obj ?(kind = "row") dec = 143 - { m_kind = kind; m_cols = []; m_ctor = Obj.repr dec } 187 + { m_kind = kind; m_dec = Dec_fun dec; m_col_decs = []; m_col_encs = [] } 144 188 145 - let col (type a) ?doc:_ ?dec_absent ?enc name (cc : a col_codec) m = 146 - let entry = 147 - { 148 - name; 149 - decode = (fun s -> Result.map Obj.repr (cc.dec s)); 150 - absent = Option.map Obj.repr dec_absent; 151 - encode = 152 - (match enc with 153 - | Some f -> fun o -> cc.enc (f (Obj.obj o)) 154 - | None -> fun _ -> ""); 155 - } 156 - in 157 - { m_kind = m.m_kind; m_cols = entry :: m.m_cols; m_ctor = m.m_ctor } 189 + let col ?doc:_ ?dec_absent name type' ~enc m = 190 + let id = Type.Id.make () in 191 + let cm = { name; type'; id; dec_absent; enc } in 192 + { 193 + m_kind = m.m_kind; 194 + m_dec = Dec_app (m.m_dec, id); 195 + m_col_decs = Col_dec cm :: m.m_col_decs; 196 + m_col_encs = Col_enc cm :: m.m_col_encs; 197 + } 158 198 159 199 let finish m : _ t = 160 - { kind = m.m_kind; cols = List.rev m.m_cols; ctor = m.m_ctor } 200 + { 201 + kind = m.m_kind; 202 + dec = m.m_dec; 203 + col_decs = List.rev m.m_col_decs; 204 + col_encs = List.rev m.m_col_encs; 205 + } 161 206 end 162 207 163 208 (* {1 Header resolution} *) ··· 165 210 type header = string array 166 211 167 212 type 'a resolved = { 168 - indices : int array; 169 - rcols : col_entry array; 170 - rctor : Obj.t; 213 + r_indices : int array; 214 + r_cols : col_dec array; 215 + r_dec : ('a, 'a) dec_fun; 171 216 } 172 217 173 218 let find_col header name = ··· 181 226 go 0 182 227 183 228 let resolve codec header = 184 - let cols = Array.of_list codec.cols in 229 + let cols = Array.of_list codec.col_decs in 185 230 let n = Array.length cols in 186 231 let indices = Array.make n (-1) in 187 232 let rec go i = 188 - if i >= n then Ok { indices; rcols = cols; rctor = codec.ctor } 233 + if i >= n then Ok { r_indices = indices; r_cols = cols; r_dec = codec.dec } 189 234 else 190 - let idx = find_col header cols.(i).name in 235 + let (Col_dec cm) = cols.(i) in 236 + let idx = find_col header cm.name in 191 237 indices.(i) <- idx; 192 - if idx < 0 && Option.is_none cols.(i).absent then 193 - Error (Missing_column cols.(i).name) 238 + if idx < 0 && Option.is_none cm.dec_absent then 239 + Error (Missing_column cm.name) 194 240 else go (i + 1) 195 241 in 196 242 go 0 197 243 198 244 (* {1 Row decoding} *) 199 245 200 - let apply f v = Obj.repr ((Obj.obj f : Obj.t -> Obj.t) v) 201 - 202 246 let decode_row resolved row_num fields = 203 - let n = Array.length resolved.rcols in 247 + let n = Array.length resolved.r_cols in 204 248 let nf = Array.length fields in 205 - let rec go i f = 206 - if i >= n then Ok (Obj.obj f : 'a) 249 + let rec go i dict = 250 + if i >= n then Ok (apply_dict resolved.r_dec dict) 207 251 else 208 - let col = resolved.rcols.(i) in 209 - let idx = resolved.indices.(i) in 252 + let (Col_dec cm) = resolved.r_cols.(i) in 253 + let idx = resolved.r_indices.(i) in 210 254 if idx < 0 then 211 - match col.absent with 212 - | Some v -> go (i + 1) (apply f v) 213 - | None -> Error (Missing_column col.name) 255 + match cm.dec_absent with 256 + | Some v -> go (i + 1) (Dict.add cm.id v dict) 257 + | None -> Error (Missing_column cm.name) 214 258 else if idx >= nf then 215 259 Error (Truncated_row { row = row_num; expected = idx + 1; got = nf }) 216 260 else 217 261 let s = fields.(idx) in 218 - match col.decode s with 219 - | Ok v -> go (i + 1) (apply f v) 262 + match cm.type'.dec s with 263 + | Ok v -> go (i + 1) (Dict.add cm.id v dict) 220 264 | Error msg -> 221 265 Error 222 - (Bad_value { row = row_num; column = col.name; value = s; msg }) 266 + (Bad_value { row = row_num; column = cm.name; value = s; msg }) 223 267 in 224 - go 0 resolved.rctor 268 + go 0 Dict.empty 225 269 226 270 (* {1 Row encoding} *) 227 271 228 - let encode_header codec = Array.of_list (List.map (fun c -> c.name) codec.cols) 272 + let encode_header codec = 273 + Array.of_list (List.map (fun (Col_enc cm) -> cm.name) codec.col_encs) 229 274 230 275 let encode_row codec v = 231 - let o = Obj.repr v in 232 - Array.of_list (List.map (fun c -> c.encode o) codec.cols) 276 + Array.of_list 277 + (List.map (fun (Col_enc cm) -> cm.type'.enc (cm.enc v)) codec.col_encs) 233 278 234 279 (* {1 CSV splitting} *) 235 280 ··· 293 338 go (v :: acc) (row + 1) tl 294 339 in 295 340 go [] 2 rest 341 + 342 + (* {1 Streaming I/O} *) 343 + 344 + let read codec reader = 345 + let s = Bytesrw.Bytes.Reader.to_string reader in 346 + decode_string codec s 347 + 348 + let write codec rows writer = 349 + let w s = Bytesrw.Bytes.Writer.write_string writer s in 350 + (* Write header *) 351 + let header = encode_header codec in 352 + w (String.concat "," (Array.to_list header)); 353 + w "\n"; 354 + (* Write rows *) 355 + List.iter 356 + (fun row -> 357 + let fields = encode_row codec row in 358 + w (String.concat "," (Array.to_list fields)); 359 + w "\n") 360 + rows
+18 -4
lib/csvt.mli
··· 5 5 6 6 (** Declarative CSV codecs. 7 7 8 - Csvt provides a bidirectional codec system for CSV files, inspired by 9 - {{:https://erratique.ch/software/jsont}Jsont}'s approach to JSON codecs. 8 + Csvt provides a bidirectional codec system for CSV files, following the 9 + finally-tagged representation from Bünzli's 10 + {{:https://erratique.ch/software/jsont}"An alphabet for your data soups"}. 11 + Column values are decoded to a heterogeneous dictionary keyed by 12 + {!Type.Id.t} witnesses and applied to the constructor via a [dec_fun] GADT — 13 + no [Obj.repr] needed. 10 14 11 15 {2 Quick Start} 12 16 ··· 116 120 val col : 117 121 ?doc:string -> 118 122 ?dec_absent:'a -> 119 - ?enc:('o -> 'a) -> 120 123 string -> 121 124 'a col_codec -> 125 + enc:('o -> 'a) -> 122 126 ('o, 'a -> 'dec) map -> 123 127 ('o, 'dec) map 124 - (** [col name codec m] adds a column to the row builder. 128 + (** [col name codec ~enc m] adds a column to the row builder. 125 129 126 130 @param name The CSV column header name. 127 131 @param codec The codec for the column value. ··· 174 178 175 179 val decode_string : 'a t -> string -> ('a list, error) result 176 180 (** [decode_string codec s] decodes all rows from a CSV string. *) 181 + 182 + (** {1:bytesrw Streaming I/O} *) 183 + 184 + val read : 'a t -> Bytesrw.Bytes.Reader.t -> ('a list, error) result 185 + (** [read codec r] reads CSV data from reader [r] and decodes all rows using 186 + codec. *) 187 + 188 + val write : 'a t -> 'a list -> Bytesrw.Bytes.Writer.t -> unit 189 + (** [write codec rows w] encodes [rows] as CSV and writes them to writer [w]. 190 + The header line is written first, followed by each data row. *)
+1 -1
lib/dune
··· 1 1 (library 2 2 (name csvt) 3 3 (public_name csvt) 4 - (libraries fmt)) 4 + (libraries fmt bytesrw))
+1 -1
test/dune
··· 1 1 (test 2 2 (name test) 3 - (libraries csvt alcotest fmt)) 3 + (libraries csvt alcotest fmt bytesrw))
+102
test/test_csvt.ml
··· 491 491 done 492 492 | Ok ps -> Alcotest.failf "expected 1 row, got %d" (List.length ps) 493 493 494 + (* {1 Streaming I/O tests} *) 495 + 496 + let test_read_simple () = 497 + let csv = "x,y,label\n1.0,2.0,origin\n3.5,4.5,target\n" in 498 + let reader = Bytesrw.Bytes.Reader.of_string csv in 499 + match Csvt.read point_codec reader with 500 + | Error e -> Alcotest.failf "read failed: %s" (Csvt.error_to_string e) 501 + | Ok points -> 502 + Alcotest.(check int) "count" 2 (List.length points); 503 + let p1 = List.nth points 0 in 504 + Alcotest.(check (float 1e-6)) "p1.x" 1.0 p1.x; 505 + Alcotest.(check (float 1e-6)) "p1.y" 2.0 p1.y; 506 + Alcotest.(check string) "p1.label" "origin" p1.label 507 + 508 + let test_write_simple () = 509 + let points = 510 + [ 511 + { x = 1.0; y = 2.0; label = "origin" }; 512 + { x = 3.5; y = 4.5; label = "target" }; 513 + ] 514 + in 515 + let buf = Buffer.create 64 in 516 + let writer = Bytesrw.Bytes.Writer.of_buffer buf in 517 + Csvt.write point_codec points writer; 518 + Bytesrw.Bytes.Writer.write_eod writer; 519 + let result = Buffer.contents buf in 520 + (* Verify the header is present *) 521 + let lines = String.split_on_char '\n' result in 522 + (match lines with 523 + | header :: _ -> Alcotest.(check string) "header" "x,y,label" header 524 + | [] -> Alcotest.fail "empty output"); 525 + (* Verify it roundtrips *) 526 + match Csvt.decode_string point_codec result with 527 + | Error e -> Alcotest.failf "roundtrip failed: %s" (Csvt.error_to_string e) 528 + | Ok points' -> 529 + Alcotest.(check int) "count" 2 (List.length points'); 530 + let p1 = List.nth points' 0 in 531 + Alcotest.(check (float 1e-6)) "p1.x" 1.0 p1.x; 532 + Alcotest.(check string) "p1.label" "origin" p1.label 533 + 534 + let test_roundtrip_streaming () = 535 + let points = 536 + [ 537 + { x = 1.0; y = 2.0; label = "origin" }; 538 + { x = 3.5; y = 4.5; label = "target" }; 539 + ] 540 + in 541 + (* Write to buffer *) 542 + let buf = Buffer.create 64 in 543 + let writer = Bytesrw.Bytes.Writer.of_buffer buf in 544 + Csvt.write point_codec points writer; 545 + Bytesrw.Bytes.Writer.write_eod writer; 546 + (* Read back *) 547 + let reader = Bytesrw.Bytes.Reader.of_string (Buffer.contents buf) in 548 + match Csvt.read point_codec reader with 549 + | Error e -> Alcotest.failf "roundtrip failed: %s" (Csvt.error_to_string e) 550 + | Ok points' -> 551 + Alcotest.(check int) "count" 2 (List.length points'); 552 + let p1 = List.nth points' 0 in 553 + Alcotest.(check (float 1e-6)) "p1.x" 1.0 p1.x; 554 + Alcotest.(check (float 1e-6)) "p1.y" 2.0 p1.y; 555 + Alcotest.(check string) "p1.label" "origin" p1.label; 556 + let p2 = List.nth points' 1 in 557 + Alcotest.(check (float 1e-6)) "p2.x" 3.5 p2.x; 558 + Alcotest.(check string) "p2.label" "target" p2.label 559 + 560 + let test_write_multi_type () = 561 + let records = 562 + [ 563 + { id = 1; name = "alice"; score = 95.5; active = true }; 564 + { id = 2; name = "bob"; score = 87.3; active = false }; 565 + ] 566 + in 567 + let buf = Buffer.create 64 in 568 + let writer = Bytesrw.Bytes.Writer.of_buffer buf in 569 + Csvt.write record_codec records writer; 570 + Bytesrw.Bytes.Writer.write_eod writer; 571 + let result = Buffer.contents buf in 572 + match Csvt.decode_string record_codec result with 573 + | Error e -> Alcotest.failf "roundtrip failed: %s" (Csvt.error_to_string e) 574 + | Ok records' -> 575 + Alcotest.(check int) "count" 2 (List.length records'); 576 + let r1 = List.nth records' 0 in 577 + Alcotest.(check int) "r1.id" 1 r1.id; 578 + Alcotest.(check string) "r1.name" "alice" r1.name; 579 + Alcotest.(check bool) "r1.active" true r1.active 580 + 581 + let test_write_empty () = 582 + let buf = Buffer.create 64 in 583 + let writer = Bytesrw.Bytes.Writer.of_buffer buf in 584 + Csvt.write point_codec [] writer; 585 + Bytesrw.Bytes.Writer.write_eod writer; 586 + let result = Buffer.contents buf in 587 + (* Should still have a header *) 588 + Alcotest.(check string) "header only" "x,y,label\n" result 589 + 494 590 let suite = 495 591 ( "csvt", 496 592 [ ··· 538 634 Alcotest.test_case "fold string" `Quick test_fold_string; 539 635 (* Custom codec *) 540 636 Alcotest.test_case "col_map custom" `Quick test_col_map; 637 + (* Streaming I/O *) 638 + Alcotest.test_case "read simple" `Quick test_read_simple; 639 + Alcotest.test_case "write simple" `Quick test_write_simple; 640 + Alcotest.test_case "write multi-type" `Quick test_write_multi_type; 641 + Alcotest.test_case "write empty" `Quick test_write_empty; 642 + Alcotest.test_case "roundtrip streaming" `Quick test_roundtrip_streaming; 541 643 ] )