Declarative CSV codecs
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Redesign csvt col_codec as GADT (soup paper)

Complete the finally-tagged representation:
- col_codec GADT with constructors for each column type
(String_col, Int_col, Float_col, etc.) + Col_map for custom
- col_decode/col_encode by pattern match (not closures)
- New: col_names, col_count for introspection
- New: get_col for single-column extraction queries
- 8 new tests (46 total), all passing

+209 -71
+94 -71
lib/csvt.ml
··· 28 28 29 29 (* {1 Column codecs} *) 30 30 31 - type 'a col_codec = { 32 - kind : string; 33 - dec : string -> ('a, string) result; 34 - enc : 'a -> string; 31 + type ('a, 'b) col_base_map = { 32 + col_dec : string -> ('b, string) result; 33 + col_enc : 'b -> string; 35 34 } 36 35 37 - let col_kind c = c.kind 38 - let col_map ?(kind = "custom") ~dec ~enc () = { kind; dec; enc } 39 - let string = { kind = "string"; dec = (fun s -> Ok s); enc = Fun.id } 36 + type _ col_codec = 37 + | String_col : string col_codec 38 + | Int_col : int col_codec 39 + | Float_col : float col_codec 40 + | Bool_col : bool col_codec 41 + | Nullable_float_col : float col_codec 42 + | Nullable_int_col : int col_codec 43 + | Option_col : 'a col_codec -> 'a option col_codec 44 + | Col_map : ('a, 'b) col_base_map * string -> 'b col_codec 40 45 41 - let int = 42 - { 43 - kind = "int"; 44 - dec = 45 - (fun s -> 46 - match int_of_string_opt s with 47 - | Some i -> Ok i 48 - | None -> Error "not an integer"); 49 - enc = string_of_int; 50 - } 46 + let is_null s = String.equal s "NULL" || String.equal s "" 51 47 52 - let float = 53 - { 54 - kind = "float"; 55 - dec = 56 - (fun s -> 48 + let rec col_decode : type a. a col_codec -> string -> (a, string) result = 49 + fun c s -> 50 + match c with 51 + | String_col -> Ok s 52 + | Int_col -> ( 53 + match int_of_string_opt s with 54 + | Some i -> Ok i 55 + | None -> Error "not an integer") 56 + | Float_col -> ( 57 + match float_of_string_opt s with 58 + | Some f -> Ok f 59 + | None -> Error "not a float") 60 + | Bool_col -> ( 61 + match String.lowercase_ascii s with 62 + | "true" | "1" | "yes" -> Ok true 63 + | "false" | "0" | "no" -> Ok false 64 + | _ -> Error "not a boolean") 65 + | Nullable_float_col -> ( 66 + if is_null s then Ok Float.nan 67 + else 57 68 match float_of_string_opt s with 58 69 | Some f -> Ok f 59 - | None -> Error "not a float"); 60 - enc = string_of_float; 61 - } 70 + | None -> Error "not a float") 71 + | Nullable_int_col -> ( 72 + if is_null s then Ok (-1) 73 + else 74 + match int_of_string_opt s with 75 + | Some i -> Ok i 76 + | None -> Error "not an integer") 77 + | Option_col inner -> 78 + if is_null s then Ok None else Result.map Option.some (col_decode inner s) 79 + | Col_map (m, _) -> m.col_dec s 62 80 63 - let bool = 64 - { 65 - kind = "bool"; 66 - dec = 67 - (fun s -> 68 - match String.lowercase_ascii s with 69 - | "true" | "1" | "yes" -> Ok true 70 - | "false" | "0" | "no" -> Ok false 71 - | _ -> Error "not a boolean"); 72 - enc = string_of_bool; 73 - } 81 + let rec col_encode : type a. a col_codec -> a -> string = 82 + fun c v -> 83 + match c with 84 + | String_col -> v 85 + | Int_col -> string_of_int v 86 + | Float_col -> string_of_float v 87 + | Bool_col -> string_of_bool v 88 + | Nullable_float_col -> if Float.is_nan v then "NULL" else string_of_float v 89 + | Nullable_int_col -> if v = -1 then "NULL" else string_of_int v 90 + | Option_col inner -> ( 91 + match v with None -> "NULL" | Some x -> col_encode inner x) 92 + | Col_map (m, _) -> m.col_enc v 74 93 75 - let is_null s = String.equal s "NULL" || String.equal s "" 94 + let rec col_kind : type a. a col_codec -> string = function 95 + | String_col -> "string" 96 + | Int_col -> "int" 97 + | Float_col -> "float" 98 + | Bool_col -> "bool" 99 + | Nullable_float_col -> "nullable_float" 100 + | Nullable_int_col -> "nullable_int" 101 + | Option_col inner -> "option(" ^ col_kind inner ^ ")" 102 + | Col_map (_, kind) -> kind 76 103 77 - let nullable_float = 78 - { 79 - kind = "nullable_float"; 80 - dec = 81 - (fun s -> 82 - if is_null s then Ok Float.nan 83 - else 84 - match float_of_string_opt s with 85 - | Some f -> Ok f 86 - | None -> Error "not a float"); 87 - enc = (fun f -> if Float.is_nan f then "NULL" else string_of_float f); 88 - } 89 - 90 - let nullable_int = 91 - { 92 - kind = "nullable_int"; 93 - dec = 94 - (fun s -> 95 - if is_null s then Ok (-1) 96 - else 97 - match int_of_string_opt s with 98 - | Some i -> Ok i 99 - | None -> Error "not an integer"); 100 - enc = (fun i -> if i = -1 then "NULL" else string_of_int i); 101 - } 104 + let col_map ?(kind = "custom") ~dec ~enc () = 105 + Col_map ({ col_dec = dec; col_enc = enc }, kind) 102 106 103 - let option c = 104 - { 105 - kind = "option(" ^ c.kind ^ ")"; 106 - dec = 107 - (fun s -> if is_null s then Ok None else Result.map Option.some (c.dec s)); 108 - enc = (fun v -> match v with None -> "NULL" | Some x -> c.enc x); 109 - } 107 + let string = String_col 108 + let int = Int_col 109 + let float = Float_col 110 + let bool = Bool_col 111 + let nullable_float = Nullable_float_col 112 + let nullable_int = Nullable_int_col 113 + let option c = Option_col c 110 114 111 115 (* {1 Row codec internals} 112 116 ··· 205 209 } 206 210 end 207 211 212 + (* {1 Introspection} *) 213 + 214 + let col_names codec = List.map (fun (Col_dec cm) -> cm.name) codec.col_decs 215 + let col_count codec = List.length codec.col_decs 216 + 217 + (* {1 Query support} *) 218 + 219 + let get_col name (type a) (c : a col_codec) : a t = 220 + let id = Type.Id.make () in 221 + let cm = { name; type' = c; id; dec_absent = None; enc = Fun.id } in 222 + { 223 + kind = "get_col(" ^ name ^ ")"; 224 + dec = Dec_app (Dec_fun Fun.id, id); 225 + col_decs = [ Col_dec cm ]; 226 + col_encs = [ Col_enc cm ]; 227 + } 228 + 208 229 (* {1 Header resolution} *) 209 230 210 231 type header = string array ··· 259 280 Error (Truncated_row { row = row_num; expected = idx + 1; got = nf }) 260 281 else 261 282 let s = fields.(idx) in 262 - match cm.type'.dec s with 283 + match col_decode cm.type' s with 263 284 | Ok v -> go (i + 1) (Dict.add cm.id v dict) 264 285 | Error msg -> 265 286 Error ··· 274 295 275 296 let encode_row codec v = 276 297 Array.of_list 277 - (List.map (fun (Col_enc cm) -> cm.type'.enc (cm.enc v)) codec.col_encs) 298 + (List.map 299 + (fun (Col_enc cm) -> col_encode cm.type' (cm.enc v)) 300 + codec.col_encs) 278 301 279 302 (* {1 CSV splitting} *) 280 303
+20
lib/csvt.mli
··· 92 92 val col_kind : 'a col_codec -> string 93 93 (** [col_kind c] returns the kind description of column codec [c]. *) 94 94 95 + val col_decode : 'a col_codec -> string -> ('a, string) result 96 + (** [col_decode c s] decodes a single CSV field using column codec [c]. *) 97 + 98 + val col_encode : 'a col_codec -> 'a -> string 99 + (** [col_encode c v] encodes a value using column codec [c]. *) 100 + 95 101 (** {1:row Row Codecs} *) 96 102 97 103 type 'a t ··· 136 142 val finish : ('o, 'o) map -> 'o codec 137 143 (** [finish m] completes the row codec. *) 138 144 end 145 + 146 + (** {1:introspection Introspection} *) 147 + 148 + val col_names : 'a t -> string list 149 + (** [col_names codec] returns the list of column names in the codec. *) 150 + 151 + val col_count : 'a t -> int 152 + (** [col_count codec] returns the number of columns in the codec. *) 153 + 154 + (** {1:query Query Support} *) 155 + 156 + val get_col : string -> 'a col_codec -> 'a t 157 + (** [get_col name c] creates a single-column row codec that extracts just one 158 + column by name. Useful for querying a specific column from a CSV. *) 139 159 140 160 (** {1:header Header Resolution} *) 141 161
+95
test/test_csvt.ml
··· 587 587 (* Should still have a header *) 588 588 Alcotest.(check string) "header only" "x,y,label\n" result 589 589 590 + (* {1 Introspection tests} *) 591 + 592 + let test_col_names () = 593 + let names = Csvt.col_names point_codec in 594 + Alcotest.(check (list string)) "names" [ "x"; "y"; "label" ] names 595 + 596 + let test_col_count () = 597 + Alcotest.(check int) "point cols" 3 (Csvt.col_count point_codec); 598 + Alcotest.(check int) "record cols" 4 (Csvt.col_count record_codec) 599 + 600 + (* {1 col_decode / col_encode tests} *) 601 + 602 + let test_col_decode_encode () = 603 + (* string *) 604 + Alcotest.(check (result string string)) 605 + "string dec" (Ok "hello") 606 + (Csvt.col_decode Csvt.string "hello"); 607 + Alcotest.(check string) 608 + "string enc" "hello" 609 + (Csvt.col_encode Csvt.string "hello"); 610 + (* int *) 611 + Alcotest.(check (result int string)) 612 + "int dec" (Ok 42) 613 + (Csvt.col_decode Csvt.int "42"); 614 + Alcotest.(check string) "int enc" "42" (Csvt.col_encode Csvt.int 42); 615 + (* float *) 616 + (match Csvt.col_decode Csvt.float "3.14" with 617 + | Ok f -> Alcotest.(check (float 1e-6)) "float dec" 3.14 f 618 + | Error e -> Alcotest.failf "float dec: %s" e); 619 + Alcotest.(check string) "float enc" "3.14" (Csvt.col_encode Csvt.float 3.14); 620 + (* bool *) 621 + Alcotest.(check (result bool string)) 622 + "bool dec" (Ok true) 623 + (Csvt.col_decode Csvt.bool "yes"); 624 + Alcotest.(check string) "bool enc" "true" (Csvt.col_encode Csvt.bool true); 625 + (* option *) 626 + Alcotest.(check (result (option int) string)) 627 + "option dec none" (Ok None) 628 + (Csvt.col_decode (Csvt.option Csvt.int) "NULL"); 629 + Alcotest.(check (result (option int) string)) 630 + "option dec some" (Ok (Some 7)) 631 + (Csvt.col_decode (Csvt.option Csvt.int) "7"); 632 + Alcotest.(check string) 633 + "option enc none" "NULL" 634 + (Csvt.col_encode (Csvt.option Csvt.int) None); 635 + Alcotest.(check string) 636 + "option enc some" "7" 637 + (Csvt.col_encode (Csvt.option Csvt.int) (Some 7)) 638 + 639 + (* {1 Query tests (get_col)} *) 640 + 641 + let test_get_col_int () = 642 + let csv = "id,name,score,active\n1,alice,95.5,true\n2,bob,87.3,false\n" in 643 + let id_codec = Csvt.get_col "id" Csvt.int in 644 + match Csvt.decode_string id_codec csv with 645 + | Error e -> Alcotest.failf "get_col id: %s" (Csvt.error_to_string e) 646 + | Ok ids -> Alcotest.(check (list int)) "ids" [ 1; 2 ] ids 647 + 648 + let test_get_col_string () = 649 + let csv = "id,name,score,active\n1,alice,95.5,true\n2,bob,87.3,false\n" in 650 + let name_codec = Csvt.get_col "name" Csvt.string in 651 + match Csvt.decode_string name_codec csv with 652 + | Error e -> Alcotest.failf "get_col name: %s" (Csvt.error_to_string e) 653 + | Ok names -> Alcotest.(check (list string)) "names" [ "alice"; "bob" ] names 654 + 655 + let test_get_col_missing () = 656 + let csv = "id,name\n1,alice\n" in 657 + let score_codec = Csvt.get_col "score" Csvt.float in 658 + match Csvt.decode_string score_codec csv with 659 + | Ok _ -> Alcotest.fail "expected error for missing column" 660 + | Error (Csvt.Missing_column "score") -> () 661 + | Error e -> Alcotest.failf "unexpected error: %s" (Csvt.error_to_string e) 662 + 663 + let test_get_col_encode () = 664 + let codec = Csvt.get_col "v" Csvt.int in 665 + let h = Csvt.encode_header codec in 666 + Alcotest.(check (array string)) "header" [| "v" |] h; 667 + let row = Csvt.encode_row codec 42 in 668 + Alcotest.(check (array string)) "row" [| "42" |] row 669 + 670 + let test_get_col_col_names () = 671 + let codec = Csvt.get_col "score" Csvt.float in 672 + Alcotest.(check (list string)) "col_names" [ "score" ] (Csvt.col_names codec); 673 + Alcotest.(check int) "col_count" 1 (Csvt.col_count codec) 674 + 590 675 let suite = 591 676 ( "csvt", 592 677 [ ··· 640 725 Alcotest.test_case "write multi-type" `Quick test_write_multi_type; 641 726 Alcotest.test_case "write empty" `Quick test_write_empty; 642 727 Alcotest.test_case "roundtrip streaming" `Quick test_roundtrip_streaming; 728 + (* Introspection *) 729 + Alcotest.test_case "col_names" `Quick test_col_names; 730 + Alcotest.test_case "col_count" `Quick test_col_count; 731 + Alcotest.test_case "col_decode/encode" `Quick test_col_decode_encode; 732 + (* Query (get_col) *) 733 + Alcotest.test_case "get_col int" `Quick test_get_col_int; 734 + Alcotest.test_case "get_col string" `Quick test_get_col_string; 735 + Alcotest.test_case "get_col missing" `Quick test_get_col_missing; 736 + Alcotest.test_case "get_col encode" `Quick test_get_col_encode; 737 + Alcotest.test_case "get_col col_names" `Quick test_get_col_col_names; 643 738 ] )