Declarative CSV codecs
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat(ocaml-csvt, ocaml-cdm): add declarative CSV codec and CDM libraries

ocaml-csvt: bidirectional CSV codec library following the jsont/sexpt
applicative builder pattern. Column codecs (string, int, float, bool,
nullable, option, custom), header-based column resolution, streaming
fold for large files. 33 unit tests + 9 Crowbar fuzz tests.

ocaml-cdm: Conjunction Data Message (CCSDS 508.0-B-1) built on csvt.
Parses TraCSS CA verification dataset CSV (45 columns) into structured
types with state vectors, covariance matrices, TCA, miss distance, Pc.
Verified on both real datasets (913K + 283K events).

+1330
+22
csvt.opam
··· 1 + opam-version: "2.0" 2 + synopsis: "Declarative CSV codecs" 3 + description: """ 4 + Bidirectional codec system for CSV files, inspired by Jsont's approach to JSON 5 + codecs. Define typed codecs for your OCaml types and use them to decode CSV 6 + rows and encode values back to CSV. 7 + """ 8 + maintainer: ["Thomas Gazagnaire <thomas@gazagnaire.org>"] 9 + authors: ["Thomas Gazagnaire <thomas@gazagnaire.org>"] 10 + license: "ISC" 11 + homepage: "https://tangled.org/gazagnaire.org/ocaml-csvt" 12 + bug-reports: "https://tangled.org/gazagnaire.org/ocaml-csvt/issues" 13 + dev-repo: "git+https://tangled.org/gazagnaire.org/ocaml-csvt.git" 14 + depends: [ 15 + "ocaml" {>= "4.14"} 16 + "dune" {>= "3.0"} 17 + "fmt" 18 + ] 19 + build: [ 20 + ["dune" "subst"] {dev} 21 + ["dune" "build" "-p" name "-j" jobs] 22 + ]
+4
dune-project
··· 1 + (lang dune 3.21) 2 + (name csvt) 3 + (source (tangled gazagnaire.org/ocaml-csvt)) 4 + (formatting (enabled_for ocaml))
+38
fuzz/dune
··· 1 + ; Crowbar fuzz testing for CSV codec roundtripping 2 + ; 3 + ; Quick check (runs tests with random inputs): 4 + ; dune build @fuzz 5 + ; 6 + ; With AFL instrumentation (use crow orchestrator): 7 + ; crow start --cpus=4 8 + 9 + (executable 10 + (name fuzz) 11 + (modules fuzz fuzz_csvt) 12 + (libraries csvt crowbar)) 13 + 14 + (executable 15 + (name gen_corpus) 16 + (modules gen_corpus) 17 + (libraries unix fmt)) 18 + 19 + (rule 20 + (alias runtest) 21 + (enabled_if 22 + (<> %{profile} afl)) 23 + (deps 24 + fuzz.exe 25 + (source_tree corpus)) 26 + (action 27 + (run %{exe:fuzz.exe}))) 28 + 29 + (rule 30 + (alias fuzz) 31 + (enabled_if 32 + (= %{profile} afl)) 33 + (deps 34 + (source_tree corpus) 35 + fuzz.exe 36 + gen_corpus.exe) 37 + (action 38 + (echo "AFL fuzzer built: %{exe:fuzz.exe}\n")))
+1
fuzz/fuzz.ml
··· 1 + let () = Crowbar.run "csvt" [ Fuzz_csvt.suite ]
+233
fuzz/fuzz_csvt.ml
··· 1 + (* Crowbar-based fuzz testing for Csvt *) 2 + 3 + open Crowbar 4 + 5 + (* {1 Generators} *) 6 + 7 + (* Generate a string that doesn't contain commas or newlines (valid CSV field) *) 8 + let csv_field_gen = 9 + map [ bytes ] (fun s -> 10 + String.to_seq s 11 + |> Seq.filter (fun c -> c <> ',' && c <> '\n' && c <> '\r') 12 + |> String.of_seq) 13 + 14 + (* Generate a valid float string *) 15 + let float_str_gen = 16 + map [ float ] (fun f -> 17 + if Float.is_nan f || Float.is_infinite f then "0.0" 18 + else string_of_float f) 19 + 20 + (* Generate a valid int string *) 21 + let int_str_gen = map [ int ] (fun i -> string_of_int i) 22 + 23 + (* Generate a valid bool string *) 24 + let bool_str_gen = 25 + map [ bool ] (fun b -> if b then "true" else "false") 26 + 27 + (* {1 Test codecs} *) 28 + 29 + let int_codec = Csvt.(Row.(obj Fun.id |> col "v" int ~enc:Fun.id |> finish)) 30 + 31 + let float_codec = 32 + Csvt.(Row.(obj Fun.id |> col "v" float ~enc:Fun.id |> finish)) 33 + 34 + let string_codec = 35 + Csvt.(Row.(obj Fun.id |> col "v" string ~enc:Fun.id |> finish)) 36 + 37 + let bool_codec = 38 + Csvt.(Row.(obj Fun.id |> col "v" bool ~enc:Fun.id |> finish)) 39 + 40 + let pair_codec = 41 + Csvt.( 42 + Row.( 43 + obj (fun a b -> (a, b)) 44 + |> col "a" int ~enc:fst 45 + |> col "b" string ~enc:snd 46 + |> finish)) 47 + 48 + let triple_codec = 49 + Csvt.( 50 + Row.( 51 + obj (fun a b c -> (a, b, c)) 52 + |> col "a" int ~enc:(fun (a, _, _) -> a) 53 + |> col "b" float ~enc:(fun (_, b, _) -> b) 54 + |> col "c" string ~enc:(fun (_, _, c) -> c) 55 + |> finish)) 56 + 57 + let nullable_codec = 58 + Csvt.( 59 + Row.(obj Fun.id |> col "v" nullable_float ~enc:Fun.id |> finish)) 60 + 61 + let option_codec = 62 + Csvt.( 63 + Row.(obj Fun.id |> col "v" (option int) ~enc:Fun.id |> finish)) 64 + 65 + (* {1 Crash safety: arbitrary bytes never cause a crash} *) 66 + 67 + let test_no_crash_on_arbitrary_input bytes = 68 + (* Decoding arbitrary bytes should either succeed or return Error, 69 + never crash *) 70 + ignore (Csvt.decode_string int_codec bytes); 71 + ignore (Csvt.decode_string float_codec bytes); 72 + ignore (Csvt.decode_string string_codec bytes); 73 + ignore (Csvt.decode_string bool_codec bytes); 74 + ignore (Csvt.decode_string pair_codec bytes); 75 + ignore (Csvt.decode_string nullable_codec bytes); 76 + ignore (Csvt.decode_string option_codec bytes) 77 + 78 + (* {1 Int roundtrip} *) 79 + 80 + let test_int_roundtrip i = 81 + let csv = "v\n" ^ string_of_int i ^ "\n" in 82 + match Csvt.decode_string int_codec csv with 83 + | Error e -> failf "int decode failed: %s" (Csvt.error_to_string e) 84 + | Ok [ v ] -> 85 + if v <> i then failf "int roundtrip: expected %d, got %d" i v 86 + | Ok vs -> failf "int roundtrip: expected 1 row, got %d" (List.length vs) 87 + 88 + (* {1 Float roundtrip} *) 89 + 90 + let test_float_roundtrip f = 91 + if Float.is_nan f || Float.is_infinite f then () 92 + else 93 + let csv = "v\n" ^ string_of_float f ^ "\n" in 94 + match Csvt.decode_string float_codec csv with 95 + | Error e -> failf "float decode failed: %s" (Csvt.error_to_string e) 96 + | Ok [ v ] -> 97 + (* Float roundtrip through string may lose precision *) 98 + let diff = Float.abs (f -. v) in 99 + let scale = Float.max 1.0 (Float.abs f) in 100 + if diff /. scale > 1e-10 then 101 + failf "float roundtrip: expected %g, got %g (diff %g)" f v diff 102 + | Ok vs -> failf "float roundtrip: expected 1 row, got %d" (List.length vs) 103 + 104 + (* {1 Bool roundtrip} *) 105 + 106 + let test_bool_roundtrip b = 107 + let csv = "v\n" ^ string_of_bool b ^ "\n" in 108 + match Csvt.decode_string bool_codec csv with 109 + | Error e -> failf "bool decode failed: %s" (Csvt.error_to_string e) 110 + | Ok [ v ] -> 111 + if v <> b then failf "bool roundtrip: expected %b, got %b" b v 112 + | Ok vs -> failf "bool roundtrip: expected 1 row, got %d" (List.length vs) 113 + 114 + (* {1 String roundtrip (no commas/newlines)} *) 115 + 116 + let test_string_roundtrip s = 117 + let s = 118 + String.to_seq s 119 + |> Seq.filter (fun c -> c <> ',' && c <> '\n' && c <> '\r') 120 + |> String.of_seq 121 + in 122 + if String.length s = 0 then () (* empty lines are skipped *) 123 + else 124 + let csv = "v\n" ^ s ^ "\n" in 125 + match Csvt.decode_string string_codec csv with 126 + | Error e -> failf "string decode failed: %s" (Csvt.error_to_string e) 127 + | Ok [ v ] -> 128 + if not (String.equal v s) then 129 + failf "string roundtrip: expected %S, got %S" s v 130 + | Ok vs -> 131 + failf "string roundtrip: expected 1 row, got %d" (List.length vs) 132 + 133 + (* {1 Pair roundtrip} *) 134 + 135 + let test_pair_roundtrip i s = 136 + let s = 137 + String.to_seq s 138 + |> Seq.filter (fun c -> c <> ',' && c <> '\n' && c <> '\r') 139 + |> String.of_seq 140 + in 141 + if String.length s = 0 then () 142 + else 143 + let csv = "a,b\n" ^ string_of_int i ^ "," ^ s ^ "\n" in 144 + match Csvt.decode_string pair_codec csv with 145 + | Error e -> failf "pair decode failed: %s" (Csvt.error_to_string e) 146 + | Ok [ (a, b) ] -> 147 + if a <> i then failf "pair.a: expected %d, got %d" i a; 148 + if not (String.equal b s) then 149 + failf "pair.b: expected %S, got %S" s b 150 + | Ok vs -> failf "pair roundtrip: expected 1 row, got %d" (List.length vs) 151 + 152 + (* {1 Column reorder invariance} *) 153 + 154 + let test_reorder_invariance i s = 155 + let s = 156 + String.to_seq s 157 + |> Seq.filter (fun c -> c <> ',' && c <> '\n' && c <> '\r') 158 + |> String.of_seq 159 + in 160 + if String.length s = 0 then () 161 + else 162 + let csv1 = "a,b\n" ^ string_of_int i ^ "," ^ s ^ "\n" in 163 + let csv2 = "b,a\n" ^ s ^ "," ^ string_of_int i ^ "\n" in 164 + match (Csvt.decode_string pair_codec csv1, Csvt.decode_string pair_codec csv2) with 165 + | Ok [ (a1, b1) ], Ok [ (a2, b2) ] -> 166 + if a1 <> a2 then failf "reorder.a: %d vs %d" a1 a2; 167 + if not (String.equal b1 b2) then 168 + failf "reorder.b: %S vs %S" b1 b2 169 + | Error e, _ | _, Error e -> 170 + failf "reorder decode failed: %s" (Csvt.error_to_string e) 171 + | _ -> failf "reorder: unexpected row count" 172 + 173 + (* {1 Multiple rows} *) 174 + 175 + let test_multi_row_int is = 176 + if List.length is = 0 then () 177 + else 178 + let buf = Buffer.create 256 in 179 + Buffer.add_string buf "v\n"; 180 + List.iter 181 + (fun i -> 182 + Buffer.add_string buf (string_of_int i); 183 + Buffer.add_char buf '\n') 184 + is; 185 + match Csvt.decode_string int_codec (Buffer.contents buf) with 186 + | Error e -> failf "multi-row decode failed: %s" (Csvt.error_to_string e) 187 + | Ok vs -> 188 + if List.length vs <> List.length is then 189 + failf "multi-row: expected %d rows, got %d" 190 + (List.length is) (List.length vs); 191 + List.iter2 192 + (fun expected got -> 193 + if expected <> got then 194 + failf "multi-row: expected %d, got %d" expected got) 195 + is vs 196 + 197 + (* {1 Nullable float roundtrip} *) 198 + 199 + let test_nullable_roundtrip f = 200 + let s = 201 + if Float.is_nan f || Float.is_infinite f then "NULL" 202 + else string_of_float f 203 + in 204 + let csv = "v\n" ^ s ^ "\n" in 205 + match Csvt.decode_string nullable_codec csv with 206 + | Error e -> failf "nullable decode failed: %s" (Csvt.error_to_string e) 207 + | Ok [ v ] -> 208 + if String.equal s "NULL" then ( 209 + if not (Float.is_nan v) then 210 + failf "nullable: expected nan, got %g" v) 211 + else 212 + let diff = Float.abs (f -. v) in 213 + let scale = Float.max 1.0 (Float.abs f) in 214 + if diff /. scale > 1e-10 then 215 + failf "nullable roundtrip: expected %g, got %g" f v 216 + | Ok vs -> 217 + failf "nullable roundtrip: expected 1 row, got %d" (List.length vs) 218 + 219 + (* {1 Test suite} *) 220 + 221 + let suite = 222 + ( "csvt", 223 + [ 224 + test_case "no crash on arbitrary" [ bytes ] test_no_crash_on_arbitrary_input; 225 + test_case "int roundtrip" [ int ] test_int_roundtrip; 226 + test_case "float roundtrip" [ float ] test_float_roundtrip; 227 + test_case "bool roundtrip" [ bool ] test_bool_roundtrip; 228 + test_case "string roundtrip" [ bytes ] test_string_roundtrip; 229 + test_case "pair roundtrip" [ int; bytes ] test_pair_roundtrip; 230 + test_case "column reorder" [ int; bytes ] test_reorder_invariance; 231 + test_case "multi-row int" [ list int ] test_multi_row_int; 232 + test_case "nullable roundtrip" [ float ] test_nullable_roundtrip; 233 + ] )
+1
fuzz/fuzz_csvt.mli
··· 1 + val suite : string * Crowbar.test_case list
+37
fuzz/gen_corpus.ml
··· 1 + (* Generate seed corpus for AFL fuzzing *) 2 + 3 + let () = 4 + let dir = "corpus" in 5 + (try Unix.mkdir dir 0o755 with Unix.Unix_error (Unix.EEXIST, _, _) -> ()); 6 + let write name content = 7 + let path = Filename.concat dir name in 8 + let oc = open_out path in 9 + output_string oc content; 10 + close_out oc 11 + in 12 + (* Simple valid CSVs *) 13 + write "simple_int" "v\n42\n"; 14 + write "simple_float" "v\n3.14\n"; 15 + write "simple_string" "v\nhello\n"; 16 + write "simple_bool" "v\ntrue\n"; 17 + write "two_cols" "a,b\n1,hello\n"; 18 + write "three_cols" "a,b,c\n1,2.5,test\n"; 19 + write "multi_row" "v\n1\n2\n3\n4\n5\n"; 20 + write "reordered" "b,a\nhello,42\n"; 21 + write "nullable" "v\nNULL\n1.5\nNULL\n"; 22 + write "empty_field" "a,b\nhello,\n"; 23 + write "extra_cols" "a,extra,b\n1,ignored,hello\n"; 24 + write "whitespace_header" " a , b \n1,hello\n"; 25 + (* Edge cases *) 26 + write "header_only" "a,b,c\n"; 27 + write "empty" ""; 28 + write "just_newlines" "\n\n\n"; 29 + write "bad_int" "v\nabc\n"; 30 + write "bad_float" "v\nxyz\n"; 31 + write "negative" "v\n-999\n"; 32 + write "scientific" "v\n1.5e-3\n"; 33 + write "zero" "v\n0\n"; 34 + write "large_int" "v\n2147483647\n"; 35 + write "bool_variants" "v\ntrue\nfalse\n1\n0\nyes\nno\n"; 36 + write "many_commas" "a,b,c,d,e,f\n1,2,3,4,5,6\n"; 37 + Format.printf "Generated %d corpus files in %s/@." 22 dir
+304
lib/csvt.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Thomas Gazagnaire. All rights reserved. 3 + SPDX-License-Identifier: ISC 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Declarative CSV codecs. *) 7 + 8 + (* {1 Errors} *) 9 + 10 + type error = 11 + | Missing_header 12 + | Missing_column of string 13 + | Bad_value of { row : int; column : string; value : string; msg : string } 14 + | Truncated_row of { row : int; expected : int; got : int } 15 + | Encode_error of string 16 + 17 + let error_to_string = function 18 + | Missing_header -> "missing CSV header row" 19 + | Missing_column name -> "missing required column: " ^ name 20 + | Bad_value { row; column; value; msg } -> 21 + Printf.sprintf "row %d, column %s: bad value %S (%s)" row column value 22 + msg 23 + | Truncated_row { row; expected; got } -> 24 + Printf.sprintf "row %d: expected %d columns, got %d" row expected got 25 + | Encode_error msg -> "encode error: " ^ msg 26 + 27 + let pp_error ppf e = Format.pp_print_string ppf (error_to_string e) 28 + 29 + let ( let* ) = Result.bind 30 + 31 + (* {1 Column codecs} *) 32 + 33 + type 'a col_codec = { 34 + kind : string; 35 + dec : string -> ('a, string) result; 36 + enc : 'a -> string; 37 + } 38 + 39 + let col_kind c = c.kind 40 + 41 + let col_map ?(kind = "custom") ~dec ~enc () = { kind; dec; enc } 42 + 43 + let string = { kind = "string"; dec = (fun s -> Ok s); enc = Fun.id } 44 + 45 + let int = 46 + { 47 + kind = "int"; 48 + dec = 49 + (fun s -> 50 + match int_of_string_opt s with 51 + | Some i -> Ok i 52 + | None -> Error "not an integer"); 53 + enc = string_of_int; 54 + } 55 + 56 + let float = 57 + { 58 + kind = "float"; 59 + dec = 60 + (fun s -> 61 + match float_of_string_opt s with 62 + | Some f -> Ok f 63 + | None -> Error "not a float"); 64 + enc = string_of_float; 65 + } 66 + 67 + let bool = 68 + { 69 + kind = "bool"; 70 + dec = 71 + (fun s -> 72 + match String.lowercase_ascii s with 73 + | "true" | "1" | "yes" -> Ok true 74 + | "false" | "0" | "no" -> Ok false 75 + | _ -> Error "not a boolean"); 76 + enc = string_of_bool; 77 + } 78 + 79 + let is_null s = String.equal s "NULL" || String.equal s "" 80 + 81 + let nullable_float = 82 + { 83 + kind = "nullable_float"; 84 + dec = 85 + (fun s -> 86 + if is_null s then Ok Float.nan 87 + else 88 + match float_of_string_opt s with 89 + | Some f -> Ok f 90 + | None -> Error "not a float"); 91 + enc = (fun f -> if Float.is_nan f then "NULL" else string_of_float f); 92 + } 93 + 94 + let nullable_int = 95 + { 96 + kind = "nullable_int"; 97 + dec = 98 + (fun s -> 99 + if is_null s then Ok (-1) 100 + else 101 + match int_of_string_opt s with 102 + | Some i -> Ok i 103 + | None -> Error "not an integer"); 104 + enc = (fun i -> if i = -1 then "NULL" else string_of_int i); 105 + } 106 + 107 + let option c = 108 + { 109 + kind = "option(" ^ c.kind ^ ")"; 110 + dec = 111 + (fun s -> 112 + if is_null s then Ok None else Result.map Option.some (c.dec s)); 113 + enc = (fun v -> match v with None -> "NULL" | Some x -> c.enc x); 114 + } 115 + 116 + (* {1 Row codec internals} 117 + 118 + The applicative builder pattern requires threading a partially-applied 119 + constructor through heterogeneous column decoders. We use Obj.repr/obj 120 + for the intermediate steps, which is safe because the builder pattern 121 + at the type level guarantees that the constructor arity matches the 122 + number of col calls before finish. 123 + 124 + This is the same technique used by cmdliner, jsont, and other OCaml 125 + combinator libraries. *) 126 + 127 + type col_entry = { 128 + name : string; 129 + decode : string -> (Obj.t, string) result; 130 + absent : Obj.t option; 131 + encode : Obj.t -> string; 132 + } 133 + 134 + type 'a t = { 135 + kind : string; 136 + cols : col_entry list; 137 + ctor : Obj.t; 138 + } 139 + 140 + (* {1 Row builder} *) 141 + 142 + module Row = struct 143 + type 'a codec = 'a t 144 + 145 + type ('o, 'dec) map = { 146 + m_kind : string; 147 + m_cols : col_entry list; (* reversed *) 148 + m_ctor : Obj.t; 149 + } 150 + 151 + let obj ?(kind = "row") dec = 152 + { m_kind = kind; m_cols = []; m_ctor = Obj.repr dec } 153 + 154 + let col (type a) ?doc:_ ?dec_absent ?enc name (cc : a col_codec) m = 155 + let entry = 156 + { 157 + name; 158 + decode = (fun s -> Result.map Obj.repr (cc.dec s)); 159 + absent = Option.map Obj.repr dec_absent; 160 + encode = 161 + (match enc with 162 + | Some f -> fun o -> cc.enc (f (Obj.obj o)) 163 + | None -> fun _ -> ""); 164 + } 165 + in 166 + { m_kind = m.m_kind; m_cols = entry :: m.m_cols; m_ctor = m.m_ctor } 167 + 168 + let finish m : _ t = 169 + { kind = m.m_kind; cols = List.rev m.m_cols; ctor = m.m_ctor } 170 + end 171 + 172 + (* {1 Header resolution} *) 173 + 174 + type header = string array 175 + 176 + type 'a resolved = { 177 + indices : int array; 178 + rcols : col_entry array; 179 + rctor : Obj.t; 180 + } 181 + 182 + let find_col header name = 183 + let name = String.trim name in 184 + let n = Array.length header in 185 + let rec go i = 186 + if i >= n then -1 187 + else if String.equal (String.trim header.(i)) name then i 188 + else go (i + 1) 189 + in 190 + go 0 191 + 192 + let resolve codec header = 193 + let cols = Array.of_list codec.cols in 194 + let n = Array.length cols in 195 + let indices = Array.make n (-1) in 196 + let rec go i = 197 + if i >= n then Ok { indices; rcols = cols; rctor = codec.ctor } 198 + else 199 + let idx = find_col header cols.(i).name in 200 + indices.(i) <- idx; 201 + if idx < 0 && Option.is_none cols.(i).absent then 202 + Error (Missing_column cols.(i).name) 203 + else go (i + 1) 204 + in 205 + go 0 206 + 207 + (* {1 Row decoding} *) 208 + 209 + let apply f v = Obj.repr ((Obj.obj f : Obj.t -> Obj.t) v) 210 + 211 + let decode_row resolved row_num fields = 212 + let n = Array.length resolved.rcols in 213 + let nf = Array.length fields in 214 + let rec go i f = 215 + if i >= n then Ok (Obj.obj f : 'a) 216 + else 217 + let col = resolved.rcols.(i) in 218 + let idx = resolved.indices.(i) in 219 + if idx < 0 then 220 + match col.absent with 221 + | Some v -> go (i + 1) (apply f v) 222 + | None -> Error (Missing_column col.name) 223 + else if idx >= nf then 224 + Error (Truncated_row { row = row_num; expected = idx + 1; got = nf }) 225 + else 226 + let s = fields.(idx) in 227 + match col.decode s with 228 + | Ok v -> go (i + 1) (apply f v) 229 + | Error msg -> 230 + Error 231 + (Bad_value 232 + { row = row_num; column = col.name; value = s; msg }) 233 + in 234 + go 0 resolved.rctor 235 + 236 + (* {1 Row encoding} *) 237 + 238 + let encode_header codec = 239 + Array.of_list (List.map (fun c -> c.name) codec.cols) 240 + 241 + let encode_row codec v = 242 + let o = Obj.repr v in 243 + Array.of_list (List.map (fun c -> c.encode o) codec.cols) 244 + 245 + (* {1 CSV splitting} *) 246 + 247 + let split_csv_line line = 248 + let fields = String.split_on_char ',' line in 249 + Array.of_list fields 250 + 251 + let parse_header line = 252 + let h = split_csv_line line in 253 + if Array.length h = 0 then Error Missing_header else Ok h 254 + 255 + (* {1 File operations} *) 256 + 257 + let fold_channel codec ic f acc = 258 + let line = try Some (input_line ic) with End_of_file -> None in 259 + match line with 260 + | None -> Error Missing_header 261 + | Some header_line -> 262 + let* header = parse_header header_line in 263 + let* resolved = resolve codec header in 264 + let rec go acc row = 265 + match input_line ic with 266 + | exception End_of_file -> Ok acc 267 + | "" -> go acc row 268 + | line -> 269 + let fields = split_csv_line line in 270 + let* v = decode_row resolved row fields in 271 + go (f acc v) (row + 1) 272 + in 273 + go acc 2 274 + 275 + let decode_channel codec ic = 276 + let* rev_list = fold_channel codec ic (fun acc v -> v :: acc) [] in 277 + Ok (List.rev rev_list) 278 + 279 + let fold_file codec path f acc = 280 + let ic = open_in path in 281 + Fun.protect ~finally:(fun () -> close_in ic) (fun () -> 282 + fold_channel codec ic f acc) 283 + 284 + let decode_file codec path = 285 + let ic = open_in path in 286 + Fun.protect ~finally:(fun () -> close_in ic) (fun () -> 287 + decode_channel codec ic) 288 + 289 + let decode_string codec s = 290 + let lines = String.split_on_char '\n' s in 291 + match lines with 292 + | [] | [ "" ] -> Error Missing_header 293 + | header_line :: rest -> 294 + let* header = parse_header header_line in 295 + let* resolved = resolve codec header in 296 + let rec go acc row = function 297 + | [] -> Ok (List.rev acc) 298 + | "" :: tl -> go acc row tl 299 + | line :: tl -> 300 + let fields = split_csv_line line in 301 + let* v = decode_row resolved row fields in 302 + go (v :: acc) (row + 1) tl 303 + in 304 + go [] 2 rest
+175
lib/csvt.mli
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Thomas Gazagnaire. All rights reserved. 3 + SPDX-License-Identifier: ISC 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Declarative CSV codecs. 7 + 8 + Csvt provides a bidirectional codec system for CSV files, inspired by 9 + {{:https://erratique.ch/software/jsont}Jsont}'s approach to JSON codecs. 10 + 11 + {2 Quick Start} 12 + 13 + Define a codec for your OCaml types: 14 + {v 15 + type point = { x : float; y : float; label : string } 16 + 17 + let point_codec = 18 + Csvt.(Row.( 19 + obj (fun x y label -> { x; y; label }) 20 + |> col "x" float ~enc:(fun p -> p.x) 21 + |> col "y" float ~enc:(fun p -> p.y) 22 + |> col "label" string ~enc:(fun p -> p.label) 23 + |> finish 24 + )) 25 + v} 26 + 27 + Decode from a file: 28 + {v 29 + let () = 30 + match Csvt.decode_file point_codec "points.csv" with 31 + | Ok points -> List.iter (fun p -> Printf.printf "%s\n" p.label) points 32 + | Error e -> prerr_endline (Csvt.error_to_string e) 33 + v} *) 34 + 35 + (** {1:errors Errors} *) 36 + 37 + type error = 38 + | Missing_header 39 + | Missing_column of string 40 + | Bad_value of { row : int; column : string; value : string; msg : string } 41 + | Truncated_row of { row : int; expected : int; got : int } 42 + | Encode_error of string 43 + 44 + val error_to_string : error -> string 45 + (** [error_to_string e] returns a human-readable error message. *) 46 + 47 + val pp_error : error Fmt.t 48 + (** [pp_error] pretty-prints an error. *) 49 + 50 + (** {1:codec Column Codecs} *) 51 + 52 + type 'a col_codec 53 + (** The type of column-level codecs. A column codec knows how to convert 54 + between a single CSV field (string) and a typed OCaml value. *) 55 + 56 + val string : string col_codec 57 + (** Codec for string fields (identity). *) 58 + 59 + val int : int col_codec 60 + (** Codec for integer fields. *) 61 + 62 + val float : float col_codec 63 + (** Codec for floating-point fields. *) 64 + 65 + val bool : bool col_codec 66 + (** [bool] decodes "true"/"1" as [true], "false"/"0" as [false]. *) 67 + 68 + val nullable_float : float col_codec 69 + (** Like {!float} but treats "NULL" and empty strings as [nan]. *) 70 + 71 + val nullable_int : int col_codec 72 + (** Like {!int} but treats "NULL" and empty strings as [-1]. *) 73 + 74 + val option : 'a col_codec -> 'a option col_codec 75 + (** [option c] treats "NULL" and empty strings as [None]. *) 76 + 77 + val col_map : 78 + ?kind:string -> 79 + dec:(string -> ('a, string) result) -> 80 + enc:('a -> string) -> 81 + unit -> 82 + 'a col_codec 83 + (** [col_map ~dec ~enc] creates a column codec from raw decode/encode 84 + functions. *) 85 + 86 + val col_kind : 'a col_codec -> string 87 + (** [col_kind c] returns the kind description of column codec [c]. *) 88 + 89 + (** {1:row Row Codecs} *) 90 + 91 + type 'a t 92 + (** The type of row-level codecs. A row codec maps between a CSV row 93 + (resolved through header column names) and a typed OCaml record. *) 94 + 95 + (** Row codec builder. *) 96 + module Row : sig 97 + type 'a codec = 'a t 98 + 99 + type ('o, 'dec) map 100 + (** Builder state for a row codec. ['o] is the record type, ['dec] is the 101 + remaining decoder function type. *) 102 + 103 + val obj : ?kind:string -> 'dec -> ('o, 'dec) map 104 + (** [obj f] starts building a row codec with decoder function [f]. 105 + 106 + {v 107 + obj (fun a b c -> { a; b; c }) 108 + |> col "a" string ~enc:(fun r -> r.a) 109 + |> col "b" int ~enc:(fun r -> r.b) 110 + |> col "c" float ~enc:(fun r -> r.c) 111 + |> finish 112 + v} *) 113 + 114 + val col : 115 + ?doc:string -> 116 + ?dec_absent:'a -> 117 + ?enc:('o -> 'a) -> 118 + string -> 119 + 'a col_codec -> 120 + ('o, 'a -> 'dec) map -> 121 + ('o, 'dec) map 122 + (** [col name codec m] adds a column to the row builder. 123 + 124 + @param name The CSV column header name. 125 + @param codec The codec for the column value. 126 + @param doc Documentation string. 127 + @param dec_absent Default value if the column is missing from the header. 128 + @param enc Extractor function for encoding. *) 129 + 130 + val finish : ('o, 'o) map -> 'o codec 131 + (** [finish m] completes the row codec. *) 132 + end 133 + 134 + (** {1:header Header Resolution} *) 135 + 136 + type header = string array 137 + (** A CSV header: array of column names. *) 138 + 139 + type 'a resolved 140 + (** A codec resolved against a specific header. Column name lookups are 141 + performed once, giving O(1) per-row decoding. *) 142 + 143 + val resolve : 'a t -> header -> ('a resolved, error) result 144 + (** [resolve codec header] resolves column names to indices. 145 + Returns an error if a required column is missing. *) 146 + 147 + val decode_row : 'a resolved -> int -> string array -> ('a, error) result 148 + (** [decode_row resolved row_num fields] decodes a single CSV row. 149 + [row_num] is for error reporting (1-based). *) 150 + 151 + val encode_header : 'a t -> header 152 + (** [encode_header codec] returns the CSV header for encoding. *) 153 + 154 + val encode_row : 'a t -> 'a -> string array 155 + (** [encode_row codec v] encodes a value as a CSV row. *) 156 + 157 + (** {1:file File Operations} *) 158 + 159 + val decode_channel : 'a t -> in_channel -> ('a list, error) result 160 + (** [decode_channel codec ic] decodes all rows from a CSV input channel. *) 161 + 162 + val decode_file : 'a t -> string -> ('a list, error) result 163 + (** [decode_file codec path] decodes all rows from a CSV file. *) 164 + 165 + val fold_channel : 166 + 'a t -> in_channel -> ('b -> 'a -> 'b) -> 'b -> ('b, error) result 167 + (** [fold_channel codec ic f acc] folds over rows without building an 168 + intermediate list. Useful for large files. *) 169 + 170 + val fold_file : 171 + 'a t -> string -> ('b -> 'a -> 'b) -> 'b -> ('b, error) result 172 + (** [fold_file codec path f acc] folds over rows from a file. *) 173 + 174 + val decode_string : 'a t -> string -> ('a list, error) result 175 + (** [decode_string codec s] decodes all rows from a CSV string. *)
+4
lib/dune
··· 1 + (library 2 + (name csvt) 3 + (public_name csvt) 4 + (libraries fmt))
+3
test/dune
··· 1 + (test 2 + (name test) 3 + (libraries csvt alcotest fmt))
+1
test/test.ml
··· 1 + let () = Alcotest.run "csvt" [ Test_csvt.suite ]
+506
test/test_csvt.ml
··· 1 + (* Comprehensive tests for the Csvt library. *) 2 + 3 + (* {1 Test helpers} *) 4 + 5 + let check_error msg expected_kind r = 6 + match r with 7 + | Ok _ -> Alcotest.failf "%s: expected error, got Ok" msg 8 + | Error e -> 9 + if not (expected_kind e) then 10 + Alcotest.failf "%s: unexpected error: %s" msg (Csvt.error_to_string e) 11 + 12 + (* {1 Simple codecs for testing} *) 13 + 14 + type point = { x : float; y : float; label : string } 15 + 16 + let point_codec = 17 + Csvt.( 18 + Row.( 19 + obj (fun x y label -> { x; y; label }) 20 + |> col "x" float ~enc:(fun p -> p.x) 21 + |> col "y" float ~enc:(fun p -> p.y) 22 + |> col "label" string ~enc:(fun p -> p.label) 23 + |> finish)) 24 + 25 + type record = { id : int; name : string; score : float; active : bool } 26 + 27 + let record_codec = 28 + Csvt.( 29 + Row.( 30 + obj (fun id name score active -> { id; name; score; active }) 31 + |> col "id" int ~enc:(fun r -> r.id) 32 + |> col "name" string ~enc:(fun r -> r.name) 33 + |> col "score" float ~enc:(fun r -> r.score) 34 + |> col "active" bool ~enc:(fun r -> r.active) 35 + |> finish)) 36 + 37 + (* {1 Basic decode tests} *) 38 + 39 + let test_decode_simple () = 40 + let csv = "x,y,label\n1.0,2.0,origin\n3.5,4.5,target\n" in 41 + match Csvt.decode_string point_codec csv with 42 + | Error e -> Alcotest.failf "decode failed: %s" (Csvt.error_to_string e) 43 + | Ok points -> 44 + Alcotest.(check int) "count" 2 (List.length points); 45 + let p1 = List.nth points 0 in 46 + Alcotest.(check (float 1e-6)) "p1.x" 1.0 p1.x; 47 + Alcotest.(check (float 1e-6)) "p1.y" 2.0 p1.y; 48 + Alcotest.(check string) "p1.label" "origin" p1.label; 49 + let p2 = List.nth points 1 in 50 + Alcotest.(check (float 1e-6)) "p2.x" 3.5 p2.x; 51 + Alcotest.(check (float 1e-6)) "p2.y" 4.5 p2.y; 52 + Alcotest.(check string) "p2.label" "target" p2.label 53 + 54 + let test_single_row () = 55 + let csv = "x,y,label\n42.0,99.0,only\n" in 56 + match Csvt.decode_string point_codec csv with 57 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 58 + | Ok [ p ] -> 59 + Alcotest.(check (float 1e-6)) "x" 42.0 p.x; 60 + Alcotest.(check string) "label" "only" p.label 61 + | Ok ps -> Alcotest.failf "expected 1 row, got %d" (List.length ps) 62 + 63 + let test_multi_type_record () = 64 + let csv = "id,name,score,active\n1,alice,95.5,true\n2,bob,87.3,false\n" in 65 + match Csvt.decode_string record_codec csv with 66 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 67 + | Ok records -> 68 + Alcotest.(check int) "count" 2 (List.length records); 69 + let r1 = List.nth records 0 in 70 + Alcotest.(check int) "r1.id" 1 r1.id; 71 + Alcotest.(check string) "r1.name" "alice" r1.name; 72 + Alcotest.(check (float 1e-6)) "r1.score" 95.5 r1.score; 73 + Alcotest.(check bool) "r1.active" true r1.active; 74 + let r2 = List.nth records 1 in 75 + Alcotest.(check int) "r2.id" 2 r2.id; 76 + Alcotest.(check bool) "r2.active" false r2.active 77 + 78 + (* {1 Column reordering tests} *) 79 + 80 + let test_column_reorder () = 81 + let csv = "label,y,x\nhello,10.0,20.0\n" in 82 + match Csvt.decode_string point_codec csv with 83 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 84 + | Ok [ p ] -> 85 + Alcotest.(check (float 1e-6)) "x" 20.0 p.x; 86 + Alcotest.(check (float 1e-6)) "y" 10.0 p.y; 87 + Alcotest.(check string) "label" "hello" p.label 88 + | Ok ps -> Alcotest.failf "expected 1 row, got %d" (List.length ps) 89 + 90 + let test_extra_columns () = 91 + (* CSV has extra columns not in the codec — should be ignored *) 92 + let csv = "x,extra1,y,extra2,label\n1.0,foo,2.0,bar,test\n" in 93 + match Csvt.decode_string point_codec csv with 94 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 95 + | Ok [ p ] -> 96 + Alcotest.(check (float 1e-6)) "x" 1.0 p.x; 97 + Alcotest.(check (float 1e-6)) "y" 2.0 p.y; 98 + Alcotest.(check string) "label" "test" p.label 99 + | Ok ps -> Alcotest.failf "expected 1 row, got %d" (List.length ps) 100 + 101 + (* {1 Error handling tests} *) 102 + 103 + let test_missing_column () = 104 + let csv = "x,label\n1.0,foo\n" in 105 + check_error "missing y" (function Csvt.Missing_column "y" -> true | _ -> false) 106 + (Csvt.decode_string point_codec csv) 107 + 108 + let test_bad_float () = 109 + let csv = "x,y,label\nnot_a_number,2.0,foo\n" in 110 + check_error "bad float" (function Csvt.Bad_value { column = "x"; _ } -> true | _ -> false) 111 + (Csvt.decode_string point_codec csv) 112 + 113 + let test_bad_int () = 114 + let csv = "id,name,score,active\nabc,alice,95.5,true\n" in 115 + check_error "bad int" (function Csvt.Bad_value { column = "id"; _ } -> true | _ -> false) 116 + (Csvt.decode_string record_codec csv) 117 + 118 + let test_bad_bool () = 119 + let csv = "id,name,score,active\n1,alice,95.5,maybe\n" in 120 + check_error "bad bool" (function Csvt.Bad_value { column = "active"; _ } -> true | _ -> false) 121 + (Csvt.decode_string record_codec csv) 122 + 123 + let test_empty_csv () = 124 + check_error "empty" (function Csvt.Missing_header -> true | _ -> false) 125 + (Csvt.decode_string point_codec "") 126 + 127 + let test_header_only () = 128 + match Csvt.decode_string point_codec "x,y,label\n" with 129 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 130 + | Ok [] -> () 131 + | Ok ps -> Alcotest.failf "expected 0 rows, got %d" (List.length ps) 132 + 133 + let test_truncated_row () = 134 + let csv = "x,y,label\n1.0\n" in 135 + check_error "truncated" 136 + (function Csvt.Truncated_row _ | Csvt.Bad_value _ -> true | _ -> false) 137 + (Csvt.decode_string point_codec csv) 138 + 139 + (* {1 Nullable and option codecs} *) 140 + 141 + let test_nullable_float () = 142 + let codec = 143 + Csvt.(Row.(obj Fun.id |> col "v" nullable_float ~enc:Fun.id |> finish)) 144 + in 145 + match Csvt.decode_string codec "v\nNULL\n1.5\n" with 146 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 147 + | Ok vals -> 148 + Alcotest.(check int) "count" 2 (List.length vals); 149 + Alcotest.(check bool) "null is nan" true (Float.is_nan (List.nth vals 0)); 150 + Alcotest.(check (float 1e-6)) "1.5" 1.5 (List.nth vals 1) 151 + 152 + let test_nullable_float_empty () = 153 + let codec = 154 + Csvt.(Row.(obj Fun.id |> col "v" nullable_float ~enc:Fun.id |> finish)) 155 + in 156 + (* Empty string should also be treated as NULL *) 157 + match Csvt.decode_string codec "v\n\n3.0\n" with 158 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 159 + | Ok vals -> 160 + Alcotest.(check int) "count" 1 (List.length vals); 161 + (* Empty lines are skipped, so only 3.0 *) 162 + Alcotest.(check (float 1e-6)) "3.0" 3.0 (List.nth vals 0) 163 + 164 + let test_nullable_int () = 165 + let codec = 166 + Csvt.(Row.(obj Fun.id |> col "v" nullable_int ~enc:Fun.id |> finish)) 167 + in 168 + match Csvt.decode_string codec "v\nNULL\n42\n" with 169 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 170 + | Ok vals -> 171 + Alcotest.(check int) "count" 2 (List.length vals); 172 + Alcotest.(check int) "null" (-1) (List.nth vals 0); 173 + Alcotest.(check int) "42" 42 (List.nth vals 1) 174 + 175 + let test_option_int () = 176 + let codec = 177 + Csvt.(Row.(obj Fun.id |> col "v" (option int) ~enc:Fun.id |> finish)) 178 + in 179 + match Csvt.decode_string codec "v\nNULL\n42\n\n" with 180 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 181 + | Ok vals -> 182 + Alcotest.(check int) "count" 2 (List.length vals); 183 + Alcotest.(check (option int)) "null" None (List.nth vals 0); 184 + Alcotest.(check (option int)) "42" (Some 42) (List.nth vals 1) 185 + 186 + let test_option_string () = 187 + let codec = 188 + Csvt.(Row.(obj Fun.id |> col "v" (option string) ~enc:Fun.id |> finish)) 189 + in 190 + match Csvt.decode_string codec "v\nhello\nNULL\n" with 191 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 192 + | Ok vals -> 193 + Alcotest.(check int) "count" 2 (List.length vals); 194 + Alcotest.(check (option string)) "hello" (Some "hello") (List.nth vals 0); 195 + Alcotest.(check (option string)) "null" None (List.nth vals 1) 196 + 197 + (* {1 Default values (dec_absent)} *) 198 + 199 + let test_dec_absent () = 200 + let codec = 201 + Csvt.( 202 + Row.( 203 + obj (fun x y -> (x, y)) 204 + |> col "x" float ~enc:fst 205 + |> col "y" float ~enc:snd ~dec_absent:0.0 206 + |> finish)) 207 + in 208 + match Csvt.decode_string codec "x\n5.0\n" with 209 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 210 + | Ok [ (x, y) ] -> 211 + Alcotest.(check (float 1e-6)) "x" 5.0 x; 212 + Alcotest.(check (float 1e-6)) "y default" 0.0 y 213 + | Ok ps -> Alcotest.failf "expected 1 row, got %d" (List.length ps) 214 + 215 + let test_dec_absent_all_missing () = 216 + let codec = 217 + Csvt.( 218 + Row.( 219 + obj (fun a b -> (a, b)) 220 + |> col "a" int ~enc:fst ~dec_absent:0 221 + |> col "b" string ~enc:snd ~dec_absent:"default" 222 + |> finish)) 223 + in 224 + match Csvt.decode_string codec "unrelated\nfoo\n" with 225 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 226 + | Ok [ (a, b) ] -> 227 + Alcotest.(check int) "a default" 0 a; 228 + Alcotest.(check string) "b default" "default" b 229 + | Ok ps -> Alcotest.failf "expected 1 row, got %d" (List.length ps) 230 + 231 + (* {1 Encoding tests} *) 232 + 233 + let test_encode_header () = 234 + let h = Csvt.encode_header point_codec in 235 + Alcotest.(check int) "ncols" 3 (Array.length h); 236 + Alcotest.(check string) "col0" "x" h.(0); 237 + Alcotest.(check string) "col1" "y" h.(1); 238 + Alcotest.(check string) "col2" "label" h.(2) 239 + 240 + let test_encode_row () = 241 + let row = Csvt.encode_row point_codec { x = 1.5; y = 2.5; label = "test" } in 242 + Alcotest.(check int) "ncols" 3 (Array.length row); 243 + Alcotest.(check string) "label" "test" row.(2) 244 + 245 + let test_encode_record () = 246 + let row = 247 + Csvt.encode_row record_codec 248 + { id = 42; name = "bob"; score = 99.9; active = true } 249 + in 250 + Alcotest.(check string) "id" "42" row.(0); 251 + Alcotest.(check string) "name" "bob" row.(1); 252 + Alcotest.(check string) "active" "true" row.(3) 253 + 254 + (* {1 Bool codec edge cases} *) 255 + 256 + let test_bool_variants () = 257 + let codec = 258 + Csvt.(Row.(obj Fun.id |> col "v" bool ~enc:Fun.id |> finish)) 259 + in 260 + let check input expected = 261 + match Csvt.decode_string codec ("v\n" ^ input ^ "\n") with 262 + | Error e -> Alcotest.failf "%s: %s" input (Csvt.error_to_string e) 263 + | Ok [ v ] -> Alcotest.(check bool) input expected v 264 + | Ok _ -> Alcotest.failf "%s: wrong count" input 265 + in 266 + check "true" true; 267 + check "false" false; 268 + check "1" true; 269 + check "0" false; 270 + check "yes" true; 271 + check "no" false; 272 + check "TRUE" true; 273 + check "FALSE" false; 274 + check "Yes" true; 275 + check "No" false 276 + 277 + (* {1 Float edge cases} *) 278 + 279 + let test_float_edge_cases () = 280 + let codec = 281 + Csvt.(Row.(obj Fun.id |> col "v" float ~enc:Fun.id |> finish)) 282 + in 283 + let check input expected = 284 + match Csvt.decode_string codec ("v\n" ^ input ^ "\n") with 285 + | Error e -> Alcotest.failf "%s: %s" input (Csvt.error_to_string e) 286 + | Ok [ v ] -> Alcotest.(check (float 1e-10)) input expected v 287 + | Ok _ -> Alcotest.failf "%s: wrong count" input 288 + in 289 + check "0" 0.0; 290 + check "0.0" 0.0; 291 + check "-0.0" (-0.0); 292 + check "1e10" 1e10; 293 + check "1.5e-3" 1.5e-3; 294 + check "-999.999" (-999.999); 295 + check "inf" infinity; 296 + check "-inf" neg_infinity; 297 + check "nan" Float.nan (* OCaml float_of_string handles this *) 298 + 299 + let test_float_nan_check () = 300 + let codec = 301 + Csvt.(Row.(obj Fun.id |> col "v" float ~enc:Fun.id |> finish)) 302 + in 303 + match Csvt.decode_string codec "v\nnan\n" with 304 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 305 + | Ok [ v ] -> Alcotest.(check bool) "is nan" true (Float.is_nan v) 306 + | Ok _ -> Alcotest.fail "wrong count" 307 + 308 + (* {1 Int edge cases} *) 309 + 310 + let test_int_edge_cases () = 311 + let codec = 312 + Csvt.(Row.(obj Fun.id |> col "v" int ~enc:Fun.id |> finish)) 313 + in 314 + let check input expected = 315 + match Csvt.decode_string codec ("v\n" ^ input ^ "\n") with 316 + | Error e -> Alcotest.failf "%s: %s" input (Csvt.error_to_string e) 317 + | Ok [ v ] -> Alcotest.(check int) input expected v 318 + | Ok _ -> Alcotest.failf "%s: wrong count" input 319 + in 320 + check "0" 0; 321 + check "-1" (-1); 322 + check "2147483647" 2147483647; 323 + check "-2147483648" (-2147483648) 324 + 325 + (* {1 String edge cases} *) 326 + 327 + let test_string_with_spaces () = 328 + let codec = 329 + Csvt.(Row.(obj Fun.id |> col "v" string ~enc:Fun.id |> finish)) 330 + in 331 + match Csvt.decode_string codec "v\nhello world\n" with 332 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 333 + | Ok [ v ] -> Alcotest.(check string) "spaces" "hello world" v 334 + | Ok _ -> Alcotest.fail "wrong count" 335 + 336 + let test_string_empty_field () = 337 + (* Two-column CSV where one field is empty *) 338 + let codec = 339 + Csvt.( 340 + Row.( 341 + obj (fun a b -> (a, b)) 342 + |> col "a" string ~enc:fst 343 + |> col "b" string ~enc:snd 344 + |> finish)) 345 + in 346 + match Csvt.decode_string codec "a,b\nhello,\n" with 347 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 348 + | Ok [ (a, b) ] -> 349 + Alcotest.(check string) "a" "hello" a; 350 + Alcotest.(check string) "b empty" "" b 351 + | Ok ps -> Alcotest.failf "expected 1 row, got %d" (List.length ps) 352 + 353 + (* {1 Many rows} *) 354 + 355 + let test_many_rows () = 356 + let codec = 357 + Csvt.(Row.(obj Fun.id |> col "v" int ~enc:Fun.id |> finish)) 358 + in 359 + let buf = Buffer.create 1024 in 360 + Buffer.add_string buf "v\n"; 361 + for i = 0 to 999 do 362 + Buffer.add_string buf (string_of_int i); 363 + Buffer.add_char buf '\n' 364 + done; 365 + match Csvt.decode_string codec (Buffer.contents buf) with 366 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 367 + | Ok vals -> 368 + Alcotest.(check int) "count" 1000 (List.length vals); 369 + Alcotest.(check int) "first" 0 (List.hd vals); 370 + Alcotest.(check int) "last" 999 (List.nth vals 999) 371 + 372 + (* {1 Fold tests} *) 373 + 374 + let test_fold_string () = 375 + let csv = "x,y,label\n1.0,2.0,a\n3.0,4.0,b\n5.0,6.0,c\n" in 376 + match Csvt.decode_string point_codec csv with 377 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 378 + | Ok points -> 379 + let sum = List.fold_left (fun acc p -> acc +. p.x +. p.y) 0.0 points in 380 + Alcotest.(check (float 1e-6)) "sum" 21.0 sum 381 + 382 + (* {1 Whitespace in headers} *) 383 + 384 + let test_header_whitespace () = 385 + (* Headers with trailing/leading spaces should still match *) 386 + let csv = " x , y , label \n1.0,2.0,test\n" in 387 + match Csvt.decode_string point_codec csv with 388 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 389 + | Ok [ p ] -> 390 + Alcotest.(check (float 1e-6)) "x" 1.0 p.x; 391 + Alcotest.(check string) "label" "test" p.label 392 + | Ok ps -> Alcotest.failf "expected 1 row, got %d" (List.length ps) 393 + 394 + (* {1 col_map custom codec} *) 395 + 396 + let test_col_map () = 397 + let hex_codec = 398 + Csvt.col_map ~kind:"hex" 399 + ~dec:(fun s -> 400 + match int_of_string_opt ("0x" ^ s) with 401 + | Some i -> Ok i 402 + | None -> Error "not hex") 403 + ~enc:(fun i -> Printf.sprintf "%x" i) 404 + () 405 + in 406 + let codec = 407 + Csvt.(Row.(obj Fun.id |> col "v" hex_codec ~enc:Fun.id |> finish)) 408 + in 409 + match Csvt.decode_string codec "v\nff\n1a\n" with 410 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 411 + | Ok vals -> 412 + Alcotest.(check int) "count" 2 (List.length vals); 413 + Alcotest.(check int) "0xff" 255 (List.nth vals 0); 414 + Alcotest.(check int) "0x1a" 26 (List.nth vals 1) 415 + 416 + (* {1 Large field count} *) 417 + 418 + let test_wide_row () = 419 + (* 20 columns *) 420 + let codec = 421 + Csvt.( 422 + Row.( 423 + obj (fun c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16 424 + c17 c18 c19 -> 425 + [| c0; c1; c2; c3; c4; c5; c6; c7; c8; c9; c10; c11; c12; c13; c14; 426 + c15; c16; c17; c18; c19 |]) 427 + |> col "c0" int ~enc:(fun a -> a.(0)) 428 + |> col "c1" int ~enc:(fun a -> a.(1)) 429 + |> col "c2" int ~enc:(fun a -> a.(2)) 430 + |> col "c3" int ~enc:(fun a -> a.(3)) 431 + |> col "c4" int ~enc:(fun a -> a.(4)) 432 + |> col "c5" int ~enc:(fun a -> a.(5)) 433 + |> col "c6" int ~enc:(fun a -> a.(6)) 434 + |> col "c7" int ~enc:(fun a -> a.(7)) 435 + |> col "c8" int ~enc:(fun a -> a.(8)) 436 + |> col "c9" int ~enc:(fun a -> a.(9)) 437 + |> col "c10" int ~enc:(fun a -> a.(10)) 438 + |> col "c11" int ~enc:(fun a -> a.(11)) 439 + |> col "c12" int ~enc:(fun a -> a.(12)) 440 + |> col "c13" int ~enc:(fun a -> a.(13)) 441 + |> col "c14" int ~enc:(fun a -> a.(14)) 442 + |> col "c15" int ~enc:(fun a -> a.(15)) 443 + |> col "c16" int ~enc:(fun a -> a.(16)) 444 + |> col "c17" int ~enc:(fun a -> a.(17)) 445 + |> col "c18" int ~enc:(fun a -> a.(18)) 446 + |> col "c19" int ~enc:(fun a -> a.(19)) 447 + |> finish)) 448 + in 449 + let header = String.concat "," (List.init 20 (fun i -> "c" ^ string_of_int i)) in 450 + let row = String.concat "," (List.init 20 string_of_int) in 451 + let csv = header ^ "\n" ^ row ^ "\n" in 452 + match Csvt.decode_string codec csv with 453 + | Error e -> Alcotest.failf "%s" (Csvt.error_to_string e) 454 + | Ok [ arr ] -> 455 + for i = 0 to 19 do 456 + Alcotest.(check int) ("c" ^ string_of_int i) i arr.(i) 457 + done 458 + | Ok ps -> Alcotest.failf "expected 1 row, got %d" (List.length ps) 459 + 460 + let suite = 461 + ( "csvt", 462 + [ 463 + (* Basic decoding *) 464 + Alcotest.test_case "decode simple" `Quick test_decode_simple; 465 + Alcotest.test_case "single row" `Quick test_single_row; 466 + Alcotest.test_case "multi-type record" `Quick test_multi_type_record; 467 + (* Column handling *) 468 + Alcotest.test_case "column reorder" `Quick test_column_reorder; 469 + Alcotest.test_case "extra columns" `Quick test_extra_columns; 470 + Alcotest.test_case "header whitespace" `Quick test_header_whitespace; 471 + (* Error handling *) 472 + Alcotest.test_case "missing column" `Quick test_missing_column; 473 + Alcotest.test_case "bad float" `Quick test_bad_float; 474 + Alcotest.test_case "bad int" `Quick test_bad_int; 475 + Alcotest.test_case "bad bool" `Quick test_bad_bool; 476 + Alcotest.test_case "empty CSV" `Quick test_empty_csv; 477 + Alcotest.test_case "header only" `Quick test_header_only; 478 + Alcotest.test_case "truncated row" `Quick test_truncated_row; 479 + (* Nullable / option *) 480 + Alcotest.test_case "nullable float" `Quick test_nullable_float; 481 + Alcotest.test_case "nullable float empty" `Quick test_nullable_float_empty; 482 + Alcotest.test_case "nullable int" `Quick test_nullable_int; 483 + Alcotest.test_case "option int" `Quick test_option_int; 484 + Alcotest.test_case "option string" `Quick test_option_string; 485 + (* Defaults *) 486 + Alcotest.test_case "dec_absent" `Quick test_dec_absent; 487 + Alcotest.test_case "dec_absent all missing" `Quick test_dec_absent_all_missing; 488 + (* Encoding *) 489 + Alcotest.test_case "encode header" `Quick test_encode_header; 490 + Alcotest.test_case "encode row" `Quick test_encode_row; 491 + Alcotest.test_case "encode record" `Quick test_encode_record; 492 + (* Type edge cases *) 493 + Alcotest.test_case "bool variants" `Quick test_bool_variants; 494 + Alcotest.test_case "float edge cases" `Quick test_float_edge_cases; 495 + Alcotest.test_case "float nan" `Quick test_float_nan_check; 496 + Alcotest.test_case "int edge cases" `Quick test_int_edge_cases; 497 + Alcotest.test_case "string with spaces" `Quick test_string_with_spaces; 498 + Alcotest.test_case "string empty field" `Quick test_string_empty_field; 499 + (* Scale *) 500 + Alcotest.test_case "many rows" `Quick test_many_rows; 501 + Alcotest.test_case "wide row (20 cols)" `Quick test_wide_row; 502 + (* Fold *) 503 + Alcotest.test_case "fold string" `Quick test_fold_string; 504 + (* Custom codec *) 505 + Alcotest.test_case "col_map custom" `Quick test_col_map; 506 + ] )
+1
test/test_csvt.mli
··· 1 + val suite : string * unit Alcotest.test_case list