Duplicate code detection across OCaml packages
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

fix(lint): resolve E600 and E605 lint issues in dupfind tests

Add .mli files for all dupfind test modules exposing suite signatures.
Create real test_fingerprint.ml (8 tests) and test_similar.ml (5 tests)
with comprehensive coverage for Jaccard similarity and duplicate detection.

+1583 -362
+1 -1
bin/dune
··· 2 2 (name main) 3 3 (public_name dupfind) 4 4 (package dupfind) 5 - (libraries dupfind eio_main cmdliner fpath memtrace)) 5 + (libraries dupfind eio_main cmdliner fpath memtrace vlog tty logs))
+183 -117
bin/main.ml
··· 1 1 open Cmdliner 2 2 3 + let src = Logs.Src.create "dupfind" 4 + 5 + module Log = (val Logs.src_log src) 6 + 3 7 let process_file ~min_size ~package file = 4 8 let path = Fpath.to_string file in 5 9 let structure = Dupfind.Source.parse_file path in 6 10 let bindings = Dupfind.Source.extract_bindings structure in 11 + Log.debug (fun m -> m "parse %s: %d bindings" path (List.length bindings)); 7 12 List.filter_map 8 13 (fun (name, expr, line) -> 9 14 let size = Dupfind.Source.ast_size expr in ··· 11 16 else 12 17 let normalized = Dupfind.Normalize.apply expr in 13 18 let hash = Dupfind.Normalize.hash normalized in 19 + Log.debug (fun m -> m " %s: size=%d hash=%s" name size hash); 14 20 Some 15 21 { 16 22 Dupfind.Fragment.hash; ··· 20 26 }) 21 27 bindings 22 28 23 - (* inter subcommand *) 29 + let collect_files ~fs ~exclude paths = 30 + List.concat_map 31 + (fun root -> 32 + let root = Fpath.v root in 33 + Dupfind.Discover.ml_files ~fs ~exclude root 34 + |> List.map (fun file -> 35 + let package = Dupfind.Discover.package_of_file ~root file in 36 + (package, file))) 37 + paths 24 38 25 - let run_scan eio min_size no_intra format exclude top paths = 26 - let fs = Eio.Stdenv.fs eio in 39 + let index_files ~min_size ~progress files = 27 40 let index = Dupfind.Index.v () in 28 41 List.iter 29 - (fun root -> 30 - let root = Fpath.v root in 31 - let files = Dupfind.Discover.ml_files ~fs ~exclude root in 32 - List.iter 33 - (fun file -> 34 - let package = Dupfind.Discover.package_of_file ~root file in 35 - let fragments = process_file ~min_size ~package file in 36 - List.iter (Dupfind.Index.add index) fragments) 37 - files) 38 - paths; 42 + (fun (package, file) -> 43 + let fragments = process_file ~min_size ~package file in 44 + List.iter (Dupfind.Index.add index) fragments; 45 + Tty.Progress.tick progress) 46 + files; 47 + Tty.Progress.finish progress; 48 + index 49 + 50 + (* scan subcommand *) 51 + 52 + let run_scan eio min_size no_intra format exclude top paths = 53 + let fs = Eio.Stdenv.fs eio in 54 + let files = collect_files ~fs ~exclude paths in 55 + Log.info (fun m -> m "Scanning %d files" (List.length files)); 56 + let progress = 57 + Tty.Progress.create ~enabled:(Tty.is_tty ()) ~total:(List.length files) 58 + "Scanning" 59 + in 60 + let index = index_files ~min_size ~progress files in 39 61 let intra = not no_intra in 40 62 let clusters = Dupfind.Cluster.get index ~intra in 41 63 Dupfind.Report.output ~format ~top clusters 42 64 43 - (* find subcommand: search for duplicates of a specific function *) 65 + (* find subcommand *) 44 66 45 67 let resolve_qualified name = 46 - (* Split Module.func into (module_name, func_name) *) 47 68 match String.rindex_opt name '.' with 48 69 | None -> (None, name) 49 70 | Some i -> ··· 61 82 match mod_name with 62 83 | None -> true 63 84 | Some m -> 64 - (* Check if the file's module name matches *) 65 85 let base = Filename.chop_extension (Filename.basename file) in 66 86 String.capitalize_ascii base = m) 67 87 bindings 68 88 69 - let collect_files ~fs paths = 70 - List.concat_map 71 - (fun root -> 72 - let root = Fpath.v root in 73 - Dupfind.Discover.ml_files ~fs ~exclude:[] root 74 - |> List.map (fun file -> 75 - let package = Dupfind.Discover.package_of_file ~root file in 76 - (package, file))) 77 - paths 89 + let parse_expr s = 90 + let lexbuf = Lexing.from_string s in 91 + match Parse.implementation lexbuf with 92 + | [ { Parsetree.pstr_desc = Pstr_eval (e, _); _ } ] -> e 93 + | [ { Parsetree.pstr_desc = Pstr_value (_, [ vb ]); _ } ] -> vb.pvb_expr 94 + | _ -> Fmt.failwith "Cannot parse expression: %S" s 95 + 96 + let is_hex_hash s = 97 + String.length s = 32 98 + && String.for_all 99 + (fun c -> (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')) 100 + s 101 + 102 + let is_inline_expr s = 103 + String.length s > 0 104 + && (s.[0] = '(' 105 + || String.starts_with ~prefix:"let " s 106 + || String.starts_with ~prefix:"fun " s 107 + || String.starts_with ~prefix:"function " s) 78 108 79 109 let report_matches ~format ~top ~qualified_name expr matches target_hash = 80 110 if List.length matches <= 1 then ··· 93 123 in 94 124 Dupfind.Report.output ~format ~top [ cluster ] 95 125 96 - let parse_expr s = 97 - let lexbuf = Lexing.from_string s in 98 - match Parse.implementation lexbuf with 99 - | [ { Parsetree.pstr_desc = Pstr_eval (e, _); _ } ] -> e 100 - | [ { Parsetree.pstr_desc = Pstr_value (_, [ vb ]); _ } ] -> vb.pvb_expr 101 - | _ -> Fmt.failwith "Cannot parse expression: %S" s 126 + let report_hash_matches ~format ~top hash matches = 127 + if matches = [] then Fmt.pr "No matches for hash %s.@." hash 128 + else 129 + let ast_size = 130 + match matches with h :: _ -> h.Dupfind.Fragment.ast_size | [] -> 0 131 + in 132 + let cluster : Dupfind.Cluster.t = 133 + { 134 + hash; 135 + fragments = matches; 136 + ast_size; 137 + count = List.length matches; 138 + packages = 139 + List.map (fun (f : Dupfind.Fragment.t) -> f.location.package) matches 140 + |> List.sort_uniq String.compare; 141 + } 142 + in 143 + Dupfind.Report.output ~format ~top [ cluster ] 102 144 103 - let is_hex_hash s = 104 - String.length s = 32 105 - && String.for_all 106 - (fun c -> (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')) 107 - s 145 + let find_by_expr ~format ~top index query = 146 + let expr = parse_expr query in 147 + let normalized = Dupfind.Normalize.apply expr in 148 + let target_hash = Dupfind.Normalize.hash normalized in 149 + Fmt.pr "AST: %a@." Dupfind.Normalize.pp_expr normalized; 150 + Fmt.pr "Hash: %s@.@." target_hash; 151 + let matches = Dupfind.Index.get index target_hash in 152 + report_matches ~format ~top ~qualified_name:query expr matches target_hash 108 153 109 - let is_inline_expr s = 110 - String.length s > 0 111 - && (s.[0] = '(' 112 - || String.starts_with ~prefix:"let " s 113 - || String.starts_with ~prefix:"fun " s 114 - || String.starts_with ~prefix:"function " s) 154 + let find_by_name ~format ~top index query all_files = 155 + let mod_name, func_name = resolve_qualified query in 156 + let target = 157 + List.find_map 158 + (fun (_package, file) -> 159 + binding_in_file (Fpath.to_string file) mod_name func_name) 160 + all_files 161 + in 162 + match target with 163 + | None -> 164 + Fmt.epr "Error: binding %S not found.@." query; 165 + Stdlib.exit 1 166 + | Some (_name, expr, _line) -> 167 + let normalized = Dupfind.Normalize.apply expr in 168 + let target_hash = Dupfind.Normalize.hash normalized in 169 + let matches = Dupfind.Index.get index target_hash in 170 + report_matches ~format ~top ~qualified_name:query expr matches target_hash 115 171 116 172 let run_find eio min_size format top query paths = 117 173 let fs = Eio.Stdenv.fs eio in 118 - let all_files = collect_files ~fs paths in 119 - let index = Dupfind.Index.v () in 174 + let all_files = collect_files ~fs ~exclude:[] paths in 175 + Log.info (fun m -> m "Indexing %d files" (List.length all_files)); 176 + let progress = 177 + Tty.Progress.create ~enabled:(Tty.is_tty ()) ~total:(List.length all_files) 178 + "Indexing" 179 + in 180 + let index = index_files ~min_size ~progress all_files in 181 + if is_hex_hash query then 182 + report_hash_matches ~format ~top query (Dupfind.Index.get index query) 183 + else if is_inline_expr query then find_by_expr ~format ~top index query 184 + else find_by_name ~format ~top index query all_files 185 + 186 + (* similar subcommand *) 187 + 188 + let run_similar eio min_size min_sub_size threshold no_intra format top paths = 189 + let fs = Eio.Stdenv.fs eio in 190 + let files = collect_files ~fs ~exclude:[] paths in 191 + Log.info (fun m -> m "Fingerprinting %d files" (List.length files)); 192 + let progress = 193 + Tty.Progress.create ~enabled:(Tty.is_tty ()) ~total:(List.length files) 194 + "Fingerprinting" 195 + in 196 + let entries = ref [] in 120 197 List.iter 121 - (fun (package, file) -> 122 - let fragments = process_file ~min_size ~package file in 123 - List.iter (Dupfind.Index.add index) fragments) 124 - all_files; 125 - if is_hex_hash query then begin 126 - let matches = Dupfind.Index.get index query in 127 - if matches = [] then Fmt.pr "No matches for hash %s.@." query 128 - else 129 - let cluster : Dupfind.Cluster.t = 130 - { 131 - hash = query; 132 - fragments = matches; 133 - ast_size = (match matches with h :: _ -> h.ast_size | [] -> 0); 134 - count = List.length matches; 135 - packages = 136 - List.map 137 - (fun (f : Dupfind.Fragment.t) -> f.location.package) 138 - matches 139 - |> List.sort_uniq String.compare; 140 - } 141 - in 142 - Dupfind.Report.output ~format ~top [ cluster ] 143 - end 144 - else if is_inline_expr query then begin 145 - let expr = parse_expr query in 146 - let normalized = Dupfind.Normalize.apply expr in 147 - let target_hash = Dupfind.Normalize.hash normalized in 148 - Fmt.pr "AST: %a@." Dupfind.Normalize.pp_expr normalized; 149 - Fmt.pr "Hash: %s@.@." target_hash; 150 - let matches = Dupfind.Index.get index target_hash in 151 - if matches = [] then Fmt.pr "No matches found.@." 152 - else 153 - let cluster : Dupfind.Cluster.t = 154 - { 155 - hash = target_hash; 156 - fragments = matches; 157 - ast_size = Dupfind.Source.ast_size expr; 158 - count = List.length matches; 159 - packages = 160 - List.map 161 - (fun (f : Dupfind.Fragment.t) -> f.location.package) 162 - matches 163 - |> List.sort_uniq String.compare; 164 - } 165 - in 166 - Dupfind.Report.output ~format ~top [ cluster ] 167 - end 168 - else begin 169 - let mod_name, func_name = resolve_qualified query in 170 - let target = 171 - List.find_map 172 - (fun (_package, file) -> 173 - binding_in_file (Fpath.to_string file) mod_name func_name) 174 - all_files 175 - in 176 - match target with 177 - | None -> 178 - Fmt.epr "Error: binding %S not found.@." query; 179 - Stdlib.exit 1 180 - | Some (_name, expr, _line) -> 181 - let normalized = Dupfind.Normalize.apply expr in 182 - let target_hash = Dupfind.Normalize.hash normalized in 183 - let matches = Dupfind.Index.get index target_hash in 184 - report_matches ~format ~top ~qualified_name:query expr matches 185 - target_hash 186 - end 198 + (fun (_package, file) -> 199 + let path = Fpath.to_string file in 200 + let package = Dupfind.Discover.package_of_file ~root:(Fpath.v ".") file in 201 + let structure = Dupfind.Source.parse_file path in 202 + let bindings = Dupfind.Source.extract_bindings structure in 203 + List.iter 204 + (fun (name, expr, line) -> 205 + let size = Dupfind.Source.ast_size expr in 206 + if size >= min_size then begin 207 + let normalized = Dupfind.Normalize.apply expr in 208 + let hash = Dupfind.Normalize.hash normalized in 209 + let frag : Dupfind.Fragment.t = 210 + { 211 + hash; 212 + location = { file = Fpath.to_string file; line; package }; 213 + ast_size = size; 214 + binding_name = name; 215 + } 216 + in 217 + let sub = 218 + Dupfind.Normalize.sub_hashes ~min_size:min_sub_size normalized 219 + in 220 + let fp = Dupfind.Fingerprint.of_sub_hashes sub in 221 + entries := (frag, fp) :: !entries 222 + end) 223 + bindings; 224 + Tty.Progress.tick progress) 225 + files; 226 + Tty.Progress.finish progress; 227 + let intra = not no_intra in 228 + let pairs = Dupfind.Similar.get ~threshold ~intra !entries in 229 + Dupfind.Report.output_similar ~format ~top pairs 187 230 188 - (* hash subcommand: show normalized AST + hash for an expression *) 231 + (* hash subcommand *) 189 232 190 233 let run_hash input = 191 234 let expr = parse_expr input in ··· 194 237 Fmt.pr "@[<v>AST: %a@,Hash: %s@]@." Dupfind.Normalize.pp_expr normalized h 195 238 196 239 (* Cmdliner terms *) 240 + 241 + let setup = Vlog.setup ~json_reporter:None "dupfind" 197 242 198 243 let min_size = 199 244 Arg.( ··· 251 296 let info = Cmd.info "scan" ~doc in 252 297 Cmd.v info 253 298 Term.( 254 - const (run_scan eio) 255 - $ min_size $ no_intra $ format_opt $ exclude $ top $ paths) 299 + const (fun () -> run_scan eio) 300 + $ setup $ min_size $ no_intra $ format_opt $ exclude $ top $ paths) 256 301 257 302 let search_cmd eio = 258 303 let doc = "Find duplicates of a specific function." in 259 304 let info = Cmd.info "find" ~doc in 260 305 Cmd.v info 261 306 Term.( 262 - const (run_find eio) 263 - $ min_size $ format_opt $ top $ qualified_name $ search_paths) 307 + const (fun () -> run_find eio) 308 + $ setup $ min_size $ format_opt $ top $ qualified_name $ search_paths) 309 + 310 + let threshold = 311 + Arg.( 312 + value & opt float 0.7 313 + & info [ "threshold" ] ~docv:"FLOAT" 314 + ~doc:"Minimum Jaccard similarity (0.0-1.0, default 0.7).") 315 + 316 + let min_sub_size = 317 + Arg.( 318 + value & opt int 3 319 + & info [ "min-sub-size" ] ~docv:"N" 320 + ~doc:"Minimum sub-expression size for fingerprinting (default 3).") 264 321 265 322 let hash_input = 266 323 Arg.( ··· 272 329 let hash_cmd = 273 330 let doc = "Show normalized AST and hash for an expression." in 274 331 let info = Cmd.info "hash" ~doc in 275 - Cmd.v info Term.(const run_hash $ hash_input) 332 + Cmd.v info Term.(const (fun () -> run_hash) $ setup $ hash_input) 333 + 334 + let similar_cmd eio = 335 + let doc = "Find near-duplicate code via sub-expression fingerprinting." in 336 + let info = Cmd.info "similar" ~doc in 337 + Cmd.v info 338 + Term.( 339 + const (fun () -> run_similar eio) 340 + $ setup $ min_size $ min_sub_size $ threshold $ no_intra $ format_opt 341 + $ top $ paths) 276 342 277 343 let cmd eio = 278 344 let doc = "Duplicate code detection for OCaml." in 279 345 let info = Cmd.info "dupfind" ~version:"0.1.0" ~doc in 280 346 Cmd.group info 281 347 ~default:Term.(ret (const (`Help (`Pager, None)))) 282 - [ scan_cmd eio; search_cmd eio; hash_cmd ] 348 + [ scan_cmd eio; search_cmd eio; hash_cmd; similar_cmd eio ] 283 349 284 350 let () = 285 351 Memtrace.trace_if_requested ();
+14
lib/fingerprint.ml
··· 1 + module SS = Set.Make (String) 2 + 3 + type t = SS.t 4 + 5 + let of_sub_hashes pairs = 6 + List.fold_left (fun acc (h, _) -> SS.add h acc) SS.empty pairs 7 + 8 + let jaccard a b = 9 + let inter = SS.cardinal (SS.inter a b) in 10 + let union = SS.cardinal (SS.union a b) in 11 + if union = 0 then 0.0 else Float.of_int inter /. Float.of_int union 12 + 13 + let overlap a b = SS.elements (SS.inter a b) 14 + let cardinal t = SS.cardinal t
+17
lib/fingerprint.mli
··· 1 + (** Sub-expression fingerprints for similarity detection. *) 2 + 3 + type t 4 + (** A set of sub-expression hashes. *) 5 + 6 + val of_sub_hashes : (string * int) list -> t 7 + (** [of_sub_hashes pairs] builds a fingerprint from [(hash, size)] pairs. *) 8 + 9 + val jaccard : t -> t -> float 10 + (** [jaccard a b] computes the Jaccard similarity [|A inter B| / |A union B|]. 11 + Returns [0.0] when both sets are empty. *) 12 + 13 + val overlap : t -> t -> string list 14 + (** [overlap a b] returns the hashes shared by both fingerprints. *) 15 + 16 + val cardinal : t -> int 17 + (** [cardinal t] returns the number of distinct sub-expression hashes. *)
+808 -169
lib/normalize.ml
··· 184 184 let rhs = convert_expr env c.pc_rhs in 185 185 { cpat; guard; rhs } 186 186 187 - let apply expr = 188 - let env = env () in 189 - convert_expr env expr 187 + (* Substitution: replace all occurrences of variable [name] with [replacement] *) 188 + 189 + let rec subst name replacement = function 190 + | E_ident s -> if s = name then replacement else E_ident s 191 + | (E_const _ | E_other) as e -> e 192 + | E_let (rf, bindings, body) -> 193 + let bindings = 194 + List.map 195 + (fun { bpat; bexpr } -> 196 + { bpat; bexpr = subst name replacement bexpr }) 197 + bindings 198 + in 199 + E_let (rf, bindings, subst name replacement body) 200 + | E_fun (params, body) -> 201 + E_fun 202 + (subst_params name replacement params, subst_body name replacement body) 203 + | E_apply (f, args) -> 204 + E_apply 205 + ( subst name replacement f, 206 + List.map (fun (l, e) -> (l, subst name replacement e)) args ) 207 + | E_match (e, cases) -> 208 + E_match 209 + (subst name replacement e, List.map (subst_case name replacement) cases) 210 + | E_tuple es -> E_tuple (List.map (subst name replacement) es) 211 + | E_construct (n, arg) -> 212 + E_construct (n, Option.map (subst name replacement) arg) 213 + | E_variant (l, arg) -> E_variant (l, Option.map (subst name replacement) arg) 214 + | E_record (fs, base) -> 215 + E_record 216 + ( List.map (fun (n, e) -> (n, subst name replacement e)) fs, 217 + Option.map (subst name replacement) base ) 218 + | E_field (e, n) -> E_field (subst name replacement e, n) 219 + | E_ifthenelse (c, t, f) -> 220 + E_ifthenelse 221 + ( subst name replacement c, 222 + subst name replacement t, 223 + Option.map (subst name replacement) f ) 224 + | E_sequence (e1, e2) -> 225 + E_sequence (subst name replacement e1, subst name replacement e2) 226 + | E_assert e -> E_assert (subst name replacement e) 227 + | E_lazy e -> E_lazy (subst name replacement e) 228 + | E_try (e, cases) -> 229 + E_try 230 + (subst name replacement e, List.map (subst_case name replacement) cases) 231 + 232 + and subst_params name replacement params = 233 + List.map 234 + (function 235 + | Param_val (l, default, pat) -> 236 + Param_val (l, Option.map (subst name replacement) default, pat) 237 + | Param_newtype -> Param_newtype) 238 + params 239 + 240 + and subst_body name replacement = function 241 + | Body_expr e -> Body_expr (subst name replacement e) 242 + | Body_cases cases -> 243 + Body_cases (List.map (subst_case name replacement) cases) 244 + 245 + and subst_case name replacement { cpat; guard; rhs } = 246 + { 247 + cpat; 248 + guard = Option.map (subst name replacement) guard; 249 + rhs = subst name replacement rhs; 250 + } 251 + 252 + (* Check if a variable name occurs in an expression *) 253 + 254 + let opt_exists f = function None -> false | Some x -> f x 255 + 256 + let rec ident_in_expr name = function 257 + | E_ident s -> s = name 258 + | E_const _ | E_other -> false 259 + | E_let (_, bindings, body) -> 260 + List.exists (fun { bpat = _; bexpr } -> ident_in_expr name bexpr) bindings 261 + || ident_in_expr name body 262 + | E_fun (params, body) -> 263 + List.exists 264 + (function 265 + | Param_val (_, default, _) -> opt_exists (ident_in_expr name) default 266 + | Param_newtype -> false) 267 + params 268 + || ident_in_body name body 269 + | E_apply (f, args) -> 270 + ident_in_expr name f 271 + || List.exists (fun (_, e) -> ident_in_expr name e) args 272 + | E_match (e, cases) | E_try (e, cases) -> 273 + ident_in_expr name e || List.exists (ident_in_case name) cases 274 + | E_tuple es -> List.exists (ident_in_expr name) es 275 + | E_construct (_, arg) | E_variant (_, arg) -> 276 + opt_exists (ident_in_expr name) arg 277 + | E_record (fs, base) -> 278 + List.exists (fun (_, e) -> ident_in_expr name e) fs 279 + || opt_exists (ident_in_expr name) base 280 + | E_field (e, _) | E_assert e | E_lazy e -> ident_in_expr name e 281 + | E_ifthenelse (c, t, f) -> 282 + ident_in_expr name c || ident_in_expr name t 283 + || opt_exists (ident_in_expr name) f 284 + | E_sequence (e1, e2) -> ident_in_expr name e1 || ident_in_expr name e2 285 + 286 + and ident_in_body name = function 287 + | Body_expr e -> ident_in_expr name e 288 + | Body_cases cases -> List.exists (ident_in_case name) cases 289 + 290 + and ident_in_case name { cpat = _; guard; rhs } = 291 + opt_exists (ident_in_expr name) guard || ident_in_expr name rhs 292 + 293 + (* Pass 1: Inline non-recursive let bindings *) 294 + 295 + let rec inline_lets = function 296 + | E_let (Asttypes.Nonrecursive, bindings, body) -> 297 + let bindings = 298 + List.map 299 + (fun { bpat; bexpr } -> { bpat; bexpr = inline_lets bexpr }) 300 + bindings 301 + in 302 + let body = inline_lets body in 303 + List.fold_right 304 + (fun { bpat; bexpr } acc -> 305 + match bpat with 306 + | P_var name -> subst name bexpr acc 307 + | _ -> E_let (Asttypes.Nonrecursive, [ { bpat; bexpr } ], acc)) 308 + bindings body 309 + | E_let (rf, bindings, body) -> 310 + E_let 311 + ( rf, 312 + List.map 313 + (fun { bpat; bexpr } -> { bpat; bexpr = inline_lets bexpr }) 314 + bindings, 315 + inline_lets body ) 316 + | E_fun (params, body) -> 317 + E_fun (inline_lets_params params, inline_lets_body body) 318 + | E_apply (f, args) -> 319 + E_apply (inline_lets f, List.map (fun (l, e) -> (l, inline_lets e)) args) 320 + | E_match (e, cases) -> 321 + E_match (inline_lets e, List.map inline_lets_case cases) 322 + | E_tuple es -> E_tuple (List.map inline_lets es) 323 + | E_construct (n, arg) -> E_construct (n, Option.map inline_lets arg) 324 + | E_variant (l, arg) -> E_variant (l, Option.map inline_lets arg) 325 + | E_record (fs, base) -> 326 + E_record 327 + ( List.map (fun (n, e) -> (n, inline_lets e)) fs, 328 + Option.map inline_lets base ) 329 + | E_field (e, n) -> E_field (inline_lets e, n) 330 + | E_ifthenelse (c, t, f) -> 331 + E_ifthenelse (inline_lets c, inline_lets t, Option.map inline_lets f) 332 + | E_sequence (e1, e2) -> E_sequence (inline_lets e1, inline_lets e2) 333 + | E_assert e -> E_assert (inline_lets e) 334 + | E_lazy e -> E_lazy (inline_lets e) 335 + | E_try (e, cases) -> E_try (inline_lets e, List.map inline_lets_case cases) 336 + | (E_ident _ | E_const _ | E_other) as e -> e 337 + 338 + and inline_lets_params params = 339 + List.map 340 + (function 341 + | Param_val (l, default, pat) -> 342 + Param_val (l, Option.map inline_lets default, pat) 343 + | Param_newtype -> Param_newtype) 344 + params 345 + 346 + and inline_lets_body = function 347 + | Body_expr e -> Body_expr (inline_lets e) 348 + | Body_cases cases -> Body_cases (List.map inline_lets_case cases) 349 + 350 + and inline_lets_case { cpat; guard; rhs } = 351 + { cpat; guard = Option.map inline_lets guard; rhs = inline_lets rhs } 352 + 353 + (* Pass 2: Beta reduction — (fun x -> body) arg → body[x:=arg] *) 354 + 355 + let rec beta_reduce = function 356 + | E_apply 357 + ( E_fun ([ Param_val (Asttypes.Nolabel, None, P_var x) ], Body_expr body), 358 + [ (Asttypes.Nolabel, arg) ] ) -> 359 + beta_reduce (subst x (beta_reduce arg) (beta_reduce body)) 360 + | E_apply (f, args) -> ( 361 + let f' = beta_reduce f in 362 + let args' = List.map (fun (l, e) -> (l, beta_reduce e)) args in 363 + (* Check if beta-reduction exposed a new redex *) 364 + let result = E_apply (f', args') in 365 + match result with 366 + | E_apply 367 + ( E_fun 368 + ([ Param_val (Asttypes.Nolabel, None, P_var x) ], Body_expr body), 369 + [ (Asttypes.Nolabel, arg) ] ) -> 370 + beta_reduce (subst x arg body) 371 + | _ -> result) 372 + | E_let (rf, bindings, body) -> 373 + E_let 374 + ( rf, 375 + List.map 376 + (fun { bpat; bexpr } -> { bpat; bexpr = beta_reduce bexpr }) 377 + bindings, 378 + beta_reduce body ) 379 + | E_fun (params, body) -> 380 + E_fun (beta_reduce_params params, beta_reduce_body body) 381 + | E_match (e, cases) -> 382 + E_match (beta_reduce e, List.map beta_reduce_case cases) 383 + | E_tuple es -> E_tuple (List.map beta_reduce es) 384 + | E_construct (n, arg) -> E_construct (n, Option.map beta_reduce arg) 385 + | E_variant (l, arg) -> E_variant (l, Option.map beta_reduce arg) 386 + | E_record (fs, base) -> 387 + E_record 388 + ( List.map (fun (n, e) -> (n, beta_reduce e)) fs, 389 + Option.map beta_reduce base ) 390 + | E_field (e, n) -> E_field (beta_reduce e, n) 391 + | E_ifthenelse (c, t, f) -> 392 + E_ifthenelse (beta_reduce c, beta_reduce t, Option.map beta_reduce f) 393 + | E_sequence (e1, e2) -> E_sequence (beta_reduce e1, beta_reduce e2) 394 + | E_assert e -> E_assert (beta_reduce e) 395 + | E_lazy e -> E_lazy (beta_reduce e) 396 + | E_try (e, cases) -> E_try (beta_reduce e, List.map beta_reduce_case cases) 397 + | (E_ident _ | E_const _ | E_other) as e -> e 398 + 399 + and beta_reduce_params params = 400 + List.map 401 + (function 402 + | Param_val (l, default, pat) -> 403 + Param_val (l, Option.map beta_reduce default, pat) 404 + | Param_newtype -> Param_newtype) 405 + params 406 + 407 + and beta_reduce_body = function 408 + | Body_expr e -> Body_expr (beta_reduce e) 409 + | Body_cases cases -> Body_cases (List.map beta_reduce_case cases) 410 + 411 + and beta_reduce_case { cpat; guard; rhs } = 412 + { cpat; guard = Option.map beta_reduce guard; rhs = beta_reduce rhs } 413 + 414 + (* Pass 3: Eta reduction — fun x -> f x → f (when x not free in f) 415 + 416 + Standard eta rule applied bottom-up. For multi-param functions like 417 + fun a b -> f a b, flatten_funs has already merged params, so we split 418 + the last param off: fun a b -> f a b → fun a -> f a → f *) 419 + 420 + let rec eta_reduce = function 421 + | E_fun (params, body) -> 422 + let params = eta_reduce_params params in 423 + let body = eta_reduce_body body in 424 + eta_try_reduce params body 425 + | E_let (rf, bindings, body) -> 426 + E_let 427 + ( rf, 428 + List.map 429 + (fun { bpat; bexpr } -> { bpat; bexpr = eta_reduce bexpr }) 430 + bindings, 431 + eta_reduce body ) 432 + | E_apply (f, args) -> 433 + E_apply (eta_reduce f, List.map (fun (l, e) -> (l, eta_reduce e)) args) 434 + | E_match (e, cases) -> E_match (eta_reduce e, List.map eta_reduce_case cases) 435 + | E_tuple es -> E_tuple (List.map eta_reduce es) 436 + | E_construct (n, arg) -> E_construct (n, Option.map eta_reduce arg) 437 + | E_variant (l, arg) -> E_variant (l, Option.map eta_reduce arg) 438 + | E_record (fs, base) -> 439 + E_record 440 + ( List.map (fun (n, e) -> (n, eta_reduce e)) fs, 441 + Option.map eta_reduce base ) 442 + | E_field (e, n) -> E_field (eta_reduce e, n) 443 + | E_ifthenelse (c, t, f) -> 444 + E_ifthenelse (eta_reduce c, eta_reduce t, Option.map eta_reduce f) 445 + | E_sequence (e1, e2) -> E_sequence (eta_reduce e1, eta_reduce e2) 446 + | E_assert e -> E_assert (eta_reduce e) 447 + | E_lazy e -> E_lazy (eta_reduce e) 448 + | E_try (e, cases) -> E_try (eta_reduce e, List.map eta_reduce_case cases) 449 + | (E_ident _ | E_const _ | E_other) as e -> e 450 + 451 + (* Try the eta rule on a fun node. For a single-param function 452 + fun x -> f x, reduce to f. For multi-param fun a b -> f a b, 453 + split off the last param and recurse: fun a -> f a → f. *) 454 + and eta_try_reduce params body = 455 + match (params, body) with 456 + (* Single param: fun x -> f x → f *) 457 + | ( [ Param_val (Asttypes.Nolabel, None, P_var x) ], 458 + Body_expr (E_apply (f, [ (Asttypes.Nolabel, E_ident y) ])) ) 459 + when x = y && not (ident_in_expr x f) -> 460 + eta_reduce f 461 + (* Multi param: peel last param/arg, recurse *) 462 + | _ :: _ :: _, Body_expr (E_apply (f, (_ :: _ :: _ as args))) -> ( 463 + let params_rev = List.rev params in 464 + let args_rev = List.rev args in 465 + match (params_rev, args_rev) with 466 + | ( Param_val (Asttypes.Nolabel, None, P_var x) :: rest_params_rev, 467 + (Asttypes.Nolabel, E_ident y) :: rest_args_rev ) 468 + when x = y 469 + && (not (ident_in_expr x f)) 470 + && not 471 + (List.exists (fun (_, e) -> ident_in_expr x e) rest_args_rev) 472 + -> 473 + eta_try_reduce (List.rev rest_params_rev) 474 + (Body_expr (E_apply (f, List.rev rest_args_rev))) 475 + | _ -> E_fun (params, body)) 476 + | _ -> E_fun (params, body) 477 + 478 + and eta_reduce_params params = 479 + List.map 480 + (function 481 + | Param_val (l, default, pat) -> 482 + Param_val (l, Option.map eta_reduce default, pat) 483 + | Param_newtype -> Param_newtype) 484 + params 485 + 486 + and eta_reduce_body = function 487 + | Body_expr e -> Body_expr (eta_reduce e) 488 + | Body_cases cases -> Body_cases (List.map eta_reduce_case cases) 489 + 490 + and eta_reduce_case { cpat; guard; rhs } = 491 + { cpat; guard = Option.map eta_reduce guard; rhs = eta_reduce rhs } 492 + 493 + (* Pass 4: Flatten nested funs — fun x -> fun y -> e → fun x y -> e *) 494 + 495 + let rec flatten_funs = function 496 + | E_fun (ps1, Body_expr (E_fun (ps2, body))) -> 497 + flatten_funs (E_fun (ps1 @ ps2, body)) 498 + | E_fun (params, body) -> 499 + E_fun (flatten_funs_params params, flatten_funs_body body) 500 + | E_let (rf, bindings, body) -> 501 + E_let 502 + ( rf, 503 + List.map 504 + (fun { bpat; bexpr } -> { bpat; bexpr = flatten_funs bexpr }) 505 + bindings, 506 + flatten_funs body ) 507 + | E_apply (f, args) -> 508 + E_apply (flatten_funs f, List.map (fun (l, e) -> (l, flatten_funs e)) args) 509 + | E_match (e, cases) -> 510 + E_match (flatten_funs e, List.map flatten_funs_case cases) 511 + | E_tuple es -> E_tuple (List.map flatten_funs es) 512 + | E_construct (n, arg) -> E_construct (n, Option.map flatten_funs arg) 513 + | E_variant (l, arg) -> E_variant (l, Option.map flatten_funs arg) 514 + | E_record (fs, base) -> 515 + E_record 516 + ( List.map (fun (n, e) -> (n, flatten_funs e)) fs, 517 + Option.map flatten_funs base ) 518 + | E_field (e, n) -> E_field (flatten_funs e, n) 519 + | E_ifthenelse (c, t, f) -> 520 + E_ifthenelse (flatten_funs c, flatten_funs t, Option.map flatten_funs f) 521 + | E_sequence (e1, e2) -> E_sequence (flatten_funs e1, flatten_funs e2) 522 + | E_assert e -> E_assert (flatten_funs e) 523 + | E_lazy e -> E_lazy (flatten_funs e) 524 + | E_try (e, cases) -> E_try (flatten_funs e, List.map flatten_funs_case cases) 525 + | (E_ident _ | E_const _ | E_other) as e -> e 526 + 527 + and flatten_funs_params params = 528 + List.map 529 + (function 530 + | Param_val (l, default, pat) -> 531 + Param_val (l, Option.map flatten_funs default, pat) 532 + | Param_newtype -> Param_newtype) 533 + params 534 + 535 + and flatten_funs_body = function 536 + | Body_expr e -> Body_expr (flatten_funs e) 537 + | Body_cases cases -> Body_cases (List.map flatten_funs_case cases) 538 + 539 + and flatten_funs_case { cpat; guard; rhs } = 540 + { cpat; guard = Option.map flatten_funs guard; rhs = flatten_funs rhs } 541 + 542 + let buf_tag buf n = Buffer.add_uint8 buf n 543 + 544 + let buf_str buf s = 545 + Buffer.add_string buf (string_of_int (String.length s)); 546 + Buffer.add_char buf ':'; 547 + Buffer.add_string buf s 548 + 549 + let buf_opt buf f = function 550 + | None -> buf_tag buf 0 551 + | Some x -> 552 + buf_tag buf 1; 553 + f x 554 + 555 + let buf_list buf f xs = 556 + Buffer.add_string buf (string_of_int (List.length xs)); 557 + Buffer.add_char buf ':'; 558 + List.iter f xs 559 + 560 + let buf_label buf = function 561 + | Asttypes.Nolabel -> buf_tag buf 0 562 + | Asttypes.Labelled s -> 563 + buf_tag buf 1; 564 + buf_str buf s 565 + | Asttypes.Optional s -> 566 + buf_tag buf 2; 567 + buf_str buf s 568 + 569 + let rec hash_const buf = function 570 + | Int (s, c) -> 571 + buf_tag buf 0; 572 + buf_str buf s; 573 + buf_opt buf (fun c -> Buffer.add_char buf c) c 574 + | Char c -> 575 + buf_tag buf 1; 576 + Buffer.add_char buf c 577 + | String s -> 578 + buf_tag buf 2; 579 + buf_str buf s 580 + | Float (s, c) -> 581 + buf_tag buf 3; 582 + buf_str buf s; 583 + buf_opt buf (fun c -> Buffer.add_char buf c) c 584 + 585 + and hash_pat buf = function 586 + | P_any -> buf_tag buf 0 587 + | P_var s -> 588 + buf_tag buf 1; 589 + buf_str buf s 590 + | P_const c -> 591 + buf_tag buf 2; 592 + hash_const buf c 593 + | P_tuple ps -> 594 + buf_tag buf 3; 595 + buf_list buf (hash_pat buf) ps 596 + | P_construct (n, p) -> 597 + buf_tag buf 4; 598 + buf_str buf n; 599 + buf_opt buf (hash_pat buf) p 600 + | P_variant (l, p) -> 601 + buf_tag buf 5; 602 + buf_str buf l; 603 + buf_opt buf (hash_pat buf) p 604 + | P_record fs -> 605 + buf_tag buf 6; 606 + buf_list buf 607 + (fun (n, p) -> 608 + buf_str buf n; 609 + hash_pat buf p) 610 + fs 611 + | P_or (p1, p2) -> 612 + buf_tag buf 7; 613 + hash_pat buf p1; 614 + hash_pat buf p2 615 + | P_alias (p, n) -> 616 + buf_tag buf 8; 617 + hash_pat buf p; 618 + buf_str buf n 619 + | P_array ps -> 620 + buf_tag buf 9; 621 + buf_list buf (hash_pat buf) ps 622 + | P_other -> buf_tag buf 10 623 + 624 + and hash_expr buf = function 625 + | E_ident s -> 626 + buf_tag buf 0; 627 + buf_str buf s 628 + | E_const c -> 629 + buf_tag buf 1; 630 + hash_const buf c 631 + | E_let (rf, bs, body) -> 632 + buf_tag buf 2; 633 + buf_tag buf (match rf with Asttypes.Recursive -> 1 | _ -> 0); 634 + buf_list buf 635 + (fun { bpat; bexpr } -> 636 + hash_pat buf bpat; 637 + hash_expr buf bexpr) 638 + bs; 639 + hash_expr buf body 640 + | E_fun (params, body) -> 641 + buf_tag buf 3; 642 + buf_list buf (hash_param buf) params; 643 + hash_body buf body 644 + | E_apply (f, args) -> 645 + buf_tag buf 4; 646 + hash_expr buf f; 647 + buf_list buf 648 + (fun (l, e) -> 649 + buf_label buf l; 650 + hash_expr buf e) 651 + args 652 + | E_match (e, cases) -> 653 + buf_tag buf 5; 654 + hash_expr buf e; 655 + buf_list buf (hash_case buf) cases 656 + | E_tuple es -> 657 + buf_tag buf 6; 658 + buf_list buf (hash_expr buf) es 659 + | E_construct (n, e) -> 660 + buf_tag buf 7; 661 + buf_str buf n; 662 + buf_opt buf (hash_expr buf) e 663 + | E_variant (l, e) -> 664 + buf_tag buf 8; 665 + buf_str buf l; 666 + buf_opt buf (hash_expr buf) e 667 + | E_record (fs, base) -> 668 + buf_tag buf 9; 669 + buf_list buf 670 + (fun (n, e) -> 671 + buf_str buf n; 672 + hash_expr buf e) 673 + fs; 674 + buf_opt buf (hash_expr buf) base 675 + | E_field (e, n) -> 676 + buf_tag buf 10; 677 + hash_expr buf e; 678 + buf_str buf n 679 + | E_ifthenelse (c, t, f) -> 680 + buf_tag buf 11; 681 + hash_expr buf c; 682 + hash_expr buf t; 683 + buf_opt buf (hash_expr buf) f 684 + | E_sequence (e1, e2) -> 685 + buf_tag buf 12; 686 + hash_expr buf e1; 687 + hash_expr buf e2 688 + | E_assert e -> 689 + buf_tag buf 13; 690 + hash_expr buf e 691 + | E_lazy e -> 692 + buf_tag buf 14; 693 + hash_expr buf e 694 + | E_try (e, cases) -> 695 + buf_tag buf 15; 696 + hash_expr buf e; 697 + buf_list buf (hash_case buf) cases 698 + | E_other -> buf_tag buf 16 699 + 700 + and hash_param buf = function 701 + | Param_val (l, default, pat) -> 702 + buf_tag buf 0; 703 + buf_label buf l; 704 + buf_opt buf (hash_expr buf) default; 705 + hash_pat buf pat 706 + | Param_newtype -> buf_tag buf 1 707 + 708 + and hash_body buf = function 709 + | Body_expr e -> 710 + buf_tag buf 0; 711 + hash_expr buf e 712 + | Body_cases cases -> 713 + buf_tag buf 1; 714 + buf_list buf (hash_case buf) cases 715 + 716 + and hash_case buf { cpat; guard; rhs } = 717 + hash_pat buf cpat; 718 + buf_opt buf (hash_expr buf) guard; 719 + hash_expr buf rhs 190 720 191 721 let hash expr = 192 722 let buf = Buffer.create 256 in 193 - let tag n = Buffer.add_uint8 buf n in 194 - let str s = 195 - Buffer.add_string buf (string_of_int (String.length s)); 196 - Buffer.add_char buf ':'; 197 - Buffer.add_string buf s 198 - in 199 - let opt f = function 200 - | None -> tag 0 201 - | Some x -> 202 - tag 1; 203 - f x 723 + hash_expr buf expr; 724 + Digest.to_hex (Digest.string (Buffer.contents buf)) 725 + 726 + (* AST size *) 727 + 728 + let rec expr_size = function 729 + | E_ident _ | E_const _ | E_other -> 1 730 + | E_let (_, bs, body) -> 731 + 1 732 + + List.fold_left (fun acc b -> acc + binding_size b) 0 bs 733 + + expr_size body 734 + | E_fun (params, body) -> 735 + 1 736 + + List.fold_left (fun acc p -> acc + param_size p) 0 params 737 + + body_size body 738 + | E_apply (f, args) -> 739 + 1 + expr_size f 740 + + List.fold_left (fun acc (_, e) -> acc + expr_size e) 0 args 741 + | E_match (e, cases) | E_try (e, cases) -> 742 + 1 + expr_size e + List.fold_left (fun acc c -> acc + case_size c) 0 cases 743 + | E_tuple es -> 1 + List.fold_left (fun acc e -> acc + expr_size e) 0 es 744 + | E_construct (_, arg) | E_variant (_, arg) -> ( 745 + 1 + match arg with None -> 0 | Some e -> expr_size e) 746 + | E_record (fs, base) -> ( 747 + 1 748 + + List.fold_left (fun acc (_, e) -> acc + expr_size e) 0 fs 749 + + match base with None -> 0 | Some e -> expr_size e) 750 + | E_field (e, _) | E_assert e | E_lazy e -> 1 + expr_size e 751 + | E_ifthenelse (c, t, f) -> ( 752 + 1 + expr_size c + expr_size t 753 + + match f with None -> 0 | Some e -> expr_size e) 754 + | E_sequence (e1, e2) -> 1 + expr_size e1 + expr_size e2 755 + 756 + and binding_size { bpat = _; bexpr } = 1 + expr_size bexpr 757 + 758 + and param_size = function 759 + | Param_val (_, default, _) -> ( 760 + 1 + match default with None -> 0 | Some e -> expr_size e) 761 + | Param_newtype -> 1 762 + 763 + and body_size = function 764 + | Body_expr e -> expr_size e 765 + | Body_cases cases -> List.fold_left (fun acc c -> acc + case_size c) 0 cases 766 + 767 + and case_size { cpat = _; guard; rhs } = 768 + 1 + (match guard with None -> 0 | Some e -> expr_size e) + expr_size rhs 769 + 770 + (* Sub-expression hashing *) 771 + 772 + let sub_hashes ~min_size root = 773 + let acc = ref [] in 774 + let rec walk_expr e = 775 + let sz = expr_size e in 776 + if sz >= min_size then acc := (hash e, sz) :: !acc; 777 + descend_expr e 778 + and descend_expr = function 779 + | E_ident _ | E_const _ | E_other -> () 780 + | E_let (_, bs, body) -> 781 + List.iter (fun { bpat = _; bexpr } -> walk_expr bexpr) bs; 782 + walk_expr body 783 + | E_fun (_, body) -> walk_body body 784 + | E_apply (f, args) -> 785 + walk_expr f; 786 + List.iter (fun (_, e) -> walk_expr e) args 787 + | E_match (e, cases) | E_try (e, cases) -> 788 + walk_expr e; 789 + List.iter walk_case cases 790 + | E_tuple es -> List.iter walk_expr es 791 + | E_construct (_, arg) | E_variant (_, arg) -> Option.iter walk_expr arg 792 + | E_record (fs, base) -> 793 + List.iter (fun (_, e) -> walk_expr e) fs; 794 + Option.iter walk_expr base 795 + | E_field (e, _) | E_assert e | E_lazy e -> walk_expr e 796 + | E_ifthenelse (c, t, f) -> 797 + walk_expr c; 798 + walk_expr t; 799 + Option.iter walk_expr f 800 + | E_sequence (e1, e2) -> 801 + walk_expr e1; 802 + walk_expr e2 803 + and walk_body = function 804 + | Body_expr e -> walk_expr e 805 + | Body_cases cases -> List.iter walk_case cases 806 + and walk_case { cpat = _; guard; rhs } = 807 + Option.iter walk_expr guard; 808 + walk_expr rhs 204 809 in 205 - let list f xs = 206 - Buffer.add_string buf (string_of_int (List.length xs)); 207 - Buffer.add_char buf ':'; 208 - List.iter f xs 810 + walk_expr root; 811 + !acc 812 + 813 + (* Pass 5: Common sub-expression elimination *) 814 + 815 + let cse expr = 816 + (* Count occurrences of each sub-expression by hash *) 817 + let counts = Hashtbl.create 64 in 818 + let rec count_expr e = 819 + let sz = expr_size e in 820 + if sz >= 2 then begin 821 + let h = hash e in 822 + let cur = try Hashtbl.find counts h with Not_found -> 0 in 823 + Hashtbl.replace counts h (cur + 1) 824 + end; 825 + count_descend e 826 + and count_descend = function 827 + | E_ident _ | E_const _ | E_other -> () 828 + | E_let (_, bs, body) -> 829 + List.iter (fun { bpat = _; bexpr } -> count_expr bexpr) bs; 830 + count_expr body 831 + | E_fun (params, body) -> 832 + List.iter 833 + (function 834 + | Param_val (_, default, _) -> Option.iter count_expr default 835 + | Param_newtype -> ()) 836 + params; 837 + count_body body 838 + | E_apply (f, args) -> 839 + count_expr f; 840 + List.iter (fun (_, e) -> count_expr e) args 841 + | E_match (e, cases) | E_try (e, cases) -> 842 + count_expr e; 843 + List.iter count_case cases 844 + | E_tuple es -> List.iter count_expr es 845 + | E_construct (_, arg) | E_variant (_, arg) -> Option.iter count_expr arg 846 + | E_record (fs, base) -> 847 + List.iter (fun (_, e) -> count_expr e) fs; 848 + Option.iter count_expr base 849 + | E_field (e, _) | E_assert e | E_lazy e -> count_expr e 850 + | E_ifthenelse (c, t, f) -> 851 + count_expr c; 852 + count_expr t; 853 + Option.iter count_expr f 854 + | E_sequence (e1, e2) -> 855 + count_expr e1; 856 + count_expr e2 857 + and count_body = function 858 + | Body_expr e -> count_expr e 859 + | Body_cases cases -> List.iter count_case cases 860 + and count_case { cpat = _; guard; rhs } = 861 + Option.iter count_expr guard; 862 + count_expr rhs 209 863 in 210 - let label = function 211 - | Asttypes.Nolabel -> tag 0 212 - | Asttypes.Labelled s -> 213 - tag 1; 214 - str s 215 - | Asttypes.Optional s -> 216 - tag 2; 217 - str s 864 + count_expr expr; 865 + (* Extract shared sub-expressions *) 866 + let seen : (string, string) Hashtbl.t = Hashtbl.create 16 in 867 + let bindings = ref [] in 868 + let cse_counter = ref 0 in 869 + let cse_fresh () = 870 + let n = !cse_counter in 871 + incr cse_counter; 872 + Printf.sprintf "_cse%d" n 218 873 in 219 - let rec hash_const = function 220 - | Int (s, c) -> 221 - tag 0; 222 - str s; 223 - opt (fun c -> Buffer.add_char buf c) c 224 - | Char c -> 225 - tag 1; 226 - Buffer.add_char buf c 227 - | String s -> 228 - tag 2; 229 - str s 230 - | Float (s, c) -> 231 - tag 3; 232 - str s; 233 - opt (fun c -> Buffer.add_char buf c) c 234 - and hash_pat = function 235 - | P_any -> tag 0 236 - | P_var s -> 237 - tag 1; 238 - str s 239 - | P_const c -> 240 - tag 2; 241 - hash_const c 242 - | P_tuple ps -> 243 - tag 3; 244 - list hash_pat ps 245 - | P_construct (n, p) -> 246 - tag 4; 247 - str n; 248 - opt hash_pat p 249 - | P_variant (l, p) -> 250 - tag 5; 251 - str l; 252 - opt hash_pat p 253 - | P_record fs -> 254 - tag 6; 255 - list 256 - (fun (n, p) -> 257 - str n; 258 - hash_pat p) 259 - fs 260 - | P_or (p1, p2) -> 261 - tag 7; 262 - hash_pat p1; 263 - hash_pat p2 264 - | P_alias (p, n) -> 265 - tag 8; 266 - hash_pat p; 267 - str n 268 - | P_array ps -> 269 - tag 9; 270 - list hash_pat ps 271 - | P_other -> tag 10 272 - and hash_expr = function 273 - | E_ident s -> 274 - tag 0; 275 - str s 276 - | E_const c -> 277 - tag 1; 278 - hash_const c 874 + let rec extract e = 875 + let sz = expr_size e in 876 + if sz >= 2 then begin 877 + let h = hash e in 878 + let count = try Hashtbl.find counts h with Not_found -> 0 in 879 + if count >= 2 then ( 880 + match Hashtbl.find_opt seen h with 881 + | Some name -> E_ident name 882 + | None -> 883 + let e' = extract_descend e in 884 + let name = cse_fresh () in 885 + Hashtbl.replace seen h name; 886 + bindings := { bpat = P_var name; bexpr = e' } :: !bindings; 887 + E_ident name) 888 + else extract_descend e 889 + end 890 + else e 891 + and extract_descend = function 892 + | (E_ident _ | E_const _ | E_other) as e -> e 279 893 | E_let (rf, bs, body) -> 280 - tag 2; 281 - tag (match rf with Asttypes.Recursive -> 1 | _ -> 0); 282 - list 283 - (fun { bpat; bexpr } -> 284 - hash_pat bpat; 285 - hash_expr bexpr) 286 - bs; 287 - hash_expr body 288 - | E_fun (params, body) -> 289 - tag 3; 290 - list hash_param params; 291 - hash_body body 894 + E_let 895 + ( rf, 896 + List.map (fun { bpat; bexpr } -> { bpat; bexpr = extract bexpr }) bs, 897 + extract body ) 898 + | E_fun (params, body) -> E_fun (extract_params params, extract_body body) 292 899 | E_apply (f, args) -> 293 - tag 4; 294 - hash_expr f; 295 - list 296 - (fun (l, e) -> 297 - label l; 298 - hash_expr e) 299 - args 300 - | E_match (e, cases) -> 301 - tag 5; 302 - hash_expr e; 303 - list hash_case cases 304 - | E_tuple es -> 305 - tag 6; 306 - list hash_expr es 307 - | E_construct (n, e) -> 308 - tag 7; 309 - str n; 310 - opt hash_expr e 311 - | E_variant (l, e) -> 312 - tag 8; 313 - str l; 314 - opt hash_expr e 900 + E_apply (extract f, List.map (fun (l, e) -> (l, extract e)) args) 901 + | E_match (e, cases) -> E_match (extract e, List.map extract_case cases) 902 + | E_tuple es -> E_tuple (List.map extract es) 903 + | E_construct (n, arg) -> E_construct (n, Option.map extract arg) 904 + | E_variant (l, arg) -> E_variant (l, Option.map extract arg) 315 905 | E_record (fs, base) -> 316 - tag 9; 317 - list 318 - (fun (n, e) -> 319 - str n; 320 - hash_expr e) 321 - fs; 322 - opt hash_expr base 323 - | E_field (e, n) -> 324 - tag 10; 325 - hash_expr e; 326 - str n 906 + E_record 907 + (List.map (fun (n, e) -> (n, extract e)) fs, Option.map extract base) 908 + | E_field (e, n) -> E_field (extract e, n) 327 909 | E_ifthenelse (c, t, f) -> 328 - tag 11; 329 - hash_expr c; 330 - hash_expr t; 331 - opt hash_expr f 332 - | E_sequence (e1, e2) -> 333 - tag 12; 334 - hash_expr e1; 335 - hash_expr e2 336 - | E_assert e -> 337 - tag 13; 338 - hash_expr e 339 - | E_lazy e -> 340 - tag 14; 341 - hash_expr e 342 - | E_try (e, cases) -> 343 - tag 15; 344 - hash_expr e; 345 - list hash_case cases 346 - | E_other -> tag 16 347 - and hash_param = function 910 + E_ifthenelse (extract c, extract t, Option.map extract f) 911 + | E_sequence (e1, e2) -> E_sequence (extract e1, extract e2) 912 + | E_assert e -> E_assert (extract e) 913 + | E_lazy e -> E_lazy (extract e) 914 + | E_try (e, cases) -> E_try (extract e, List.map extract_case cases) 915 + and extract_params params = 916 + List.map 917 + (function 918 + | Param_val (l, default, pat) -> 919 + Param_val (l, Option.map extract default, pat) 920 + | Param_newtype -> Param_newtype) 921 + params 922 + and extract_body = function 923 + | Body_expr e -> Body_expr (extract e) 924 + | Body_cases cases -> Body_cases (List.map extract_case cases) 925 + and extract_case { cpat; guard; rhs } = 926 + { cpat; guard = Option.map extract guard; rhs = extract rhs } 927 + in 928 + let body = extract expr in 929 + match List.rev !bindings with 930 + | [] -> body 931 + | bs -> E_let (Asttypes.Nonrecursive, bs, body) 932 + 933 + (* Pass 6: Renumber — assign fresh sequential _0, _1, ... to all binding sites *) 934 + 935 + let renumber expr = 936 + let env = env () in 937 + let ren = lookup env in 938 + let bind name = rename env name in 939 + let rec ren_expr = function 940 + | E_ident s -> E_ident (ren s) 941 + | (E_const _ | E_other) as e -> e 942 + | E_let (rf, bindings, body) -> 943 + let bindings = 944 + List.map 945 + (fun { bpat; bexpr } -> 946 + let bexpr = ren_expr bexpr in 947 + let bpat = ren_pat bpat in 948 + { bpat; bexpr }) 949 + bindings 950 + in 951 + E_let (rf, bindings, ren_expr body) 952 + | E_fun (params, body) -> E_fun (List.map ren_param params, ren_body body) 953 + | E_apply (f, args) -> 954 + E_apply (ren_expr f, List.map (fun (l, e) -> (l, ren_expr e)) args) 955 + | E_match (e, cases) -> E_match (ren_expr e, List.map ren_case cases) 956 + | E_tuple es -> E_tuple (List.map ren_expr es) 957 + | E_construct (n, arg) -> E_construct (n, Option.map ren_expr arg) 958 + | E_variant (l, arg) -> E_variant (l, Option.map ren_expr arg) 959 + | E_record (fs, base) -> 960 + E_record 961 + (List.map (fun (n, e) -> (n, ren_expr e)) fs, Option.map ren_expr base) 962 + | E_field (e, n) -> E_field (ren_expr e, n) 963 + | E_ifthenelse (c, t, f) -> 964 + E_ifthenelse (ren_expr c, ren_expr t, Option.map ren_expr f) 965 + | E_sequence (e1, e2) -> E_sequence (ren_expr e1, ren_expr e2) 966 + | E_assert e -> E_assert (ren_expr e) 967 + | E_lazy e -> E_lazy (ren_expr e) 968 + | E_try (e, cases) -> E_try (ren_expr e, List.map ren_case cases) 969 + and ren_pat = function 970 + | P_var s -> P_var (bind s) 971 + | P_tuple ps -> P_tuple (List.map ren_pat ps) 972 + | P_construct (n, arg) -> P_construct (n, Option.map ren_pat arg) 973 + | P_variant (l, arg) -> P_variant (l, Option.map ren_pat arg) 974 + | P_record fs -> P_record (List.map (fun (n, p) -> (n, ren_pat p)) fs) 975 + | P_or (p1, p2) -> P_or (ren_pat p1, ren_pat p2) 976 + | P_alias (p, n) -> 977 + let p = ren_pat p in 978 + P_alias (p, bind n) 979 + | P_array ps -> P_array (List.map ren_pat ps) 980 + | (P_any | P_const _ | P_other) as p -> p 981 + and ren_param = function 348 982 | Param_val (l, default, pat) -> 349 - tag 0; 350 - label l; 351 - opt hash_expr default; 352 - hash_pat pat 353 - | Param_newtype -> tag 1 354 - and hash_body = function 355 - | Body_expr e -> 356 - tag 0; 357 - hash_expr e 358 - | Body_cases cases -> 359 - tag 1; 360 - list hash_case cases 361 - and hash_case { cpat; guard; rhs } = 362 - hash_pat cpat; 363 - opt hash_expr guard; 364 - hash_expr rhs 983 + let default = Option.map ren_expr default in 984 + let pat = ren_pat pat in 985 + Param_val (l, default, pat) 986 + | Param_newtype -> Param_newtype 987 + and ren_body = function 988 + | Body_expr e -> Body_expr (ren_expr e) 989 + | Body_cases cases -> Body_cases (List.map ren_case cases) 990 + and ren_case { cpat; guard; rhs } = 991 + let cpat = ren_pat cpat in 992 + let guard = Option.map ren_expr guard in 993 + let rhs = ren_expr rhs in 994 + { cpat; guard; rhs } 365 995 in 366 - hash_expr expr; 367 - Digest.to_hex (Digest.string (Buffer.contents buf)) 996 + ren_expr expr 997 + 998 + (* Canonicalization pipeline *) 999 + 1000 + let canonicalize expr = 1001 + expr |> inline_lets |> beta_reduce |> flatten_funs |> eta_reduce |> cse 1002 + |> renumber 1003 + 1004 + let apply expr = 1005 + let env = env () in 1006 + convert_expr env expr |> canonicalize 368 1007 369 1008 (* Pretty-printer *) 370 1009
+7
lib/normalize.mli
··· 15 15 16 16 val pp_expr : expr Fmt.t 17 17 (** [pp_expr] pretty-prints the simplified expression. *) 18 + 19 + val expr_size : expr -> int 20 + (** [expr_size e] counts the number of nodes in the simplified AST. *) 21 + 22 + val sub_hashes : min_size:int -> expr -> (string * int) list 23 + (** [sub_hashes ~min_size e] walks the tree and returns [(hash, size)] pairs for 24 + every sub-expression whose size is at least [min_size]. *)
+117 -18
lib/report.ml
··· 1 + let src = Logs.Src.create "dupfind.report" 2 + 3 + module Log = (val Logs.src_log src) 4 + 1 5 type format = Cli | Json 2 6 3 - let output_cli ~top clusters = 4 - let clusters = 5 - match top with 6 - | Some n -> List.filteri (fun i _ -> i < n) clusters 7 - | None -> clusters 8 - in 9 - List.iteri 10 - (fun i (cluster : Cluster.t) -> 11 - Fmt.pr "@[<v> Clone #%d (%d nodes, %d occurrences)@," (i + 1) 12 - cluster.ast_size cluster.count; 13 - List.iter 14 - (fun (f : Fragment.t) -> 15 - Fmt.pr " %s:%d %s@," f.location.file f.location.line f.binding_name) 16 - cluster.fragments; 17 - Fmt.pr "@]@.") 18 - clusters; 7 + let add_cluster_rows table i (cluster : Cluster.t) = 8 + List.fold_left 9 + (fun table (j, (f : Fragment.t)) -> 10 + let idx = if j = 0 then string_of_int (i + 1) else "" in 11 + let size = if j = 0 then string_of_int cluster.ast_size else "" in 12 + let count = if j = 0 then string_of_int cluster.count else "" in 13 + let loc = Fmt.str "%s:%d" f.location.file f.location.line in 14 + Tty.Table.add_row_strings [ idx; size; count; loc; f.binding_name ] table) 15 + table 16 + (List.mapi (fun j f -> (j, f)) cluster.fragments) 17 + 18 + let print_summary clusters = 19 19 let total = 20 20 List.fold_left 21 21 (fun acc (c : Cluster.t) -> acc + (c.ast_size * c.count)) ··· 26 26 |> List.sort_uniq String.compare 27 27 |> List.length 28 28 in 29 - Fmt.pr 30 - "@.Found %d clone clusters across %d packages (%d total duplicated nodes)@." 29 + Fmt.pr "Found %d clone clusters across %d packages (%d duplicated nodes).@." 31 30 (List.length clusters) n_packages total 32 31 32 + let output_cli ~top clusters = 33 + let clusters = 34 + match top with 35 + | Some n -> List.filteri (fun i _ -> i < n) clusters 36 + | None -> clusters 37 + in 38 + if clusters = [] then Fmt.pr "No clones found.@." 39 + else begin 40 + let table = 41 + Tty.Table.( 42 + create ~border:Tty.Border.rounded 43 + [ 44 + column "#"; 45 + column "Size"; 46 + column ~align:`Right "Count"; 47 + column "File"; 48 + column "Binding"; 49 + ]) 50 + in 51 + let table = 52 + List.fold_left 53 + (fun table (i, cluster) -> add_cluster_rows table i cluster) 54 + table 55 + (List.mapi (fun i c -> (i, c)) clusters) 56 + in 57 + Tty.Table.pp Format.std_formatter table; 58 + Format.pp_print_newline Format.std_formatter (); 59 + print_summary clusters 60 + end 61 + 33 62 let output_json ~top clusters = 34 63 let clusters = 35 64 match top with ··· 52 81 match format with 53 82 | Cli -> output_cli ~top clusters 54 83 | Json -> output_json ~top clusters 84 + 85 + (* Similar output *) 86 + 87 + let output_similar_cli ~top pairs = 88 + let pairs = 89 + match top with 90 + | Some n -> List.filteri (fun i _ -> i < n) pairs 91 + | None -> pairs 92 + in 93 + if pairs = [] then Fmt.pr "No similar pairs found.@." 94 + else begin 95 + let table = 96 + Tty.Table.( 97 + create ~border:Tty.Border.rounded 98 + [ 99 + column "#"; 100 + column ~align:`Right "Similarity"; 101 + column "Left"; 102 + column "Right"; 103 + column ~align:`Right "Shared"; 104 + ]) 105 + in 106 + let table = 107 + List.fold_left 108 + (fun table (i, (m : Similar.match_pair)) -> 109 + let sim = Fmt.str "%.0f%%" (m.similarity *. 100.0) in 110 + let left = 111 + Fmt.str "%s:%d %s" m.left.location.file m.left.location.line 112 + m.left.binding_name 113 + in 114 + let right = 115 + Fmt.str "%s:%d %s" m.right.location.file m.right.location.line 116 + m.right.binding_name 117 + in 118 + let shared = string_of_int m.shared_hashes in 119 + Tty.Table.add_row_strings 120 + [ string_of_int (i + 1); sim; left; right; shared ] 121 + table) 122 + table 123 + (List.mapi (fun i m -> (i, m)) pairs) 124 + in 125 + Tty.Table.pp Format.std_formatter table; 126 + Format.pp_print_newline Format.std_formatter (); 127 + Fmt.pr "Found %d similar pairs.@." (List.length pairs) 128 + end 129 + 130 + let output_similar_json ~top pairs = 131 + let pairs = 132 + match top with 133 + | Some n -> List.filteri (fun i _ -> i < n) pairs 134 + | None -> pairs 135 + in 136 + let json_frag (f : Fragment.t) = 137 + Fmt.str 138 + {|{"file": %S, "line": %d, "package": %S, "binding": %S, "ast_size": %d}|} 139 + f.location.file f.location.line f.location.package f.binding_name 140 + f.ast_size 141 + in 142 + let json_pair (m : Similar.match_pair) = 143 + Fmt.str 144 + {|{"similarity": %.4f, "shared_hashes": %d, "left": %s, "right": %s}|} 145 + m.similarity m.shared_hashes (json_frag m.left) (json_frag m.right) 146 + in 147 + let items = List.map json_pair pairs |> String.concat ",\n " in 148 + Fmt.pr "[%s]@." items 149 + 150 + let output_similar ~format ~top pairs = 151 + match format with 152 + | Cli -> output_similar_cli ~top pairs 153 + | Json -> output_similar_json ~top pairs
+4
lib/report.mli
··· 4 4 5 5 val output : format:format -> top:int option -> Cluster.t list -> unit 6 6 (** [output ~format ~top clusters] prints clone clusters in the given format. *) 7 + 8 + val output_similar : 9 + format:format -> top:int option -> Similar.match_pair list -> unit 10 + (** [output_similar ~format ~top pairs] prints similarity matches. *)
+55
lib/similar.ml
··· 1 + type match_pair = { 2 + left : Fragment.t; 3 + right : Fragment.t; 4 + similarity : float; 5 + shared_hashes : int; 6 + } 7 + 8 + let check_pair ~threshold ~intra ~seen arr results i j = 9 + if i >= j then () 10 + else 11 + let key = (i, j) in 12 + if Hashtbl.mem seen key then () 13 + else begin 14 + Hashtbl.replace seen key (); 15 + let frag_i, fp_i = arr.(i) in 16 + let frag_j, fp_j = arr.(j) in 17 + let sim = Fingerprint.jaccard fp_i fp_j in 18 + let cross_pkg = 19 + frag_i.Fragment.location.package <> frag_j.Fragment.location.package 20 + in 21 + if sim >= threshold && sim < 1.0 && (intra || cross_pkg) then 22 + results := 23 + { 24 + left = frag_i; 25 + right = frag_j; 26 + similarity = sim; 27 + shared_hashes = List.length (Fingerprint.overlap fp_i fp_j); 28 + } 29 + :: !results 30 + end 31 + 32 + let get ~threshold ~intra entries = 33 + let arr = Array.of_list entries in 34 + let inv = Hashtbl.create 256 in 35 + Array.iteri 36 + (fun i (_frag, fp) -> 37 + List.iter 38 + (fun h -> 39 + let prev = 40 + match Hashtbl.find_opt inv h with Some l -> l | None -> [] 41 + in 42 + Hashtbl.replace inv h (i :: prev)) 43 + (Fingerprint.overlap fp fp)) 44 + arr; 45 + let seen = Hashtbl.create 256 in 46 + let results = ref [] in 47 + Hashtbl.iter 48 + (fun _hash indices -> 49 + let indices = List.sort_uniq Int.compare indices in 50 + List.iter 51 + (fun i -> 52 + List.iter (check_pair ~threshold ~intra ~seen arr results i) indices) 53 + indices) 54 + inv; 55 + List.sort (fun a b -> Float.compare b.similarity a.similarity) !results
+19
lib/similar.mli
··· 1 + (** Similarity detection between code fragments. *) 2 + 3 + type match_pair = { 4 + left : Fragment.t; 5 + right : Fragment.t; 6 + similarity : float; 7 + shared_hashes : int; 8 + } 9 + (** A pair of similar fragments with their Jaccard similarity score. *) 10 + 11 + val get : 12 + threshold:float -> 13 + intra:bool -> 14 + (Fragment.t * Fingerprint.t) list -> 15 + match_pair list 16 + (** [get ~threshold ~intra entries] finds pairs of fragments whose 17 + sub-expression fingerprints have Jaccard similarity >= [threshold] and < 1.0 18 + (exact duplicates are excluded). When [intra] is [false], only cross-package 19 + pairs are returned. Results are sorted by descending similarity. *)
+15 -12
test/cram/inter.t
··· 23 23 Scan detects structurally identical functions after alpha-renaming: 24 24 25 25 $ dupfind scan pkg_a pkg_b 26 - Clone #1 (21 nodes, 2 occurrences) 27 - pkg_b/lib/b.ml:1 encode 28 - pkg_a/lib/a.ml:1 encode 29 - 26 + ╭───┬──────┬───────┬──────────────────┬─────────╮ 27 + │ # │ Size │ Count │ File │ Binding │ 28 + ├───┼──────┼───────┼──────────────────┼─────────┤ 29 + │ 1 │ 21 │ 2 │ pkg_b/lib/b.ml:1 │ encode │ 30 + │ │ │ │ pkg_a/lib/a.ml:1 │ encode │ 31 + ╰───┴──────┴───────┴──────────────────┴─────────╯ 30 32 31 - Found 1 clone clusters across 2 packages (42 total duplicated nodes) 33 + Found 1 clone clusters across 2 packages (42 duplicated nodes). 32 34 33 35 34 36 35 37 With --no-intra, only cross-package duplicates are shown: 36 38 37 39 $ dupfind scan --no-intra pkg_a 38 - 39 - Found 0 clone clusters across 0 packages (0 total duplicated nodes) 40 + No clones found. 40 41 41 42 42 43 Find subcommand: 43 44 44 45 $ dupfind find A.encode pkg_a pkg_b 45 - Clone #1 (21 nodes, 2 occurrences) 46 - pkg_b/lib/b.ml:1 encode 47 - pkg_a/lib/a.ml:1 encode 46 + ╭───┬──────┬───────┬──────────────────┬─────────╮ 47 + │ # │ Size │ Count │ File │ Binding │ 48 + ├───┼──────┼───────┼──────────────────┼─────────┤ 49 + │ 1 │ 21 │ 2 │ pkg_b/lib/b.ml:1 │ encode │ 50 + │ │ │ │ pkg_a/lib/a.ml:1 │ encode │ 51 + ╰───┴──────┴───────┴──────────────────┴─────────╯ 48 52 49 - 50 - Found 1 clone clusters across 2 packages (42 total duplicated nodes) 53 + Found 1 clone clusters across 2 packages (42 duplicated nodes). 51 54 52 55 53 56
+60
test/cram/similar.t
··· 1 + Create test files with near-duplicate code across two packages: 2 + 3 + $ mkdir -p pkg_a/lib pkg_b/lib 4 + 5 + $ cat > pkg_a/lib/a.ml << 'EOF' 6 + > let encode buf x = 7 + > Buffer.add_string buf (string_of_int x.length); 8 + > Buffer.add_char buf ' '; 9 + > Buffer.add_string buf x.name 10 + > 11 + > let process x = 12 + > (x + 1, x + 2, x + 3, x + 4, x + 5, 13 + > x + 6, x + 7, x + 8, x + 9, x * 2) 14 + > EOF 15 + 16 + $ cat > pkg_b/lib/b.ml << 'EOF' 17 + > let encode buffer item = 18 + > Buffer.add_string buffer (string_of_int item.length); 19 + > Buffer.add_char buffer ' '; 20 + > Buffer.add_string buffer item.name 21 + > 22 + > let transform y = 23 + > (y + 1, y + 2, y + 3, y + 4, y + 5, 24 + > y + 6, y + 7, y + 8, y + 9, y * 3) 25 + > EOF 26 + 27 + Similar command finds near-duplicates (process/transform share 9 of 12 sub-expressions): 28 + 29 + $ dupfind similar --threshold 0.5 pkg_a pkg_b 30 + ╭───┬────────────┬────────────────────────────┬──────────────────────────┬────────╮ 31 + │ # │ Similarity │ Left │ Right │ Shared │ 32 + ├───┼────────────┼────────────────────────────┼──────────────────────────┼────────┤ 33 + │ 1 │ 60% │ pkg_b/lib/b.ml:6 transform │ pkg_a/lib/a.ml:6 process │ 9 │ 34 + ╰───┴────────────┴────────────────────────────┴──────────────────────────┴────────╯ 35 + 36 + Found 1 similar pairs. 37 + 38 + 39 + 40 + Exact duplicates (encode) are NOT reported by similar (they belong in scan): 41 + 42 + $ dupfind scan pkg_a pkg_b | head -5 43 + ╭───┬──────┬───────┬──────────────────┬─────────╮ 44 + │ # │ Size │ Count │ File │ Binding │ 45 + ├───┼──────┼───────┼──────────────────┼─────────┤ 46 + │ 1 │ 21 │ 2 │ pkg_b/lib/b.ml:1 │ encode │ 47 + │ │ │ │ pkg_a/lib/a.ml:1 │ encode │ 48 + 49 + 50 + High threshold filters out less-similar pairs: 51 + 52 + $ dupfind similar --threshold 0.99 pkg_a pkg_b 53 + No similar pairs found. 54 + 55 + 56 + Cross-package only filter: 57 + 58 + $ dupfind similar --threshold 0.5 --no-intra pkg_a 59 + No similar pairs found. 60 +
+2
test/test.ml
··· 8 8 Test_cluster.suite; 9 9 Test_discover.suite; 10 10 Test_report.suite; 11 + Test_fingerprint.suite; 12 + Test_similar.suite; 11 13 ]
+1
test/test_cluster.mli
··· 1 + val suite : string * unit Alcotest.test_case list
+8 -10
test/test_discover.ml
··· 1 - let test_package_of_file_named_root () = 1 + let test_named_root () = 2 2 let pkg = 3 3 Dupfind.Discover.package_of_file ~root:(Fpath.v "repos") 4 4 (Fpath.v "repos/mylib/lib/foo.ml") 5 5 in 6 6 Alcotest.(check string) "package" "repos" pkg 7 7 8 - let test_package_of_file_deep () = 8 + let test_deep () = 9 9 let pkg = 10 10 Dupfind.Discover.package_of_file ~root:(Fpath.v "/src") 11 11 (Fpath.v "/src/core/lib/sub/deep.ml") 12 12 in 13 13 Alcotest.(check string) "package" "src" pkg 14 14 15 - let test_package_of_file_at_root () = 15 + let test_at_root () = 16 16 let pkg = 17 17 Dupfind.Discover.package_of_file ~root:(Fpath.v ".") 18 18 (Fpath.v "./ocaml-aos/lib/aos.ml") 19 19 in 20 20 Alcotest.(check string) "package" "ocaml-aos" pkg 21 21 22 - let test_package_of_file_flat () = 22 + let test_flat () = 23 23 let pkg = 24 24 Dupfind.Discover.package_of_file ~root:(Fpath.v ".") (Fpath.v "./foo.ml") 25 25 in ··· 28 28 let suite = 29 29 ( "discover", 30 30 [ 31 - Alcotest.test_case "package_of_file named root" `Quick 32 - test_package_of_file_named_root; 33 - Alcotest.test_case "package_of_file deep" `Quick test_package_of_file_deep; 34 - Alcotest.test_case "package_of_file at root" `Quick 35 - test_package_of_file_at_root; 36 - Alcotest.test_case "package_of_file flat" `Quick test_package_of_file_flat; 31 + Alcotest.test_case "package_of_file named root" `Quick test_named_root; 32 + Alcotest.test_case "package_of_file deep" `Quick test_deep; 33 + Alcotest.test_case "package_of_file at root" `Quick test_at_root; 34 + Alcotest.test_case "package_of_file flat" `Quick test_flat; 37 35 ] )
+1
test/test_discover.mli
··· 1 + val suite : string * unit Alcotest.test_case list
+54
test/test_fingerprint.ml
··· 1 + let test_empty () = 2 + let fp = Dupfind.Fingerprint.of_sub_hashes [] in 3 + Alcotest.(check int) "empty" 0 (Dupfind.Fingerprint.cardinal fp) 4 + 5 + let test_of_sub_hashes () = 6 + let fp = Dupfind.Fingerprint.of_sub_hashes [ ("a", 1); ("b", 2); ("c", 3) ] in 7 + Alcotest.(check int) "three hashes" 3 (Dupfind.Fingerprint.cardinal fp) 8 + 9 + let test_of_sub_hashes_dedup () = 10 + let fp = Dupfind.Fingerprint.of_sub_hashes [ ("a", 1); ("a", 2); ("b", 3) ] in 11 + Alcotest.(check int) "deduped" 2 (Dupfind.Fingerprint.cardinal fp) 12 + 13 + let test_jaccard_identical () = 14 + let fp = Dupfind.Fingerprint.of_sub_hashes [ ("a", 1); ("b", 2) ] in 15 + let sim = Dupfind.Fingerprint.jaccard fp fp in 16 + Alcotest.(check (float 0.001)) "identical" 1.0 sim 17 + 18 + let test_jaccard_disjoint () = 19 + let a = Dupfind.Fingerprint.of_sub_hashes [ ("a", 1); ("b", 2) ] in 20 + let b = Dupfind.Fingerprint.of_sub_hashes [ ("c", 3); ("d", 4) ] in 21 + let sim = Dupfind.Fingerprint.jaccard a b in 22 + Alcotest.(check (float 0.001)) "disjoint" 0.0 sim 23 + 24 + let test_jaccard_partial () = 25 + let a = Dupfind.Fingerprint.of_sub_hashes [ ("a", 1); ("b", 2) ] in 26 + let b = Dupfind.Fingerprint.of_sub_hashes [ ("b", 2); ("c", 3) ] in 27 + let sim = Dupfind.Fingerprint.jaccard a b in 28 + Alcotest.(check (float 0.001)) "1/3 overlap" (1.0 /. 3.0) sim 29 + 30 + let test_jaccard_empty () = 31 + let a = Dupfind.Fingerprint.of_sub_hashes [] in 32 + let b = Dupfind.Fingerprint.of_sub_hashes [] in 33 + Alcotest.(check (float 0.001)) "empty" 0.0 (Dupfind.Fingerprint.jaccard a b) 34 + 35 + let test_overlap () = 36 + let a = Dupfind.Fingerprint.of_sub_hashes [ ("a", 1); ("b", 2); ("c", 3) ] in 37 + let b = Dupfind.Fingerprint.of_sub_hashes [ ("b", 2); ("c", 3); ("d", 4) ] in 38 + let shared = Dupfind.Fingerprint.overlap a b in 39 + Alcotest.(check int) "two shared" 2 (List.length shared); 40 + Alcotest.(check bool) "has b" true (List.mem "b" shared); 41 + Alcotest.(check bool) "has c" true (List.mem "c" shared) 42 + 43 + let suite = 44 + ( "fingerprint", 45 + [ 46 + Alcotest.test_case "empty" `Quick test_empty; 47 + Alcotest.test_case "of_sub_hashes" `Quick test_of_sub_hashes; 48 + Alcotest.test_case "of_sub_hashes dedup" `Quick test_of_sub_hashes_dedup; 49 + Alcotest.test_case "jaccard identical" `Quick test_jaccard_identical; 50 + Alcotest.test_case "jaccard disjoint" `Quick test_jaccard_disjoint; 51 + Alcotest.test_case "jaccard partial" `Quick test_jaccard_partial; 52 + Alcotest.test_case "jaccard empty" `Quick test_jaccard_empty; 53 + Alcotest.test_case "overlap" `Quick test_overlap; 54 + ] )
+1
test/test_fingerprint.mli
··· 1 + val suite : string * unit Alcotest.test_case list
+1
test/test_fragment.mli
··· 1 + val suite : string * unit Alcotest.test_case list
+1
test/test_index.mli
··· 1 + val suite : string * unit Alcotest.test_case list
+119 -35
test/test_normalize.ml
··· 7 7 8 8 let hash_of s = Dupfind.Normalize.(hash (apply (parse_expr s))) 9 9 10 - let test_alpha_equivalence () = 11 - let h1 = hash_of "let f x = x + 1" in 12 - let h2 = hash_of "let g y = y + 1" in 13 - Alcotest.(check string) "same hash" h1 h2 10 + (* Helper: assert two expressions produce the same hash *) 11 + let same s1 s2 () = 12 + let h1 = hash_of s1 in 13 + let h2 = hash_of s2 in 14 + Alcotest.(check string) (Fmt.str "%s == %s" s1 s2) h1 h2 14 15 15 - let test_nested_alpha () = 16 - let h1 = hash_of "let f x = let y = x in y + y" in 17 - let h2 = hash_of "let g a = let b = a in b + b" in 18 - Alcotest.(check string) "same hash" h1 h2 19 - 20 - let test_different_structure () = 21 - let h1 = hash_of "let f x = x + 1" in 22 - let h2 = hash_of "let f x = x * 2" in 23 - Alcotest.(check bool) "different hashes" true (h1 <> h2) 24 - 25 - let test_different_constants () = 26 - let h1 = hash_of "let f x = x + 1" in 27 - let h2 = hash_of "let f x = x + 2" in 28 - Alcotest.(check bool) "different hashes" true (h1 <> h2) 29 - 30 - let test_same_structure_different_globals () = 31 - let h1 = hash_of "let f x = List.map x" in 32 - let h2 = hash_of "let f x = List.iter x" in 33 - Alcotest.(check bool) "different hashes" true (h1 <> h2) 34 - 35 - let test_identity_functions () = 36 - let h1 = hash_of "fun x -> x" in 37 - let h2 = hash_of "fun y -> y" in 38 - Alcotest.(check string) "same hash" h1 h2 16 + (* Helper: assert two expressions produce different hashes *) 17 + let diff s1 s2 () = 18 + let h1 = hash_of s1 in 19 + let h2 = hash_of s2 in 20 + Alcotest.(check bool) (Fmt.str "%s != %s" s1 s2) true (h1 <> h2) 39 21 40 22 let test_pp_does_not_crash () = 41 23 let e = Dupfind.Normalize.apply (parse_expr "let f x = x + 1") in ··· 50 32 let suite = 51 33 ( "normalize", 52 34 [ 53 - Alcotest.test_case "alpha equivalence" `Quick test_alpha_equivalence; 54 - Alcotest.test_case "nested alpha" `Quick test_nested_alpha; 55 - Alcotest.test_case "different structure" `Quick test_different_structure; 56 - Alcotest.test_case "different constants" `Quick test_different_constants; 57 - Alcotest.test_case "different globals" `Quick 58 - test_same_structure_different_globals; 59 - Alcotest.test_case "identity functions" `Quick test_identity_functions; 35 + (* --- Alpha-renaming --- *) 36 + Alcotest.test_case "alpha: simple rename" `Quick 37 + (same "let f x = x + 1" "let g y = y + 1"); 38 + Alcotest.test_case "alpha: nested let" `Quick 39 + (same "let f x = let y = x in y + y" "let g a = let b = a in b + b"); 40 + Alcotest.test_case "alpha: identity" `Quick 41 + (same "fun x -> x" "fun y -> y"); 42 + Alcotest.test_case "alpha: multiple params" `Quick 43 + (same "let f a b = a + b" "let g x y = x + y"); 44 + Alcotest.test_case "alpha: match binders" `Quick 45 + (same "fun x -> match x with Some y -> y | None -> 0" 46 + "fun a -> match a with Some b -> b | None -> 0"); 47 + Alcotest.test_case "alpha: tuple param" `Quick 48 + (same "fun (x, y) -> x + y" "fun (a, b) -> a + b"); 49 + (* --- Different structure --- *) 50 + Alcotest.test_case "diff: structure" `Quick 51 + (diff "let f x = x + 1" "let f x = x * 2"); 52 + Alcotest.test_case "diff: constants" `Quick 53 + (diff "let f x = x + 1" "let f x = x + 2"); 54 + Alcotest.test_case "diff: globals" `Quick 55 + (diff "let f x = List.map x" "let f x = List.iter x"); 56 + Alcotest.test_case "diff: arity" `Quick (diff "fun x -> x" "fun x y -> x"); 57 + Alcotest.test_case "diff: body shape" `Quick 58 + (diff "fun x -> x + x" "fun x -> x * x"); 59 + (* --- Let inlining --- *) 60 + Alcotest.test_case "inline: simple" `Quick 61 + (same "let f x = let y = x + 1 in y" "let f x = x + 1"); 62 + Alcotest.test_case "inline: used twice" `Quick 63 + (same "let f x = let y = x + 1 in y + y" "let f x = (x + 1) + (x + 1)"); 64 + Alcotest.test_case "inline: chained lets" `Quick 65 + (same "let f x = let a = x + 1 in let b = a + 2 in b" 66 + "let f x = (x + 1) + 2"); 67 + Alcotest.test_case "inline: nested scope" `Quick 68 + (same "let f x = let a = x + 1 in let b = a * 2 in a + b" 69 + "let f x = (x + 1) + ((x + 1) * 2)"); 70 + Alcotest.test_case "inline: unused binding" `Quick 71 + (same "let f x = let y = 42 in x" "let f x = x"); 72 + Alcotest.test_case "inline: multiple bindings" `Quick 73 + (same "let f x = let a = x + 1 in let b = x + 2 in a + b" 74 + "let f x = (x + 1) + (x + 2)"); 75 + (* --- Beta reduction --- *) 76 + Alcotest.test_case "beta: simple" `Quick 77 + (same "(fun x -> x + 1) 5" "5 + 1"); 78 + Alcotest.test_case "beta: identity" `Quick (same "(fun x -> x) 42" "42"); 79 + Alcotest.test_case "beta: nested" `Quick 80 + (same "(fun x -> (fun y -> x + y) 2) 3" "3 + 2"); 81 + Alcotest.test_case "beta: with subexpr" `Quick 82 + (same "(fun x -> x * x) (1 + 2)" "(1 + 2) * (1 + 2)"); 83 + Alcotest.test_case "beta: in binding" `Quick 84 + (same "let f = (fun x -> x + 1) 5" "let f = 5 + 1"); 85 + (* --- Eta reduction --- *) 86 + Alcotest.test_case "eta: simple" `Quick (same "fun x -> f x" "f"); 87 + Alcotest.test_case "eta: in binding" `Quick 88 + (same "let g = fun x -> f x" "let g = f"); 89 + Alcotest.test_case "eta: no reduce when var in fn" `Quick 90 + (diff "fun x -> x x" "fun x -> x"); 91 + Alcotest.test_case "eta: global function" `Quick 92 + (same "fun x -> List.length x" "List.length"); 93 + Alcotest.test_case "eta: nested" `Quick 94 + (same "fun x -> (fun y -> g y) x" "g"); 95 + (* --- Flatten funs --- *) 96 + Alcotest.test_case "flatten: two params" `Quick 97 + (same "fun x -> fun y -> x + y" "fun x y -> x + y"); 98 + Alcotest.test_case "flatten: three params" `Quick 99 + (same "fun x -> fun y -> fun z -> x + y + z" "fun x y z -> x + y + z"); 100 + Alcotest.test_case "flatten: in binding" `Quick 101 + (same "let f = fun x -> fun y -> x + y" "let f = fun x y -> x + y"); 102 + (* --- CSE --- *) 103 + Alcotest.test_case "cse: repeated subexpr" `Quick 104 + (same "(x + 1) + (x + 1)" "let y = x + 1 in y + y"); 105 + Alcotest.test_case "cse: three occurrences" `Quick 106 + (same "(x + 1) + (x + 1) + (x + 1)" "let y = x + 1 in y + y + y"); 107 + Alcotest.test_case "cse: no cse for single ident" `Quick 108 + (diff "x + y" "x + x"); 109 + Alcotest.test_case "cse: nested repeated" `Quick 110 + (same "let f a = (a * a) + (a * a)" "let f a = let b = a * a in b + b"); 111 + (* --- Combined passes --- *) 112 + Alcotest.test_case "combined: let + cse" `Quick 113 + (same "let f x = let y = x + 1 in y + y" 114 + "let f x = let z = x + 1 in z + z"); 115 + Alcotest.test_case "combined: beta + eta" `Quick 116 + (same "(fun x -> g x) y" "g y"); 117 + Alcotest.test_case "combined: inline + beta" `Quick 118 + (same "let f = fun x -> x + 1 in f 5" "5 + 1"); 119 + Alcotest.test_case "combined: flatten + eta" `Quick 120 + (same "fun x -> fun y -> f x y" "f"); 121 + Alcotest.test_case "combined: complex pipeline" `Quick 122 + (same "let f x = let inc = fun y -> y + 1 in inc x" "let f x = x + 1"); 123 + (* --- Recursive lets preserved --- *) 124 + Alcotest.test_case "rec: preserved" `Quick 125 + (diff "let rec f x = f x" "let f x = g x"); 126 + (* --- Misc --- *) 60 127 Alcotest.test_case "pp does not crash" `Quick test_pp_does_not_crash; 61 128 Alcotest.test_case "deterministic" `Quick test_hash_deterministic; 129 + Alcotest.test_case "misc: if-then-else alpha" `Quick 130 + (same "fun x -> if x > 0 then x else 0" 131 + "fun y -> if y > 0 then y else 0"); 132 + Alcotest.test_case "misc: record alpha" `Quick 133 + (same "fun x -> { a = x; b = x + 1 }" "fun y -> { a = y; b = y + 1 }"); 134 + Alcotest.test_case "misc: constructor alpha" `Quick 135 + (same "fun x -> Some x" "fun y -> Some y"); 136 + Alcotest.test_case "misc: sequence alpha" `Quick 137 + (same "fun x -> print x; x" "fun y -> print y; y"); 138 + Alcotest.test_case "misc: try alpha" `Quick 139 + (same "fun x -> try f x with e -> g e" "fun y -> try f y with e -> g e"); 140 + Alcotest.test_case "misc: assert alpha" `Quick 141 + (same "fun x -> assert (x > 0)" "fun y -> assert (y > 0)"); 142 + Alcotest.test_case "misc: field access alpha" `Quick 143 + (same "fun x -> x.foo" "fun y -> y.foo"); 144 + Alcotest.test_case "misc: lazy alpha" `Quick 145 + (same "fun x -> lazy x" "fun y -> lazy y"); 62 146 ] )
+1
test/test_normalize.mli
··· 1 + val suite : string * unit Alcotest.test_case list
+1
test/test_report.mli
··· 1 + val suite : string * unit Alcotest.test_case list
+91
test/test_similar.ml
··· 1 + let mk_fragment ~package ~name : Dupfind.Fragment.t = 2 + { 3 + hash = name; 4 + location = { file = "test.ml"; line = 1; package }; 5 + ast_size = 5; 6 + binding_name = name; 7 + } 8 + 9 + let test_empty () = 10 + let results = Dupfind.Similar.get ~threshold:0.5 ~intra:true [] in 11 + Alcotest.(check int) "no results" 0 (List.length results) 12 + 13 + let test_identical_excluded () = 14 + let fp = Dupfind.Fingerprint.of_sub_hashes [ ("a", 1); ("b", 2) ] in 15 + let entries = 16 + [ 17 + (mk_fragment ~package:"p" ~name:"f", fp); 18 + (mk_fragment ~package:"p" ~name:"g", fp); 19 + ] 20 + in 21 + let results = Dupfind.Similar.get ~threshold:0.5 ~intra:true entries in 22 + Alcotest.(check int) "exact duplicates excluded" 0 (List.length results) 23 + 24 + let test_similar_found () = 25 + let fp1 = 26 + Dupfind.Fingerprint.of_sub_hashes [ ("a", 1); ("b", 2); ("c", 3) ] 27 + in 28 + let fp2 = 29 + Dupfind.Fingerprint.of_sub_hashes [ ("a", 1); ("b", 2); ("d", 4) ] 30 + in 31 + let entries = 32 + [ 33 + (mk_fragment ~package:"p" ~name:"f", fp1); 34 + (mk_fragment ~package:"p" ~name:"g", fp2); 35 + ] 36 + in 37 + let results = Dupfind.Similar.get ~threshold:0.4 ~intra:true entries in 38 + Alcotest.(check int) "one pair" 1 (List.length results); 39 + let m = List.hd results in 40 + Alcotest.(check (float 0.001)) "similarity 0.5" 0.5 m.similarity; 41 + Alcotest.(check int) "shared" 2 m.shared_hashes 42 + 43 + let test_below_threshold () = 44 + let fp1 = 45 + Dupfind.Fingerprint.of_sub_hashes [ ("a", 1); ("b", 2); ("c", 3) ] 46 + in 47 + let fp2 = 48 + Dupfind.Fingerprint.of_sub_hashes [ ("a", 1); ("d", 4); ("e", 5) ] 49 + in 50 + let entries = 51 + [ 52 + (mk_fragment ~package:"p" ~name:"f", fp1); 53 + (mk_fragment ~package:"p" ~name:"g", fp2); 54 + ] 55 + in 56 + let results = Dupfind.Similar.get ~threshold:0.5 ~intra:true entries in 57 + Alcotest.(check int) "below threshold" 0 (List.length results) 58 + 59 + let test_cross_package_filter () = 60 + let fp1 = 61 + Dupfind.Fingerprint.of_sub_hashes [ ("a", 1); ("b", 2); ("c", 3) ] 62 + in 63 + let fp2 = 64 + Dupfind.Fingerprint.of_sub_hashes [ ("a", 1); ("b", 2); ("d", 4) ] 65 + in 66 + let same_pkg = 67 + [ 68 + (mk_fragment ~package:"p" ~name:"f", fp1); 69 + (mk_fragment ~package:"p" ~name:"g", fp2); 70 + ] 71 + in 72 + let results = Dupfind.Similar.get ~threshold:0.4 ~intra:false same_pkg in 73 + Alcotest.(check int) "same pkg filtered" 0 (List.length results); 74 + let cross_pkg = 75 + [ 76 + (mk_fragment ~package:"a" ~name:"f", fp1); 77 + (mk_fragment ~package:"b" ~name:"g", fp2); 78 + ] 79 + in 80 + let results = Dupfind.Similar.get ~threshold:0.4 ~intra:false cross_pkg in 81 + Alcotest.(check int) "cross pkg found" 1 (List.length results) 82 + 83 + let suite = 84 + ( "similar", 85 + [ 86 + Alcotest.test_case "empty" `Quick test_empty; 87 + Alcotest.test_case "identical excluded" `Quick test_identical_excluded; 88 + Alcotest.test_case "similar found" `Quick test_similar_found; 89 + Alcotest.test_case "below threshold" `Quick test_below_threshold; 90 + Alcotest.test_case "cross-package filter" `Quick test_cross_package_filter; 91 + ] )
+1
test/test_similar.mli
··· 1 + val suite : string * unit Alcotest.test_case list
+1
test/test_source.mli
··· 1 + val suite : string * unit Alcotest.test_case list