Duplicate code detection across OCaml packages
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

json: rename mem -> member / finish -> seal across the codec + value API

Object combinators: [Object.mem] -> [Object.member], [Object.opt_mem]
-> [Object.opt_member], [Object.case_mem] -> [Object.case_member]. The
sibling submodules [Object.Mem] / [Object.Mems] become
[Object.Member] / [Object.Members]. RFC 8259 §4 calls these
"name/value pairs, referred to as the members", so mirror the spec
name rather than the shortened [mem].

[Object.finish] -> [Object.seal]. "Seal" reads as "close the map, no
more members added", which is what the operation does.

Value constructors/queries: [Value.mem] (function) -> [Value.member];
[Value.mem_find] -> [Value.member_key]; [Value.mem_names] ->
[Value.member_names]; [Value.mem_keys] -> [Value.member_keys].
[type mem = ...] -> [type member = ...]; [type object'] still points
at [member list].

Downstream (~80 files across slack, sbom, stripe, sigstore, requests,
claude, irmin, freebox) updated via perl-pie. dune build clean,
dune test ocaml-json clean.

+426 -420
+140
bin/cmd_find.ml
··· 1 + open Cmdliner 2 + 3 + let resolve_qualified name = 4 + match String.rindex_opt name '.' with 5 + | None -> (None, name) 6 + | Some i -> 7 + let modname = String.sub name 0 i in 8 + let func = String.sub name (i + 1) (String.length name - i - 1) in 9 + (Some modname, func) 10 + 11 + let binding_in_file file mod_name func_name = 12 + let structure = Dupfind.Source.parse_file file in 13 + let bindings = Dupfind.Source.extract_bindings structure in 14 + List.find_opt 15 + (fun (name, _expr, _line) -> 16 + name = func_name 17 + && 18 + match mod_name with 19 + | None -> true 20 + | Some m -> 21 + let base = Filename.chop_extension (Filename.basename file) in 22 + String.capitalize_ascii base = m) 23 + bindings 24 + 25 + let is_hex_hash s = 26 + String.length s = 32 27 + && String.for_all 28 + (fun c -> (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')) 29 + s 30 + 31 + let is_inline_expr s = 32 + String.length s > 0 33 + && (s.[0] = '(' 34 + || String.starts_with ~prefix:"let " s 35 + || String.starts_with ~prefix:"fun " s 36 + || String.starts_with ~prefix:"function " s) 37 + 38 + let report_matches ~format ~top ~qualified_name expr matches target_hash = 39 + if List.length matches <= 1 then 40 + Fmt.pr "No duplicates found for %s.@." qualified_name 41 + else 42 + let cluster : Dupfind.Cluster.t = 43 + { 44 + hash = target_hash; 45 + fragments = matches; 46 + ast_size = Dupfind.Source.ast_size expr; 47 + count = List.length matches; 48 + packages = 49 + List.map (fun (f : Dupfind.Fragment.t) -> f.location.package) matches 50 + |> List.sort_uniq String.compare; 51 + } 52 + in 53 + Dupfind.Report.output ~format ~top [ cluster ] 54 + 55 + let report_hash_matches ~format ~top hash matches = 56 + if matches = [] then Fmt.pr "No matches for hash %s.@." hash 57 + else 58 + let ast_size = 59 + match matches with h :: _ -> h.Dupfind.Fragment.ast_size | [] -> 0 60 + in 61 + let cluster : Dupfind.Cluster.t = 62 + { 63 + hash; 64 + fragments = matches; 65 + ast_size; 66 + count = List.length matches; 67 + packages = 68 + List.map (fun (f : Dupfind.Fragment.t) -> f.location.package) matches 69 + |> List.sort_uniq String.compare; 70 + } 71 + in 72 + Dupfind.Report.output ~format ~top [ cluster ] 73 + 74 + let by_expr ~format ~top index query = 75 + let expr = 76 + match Common.parse_expr query with 77 + | Ok e -> e 78 + | Error msg -> Fmt.failwith "%s" msg 79 + in 80 + let normalized = Dupfind.Normalize.apply expr in 81 + let target_hash = Dupfind.Normalize.hash normalized in 82 + Fmt.pr "AST: %a@." Dupfind.Normalize.pp_expr normalized; 83 + Fmt.pr "Hash: %s@.@." target_hash; 84 + let matches = Dupfind.Index.get index target_hash in 85 + report_matches ~format ~top ~qualified_name:query expr matches target_hash 86 + 87 + let by_name ~format ~top index query all_files = 88 + let mod_name, func_name = resolve_qualified query in 89 + let target = 90 + List.find_map 91 + (fun (_package, file) -> 92 + binding_in_file (Fpath.to_string file) mod_name func_name) 93 + all_files 94 + in 95 + match target with 96 + | None -> 97 + Fmt.epr "Error: binding %S not found.@." query; 98 + Stdlib.exit 1 99 + | Some (_name, expr, _line) -> 100 + let normalized = Dupfind.Normalize.apply expr in 101 + let target_hash = Dupfind.Normalize.hash normalized in 102 + let matches = Dupfind.Index.get index target_hash in 103 + report_matches ~format ~top ~qualified_name:query expr matches target_hash 104 + 105 + let run_find eio min_size format top query paths = 106 + let fs = Eio.Stdenv.fs eio in 107 + let all_files = Common.collect_files ~fs ~exclude:[] paths in 108 + Common.Log.info (fun m -> m "Indexing %d files" (List.length all_files)); 109 + let progress = 110 + Tty.Progress.v ~enabled:(Tty.is_tty ()) ~total:(List.length all_files) 111 + "Indexing" 112 + in 113 + let index = Common.index_files ~min_size ~progress all_files in 114 + if is_hex_hash query then 115 + report_hash_matches ~format ~top query (Dupfind.Index.get index query) 116 + else if is_inline_expr query then by_expr ~format ~top index query 117 + else by_name ~format ~top index query all_files 118 + 119 + let qualified_name = 120 + Arg.( 121 + required 122 + & pos 0 (some string) None 123 + & info [] ~docv:"QUERY" 124 + ~doc: 125 + "A qualified name (e.g. $(b,Aos.encode)), a hex hash, or an inline \ 126 + OCaml expression (e.g. $(b,\"let f x = x\")).") 127 + 128 + let search_paths = 129 + Arg.( 130 + value & pos_right 0 string [ "." ] 131 + & info [] ~docv:"DIR" ~doc:"Directories to scan.") 132 + 133 + let cmd eio = 134 + let doc = "Find duplicates of a specific function." in 135 + let info = Cmd.info "find" ~doc in 136 + Cmd.v info 137 + Term.( 138 + const (fun () -> run_find eio) 139 + $ Common.setup $ Common.min_size $ Common.format_opt $ Common.top 140 + $ qualified_name $ search_paths)
+24
bin/cmd_hash.ml
··· 1 + open Cmdliner 2 + 3 + let run_hash input = 4 + match Common.parse_expr input with 5 + | Error msg -> 6 + Fmt.epr "Error: %s@." msg; 7 + Stdlib.exit 1 8 + | Ok expr -> 9 + let normalized = Dupfind.Normalize.apply expr in 10 + let h = Dupfind.Normalize.hash normalized in 11 + Fmt.pr "@[<v>AST: %a@,Hash: %s@]@." Dupfind.Normalize.pp_expr normalized 12 + h 13 + 14 + let hash_input = 15 + Arg.( 16 + required 17 + & pos 0 (some string) None 18 + & info [] ~docv:"EXPR" 19 + ~doc:"OCaml expression to hash (e.g. $(b,\"let f x = x + 1\")).") 20 + 21 + let cmd = 22 + let doc = "Show normalized AST and hash for an expression." in 23 + let info = Cmd.info "hash" ~doc in 24 + Cmd.v info Term.(const (fun () -> run_hash) $ Common.setup $ hash_input)
+23
bin/cmd_scan.ml
··· 1 + open Cmdliner 2 + 3 + let run_scan eio min_size no_intra format exclude top paths = 4 + let fs = Eio.Stdenv.fs eio in 5 + let files = Common.collect_files ~fs ~exclude paths in 6 + Common.Log.info (fun m -> m "Scanning %d files" (List.length files)); 7 + let progress = 8 + Tty.Progress.v ~enabled:(Tty.is_tty ()) ~total:(List.length files) 9 + "Scanning" 10 + in 11 + let index = Common.index_files ~min_size ~progress files in 12 + let intra = not no_intra in 13 + let clusters = Dupfind.Cluster.get index ~intra in 14 + Dupfind.Report.output ~format ~top clusters 15 + 16 + let cmd eio = 17 + let doc = "Scan directories for duplicate code." in 18 + let info = Cmd.info "scan" ~doc in 19 + Cmd.v info 20 + Term.( 21 + const (fun () -> run_scan eio) 22 + $ Common.setup $ Common.min_size $ Common.no_intra $ Common.format_opt 23 + $ Common.exclude $ Common.top $ Common.paths)
+64
bin/cmd_show.ml
··· 1 + open Cmdliner 2 + 3 + let run_show eio min_size hash_query paths = 4 + let fs = Eio.Stdenv.fs eio in 5 + let all_files = Common.collect_files ~fs ~exclude:[] paths in 6 + Common.Log.info (fun m -> m "Indexing %d files" (List.length all_files)); 7 + let progress = 8 + Tty.Progress.v ~enabled:(Tty.is_tty ()) ~total:(List.length all_files) 9 + "Indexing" 10 + in 11 + let index = Common.index_files ~min_size ~progress all_files in 12 + let matches = Dupfind.Index.get index hash_query in 13 + if matches = [] then ( 14 + Fmt.epr "No matches for hash %s.@." hash_query; 15 + Stdlib.exit 1) 16 + else begin 17 + let first = List.hd matches in 18 + let file = first.Dupfind.Fragment.location.file in 19 + let structure = Dupfind.Source.parse_file file in 20 + let bindings = Dupfind.Source.extract_bindings structure in 21 + let target = 22 + List.find_opt 23 + (fun (name, _, line) -> 24 + name = first.binding_name && line = first.location.line) 25 + bindings 26 + in 27 + (match target with 28 + | Some (_name, expr, _line) -> 29 + let normalized = Dupfind.Normalize.apply expr in 30 + Fmt.pr 31 + "@[<v>Hash: %s@,\ 32 + @,\ 33 + Normalized AST:@,\ 34 + \ %a@,\ 35 + @,\ 36 + Locations (%d matches):@]@." 37 + hash_query Dupfind.Normalize.pp_expr normalized (List.length matches) 38 + | None -> 39 + Fmt.pr "Hash: %s@.@.Locations (%d matches):@." hash_query 40 + (List.length matches)); 41 + List.iter 42 + (fun (f : Dupfind.Fragment.t) -> 43 + Fmt.pr " %s:%d %s@." f.location.file f.location.line f.binding_name) 44 + matches 45 + end 46 + 47 + let show_hash = 48 + Arg.( 49 + required 50 + & pos 0 (some string) None 51 + & info [] ~docv:"HASH" ~doc:"The hex hash of the cluster to show.") 52 + 53 + let show_paths = 54 + Arg.( 55 + value & pos_right 0 string [ "." ] 56 + & info [] ~docv:"DIR" ~doc:"Directories to scan.") 57 + 58 + let cmd eio = 59 + let doc = "Show normalized AST for a hash cluster." in 60 + let info = Cmd.info "show" ~doc in 61 + Cmd.v info 62 + Term.( 63 + const (fun () -> run_show eio) 64 + $ Common.setup $ Common.min_size $ show_hash $ show_paths)
+65
bin/cmd_similar.ml
··· 1 + open Cmdliner 2 + 3 + let run_similar eio min_size min_sub_size threshold no_intra format top paths = 4 + let fs = Eio.Stdenv.fs eio in 5 + let files = Common.collect_files ~fs ~exclude:[] paths in 6 + Common.Log.info (fun m -> m "Fingerprinting %d files" (List.length files)); 7 + let progress = 8 + Tty.Progress.v ~enabled:(Tty.is_tty ()) ~total:(List.length files) 9 + "Fingerprinting" 10 + in 11 + let entries = ref [] in 12 + List.iter 13 + (fun (_package, file) -> 14 + let path = Fpath.to_string file in 15 + let package = Dupfind.Discover.package_of_file ~root:(Fpath.v ".") file in 16 + let structure = Dupfind.Source.parse_file path in 17 + let bindings = Dupfind.Source.extract_bindings structure in 18 + List.iter 19 + (fun (name, expr, line) -> 20 + let size = Dupfind.Source.ast_size expr in 21 + if size >= min_size then begin 22 + let normalized = Dupfind.Normalize.apply expr in 23 + let hash = Dupfind.Normalize.hash normalized in 24 + let frag : Dupfind.Fragment.t = 25 + { 26 + hash; 27 + location = { file = Fpath.to_string file; line; package }; 28 + ast_size = size; 29 + binding_name = name; 30 + } 31 + in 32 + let sub = 33 + Dupfind.Normalize.sub_hashes ~min_size:min_sub_size normalized 34 + in 35 + let fp = Dupfind.Fingerprint.of_sub_hashes sub in 36 + entries := (frag, fp) :: !entries 37 + end) 38 + bindings; 39 + Tty.Progress.tick progress) 40 + files; 41 + Tty.Progress.finish progress; 42 + let intra = not no_intra in 43 + let pairs = Dupfind.Similar.get ~threshold ~intra !entries in 44 + Dupfind.Report.output_similar ~format ~top pairs 45 + 46 + let threshold = 47 + Arg.( 48 + value & opt float 0.7 49 + & info [ "threshold" ] ~docv:"FLOAT" 50 + ~doc:"Minimum Jaccard similarity (0.0-1.0, default 0.7).") 51 + 52 + let min_sub_size = 53 + Arg.( 54 + value & opt int 3 55 + & info [ "min-sub-size" ] ~docv:"N" 56 + ~doc:"Minimum sub-expression size for fingerprinting (default 3).") 57 + 58 + let cmd eio = 59 + let doc = "Find near-duplicate code via sub-expression fingerprinting." in 60 + let info = Cmd.info "similar" ~doc in 61 + Cmd.v info 62 + Term.( 63 + const (fun () -> run_similar eio) 64 + $ Common.setup $ Common.min_size $ min_sub_size $ threshold 65 + $ Common.no_intra $ Common.format_opt $ Common.top $ Common.paths)
+100
bin/common.ml
··· 1 + open Cmdliner 2 + 3 + let src = Logs.Src.create "dupfind" 4 + 5 + module Log = (val Logs.src_log src) 6 + 7 + let process_file ~min_size ~package file = 8 + let path = Fpath.to_string file in 9 + let structure = Dupfind.Source.parse_file path in 10 + let bindings = Dupfind.Source.extract_bindings structure in 11 + Log.debug (fun m -> m "parse %s: %d bindings" path (List.length bindings)); 12 + List.filter_map 13 + (fun (name, expr, line) -> 14 + let size = Dupfind.Source.ast_size expr in 15 + if size < min_size then None 16 + else 17 + let normalized = Dupfind.Normalize.apply expr in 18 + let hash = Dupfind.Normalize.hash normalized in 19 + Log.debug (fun m -> m " %s: size=%d hash=%s" name size hash); 20 + Some 21 + { 22 + Dupfind.Fragment.hash; 23 + location = { file = Fpath.to_string file; line; package }; 24 + ast_size = size; 25 + binding_name = name; 26 + }) 27 + bindings 28 + 29 + let collect_files ~fs ~exclude paths = 30 + List.concat_map 31 + (fun root -> 32 + let root = Fpath.v root in 33 + Dupfind.Discover.ml_files ~fs ~exclude root 34 + |> List.map (fun file -> 35 + let package = Dupfind.Discover.package_of_file ~root file in 36 + (package, file))) 37 + paths 38 + 39 + let index_files ~min_size ~progress files = 40 + let index = Dupfind.Index.v () in 41 + List.iter 42 + (fun (package, file) -> 43 + let fragments = process_file ~min_size ~package file in 44 + List.iter (Dupfind.Index.add index) fragments; 45 + Tty.Progress.tick progress) 46 + files; 47 + Tty.Progress.finish progress; 48 + index 49 + 50 + let err_parse s = Error (Fmt.str "cannot parse expression: %S" s) 51 + let err_syntax s = Error (Fmt.str "syntax error in: %S" s) 52 + let err_lexer s = Error (Fmt.str "lexer error in: %S" s) 53 + 54 + let parse_expr s = 55 + let lexbuf = Lexing.from_string s in 56 + match Parse.implementation lexbuf with 57 + | [ { Parsetree.pstr_desc = Pstr_eval (e, _); _ } ] -> Ok e 58 + | [ { Parsetree.pstr_desc = Pstr_value (_, [ vb ]); _ } ] -> Ok vb.pvb_expr 59 + | _ -> err_parse s 60 + | exception Syntaxerr.Error _ -> err_syntax s 61 + | exception Lexer.Error (_, _) -> err_lexer s 62 + 63 + let setup = Vlog.setup ~json_reporter:None "dupfind" 64 + 65 + let min_size = 66 + Arg.( 67 + value & opt int 30 68 + & info [ "min-size" ] ~docv:"N" ~doc:"Minimum AST node count for fragments.") 69 + 70 + let no_intra = 71 + Arg.( 72 + value & flag 73 + & info [ "no-intra" ] ~doc:"Only show cross-package duplicates.") 74 + 75 + let format_opt = 76 + let formats = 77 + [ ("cli", Dupfind.Report.Cli); ("json", Dupfind.Report.Json) ] 78 + in 79 + Arg.( 80 + value 81 + & opt (enum formats) Dupfind.Report.Cli 82 + & info [ "format" ] ~docv:"FORMAT" 83 + ~doc:"Output format: $(b,cli) or $(b,json).") 84 + 85 + let exclude = 86 + Arg.( 87 + value & opt_all string [] 88 + & info [ "exclude" ] ~docv:"PATTERN" 89 + ~doc:"Exclude paths matching glob pattern.") 90 + 91 + let top = 92 + Arg.( 93 + value 94 + & opt (some int) None 95 + & info [ "top" ] ~docv:"N" ~doc:"Show top N clone clusters.") 96 + 97 + let paths = 98 + Arg.( 99 + value & pos_all string [ "." ] 100 + & info [] ~docv:"DIR" ~doc:"Directories to scan.")
+7 -419
bin/main.ml
··· 1 1 open Cmdliner 2 2 3 - let src = Logs.Src.create "dupfind" 4 - 5 - module Log = (val Logs.src_log src) 6 - 7 - let process_file ~min_size ~package file = 8 - let path = Fpath.to_string file in 9 - let structure = Dupfind.Source.parse_file path in 10 - let bindings = Dupfind.Source.extract_bindings structure in 11 - Log.debug (fun m -> m "parse %s: %d bindings" path (List.length bindings)); 12 - List.filter_map 13 - (fun (name, expr, line) -> 14 - let size = Dupfind.Source.ast_size expr in 15 - if size < min_size then None 16 - else 17 - let normalized = Dupfind.Normalize.apply expr in 18 - let hash = Dupfind.Normalize.hash normalized in 19 - Log.debug (fun m -> m " %s: size=%d hash=%s" name size hash); 20 - Some 21 - { 22 - Dupfind.Fragment.hash; 23 - location = { file = Fpath.to_string file; line; package }; 24 - ast_size = size; 25 - binding_name = name; 26 - }) 27 - bindings 28 - 29 - let collect_files ~fs ~exclude paths = 30 - List.concat_map 31 - (fun root -> 32 - let root = Fpath.v root in 33 - Dupfind.Discover.ml_files ~fs ~exclude root 34 - |> List.map (fun file -> 35 - let package = Dupfind.Discover.package_of_file ~root file in 36 - (package, file))) 37 - paths 38 - 39 - let index_files ~min_size ~progress files = 40 - let index = Dupfind.Index.v () in 41 - List.iter 42 - (fun (package, file) -> 43 - let fragments = process_file ~min_size ~package file in 44 - List.iter (Dupfind.Index.add index) fragments; 45 - Tty.Progress.tick progress) 46 - files; 47 - Tty.Progress.finish progress; 48 - index 49 - 50 - (* scan subcommand *) 51 - 52 - let run_scan eio min_size no_intra format exclude top paths = 53 - let fs = Eio.Stdenv.fs eio in 54 - let files = collect_files ~fs ~exclude paths in 55 - Log.info (fun m -> m "Scanning %d files" (List.length files)); 56 - let progress = 57 - Tty.Progress.v ~enabled:(Tty.is_tty ()) ~total:(List.length files) 58 - "Scanning" 59 - in 60 - let index = index_files ~min_size ~progress files in 61 - let intra = not no_intra in 62 - let clusters = Dupfind.Cluster.get index ~intra in 63 - Dupfind.Report.output ~format ~top clusters 64 - 65 - (* find subcommand *) 66 - 67 - let resolve_qualified name = 68 - match String.rindex_opt name '.' with 69 - | None -> (None, name) 70 - | Some i -> 71 - let modname = String.sub name 0 i in 72 - let func = String.sub name (i + 1) (String.length name - i - 1) in 73 - (Some modname, func) 74 - 75 - let binding_in_file file mod_name func_name = 76 - let structure = Dupfind.Source.parse_file file in 77 - let bindings = Dupfind.Source.extract_bindings structure in 78 - List.find_opt 79 - (fun (name, _expr, _line) -> 80 - name = func_name 81 - && 82 - match mod_name with 83 - | None -> true 84 - | Some m -> 85 - let base = Filename.chop_extension (Filename.basename file) in 86 - String.capitalize_ascii base = m) 87 - bindings 88 - 89 - let err_parse s = Error (Fmt.str "cannot parse expression: %S" s) 90 - let err_syntax s = Error (Fmt.str "syntax error in: %S" s) 91 - let err_lexer s = Error (Fmt.str "lexer error in: %S" s) 92 - 93 - let parse_expr s = 94 - let lexbuf = Lexing.from_string s in 95 - match Parse.implementation lexbuf with 96 - | [ { Parsetree.pstr_desc = Pstr_eval (e, _); _ } ] -> Ok e 97 - | [ { Parsetree.pstr_desc = Pstr_value (_, [ vb ]); _ } ] -> Ok vb.pvb_expr 98 - | _ -> err_parse s 99 - | exception Syntaxerr.Error _ -> err_syntax s 100 - | exception Lexer.Error (_, _) -> err_lexer s 101 - 102 - let is_hex_hash s = 103 - String.length s = 32 104 - && String.for_all 105 - (fun c -> (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')) 106 - s 107 - 108 - let is_inline_expr s = 109 - String.length s > 0 110 - && (s.[0] = '(' 111 - || String.starts_with ~prefix:"let " s 112 - || String.starts_with ~prefix:"fun " s 113 - || String.starts_with ~prefix:"function " s) 114 - 115 - let report_matches ~format ~top ~qualified_name expr matches target_hash = 116 - if List.length matches <= 1 then 117 - Fmt.pr "No duplicates found for %s.@." qualified_name 118 - else 119 - let cluster : Dupfind.Cluster.t = 120 - { 121 - hash = target_hash; 122 - fragments = matches; 123 - ast_size = Dupfind.Source.ast_size expr; 124 - count = List.length matches; 125 - packages = 126 - List.map (fun (f : Dupfind.Fragment.t) -> f.location.package) matches 127 - |> List.sort_uniq String.compare; 128 - } 129 - in 130 - Dupfind.Report.output ~format ~top [ cluster ] 131 - 132 - let report_hash_matches ~format ~top hash matches = 133 - if matches = [] then Fmt.pr "No matches for hash %s.@." hash 134 - else 135 - let ast_size = 136 - match matches with h :: _ -> h.Dupfind.Fragment.ast_size | [] -> 0 137 - in 138 - let cluster : Dupfind.Cluster.t = 139 - { 140 - hash; 141 - fragments = matches; 142 - ast_size; 143 - count = List.length matches; 144 - packages = 145 - List.map (fun (f : Dupfind.Fragment.t) -> f.location.package) matches 146 - |> List.sort_uniq String.compare; 147 - } 148 - in 149 - Dupfind.Report.output ~format ~top [ cluster ] 150 - 151 - let by_expr ~format ~top index query = 152 - let expr = 153 - match parse_expr query with Ok e -> e | Error msg -> Fmt.failwith "%s" msg 154 - in 155 - let normalized = Dupfind.Normalize.apply expr in 156 - let target_hash = Dupfind.Normalize.hash normalized in 157 - Fmt.pr "AST: %a@." Dupfind.Normalize.pp_expr normalized; 158 - Fmt.pr "Hash: %s@.@." target_hash; 159 - let matches = Dupfind.Index.get index target_hash in 160 - report_matches ~format ~top ~qualified_name:query expr matches target_hash 161 - 162 - let by_name ~format ~top index query all_files = 163 - let mod_name, func_name = resolve_qualified query in 164 - let target = 165 - List.find_map 166 - (fun (_package, file) -> 167 - binding_in_file (Fpath.to_string file) mod_name func_name) 168 - all_files 169 - in 170 - match target with 171 - | None -> 172 - Fmt.epr "Error: binding %S not found.@." query; 173 - Stdlib.exit 1 174 - | Some (_name, expr, _line) -> 175 - let normalized = Dupfind.Normalize.apply expr in 176 - let target_hash = Dupfind.Normalize.hash normalized in 177 - let matches = Dupfind.Index.get index target_hash in 178 - report_matches ~format ~top ~qualified_name:query expr matches target_hash 179 - 180 - let run_find eio min_size format top query paths = 181 - let fs = Eio.Stdenv.fs eio in 182 - let all_files = collect_files ~fs ~exclude:[] paths in 183 - Log.info (fun m -> m "Indexing %d files" (List.length all_files)); 184 - let progress = 185 - Tty.Progress.v ~enabled:(Tty.is_tty ()) ~total:(List.length all_files) 186 - "Indexing" 187 - in 188 - let index = index_files ~min_size ~progress all_files in 189 - if is_hex_hash query then 190 - report_hash_matches ~format ~top query (Dupfind.Index.get index query) 191 - else if is_inline_expr query then by_expr ~format ~top index query 192 - else by_name ~format ~top index query all_files 193 - 194 - (* similar subcommand *) 195 - 196 - let run_similar eio min_size min_sub_size threshold no_intra format top paths = 197 - let fs = Eio.Stdenv.fs eio in 198 - let files = collect_files ~fs ~exclude:[] paths in 199 - Log.info (fun m -> m "Fingerprinting %d files" (List.length files)); 200 - let progress = 201 - Tty.Progress.v ~enabled:(Tty.is_tty ()) ~total:(List.length files) 202 - "Fingerprinting" 203 - in 204 - let entries = ref [] in 205 - List.iter 206 - (fun (_package, file) -> 207 - let path = Fpath.to_string file in 208 - let package = Dupfind.Discover.package_of_file ~root:(Fpath.v ".") file in 209 - let structure = Dupfind.Source.parse_file path in 210 - let bindings = Dupfind.Source.extract_bindings structure in 211 - List.iter 212 - (fun (name, expr, line) -> 213 - let size = Dupfind.Source.ast_size expr in 214 - if size >= min_size then begin 215 - let normalized = Dupfind.Normalize.apply expr in 216 - let hash = Dupfind.Normalize.hash normalized in 217 - let frag : Dupfind.Fragment.t = 218 - { 219 - hash; 220 - location = { file = Fpath.to_string file; line; package }; 221 - ast_size = size; 222 - binding_name = name; 223 - } 224 - in 225 - let sub = 226 - Dupfind.Normalize.sub_hashes ~min_size:min_sub_size normalized 227 - in 228 - let fp = Dupfind.Fingerprint.of_sub_hashes sub in 229 - entries := (frag, fp) :: !entries 230 - end) 231 - bindings; 232 - Tty.Progress.tick progress) 233 - files; 234 - Tty.Progress.finish progress; 235 - let intra = not no_intra in 236 - let pairs = Dupfind.Similar.get ~threshold ~intra !entries in 237 - Dupfind.Report.output_similar ~format ~top pairs 238 - 239 - (* hash subcommand *) 240 - 241 - let run_hash input = 242 - match parse_expr input with 243 - | Error msg -> 244 - Fmt.epr "Error: %s@." msg; 245 - Stdlib.exit 1 246 - | Ok expr -> 247 - let normalized = Dupfind.Normalize.apply expr in 248 - let h = Dupfind.Normalize.hash normalized in 249 - Fmt.pr "@[<v>AST: %a@,Hash: %s@]@." Dupfind.Normalize.pp_expr normalized 250 - h 251 - 252 - (* show subcommand *) 253 - 254 - let run_show eio min_size hash_query paths = 255 - let fs = Eio.Stdenv.fs eio in 256 - let all_files = collect_files ~fs ~exclude:[] paths in 257 - Log.info (fun m -> m "Indexing %d files" (List.length all_files)); 258 - let progress = 259 - Tty.Progress.v ~enabled:(Tty.is_tty ()) ~total:(List.length all_files) 260 - "Indexing" 261 - in 262 - let index = index_files ~min_size ~progress all_files in 263 - let matches = Dupfind.Index.get index hash_query in 264 - if matches = [] then ( 265 - Fmt.epr "No matches for hash %s.@." hash_query; 266 - Stdlib.exit 1) 267 - else begin 268 - (* Show normalized AST from the first match *) 269 - let first = List.hd matches in 270 - let file = first.Dupfind.Fragment.location.file in 271 - let structure = Dupfind.Source.parse_file file in 272 - let bindings = Dupfind.Source.extract_bindings structure in 273 - let target = 274 - List.find_opt 275 - (fun (name, _, line) -> 276 - name = first.binding_name && line = first.location.line) 277 - bindings 278 - in 279 - (match target with 280 - | Some (_name, expr, _line) -> 281 - let normalized = Dupfind.Normalize.apply expr in 282 - Fmt.pr 283 - "@[<v>Hash: %s@,\ 284 - @,\ 285 - Normalized AST:@,\ 286 - \ %a@,\ 287 - @,\ 288 - Locations (%d matches):@]@." 289 - hash_query Dupfind.Normalize.pp_expr normalized (List.length matches) 290 - | None -> 291 - Fmt.pr "Hash: %s@.@.Locations (%d matches):@." hash_query 292 - (List.length matches)); 293 - List.iter 294 - (fun (f : Dupfind.Fragment.t) -> 295 - Fmt.pr " %s:%d %s@." f.location.file f.location.line f.binding_name) 296 - matches 297 - end 298 - 299 - (* Cmdliner terms *) 300 - 301 - let setup = Vlog.setup ~json_reporter:None "dupfind" 302 - 303 - let min_size = 304 - Arg.( 305 - value & opt int 30 306 - & info [ "min-size" ] ~docv:"N" ~doc:"Minimum AST node count for fragments.") 307 - 308 - let no_intra = 309 - Arg.( 310 - value & flag 311 - & info [ "no-intra" ] ~doc:"Only show cross-package duplicates.") 312 - 313 - let format_opt = 314 - let formats = 315 - [ ("cli", Dupfind.Report.Cli); ("json", Dupfind.Report.Json) ] 316 - in 317 - Arg.( 318 - value 319 - & opt (enum formats) Dupfind.Report.Cli 320 - & info [ "format" ] ~docv:"FORMAT" 321 - ~doc:"Output format: $(b,cli) or $(b,json).") 322 - 323 - let exclude = 324 - Arg.( 325 - value & opt_all string [] 326 - & info [ "exclude" ] ~docv:"PATTERN" 327 - ~doc:"Exclude paths matching glob pattern.") 328 - 329 - let top = 330 - Arg.( 331 - value 332 - & opt (some int) None 333 - & info [ "top" ] ~docv:"N" ~doc:"Show top N clone clusters.") 334 - 335 - let paths = 336 - Arg.( 337 - value & pos_all string [ "." ] 338 - & info [] ~docv:"DIR" ~doc:"Directories to scan.") 339 - 340 - let qualified_name = 341 - Arg.( 342 - required 343 - & pos 0 (some string) None 344 - & info [] ~docv:"QUERY" 345 - ~doc: 346 - "A qualified name (e.g. $(b,Aos.encode)), a hex hash, or an inline \ 347 - OCaml expression (e.g. $(b,\"let f x = x\")).") 348 - 349 - let search_paths = 350 - Arg.( 351 - value & pos_right 0 string [ "." ] 352 - & info [] ~docv:"DIR" ~doc:"Directories to scan.") 353 - 354 - let scan_cmd eio = 355 - let doc = "Scan directories for duplicate code." in 356 - let info = Cmd.info "scan" ~doc in 357 - Cmd.v info 358 - Term.( 359 - const (fun () -> run_scan eio) 360 - $ setup $ min_size $ no_intra $ format_opt $ exclude $ top $ paths) 361 - 362 - let search_cmd eio = 363 - let doc = "Find duplicates of a specific function." in 364 - let info = Cmd.info "find" ~doc in 365 - Cmd.v info 366 - Term.( 367 - const (fun () -> run_find eio) 368 - $ setup $ min_size $ format_opt $ top $ qualified_name $ search_paths) 369 - 370 - let threshold = 371 - Arg.( 372 - value & opt float 0.7 373 - & info [ "threshold" ] ~docv:"FLOAT" 374 - ~doc:"Minimum Jaccard similarity (0.0-1.0, default 0.7).") 375 - 376 - let min_sub_size = 377 - Arg.( 378 - value & opt int 3 379 - & info [ "min-sub-size" ] ~docv:"N" 380 - ~doc:"Minimum sub-expression size for fingerprinting (default 3).") 381 - 382 - let hash_input = 383 - Arg.( 384 - required 385 - & pos 0 (some string) None 386 - & info [] ~docv:"EXPR" 387 - ~doc:"OCaml expression to hash (e.g. $(b,\"let f x = x + 1\")).") 388 - 389 - let show_hash = 390 - Arg.( 391 - required 392 - & pos 0 (some string) None 393 - & info [] ~docv:"HASH" ~doc:"The hex hash of the cluster to show.") 394 - 395 - let show_paths = 396 - Arg.( 397 - value & pos_right 0 string [ "." ] 398 - & info [] ~docv:"DIR" ~doc:"Directories to scan.") 399 - 400 - let show_cmd eio = 401 - let doc = "Show normalized AST for a hash cluster." in 402 - let info = Cmd.info "show" ~doc in 403 - Cmd.v info 404 - Term.( 405 - const (fun () -> run_show eio) $ setup $ min_size $ show_hash $ show_paths) 406 - 407 - let hash_cmd = 408 - let doc = "Show normalized AST and hash for an expression." in 409 - let info = Cmd.info "hash" ~doc in 410 - Cmd.v info Term.(const (fun () -> run_hash) $ setup $ hash_input) 411 - 412 - let similar_cmd eio = 413 - let doc = "Find near-duplicate code via sub-expression fingerprinting." in 414 - let info = Cmd.info "similar" ~doc in 415 - Cmd.v info 416 - Term.( 417 - const (fun () -> run_similar eio) 418 - $ setup $ min_size $ min_sub_size $ threshold $ no_intra $ format_opt 419 - $ top $ paths) 420 - 421 3 let cmd eio = 422 4 let doc = "Duplicate code detection for OCaml." in 423 5 let info = Cmd.info "dupfind" ~version:"0.1.0" ~doc in 424 6 Cmd.group info 425 7 ~default:Term.(ret (const (`Help (`Pager, None)))) 426 - [ scan_cmd eio; search_cmd eio; show_cmd eio; hash_cmd; similar_cmd eio ] 8 + [ 9 + Cmd_scan.cmd eio; 10 + Cmd_find.cmd eio; 11 + Cmd_show.cmd eio; 12 + Cmd_hash.cmd; 13 + Cmd_similar.cmd eio; 14 + ] 427 15 428 16 let () = 429 17 Memtrace.trace_if_requested ();
+2 -1
dune-project
··· 27 27 fpath 28 28 bos 29 29 re 30 - json memtrace)) 30 + json memtrace 31 + loc))
+1
dupfind.opam
··· 22 22 "re" 23 23 "json" 24 24 "memtrace" 25 + "loc" 25 26 "odoc" {with-doc} 26 27 ] 27 28 build: [