Duplicate code detection across OCaml packages
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

fix(dupfind): handle parse errors gracefully, add Pexp_open support, tune defaults

- parse_expr returns Result instead of crashing on syntax errors
- Handle Pexp_open (local module opens like Arg.(...)) in normalize
- Change default min-size from 5 to 30 for more actionable results
- Add explicit --min-size 5 to cram tests for stability

+27 -17
+18 -9
bin/main.ml
··· 89 89 let parse_expr s = 90 90 let lexbuf = Lexing.from_string s in 91 91 match Parse.implementation lexbuf with 92 - | [ { Parsetree.pstr_desc = Pstr_eval (e, _); _ } ] -> e 93 - | [ { Parsetree.pstr_desc = Pstr_value (_, [ vb ]); _ } ] -> vb.pvb_expr 94 - | _ -> Fmt.failwith "Cannot parse expression: %S" s 92 + | [ { Parsetree.pstr_desc = Pstr_eval (e, _); _ } ] -> Ok e 93 + | [ { Parsetree.pstr_desc = Pstr_value (_, [ vb ]); _ } ] -> Ok vb.pvb_expr 94 + | _ -> Error (Fmt.str "cannot parse expression: %S" s) 95 + | exception Syntaxerr.Error _ -> Error (Fmt.str "syntax error in: %S" s) 96 + | exception Lexer.Error (_, _) -> Error (Fmt.str "lexer error in: %S" s) 95 97 96 98 let is_hex_hash s = 97 99 String.length s = 32 ··· 143 145 Dupfind.Report.output ~format ~top [ cluster ] 144 146 145 147 let find_by_expr ~format ~top index query = 146 - let expr = parse_expr query in 148 + let expr = 149 + match parse_expr query with Ok e -> e | Error msg -> Fmt.failwith "%s" msg 150 + in 147 151 let normalized = Dupfind.Normalize.apply expr in 148 152 let target_hash = Dupfind.Normalize.hash normalized in 149 153 Fmt.pr "AST: %a@." Dupfind.Normalize.pp_expr normalized; ··· 231 235 (* hash subcommand *) 232 236 233 237 let run_hash input = 234 - let expr = parse_expr input in 235 - let normalized = Dupfind.Normalize.apply expr in 236 - let h = Dupfind.Normalize.hash normalized in 237 - Fmt.pr "@[<v>AST: %a@,Hash: %s@]@." Dupfind.Normalize.pp_expr normalized h 238 + match parse_expr input with 239 + | Error msg -> 240 + Fmt.epr "Error: %s@." msg; 241 + Stdlib.exit 1 242 + | Ok expr -> 243 + let normalized = Dupfind.Normalize.apply expr in 244 + let h = Dupfind.Normalize.hash normalized in 245 + Fmt.pr "@[<v>AST: %a@,Hash: %s@]@." Dupfind.Normalize.pp_expr normalized 246 + h 238 247 239 248 (* Cmdliner terms *) 240 249 ··· 242 251 243 252 let min_size = 244 253 Arg.( 245 - value & opt int 5 254 + value & opt int 30 246 255 & info [ "min-size" ] ~docv:"N" ~doc:"Minimum AST node count for fragments.") 247 256 248 257 let no_intra =
+1
lib/normalize.ml
··· 163 163 | Pexp_lazy e -> E_lazy (convert_expr env e) 164 164 | Pexp_try (e, cases) -> 165 165 E_try (convert_expr env e, List.map (convert_case env) cases) 166 + | Pexp_open (_, e) -> convert_expr env e 166 167 | _ -> E_other 167 168 168 169 and convert_param env (p : Parsetree.function_param) =
+4 -4
test/cram/inter.t
··· 22 22 23 23 Scan detects structurally identical functions after alpha-renaming: 24 24 25 - $ dupfind scan pkg_a pkg_b 25 + $ dupfind scan --min-size 5 pkg_a pkg_b 26 26 ╭───┬──────┬───────┬──────────────────┬─────────╮ 27 27 │ # │ Size │ Count │ File │ Binding │ 28 28 ├───┼──────┼───────┼──────────────────┼─────────┤ ··· 36 36 37 37 With --no-intra, only cross-package duplicates are shown: 38 38 39 - $ dupfind scan --no-intra pkg_a 39 + $ dupfind scan --min-size 5 --no-intra pkg_a 40 40 No clones found. 41 41 42 42 43 43 Find subcommand: 44 44 45 - $ dupfind find A.encode pkg_a pkg_b 45 + $ dupfind find --min-size 5 A.encode pkg_a pkg_b 46 46 ╭───┬──────┬───────┬──────────────────┬─────────╮ 47 47 │ # │ Size │ Count │ File │ Binding │ 48 48 ├───┼──────┼───────┼──────────────────┼─────────┤ ··· 54 54 55 55 56 56 57 - $ dupfind find A.unique_a pkg_a pkg_b 57 + $ dupfind find --min-size 5 A.unique_a pkg_a pkg_b 58 58 No duplicates found for A.unique_a. 59 59 60 60 Hash subcommand shows normalized AST:
+4 -4
test/cram/similar.t
··· 26 26 27 27 Similar command finds near-duplicates (process/transform share 9 of 12 sub-expressions): 28 28 29 - $ dupfind similar --threshold 0.5 pkg_a pkg_b 29 + $ dupfind similar --min-size 5 --threshold 0.5 pkg_a pkg_b 30 30 ╭───┬────────────┬────────────────────────────┬──────────────────────────┬────────╮ 31 31 │ # │ Similarity │ Left │ Right │ Shared │ 32 32 ├───┼────────────┼────────────────────────────┼──────────────────────────┼────────┤ ··· 39 39 40 40 Exact duplicates (encode) are NOT reported by similar (they belong in scan): 41 41 42 - $ dupfind scan pkg_a pkg_b | head -5 42 + $ dupfind scan --min-size 5 pkg_a pkg_b | head -5 43 43 ╭───┬──────┬───────┬──────────────────┬─────────╮ 44 44 │ # │ Size │ Count │ File │ Binding │ 45 45 ├───┼──────┼───────┼──────────────────┼─────────┤ ··· 49 49 50 50 High threshold filters out less-similar pairs: 51 51 52 - $ dupfind similar --threshold 0.99 pkg_a pkg_b 52 + $ dupfind similar --min-size 5 --threshold 0.99 pkg_a pkg_b 53 53 No similar pairs found. 54 54 55 55 56 56 Cross-package only filter: 57 57 58 - $ dupfind similar --threshold 0.5 --no-intra pkg_a 58 + $ dupfind similar --min-size 5 --threshold 0.5 --no-intra pkg_a 59 59 No similar pairs found. 60 60