this repo has no description
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

search-labels: add --raw flag for literal substring patterns

Needed for patterns containing non-word chars like ^prime.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

+24 -13
+24 -13
src/search_labels.py
··· 1 1 """ 2 2 Search and optionally rewrite labels across all manifests. 3 3 4 + By default the pattern matches whole words only (\b boundaries). 5 + Use --raw for literal substring match (needed when pattern contains 6 + non-word chars like ^). 7 + 4 8 Usage: 5 - uv run search-labels gt # find labels containing 'gt' 6 - uv run search-labels prime # find labels containing 'prime' 7 - uv run search-labels gt --replace > # replace whole-word 'gt' with '>' 8 - uv run search-labels prime --replace "'" # replace 'prime' with ' 9 - uv run search-labels gt --replace > --dry-run # preview only 10 - uv run search-labels gt --replace > --split crohme_gen_2019 9 + uv run search-labels gt # whole-word search 10 + uv run search-labels "^prime" --raw # literal substring 11 + uv run search-labels gt --replace ">" # whole-word replace 12 + uv run search-labels "^prime" --raw --replace "'" # literal replace 13 + uv run search-labels "^prime" --raw --replace "'" --dry-run 14 + uv run search-labels gt --replace ">" --split crohme_gen_2019 11 15 """ 12 16 13 17 import argparse ··· 23 27 return TRAIN_SPLITS + VAL_SPLITS + TEST_SPLITS 24 28 25 29 26 - def search(pattern: str, splits: list[str], context: int = 0) -> None: 27 - rx = re.compile(r"\b" + re.escape(pattern) + r"\b") 30 + def _compile(pattern: str, raw: bool) -> re.Pattern: 31 + escaped = re.escape(pattern) 32 + return re.compile(escaped if raw else r"\b" + escaped + r"\b") 33 + 34 + 35 + def search(pattern: str, splits: list[str], raw: bool = False) -> None: 36 + rx = _compile(pattern, raw) 28 37 total = 0 29 38 for split_name in splits: 30 39 manifest = DATA_ROOT / split_name / "manifest.jsonl" ··· 46 55 print(f"\nTotal: {total} matches for '{pattern}'") 47 56 48 57 49 - def replace(pattern: str, replacement: str, splits: list[str], dry_run: bool) -> None: 50 - rx = re.compile(r"\b" + re.escape(pattern) + r"\b") 58 + def replace(pattern: str, replacement: str, splits: list[str], dry_run: bool, raw: bool = False) -> None: 59 + rx = _compile(pattern, raw) 51 60 total_changed = 0 52 61 53 62 for split_name in splits: ··· 93 102 parser = argparse.ArgumentParser( 94 103 description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter 95 104 ) 96 - parser.add_argument("pattern", help="String to search for (whole-word match)") 105 + parser.add_argument("pattern", help="String to search for") 106 + parser.add_argument("--raw", action="store_true", 107 + help="Literal substring match (default: whole-word)") 97 108 parser.add_argument("--replace", metavar="STR", 98 109 help="Replace matched pattern with this string") 99 110 parser.add_argument("--dry-run", action="store_true", ··· 105 116 splits = [args.split] if args.split else _all_splits() 106 117 107 118 if args.replace is not None: 108 - replace(args.pattern, args.replace, splits, dry_run=args.dry_run) 119 + replace(args.pattern, args.replace, splits, dry_run=args.dry_run, raw=args.raw) 109 120 else: 110 - search(args.pattern, splits) 121 + search(args.pattern, splits, raw=args.raw) 111 122 112 123 113 124 if __name__ == "__main__":