this repo has no description
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Add search-labels CLI for inspecting and bulk-replacing label patterns

uv run search-labels gt
uv run search-labels prime --replace "'" --dry-run
uv run search-labels gt --replace >

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

+115
+1
pyproject.toml
··· 33 33 probe-deepseek = "src.probe_deepseek:main" 34 34 review = "src.review_app:main" 35 35 apply-edits = "src.apply_edits:main" 36 + search-labels = "src.search_labels:main" 36 37 eff-mer-evaluate = "src.eff_mer.infer:main" 37 38 38 39 [build-system]
+114
src/search_labels.py
··· 1 + """ 2 + Search and optionally rewrite labels across all manifests. 3 + 4 + Usage: 5 + uv run search-labels gt # find labels containing 'gt' 6 + uv run search-labels prime # find labels containing 'prime' 7 + uv run search-labels gt --replace > # replace whole-word 'gt' with '>' 8 + uv run search-labels prime --replace "'" # replace 'prime' with ' 9 + uv run search-labels gt --replace > --dry-run # preview only 10 + uv run search-labels gt --replace > --split crohme_gen_2019 11 + """ 12 + 13 + import argparse 14 + import json 15 + import re 16 + import shutil 17 + from pathlib import Path 18 + 19 + from .data import DATA_ROOT, TRAIN_SPLITS, VAL_SPLITS, TEST_SPLITS 20 + 21 + 22 + def _all_splits(): 23 + return TRAIN_SPLITS + VAL_SPLITS + TEST_SPLITS 24 + 25 + 26 + def search(pattern: str, splits: list[str], context: int = 0) -> None: 27 + rx = re.compile(r"\b" + re.escape(pattern) + r"\b") 28 + total = 0 29 + for split_name in splits: 30 + manifest = DATA_ROOT / split_name / "manifest.jsonl" 31 + if not manifest.exists(): 32 + continue 33 + hits = [] 34 + for line in manifest.read_text().splitlines(): 35 + if not line.strip(): 36 + continue 37 + r = json.loads(line) 38 + t = r.get("typst", "") 39 + if rx.search(t): 40 + hits.append(t) 41 + if hits: 42 + print(f"\n── {split_name} ({len(hits)} hits) ──") 43 + for t in hits: 44 + print(f" {t}") 45 + total += len(hits) 46 + print(f"\nTotal: {total} matches for '{pattern}'") 47 + 48 + 49 + def replace(pattern: str, replacement: str, splits: list[str], dry_run: bool) -> None: 50 + rx = re.compile(r"\b" + re.escape(pattern) + r"\b") 51 + total_changed = 0 52 + 53 + for split_name in splits: 54 + manifest = DATA_ROOT / split_name / "manifest.jsonl" 55 + if not manifest.exists(): 56 + continue 57 + 58 + lines = manifest.read_text().splitlines() 59 + new_lines = [] 60 + changed = 0 61 + for line in lines: 62 + if not line.strip(): 63 + new_lines.append(line) 64 + continue 65 + r = json.loads(line) 66 + t = r.get("typst", "") 67 + new_t, n = rx.subn(replacement, t) 68 + if n: 69 + changed += n 70 + if dry_run: 71 + print(f" [{split_name}] {t!r} -> {new_t!r}") 72 + r = dict(r) 73 + r["typst"] = new_t 74 + new_lines.append(json.dumps(r)) 75 + 76 + if changed: 77 + print(f"[{split_name}] {changed} replacements") 78 + if not dry_run: 79 + bak = manifest.with_suffix(".jsonl.bak") 80 + shutil.copy2(manifest, bak) 81 + manifest.write_text("\n".join(new_lines) + "\n") 82 + 83 + total_changed += changed 84 + 85 + print(f"\nTotal: {total_changed} replacements") 86 + if dry_run: 87 + print("(dry run -- no files written)") 88 + else: 89 + print("Run dvc add + git commit to record changes.") 90 + 91 + 92 + def main() -> None: 93 + parser = argparse.ArgumentParser( 94 + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter 95 + ) 96 + parser.add_argument("pattern", help="String to search for (whole-word match)") 97 + parser.add_argument("--replace", metavar="STR", 98 + help="Replace matched pattern with this string") 99 + parser.add_argument("--dry-run", action="store_true", 100 + help="With --replace: show changes without writing") 101 + parser.add_argument("--split", metavar="NAME", 102 + help="Restrict to one split") 103 + args = parser.parse_args() 104 + 105 + splits = [args.split] if args.split else _all_splits() 106 + 107 + if args.replace is not None: 108 + replace(args.pattern, args.replace, splits, dry_run=args.dry_run) 109 + else: 110 + search(args.pattern, splits) 111 + 112 + 113 + if __name__ == "__main__": 114 + main()