this repo has no description
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Add replace-dif script and apply d→dif label rewrites

Replaces `d` with `dif` (Typst differential operator) in two contexts:
1. Derivative fractions: (d A)/(d B) and operator form d/(d z)
2. Integral differentials: d-tokens following an `integral` keyword

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

+190 -14
+2 -2
data/crohme_gen_2019.dvc
··· 1 1 outs: 2 - - md5: 96bb848d0ff0aacb0ebb62067c7c50d9.dir 3 - size: 181407618 2 + - md5: d8a381cf883dabe8a8f131e70c0269d2.dir 3 + size: 181404734 4 4 nfiles: 51857 5 5 hash: md5 6 6 path: crohme_gen_2019
+2 -2
data/crohme_gen_2023.dvc
··· 1 1 outs: 2 - - md5: cdfd3e8c3130f36f3687eb02ae4490df.dir 3 - size: 11502682 2 + - md5: fbbb877913a7ed6b918467b4eebf3b2a.dir 3 + size: 11502787 4 4 nfiles: 3075 5 5 hash: md5 6 6 path: crohme_gen_2023
+2 -2
data/crohme_gen_syntactic.dvc
··· 1 1 outs: 2 - - md5: 7822a5ba28b6f3a8ceb9a779a00a9f2f.dir 3 - size: 181222780 2 + - md5: 5daecbc28ee8193fa863dba69b6f9855.dir 3 + size: 181221786 4 4 nfiles: 69399 5 5 hash: md5 6 6 path: crohme_gen_syntactic
+2 -2
data/mathwriting_synthetic.dvc
··· 1 1 outs: 2 - - md5: a24754333fad346800a1c29852f7bd11.dir 3 - size: 426140893 2 + - md5: 8c846987ce355e8d9da2227ee6bab4ee.dir 3 + size: 426144703 4 4 nfiles: 85964 5 5 hash: md5 6 6 path: mathwriting_synthetic
+2 -2
data/mathwriting_test.dvc
··· 1 1 outs: 2 - - md5: 376507f833503a110a82c310e3b35954.dir 3 - size: 22825064 2 + - md5: f2b7d8d3f2744f35598557df99829f7e.dir 3 + size: 22824247 4 4 nfiles: 5741 5 5 hash: md5 6 6 path: mathwriting_test
+2 -2
data/mathwriting_train.dvc
··· 1 1 outs: 2 - - md5: 77b3d0acf005b729d3b316f0f0954f4a.dir 3 - size: 544481338 2 + - md5: 0bb3ca50d114f4d12e58084cd3ea5c90.dir 3 + size: 544534742 4 4 nfiles: 143108 5 5 hash: md5 6 6 path: mathwriting_train
+2 -2
data/mathwriting_val.dvc
··· 1 1 outs: 2 - - md5: 7400f070567b7dbd19811469556938e4.dir 3 - size: 34595652 2 + - md5: 970529379a6cbf4863973fb8ec45127d.dir 3 + size: 34599918 4 4 nfiles: 9338 5 5 hash: md5 6 6 path: mathwriting_val
+1
pyproject.toml
··· 35 35 apply-edits = "src.apply_edits:main" 36 36 search-labels = "src.search_labels:main" 37 37 replace-frac = "src.replace_frac:main" 38 + replace-dif = "src.replace_dif:main" 38 39 eff-mer-evaluate = "src.eff_mer.infer:main" 39 40 40 41 [build-system]
+175
src/replace_dif.py
··· 1 + """ 2 + Replace `d` → `dif` in two Typst differential contexts: 3 + 4 + 1. Derivative fractions: 5 + (d A) / (d B) → (dif A) / (dif B) 6 + (d^2 y) / (d x^2) → (dif^2 y) / (dif x^2) 7 + (d^{n} y) / (d x^n) → (dif^{n} y) / (dif x^n) 8 + 9 + 2. Integral differentials (d-token that follows an `integral` keyword): 10 + integral f(x) d x → integral f(x) dif x 11 + integral v f d^3 v → integral v f dif^3 v 12 + integral f(x) d_q x → integral f(x) dif_q x 13 + 14 + Usage: 15 + uv run replace-dif [--split NAME] [--dry-run] [--limit N] 16 + """ 17 + 18 + import argparse 19 + import json 20 + import re 21 + import shutil 22 + from pathlib import Path 23 + 24 + from .data import DATA_ROOT, TRAIN_SPLITS, VAL_SPLITS, TEST_SPLITS 25 + 26 + 27 + def _all_splits(): 28 + return TRAIN_SPLITS + VAL_SPLITS + TEST_SPLITS 29 + 30 + 31 + # Optional superscript/subscript modifier: ^2, _q, ^{n+1}, _{k=0}, … 32 + _MOD = r'([_^](?:\{[^}]*\}|[^\s,()=+*/\\-]+))?' 33 + 34 + # Pattern 1a: (d ...) / (d ...) — both sides parenthesised 35 + # Groups: mod1, rest1, mod2, rest2 36 + _DERIV_RE = re.compile( 37 + r'\(d' + _MOD + r'(\s+[^)]+)\)' 38 + r'\s*/\s*' 39 + r'\(d' + _MOD + r'(\s+[^)]+)\)' 40 + ) 41 + 42 + # Pattern 1b: bare d / (d ...) — operator notation, e.g. d/(d z) or d / (d^2 z) 43 + # Groups: slash_span, mod_denom, rest_denom 44 + _D_OVER_PAREN_D_RE = re.compile( 45 + r'\bd(\s*/\s*)\(d' + _MOD + r'(\s+[^)]+)\)' 46 + ) 47 + 48 + # d-token as integral differential: standalone d (with optional modifier) 49 + # followed by whitespace then a variable / open-paren / backslash command. 50 + _D_TOKEN_RE = re.compile(r'\bd' + _MOD + r'(?=\s+[a-zA-Z(\\])') 51 + 52 + # integral keyword 53 + _INTEGRAL_RE = re.compile(r'\bintegral\b') 54 + 55 + 56 + def _replace_deriv(s: str) -> tuple[str, int]: 57 + count = [0] 58 + 59 + def repl_full(m: re.Match) -> str: 60 + count[0] += 1 61 + mod1 = m.group(1) or '' 62 + rest1 = m.group(2) 63 + mod2 = m.group(3) or '' 64 + rest2 = m.group(4) 65 + return f'(dif{mod1}{rest1}) / (dif{mod2}{rest2})' 66 + 67 + def repl_op(m: re.Match) -> str: 68 + count[0] += 1 69 + slash = m.group(1) # e.g. "/" or " / " 70 + mod_denom = m.group(2) or '' 71 + rest = m.group(3) 72 + return f'dif{slash}(dif{mod_denom}{rest})' 73 + 74 + s = _DERIV_RE.sub(repl_full, s) 75 + s = _D_OVER_PAREN_D_RE.sub(repl_op, s) 76 + return s, count[0] 77 + 78 + 79 + def _replace_integral_d(s: str) -> tuple[str, int]: 80 + """Replace d-tokens that appear after an integral keyword.""" 81 + if not _INTEGRAL_RE.search(s): 82 + return s, 0 83 + 84 + int_positions = [m.start() for m in _INTEGRAL_RE.finditer(s)] 85 + 86 + # Keep only d-tokens that have at least one integral somewhere before them 87 + to_replace = [ 88 + dm for dm in _D_TOKEN_RE.finditer(s) 89 + if any(ip < dm.start() for ip in int_positions) 90 + ] 91 + if not to_replace: 92 + return s, 0 93 + 94 + # Substitute right-to-left so earlier offsets stay valid 95 + result = list(s) 96 + for dm in reversed(to_replace): 97 + # dm matches the full d-token (e.g. "d^3"); replace only the leading "d" 98 + result[dm.start() : dm.start() + 1] = list('dif') 99 + 100 + return ''.join(result), len(to_replace) 101 + 102 + 103 + def replace_dif_all(s: str) -> tuple[str, int]: 104 + """Apply both replacement passes. Returns (new_string, n_replacements).""" 105 + s, n1 = _replace_deriv(s) 106 + s, n2 = _replace_integral_d(s) 107 + return s, n1 + n2 108 + 109 + 110 + def main() -> None: 111 + parser = argparse.ArgumentParser( 112 + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter 113 + ) 114 + parser.add_argument('--split', default=None, help='Restrict to one split') 115 + parser.add_argument('--dry-run', action='store_true', 116 + help='Print changes without writing') 117 + parser.add_argument('--limit', type=int, default=None, metavar='N', 118 + help='In dry-run: stop after N changed records') 119 + args = parser.parse_args() 120 + 121 + splits = [args.split] if args.split else _all_splits() 122 + grand_total = 0 123 + shown = 0 124 + 125 + for split_name in splits: 126 + manifest = DATA_ROOT / split_name / 'manifest.jsonl' 127 + if not manifest.exists(): 128 + continue 129 + 130 + lines = manifest.read_text().splitlines() 131 + new_lines = [] 132 + changed = 0 133 + 134 + for line in lines: 135 + if not line.strip(): 136 + new_lines.append(line) 137 + continue 138 + rec = json.loads(line) 139 + orig = rec.get('typst', '') 140 + new_t, n = replace_dif_all(orig) 141 + if n: 142 + changed += 1 143 + if args.dry_run and (args.limit is None or shown < args.limit): 144 + print(f' [{split_name}]') 145 + print(f' before: {orig!r}') 146 + print(f' after: {new_t!r}') 147 + shown += 1 148 + rec = dict(rec) 149 + rec['typst'] = new_t 150 + new_lines.append(json.dumps(rec)) 151 + 152 + if changed: 153 + print(f'[{split_name}] {changed} records would change' 154 + if args.dry_run else 155 + f'[{split_name}] {changed} records updated') 156 + if not args.dry_run: 157 + bak = manifest.with_suffix('.jsonl.bak') 158 + shutil.copy2(manifest, bak) 159 + manifest.write_text('\n'.join(new_lines) + '\n') 160 + 161 + grand_total += changed 162 + 163 + if args.dry_run and args.limit is not None and shown >= args.limit: 164 + print(f'(stopped after {args.limit} shown)') 165 + break 166 + 167 + print(f'\nTotal: {grand_total} records') 168 + if args.dry_run: 169 + print('(dry run -- no files written)') 170 + else: 171 + print('Run dvc add + git commit to record changes.') 172 + 173 + 174 + if __name__ == '__main__': 175 + main()