this repo has no description
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Add typeset_prose splits and augmentation improvements

Adds a prose-only synthetic split (typeset_prose_{train,val,test}) rendered
in handwriting-style fonts with no math delimiters, addressing the prior bias
P(Math | Handwritten) ≈ 1 learned from math-only real handwriting datasets.

Also extends _augment() with ElasticTransform (baseline wobble) and
RandomPerspective (photographed-paper effect), plus optional ruled-line
overlay for notebook paper simulation.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

+131 -11
+3
data/.gitignore
··· 15 15 /typeset_mixed_train 16 16 /typeset_mixed_val 17 17 /typeset_mixed_test 18 + /typeset_prose_test 19 + /typeset_prose_train 20 + /typeset_prose_val
+6
data/typeset_prose_test.dvc
··· 1 + outs: 2 + - md5: 2fbe1559f3565093f631321c78ad6e7d.dir 3 + size: 55356034 4 + nfiles: 501 5 + hash: md5 6 + path: typeset_prose_test
+6
data/typeset_prose_train.dvc
··· 1 + outs: 2 + - md5: 7df77d9559943538754fa8fcec1d39cc.dir 3 + size: 643225173 4 + nfiles: 6001 5 + hash: md5 6 + path: typeset_prose_train
+6
data/typeset_prose_val.dvc
··· 1 + outs: 2 + - md5: 3ee69f81a366cae891f81d2e84106219.dir 3 + size: 53767934 4 + nfiles: 501 5 + hash: md5 6 + path: typeset_prose_val
+39 -3
src/data.py
··· 15 15 from pathlib import Path 16 16 17 17 from torch.utils.data import Dataset 18 - from PIL import Image, ImageFilter 18 + from PIL import Image, ImageDraw, ImageFilter 19 + from torchvision.transforms import ElasticTransform, RandomPerspective 19 20 import torchvision.transforms.functional as TF 20 21 21 22 DATA_ROOT = Path(__file__).parent.parent / "data" ··· 29 30 "mathwriting_synthetic", 30 31 "mathwriting_symbols", 31 32 "typeset_uniform_train", "typeset_mixed_train", 33 + "typeset_prose_train", 32 34 ] 33 35 VAL_SPLITS = [ 34 36 "mathwriting_val", 35 37 "typeset_uniform_val", "typeset_mixed_val", 38 + "typeset_prose_val", 36 39 ] 37 40 TEST_SPLITS = [ 38 41 "mathwriting_test", 39 42 "typeset_uniform_test", "typeset_mixed_test", 43 + "typeset_prose_test", 40 44 ] 41 45 42 46 # Splits whose manifest typst field is a bare math expression (no $ delimiters). ··· 60 64 _VAR_RE = re.compile(r"\b[a-zA-Z]\b") 61 65 _MAX_PER_KEY = 5 62 66 67 + _ELASTIC = ElasticTransform(alpha=30.0, sigma=6.0, fill=255) 68 + _PERSPECTIVE = RandomPerspective(distortion_scale=0.04, p=1.0, fill=255) 69 + 63 70 64 71 def _structural_key(typst: str) -> str: 65 72 """Normalize numbers and single-letter variables to expose structural pattern. ··· 112 119 return records 113 120 114 121 122 + def _add_ruled_lines(img: Image.Image) -> Image.Image: 123 + """Overlay faint horizontal ruled lines simulating notebook paper.""" 124 + orig_mode = img.mode 125 + work = img.convert("RGBA") 126 + overlay = Image.new("RGBA", work.size, (255, 255, 255, 0)) 127 + draw = ImageDraw.Draw(overlay) 128 + spacing = random.randint(18, 28) 129 + opacity = random.randint(28, 55) 130 + for y in range(spacing, work.height, spacing): 131 + draw.line([(0, y), (work.width, y)], fill=(160, 160, 210, opacity), width=1) 132 + return Image.alpha_composite(work, overlay).convert(orig_mode) 133 + 134 + 115 135 def _augment(img: Image.Image) -> Image.Image: 116 136 """ 117 - Mild augmentation for synthetic-to-phone-photo robustness. 118 - Mirrors eff-mer's augmentation plus brightness/contrast jitter. 137 + Augmentation for synthetic-to-real-notes robustness. 138 + 139 + Geometric: affine (existing) + elastic deformation + mild perspective. 140 + Photometric: blur + brightness/contrast (existing) + optional ruled lines. 119 141 """ 120 142 angle = random.uniform(-5, 5) 121 143 scale = random.uniform(0.9, 1.1) ··· 123 145 ty = int(random.uniform(-0.05, 0.05) * img.height) 124 146 img = TF.affine(img, angle=angle, translate=(tx, ty), scale=scale, shear=0, fill=255) 125 147 148 + # Elastic deformation: smooth warp mimics baseline wobble without 149 + # corrupting subscript/superscript spatial relationships (sigma=6 keeps 150 + # the displacement field globally smooth). 151 + if random.random() < 0.35: 152 + img = _ELASTIC(img) 153 + 154 + # Mild perspective: simulates photographed note-paper rather than scanned. 155 + if random.random() < 0.25: 156 + img = _PERSPECTIVE(img) 157 + 126 158 if random.random() < 0.6: 127 159 img = img.filter(ImageFilter.GaussianBlur(random.uniform(0.0, 0.8))) 128 160 129 161 img = TF.adjust_brightness(img, random.uniform(0.75, 1.25)) 130 162 img = TF.adjust_contrast(img, random.uniform(0.75, 1.25)) 163 + 164 + if random.random() < 0.20: 165 + img = _add_ruled_lines(img) 166 + 131 167 return img 132 168 133 169
+57
src/generate_mixed.py
··· 656 656 return _inline_seq(rng, n, require_math=True), "auto" 657 657 658 658 659 + # ── Prose-only body generation ──────────────────────────────────────────────── 660 + 661 + _HEADINGS = [ 662 + "Definition", "Theorem", "Proof", "Lemma", "Corollary", 663 + "Example", "Remark", "Note", "Observation", "Claim", 664 + "Proposition", "Notation", "Convention", "Exercise", 665 + ] 666 + 667 + 668 + def _prose_sentence(rng: random.Random) -> str: 669 + """One clause of mathematical prose; no $ delimiters.""" 670 + words: list[str] = [] 671 + if rng.random() < 0.5: 672 + words.append(rng.choice(_STARTERS)) 673 + words += rng.sample(_WORDS, k=rng.randint(2, 4)) 674 + # ~30%: mention a variable name as italic text, not math mode 675 + if rng.random() < 0.3: 676 + var = rng.choice(["x", "y", "f", "n", "g", "h", "T", "A", "V"]) 677 + pos = rng.randint(0, len(words)) 678 + words.insert(pos, f"_{var}_") 679 + if rng.random() < 0.4: 680 + words.append(rng.choice(_CONNECTORS)) 681 + words += rng.sample(_WORDS, k=rng.randint(1, 2)) 682 + return " ".join(words) 683 + 684 + 685 + def generate_prose_body(rng: random.Random) -> tuple[str, str]: 686 + """Body with no math delimiters; rendered in handwriting fonts for negative training.""" 687 + r = rng.random() 688 + width = rng.choice(_PARA_WIDTHS) 689 + if r < 0.20: 690 + body = " ".join(_prose_sentence(rng) for _ in range(rng.randint(2, 4))) 691 + return body, f"{width}pt" 692 + elif r < 0.40: 693 + paras = [ 694 + " ".join(_prose_sentence(rng) for _ in range(rng.randint(1, 3))) 695 + for _ in range(rng.randint(2, 4)) 696 + ] 697 + return "\n\n".join(paras), f"{width}pt" 698 + elif r < 0.58: 699 + marker = rng.choice(["- ", "+ "]) 700 + items = [f"{marker}{_prose_sentence(rng)}" for _ in range(rng.randint(3, 6))] 701 + return "\n".join(items), f"{width}pt" 702 + elif r < 0.74: 703 + heading = rng.choice(_HEADINGS) 704 + body = " ".join(_prose_sentence(rng) for _ in range(rng.randint(1, 3))) 705 + return f"*{heading}*\n\n{body}", f"{width}pt" 706 + elif r < 0.87: 707 + items = [f"{i + 1}. {_prose_sentence(rng)}" for i in range(rng.randint(3, 5))] 708 + return "\n".join(items), f"{width}pt" 709 + else: 710 + heading = rng.choice(_HEADINGS) 711 + marker = rng.choice(["- ", "+ "]) 712 + items = [f"{marker}{_prose_sentence(rng)}" for _ in range(rng.randint(2, 4))] 713 + return f"*{heading}*\n\n" + "\n".join(items), f"{width}pt" 714 + 715 + 659 716 # ── Rendering ───────────────────────────────────────────────────────────────── 660 717 661 718 # Page widths (pt) used for multi-paragraph bodies; sampled uniformly.
+11 -6
src/generate_typeset.py
··· 39 39 40 40 from tqdm import tqdm 41 41 42 - from .generate_mixed import generate_body 42 + from .generate_mixed import generate_body, generate_prose_body 43 43 44 44 45 45 # ── Font pool ───────────────────────────────────────────────────────────────── ··· 200 200 parser = argparse.ArgumentParser( 201 201 description="Generate handwriting-font structured math+text data." 202 202 ) 203 - parser.add_argument("--mode", choices=["uniform", "mixed"], default="uniform", 204 - help="uniform=one font per doc; mixed=per-block font sampling") 203 + parser.add_argument("--mode", choices=["uniform", "mixed", "prose"], default="uniform", 204 + help="uniform=one font per doc; mixed=per-block font sampling; prose=prose-only no math") 205 205 parser.add_argument("--count", type=int, default=15_000) 206 206 parser.add_argument("--out", default="data/hw_structured_train") 207 207 parser.add_argument("--jobs", type=int, default=4) ··· 241 241 with tqdm(total=args.count, unit="body") as pbar: 242 242 while len(candidates) < args.count: 243 243 attempts += 1 244 - body, page_width = generate_body(rng) 244 + if args.mode == "prose": 245 + body, page_width = generate_prose_body(rng) 246 + else: 247 + body, page_width = generate_body(rng) 245 248 246 249 if body in seen: 247 250 continue ··· 270 273 def _task( 271 274 clean_body: str, page_width: str, font: str, size: int, ink: str 272 275 ) -> tuple[str, str, bool, str]: 273 - if args.mode == "uniform": 276 + # prose renders as a uniform-font document (one font, whole page) 277 + if args.mode in ("uniform", "prose"): 274 278 render_body = clean_body 275 279 else: 276 280 render_body = _apply_mixed_fonts( ··· 282 286 h = hashlib.sha1(h_key.encode()).hexdigest()[:16] 283 287 out_path = img_dir / f"{h}.png" 284 288 289 + render_mode = "uniform" if args.mode == "prose" else args.mode 285 290 ok, err = _render( 286 291 render_body, out_path, page_width, 287 - mode=args.mode, font=font, font_dir=font_dir, size=size, ink=ink, 292 + mode=render_mode, font=font, font_dir=font_dir, size=size, ink=ink, 288 293 rng=rng, 289 294 ) 290 295 # Manifest always records the CLEAN body (no font directives).
+2 -2
src/static/index.html
··· 47 47 button#run:hover { background: #335533; } 48 48 button#run:disabled { opacity: 0.5; cursor: wait; } 49 49 #result { 50 - width: 512px; 50 + width: 1024px; 51 51 min-height: 2.5rem; 52 52 background: #111; 53 53 border: 1px solid #333; ··· 65 65 </head> 66 66 <body> 67 67 <h1>Typst Math OCR</h1> 68 - <canvas id="canvas" width="512" height="512"></canvas> 68 + <canvas id="canvas" width="1024" height="1024"></canvas> 69 69 <div class="controls"> 70 70 <button id="run">Run</button> 71 71 <button id="clear">Clear</button>
+1
src/train.py
··· 22 22 "crohme_gen_2019": 15_000, 23 23 "crohme_gen_syntactic": 15_000, 24 24 "mathwriting_train": 10_000, 25 + "typeset_prose_train": 5_000, 25 26 } 26 27 27 28