this repo has no description
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Regen typeset splits at 250 PPI; rename structured->uniform; strengthen prompt

- Bump render PPI 150->250 in generate_typeset.py; increase margins y:8->12pt
- Rename typeset_structured_* splits to typeset_uniform_* (mode=uniform vs mixed)
- Consolidate generate_handwritten.py into generate_typeset.py; drop dead
generate_mixed.py and generate_handwritten.py entrypoints
- Regenerate typeset_uniform (10k train) and typeset_mixed (20k train) at new PPI
- Strengthen PROMPT: require raw Typst output, explicitly forbid LaTeX

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

+658 -775
+8 -6
README.md
··· 124 124 uv run download-hw-fonts 125 125 126 126 # Generate hw splits 127 - uv run generate-hw --mode hw --count 15000 --out data/hw_structured_train 128 - uv run generate-hw --mode mix --count 10000 --out data/hw_mixed_train 129 - uv run generate-hw --mode hw --count 500 --out data/hw_structured_val --seed 100 130 - uv run generate-hw --mode mix --count 500 --out data/hw_mixed_val --seed 100 131 - uv run generate-hw --mode hw --count 500 --out data/hw_structured_test --seed 200 132 - uv run generate-hw --mode mix --count 500 --out data/hw_mixed_test --seed 200 127 + 128 + ``` 129 + uv run generate-typeset --mode uniform --count 10000 --out data/typeset_uniform_train 130 + uv run generate-typeset --mode uniform --count 500 --out data/typeset_uniform_val --seed 100 131 + uv run generate-typeset --mode uniform --count 500 --out data/typeset_uniform_test --seed 200 132 + uv run generate-typeset --mode mixed --count 25000 --out data/typeset_mixed_train 133 + uv run generate-typeset --mode mixed --count 500 --out data/typeset_mixed_val --seed 100 134 + uv run generate-typeset --mode mixed --count 500 --out data/typeset_mixed_test --seed 200 133 135 ``` 134 136 135 137 ### Train
+4 -4
data/.gitignore
··· 9 9 /mathwriting_symbols 10 10 /mathwriting_val 11 11 /mathwriting_test 12 - /typeset_mixed_test 12 + /typeset_uniform_train 13 + /typeset_uniform_val 14 + /typeset_uniform_test 13 15 /typeset_mixed_train 14 16 /typeset_mixed_val 15 - /typeset_structured_test 16 - /typeset_structured_train 17 - /typeset_structured_val 17 + /typeset_mixed_test
+2 -2
data/typeset_mixed_test.dvc
··· 1 1 outs: 2 - - md5: c1e3cf8782f435b3135f6918cfbb03c2.dir 3 - size: 32044732 2 + - md5: 24e917ef40fa0c4f323f11a7997f45b5.dir 3 + size: 60886374 4 4 nfiles: 501 5 5 hash: md5 6 6 path: typeset_mixed_test
+3 -3
data/typeset_mixed_train.dvc
··· 1 1 outs: 2 - - md5: 5cce037102c5c521ec574edfd0d562d3.dir 3 - size: 618883250 4 - nfiles: 10001 2 + - md5: 1251d930791048937e518c6bb931a531.dir 3 + size: 2404966441 4 + nfiles: 20001 5 5 hash: md5 6 6 path: typeset_mixed_train
+2 -2
data/typeset_mixed_val.dvc
··· 1 1 outs: 2 - - md5: f3c01f8cc4352d02b2c15fc4ab1a0815.dir 3 - size: 30922657 2 + - md5: 31b5d8765478c3f4d0512aec7a9f2a50.dir 3 + size: 60279129 4 4 nfiles: 501 5 5 hash: md5 6 6 path: typeset_mixed_val
-6
data/typeset_structured_test.dvc
··· 1 - outs: 2 - - md5: cae0dd567ebd51e42a63553200135d33.dir 3 - size: 17131859 4 - nfiles: 501 5 - hash: md5 6 - path: typeset_structured_test
-6
data/typeset_structured_train.dvc
··· 1 - outs: 2 - - md5: ac0f899cdc71e9e397b453b5a3574ed5.dir 3 - size: 506504324 4 - nfiles: 15001 5 - hash: md5 6 - path: typeset_structured_train
-6
data/typeset_structured_val.dvc
··· 1 - outs: 2 - - md5: c568259a74b9d0814763fb39d0db3855.dir 3 - size: 17717698 4 - nfiles: 501 5 - hash: md5 6 - path: typeset_structured_val
+6
data/typeset_uniform_test.dvc
··· 1 + outs: 2 + - md5: 65b5718d0ccbc7bac93ba5f9bdabb88e.dir 3 + size: 33164561 4 + nfiles: 501 5 + hash: md5 6 + path: typeset_uniform_test
+6
data/typeset_uniform_train.dvc
··· 1 + outs: 2 + - md5: 926406071435ac5af54990b674166766.dir 3 + size: 644650286 4 + nfiles: 10001 5 + hash: md5 6 + path: typeset_uniform_train
+6
data/typeset_uniform_val.dvc
··· 1 + outs: 2 + - md5: 85df052205572241341ed2390adc3486.dir 3 + size: 32904580 4 + nfiles: 501 5 + hash: md5 6 + path: typeset_uniform_val
-2
pyproject.toml
··· 33 33 train-hnm = "src.train_hnm:main" 34 34 export = "src.export:main" 35 35 generate-typeset = "src.generate_typeset:main" 36 - generate-mixed = "src.generate_mixed:main" 37 - generate-hw = "src.generate_handwritten:main" 38 36 download-hw-fonts = "src.download_hw_fonts:main" 39 37 probe = "src.probe:main" 40 38 app = "src.app:main"
+4 -4
src/data.py
··· 28 28 "mathwriting_train", 29 29 "mathwriting_synthetic", 30 30 "mathwriting_symbols", 31 - "typeset_structured_train", "typeset_mixed_train", 31 + "typeset_uniform_train", "typeset_mixed_train", 32 32 ] 33 33 VAL_SPLITS = [ 34 34 "mathwriting_val", 35 - "typeset_structured_val", "typeset_mixed_val", 35 + "typeset_uniform_val", "typeset_mixed_val", 36 36 ] 37 37 TEST_SPLITS = [ 38 38 "mathwriting_test", 39 - "typeset_structured_test", "typeset_mixed_test", 39 + "typeset_uniform_test", "typeset_mixed_test", 40 40 ] 41 41 42 42 # Splits whose manifest typst field is a bare math expression (no $ delimiters). ··· 49 49 "typeset_train", "typeset_val", "typeset_test", 50 50 } 51 51 52 - PROMPT = "Transcribe this image to Typst notation." 52 + PROMPT = "Transcribe this image to Typst notation. Output only the raw Typst, without explanation. No LaTeX, only Typst." 53 53 BASE_MODEL = "unsloth/gemma-4-E2B-it" 54 54 55 55
-328
src/generate_handwritten.py
··· 1 - """ 2 - Generate structured text+math documents rendered in handwriting-style fonts. 3 - 4 - Fills two training data gaps: 5 - 6 - hw -- all text rendered in a handwriting font; math stays typeset. 7 - Realistic: handwritten notes typically have careful notation. 8 - Target splits: hw_structured_{train,val,test} 9 - 10 - mix -- per-paragraph font mixing within the same document: each block 11 - independently gets a handwriting font or the default typeset font. 12 - Models the appearance of partially annotated / hybrid documents. 13 - Target splits: hw_mixed_{train,val,test} 14 - 15 - In both modes the manifest typst field records the CLEAN body (no font 16 - directives), matching the format of typeset_mixed_* splits so data.py can 17 - load them without changes. Add the new split names to TRAIN_SPLITS / 18 - VAL_SPLITS / TEST_SPLITS in data.py when ready to train. 19 - 20 - Prerequisites: 21 - uv run download-hw-fonts # fetch fonts into data/fonts/ 22 - 23 - Usage: 24 - uv run generate-hw --mode hw --count 15000 --out data/hw_structured_train 25 - uv run generate-hw --mode mix --count 10000 --out data/hw_mixed_train 26 - uv run generate-hw --mode hw --count 500 --out data/hw_structured_val --seed 100 27 - uv run generate-hw --mode mix --count 500 --out data/hw_mixed_val --seed 100 28 - uv run generate-hw --mode hw --count 500 --out data/hw_structured_test --seed 200 29 - uv run generate-hw --mode mix --count 500 --out data/hw_mixed_test --seed 200 30 - """ 31 - 32 - import argparse 33 - import hashlib 34 - import json 35 - import random 36 - import subprocess 37 - import tempfile 38 - from concurrent.futures import ThreadPoolExecutor, as_completed 39 - from pathlib import Path 40 - 41 - from tqdm import tqdm 42 - 43 - from .generate_mixed import generate_body 44 - 45 - 46 - # ── Font pool ───────────────────────────────────────────────────────────────── 47 - 48 - _DEFAULT_FONT_DIR = Path("data/fonts") 49 - 50 - # Maps the Typst family name (what Typst sees in font metadata) to the filename 51 - # slug used when checking whether the font was downloaded. 52 - _FONTS: dict[str, str] = { 53 - "Comic Neue": "ComicNeue-Regular.ttf", 54 - "Gochi Hand": "GochiHand-Regular.ttf", 55 - "Handlee": "Handlee-Regular.ttf", 56 - "Oswald": "Oswald-Regular.ttf", 57 - "Dancing Script": "DancingScript-Regular.ttf", 58 - "Special Elite": "SpecialElite-Regular.ttf", 59 - } 60 - 61 - # Font sizes (pt) sampled for diversity. Heavier/larger fonts look more 62 - # "blackboard-style"; smaller sizes give tighter notes appearance. 63 - _FONT_SIZES = [10, 10, 11, 11, 11, 12, 12, 13, 14] 64 - 65 - # Ink colours: mostly black, occasional dark blue / dark grey 66 - _INK_COLOURS = [ 67 - "#000000", "#000000", "#000000", "#000000", # 4/7 pure black 68 - "#1a1a1a", # near-black 69 - "#0d0d4d", # dark navy (pen ink) 70 - "#2b2b2b", # dark grey 71 - ] 72 - 73 - 74 - def _available_fonts(font_dir: Path) -> list[str]: 75 - """Return Typst family names whose TTF files exist in font_dir.""" 76 - if not font_dir.exists(): 77 - return [] 78 - present = {p.name for p in font_dir.iterdir()} 79 - return [family for family, fname in _FONTS.items() if fname in present] 80 - 81 - 82 - def _pick_font(rng: random.Random, available: list[str]) -> str | None: 83 - """Sample uniformly from available handwriting fonts + Typst default (None).""" 84 - pool = available + [None] # None -> New Computer Modern (Typst default) 85 - return rng.choice(pool) 86 - 87 - 88 - # ── Typst templates ─────────────────────────────────────────────────────────── 89 - 90 - # hw mode: override the text font for the whole document. 91 - # Math equations continue using Typst's built-in math fonts (realistic). 92 - _TEMPLATE_HW = ( 93 - "#set page(width: {width}, height: auto, " 94 - "margin: (x: 10pt, y: 8pt), fill: white)\n" 95 - '#set text(font: ("{font}", "New Computer Modern"), ' 96 - "size: {size}pt, fill: rgb(\"{ink}\"), fallback: true)\n" 97 - "#set list(spacing: 1.2em)\n" 98 - "#set enum(spacing: 1.2em)\n" 99 - "{body}\n" 100 - ) 101 - 102 - # mix mode: default document font (typeset), individual blocks may be 103 - # wrapped in #text(font: ...) -- handled in _apply_mixed_fonts(). 104 - _TEMPLATE_MIX = ( 105 - "#set page(width: {width}, height: auto, " 106 - "margin: (x: 10pt, y: 8pt), fill: white)\n" 107 - "#set list(spacing: 1.0em)\n" 108 - "#set enum(spacing: 1.0em)\n" 109 - "{body}\n" 110 - ) 111 - 112 - 113 - # ── Block-level font mixing ─────────────────────────────────────────────────── 114 - 115 - def _apply_mixed_fonts(body: str, rng: random.Random, font: str, 116 - hw_prob: float = 0.5, size: int = 11, 117 - ink: str = "#000000") -> str: 118 - """ 119 - Wrap random paragraph-level blocks in a Typst scoped content block that 120 - overrides the text font. Returns a modified body for rendering only -- 121 - the label (manifest typst) records the original clean body. 122 - 123 - Blocks are delimited by double newlines (Typst paragraph breaks). Single- 124 - block bodies (inline sequences, simple tables) are treated as one unit. 125 - """ 126 - blocks = body.split("\n\n") 127 - 128 - hw_style = ( 129 - f'#set text(font: ("{font}", "New Computer Modern"), ' 130 - f'size: {size}pt, fill: rgb("{ink}"), fallback: true)' 131 - ) 132 - 133 - result: list[str] = [] 134 - for block in blocks: 135 - stripped = block.strip() 136 - if stripped and rng.random() < hw_prob: 137 - # Scoped content block: #[#set text(...); content] 138 - # This keeps list/table markup valid inside the block. 139 - result.append(f"#[{hw_style}; {block}]") 140 - else: 141 - result.append(block) 142 - 143 - return "\n\n".join(result) 144 - 145 - 146 - # ── Rendering ───────────────────────────────────────────────────────────────── 147 - 148 - def _render( 149 - body: str, 150 - out_path: Path, 151 - page_width: str, 152 - mode: str, 153 - font: str | None, 154 - font_dir: Path, 155 - size: int, 156 - ink: str, 157 - rng: random.Random, 158 - ) -> tuple[bool, str]: 159 - """ 160 - Compile one sample to PNG. 161 - 162 - mode='hw' -- set hw font globally in the page header; body is used as-is. 163 - mode='mix' -- body may contain per-block #[#set text(...); ...] wrappers. 164 - """ 165 - if mode == "hw" and font is not None: 166 - src = _TEMPLATE_HW.format( 167 - width=page_width, font=font, size=size, ink=ink, body=body 168 - ) 169 - else: 170 - # font=None (default) or mix mode: use plain template 171 - src = _TEMPLATE_MIX.format(width=page_width, body=body) 172 - 173 - with tempfile.NamedTemporaryFile(suffix=".typ", mode="w", delete=False) as f: 174 - f.write(src) 175 - typ_path = Path(f.name) 176 - 177 - cmd = [ 178 - "typst", "compile", 179 - "--format", "png", 180 - "--ppi", "150", 181 - str(typ_path), 182 - str(out_path), 183 - ] 184 - if font is not None and font_dir.exists(): 185 - cmd += ["--font-path", str(font_dir.resolve())] 186 - 187 - try: 188 - result = subprocess.run(cmd, capture_output=True, timeout=15) 189 - return result.returncode == 0, result.stderr.decode(errors="replace") 190 - except subprocess.TimeoutExpired: 191 - return False, "timeout" 192 - except FileNotFoundError: 193 - return False, "typst not found" 194 - finally: 195 - typ_path.unlink(missing_ok=True) 196 - 197 - 198 - # ── Main ────────────────────────────────────────────────────────────────────── 199 - 200 - def main() -> None: 201 - parser = argparse.ArgumentParser( 202 - description="Generate handwriting-font structured math+text data." 203 - ) 204 - parser.add_argument("--mode", choices=["hw", "mix"], default="hw", 205 - help="hw=whole doc handwritten; mix=per-block mixing") 206 - parser.add_argument("--count", type=int, default=15_000) 207 - parser.add_argument("--out", default="data/hw_structured_train") 208 - parser.add_argument("--jobs", type=int, default=4) 209 - parser.add_argument("--seed", type=int, default=42) 210 - parser.add_argument("--font-dir", default=str(_DEFAULT_FONT_DIR), 211 - help="Directory containing downloaded handwriting TTFs") 212 - parser.add_argument("--hw-prob", type=float, default=0.55, 213 - help="(mix mode) probability each block gets hw font") 214 - parser.add_argument("--show-failures", type=int, default=0, metavar="N") 215 - args = parser.parse_args() 216 - 217 - font_dir = Path(args.font_dir) 218 - available = _available_fonts(font_dir) 219 - if not available: 220 - print( 221 - f"No handwriting fonts found in {font_dir}.\n" 222 - "Run `uv run download-hw-fonts` first." 223 - ) 224 - raise SystemExit(1) 225 - 226 - print(f"Available fonts ({len(available)}): {', '.join(available)}") 227 - 228 - out = Path(args.out) 229 - img_dir = out / "images" 230 - img_dir.mkdir(parents=True, exist_ok=True) 231 - 232 - rng = random.Random(args.seed) 233 - 234 - # ── Phase 1: generate unique bodies ────────────────────────────────────── 235 - print(f"Generating {args.count:,} unique bodies (mode={args.mode}) ...") 236 - 237 - seen: set[str] = set() 238 - # (clean_body, page_width, font, size, ink) 239 - candidates: list[tuple[str, str, str, int, str]] = [] 240 - attempts = 0 241 - 242 - with tqdm(total=args.count, unit="body") as pbar: 243 - while len(candidates) < args.count: 244 - attempts += 1 245 - body, page_width = generate_body(rng) 246 - 247 - if body in seen: 248 - continue 249 - 250 - # mix mode: require at least two blocks so within-doc mixing is 251 - # meaningful (single-block bodies would always be fully hw or typeset). 252 - if args.mode == "mix" and "\n\n" not in body: 253 - continue 254 - 255 - seen.add(body) 256 - font = _pick_font(rng, available) 257 - size = rng.choice(_FONT_SIZES) 258 - ink = rng.choice(_INK_COLOURS) 259 - candidates.append((body, page_width, font, size, ink)) 260 - pbar.update(1) 261 - 262 - print(f" {attempts:,} attempts ({attempts / len(candidates):.1f}x overhead)") 263 - 264 - # ── Phase 2: render ─────────────────────────────────────────────────────── 265 - print(f"Rendering {len(candidates):,} images with {args.jobs} workers ...") 266 - 267 - records: list[dict] = [] 268 - failures = 0 269 - shown_failures = 0 270 - 271 - def _task( 272 - clean_body: str, page_width: str, font: str, size: int, ink: str 273 - ) -> tuple[str, str, bool, str]: 274 - if args.mode == "hw": 275 - render_body = clean_body 276 - else: 277 - render_body = _apply_mixed_fonts( 278 - clean_body, rng, font, hw_prob=args.hw_prob, size=size, ink=ink 279 - ) 280 - 281 - # Hash over render body + font + size + ink + mode for uniqueness 282 - h_key = f"{args.mode}:{font}:{size}:{ink}:{page_width}:{render_body}" 283 - h = hashlib.sha1(h_key.encode()).hexdigest()[:16] 284 - out_path = img_dir / f"{h}.png" 285 - 286 - ok, err = _render( 287 - render_body, out_path, page_width, 288 - mode=args.mode, font=font, font_dir=font_dir, size=size, ink=ink, 289 - rng=rng, 290 - ) 291 - # Manifest always records the CLEAN body (no font directives). 292 - return clean_body, f"images/{h}.png", ok, err, font 293 - 294 - with ThreadPoolExecutor(max_workers=args.jobs) as pool: 295 - futs = { 296 - pool.submit(_task, body, pw, font, size, ink): body 297 - for body, pw, font, size, ink in candidates 298 - } 299 - with tqdm(total=len(candidates), unit="img") as pbar: 300 - for fut in as_completed(futs): 301 - clean_body, rel_path, ok, err, used_font = fut.result() 302 - if ok: 303 - records.append({"image": rel_path, "typst": clean_body}) 304 - else: 305 - failures += 1 306 - if shown_failures < args.show_failures: 307 - tqdm.write(f"\n--- failure ---\nbody: {clean_body!r}\n{err.strip()}") 308 - shown_failures += 1 309 - pbar.update(1) 310 - 311 - # ── Phase 3: manifest ───────────────────────────────────────────────────── 312 - manifest = out / "manifest.jsonl" 313 - with manifest.open("w") as f: 314 - for r in records: 315 - f.write(json.dumps(r) + "\n") 316 - 317 - split_hint = out.name # e.g. hw_structured_train 318 - print(f"Wrote {len(records):,} records to {manifest} ({failures} render failures)") 319 - print( 320 - f"\nNext steps:\n" 321 - f" 1. Add '{split_hint}' to TRAIN_SPLITS in data.py\n" 322 - f" (it is NOT in _MATH_ONLY_SPLITS -- body already contains $ delimiters)\n" 323 - f" 2. Set a sampling cap in train.py if needed (e.g. 15k–20k)" 324 - ) 325 - 326 - 327 - if __name__ == "__main__": 328 - main()
+384 -1
src/generate_mixed.py
··· 25 25 26 26 from tqdm import tqdm 27 27 28 - from .generate_typeset import generate_expr 28 + 29 + # ── Grammar ─────────────────────────────────────────────────────────────────── 30 + 31 + _VARS = ["x", "y", "z", "n", "a", "b", "t", "k", "m", "i", "j", "r", "s", 32 + "u", "v", "f", "g", "h", "p", "q"] 33 + _GREEK = ["alpha", "beta", "theta", "lambda", "mu", "sigma", "pi", "phi", 34 + "omega", "epsilon", "delta", "gamma", "rho", "nu", "eta", "xi", "zeta", 35 + "Phi", "Psi", "Lambda", "Sigma", "Omega", "Delta", "Gamma", "Theta", 36 + # Alternate glyph variants (LaTeX \var* forms) 37 + "phi.alt", "epsilon.alt", "theta.alt"] 38 + _NUMS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "100"] 39 + _FUNCS = ["sin", "cos", "tan", "ln", "log", "exp", "arcsin", "arccos", 40 + "arctan", "sinh", "cosh", "tanh", "Re", "Im", "det", "tr", "ker", 41 + "arg"] 42 + _RELS = ["=", "!=", "<=", ">=", "<", ">", "approx", "equiv", "tilde.eq", 43 + "prec", "succ"] 44 + _OPS = ["+", "-", "dot", "times"] 45 + _TO = ["0", "oo", "1", "-oo", "infinity"] 46 + 47 + # Blackboard bold (set/space names) 48 + _BLACKBOARD = ["RR", "ZZ", "NN", "QQ", "CC", "FF"] 49 + 50 + # Vector/decorator functions applied to atoms 51 + # Accent functions usable as f(atom) in Typst math 52 + _DECORATORS = ["arrow", "hat", "tilde", "overline", "bold", "underline"] 53 + # Time-derivative accents via accent(base, sym) -- separate pool 54 + _DOT_ACCENTS = ["dot", "dot.double", "dot.triple"] 55 + 56 + # Set-theoretic binary operators 57 + _SET_OPS = ["union", "inter", "without", "times"] 58 + 59 + # Physical vector fields (used with arrow() in Maxwell branches) 60 + _FIELDS = ["E", "B", "J", "D", "H", "F", "A", "v"] 61 + 62 + # Physical constants commonly appearing as subscripted atoms 63 + _PHYS_ATOMS = ["epsilon_0", "mu_0", "sigma", "rho", "J"] 64 + 65 + # Custom math operators (op("name") gives proper operatorname spacing/limits) 66 + _CUSTOM_OPS = ["rank", "span", "coker", "im", 67 + "supp", "diam", "vol", "codim", "sgn", "sign"] 68 + 69 + # Math spacing atoms (skipping med -- visually indistinct from thin at normal zoom) 70 + _SPACES = ["thin", "thick", "quad", "wide"] 71 + 72 + # Calligraphic / script letters (Typst: cal(P), cal(H), etc.) 73 + # Used for powerset cal(P), sigma-algebras cal(F), Hilbert spaces cal(H), Lagrangians cal(L), etc. 74 + _CALLIGRAPHIC = ["A", "B", "C", "F", "G", "H", "L", "M", "N", "P", "R", "S", "T"] 75 + 76 + 77 + def _polynomial(rng: random.Random) -> str: 78 + """Explicit polynomial in one variable: a x^d + ... + c, indexed, or monic general.""" 79 + # Capital letters give operator polynomials: p(T), q(A), etc. 80 + var = rng.choice(["x", "t", "z", "lambda", "T", "A", "D", "L", "X", "S"]) 81 + r = rng.random() 82 + if r < 0.45: 83 + # Explicit coefficients: a x^3 + b x^2 + c x + d 84 + degree = rng.randint(2, 4) 85 + pool = ["a", "b", "c", "p", "q", "alpha", "beta"] 86 + coeffs = rng.sample(pool, k=min(degree + 1, len(pool))) 87 + while len(coeffs) < degree + 1: 88 + coeffs.append(rng.choice(pool)) 89 + terms: list[str] = [] 90 + for k in range(degree, -1, -1): 91 + coeff = coeffs[degree - k] 92 + if k == 0: 93 + terms.append(coeff) 94 + elif k == 1: 95 + terms.append(f"{coeff} {var}") 96 + else: 97 + terms.append(f"{coeff} {var}^{k}") 98 + return " + ".join(terms) 99 + elif r < 0.75: 100 + # Indexed form: a_0 + a_1 x + dots.c + a_n x^n 101 + letter = rng.choice(["a", "b", "c"]) 102 + n_sym = rng.choice(["n", "m", "N"]) 103 + return f"{letter}_0 + {letter}_1 {var} + dots.c + {letter}_{n_sym} {var}^{n_sym}" 104 + else: 105 + # General monic: x^n + a_(n-1) x^(n-1) + dots.c + a_0 106 + n_sym = rng.choice(["n", "m"]) 107 + letter = rng.choice(["a", "b", "c"]) 108 + return (f"{var}^{n_sym} + {letter}_({n_sym}-1) {var}^({n_sym}-1) " 109 + f"+ dots.c + {letter}_0") 110 + 111 + 112 + def _schematic_matrix(rng: random.Random) -> str: 113 + """Generic n×m matrix with ellipsis dots showing structure.""" 114 + letter = rng.choice(["a", "b", "c", "m"]) 115 + row_sym = rng.choice(["m", "p", "r"]) 116 + col_sym = rng.choice(["n", "k", "q"]) 117 + r1 = f"{letter}_(1,1), dots.c, {letter}_(1,{col_sym})" 118 + r2 = "dots.v, dots.down, dots.v" 119 + r3 = f"{letter}_({row_sym},1), dots.c, {letter}_({row_sym},{col_sym})" 120 + return f"mat({r1}; {r2}; {r3})" 121 + 122 + 123 + def _atom(rng: random.Random) -> str: 124 + r = rng.random() 125 + if r < 0.36: 126 + return rng.choice(_VARS) 127 + elif r < 0.54: 128 + return rng.choice(_GREEK) 129 + elif r < 0.65: 130 + return rng.choice(_NUMS) 131 + elif r < 0.78: 132 + return rng.choice(_BLACKBOARD) 133 + elif r < 0.90: 134 + return rng.choice(_PHYS_ATOMS) 135 + else: 136 + return f"cal({rng.choice(_CALLIGRAPHIC)})" 137 + 138 + 139 + def _expr(rng: random.Random, depth: int = 0, max_depth: int = 3) -> str: 140 + if depth >= max_depth or (depth > 0 and rng.random() < 0.35): 141 + return _atom(rng) 142 + 143 + c = rng.random() 144 + 145 + # ── Core algebra / calculus ─────────────────────────────────────────────── 146 + 147 + if c < 0.09: 148 + return f"({_expr(rng, depth+1)}) / ({_expr(rng, depth+1)})" 149 + elif c < 0.15: 150 + return f"{_atom(rng)}^({_expr(rng, depth+1)})" 151 + elif c < 0.20: 152 + return f"{_atom(rng)}^{_atom(rng)}" 153 + elif c < 0.25: 154 + a, s, p = _atom(rng), _atom(rng), _atom(rng) 155 + return f"{a}_{s}^{p}" 156 + elif c < 0.30: 157 + v, lo, hi = _atom(rng), _atom(rng), _atom(rng) 158 + return f"sum_({v} = {lo})^{hi} {_expr(rng, depth+1)}" 159 + elif c < 0.35: 160 + lo, hi = _atom(rng), _atom(rng) 161 + dv = rng.choice(["x", "y", "t", "r"]) 162 + return f"integral_{lo}^{hi} {_expr(rng, depth+1)} dif {dv}" 163 + elif c < 0.39: 164 + return f"{rng.choice(_FUNCS)}({_expr(rng, depth+1)})" 165 + elif c < 0.43: 166 + return f"sqrt({_expr(rng, depth+1)})" 167 + elif c < 0.46: 168 + n = rng.randint(2, 5) 169 + return f"root({n}, {_expr(rng, depth+1)})" 170 + elif c < 0.50: 171 + v = rng.choice(_VARS) 172 + to = rng.choice(_TO) 173 + return f"lim_({v} -> {to}) {_expr(rng, depth+1)}" 174 + elif c < 0.53: 175 + return f"binom({_atom(rng)}, {_atom(rng)})" 176 + elif c < 0.59: 177 + op = rng.choice(_OPS) 178 + return f"({_expr(rng, depth+1)} {op} {_expr(rng, depth+1)})" 179 + elif c < 0.63: 180 + rel = rng.choice(_RELS) 181 + return f"{_expr(rng, depth+1)} {rel} {_expr(rng, depth+1)}" 182 + elif c < 0.66: 183 + v, lo, hi = _atom(rng), _atom(rng), _atom(rng) 184 + return f"product_({v} = {lo})^{hi} {_expr(rng, depth+1)}" 185 + elif c < 0.69: 186 + # Matrix / row-vector shapes; column vectors handled by vec() instead 187 + nrows, ncols = rng.choices( 188 + [(1, 2), (1, 3), (1, 4), (2, 2), (2, 3), (3, 2), (3, 3)], 189 + weights=[12, 12, 6, 20, 15, 10, 25], 190 + )[0] 191 + rows = [", ".join(_atom(rng) for _ in range(ncols)) 192 + for _ in range(nrows)] 193 + return f"mat({'; '.join(rows)})" 194 + elif c < 0.71: 195 + f_var = rng.choice(_VARS) 196 + x_var = rng.choice(_VARS) 197 + return f"(dif {f_var}) / (dif {x_var})" 198 + elif c < 0.73: 199 + f_var = rng.choice(_VARS) 200 + x_var = rng.choice(_VARS) 201 + return f"(partial {f_var}) / (partial {x_var})" 202 + elif c < 0.75: 203 + return _polynomial(rng) 204 + elif c < 0.76: 205 + return _schematic_matrix(rng) 206 + elif c < 0.79: 207 + return f"norm({_expr(rng, depth+1)})" 208 + 209 + # ── Logic / sequent calculus ────────────────────────────────────────────── 210 + 211 + elif c < 0.82: 212 + r2 = rng.random() 213 + A = _expr(rng, depth+1) 214 + if r2 < 0.25: 215 + # Sequent: Gamma ⊢ A 216 + ctx = rng.choice(["Gamma", "Delta", "Sigma"]) 217 + return f"({ctx} tack.r {A})" 218 + elif r2 < 0.50: 219 + # Propositional connective 220 + op = rng.choice(["and", "or", "=>", "<=>", "xor"]) 221 + return f"({A} {op} {_expr(rng, depth+1)})" 222 + elif r2 < 0.65: 223 + # Negation 224 + return f"not {A}" 225 + elif r2 < 0.78: 226 + # Semantic entailment / models 227 + return f"({A} models {_expr(rng, depth+1)})" 228 + elif r2 < 0.88: 229 + # Proposition with top/bot constant 230 + tb = rng.choice(["top", "bot"]) 231 + op = rng.choice(["=>", "<=>", "and", "or"]) 232 + return f"({A} {op} {tb})" 233 + else: 234 + # Type judgment: a : tau 235 + return f"({_atom(rng)} : {A})" 236 + 237 + # ── Decorators / vector notation ────────────────────────────────────────── 238 + 239 + elif c < 0.84: 240 + dec = rng.choice(_DECORATORS) 241 + return f"{dec}({_atom(rng)})" 242 + elif c < 0.86: 243 + # dot / dot.double / dot.triple accents (time derivatives) 244 + return f"accent({rng.choice(_VARS)}, {rng.choice(_DOT_ACCENTS)})" 245 + 246 + # ── Sets and types ──────────────────────────────────────────────────────── 247 + 248 + elif c < 0.865: 249 + # membership: x in RR, k in ZZ, ... 250 + return f"{_atom(rng)} in {rng.choice(_BLACKBOARD)}" 251 + elif c < 0.875: 252 + op = rng.choice(_SET_OPS) 253 + return f"({_expr(rng, depth+1)} {op} {_expr(rng, depth+1)})" 254 + 255 + # ── Vectors and annotated expressions ───────────────────────────────────── 256 + 257 + elif c < 0.885: 258 + # Column vector: vec(a, b) or vec(a, b, c) 259 + n = rng.choices([2, 3], weights=[3, 2])[0] 260 + return f"vec({', '.join(_atom(rng) for _ in range(n))})" 261 + elif c < 0.895: 262 + # Over/under brace, bracket, paren with annotation label 263 + fn = rng.choice(["underbrace", "overbrace", 264 + "underbracket", "overbracket", 265 + "underparen", "overparen"]) 266 + label = _atom(rng) if rng.random() < 0.6 else f"{_atom(rng)} + {_atom(rng)}" 267 + return f"{fn}({_expr(rng, depth+1)}, {label})" 268 + 269 + # ── Function signatures / arrows ────────────────────────────────────────── 270 + 271 + elif c < 0.915: 272 + # f: RR^n -> RR or f: A -> B or f: A times B -> C 273 + fname = rng.choice(_VARS) 274 + dom = rng.choice(_BLACKBOARD) 275 + cod = rng.choice(_BLACKBOARD) 276 + r2 = rng.random() 277 + if r2 < 0.35: 278 + exp = rng.choice(["n", "m", "k", "2", "3"]) 279 + return f"({fname}: {dom}^{exp} -> {cod})" 280 + elif r2 < 0.60: 281 + return f"({fname}: {dom} -> {cod})" 282 + elif r2 < 0.80: 283 + d2 = rng.choice(_BLACKBOARD) 284 + return f"({fname}: {dom} times {d2} -> {cod})" 285 + else: 286 + d2 = rng.choice(_BLACKBOARD) 287 + d3 = rng.choice(_BLACKBOARD) 288 + return f"({fname}: {dom} times {d2} times {d3} -> {cod})" 289 + 290 + # ── Logic / type theory ─────────────────────────────────────────────────── 291 + 292 + elif c < 0.935: 293 + q = rng.choice(["forall", "exists"]) 294 + v = rng.choice(_VARS) 295 + return f"({q} {v}, {_expr(rng, depth+1)})" 296 + elif c < 0.945: 297 + v = rng.choice(_VARS) 298 + return f"lambda {v}. {_expr(rng, depth+1)}" 299 + 300 + # ── Vector calculus / Maxwell ───────────────────────────────────────────── 301 + 302 + elif c < 0.96: 303 + # grad, div, curl, Laplacian -- nabla and text forms 304 + r2 = rng.random() 305 + vec = f"arrow({rng.choice(_FIELDS)})" 306 + if r2 < 0.20: 307 + return f"nabla dot {vec}" # divergence (nabla form) 308 + elif r2 < 0.40: 309 + return f"nabla times {vec}" # curl (nabla form) 310 + elif r2 < 0.55: 311 + return f"nabla^2 {_atom(rng)}" # Laplacian 312 + elif r2 < 0.68: 313 + return f"nabla {_expr(rng, depth+1)}" # gradient 314 + elif r2 < 0.78: 315 + return f'op("div") {vec}' # divergence (text form) 316 + elif r2 < 0.88: 317 + return f'op("curl") {vec}' # curl (text form) 318 + else: 319 + # mixed: e.g. (nabla dot E) = rho / epsilon_0 320 + lhs = f"nabla dot {vec}" 321 + rhs = _expr(rng, depth+1) 322 + return f"{lhs} = {rhs}" 323 + 324 + elif c < 0.97: 325 + # Maxwell integral form: closed line / surface / volume integrals 326 + r2 = rng.random() 327 + vec = rng.choice(_FIELDS) 328 + if r2 < 0.50: 329 + curve = rng.choice(["C", "partial S", "partial V"]) 330 + dl = rng.choice(["l", "s"]) 331 + return f"integral.cont_{curve} arrow({vec}) dot dif arrow({dl})" 332 + elif r2 < 0.85: 333 + surf = rng.choice(["S", "partial V", "partial Omega"]) 334 + return f"integral.surf_{surf} arrow({vec}) dot dif arrow(A)" 335 + else: 336 + vol = rng.choice(["V", "Omega"]) 337 + return f"integral.vol_{vol} {_expr(rng, depth+1)} dif V" 338 + 339 + # ── Miscellaneous useful constructs ─────────────────────────────────────── 340 + 341 + elif c < 0.974: 342 + fn = rng.choice(["abs", "floor", "ceil"]) 343 + return f"{fn}({_expr(rng, depth+1)})" 344 + elif c < 0.979: 345 + # piecewise / cases 346 + e1 = _expr(rng, depth+1) 347 + e2 = _expr(rng, depth+1) 348 + cond = f"{_atom(rng)} > {_atom(rng)}" 349 + return f'cases({e1} "if" {cond}, {e2} "otherwise")' 350 + elif c < 0.984: 351 + # Bra-ket / Dirac notation: ket |ψ⟩, bra ⟨ψ|, braket ⟨φ|ψ⟩, expectation ⟨A⟩ 352 + # lr() auto-sizes delimiters; | is a plain vertical bar in math 353 + r2 = rng.random() 354 + psi = _atom(rng) 355 + if r2 < 0.30: 356 + return f"lr(| {psi} chevron.r)" 357 + elif r2 < 0.60: 358 + return f"lr(chevron.l {psi} |)" 359 + elif r2 < 0.82: 360 + return f"lr(chevron.l {_atom(rng)} | {psi} chevron.r)" 361 + else: 362 + return f"lr(chevron.l {_expr(rng, depth+1)} chevron.r)" 363 + elif c < 0.988: 364 + # Intervals: lr() handles mismatched delimiters for half-open forms 365 + a, b = _atom(rng), _atom(rng) 366 + r2 = rng.random() 367 + if r2 < 0.25: 368 + return f"lr([{a}, {b}])" # closed [a, b] 369 + elif r2 < 0.50: 370 + return f"lr(({a}, {b}))" # open (a, b) 371 + elif r2 < 0.75: 372 + return f"lr([{a}, {b}))" # half-open [a, b) 373 + else: 374 + return f"lr(({a}, {b}])" # half-open (a, b] 375 + elif c < 0.993: 376 + # indexed sequence with ellipsis: (a_1, dots.c, a_n) 377 + # parens required -- bare commas break multi-arg contexts (sqrt, norm, etc.) 378 + v = rng.choice(_VARS) 379 + n = rng.choice(["n", "m", "N", "k"]) 380 + return f"({v}_1, dots.c, {v}_{n})" 381 + elif c < 0.995: 382 + # chevron bracket pair (inner product, type constructors) 383 + # outer parens prevent comma from being parsed as extra function arg 384 + return f"(chevron.l {_expr(rng, depth+1)}, {_expr(rng, depth+1)} chevron.r)" 385 + elif c < 0.9975: 386 + # custom math operators: op("rank")(x) gives proper operatorname spacing 387 + return f'op("{rng.choice(_CUSTOM_OPS)}")({_expr(rng, depth+1)})' 388 + elif c < 0.999: 389 + # calligraphic applied to expr: cal(P)(X) powerset, cal(H) Hilbert space, etc. 390 + return f"cal({rng.choice(['P', 'F', 'H', 'L', 'B'])})({_expr(rng, depth+1)})" 391 + return f"({v}_1, dots.c, {v}_{n})" 392 + elif c < 0.995: 393 + # chevron bracket pair (inner product, type constructors) 394 + # outer parens prevent comma from being parsed as extra function arg 395 + return f"(chevron.l {_expr(rng, depth+1)}, {_expr(rng, depth+1)} chevron.r)" 396 + elif c < 0.9975: 397 + # custom math operators: op("rank")(x) gives proper operatorname spacing 398 + return f'op("{rng.choice(_CUSTOM_OPS)}")({_expr(rng, depth+1)})' 399 + elif c < 0.999: 400 + # calligraphic applied to expr: cal(P)(X) powerset, cal(H) Hilbert space, etc. 401 + return f"cal({rng.choice(['P', 'F', 'H', 'L', 'B'])})({_expr(rng, depth+1)})" 402 + elif c < 0.9999: 403 + # explicit math spacing between two sub-expressions 404 + sp = rng.choice(_SPACES) 405 + return f"{_atom(rng)} {sp} {_atom(rng)}" 406 + else: 407 + return _atom(rng) 408 + 409 + def generate_expr(rng: random.Random) -> str: 410 + return _expr(rng, depth=0, max_depth=3) 411 + 29 412 30 413 31 414 # ── Emoji pools ───────────────────────────────────────────────────────────────
+233 -405
src/generate_typeset.py
··· 1 1 """ 2 - Generate typeset math images from a recursive grammar. 2 + Generate structured text+math documents rendered in handwriting-style fonts. 3 3 4 - Renders diverse Typst math expressions to PNG via `typst compile` and writes 5 - a manifest.jsonl compatible with load_records(). 4 + Fills two training data gaps: 6 5 7 - After running, add "typeset" to TYPESET_SPLITS in data.py and mix into training. 6 + uniform -- whole document rendered in one randomly sampled font (6 custom 7 + fonts + Typst default); math stays typeset regardless of font. 8 + Target splits: typeset_uniform_{train,val,test} 8 9 9 - Usage: uv run generate-typeset [--count 8000] [--out data/typeset] 10 - [--jobs 4] [--seed 42] 10 + mixed -- per-paragraph font mixing: each block independently gets a 11 + randomly sampled font or the default typeset font. 12 + Target splits: typeset_mixed_{train,val,test} 13 + 14 + In both modes the manifest typst field records the CLEAN body (no font 15 + directives), matching the format of typeset_mixed_* splits so data.py can 16 + load them without changes. Add the new split names to TRAIN_SPLITS / 17 + VAL_SPLITS / TEST_SPLITS in data.py when ready to train. 18 + 19 + Prerequisites: 20 + uv run download-hw-fonts # fetch fonts into data/fonts/ 21 + 22 + Usage: 23 + uv run generate-hw --mode hw --count 15000 --out data/hw_structured_train 24 + uv run generate-hw --mode mix --count 10000 --out data/hw_mixed_train 25 + uv run generate-hw --mode hw --count 500 --out data/hw_structured_val --seed 100 26 + uv run generate-hw --mode mix --count 500 --out data/hw_mixed_val --seed 100 27 + uv run generate-hw --mode hw --count 500 --out data/hw_structured_test --seed 200 28 + uv run generate-hw --mode mix --count 500 --out data/hw_mixed_test --seed 200 11 29 """ 12 30 13 31 import argparse ··· 21 39 22 40 from tqdm import tqdm 23 41 24 - from .data import _structural_key 25 - 26 - 27 - # ── Grammar ─────────────────────────────────────────────────────────────────── 28 - 29 - _VARS = ["x", "y", "z", "n", "a", "b", "t", "k", "m", "i", "j", "r", "s", 30 - "u", "v", "f", "g", "h", "p", "q"] 31 - _GREEK = ["alpha", "beta", "theta", "lambda", "mu", "sigma", "pi", "phi", 32 - "omega", "epsilon", "delta", "gamma", "rho", "nu", "eta", "xi", "zeta", 33 - "Phi", "Psi", "Lambda", "Sigma", "Omega", "Delta", "Gamma", "Theta", 34 - # Alternate glyph variants (LaTeX \var* forms) 35 - "phi.alt", "epsilon.alt", "theta.alt"] 36 - _NUMS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "100"] 37 - _FUNCS = ["sin", "cos", "tan", "ln", "log", "exp", "arcsin", "arccos", 38 - "arctan", "sinh", "cosh", "tanh", "Re", "Im", "det", "tr", "ker", 39 - "arg"] 40 - _RELS = ["=", "!=", "<=", ">=", "<", ">", "approx", "equiv", "tilde.eq", 41 - "prec", "succ"] 42 - _OPS = ["+", "-", "dot", "times"] 43 - _TO = ["0", "oo", "1", "-oo", "infinity"] 42 + from .generate_mixed import generate_body 44 43 45 - # Blackboard bold (set/space names) 46 - _BLACKBOARD = ["RR", "ZZ", "NN", "QQ", "CC", "FF"] 47 44 48 - # Vector/decorator functions applied to atoms 49 - # Accent functions usable as f(atom) in Typst math 50 - _DECORATORS = ["arrow", "hat", "tilde", "overline", "bold", "underline"] 51 - # Time-derivative accents via accent(base, sym) -- separate pool 52 - _DOT_ACCENTS = ["dot", "dot.double", "dot.triple"] 53 - 54 - # Set-theoretic binary operators 55 - _SET_OPS = ["union", "inter", "without", "times"] 45 + # ── Font pool ───────────────────────────────────────────────────────────────── 56 46 57 - # Physical vector fields (used with arrow() in Maxwell branches) 58 - _FIELDS = ["E", "B", "J", "D", "H", "F", "A", "v"] 47 + _DEFAULT_FONT_DIR = Path("data/fonts") 59 48 60 - # Physical constants commonly appearing as subscripted atoms 61 - _PHYS_ATOMS = ["epsilon_0", "mu_0", "sigma", "rho", "J"] 49 + # Maps the Typst family name (what Typst sees in font metadata) to the filename 50 + # slug used when checking whether the font was downloaded. 51 + _FONTS: dict[str, str] = { 52 + "Comic Neue": "ComicNeue-Regular.ttf", 53 + "Gochi Hand": "GochiHand-Regular.ttf", 54 + "Handlee": "Handlee-Regular.ttf", 55 + "Oswald": "Oswald-Regular.ttf", 56 + "Dancing Script": "DancingScript-Regular.ttf", 57 + "Special Elite": "SpecialElite-Regular.ttf", 58 + } 62 59 63 - # Custom math operators (op("name") gives proper operatorname spacing/limits) 64 - _CUSTOM_OPS = ["rank", "span", "coker", "im", 65 - "supp", "diam", "vol", "codim", "sgn", "sign"] 60 + # Font sizes (pt) sampled for diversity. Heavier/larger fonts look more 61 + # "blackboard-style"; smaller sizes give tighter notes appearance. 62 + _FONT_SIZES = [10, 10, 11, 11, 11, 12, 12, 13, 14] 66 63 67 - # Math spacing atoms (skipping med -- visually indistinct from thin at normal zoom) 68 - _SPACES = ["thin", "thick", "quad", "wide"] 64 + # Ink colours: mostly black, occasional dark blue / dark grey 65 + _INK_COLOURS = [ 66 + "#000000", "#000000", "#000000", "#000000", # 4/7 pure black 67 + "#1a1a1a", # near-black 68 + "#0d0d4d", # dark navy (pen ink) 69 + "#2b2b2b", # dark grey 70 + ] 69 71 70 - # Calligraphic / script letters (Typst: cal(P), cal(H), etc.) 71 - # Used for powerset cal(P), sigma-algebras cal(F), Hilbert spaces cal(H), Lagrangians cal(L), etc. 72 - _CALLIGRAPHIC = ["A", "B", "C", "F", "G", "H", "L", "M", "N", "P", "R", "S", "T"] 73 72 73 + def _available_fonts(font_dir: Path) -> list[str]: 74 + """Return Typst family names whose TTF files exist in font_dir.""" 75 + if not font_dir.exists(): 76 + return [] 77 + present = {p.name for p in font_dir.iterdir()} 78 + return [family for family, fname in _FONTS.items() if fname in present] 74 79 75 - def _polynomial(rng: random.Random) -> str: 76 - """Explicit polynomial in one variable: a x^d + ... + c, indexed, or monic general.""" 77 - # Capital letters give operator polynomials: p(T), q(A), etc. 78 - var = rng.choice(["x", "t", "z", "lambda", "T", "A", "D", "L", "X", "S"]) 79 - r = rng.random() 80 - if r < 0.45: 81 - # Explicit coefficients: a x^3 + b x^2 + c x + d 82 - degree = rng.randint(2, 4) 83 - pool = ["a", "b", "c", "p", "q", "alpha", "beta"] 84 - coeffs = rng.sample(pool, k=min(degree + 1, len(pool))) 85 - while len(coeffs) < degree + 1: 86 - coeffs.append(rng.choice(pool)) 87 - terms: list[str] = [] 88 - for k in range(degree, -1, -1): 89 - coeff = coeffs[degree - k] 90 - if k == 0: 91 - terms.append(coeff) 92 - elif k == 1: 93 - terms.append(f"{coeff} {var}") 94 - else: 95 - terms.append(f"{coeff} {var}^{k}") 96 - return " + ".join(terms) 97 - elif r < 0.75: 98 - # Indexed form: a_0 + a_1 x + dots.c + a_n x^n 99 - letter = rng.choice(["a", "b", "c"]) 100 - n_sym = rng.choice(["n", "m", "N"]) 101 - return f"{letter}_0 + {letter}_1 {var} + dots.c + {letter}_{n_sym} {var}^{n_sym}" 102 - else: 103 - # General monic: x^n + a_(n-1) x^(n-1) + dots.c + a_0 104 - n_sym = rng.choice(["n", "m"]) 105 - letter = rng.choice(["a", "b", "c"]) 106 - return (f"{var}^{n_sym} + {letter}_({n_sym}-1) {var}^({n_sym}-1) " 107 - f"+ dots.c + {letter}_0") 108 80 81 + def _pick_font(rng: random.Random, available: list[str]) -> str | None: 82 + """Sample uniformly from available handwriting fonts + Typst default (None).""" 83 + pool = available + [None] # None -> New Computer Modern (Typst default) 84 + return rng.choice(pool) 109 85 110 - def _schematic_matrix(rng: random.Random) -> str: 111 - """Generic n×m matrix with ellipsis dots showing structure.""" 112 - letter = rng.choice(["a", "b", "c", "m"]) 113 - row_sym = rng.choice(["m", "p", "r"]) 114 - col_sym = rng.choice(["n", "k", "q"]) 115 - r1 = f"{letter}_(1,1), dots.c, {letter}_(1,{col_sym})" 116 - r2 = "dots.v, dots.down, dots.v" 117 - r3 = f"{letter}_({row_sym},1), dots.c, {letter}_({row_sym},{col_sym})" 118 - return f"mat({r1}; {r2}; {r3})" 119 86 87 + # ── Typst templates ─────────────────────────────────────────────────────────── 120 88 121 - def _atom(rng: random.Random) -> str: 122 - r = rng.random() 123 - if r < 0.36: 124 - return rng.choice(_VARS) 125 - elif r < 0.54: 126 - return rng.choice(_GREEK) 127 - elif r < 0.65: 128 - return rng.choice(_NUMS) 129 - elif r < 0.78: 130 - return rng.choice(_BLACKBOARD) 131 - elif r < 0.90: 132 - return rng.choice(_PHYS_ATOMS) 133 - else: 134 - return f"cal({rng.choice(_CALLIGRAPHIC)})" 135 - 136 - 137 - def _expr(rng: random.Random, depth: int = 0, max_depth: int = 3) -> str: 138 - if depth >= max_depth or (depth > 0 and rng.random() < 0.35): 139 - return _atom(rng) 140 - 141 - c = rng.random() 142 - 143 - # ── Core algebra / calculus ─────────────────────────────────────────────── 144 - 145 - if c < 0.09: 146 - return f"({_expr(rng, depth+1)}) / ({_expr(rng, depth+1)})" 147 - elif c < 0.15: 148 - return f"{_atom(rng)}^({_expr(rng, depth+1)})" 149 - elif c < 0.20: 150 - return f"{_atom(rng)}^{_atom(rng)}" 151 - elif c < 0.25: 152 - a, s, p = _atom(rng), _atom(rng), _atom(rng) 153 - return f"{a}_{s}^{p}" 154 - elif c < 0.30: 155 - v, lo, hi = _atom(rng), _atom(rng), _atom(rng) 156 - return f"sum_({v} = {lo})^{hi} {_expr(rng, depth+1)}" 157 - elif c < 0.35: 158 - lo, hi = _atom(rng), _atom(rng) 159 - dv = rng.choice(["x", "y", "t", "r"]) 160 - return f"integral_{lo}^{hi} {_expr(rng, depth+1)} dif {dv}" 161 - elif c < 0.39: 162 - return f"{rng.choice(_FUNCS)}({_expr(rng, depth+1)})" 163 - elif c < 0.43: 164 - return f"sqrt({_expr(rng, depth+1)})" 165 - elif c < 0.46: 166 - n = rng.randint(2, 5) 167 - return f"root({n}, {_expr(rng, depth+1)})" 168 - elif c < 0.50: 169 - v = rng.choice(_VARS) 170 - to = rng.choice(_TO) 171 - return f"lim_({v} -> {to}) {_expr(rng, depth+1)}" 172 - elif c < 0.53: 173 - return f"binom({_atom(rng)}, {_atom(rng)})" 174 - elif c < 0.59: 175 - op = rng.choice(_OPS) 176 - return f"({_expr(rng, depth+1)} {op} {_expr(rng, depth+1)})" 177 - elif c < 0.63: 178 - rel = rng.choice(_RELS) 179 - return f"{_expr(rng, depth+1)} {rel} {_expr(rng, depth+1)}" 180 - elif c < 0.66: 181 - v, lo, hi = _atom(rng), _atom(rng), _atom(rng) 182 - return f"product_({v} = {lo})^{hi} {_expr(rng, depth+1)}" 183 - elif c < 0.69: 184 - # Matrix / row-vector shapes; column vectors handled by vec() instead 185 - nrows, ncols = rng.choices( 186 - [(1, 2), (1, 3), (1, 4), (2, 2), (2, 3), (3, 2), (3, 3)], 187 - weights=[12, 12, 6, 20, 15, 10, 25], 188 - )[0] 189 - rows = [", ".join(_atom(rng) for _ in range(ncols)) 190 - for _ in range(nrows)] 191 - return f"mat({'; '.join(rows)})" 192 - elif c < 0.71: 193 - f_var = rng.choice(_VARS) 194 - x_var = rng.choice(_VARS) 195 - return f"(dif {f_var}) / (dif {x_var})" 196 - elif c < 0.73: 197 - f_var = rng.choice(_VARS) 198 - x_var = rng.choice(_VARS) 199 - return f"(partial {f_var}) / (partial {x_var})" 200 - elif c < 0.75: 201 - return _polynomial(rng) 202 - elif c < 0.76: 203 - return _schematic_matrix(rng) 204 - elif c < 0.79: 205 - return f"norm({_expr(rng, depth+1)})" 206 - 207 - # ── Logic / sequent calculus ────────────────────────────────────────────── 208 - 209 - elif c < 0.82: 210 - r2 = rng.random() 211 - A = _expr(rng, depth+1) 212 - if r2 < 0.25: 213 - # Sequent: Gamma ⊢ A 214 - ctx = rng.choice(["Gamma", "Delta", "Sigma"]) 215 - return f"({ctx} tack.r {A})" 216 - elif r2 < 0.50: 217 - # Propositional connective 218 - op = rng.choice(["and", "or", "=>", "<=>", "xor"]) 219 - return f"({A} {op} {_expr(rng, depth+1)})" 220 - elif r2 < 0.65: 221 - # Negation 222 - return f"not {A}" 223 - elif r2 < 0.78: 224 - # Semantic entailment / models 225 - return f"({A} models {_expr(rng, depth+1)})" 226 - elif r2 < 0.88: 227 - # Proposition with top/bot constant 228 - tb = rng.choice(["top", "bot"]) 229 - op = rng.choice(["=>", "<=>", "and", "or"]) 230 - return f"({A} {op} {tb})" 231 - else: 232 - # Type judgment: a : tau 233 - return f"({_atom(rng)} : {A})" 234 - 235 - # ── Decorators / vector notation ────────────────────────────────────────── 236 - 237 - elif c < 0.84: 238 - dec = rng.choice(_DECORATORS) 239 - return f"{dec}({_atom(rng)})" 240 - elif c < 0.86: 241 - # dot / dot.double / dot.triple accents (time derivatives) 242 - return f"accent({rng.choice(_VARS)}, {rng.choice(_DOT_ACCENTS)})" 243 - 244 - # ── Sets and types ──────────────────────────────────────────────────────── 245 - 246 - elif c < 0.865: 247 - # membership: x in RR, k in ZZ, ... 248 - return f"{_atom(rng)} in {rng.choice(_BLACKBOARD)}" 249 - elif c < 0.875: 250 - op = rng.choice(_SET_OPS) 251 - return f"({_expr(rng, depth+1)} {op} {_expr(rng, depth+1)})" 252 - 253 - # ── Vectors and annotated expressions ───────────────────────────────────── 254 - 255 - elif c < 0.885: 256 - # Column vector: vec(a, b) or vec(a, b, c) 257 - n = rng.choices([2, 3], weights=[3, 2])[0] 258 - return f"vec({', '.join(_atom(rng) for _ in range(n))})" 259 - elif c < 0.895: 260 - # Over/under brace, bracket, paren with annotation label 261 - fn = rng.choice(["underbrace", "overbrace", 262 - "underbracket", "overbracket", 263 - "underparen", "overparen"]) 264 - label = _atom(rng) if rng.random() < 0.6 else f"{_atom(rng)} + {_atom(rng)}" 265 - return f"{fn}({_expr(rng, depth+1)}, {label})" 266 - 267 - # ── Function signatures / arrows ────────────────────────────────────────── 268 - 269 - elif c < 0.915: 270 - # f: RR^n -> RR or f: A -> B or f: A times B -> C 271 - fname = rng.choice(_VARS) 272 - dom = rng.choice(_BLACKBOARD) 273 - cod = rng.choice(_BLACKBOARD) 274 - r2 = rng.random() 275 - if r2 < 0.35: 276 - exp = rng.choice(["n", "m", "k", "2", "3"]) 277 - return f"({fname}: {dom}^{exp} -> {cod})" 278 - elif r2 < 0.60: 279 - return f"({fname}: {dom} -> {cod})" 280 - elif r2 < 0.80: 281 - d2 = rng.choice(_BLACKBOARD) 282 - return f"({fname}: {dom} times {d2} -> {cod})" 283 - else: 284 - d2 = rng.choice(_BLACKBOARD) 285 - d3 = rng.choice(_BLACKBOARD) 286 - return f"({fname}: {dom} times {d2} times {d3} -> {cod})" 89 + # hw mode: override the text font for the whole document. 90 + # Math equations continue using Typst's built-in math fonts (realistic). 91 + _TEMPLATE_HW = ( 92 + "#set page(width: {width}, height: auto, " 93 + "margin: (x: 12pt, y: 12pt), fill: white)\n" 94 + '#set text(font: ("{font}", "New Computer Modern"), ' 95 + "size: {size}pt, fill: rgb(\"{ink}\"), fallback: true)\n" 96 + "#set list(spacing: 1.2em)\n" 97 + "#set enum(spacing: 1.2em)\n" 98 + "{body}\n" 99 + ) 287 100 288 - # ── Logic / type theory ─────────────────────────────────────────────────── 101 + # mix mode: default document font (typeset), individual blocks may be 102 + # wrapped in #text(font: ...) -- handled in _apply_mixed_fonts(). 103 + _TEMPLATE_MIX = ( 104 + "#set page(width: {width}, height: auto, " 105 + "margin: (x: 12pt, y: 12pt), fill: white)\n" 106 + "#set list(spacing: 1.0em)\n" 107 + "#set enum(spacing: 1.0em)\n" 108 + "{body}\n" 109 + ) 289 110 290 - elif c < 0.935: 291 - q = rng.choice(["forall", "exists"]) 292 - v = rng.choice(_VARS) 293 - return f"({q} {v}, {_expr(rng, depth+1)})" 294 - elif c < 0.945: 295 - v = rng.choice(_VARS) 296 - return f"lambda {v}. {_expr(rng, depth+1)}" 297 111 298 - # ── Vector calculus / Maxwell ───────────────────────────────────────────── 112 + # ── Block-level font mixing ─────────────────────────────────────────────────── 299 113 300 - elif c < 0.96: 301 - # grad, div, curl, Laplacian -- nabla and text forms 302 - r2 = rng.random() 303 - vec = f"arrow({rng.choice(_FIELDS)})" 304 - if r2 < 0.20: 305 - return f"nabla dot {vec}" # divergence (nabla form) 306 - elif r2 < 0.40: 307 - return f"nabla times {vec}" # curl (nabla form) 308 - elif r2 < 0.55: 309 - return f"nabla^2 {_atom(rng)}" # Laplacian 310 - elif r2 < 0.68: 311 - return f"nabla {_expr(rng, depth+1)}" # gradient 312 - elif r2 < 0.78: 313 - return f'op("div") {vec}' # divergence (text form) 314 - elif r2 < 0.88: 315 - return f'op("curl") {vec}' # curl (text form) 316 - else: 317 - # mixed: e.g. (nabla dot E) = rho / epsilon_0 318 - lhs = f"nabla dot {vec}" 319 - rhs = _expr(rng, depth+1) 320 - return f"{lhs} = {rhs}" 114 + def _apply_mixed_fonts(body: str, rng: random.Random, font: str, 115 + hw_prob: float = 0.5, size: int = 11, 116 + ink: str = "#000000") -> str: 117 + """ 118 + Wrap random paragraph-level blocks in a Typst scoped content block that 119 + overrides the text font. Returns a modified body for rendering only -- 120 + the label (manifest typst) records the original clean body. 321 121 322 - elif c < 0.97: 323 - # Maxwell integral form: closed line / surface / volume integrals 324 - r2 = rng.random() 325 - vec = rng.choice(_FIELDS) 326 - if r2 < 0.50: 327 - curve = rng.choice(["C", "partial S", "partial V"]) 328 - dl = rng.choice(["l", "s"]) 329 - return f"integral.cont_{curve} arrow({vec}) dot dif arrow({dl})" 330 - elif r2 < 0.85: 331 - surf = rng.choice(["S", "partial V", "partial Omega"]) 332 - return f"integral.surf_{surf} arrow({vec}) dot dif arrow(A)" 333 - else: 334 - vol = rng.choice(["V", "Omega"]) 335 - return f"integral.vol_{vol} {_expr(rng, depth+1)} dif V" 122 + Blocks are delimited by double newlines (Typst paragraph breaks). Single- 123 + block bodies (inline sequences, simple tables) are treated as one unit. 124 + """ 125 + blocks = body.split("\n\n") 336 126 337 - # ── Miscellaneous useful constructs ─────────────────────────────────────── 127 + hw_style = ( 128 + f'#set text(font: ("{font}", "New Computer Modern"), ' 129 + f'size: {size}pt, fill: rgb("{ink}"), fallback: true)' 130 + ) 338 131 339 - elif c < 0.974: 340 - fn = rng.choice(["abs", "floor", "ceil"]) 341 - return f"{fn}({_expr(rng, depth+1)})" 342 - elif c < 0.979: 343 - # piecewise / cases 344 - e1 = _expr(rng, depth+1) 345 - e2 = _expr(rng, depth+1) 346 - cond = f"{_atom(rng)} > {_atom(rng)}" 347 - return f'cases({e1} "if" {cond}, {e2} "otherwise")' 348 - elif c < 0.984: 349 - # Bra-ket / Dirac notation: ket |ψ⟩, bra ⟨ψ|, braket ⟨φ|ψ⟩, expectation ⟨A⟩ 350 - # lr() auto-sizes delimiters; | is a plain vertical bar in math 351 - r2 = rng.random() 352 - psi = _atom(rng) 353 - if r2 < 0.30: 354 - return f"lr(| {psi} chevron.r)" 355 - elif r2 < 0.60: 356 - return f"lr(chevron.l {psi} |)" 357 - elif r2 < 0.82: 358 - return f"lr(chevron.l {_atom(rng)} | {psi} chevron.r)" 359 - else: 360 - return f"lr(chevron.l {_expr(rng, depth+1)} chevron.r)" 361 - elif c < 0.988: 362 - # Intervals: lr() handles mismatched delimiters for half-open forms 363 - a, b = _atom(rng), _atom(rng) 364 - r2 = rng.random() 365 - if r2 < 0.25: 366 - return f"lr([{a}, {b}])" # closed [a, b] 367 - elif r2 < 0.50: 368 - return f"lr(({a}, {b}))" # open (a, b) 369 - elif r2 < 0.75: 370 - return f"lr([{a}, {b}))" # half-open [a, b) 132 + result: list[str] = [] 133 + for block in blocks: 134 + stripped = block.strip() 135 + if stripped and rng.random() < hw_prob: 136 + # Scoped content block: #[#set text(...); content] 137 + # This keeps list/table markup valid inside the block. 138 + result.append(f"#[{hw_style}; {block}]") 371 139 else: 372 - return f"lr(({a}, {b}])" # half-open (a, b] 373 - elif c < 0.993: 374 - # indexed sequence with ellipsis: (a_1, dots.c, a_n) 375 - # parens required -- bare commas break multi-arg contexts (sqrt, norm, etc.) 376 - v = rng.choice(_VARS) 377 - n = rng.choice(["n", "m", "N", "k"]) 378 - return f"({v}_1, dots.c, {v}_{n})" 379 - elif c < 0.995: 380 - # chevron bracket pair (inner product, type constructors) 381 - # outer parens prevent comma from being parsed as extra function arg 382 - return f"(chevron.l {_expr(rng, depth+1)}, {_expr(rng, depth+1)} chevron.r)" 383 - elif c < 0.9975: 384 - # custom math operators: op("rank")(x) gives proper operatorname spacing 385 - return f'op("{rng.choice(_CUSTOM_OPS)}")({_expr(rng, depth+1)})' 386 - elif c < 0.999: 387 - # calligraphic applied to expr: cal(P)(X) powerset, cal(H) Hilbert space, etc. 388 - return f"cal({rng.choice(['P', 'F', 'H', 'L', 'B'])})({_expr(rng, depth+1)})" 389 - elif c < 0.9999: 390 - # explicit math spacing between two sub-expressions 391 - sp = rng.choice(_SPACES) 392 - return f"{_atom(rng)} {sp} {_atom(rng)}" 393 - else: 394 - return _atom(rng) 395 - 140 + result.append(block) 396 141 397 - def generate_expr(rng: random.Random) -> str: 398 - return _expr(rng, depth=0, max_depth=3) 142 + return "\n\n".join(result) 399 143 400 144 401 145 # ── Rendering ───────────────────────────────────────────────────────────────── 402 146 403 - _TEMPLATE = ( 404 - "#set page(width: auto, height: auto, " 405 - "margin: (x: 10pt, y: 8pt), fill: white)\n" 406 - "$ {math} $\n" 407 - ) 147 + def _render( 148 + body: str, 149 + out_path: Path, 150 + page_width: str, 151 + mode: str, 152 + font: str | None, 153 + font_dir: Path, 154 + size: int, 155 + ink: str, 156 + rng: random.Random, 157 + ) -> tuple[bool, str]: 158 + """ 159 + Compile one sample to PNG. 408 160 161 + mode='hw' -- set hw font globally in the page header; body is used as-is. 162 + mode='mix' -- body may contain per-block #[#set text(...); ...] wrappers. 163 + """ 164 + if mode == "uniform" and font is not None: 165 + src = _TEMPLATE_HW.format( 166 + width=page_width, font=font, size=size, ink=ink, body=body 167 + ) 168 + else: 169 + # font=None (default) or mix mode: use plain template 170 + src = _TEMPLATE_MIX.format(width=page_width, body=body) 409 171 410 - def render_one(math: str, out_path: Path) -> bool: 411 - """Compile one Typst math string to PNG. Returns True on success.""" 412 - src = _TEMPLATE.format(math=math) 413 172 with tempfile.NamedTemporaryFile(suffix=".typ", mode="w", delete=False) as f: 414 173 f.write(src) 415 174 typ_path = Path(f.name) 175 + 176 + cmd = [ 177 + "typst", "compile", 178 + "--format", "png", 179 + "--ppi", "250", 180 + str(typ_path), 181 + str(out_path), 182 + ] 183 + if font is not None and font_dir.exists(): 184 + cmd += ["--font-path", str(font_dir.resolve())] 185 + 416 186 try: 417 - result = subprocess.run( 418 - ["typst", "compile", "--format", "png", "--ppi", "150", 419 - str(typ_path), str(out_path)], 420 - capture_output=True, 421 - timeout=15, 422 - ) 423 - return result.returncode == 0 424 - except (subprocess.TimeoutExpired, FileNotFoundError): 425 - return False 187 + result = subprocess.run(cmd, capture_output=True, timeout=15) 188 + return result.returncode == 0, result.stderr.decode(errors="replace") 189 + except subprocess.TimeoutExpired: 190 + return False, "timeout" 191 + except FileNotFoundError: 192 + return False, "typst not found" 426 193 finally: 427 194 typ_path.unlink(missing_ok=True) 428 195 ··· 430 197 # ── Main ────────────────────────────────────────────────────────────────────── 431 198 432 199 def main() -> None: 433 - parser = argparse.ArgumentParser() 434 - parser.add_argument("--count", type=int, default=8_000, 435 - help="Target unique structural templates to generate") 436 - parser.add_argument("--out", default="data/typeset") 437 - parser.add_argument("--jobs", type=int, default=4) 438 - parser.add_argument("--seed", type=int, default=42) 200 + parser = argparse.ArgumentParser( 201 + description="Generate handwriting-font structured math+text data." 202 + ) 203 + parser.add_argument("--mode", choices=["uniform", "mixed"], default="uniform", 204 + help="uniform=one font per doc; mixed=per-block font sampling") 205 + parser.add_argument("--count", type=int, default=15_000) 206 + parser.add_argument("--out", default="data/hw_structured_train") 207 + parser.add_argument("--jobs", type=int, default=4) 208 + parser.add_argument("--seed", type=int, default=42) 209 + parser.add_argument("--font-dir", default=str(_DEFAULT_FONT_DIR), 210 + help="Directory containing downloaded handwriting TTFs") 211 + parser.add_argument("--hw-prob", type=float, default=0.55, 212 + help="(mixed mode) probability each block gets a non-default font") 213 + parser.add_argument("--show-failures", type=int, default=0, metavar="N") 439 214 args = parser.parse_args() 440 215 216 + font_dir = Path(args.font_dir) 217 + available = _available_fonts(font_dir) 218 + if not available: 219 + print( 220 + f"No handwriting fonts found in {font_dir}.\n" 221 + "Run `uv run download-hw-fonts` first." 222 + ) 223 + raise SystemExit(1) 224 + 225 + print(f"Available fonts ({len(available)}): {', '.join(available)}") 226 + 441 227 out = Path(args.out) 442 228 img_dir = out / "images" 443 229 img_dir.mkdir(parents=True, exist_ok=True) 444 230 445 231 rng = random.Random(args.seed) 446 232 447 - # Phase 1: sample until we hit --count unique structural keys 448 - print(f"Generating {args.count:,} structurally unique expressions ...") 449 - seen_struct: set[str] = set() 450 - candidates: list[str] = [] 233 + # ── Phase 1: generate unique bodies ────────────────────────────────────── 234 + print(f"Generating {args.count:,} unique bodies (mode={args.mode}) ...") 235 + 236 + seen: set[str] = set() 237 + # (clean_body, page_width, font, size, ink) 238 + candidates: list[tuple[str, str, str, int, str]] = [] 451 239 attempts = 0 452 240 453 - with tqdm(total=args.count, unit="expr") as pbar: 241 + with tqdm(total=args.count, unit="body") as pbar: 454 242 while len(candidates) < args.count: 455 243 attempts += 1 456 - math = generate_expr(rng) 457 - sk = _structural_key(math) 458 - if sk in seen_struct: 244 + body, page_width = generate_body(rng) 245 + 246 + if body in seen: 459 247 continue 460 - seen_struct.add(sk) 461 - candidates.append(math) 248 + 249 + # mix mode: require at least two blocks so within-doc mixing is 250 + # meaningful (single-block bodies would always be fully hw or typeset). 251 + if args.mode == "mixed" and "\n\n" not in body: 252 + continue 253 + 254 + seen.add(body) 255 + font = _pick_font(rng, available) 256 + size = rng.choice(_FONT_SIZES) 257 + ink = rng.choice(_INK_COLOURS) 258 + candidates.append((body, page_width, font, size, ink)) 462 259 pbar.update(1) 463 260 464 - print(f" {attempts:,} attempts, {attempts / len(candidates):.1f}x overhead") 261 + print(f" {attempts:,} attempts ({attempts / len(candidates):.1f}x overhead)") 465 262 466 - # Phase 2: render in parallel 263 + # ── Phase 2: render ─────────────────────────────────────────────────────── 467 264 print(f"Rendering {len(candidates):,} images with {args.jobs} workers ...") 468 - records: list[dict] = [] 469 - failures: int = 0 265 + 266 + records: list[dict] = [] 267 + failures = 0 268 + shown_failures = 0 269 + 270 + def _task( 271 + clean_body: str, page_width: str, font: str, size: int, ink: str 272 + ) -> tuple[str, str, bool, str]: 273 + if args.mode == "uniform": 274 + render_body = clean_body 275 + else: 276 + render_body = _apply_mixed_fonts( 277 + clean_body, rng, font, hw_prob=args.hw_prob, size=size, ink=ink 278 + ) 470 279 471 - def _task(math: str) -> tuple[str, str, bool]: 472 - h = hashlib.sha1(math.encode()).hexdigest()[:16] 280 + # Hash over render body + font + size + ink + mode for uniqueness 281 + h_key = f"{args.mode}:{font}:{size}:{ink}:{page_width}:{render_body}" 282 + h = hashlib.sha1(h_key.encode()).hexdigest()[:16] 473 283 out_path = img_dir / f"{h}.png" 474 - ok = render_one(math, out_path) 475 - return math, f"images/{h}.png", ok 284 + 285 + ok, err = _render( 286 + render_body, out_path, page_width, 287 + mode=args.mode, font=font, font_dir=font_dir, size=size, ink=ink, 288 + rng=rng, 289 + ) 290 + # Manifest always records the CLEAN body (no font directives). 291 + return clean_body, f"images/{h}.png", ok, err, font 476 292 477 293 with ThreadPoolExecutor(max_workers=args.jobs) as pool: 478 - futs = {pool.submit(_task, m): m for m in candidates} 294 + futs = { 295 + pool.submit(_task, body, pw, font, size, ink): body 296 + for body, pw, font, size, ink in candidates 297 + } 479 298 with tqdm(total=len(candidates), unit="img") as pbar: 480 299 for fut in as_completed(futs): 481 - math, rel_path, ok = fut.result() 300 + clean_body, rel_path, ok, err, used_font = fut.result() 482 301 if ok: 483 - records.append({"image": rel_path, "typst": math}) 302 + records.append({"image": rel_path, "typst": clean_body}) 484 303 else: 485 304 failures += 1 305 + if shown_failures < args.show_failures: 306 + tqdm.write(f"\n--- failure ---\nbody: {clean_body!r}\n{err.strip()}") 307 + shown_failures += 1 486 308 pbar.update(1) 487 309 488 - # Phase 3: manifest 310 + # ── Phase 3: manifest ───────────────────────────────────────────────────── 489 311 manifest = out / "manifest.jsonl" 490 312 with manifest.open("w") as f: 491 313 for r in records: 492 314 f.write(json.dumps(r) + "\n") 493 315 316 + split_hint = out.name # e.g. hw_structured_train 494 317 print(f"Wrote {len(records):,} records to {manifest} ({failures} render failures)") 495 - print("Add 'typeset' to TYPESET_SPLITS in data.py and mix into train_records.") 318 + print( 319 + f"\nNext steps:\n" 320 + f" 1. Add '{split_hint}' to TRAIN_SPLITS in data.py\n" 321 + f" (it is NOT in _MATH_ONLY_SPLITS -- body already contains $ delimiters)\n" 322 + f" 2. Set a sampling cap in train.py if needed (e.g. 15k–20k)" 323 + ) 496 324 497 325 498 326 if __name__ == "__main__":