Consolidate dataset under data/, init DVC, track all 14 splits as v1

+3

.dvc/.gitignore

··· 1 + /config.local 2 + /tmp 3 + /cache

.dvc/config

This is a binary file and will not be displayed.

+3

.dvcignore

··· 1 + # Add patterns of files dvc should ignore, which could improve 2 + # the performance. Learn more at 3 + # https://dvc.org/doc/user-guide/dvcignore

-1

.gitignore

··· 1 1 __pycache__/ 2 - data/ 3 2 unsloth_compiled_cache/

+14

data/.gitignore

··· 1 + /crohme_gen_2019 2 + /crohme_gen_2023 3 + /crohme_gen_syntactic 4 + /crohme_real_train 5 + /crohme_val 6 + /crohme_test 7 + /mathwriting_train 8 + /mathwriting_synthetic 9 + /mathwriting_symbols 10 + /mathwriting_val 11 + /mathwriting_test 12 + /typeset_train 13 + /typeset_val 14 + /typeset_test

+6

data/crohme_gen_2019.dvc

··· 1 + outs: 2 + - md5: fa3f58f60d08d2c53c816ea0a1f022b5.dir 3 + size: 179876011 4 + nfiles: 51857 5 + hash: md5 6 + path: crohme_gen_2019

+6

data/crohme_gen_2023.dvc

··· 1 + outs: 2 + - md5: 21a4f5b4faad9372eb4541fe67b9c1ed.dir 3 + size: 11405707 4 + nfiles: 3075 5 + hash: md5 6 + path: crohme_gen_2023

+6

data/crohme_gen_syntactic.dvc

··· 1 + outs: 2 + - md5: 18f1a54ca6e47359f69c6014d1aa0c78.dir 3 + size: 179794605 4 + nfiles: 69399 5 + hash: md5 6 + path: crohme_gen_syntactic

+6

data/crohme_real_train.dvc

··· 1 + outs: 2 + - md5: a3feaa4834fceb75ad3f7e72823e6faa.dir 3 + size: 33687 4 + nfiles: 11 5 + hash: md5 6 + path: crohme_real_train

+6

data/crohme_test.dvc

··· 1 + outs: 2 + - md5: ba219a65a7f6abff693ea2d082e2b756.dir 3 + size: 37152 4 + nfiles: 10 5 + hash: md5 6 + path: crohme_test

+6

data/crohme_val.dvc

··· 1 + outs: 2 + - md5: 1bc8c4fff7f440b661671723b15c922a.dir 3 + size: 24493 4 + nfiles: 9 5 + hash: md5 6 + path: crohme_val

+6

data/mathwriting_symbols.dvc

··· 1 + outs: 2 + - md5: a42633d01c62323dfd02cd09db6940cd.dir 3 + size: 11651854 4 + nfiles: 6278 5 + hash: md5 6 + path: mathwriting_symbols

+6

data/mathwriting_synthetic.dvc

··· 1 + outs: 2 + - md5: ff8b28cd99abf076b2ff7ecf7c877899.dir 3 + size: 422432396 4 + nfiles: 85964 5 + hash: md5 6 + path: mathwriting_synthetic

+6

data/mathwriting_test.dvc

··· 1 + outs: 2 + - md5: b4f63acd8a1b5302b9f41f8cd4c19321.dir 3 + size: 22615179 4 + nfiles: 5741 5 + hash: md5 6 + path: mathwriting_test

+6

data/mathwriting_train.dvc

··· 1 + outs: 2 + - md5: 3636fe933379177582b311b471f5742c.dir 3 + size: 539227480 4 + nfiles: 143108 5 + hash: md5 6 + path: mathwriting_train

+6

data/mathwriting_val.dvc

··· 1 + outs: 2 + - md5: c62716de9799135660dc55770a2888be.dir 3 + size: 34272982 4 + nfiles: 9338 5 + hash: md5 6 + path: mathwriting_val

+6

data/typeset_test.dvc

··· 1 + outs: 2 + - md5: 757b324902e64cd32cbb123b73ef82ea.dir 3 + size: 4655339 4 + nfiles: 1001 5 + hash: md5 6 + path: typeset_test

+6

data/typeset_train.dvc

··· 1 + outs: 2 + - md5: ed521af94ca7386b8e8fa168e6c98ede.dir 3 + size: 39300757 4 + nfiles: 8001 5 + hash: md5 6 + path: typeset_train

+6

data/typeset_val.dvc

··· 1 + outs: 2 + - md5: 5007a8486396e9e0b78b2c26a5a9ad00.dir 3 + size: 4508603 4 + nfiles: 1001 5 + hash: md5 6 + path: typeset_val

+12 -15

src/data.py

··· 1 1 """ 2 2 Data loading for Gemma 4 vision fine-tuning. 3 3 4 - Manifests live in ../eff-mer/data/raster/<split>/manifest.jsonl. 4 + Manifests live in data/<split>/manifest.jsonl. 5 5 Each line: {"image": "images/xxx.png", "latex": "...", "typst": "..."} 6 6 7 7 Images are loaded lazily via Dataset.set_transform -- never all in memory. ··· 18 18 from PIL import Image, ImageFilter 19 19 import torchvision.transforms.functional as TF 20 20 21 - RASTER_ROOT = Path(__file__).parent.parent / "../eff-mer/data/raster" 22 - TYPESET_ROOT = Path(__file__).parent.parent / "data" 21 + DATA_ROOT = Path(__file__).parent.parent / "data" 23 22 24 23 TRAIN_SPLITS = [ 25 - "crohme_gen_LaTeX_data_CROHME_2019", 26 - "crohme_gen_LaTeX_data_CROHME_2023_corpus", 27 - "crohme_gen_syntactic_data", 24 + "crohme_gen_2019", 25 + "crohme_gen_2023", 26 + "crohme_gen_syntactic", 28 27 "crohme_real_train", 29 - "mathwriting_Train", 30 - "mathwriting_Synthetic", 31 - "mathwriting_Symbols", 28 + "mathwriting_train", 29 + "mathwriting_synthetic", 30 + "mathwriting_symbols", 31 + "typeset_train", 32 32 ] 33 - VAL_SPLITS = ["mathwriting_Val"] 34 - TEST_SPLITS = ["mathwriting_Test"] 35 - TYPESET_SPLITS = ["typeset"] # uv run generate-typeset 36 - TYPESET_VAL_SPLITS = ["typeset_val"] # uv run generate-typeset --out data/typeset_val --count 1000 --seed 43 37 - TYPESET_TEST_SPLITS = ["typeset_test"] # uv run generate-typeset --out data/typeset_test --count 500 --seed 44 33 + VAL_SPLITS = ["mathwriting_val", "typeset_val"] 34 + TEST_SPLITS = ["mathwriting_test", "typeset_test"] 38 35 39 36 PROMPT = "Transcribe this mathematical expression to Typst math notation." 40 37 BASE_MODEL = "unsloth/gemma-4-E2B-it" ··· 57 54 58 55 59 56 def load_records(split_names: list[str], dedupe: bool = True, 60 - root: Path = RASTER_ROOT) -> list[dict]: 57 + root: Path = DATA_ROOT) -> list[dict]: 61 58 """ 62 59 Load records from manifests. 63 60

+2 -13

src/train.py

··· 8 8 from unsloth.trainer import UnslothVisionDataCollator 9 9 from trl import SFTTrainer, SFTConfig 10 10 11 - from .data import (BASE_MODEL, TRAIN_SPLITS, VAL_SPLITS, TYPESET_SPLITS, 12 - TYPESET_VAL_SPLITS, TYPESET_ROOT, load_records, make_dataset) 11 + from .data import (BASE_MODEL, TRAIN_SPLITS, VAL_SPLITS, load_records, make_dataset) 13 12 14 13 15 14 def main() -> None: ··· 32 31 ) 33 32 34 33 train_records = load_records(TRAIN_SPLITS, dedupe=True) 35 - typeset_manifest = TYPESET_ROOT / "typeset" / "manifest.jsonl" 36 - if typeset_manifest.exists(): 37 - typeset_records = load_records(TYPESET_SPLITS, dedupe=False, root=TYPESET_ROOT) 38 - train_records += typeset_records 39 - print(f"Typeset train: {len(typeset_records):,} records mixed in") 40 - val_records = load_records(VAL_SPLITS, dedupe=False) 41 - typeset_val_manifest = TYPESET_ROOT / "typeset_val" / "manifest.jsonl" 42 - if typeset_val_manifest.exists(): 43 - typeset_val_records = load_records(TYPESET_VAL_SPLITS, dedupe=False, root=TYPESET_ROOT) 44 - val_records += typeset_val_records 45 - print(f"Typeset val: {len(typeset_val_records):,} records mixed in") 34 + val_records = load_records(VAL_SPLITS, dedupe=False) 46 35 47 36 import random as _random 48 37 _rng = _random.Random(42)

Configure Feed

Configure Feed