Add image-stats command · oscillatory.net/ocr-to-typst@0971daf

+100

2 changed files

expand all

src

pyproject.toml

··· 43 43 replace-frac = "src.replace_frac:main" 44 44 replace-dif = "src.replace_dif:main" 45 45 eff-mer-evaluate = "src.eff_mer.infer:main" 46 + image-stats = "src.image_stats:main" 46 47 47 48 [build-system] 48 49 requires = ["hatchling"]

+99

src/image_stats.py

··· 1 + """ 2 + Report average image dimensions (width, height, area, aspect ratio) per split. 3 + 4 + Usage: 5 + uv run image-stats [--splits mathwriting_train typeset_mixed_train ...] 6 + [--sample 500] 7 + 8 + Default: all mathwriting_* and typeset_* splits found in data/. 9 + --sample N: randomly sample N images per split (faster; 0 = all). 10 + """ 11 + 12 + import argparse 13 + import json 14 + import random 15 + import statistics 16 + from pathlib import Path 17 + 18 + from PIL import Image 19 + from tqdm import tqdm 20 + 21 + DATA_ROOT = Path(__file__).parent.parent / "data" 22 + 23 + 24 + def split_stats(split_name: str, sample: int, rng: random.Random) -> dict | None: 25 + manifest = DATA_ROOT / split_name / "manifest.jsonl" 26 + if not manifest.exists(): 27 + return None 28 + base = (DATA_ROOT / split_name).resolve() 29 + records = [json.loads(l) for l in manifest.read_text().splitlines() if l.strip()] 30 + if not records: 31 + return None 32 + if sample and len(records) > sample: 33 + records = rng.sample(records, sample) 34 + 35 + widths, heights, areas = [], [], [] 36 + for r in tqdm(records, desc=split_name, leave=False): 37 + path = base / r["image"] 38 + try: 39 + with Image.open(path) as img: 40 + w, h = img.size 41 + widths.append(w) 42 + heights.append(h) 43 + areas.append(w * h) 44 + except Exception: 45 + continue 46 + 47 + if not widths: 48 + return None 49 + return { 50 + "n": len(widths), 51 + "w_mean": statistics.mean(widths), 52 + "w_median": statistics.median(widths), 53 + "h_mean": statistics.mean(heights), 54 + "h_median": statistics.median(heights), 55 + "area_mean": statistics.mean(areas), 56 + "area_median": statistics.median(areas), 57 + "aspect_mean": statistics.mean(w / h for w, h in zip(widths, heights)), 58 + } 59 + 60 + 61 + def main() -> None: 62 + parser = argparse.ArgumentParser() 63 + parser.add_argument("--splits", nargs="+", default=None, 64 + help="Split names (default: all mathwriting_* and typeset_* in data/)") 65 + parser.add_argument("--sample", type=int, default=500, 66 + help="Images per split to sample (0 = all)") 67 + args = parser.parse_args() 68 + 69 + if args.splits: 70 + splits = args.splits 71 + else: 72 + splits = sorted( 73 + p.name for p in DATA_ROOT.iterdir() 74 + if p.is_dir() and (p.name.startswith("mathwriting_") or p.name.startswith("typeset_")) 75 + and (p / "manifest.jsonl").exists() 76 + ) 77 + 78 + rng = random.Random(0) 79 + 80 + col_w = max(len(s) for s in splits) + 2 81 + header = f"{'split':<{col_w}} {'n':>6} {'w_mean':>7} {'w_med':>6} {'h_mean':>7} {'h_med':>6} {'area_mean':>10} {'aspect':>7}" 82 + print(header) 83 + print("-" * len(header)) 84 + 85 + for split in splits: 86 + s = split_stats(split, args.sample, rng) 87 + if s is None: 88 + print(f"{split:<{col_w}} (no manifest or images)") 89 + continue 90 + print( 91 + f"{split:<{col_w}} {s['n']:>6} " 92 + f"{s['w_mean']:>7.0f} {s['w_median']:>6.0f} " 93 + f"{s['h_mean']:>7.0f} {s['h_median']:>6.0f} " 94 + f"{s['area_mean']:>10.0f} {s['aspect_mean']:>7.2f}" 95 + ) 96 + 97 + 98 + if __name__ == "__main__": 99 + main()

Configure Feed

Configure Feed