Fix dark-mode augmentation and simplify augment-vis · oscillatory.net/ocr-to-typst@39541e8

+2 -2

data/typeset_mixed_test.dvc

··· 1 1 outs: 2 - - md5: a0dfa361d8fec376845cdf7ee9727b7a.dir 3 - size: 59523396 2 + - md5: 9d98f424f6ebebb988b5c724087f8df1.dir 3 + size: 60332779 4 4 nfiles: 501 5 5 hash: md5 6 6 path: typeset_mixed_test

+2 -2

data/typeset_mixed_train.dvc

··· 1 1 outs: 2 - - md5: fb6fedb3763860680cc334906e2f38b4.dir 3 - size: 2434231075 2 + - md5: 74e026da8722c0e1172b22c7a8b00018.dir 3 + size: 2414417882 4 4 nfiles: 20001 5 5 hash: md5 6 6 path: typeset_mixed_train

+2 -2

data/typeset_mixed_val.dvc

··· 1 1 outs: 2 - - md5: 8005f35c2af54519d08b6a5d6b485f44.dir 3 - size: 59104539 2 + - md5: 86ec7a7aa0c137ae78dcf39afd40514f.dir 3 + size: 58933544 4 4 nfiles: 501 5 5 hash: md5 6 6 path: typeset_mixed_val

+2 -2

data/typeset_prose_test.dvc

··· 1 1 outs: 2 - - md5: 153507a2b98e3059ab4e43d9e5fb24b2.dir 3 - size: 54081499 2 + - md5: 4a8f5765f46a5bd542686a0059d57be1.dir 3 + size: 55029911 4 4 nfiles: 501 5 5 hash: md5 6 6 path: typeset_prose_test

+3 -3

data/typeset_prose_train.dvc

··· 1 1 outs: 2 - - md5: 17a437739b6d317f5a992cc754f2fde2.dir 3 - size: 1079329830 4 - nfiles: 10001 2 + - md5: 6191bbec561c52e37ad4a32344498b7f.dir 3 + size: 535085673 4 + nfiles: 5001 5 5 hash: md5 6 6 path: typeset_prose_train

+2 -2

data/typeset_prose_val.dvc

··· 1 1 outs: 2 - - md5: 001f6ca0abfcb61a62bdf842c5e48406.dir 3 - size: 54159095 2 + - md5: ceb61783e0737541c5afcb1160a72d3f.dir 3 + size: 54061824 4 4 nfiles: 501 5 5 hash: md5 6 6 path: typeset_prose_val

+2 -2

data/typeset_uniform_test.dvc

··· 1 1 outs: 2 - - md5: e345b759d0dc60764d77113eaf6d3e1a.dir 3 - size: 32757475 2 + - md5: 64877c925375d14197551c7ed5ce9355.dir 3 + size: 31380795 4 4 nfiles: 501 5 5 hash: md5 6 6 path: typeset_uniform_test

+2 -2

data/typeset_uniform_train.dvc

··· 1 1 outs: 2 - - md5: 7a4a6bd5396343cf11887e11abba17d0.dir 3 - size: 667750358 2 + - md5: 1f898e5f406a3d271504d1022df14a6c.dir 3 + size: 652183761 4 4 nfiles: 10001 5 5 hash: md5 6 6 path: typeset_uniform_train

+2 -2

data/typeset_uniform_val.dvc

··· 1 1 outs: 2 - - md5: 4260a1b3f9064e1d77699f5ba7854c98.dir 3 - size: 34456319 2 + - md5: 44ea5c7588c96de79f4c4c7b77bd421d.dir 3 + size: 31748396 4 4 nfiles: 501 5 5 hash: md5 6 6 path: typeset_uniform_val

+13 -26

src/augment_vis.py

··· 1 1 """ 2 2 Visualise augmentation on a sample of training images. 3 3 4 - Saves three images per example: 5 - NN_orig.png -- original 6 - NN_aug.png -- full page-level pipeline (no region jitter) 7 - NN_aug_jitter.png -- same page transforms + region jitter applied on top 4 + Saves two images per example: 5 + NN_orig.png -- original 6 + NN_aug.png -- exactly one draw of the training augmentation pipeline 7 + NN_typst.txt -- Typst source label 8 8 9 9 Usage: 10 - uv run augment-vis # 12 images from typeset_mixed_train 11 - uv run augment-vis --split typeset_prose_train --n 12 10 + uv run augment-vis # 20 images from typeset_mixed_train 11 + uv run augment-vis --split typeset_prose_train --n 20 12 12 uv run augment-vis --split mathwriting_train --out /tmp/aug_mw 13 13 uv run augment-vis --images data/foo/images/abc.png ... 14 14 """ ··· 20 20 21 21 from PIL import Image 22 22 23 - from .data import ( 24 - DATA_ROOT, _augment, _find_blocks, _region_jitter, 25 - _LIST_RE, AUG_JITTER_MAX_DX, AUG_JITTER_LIST_MAX_DX, 26 - ) 23 + from .data import DATA_ROOT, _augment 27 24 28 25 29 26 def main() -> None: ··· 55 52 for i, entry in enumerate(entries): 56 53 path, typst = entry["path"], entry["typst"] 57 54 img = Image.open(path).convert("RGB") 58 - blocks = _find_blocks(img) 59 - is_list = bool(_LIST_RE.search(typst)) 60 - 61 - aug_base = _augment(img, region_jitter=False, typst=typst) 62 - max_dx = AUG_JITTER_LIST_MAX_DX if is_list else AUG_JITTER_MAX_DX 63 - aug_jitter = _region_jitter(aug_base, max_dx=max_dx) 55 + aug = _augment(img, region_jitter=True, typst=typst) 64 56 65 57 img.save(out / f"{i:02d}_orig.png") 66 - aug_base.save(out / f"{i:02d}_aug.png") 67 - aug_jitter.save(out / f"{i:02d}_aug_jitter.png") 58 + aug.save(out / f"{i:02d}_aug.png") 68 59 (out / f"{i:02d}_typst.txt").write_text(typst) 69 60 70 - list_tag = " [list]" if is_list else "" 71 - ranges = " ".join(f"{t}-{b}({b-t}px)" for t, b in blocks) 72 61 preview = (typst[:80] + "…") if len(typst) > 80 else typst 73 - print(f"{i}: {path.name} size={img.size} blocks={len(blocks)}{list_tag}") 74 - print(f" {ranges}") 62 + print(f"{i}: {path.name} size={img.size}") 75 63 print(f" {preview}") 76 64 77 65 print(f"\nSaved to {out}/") 78 - print(" NN_orig.png -- original") 79 - print(" NN_aug.png -- page-level augmentation only (no jitter)") 80 - print(" NN_aug_jitter.png -- same transforms + region jitter") 81 - print(" NN_typst.txt -- Typst source label") 66 + print(" NN_orig.png -- original") 67 + print(" NN_aug.png -- one training augmentation draw") 68 + print(" NN_typst.txt -- Typst source label") 82 69 83 70 84 71 if __name__ == "__main__":

+43 -16

src/data.py

··· 178 178 return Image.alpha_composite(work, overlay).convert(orig_mode) 179 179 180 180 181 + def _bg_color(img: Image.Image) -> tuple[int, int, int]: 182 + """Estimate background RGB by sampling the image corners (10px radius).""" 183 + arr = np.array(img.convert("RGB")) 184 + H, W = arr.shape[:2] 185 + s = max(1, min(10, H // 4, W // 4)) 186 + corners = np.concatenate([ 187 + arr[:s, :s].reshape(-1, 3), 188 + arr[:s, W - s:].reshape(-1, 3), 189 + arr[H - s:, :s].reshape(-1, 3), 190 + arr[H - s:, W - s:].reshape(-1, 3), 191 + ]) 192 + return tuple(int(v) for v in np.median(corners, axis=0)) 193 + 194 + 181 195 def _find_blocks(img: Image.Image, threshold: float = 0.005, min_gap: int = 16) -> list[tuple[int, int]]: 182 196 """Return (top, bottom) pixel ranges for content blocks via horizontal projection. 183 197 184 - threshold: fraction of dark pixels per row below which a row is "empty". 185 - 0.005 captures rows with as few as ~5 dark pixels per 1000px width, 198 + threshold: fraction of ink pixels per row below which a row is "empty". 199 + 0.005 captures rows with as few as ~5 ink pixels per 1000px width, 186 200 which preserves sparse tops of superscripts and integral limits. 187 201 min_gap: consecutive empty rows required to count as a block separator. 188 202 16px sits above intra-character gaps (i-dot, diacritic) but below 189 203 typical inter-paragraph whitespace at 250 PPI. 190 204 """ 191 205 gray = np.array(img.convert("L")) 192 - norm = (gray < 200).sum(axis=1) / max(gray.shape[1], 1) 206 + bg_luma = int(np.median(np.concatenate([gray[:10, :], gray[-10:, :]]))) 207 + if bg_luma < 128: # dark mode: light ink on dark background 208 + norm = (gray > bg_luma + 56).sum(axis=1) / max(gray.shape[1], 1) 209 + else: # light mode: dark ink on light background 210 + norm = (gray < 200).sum(axis=1) / max(gray.shape[1], 1) 193 211 is_gap = norm < threshold 194 212 195 213 H = len(is_gap) ··· 227 245 return [(t, b) for t, b in merged] 228 246 229 247 230 - def _transform_patch(patch: Image.Image) -> Image.Image: 248 + def _transform_patch(patch: Image.Image, 249 + bg: tuple[int, int, int] = (255, 255, 255)) -> Image.Image: 231 250 """Per-block affine + elastic, simulating independently written chunks.""" 232 251 pw, ph = patch.width, patch.height 233 252 ··· 237 256 a_rad = abs(math.radians(angle)) 238 257 cos_a, sin_a = math.cos(a_rad), math.sin(a_rad) 239 258 rot_pad = int(max(pw, ph) * sin_a) + 4 240 - canvas = Image.new("RGB", (pw + 2 * rot_pad, ph + 2 * rot_pad), (255, 255, 255)) 259 + canvas = Image.new("RGB", (pw + 2 * rot_pad, ph + 2 * rot_pad), bg) 241 260 canvas.paste(patch, (rot_pad, rot_pad)) 242 - canvas = canvas.rotate(angle, resample=Image.BICUBIC, fillcolor=(255, 255, 255)) 261 + canvas = canvas.rotate(angle, resample=Image.BICUBIC, fillcolor=bg) 243 262 bb_w = int(pw * cos_a + ph * sin_a) + 2 244 263 bb_h = int(pw * sin_a + ph * cos_a) + 2 245 264 cx, cy = rot_pad + pw // 2, rot_pad + ph // 2 ··· 252 271 patch = patch.resize((new_w, new_h), Image.BICUBIC) 253 272 254 273 if ph >= AUG_PATCH_MIN_H and random.random() < AUG_P_PATCH_ELASTIC: 255 - patch = _PATCH_ELASTIC(patch) 274 + ep = 8 275 + e_pad = Image.new("RGB", (patch.width + 2 * ep, patch.height + 2 * ep), bg) 276 + e_pad.paste(patch, (ep, ep)) 277 + e_pad = _PATCH_ELASTIC(e_pad) 278 + patch = e_pad.crop((ep, ep, ep + patch.width, ep + patch.height)) 256 279 257 280 return patch 258 281 ··· 266 289 if len(blocks) < 2: 267 290 return img 268 291 292 + bg = _bg_color(img) 269 293 src = np.array(img) 270 294 H, W = src.shape[:2] 271 295 MY = AUG_JITTER_MAX_DY + 2 272 296 MX = AUG_JITTER_MAX_DX + 2 273 297 274 - out = np.full((H + 2 * MY, W + 2 * MX, 3), 255, dtype=np.uint8) 298 + out = np.full((H + 2 * MY, W + 2 * MX, 3), bg, dtype=np.uint8) 275 299 for i, (top, bot) in enumerate(blocks): 276 - patch = _transform_patch(Image.fromarray(src[top:bot])) 300 + patch = _transform_patch(Image.fromarray(src[top:bot]), bg) 277 301 strip = np.array(patch) 278 302 sh, sw = strip.shape[:2] 279 303 ··· 315 339 region_jitter: when True, applies _region_jitter with AUG_P_JITTER probability. 316 340 Pass False to run the page-level pipeline without block jitter. 317 341 """ 342 + bg = _bg_color(img) 343 + 318 344 angle = random.uniform(-AUG_ANGLE_DEG, AUG_ANGLE_DEG) 319 345 scale = random.uniform(1.0 - AUG_SCALE_RANGE, 1.0 + AUG_SCALE_RANGE) 320 346 tx = int(random.uniform(-AUG_TRANSLATE_FRAC, AUG_TRANSLATE_FRAC) * img.width) ··· 330 356 int(scale * max(orig_w, orig_h) * a_sin), 331 357 rot_extra + scale_extra, 332 358 ) + abs(tx) + abs(ty) + 8 333 - padded = Image.new("RGB", (orig_w + 2 * pad, orig_h + 2 * pad), (255, 255, 255)) 359 + padded = Image.new("RGB", (orig_w + 2 * pad, orig_h + 2 * pad), bg) 334 360 padded.paste(img, (pad, pad)) 335 - padded = TF.affine(padded, angle=angle, translate=(tx, ty), scale=scale, shear=0, fill=(255, 255, 255)) 361 + padded = TF.affine(padded, angle=angle, translate=(tx, ty), scale=scale, shear=0, fill=bg) 336 362 img = padded.crop(( 337 363 pad - rot_extra - scale_extra - max(0, -tx), 338 364 pad - rot_extra - scale_extra - max(0, -ty), ··· 340 366 pad + orig_h + rot_extra + scale_extra + max(0, ty), 341 367 )) 342 368 343 - # Elastic deformation: smooth warp mimics baseline wobble without 344 - # corrupting subscript/superscript spatial relationships (sigma=6 keeps 345 - # the displacement field globally smooth). 346 369 if random.random() < AUG_P_ELASTIC: 347 - img = _ELASTIC(img) 370 + ep = 20 371 + e_pad = Image.new("RGB", (img.width + 2 * ep, img.height + 2 * ep), bg) 372 + e_pad.paste(img, (ep, ep)) 373 + e_pad = _ELASTIC(e_pad) 374 + img = e_pad.crop((ep, ep, ep + img.width, ep + img.height)) 348 375 349 376 if random.random() < AUG_P_PERSPECTIVE: 350 377 persp_pad = int(AUG_PERSP_DISTORTION * min(img.width, img.height)) + 4 351 378 pw, ph = img.width, img.height 352 - persp_canvas = Image.new("RGB", (pw + 2 * persp_pad, ph + 2 * persp_pad), (255, 255, 255)) 379 + persp_canvas = Image.new("RGB", (pw + 2 * persp_pad, ph + 2 * persp_pad), bg) 353 380 persp_canvas.paste(img, (persp_pad, persp_pad)) 354 381 persp_canvas = _PERSPECTIVE(persp_canvas) 355 382 img = persp_canvas.crop((persp_pad, persp_pad, persp_pad + pw, persp_pad + ph))

+3 -2

src/generate_mixed.py

··· 413 413 414 414 # ── Emoji pools ─────────────────────────────────────────────────────────────── 415 415 416 - # Tier 1 (annotation + colored circles) weighted 2x via repetition 417 416 _EMOJI = [ 417 + # Tier 1 (annotation + colored circles) weighted 2x via repetition 418 418 "✅", "✅", "❌", "❌", "⚠️", "⚠️", "💡", "💡", 419 419 "🔴", "🔴", "🟠", "🟡", "🟢", "🟢", "🔵", "🔵", "🟣", "⚫", "⚪", 420 420 # Tier 2 (colored squares, misc) 421 421 "🟥", "🟧", "🟨", "🟩", "🟦", "🟪", 422 - "⭐", "🔥", "🏆", 422 + "⭐", "🔥", "🏆", "➡️ ", "⬅️ ", "⬆️ ", "⬇️ ", "📌", 423 + "🍌", "🍎", "🍇", "🍍", "🥕", "🥔", "🥦", "🧅" 423 424 ] 424 425 425 426 # ── Text pools ────────────────────────────────────────────────────────────────

Configure Feed

Configure Feed