Move probe_deepseek to src package, drop stale scripts/ copy

+121 -122

2 changed files

expand all

scripts

probe_deepseek.py

src

probe_deepseek.py

-122

scripts/probe_deepseek.py

··· 1 - #!/usr/bin/env python3 2 - # /// script 3 - # requires-python = ">=3.13" 4 - # dependencies = [ 5 - # "torch==2.6.0", 6 - # "torchvision", 7 - # "transformers==4.46.3", 8 - # "accelerate", 9 - # "einops", 10 - # "addict", 11 - # "easydict", 12 - # "pillow", 13 - # ] 14 - # /// 15 - """ 16 - Standalone DeepSeek-OCR-2 probe. Managed by uv as an isolated script env. 17 - 18 - Usage: 19 - uv run scripts/probe_deepseek.py --images a.png b.png 20 - uv run scripts/probe_deepseek.py --test-dir ../eff-mer/data/raster/mathwriting_Test --n 5 21 - """ 22 - 23 - import argparse 24 - import json 25 - import os 26 - import tempfile 27 - from pathlib import Path 28 - 29 - import torch 30 - from PIL import Image 31 - from transformers import AutoModel, AutoTokenizer 32 - 33 - MODEL_ID = "deepseek-ai/DeepSeek-OCR-2" 34 - DEFAULT_PROMPT = "<image>\nConvert this mathematical expression to Typst math notation. " 35 - 36 - 37 - def load_model(model_id: str = MODEL_ID): 38 - print(f"Loading {model_id} ...") 39 - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 40 - model = AutoModel.from_pretrained( 41 - model_id, 42 - trust_remote_code=True, 43 - use_safetensors=True, 44 - torch_dtype=torch.bfloat16, 45 - device_map="auto", 46 - ) 47 - model.eval() 48 - print("Model ready.\n") 49 - return model, tokenizer 50 - 51 - 52 - def run_image(img: Image.Image, model, tokenizer, prompt: str) -> str: 53 - with tempfile.TemporaryDirectory() as tmpdir: 54 - img_path = os.path.join(tmpdir, "input.png") 55 - out_dir = os.path.join(tmpdir, "out") 56 - os.makedirs(out_dir) 57 - img.save(img_path) 58 - res = model.infer( 59 - tokenizer, 60 - prompt=prompt, 61 - image_file=img_path, 62 - output_path=out_dir, 63 - base_size=1024, 64 - image_size=768, 65 - crop_mode=False, 66 - save_results=False, 67 - eval_mode=True, 68 - ) 69 - return str(res).strip() if res is not None else "" 70 - 71 - 72 - def load_test_records(test_dir: Path, n: int) -> list[dict]: 73 - manifest = test_dir / "manifest.jsonl" 74 - records = [] 75 - for line in manifest.read_text().splitlines(): 76 - r = json.loads(line) 77 - if r.get("typst", "").startswith("ERROR:") or not r.get("typst"): 78 - continue 79 - records.append({ 80 - "image_path": str(test_dir / r["image"]), 81 - "typst": r["typst"], 82 - }) 83 - if len(records) >= n: 84 - break 85 - return records 86 - 87 - 88 - def main(): 89 - parser = argparse.ArgumentParser() 90 - parser.add_argument("--model", default=MODEL_ID) 91 - parser.add_argument("--prompt", default=DEFAULT_PROMPT) 92 - parser.add_argument("--images", nargs="+", metavar="IMG", 93 - help="Arbitrary image files") 94 - parser.add_argument("--test-dir", metavar="DIR", 95 - help="Path to a manifest.jsonl split dir (e.g. mathwriting_Test)") 96 - parser.add_argument("--n", type=int, default=5) 97 - args = parser.parse_args() 98 - 99 - model, tokenizer = load_model(args.model) 100 - 101 - if args.images: 102 - for path in args.images: 103 - img = Image.open(path).convert("RGB") 104 - pred = run_image(img, model, tokenizer, args.prompt) 105 - print(f"{path}:\n {pred}\n") 106 - 107 - elif args.test_dir: 108 - records = load_test_records(Path(args.test_dir), args.n) 109 - for i, r in enumerate(records): 110 - img = Image.open(r["image_path"]).convert("RGB") 111 - pred = run_image(img, model, tokenizer, args.prompt) 112 - print(f"\n{'='*60}") 113 - print(f"[{i}] {Path(r['image_path']).name}") 114 - print(f" EXPECTED : {repr(r['typst'])}") 115 - print(f" PREDICTED: {repr(pred)}") 116 - 117 - else: 118 - parser.error("Provide --images or --test-dir") 119 - 120 - 121 - if __name__ == "__main__": 122 - main()

+121

src/probe_deepseek.py

··· 1 + """ 2 + Inference probe for DeepSeek-OCR-2 on Typst OCR. 3 + 4 + The model outputs LaTeX / plain text (not Typst), so MATCH will almost 5 + always be False -- this is for qualitative inspection, not ExpRate. 6 + 7 + Quantisation (--bits): 8 + 16 bf16, no quant ~6.8 GB (official recipe) 9 + 8 INT8 bitsandbytes ~3.4 GB (experimental -- custom model, may fail) 10 + 4 NF4 bitsandbytes ~1.7 GB (experimental) 11 + 12 + Usage: 13 + uv run probe-deepseek [--bits 16] [--n 5] 14 + uv run probe-deepseek --images path/to/a.png path/to/b.png 15 + """ 16 + 17 + import argparse 18 + import os 19 + import tempfile 20 + from pathlib import Path 21 + 22 + import torch 23 + from PIL import Image 24 + from transformers import AutoModel, AutoTokenizer 25 + 26 + from .data import TEST_SPLITS, load_records 27 + from .eval import normalize 28 + 29 + MODEL_ID = "deepseek-ai/DeepSeek-OCR-2" 30 + 31 + DEFAULT_PROMPT = "<image>\nConvert this mathematical expression to Typst math notation. " 32 + 33 + 34 + def load_model(model_id: str = MODEL_ID, bits: int = 16): 35 + """Load DeepSeek-OCR-2. Returns (model, tokenizer).""" 36 + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 37 + 38 + kwargs = dict( 39 + trust_remote_code=True, 40 + use_safetensors=True, 41 + _attn_implementation="flash_attention_2", 42 + ) 43 + 44 + if bits in (4, 8): 45 + from transformers import BitsAndBytesConfig 46 + if bits == 4: 47 + bnb_cfg = BitsAndBytesConfig( 48 + load_in_4bit=True, 49 + bnb_4bit_quant_type="nf4", 50 + bnb_4bit_compute_dtype=torch.bfloat16, 51 + bnb_4bit_use_double_quant=True, 52 + ) 53 + else: 54 + bnb_cfg = BitsAndBytesConfig(load_in_8bit=True) 55 + model = AutoModel.from_pretrained(model_id, quantization_config=bnb_cfg, **kwargs) 56 + else: 57 + model = AutoModel.from_pretrained(model_id, **kwargs) 58 + model = model.eval().cuda().to(torch.bfloat16) 59 + 60 + model.eval() 61 + return model, tokenizer 62 + 63 + 64 + def run_image(img: Image.Image, model, tokenizer, prompt: str = DEFAULT_PROMPT) -> str: 65 + """Run inference on a PIL image. Saves to a tempfile (model.infer needs a path).""" 66 + with tempfile.TemporaryDirectory() as tmpdir: 67 + img_path = os.path.join(tmpdir, "input.png") 68 + out_dir = os.path.join(tmpdir, "out") 69 + os.makedirs(out_dir) 70 + img.save(img_path) 71 + 72 + res = model.infer( 73 + tokenizer, 74 + prompt=prompt, 75 + image_file=img_path, 76 + output_path=out_dir, 77 + base_size=1024, 78 + image_size=768, 79 + crop_mode=False, # math expressions are small -- don't crop 80 + save_results=False, 81 + ) 82 + 83 + # res is typically a string or list; normalise to str 84 + if isinstance(res, list): 85 + return "\n".join(str(x) for x in res).strip() 86 + return str(res).strip() 87 + 88 + 89 + def main() -> None: 90 + parser = argparse.ArgumentParser() 91 + parser.add_argument("--model", default=MODEL_ID) 92 + parser.add_argument("--bits", type=int, default=16, choices=[4, 8, 16]) 93 + parser.add_argument("--prompt", default=DEFAULT_PROMPT, 94 + help="Full prompt string; must start with '<image>\\n'") 95 + parser.add_argument("--n", type=int, default=5) 96 + parser.add_argument("--images", nargs="+", metavar="IMG") 97 + args = parser.parse_args() 98 + 99 + print(f"Loading {args.model} at {args.bits}-bit ...") 100 + model, tokenizer = load_model(args.model, args.bits) 101 + print("Model ready.\n") 102 + 103 + if args.images: 104 + for path in args.images: 105 + img = Image.open(path).convert("RGB") 106 + pred = run_image(img, model, tokenizer, args.prompt) 107 + print(f"{path}:\n {pred}\n") 108 + else: 109 + records = load_records(TEST_SPLITS, dedupe=False)[: args.n] 110 + for i, r in enumerate(records): 111 + img = Image.open(r["image_path"]).convert("RGB") 112 + pred = run_image(img, model, tokenizer, args.prompt) 113 + print(f"\n{'='*60}") 114 + print(f"[{i}] {r['image_path']}") 115 + print(f" EXPECTED : {repr(r['typst'])}") 116 + print(f" PREDICTED: {repr(pred)}") 117 + print(f" MATCH : {normalize(pred) == normalize(r['typst'])}") 118 + 119 + 120 + if __name__ == "__main__": 121 + main()

Configure Feed

Configure Feed