this repo has no description
1#!/usr/bin/env python3
2"""
3PDF to Markdown Converter for LLM Context
4
5Extracts entire PDF content as clean, structured markdown.
6Images are extracted to cache directory and copied to output location.
7
8Features:
9- High-accuracy table extraction using IBM Docling (TableFormer AI model)
10- Aggressive persistent caching (extracts once, reuses forever)
11- Cache only cleared on explicit request or source file change
12
13Usage:
14 python pdf_to_md.py <input.pdf> [output.md]
15 python pdf_to_md.py <input.pdf> --docling # Accurate tables (slower)
16 python pdf_to_md.py <input.pdf> --clear-cache # Re-extract
17 python pdf_to_md.py --clear-all-cache # Clear entire cache
18
19Dependencies:
20 uv pip install pymupdf pymupdf4llm # Fast mode
21 uv pip install docling docling-core # Docling mode (optional)
22"""
23
24import argparse
25import sys
26import os
27import re
28import json
29import hashlib
30import shutil
31import tempfile
32from dataclasses import dataclass
33from pathlib import Path
34from datetime import datetime
35
36
37# =============================================================================
38# DATACLASSES
39# =============================================================================
40
41
42@dataclass
43class ExtractionConfig:
44 """Configuration for PDF extraction."""
45
46 pdf_path: str
47 docling: bool = False
48 images_scale: float = 4.0
49
50
51@dataclass
52class ExtractionResult:
53 """Result of PDF extraction or cache load."""
54
55 markdown: str
56 image_dir: Path | None
57 total_pages: int
58 from_cache: bool = False
59
60
61# Suppress PyMuPDF's "Consider using pymupdf_layout" recommendation
62os.environ.setdefault("PYMUPDF_SUGGEST_LAYOUT_ANALYZER", "0")
63
64# Default cache directory
65DEFAULT_CACHE_DIR = Path.home() / ".cache" / "pdf-to-markdown"
66
67
68# =============================================================================
69# CACHE MANAGER
70# =============================================================================
71
72
73class CacheManager:
74 """Manages PDF extraction cache."""
75
76 def __init__(self, cache_dir: Path = None):
77 self.cache_dir = cache_dir or DEFAULT_CACHE_DIR
78
79 def get_key(self, config: ExtractionConfig) -> str:
80 """Generate cache key from file content + size + mode."""
81 p = Path(config.pdf_path).resolve()
82 stat = p.stat()
83 file_size = stat.st_size
84
85 chunk_size = 65536 # 64KB
86 hasher = hashlib.sha256()
87
88 with open(p, "rb") as f:
89 if file_size <= chunk_size * 2:
90 hasher.update(f.read())
91 else:
92 hasher.update(f.read(chunk_size))
93 f.seek(-chunk_size, 2)
94 hasher.update(f.read(chunk_size))
95
96 mode = f"docling_{config.images_scale}" if config.docling else "fast"
97 raw = f"{file_size}|{hasher.hexdigest()}|{mode}"
98 return hashlib.sha256(raw.encode()).hexdigest()[:16]
99
100 def _get_dir(self, cache_key: str) -> Path:
101 """Get cache directory for a given cache key."""
102 return self.cache_dir / cache_key
103
104 def is_valid(self, config: ExtractionConfig) -> tuple[bool, str]:
105 """Check if valid cache exists for this PDF."""
106 from extractor import EXTRACTOR_VERSION
107
108 try:
109 cache_key = self.get_key(config)
110 except (FileNotFoundError, OSError):
111 return False, ""
112
113 cache_dir = self._get_dir(cache_key)
114 metadata_file = cache_dir / "metadata.json"
115 output_file = cache_dir / "full_output.md"
116
117 if not metadata_file.exists() or not output_file.exists():
118 return False, cache_key
119
120 try:
121 with open(metadata_file) as f:
122 metadata = json.load(f)
123
124 p = Path(config.pdf_path).resolve()
125 stat = p.stat()
126
127 if (
128 metadata.get("source_size") != stat.st_size
129 or metadata.get("source_mtime") != stat.st_mtime
130 ):
131 return False, cache_key
132
133 if metadata.get("extractor_version") != EXTRACTOR_VERSION:
134 return False, cache_key
135
136 return True, cache_key
137 except (json.JSONDecodeError, KeyError, OSError):
138 return False, cache_key
139
140 def load(self, cache_key: str) -> ExtractionResult | None:
141 """Load markdown from cache."""
142 cache_dir = self._get_dir(cache_key)
143
144 try:
145 full_md = (cache_dir / "full_output.md").read_text(encoding="utf-8")
146 with open(cache_dir / "metadata.json") as f:
147 metadata = json.load(f)
148 total_pages = metadata.get("total_pages", 0)
149 except (FileNotFoundError, IOError, json.JSONDecodeError, OSError) as e:
150 print(
151 f"WARNING: Cache corrupted ({e.__class__.__name__}), regenerating...",
152 file=sys.stderr,
153 )
154 try:
155 if cache_dir.exists():
156 shutil.rmtree(cache_dir)
157 except OSError:
158 pass
159 return None
160
161 # Check if markdown references images
162 has_image_refs = bool(re.search(r"!\[[^\]]*\]\([^)]+\)", full_md))
163
164 # Get cached images directory
165 cached_image_dir = cache_dir / "images"
166 has_images = cached_image_dir.exists() and any(cached_image_dir.iterdir())
167
168 # If markdown expects images but they're missing, invalidate cache
169 if has_image_refs and not has_images:
170 print(
171 "WARNING: Cache missing images, regenerating...",
172 file=sys.stderr,
173 )
174 try:
175 shutil.rmtree(cache_dir)
176 except OSError:
177 pass
178 return None
179
180 image_dir = cached_image_dir if has_images else None
181
182 return ExtractionResult(
183 markdown=full_md,
184 image_dir=image_dir,
185 total_pages=total_pages,
186 from_cache=True,
187 )
188
189 def _normalize_image_paths(self, markdown: str, source_image_dir: Path) -> str:
190 """Normalize image paths in markdown to use relative 'images/' prefix."""
191 if not source_image_dir:
192 return markdown
193
194 source_image_dir = Path(source_image_dir)
195
196 def normalize_ref(match):
197 alt_text = match.group(1)
198 filename_raw = match.group(2)
199 filename = Path(filename_raw).name
200 if (source_image_dir / filename).exists():
201 return f""
202 return match.group(0)
203
204 pattern = r"!\[([^\]]*)\]\(([^)]+)\)"
205 return re.sub(pattern, normalize_ref, markdown)
206
207 def save(self, cache_key: str, result: ExtractionResult, config: ExtractionConfig):
208 """Save full extraction to cache using atomic writes."""
209 from extractor import EXTRACTOR_VERSION
210
211 cache_dir = self._get_dir(cache_key)
212 cache_dir.mkdir(parents=True, exist_ok=True)
213
214 markdown = result.markdown
215 if result.image_dir:
216 markdown = self._normalize_image_paths(markdown, result.image_dir)
217
218 p = Path(config.pdf_path).resolve()
219 stat = p.stat()
220 mode = f"docling_{config.images_scale}" if config.docling else "fast"
221
222 metadata = {
223 "source_path": str(p),
224 "source_mtime": stat.st_mtime,
225 "source_size": stat.st_size,
226 "cache_key": cache_key,
227 "cached_at": datetime.now().isoformat(),
228 "total_pages": result.total_pages,
229 "extractor_version": EXTRACTOR_VERSION,
230 "mode": mode,
231 "images_scale": config.images_scale if config.docling else None,
232 }
233
234 temp_md = None
235 temp_json = None
236 try:
237 with tempfile.NamedTemporaryFile(
238 mode="w",
239 dir=cache_dir,
240 suffix=".md.tmp",
241 delete=False,
242 encoding="utf-8",
243 ) as f:
244 f.write(markdown)
245 temp_md = f.name
246
247 with tempfile.NamedTemporaryFile(
248 mode="w", dir=cache_dir, suffix=".json.tmp", delete=False
249 ) as f:
250 json.dump(metadata, f, indent=2)
251 temp_json = f.name
252
253 os.replace(temp_md, cache_dir / "full_output.md")
254 temp_md = None
255 os.replace(temp_json, cache_dir / "metadata.json")
256 temp_json = None
257
258 if result.image_dir and Path(result.image_dir).exists():
259 temp_images = cache_dir / "images.tmp"
260 final_images = cache_dir / "images"
261
262 if temp_images.exists():
263 shutil.rmtree(temp_images)
264
265 shutil.copytree(result.image_dir, temp_images)
266
267 if final_images.exists():
268 shutil.rmtree(final_images)
269 os.rename(temp_images, final_images)
270
271 finally:
272 if temp_md and os.path.exists(temp_md):
273 os.unlink(temp_md)
274 if temp_json and os.path.exists(temp_json):
275 os.unlink(temp_json)
276
277 def clear(self, pdf_path: str = None) -> bool:
278 """Clear cache for specific PDF (both fast and docling modes) or entire cache."""
279 if pdf_path:
280 # Clear BOTH fast and docling caches for this PDF
281 cleared = False
282 for docling_mode in [False, True]:
283 try:
284 config = ExtractionConfig(pdf_path=pdf_path, docling=docling_mode)
285 cache_key = self.get_key(config)
286 cache_dir = self._get_dir(cache_key)
287 if cache_dir.exists():
288 shutil.rmtree(cache_dir)
289 cleared = True
290 except (FileNotFoundError, OSError):
291 pass
292 return cleared
293 else:
294 if self.cache_dir.exists():
295 shutil.rmtree(self.cache_dir)
296 return True
297 return False
298
299 def get_stats(self) -> dict:
300 """Get statistics about the cache."""
301 if not self.cache_dir.exists():
302 return {"entries": 0, "total_size_mb": 0, "cache_dir": str(self.cache_dir)}
303
304 entries = 0
305 total_size = 0
306
307 for entry in self.cache_dir.iterdir():
308 if entry.is_dir():
309 entries += 1
310 for f in entry.rglob("*"):
311 if f.is_file():
312 total_size += f.stat().st_size
313
314 return {
315 "entries": entries,
316 "total_size_mb": round(total_size / (1024 * 1024), 2),
317 "cache_dir": str(self.cache_dir),
318 }
319
320
321# =============================================================================
322# IMAGE MANAGER
323# =============================================================================
324
325
326class ImageManager:
327 """Manages image extraction and cleanup."""
328
329 def __init__(self):
330 self._temp_dirs: list[Path] = []
331
332 def create_temp_dir(self, pdf_path: str) -> Path:
333 """Create tracked temp directory for image extraction."""
334 pdf_name = Path(pdf_path).stem
335 safe_name = re.sub(r"[^\w\-_]", "_", pdf_name)
336 temp_dir = Path(tempfile.mkdtemp(prefix=f"pdf_images_{safe_name}_"))
337 self._temp_dirs.append(temp_dir)
338 return temp_dir
339
340 def cleanup(self):
341 """Clean up all tracked temp directories."""
342 for temp_dir in self._temp_dirs:
343 if temp_dir.exists():
344 shutil.rmtree(temp_dir)
345 self._temp_dirs.clear()
346
347 def extract_references(self, markdown: str) -> set:
348 """Extract the set of image filenames referenced in markdown."""
349 pattern = r"!\[[^\]]*\]\(([^)]+)\)"
350 matches = re.findall(pattern, markdown)
351 return {Path(m).name for m in matches}
352
353 def get_info(self, image_dir: Path, referenced_only: set = None) -> list:
354 """Get information about extracted images."""
355 if not image_dir or not Path(image_dir).exists():
356 return []
357
358 image_dir = Path(image_dir)
359 images = []
360
361 for img_path in sorted(image_dir.glob("*")):
362 if img_path.suffix.lower() in (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp"):
363 if referenced_only is not None and img_path.name not in referenced_only:
364 continue
365
366 try:
367 size_bytes = img_path.stat().st_size
368 size_kb = size_bytes / 1024
369
370 try:
371 import pymupdf
372 pix = pymupdf.Pixmap(str(img_path))
373 dimensions = f"{pix.width}x{pix.height}"
374 pix = None
375 except Exception:
376 dimensions = "unknown"
377
378 images.append({
379 "filename": img_path.name,
380 "path": str(img_path),
381 "size_kb": round(size_kb, 1),
382 "dimensions": dimensions,
383 })
384 except Exception:
385 pass
386
387 return images
388
389 def enhance_markdown(self, markdown: str, image_dir: Path) -> str:
390 """Rewrite image references to use relative paths (portable, Windows-safe)."""
391 if not image_dir:
392 return markdown
393
394 image_dir = Path(image_dir)
395
396 def replace_image_ref(match):
397 alt_text = match.group(1)
398 filename_raw = match.group(2)
399 filename = Path(filename_raw).name
400 full_path = image_dir / filename
401
402 # Use relative path for portability (POSIX format for Windows compatibility)
403 relative_path = Path("images") / filename
404
405 if full_path.exists():
406 try:
407 size_kb = round(full_path.stat().st_size / 1024, 1)
408 try:
409 import pymupdf
410 pix = pymupdf.Pixmap(str(full_path))
411 dims = f"{pix.width}x{pix.height}"
412 pix = None
413 except Exception:
414 dims = "?"
415
416 return f"})\n\n**[Image: {filename} ({dims}, {size_kb}KB)]**"
417 except Exception:
418 return f"})\n\n**[Image: {filename}]**"
419
420 return match.group(0)
421
422 pattern = r"!\[([^\]]*)\]\(([^)]+)\)"
423 return re.sub(pattern, replace_image_ref, markdown)
424
425 def create_summary(self, images: list) -> str:
426 """Create a summary section listing all extracted images."""
427 if not images:
428 return ""
429
430 lines = [
431 "",
432 "---",
433 "",
434 "## Extracted Images",
435 "",
436 "| # | File | Dimensions | Size |",
437 "|---|------|------------|------|",
438 ]
439
440 for i, img in enumerate(images, 1):
441 lines.append(
442 f"| {i} | {img['filename']} | {img['dimensions']} | {img['size_kb']}KB |"
443 )
444
445 lines.append("")
446 return "\n".join(lines)
447
448 def finalize_images(
449 self, temp_dir: Path, cache_dir: Path, output_path: Path, show_progress: bool = False
450 ) -> Path | None:
451 """Finalize image directory after extraction.
452
453 Copies images from cache to output location (next to the markdown file).
454 Cleans up temp directories.
455
456 Returns the final image directory (next to output) for reference.
457 """
458 if not temp_dir:
459 return None
460
461 temp_dir = Path(temp_dir)
462
463 # Clean up empty temp directories
464 if not temp_dir.exists() or not any(temp_dir.iterdir()):
465 if temp_dir.exists():
466 shutil.rmtree(temp_dir)
467 if temp_dir in self._temp_dirs:
468 self._temp_dirs.remove(temp_dir)
469 return None
470
471 # Clean up temp directory (images are saved to cache)
472 if temp_dir.exists():
473 shutil.rmtree(temp_dir)
474 if temp_dir in self._temp_dirs:
475 self._temp_dirs.remove(temp_dir)
476
477 # Copy images from cache to output location
478 if cache_dir:
479 cached_image_dir = cache_dir / "images"
480 if cached_image_dir.exists() and any(cached_image_dir.iterdir()):
481 return self._copy_images_to_output(cached_image_dir, output_path, show_progress)
482
483 return None
484
485 def _copy_images_to_output(
486 self, source_dir: Path, output_path: Path, show_progress: bool = False
487 ) -> Path | None:
488 """Copy images from cache to output location (next to markdown file)."""
489 output_path = Path(output_path)
490
491 # Determine output images directory (sibling to markdown file)
492 if output_path.suffix: # It's a file path like "output.md"
493 output_images_dir = output_path.parent / "images"
494 else: # It's a directory
495 output_images_dir = output_path / "images"
496
497 # Don't copy if already at output location
498 if output_images_dir.resolve() == Path(source_dir).resolve():
499 return output_images_dir
500
501 # Copy images to output location
502 output_images_dir.mkdir(parents=True, exist_ok=True)
503 copied_count = 0
504 for img in source_dir.iterdir():
505 if img.is_file():
506 shutil.copy2(img, output_images_dir / img.name)
507 copied_count += 1
508
509 if show_progress and copied_count > 0:
510 print(f"Copied {copied_count} images to: {output_images_dir}", file=sys.stderr)
511
512 return output_images_dir
513
514
515# =============================================================================
516# PDF PROCESSING
517# =============================================================================
518
519
520def check_dependencies(docling_mode: bool = False):
521 """Check if required packages are installed."""
522 missing = []
523
524 try:
525 import pymupdf
526 except ImportError:
527 missing.append("pymupdf")
528
529 if docling_mode:
530 try:
531 import docling
532 except ImportError:
533 missing.append("docling")
534
535 try:
536 import docling_core
537 except ImportError:
538 missing.append("docling-core")
539
540 install_cmd = "uv pip install pymupdf docling docling-core"
541 else:
542 try:
543 import pymupdf4llm
544 except ImportError:
545 missing.append("pymupdf4llm")
546
547 install_cmd = "uv pip install pymupdf pymupdf4llm"
548
549 if missing:
550 print(f"ERROR: Missing dependencies: {', '.join(missing)}", file=sys.stderr)
551 print(f"Install with: {install_cmd}", file=sys.stderr)
552 return False
553
554 return True
555
556
557def convert_pdf(pdf_path, image_dir, show_progress=False, docling=False, images_scale=4.0):
558 """Convert PDF to markdown."""
559 if docling:
560 from extractor import extract_pdf_docling
561
562 markdown, _image_paths = extract_pdf_docling(
563 pdf_path,
564 output_dir=image_dir,
565 images_scale=images_scale,
566 show_progress=show_progress,
567 )
568 return markdown
569 else:
570 from extractor import extract_pdf_fast
571
572 markdown = extract_pdf_fast(
573 pdf_path,
574 image_dir=image_dir,
575 show_progress=show_progress,
576 )
577 return markdown
578
579
580def add_metadata_header(markdown, pdf_path, total_pages, image_dir=None, cached=False):
581 """Add metadata header to markdown output."""
582 filename = os.path.basename(pdf_path)
583
584 header_lines = [
585 "---",
586 f"source: {filename}",
587 f"total_pages: {total_pages}",
588 f"extracted_at: {datetime.now().isoformat()}",
589 ]
590
591 if cached:
592 header_lines.append("from_cache: true")
593
594 if image_dir:
595 # Use relative path for portability
596 header_lines.append("images_dir: images")
597
598 header_lines.extend(["---", "", ""])
599
600 return "\n".join(header_lines) + markdown
601
602
603# =============================================================================
604# MAIN
605# =============================================================================
606
607
608def main():
609 parser = argparse.ArgumentParser(
610 description="Convert PDF to Markdown for LLM context (with persistent caching)",
611 formatter_class=argparse.RawDescriptionHelpFormatter,
612 epilog="""
613Examples:
614 python pdf_to_md.py document.pdf # Output to document.md (cached)
615 python pdf_to_md.py document.pdf output.md # Custom output path
616 python pdf_to_md.py document.pdf --docling # Accurate tables (slower)
617 python pdf_to_md.py document.pdf --clear-cache # Clear cache and re-extract
618 python pdf_to_md.py --clear-all-cache # Clear entire cache
619
620Caching:
621 PDFs are cached in ~/.cache/pdf-to-markdown/
622 Cache is keyed by file content hash + extraction mode.
623 Cache persists until explicitly cleared or source PDF changes.
624 """,
625 )
626
627 parser.add_argument("input", nargs="?", help="Input PDF file path")
628 parser.add_argument("output", nargs="?", help="Output markdown file path (default: <input>.md)")
629 parser.add_argument(
630 "--docling",
631 "--accurate",
632 action="store_true",
633 dest="docling",
634 help="Use Docling AI for complex/borderless tables (slower, ~1 sec/page)",
635 )
636 parser.add_argument("--no-progress", action="store_true", help="Disable progress indicator")
637
638 # Cache options
639 parser.add_argument(
640 "--clear-cache",
641 action="store_true",
642 help="Clear cache for this PDF before processing",
643 )
644 parser.add_argument(
645 "--clear-all-cache",
646 action="store_true",
647 help="Clear entire cache directory and exit",
648 )
649 parser.add_argument("--cache-stats", action="store_true", help="Show cache statistics and exit")
650
651 args = parser.parse_args()
652
653 cache_mgr = CacheManager()
654
655 # Handle cache management commands
656 if args.clear_all_cache:
657 if cache_mgr.clear():
658 print(f"Cache cleared: {cache_mgr.cache_dir}", file=sys.stderr)
659 else:
660 print("Cache was already empty.", file=sys.stderr)
661 sys.exit(0)
662
663 if args.cache_stats:
664 stats = cache_mgr.get_stats()
665 print(f"Cache directory: {stats['cache_dir']}", file=sys.stderr)
666 print(f"Cached PDFs: {stats['entries']}", file=sys.stderr)
667 print(f"Total size: {stats['total_size_mb']} MB", file=sys.stderr)
668 sys.exit(0)
669
670 # Require input for all other operations
671 if not args.input:
672 parser.error("the following arguments are required: input")
673
674 # Handle --clear-cache
675 if args.clear_cache:
676 if cache_mgr.clear(args.input):
677 print(f"Cache cleared for: {args.input}", file=sys.stderr)
678 else:
679 print(f"No cache found for: {args.input}", file=sys.stderr)
680
681 # Validate input exists
682 if not os.path.exists(args.input):
683 print(f"ERROR: File not found: {args.input}", file=sys.stderr)
684 sys.exit(1)
685
686 if not args.input.lower().endswith(".pdf"):
687 print(f"WARNING: File may not be a PDF: {args.input}", file=sys.stderr)
688
689 show_progress = sys.stderr.isatty() and not args.no_progress
690
691 # Check cache
692 config = ExtractionConfig(pdf_path=args.input, docling=args.docling)
693 valid, cache_key = cache_mgr.is_valid(config)
694
695 result = None
696 image_dir = None
697 cache_hit = False
698
699 if valid:
700 if show_progress:
701 mode = "docling" if args.docling else "fast"
702 print(f"Loading from cache ({mode} mode)...", file=sys.stderr)
703
704 cache_result = cache_mgr.load(cache_key)
705 if cache_result:
706 result = cache_result.markdown
707 total_pages = cache_result.total_pages
708 cache_hit = True
709
710 # Copy images from cache to output location
711 if cache_result.image_dir:
712 output_path = args.output or os.path.splitext(args.input)[0] + ".md"
713 img_mgr = ImageManager()
714 image_dir = img_mgr._copy_images_to_output(
715 cache_result.image_dir, output_path, show_progress
716 )
717
718 # Extract if no cache hit
719 if not cache_hit:
720 if not check_dependencies(docling_mode=args.docling):
721 sys.exit(1)
722
723 from extractor import get_page_count
724
725 total_pages = get_page_count(args.input)
726
727 if not cache_key:
728 cache_key = cache_mgr.get_key(config)
729
730 img_mgr = ImageManager()
731 temp_image_dir = img_mgr.create_temp_dir(args.input)
732
733 try:
734 if show_progress:
735 if args.docling:
736 print(
737 f"Extracting {total_pages} pages with Docling AI (~1 sec/page)...",
738 file=sys.stderr,
739 )
740 else:
741 print(
742 f"Extracting {total_pages} pages with PyMuPDF (fast mode)...",
743 file=sys.stderr,
744 )
745
746 result = convert_pdf(
747 args.input,
748 image_dir=temp_image_dir,
749 show_progress=show_progress,
750 docling=args.docling,
751 )
752 except Exception as e:
753 img_mgr.cleanup()
754 print(f"ERROR: Conversion failed: {e}", file=sys.stderr)
755 sys.exit(1)
756
757 # Save to cache
758 extraction_result = ExtractionResult(
759 markdown=result,
760 image_dir=temp_image_dir,
761 total_pages=total_pages,
762 )
763 cache_mgr.save(cache_key, extraction_result, config)
764 if show_progress:
765 print(f"Cached: {cache_mgr._get_dir(cache_key)}", file=sys.stderr)
766
767 # Finalize images
768 output_path = args.output or os.path.splitext(args.input)[0] + ".md"
769 image_dir = img_mgr.finalize_images(
770 temp_dir=temp_image_dir,
771 cache_dir=cache_mgr._get_dir(cache_key),
772 output_path=output_path,
773 show_progress=show_progress,
774 )
775
776 # Format output
777 output = result
778 img_mgr_for_output = ImageManager() # Fresh instance for output processing
779
780 referenced_images = img_mgr_for_output.extract_references(result) if result else set()
781
782 if image_dir:
783 output = img_mgr_for_output.enhance_markdown(output, image_dir)
784 images = img_mgr_for_output.get_info(image_dir, referenced_only=referenced_images)
785 if images:
786 output += img_mgr_for_output.create_summary(images)
787
788 output = add_metadata_header(output, args.input, total_pages, image_dir, cached=cache_hit)
789
790 # Write output
791 output_path = args.output or os.path.splitext(args.input)[0] + ".md"
792 with open(output_path, "w", encoding="utf-8") as f:
793 f.write(output)
794
795 msg = f"Converted {total_pages} pages to: {output_path}"
796 if cache_hit:
797 msg += " (from cache)"
798 if image_dir:
799 images = img_mgr_for_output.get_info(image_dir, referenced_only=referenced_images)
800 if images:
801 msg += f" ({len(images)} images)"
802 print(msg, file=sys.stderr)
803
804
805if __name__ == "__main__":
806 main()