this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 806 lines 27 kB view raw
1#!/usr/bin/env python3 2""" 3PDF to Markdown Converter for LLM Context 4 5Extracts entire PDF content as clean, structured markdown. 6Images are extracted to cache directory and copied to output location. 7 8Features: 9- High-accuracy table extraction using IBM Docling (TableFormer AI model) 10- Aggressive persistent caching (extracts once, reuses forever) 11- Cache only cleared on explicit request or source file change 12 13Usage: 14 python pdf_to_md.py <input.pdf> [output.md] 15 python pdf_to_md.py <input.pdf> --docling # Accurate tables (slower) 16 python pdf_to_md.py <input.pdf> --clear-cache # Re-extract 17 python pdf_to_md.py --clear-all-cache # Clear entire cache 18 19Dependencies: 20 uv pip install pymupdf pymupdf4llm # Fast mode 21 uv pip install docling docling-core # Docling mode (optional) 22""" 23 24import argparse 25import sys 26import os 27import re 28import json 29import hashlib 30import shutil 31import tempfile 32from dataclasses import dataclass 33from pathlib import Path 34from datetime import datetime 35 36 37# ============================================================================= 38# DATACLASSES 39# ============================================================================= 40 41 42@dataclass 43class ExtractionConfig: 44 """Configuration for PDF extraction.""" 45 46 pdf_path: str 47 docling: bool = False 48 images_scale: float = 4.0 49 50 51@dataclass 52class ExtractionResult: 53 """Result of PDF extraction or cache load.""" 54 55 markdown: str 56 image_dir: Path | None 57 total_pages: int 58 from_cache: bool = False 59 60 61# Suppress PyMuPDF's "Consider using pymupdf_layout" recommendation 62os.environ.setdefault("PYMUPDF_SUGGEST_LAYOUT_ANALYZER", "0") 63 64# Default cache directory 65DEFAULT_CACHE_DIR = Path.home() / ".cache" / "pdf-to-markdown" 66 67 68# ============================================================================= 69# CACHE MANAGER 70# ============================================================================= 71 72 73class CacheManager: 74 """Manages PDF extraction cache.""" 75 76 def __init__(self, cache_dir: Path = None): 77 self.cache_dir = cache_dir or DEFAULT_CACHE_DIR 78 79 def get_key(self, config: ExtractionConfig) -> str: 80 """Generate cache key from file content + size + mode.""" 81 p = Path(config.pdf_path).resolve() 82 stat = p.stat() 83 file_size = stat.st_size 84 85 chunk_size = 65536 # 64KB 86 hasher = hashlib.sha256() 87 88 with open(p, "rb") as f: 89 if file_size <= chunk_size * 2: 90 hasher.update(f.read()) 91 else: 92 hasher.update(f.read(chunk_size)) 93 f.seek(-chunk_size, 2) 94 hasher.update(f.read(chunk_size)) 95 96 mode = f"docling_{config.images_scale}" if config.docling else "fast" 97 raw = f"{file_size}|{hasher.hexdigest()}|{mode}" 98 return hashlib.sha256(raw.encode()).hexdigest()[:16] 99 100 def _get_dir(self, cache_key: str) -> Path: 101 """Get cache directory for a given cache key.""" 102 return self.cache_dir / cache_key 103 104 def is_valid(self, config: ExtractionConfig) -> tuple[bool, str]: 105 """Check if valid cache exists for this PDF.""" 106 from extractor import EXTRACTOR_VERSION 107 108 try: 109 cache_key = self.get_key(config) 110 except (FileNotFoundError, OSError): 111 return False, "" 112 113 cache_dir = self._get_dir(cache_key) 114 metadata_file = cache_dir / "metadata.json" 115 output_file = cache_dir / "full_output.md" 116 117 if not metadata_file.exists() or not output_file.exists(): 118 return False, cache_key 119 120 try: 121 with open(metadata_file) as f: 122 metadata = json.load(f) 123 124 p = Path(config.pdf_path).resolve() 125 stat = p.stat() 126 127 if ( 128 metadata.get("source_size") != stat.st_size 129 or metadata.get("source_mtime") != stat.st_mtime 130 ): 131 return False, cache_key 132 133 if metadata.get("extractor_version") != EXTRACTOR_VERSION: 134 return False, cache_key 135 136 return True, cache_key 137 except (json.JSONDecodeError, KeyError, OSError): 138 return False, cache_key 139 140 def load(self, cache_key: str) -> ExtractionResult | None: 141 """Load markdown from cache.""" 142 cache_dir = self._get_dir(cache_key) 143 144 try: 145 full_md = (cache_dir / "full_output.md").read_text(encoding="utf-8") 146 with open(cache_dir / "metadata.json") as f: 147 metadata = json.load(f) 148 total_pages = metadata.get("total_pages", 0) 149 except (FileNotFoundError, IOError, json.JSONDecodeError, OSError) as e: 150 print( 151 f"WARNING: Cache corrupted ({e.__class__.__name__}), regenerating...", 152 file=sys.stderr, 153 ) 154 try: 155 if cache_dir.exists(): 156 shutil.rmtree(cache_dir) 157 except OSError: 158 pass 159 return None 160 161 # Check if markdown references images 162 has_image_refs = bool(re.search(r"!\[[^\]]*\]\([^)]+\)", full_md)) 163 164 # Get cached images directory 165 cached_image_dir = cache_dir / "images" 166 has_images = cached_image_dir.exists() and any(cached_image_dir.iterdir()) 167 168 # If markdown expects images but they're missing, invalidate cache 169 if has_image_refs and not has_images: 170 print( 171 "WARNING: Cache missing images, regenerating...", 172 file=sys.stderr, 173 ) 174 try: 175 shutil.rmtree(cache_dir) 176 except OSError: 177 pass 178 return None 179 180 image_dir = cached_image_dir if has_images else None 181 182 return ExtractionResult( 183 markdown=full_md, 184 image_dir=image_dir, 185 total_pages=total_pages, 186 from_cache=True, 187 ) 188 189 def _normalize_image_paths(self, markdown: str, source_image_dir: Path) -> str: 190 """Normalize image paths in markdown to use relative 'images/' prefix.""" 191 if not source_image_dir: 192 return markdown 193 194 source_image_dir = Path(source_image_dir) 195 196 def normalize_ref(match): 197 alt_text = match.group(1) 198 filename_raw = match.group(2) 199 filename = Path(filename_raw).name 200 if (source_image_dir / filename).exists(): 201 return f"![{alt_text}](images/{filename})" 202 return match.group(0) 203 204 pattern = r"!\[([^\]]*)\]\(([^)]+)\)" 205 return re.sub(pattern, normalize_ref, markdown) 206 207 def save(self, cache_key: str, result: ExtractionResult, config: ExtractionConfig): 208 """Save full extraction to cache using atomic writes.""" 209 from extractor import EXTRACTOR_VERSION 210 211 cache_dir = self._get_dir(cache_key) 212 cache_dir.mkdir(parents=True, exist_ok=True) 213 214 markdown = result.markdown 215 if result.image_dir: 216 markdown = self._normalize_image_paths(markdown, result.image_dir) 217 218 p = Path(config.pdf_path).resolve() 219 stat = p.stat() 220 mode = f"docling_{config.images_scale}" if config.docling else "fast" 221 222 metadata = { 223 "source_path": str(p), 224 "source_mtime": stat.st_mtime, 225 "source_size": stat.st_size, 226 "cache_key": cache_key, 227 "cached_at": datetime.now().isoformat(), 228 "total_pages": result.total_pages, 229 "extractor_version": EXTRACTOR_VERSION, 230 "mode": mode, 231 "images_scale": config.images_scale if config.docling else None, 232 } 233 234 temp_md = None 235 temp_json = None 236 try: 237 with tempfile.NamedTemporaryFile( 238 mode="w", 239 dir=cache_dir, 240 suffix=".md.tmp", 241 delete=False, 242 encoding="utf-8", 243 ) as f: 244 f.write(markdown) 245 temp_md = f.name 246 247 with tempfile.NamedTemporaryFile( 248 mode="w", dir=cache_dir, suffix=".json.tmp", delete=False 249 ) as f: 250 json.dump(metadata, f, indent=2) 251 temp_json = f.name 252 253 os.replace(temp_md, cache_dir / "full_output.md") 254 temp_md = None 255 os.replace(temp_json, cache_dir / "metadata.json") 256 temp_json = None 257 258 if result.image_dir and Path(result.image_dir).exists(): 259 temp_images = cache_dir / "images.tmp" 260 final_images = cache_dir / "images" 261 262 if temp_images.exists(): 263 shutil.rmtree(temp_images) 264 265 shutil.copytree(result.image_dir, temp_images) 266 267 if final_images.exists(): 268 shutil.rmtree(final_images) 269 os.rename(temp_images, final_images) 270 271 finally: 272 if temp_md and os.path.exists(temp_md): 273 os.unlink(temp_md) 274 if temp_json and os.path.exists(temp_json): 275 os.unlink(temp_json) 276 277 def clear(self, pdf_path: str = None) -> bool: 278 """Clear cache for specific PDF (both fast and docling modes) or entire cache.""" 279 if pdf_path: 280 # Clear BOTH fast and docling caches for this PDF 281 cleared = False 282 for docling_mode in [False, True]: 283 try: 284 config = ExtractionConfig(pdf_path=pdf_path, docling=docling_mode) 285 cache_key = self.get_key(config) 286 cache_dir = self._get_dir(cache_key) 287 if cache_dir.exists(): 288 shutil.rmtree(cache_dir) 289 cleared = True 290 except (FileNotFoundError, OSError): 291 pass 292 return cleared 293 else: 294 if self.cache_dir.exists(): 295 shutil.rmtree(self.cache_dir) 296 return True 297 return False 298 299 def get_stats(self) -> dict: 300 """Get statistics about the cache.""" 301 if not self.cache_dir.exists(): 302 return {"entries": 0, "total_size_mb": 0, "cache_dir": str(self.cache_dir)} 303 304 entries = 0 305 total_size = 0 306 307 for entry in self.cache_dir.iterdir(): 308 if entry.is_dir(): 309 entries += 1 310 for f in entry.rglob("*"): 311 if f.is_file(): 312 total_size += f.stat().st_size 313 314 return { 315 "entries": entries, 316 "total_size_mb": round(total_size / (1024 * 1024), 2), 317 "cache_dir": str(self.cache_dir), 318 } 319 320 321# ============================================================================= 322# IMAGE MANAGER 323# ============================================================================= 324 325 326class ImageManager: 327 """Manages image extraction and cleanup.""" 328 329 def __init__(self): 330 self._temp_dirs: list[Path] = [] 331 332 def create_temp_dir(self, pdf_path: str) -> Path: 333 """Create tracked temp directory for image extraction.""" 334 pdf_name = Path(pdf_path).stem 335 safe_name = re.sub(r"[^\w\-_]", "_", pdf_name) 336 temp_dir = Path(tempfile.mkdtemp(prefix=f"pdf_images_{safe_name}_")) 337 self._temp_dirs.append(temp_dir) 338 return temp_dir 339 340 def cleanup(self): 341 """Clean up all tracked temp directories.""" 342 for temp_dir in self._temp_dirs: 343 if temp_dir.exists(): 344 shutil.rmtree(temp_dir) 345 self._temp_dirs.clear() 346 347 def extract_references(self, markdown: str) -> set: 348 """Extract the set of image filenames referenced in markdown.""" 349 pattern = r"!\[[^\]]*\]\(([^)]+)\)" 350 matches = re.findall(pattern, markdown) 351 return {Path(m).name for m in matches} 352 353 def get_info(self, image_dir: Path, referenced_only: set = None) -> list: 354 """Get information about extracted images.""" 355 if not image_dir or not Path(image_dir).exists(): 356 return [] 357 358 image_dir = Path(image_dir) 359 images = [] 360 361 for img_path in sorted(image_dir.glob("*")): 362 if img_path.suffix.lower() in (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp"): 363 if referenced_only is not None and img_path.name not in referenced_only: 364 continue 365 366 try: 367 size_bytes = img_path.stat().st_size 368 size_kb = size_bytes / 1024 369 370 try: 371 import pymupdf 372 pix = pymupdf.Pixmap(str(img_path)) 373 dimensions = f"{pix.width}x{pix.height}" 374 pix = None 375 except Exception: 376 dimensions = "unknown" 377 378 images.append({ 379 "filename": img_path.name, 380 "path": str(img_path), 381 "size_kb": round(size_kb, 1), 382 "dimensions": dimensions, 383 }) 384 except Exception: 385 pass 386 387 return images 388 389 def enhance_markdown(self, markdown: str, image_dir: Path) -> str: 390 """Rewrite image references to use relative paths (portable, Windows-safe).""" 391 if not image_dir: 392 return markdown 393 394 image_dir = Path(image_dir) 395 396 def replace_image_ref(match): 397 alt_text = match.group(1) 398 filename_raw = match.group(2) 399 filename = Path(filename_raw).name 400 full_path = image_dir / filename 401 402 # Use relative path for portability (POSIX format for Windows compatibility) 403 relative_path = Path("images") / filename 404 405 if full_path.exists(): 406 try: 407 size_kb = round(full_path.stat().st_size / 1024, 1) 408 try: 409 import pymupdf 410 pix = pymupdf.Pixmap(str(full_path)) 411 dims = f"{pix.width}x{pix.height}" 412 pix = None 413 except Exception: 414 dims = "?" 415 416 return f"![{alt_text}]({relative_path.as_posix()})\n\n**[Image: {filename} ({dims}, {size_kb}KB)]**" 417 except Exception: 418 return f"![{alt_text}]({relative_path.as_posix()})\n\n**[Image: {filename}]**" 419 420 return match.group(0) 421 422 pattern = r"!\[([^\]]*)\]\(([^)]+)\)" 423 return re.sub(pattern, replace_image_ref, markdown) 424 425 def create_summary(self, images: list) -> str: 426 """Create a summary section listing all extracted images.""" 427 if not images: 428 return "" 429 430 lines = [ 431 "", 432 "---", 433 "", 434 "## Extracted Images", 435 "", 436 "| # | File | Dimensions | Size |", 437 "|---|------|------------|------|", 438 ] 439 440 for i, img in enumerate(images, 1): 441 lines.append( 442 f"| {i} | {img['filename']} | {img['dimensions']} | {img['size_kb']}KB |" 443 ) 444 445 lines.append("") 446 return "\n".join(lines) 447 448 def finalize_images( 449 self, temp_dir: Path, cache_dir: Path, output_path: Path, show_progress: bool = False 450 ) -> Path | None: 451 """Finalize image directory after extraction. 452 453 Copies images from cache to output location (next to the markdown file). 454 Cleans up temp directories. 455 456 Returns the final image directory (next to output) for reference. 457 """ 458 if not temp_dir: 459 return None 460 461 temp_dir = Path(temp_dir) 462 463 # Clean up empty temp directories 464 if not temp_dir.exists() or not any(temp_dir.iterdir()): 465 if temp_dir.exists(): 466 shutil.rmtree(temp_dir) 467 if temp_dir in self._temp_dirs: 468 self._temp_dirs.remove(temp_dir) 469 return None 470 471 # Clean up temp directory (images are saved to cache) 472 if temp_dir.exists(): 473 shutil.rmtree(temp_dir) 474 if temp_dir in self._temp_dirs: 475 self._temp_dirs.remove(temp_dir) 476 477 # Copy images from cache to output location 478 if cache_dir: 479 cached_image_dir = cache_dir / "images" 480 if cached_image_dir.exists() and any(cached_image_dir.iterdir()): 481 return self._copy_images_to_output(cached_image_dir, output_path, show_progress) 482 483 return None 484 485 def _copy_images_to_output( 486 self, source_dir: Path, output_path: Path, show_progress: bool = False 487 ) -> Path | None: 488 """Copy images from cache to output location (next to markdown file).""" 489 output_path = Path(output_path) 490 491 # Determine output images directory (sibling to markdown file) 492 if output_path.suffix: # It's a file path like "output.md" 493 output_images_dir = output_path.parent / "images" 494 else: # It's a directory 495 output_images_dir = output_path / "images" 496 497 # Don't copy if already at output location 498 if output_images_dir.resolve() == Path(source_dir).resolve(): 499 return output_images_dir 500 501 # Copy images to output location 502 output_images_dir.mkdir(parents=True, exist_ok=True) 503 copied_count = 0 504 for img in source_dir.iterdir(): 505 if img.is_file(): 506 shutil.copy2(img, output_images_dir / img.name) 507 copied_count += 1 508 509 if show_progress and copied_count > 0: 510 print(f"Copied {copied_count} images to: {output_images_dir}", file=sys.stderr) 511 512 return output_images_dir 513 514 515# ============================================================================= 516# PDF PROCESSING 517# ============================================================================= 518 519 520def check_dependencies(docling_mode: bool = False): 521 """Check if required packages are installed.""" 522 missing = [] 523 524 try: 525 import pymupdf 526 except ImportError: 527 missing.append("pymupdf") 528 529 if docling_mode: 530 try: 531 import docling 532 except ImportError: 533 missing.append("docling") 534 535 try: 536 import docling_core 537 except ImportError: 538 missing.append("docling-core") 539 540 install_cmd = "uv pip install pymupdf docling docling-core" 541 else: 542 try: 543 import pymupdf4llm 544 except ImportError: 545 missing.append("pymupdf4llm") 546 547 install_cmd = "uv pip install pymupdf pymupdf4llm" 548 549 if missing: 550 print(f"ERROR: Missing dependencies: {', '.join(missing)}", file=sys.stderr) 551 print(f"Install with: {install_cmd}", file=sys.stderr) 552 return False 553 554 return True 555 556 557def convert_pdf(pdf_path, image_dir, show_progress=False, docling=False, images_scale=4.0): 558 """Convert PDF to markdown.""" 559 if docling: 560 from extractor import extract_pdf_docling 561 562 markdown, _image_paths = extract_pdf_docling( 563 pdf_path, 564 output_dir=image_dir, 565 images_scale=images_scale, 566 show_progress=show_progress, 567 ) 568 return markdown 569 else: 570 from extractor import extract_pdf_fast 571 572 markdown = extract_pdf_fast( 573 pdf_path, 574 image_dir=image_dir, 575 show_progress=show_progress, 576 ) 577 return markdown 578 579 580def add_metadata_header(markdown, pdf_path, total_pages, image_dir=None, cached=False): 581 """Add metadata header to markdown output.""" 582 filename = os.path.basename(pdf_path) 583 584 header_lines = [ 585 "---", 586 f"source: {filename}", 587 f"total_pages: {total_pages}", 588 f"extracted_at: {datetime.now().isoformat()}", 589 ] 590 591 if cached: 592 header_lines.append("from_cache: true") 593 594 if image_dir: 595 # Use relative path for portability 596 header_lines.append("images_dir: images") 597 598 header_lines.extend(["---", "", ""]) 599 600 return "\n".join(header_lines) + markdown 601 602 603# ============================================================================= 604# MAIN 605# ============================================================================= 606 607 608def main(): 609 parser = argparse.ArgumentParser( 610 description="Convert PDF to Markdown for LLM context (with persistent caching)", 611 formatter_class=argparse.RawDescriptionHelpFormatter, 612 epilog=""" 613Examples: 614 python pdf_to_md.py document.pdf # Output to document.md (cached) 615 python pdf_to_md.py document.pdf output.md # Custom output path 616 python pdf_to_md.py document.pdf --docling # Accurate tables (slower) 617 python pdf_to_md.py document.pdf --clear-cache # Clear cache and re-extract 618 python pdf_to_md.py --clear-all-cache # Clear entire cache 619 620Caching: 621 PDFs are cached in ~/.cache/pdf-to-markdown/ 622 Cache is keyed by file content hash + extraction mode. 623 Cache persists until explicitly cleared or source PDF changes. 624 """, 625 ) 626 627 parser.add_argument("input", nargs="?", help="Input PDF file path") 628 parser.add_argument("output", nargs="?", help="Output markdown file path (default: <input>.md)") 629 parser.add_argument( 630 "--docling", 631 "--accurate", 632 action="store_true", 633 dest="docling", 634 help="Use Docling AI for complex/borderless tables (slower, ~1 sec/page)", 635 ) 636 parser.add_argument("--no-progress", action="store_true", help="Disable progress indicator") 637 638 # Cache options 639 parser.add_argument( 640 "--clear-cache", 641 action="store_true", 642 help="Clear cache for this PDF before processing", 643 ) 644 parser.add_argument( 645 "--clear-all-cache", 646 action="store_true", 647 help="Clear entire cache directory and exit", 648 ) 649 parser.add_argument("--cache-stats", action="store_true", help="Show cache statistics and exit") 650 651 args = parser.parse_args() 652 653 cache_mgr = CacheManager() 654 655 # Handle cache management commands 656 if args.clear_all_cache: 657 if cache_mgr.clear(): 658 print(f"Cache cleared: {cache_mgr.cache_dir}", file=sys.stderr) 659 else: 660 print("Cache was already empty.", file=sys.stderr) 661 sys.exit(0) 662 663 if args.cache_stats: 664 stats = cache_mgr.get_stats() 665 print(f"Cache directory: {stats['cache_dir']}", file=sys.stderr) 666 print(f"Cached PDFs: {stats['entries']}", file=sys.stderr) 667 print(f"Total size: {stats['total_size_mb']} MB", file=sys.stderr) 668 sys.exit(0) 669 670 # Require input for all other operations 671 if not args.input: 672 parser.error("the following arguments are required: input") 673 674 # Handle --clear-cache 675 if args.clear_cache: 676 if cache_mgr.clear(args.input): 677 print(f"Cache cleared for: {args.input}", file=sys.stderr) 678 else: 679 print(f"No cache found for: {args.input}", file=sys.stderr) 680 681 # Validate input exists 682 if not os.path.exists(args.input): 683 print(f"ERROR: File not found: {args.input}", file=sys.stderr) 684 sys.exit(1) 685 686 if not args.input.lower().endswith(".pdf"): 687 print(f"WARNING: File may not be a PDF: {args.input}", file=sys.stderr) 688 689 show_progress = sys.stderr.isatty() and not args.no_progress 690 691 # Check cache 692 config = ExtractionConfig(pdf_path=args.input, docling=args.docling) 693 valid, cache_key = cache_mgr.is_valid(config) 694 695 result = None 696 image_dir = None 697 cache_hit = False 698 699 if valid: 700 if show_progress: 701 mode = "docling" if args.docling else "fast" 702 print(f"Loading from cache ({mode} mode)...", file=sys.stderr) 703 704 cache_result = cache_mgr.load(cache_key) 705 if cache_result: 706 result = cache_result.markdown 707 total_pages = cache_result.total_pages 708 cache_hit = True 709 710 # Copy images from cache to output location 711 if cache_result.image_dir: 712 output_path = args.output or os.path.splitext(args.input)[0] + ".md" 713 img_mgr = ImageManager() 714 image_dir = img_mgr._copy_images_to_output( 715 cache_result.image_dir, output_path, show_progress 716 ) 717 718 # Extract if no cache hit 719 if not cache_hit: 720 if not check_dependencies(docling_mode=args.docling): 721 sys.exit(1) 722 723 from extractor import get_page_count 724 725 total_pages = get_page_count(args.input) 726 727 if not cache_key: 728 cache_key = cache_mgr.get_key(config) 729 730 img_mgr = ImageManager() 731 temp_image_dir = img_mgr.create_temp_dir(args.input) 732 733 try: 734 if show_progress: 735 if args.docling: 736 print( 737 f"Extracting {total_pages} pages with Docling AI (~1 sec/page)...", 738 file=sys.stderr, 739 ) 740 else: 741 print( 742 f"Extracting {total_pages} pages with PyMuPDF (fast mode)...", 743 file=sys.stderr, 744 ) 745 746 result = convert_pdf( 747 args.input, 748 image_dir=temp_image_dir, 749 show_progress=show_progress, 750 docling=args.docling, 751 ) 752 except Exception as e: 753 img_mgr.cleanup() 754 print(f"ERROR: Conversion failed: {e}", file=sys.stderr) 755 sys.exit(1) 756 757 # Save to cache 758 extraction_result = ExtractionResult( 759 markdown=result, 760 image_dir=temp_image_dir, 761 total_pages=total_pages, 762 ) 763 cache_mgr.save(cache_key, extraction_result, config) 764 if show_progress: 765 print(f"Cached: {cache_mgr._get_dir(cache_key)}", file=sys.stderr) 766 767 # Finalize images 768 output_path = args.output or os.path.splitext(args.input)[0] + ".md" 769 image_dir = img_mgr.finalize_images( 770 temp_dir=temp_image_dir, 771 cache_dir=cache_mgr._get_dir(cache_key), 772 output_path=output_path, 773 show_progress=show_progress, 774 ) 775 776 # Format output 777 output = result 778 img_mgr_for_output = ImageManager() # Fresh instance for output processing 779 780 referenced_images = img_mgr_for_output.extract_references(result) if result else set() 781 782 if image_dir: 783 output = img_mgr_for_output.enhance_markdown(output, image_dir) 784 images = img_mgr_for_output.get_info(image_dir, referenced_only=referenced_images) 785 if images: 786 output += img_mgr_for_output.create_summary(images) 787 788 output = add_metadata_header(output, args.input, total_pages, image_dir, cached=cache_hit) 789 790 # Write output 791 output_path = args.output or os.path.splitext(args.input)[0] + ".md" 792 with open(output_path, "w", encoding="utf-8") as f: 793 f.write(output) 794 795 msg = f"Converted {total_pages} pages to: {output_path}" 796 if cache_hit: 797 msg += " (from cache)" 798 if image_dir: 799 images = img_mgr_for_output.get_info(image_dir, referenced_only=referenced_images) 800 if images: 801 msg += f" ({len(images)} images)" 802 print(msg, file=sys.stderr) 803 804 805if __name__ == "__main__": 806 main()