scripts/pdf_to_md.py at main · alice.mosphere.at/claude-skill-pdf-to-markdown

alice.mosphere.at / claude-skill-pdf-to-markdown
fork
this repo has no description
fork
claude-skill-pdf-to-markdown / scripts / pdf_to_md.py
at main 806 lines 27 kB view raw
wrap content
alice Fix code review findings: cache clearing, missing images, and portable paths 5mo ago
9299034c
  1#!/usr/bin/env python3
  2"""
  3PDF to Markdown Converter for LLM Context
  4
  5Extracts entire PDF content as clean, structured markdown.
  6Images are extracted to cache directory and copied to output location.
  7
  8Features:
  9- High-accuracy table extraction using IBM Docling (TableFormer AI model)
 10- Aggressive persistent caching (extracts once, reuses forever)
 11- Cache only cleared on explicit request or source file change
 12
 13Usage:
 14    python pdf_to_md.py <input.pdf> [output.md]
 15    python pdf_to_md.py <input.pdf> --docling      # Accurate tables (slower)
 16    python pdf_to_md.py <input.pdf> --clear-cache  # Re-extract
 17    python pdf_to_md.py --clear-all-cache          # Clear entire cache
 18
 19Dependencies:
 20    uv pip install pymupdf pymupdf4llm   # Fast mode
 21    uv pip install docling docling-core  # Docling mode (optional)
 22"""
 23
 24import argparse
 25import sys
 26import os
 27import re
 28import json
 29import hashlib
 30import shutil
 31import tempfile
 32from dataclasses import dataclass
 33from pathlib import Path
 34from datetime import datetime
 35
 36
 37# =============================================================================
 38# DATACLASSES
 39# =============================================================================
 40
 41
 42@dataclass
 43class ExtractionConfig:
 44    """Configuration for PDF extraction."""
 45
 46    pdf_path: str
 47    docling: bool = False
 48    images_scale: float = 4.0
 49
 50
 51@dataclass
 52class ExtractionResult:
 53    """Result of PDF extraction or cache load."""
 54
 55    markdown: str
 56    image_dir: Path | None
 57    total_pages: int
 58    from_cache: bool = False
 59
 60
 61# Suppress PyMuPDF's "Consider using pymupdf_layout" recommendation
 62os.environ.setdefault("PYMUPDF_SUGGEST_LAYOUT_ANALYZER", "0")
 63
 64# Default cache directory
 65DEFAULT_CACHE_DIR = Path.home() / ".cache" / "pdf-to-markdown"
 66
 67
 68# =============================================================================
 69# CACHE MANAGER
 70# =============================================================================
 71
 72
 73class CacheManager:
 74    """Manages PDF extraction cache."""
 75
 76    def __init__(self, cache_dir: Path = None):
 77        self.cache_dir = cache_dir or DEFAULT_CACHE_DIR
 78
 79    def get_key(self, config: ExtractionConfig) -> str:
 80        """Generate cache key from file content + size + mode."""
 81        p = Path(config.pdf_path).resolve()
 82        stat = p.stat()
 83        file_size = stat.st_size
 84
 85        chunk_size = 65536  # 64KB
 86        hasher = hashlib.sha256()
 87
 88        with open(p, "rb") as f:
 89            if file_size <= chunk_size * 2:
 90                hasher.update(f.read())
 91            else:
 92                hasher.update(f.read(chunk_size))
 93                f.seek(-chunk_size, 2)
 94                hasher.update(f.read(chunk_size))
 95
 96        mode = f"docling_{config.images_scale}" if config.docling else "fast"
 97        raw = f"{file_size}|{hasher.hexdigest()}|{mode}"
 98        return hashlib.sha256(raw.encode()).hexdigest()[:16]
 99
100    def _get_dir(self, cache_key: str) -> Path:
101        """Get cache directory for a given cache key."""
102        return self.cache_dir / cache_key
103
104    def is_valid(self, config: ExtractionConfig) -> tuple[bool, str]:
105        """Check if valid cache exists for this PDF."""
106        from extractor import EXTRACTOR_VERSION
107
108        try:
109            cache_key = self.get_key(config)
110        except (FileNotFoundError, OSError):
111            return False, ""
112
113        cache_dir = self._get_dir(cache_key)
114        metadata_file = cache_dir / "metadata.json"
115        output_file = cache_dir / "full_output.md"
116
117        if not metadata_file.exists() or not output_file.exists():
118            return False, cache_key
119
120        try:
121            with open(metadata_file) as f:
122                metadata = json.load(f)
123
124            p = Path(config.pdf_path).resolve()
125            stat = p.stat()
126
127            if (
128                metadata.get("source_size") != stat.st_size
129                or metadata.get("source_mtime") != stat.st_mtime
130            ):
131                return False, cache_key
132
133            if metadata.get("extractor_version") != EXTRACTOR_VERSION:
134                return False, cache_key
135
136            return True, cache_key
137        except (json.JSONDecodeError, KeyError, OSError):
138            return False, cache_key
139
140    def load(self, cache_key: str) -> ExtractionResult | None:
141        """Load markdown from cache."""
142        cache_dir = self._get_dir(cache_key)
143
144        try:
145            full_md = (cache_dir / "full_output.md").read_text(encoding="utf-8")
146            with open(cache_dir / "metadata.json") as f:
147                metadata = json.load(f)
148            total_pages = metadata.get("total_pages", 0)
149        except (FileNotFoundError, IOError, json.JSONDecodeError, OSError) as e:
150            print(
151                f"WARNING: Cache corrupted ({e.__class__.__name__}), regenerating...",
152                file=sys.stderr,
153            )
154            try:
155                if cache_dir.exists():
156                    shutil.rmtree(cache_dir)
157            except OSError:
158                pass
159            return None
160
161        # Check if markdown references images
162        has_image_refs = bool(re.search(r"!\[[^\]]*\]\([^)]+\)", full_md))
163
164        # Get cached images directory
165        cached_image_dir = cache_dir / "images"
166        has_images = cached_image_dir.exists() and any(cached_image_dir.iterdir())
167
168        # If markdown expects images but they're missing, invalidate cache
169        if has_image_refs and not has_images:
170            print(
171                "WARNING: Cache missing images, regenerating...",
172                file=sys.stderr,
173            )
174            try:
175                shutil.rmtree(cache_dir)
176            except OSError:
177                pass
178            return None
179
180        image_dir = cached_image_dir if has_images else None
181
182        return ExtractionResult(
183            markdown=full_md,
184            image_dir=image_dir,
185            total_pages=total_pages,
186            from_cache=True,
187        )
188
189    def _normalize_image_paths(self, markdown: str, source_image_dir: Path) -> str:
190        """Normalize image paths in markdown to use relative 'images/' prefix."""
191        if not source_image_dir:
192            return markdown
193
194        source_image_dir = Path(source_image_dir)
195
196        def normalize_ref(match):
197            alt_text = match.group(1)
198            filename_raw = match.group(2)
199            filename = Path(filename_raw).name
200            if (source_image_dir / filename).exists():
201                return f"![{alt_text}](images/{filename})"
202            return match.group(0)
203
204        pattern = r"!\[([^\]]*)\]\(([^)]+)\)"
205        return re.sub(pattern, normalize_ref, markdown)
206
207    def save(self, cache_key: str, result: ExtractionResult, config: ExtractionConfig):
208        """Save full extraction to cache using atomic writes."""
209        from extractor import EXTRACTOR_VERSION
210
211        cache_dir = self._get_dir(cache_key)
212        cache_dir.mkdir(parents=True, exist_ok=True)
213
214        markdown = result.markdown
215        if result.image_dir:
216            markdown = self._normalize_image_paths(markdown, result.image_dir)
217
218        p = Path(config.pdf_path).resolve()
219        stat = p.stat()
220        mode = f"docling_{config.images_scale}" if config.docling else "fast"
221
222        metadata = {
223            "source_path": str(p),
224            "source_mtime": stat.st_mtime,
225            "source_size": stat.st_size,
226            "cache_key": cache_key,
227            "cached_at": datetime.now().isoformat(),
228            "total_pages": result.total_pages,
229            "extractor_version": EXTRACTOR_VERSION,
230            "mode": mode,
231            "images_scale": config.images_scale if config.docling else None,
232        }
233
234        temp_md = None
235        temp_json = None
236        try:
237            with tempfile.NamedTemporaryFile(
238                mode="w",
239                dir=cache_dir,
240                suffix=".md.tmp",
241                delete=False,
242                encoding="utf-8",
243            ) as f:
244                f.write(markdown)
245                temp_md = f.name
246
247            with tempfile.NamedTemporaryFile(
248                mode="w", dir=cache_dir, suffix=".json.tmp", delete=False
249            ) as f:
250                json.dump(metadata, f, indent=2)
251                temp_json = f.name
252
253            os.replace(temp_md, cache_dir / "full_output.md")
254            temp_md = None
255            os.replace(temp_json, cache_dir / "metadata.json")
256            temp_json = None
257
258            if result.image_dir and Path(result.image_dir).exists():
259                temp_images = cache_dir / "images.tmp"
260                final_images = cache_dir / "images"
261
262                if temp_images.exists():
263                    shutil.rmtree(temp_images)
264
265                shutil.copytree(result.image_dir, temp_images)
266
267                if final_images.exists():
268                    shutil.rmtree(final_images)
269                os.rename(temp_images, final_images)
270
271        finally:
272            if temp_md and os.path.exists(temp_md):
273                os.unlink(temp_md)
274            if temp_json and os.path.exists(temp_json):
275                os.unlink(temp_json)
276
277    def clear(self, pdf_path: str = None) -> bool:
278        """Clear cache for specific PDF (both fast and docling modes) or entire cache."""
279        if pdf_path:
280            # Clear BOTH fast and docling caches for this PDF
281            cleared = False
282            for docling_mode in [False, True]:
283                try:
284                    config = ExtractionConfig(pdf_path=pdf_path, docling=docling_mode)
285                    cache_key = self.get_key(config)
286                    cache_dir = self._get_dir(cache_key)
287                    if cache_dir.exists():
288                        shutil.rmtree(cache_dir)
289                        cleared = True
290                except (FileNotFoundError, OSError):
291                    pass
292            return cleared
293        else:
294            if self.cache_dir.exists():
295                shutil.rmtree(self.cache_dir)
296                return True
297            return False
298
299    def get_stats(self) -> dict:
300        """Get statistics about the cache."""
301        if not self.cache_dir.exists():
302            return {"entries": 0, "total_size_mb": 0, "cache_dir": str(self.cache_dir)}
303
304        entries = 0
305        total_size = 0
306
307        for entry in self.cache_dir.iterdir():
308            if entry.is_dir():
309                entries += 1
310                for f in entry.rglob("*"):
311                    if f.is_file():
312                        total_size += f.stat().st_size
313
314        return {
315            "entries": entries,
316            "total_size_mb": round(total_size / (1024 * 1024), 2),
317            "cache_dir": str(self.cache_dir),
318        }
319
320
321# =============================================================================
322# IMAGE MANAGER
323# =============================================================================
324
325
326class ImageManager:
327    """Manages image extraction and cleanup."""
328
329    def __init__(self):
330        self._temp_dirs: list[Path] = []
331
332    def create_temp_dir(self, pdf_path: str) -> Path:
333        """Create tracked temp directory for image extraction."""
334        pdf_name = Path(pdf_path).stem
335        safe_name = re.sub(r"[^\w\-_]", "_", pdf_name)
336        temp_dir = Path(tempfile.mkdtemp(prefix=f"pdf_images_{safe_name}_"))
337        self._temp_dirs.append(temp_dir)
338        return temp_dir
339
340    def cleanup(self):
341        """Clean up all tracked temp directories."""
342        for temp_dir in self._temp_dirs:
343            if temp_dir.exists():
344                shutil.rmtree(temp_dir)
345        self._temp_dirs.clear()
346
347    def extract_references(self, markdown: str) -> set:
348        """Extract the set of image filenames referenced in markdown."""
349        pattern = r"!\[[^\]]*\]\(([^)]+)\)"
350        matches = re.findall(pattern, markdown)
351        return {Path(m).name for m in matches}
352
353    def get_info(self, image_dir: Path, referenced_only: set = None) -> list:
354        """Get information about extracted images."""
355        if not image_dir or not Path(image_dir).exists():
356            return []
357
358        image_dir = Path(image_dir)
359        images = []
360
361        for img_path in sorted(image_dir.glob("*")):
362            if img_path.suffix.lower() in (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp"):
363                if referenced_only is not None and img_path.name not in referenced_only:
364                    continue
365
366                try:
367                    size_bytes = img_path.stat().st_size
368                    size_kb = size_bytes / 1024
369
370                    try:
371                        import pymupdf
372                        pix = pymupdf.Pixmap(str(img_path))
373                        dimensions = f"{pix.width}x{pix.height}"
374                        pix = None
375                    except Exception:
376                        dimensions = "unknown"
377
378                    images.append({
379                        "filename": img_path.name,
380                        "path": str(img_path),
381                        "size_kb": round(size_kb, 1),
382                        "dimensions": dimensions,
383                    })
384                except Exception:
385                    pass
386
387        return images
388
389    def enhance_markdown(self, markdown: str, image_dir: Path) -> str:
390        """Rewrite image references to use relative paths (portable, Windows-safe)."""
391        if not image_dir:
392            return markdown
393
394        image_dir = Path(image_dir)
395
396        def replace_image_ref(match):
397            alt_text = match.group(1)
398            filename_raw = match.group(2)
399            filename = Path(filename_raw).name
400            full_path = image_dir / filename
401
402            # Use relative path for portability (POSIX format for Windows compatibility)
403            relative_path = Path("images") / filename
404
405            if full_path.exists():
406                try:
407                    size_kb = round(full_path.stat().st_size / 1024, 1)
408                    try:
409                        import pymupdf
410                        pix = pymupdf.Pixmap(str(full_path))
411                        dims = f"{pix.width}x{pix.height}"
412                        pix = None
413                    except Exception:
414                        dims = "?"
415
416                    return f"![{alt_text}]({relative_path.as_posix()})\n\n**[Image: {filename} ({dims}, {size_kb}KB)]**"
417                except Exception:
418                    return f"![{alt_text}]({relative_path.as_posix()})\n\n**[Image: {filename}]**"
419
420            return match.group(0)
421
422        pattern = r"!\[([^\]]*)\]\(([^)]+)\)"
423        return re.sub(pattern, replace_image_ref, markdown)
424
425    def create_summary(self, images: list) -> str:
426        """Create a summary section listing all extracted images."""
427        if not images:
428            return ""
429
430        lines = [
431            "",
432            "---",
433            "",
434            "## Extracted Images",
435            "",
436            "| # | File | Dimensions | Size |",
437            "|---|------|------------|------|",
438        ]
439
440        for i, img in enumerate(images, 1):
441            lines.append(
442                f"| {i} | {img['filename']} | {img['dimensions']} | {img['size_kb']}KB |"
443            )
444
445        lines.append("")
446        return "\n".join(lines)
447
448    def finalize_images(
449        self, temp_dir: Path, cache_dir: Path, output_path: Path, show_progress: bool = False
450    ) -> Path | None:
451        """Finalize image directory after extraction.
452
453        Copies images from cache to output location (next to the markdown file).
454        Cleans up temp directories.
455
456        Returns the final image directory (next to output) for reference.
457        """
458        if not temp_dir:
459            return None
460
461        temp_dir = Path(temp_dir)
462
463        # Clean up empty temp directories
464        if not temp_dir.exists() or not any(temp_dir.iterdir()):
465            if temp_dir.exists():
466                shutil.rmtree(temp_dir)
467            if temp_dir in self._temp_dirs:
468                self._temp_dirs.remove(temp_dir)
469            return None
470
471        # Clean up temp directory (images are saved to cache)
472        if temp_dir.exists():
473            shutil.rmtree(temp_dir)
474            if temp_dir in self._temp_dirs:
475                self._temp_dirs.remove(temp_dir)
476
477        # Copy images from cache to output location
478        if cache_dir:
479            cached_image_dir = cache_dir / "images"
480            if cached_image_dir.exists() and any(cached_image_dir.iterdir()):
481                return self._copy_images_to_output(cached_image_dir, output_path, show_progress)
482
483        return None
484
485    def _copy_images_to_output(
486        self, source_dir: Path, output_path: Path, show_progress: bool = False
487    ) -> Path | None:
488        """Copy images from cache to output location (next to markdown file)."""
489        output_path = Path(output_path)
490
491        # Determine output images directory (sibling to markdown file)
492        if output_path.suffix:  # It's a file path like "output.md"
493            output_images_dir = output_path.parent / "images"
494        else:  # It's a directory
495            output_images_dir = output_path / "images"
496
497        # Don't copy if already at output location
498        if output_images_dir.resolve() == Path(source_dir).resolve():
499            return output_images_dir
500
501        # Copy images to output location
502        output_images_dir.mkdir(parents=True, exist_ok=True)
503        copied_count = 0
504        for img in source_dir.iterdir():
505            if img.is_file():
506                shutil.copy2(img, output_images_dir / img.name)
507                copied_count += 1
508
509        if show_progress and copied_count > 0:
510            print(f"Copied {copied_count} images to: {output_images_dir}", file=sys.stderr)
511
512        return output_images_dir
513
514
515# =============================================================================
516# PDF PROCESSING
517# =============================================================================
518
519
520def check_dependencies(docling_mode: bool = False):
521    """Check if required packages are installed."""
522    missing = []
523
524    try:
525        import pymupdf
526    except ImportError:
527        missing.append("pymupdf")
528
529    if docling_mode:
530        try:
531            import docling
532        except ImportError:
533            missing.append("docling")
534
535        try:
536            import docling_core
537        except ImportError:
538            missing.append("docling-core")
539
540        install_cmd = "uv pip install pymupdf docling docling-core"
541    else:
542        try:
543            import pymupdf4llm
544        except ImportError:
545            missing.append("pymupdf4llm")
546
547        install_cmd = "uv pip install pymupdf pymupdf4llm"
548
549    if missing:
550        print(f"ERROR: Missing dependencies: {', '.join(missing)}", file=sys.stderr)
551        print(f"Install with: {install_cmd}", file=sys.stderr)
552        return False
553
554    return True
555
556
557def convert_pdf(pdf_path, image_dir, show_progress=False, docling=False, images_scale=4.0):
558    """Convert PDF to markdown."""
559    if docling:
560        from extractor import extract_pdf_docling
561
562        markdown, _image_paths = extract_pdf_docling(
563            pdf_path,
564            output_dir=image_dir,
565            images_scale=images_scale,
566            show_progress=show_progress,
567        )
568        return markdown
569    else:
570        from extractor import extract_pdf_fast
571
572        markdown = extract_pdf_fast(
573            pdf_path,
574            image_dir=image_dir,
575            show_progress=show_progress,
576        )
577        return markdown
578
579
580def add_metadata_header(markdown, pdf_path, total_pages, image_dir=None, cached=False):
581    """Add metadata header to markdown output."""
582    filename = os.path.basename(pdf_path)
583
584    header_lines = [
585        "---",
586        f"source: {filename}",
587        f"total_pages: {total_pages}",
588        f"extracted_at: {datetime.now().isoformat()}",
589    ]
590
591    if cached:
592        header_lines.append("from_cache: true")
593
594    if image_dir:
595        # Use relative path for portability
596        header_lines.append("images_dir: images")
597
598    header_lines.extend(["---", "", ""])
599
600    return "\n".join(header_lines) + markdown
601
602
603# =============================================================================
604# MAIN
605# =============================================================================
606
607
608def main():
609    parser = argparse.ArgumentParser(
610        description="Convert PDF to Markdown for LLM context (with persistent caching)",
611        formatter_class=argparse.RawDescriptionHelpFormatter,
612        epilog="""
613Examples:
614  python pdf_to_md.py document.pdf                    # Output to document.md (cached)
615  python pdf_to_md.py document.pdf output.md         # Custom output path
616  python pdf_to_md.py document.pdf --docling         # Accurate tables (slower)
617  python pdf_to_md.py document.pdf --clear-cache     # Clear cache and re-extract
618  python pdf_to_md.py --clear-all-cache              # Clear entire cache
619
620Caching:
621  PDFs are cached in ~/.cache/pdf-to-markdown/
622  Cache is keyed by file content hash + extraction mode.
623  Cache persists until explicitly cleared or source PDF changes.
624        """,
625    )
626
627    parser.add_argument("input", nargs="?", help="Input PDF file path")
628    parser.add_argument("output", nargs="?", help="Output markdown file path (default: <input>.md)")
629    parser.add_argument(
630        "--docling",
631        "--accurate",
632        action="store_true",
633        dest="docling",
634        help="Use Docling AI for complex/borderless tables (slower, ~1 sec/page)",
635    )
636    parser.add_argument("--no-progress", action="store_true", help="Disable progress indicator")
637
638    # Cache options
639    parser.add_argument(
640        "--clear-cache",
641        action="store_true",
642        help="Clear cache for this PDF before processing",
643    )
644    parser.add_argument(
645        "--clear-all-cache",
646        action="store_true",
647        help="Clear entire cache directory and exit",
648    )
649    parser.add_argument("--cache-stats", action="store_true", help="Show cache statistics and exit")
650
651    args = parser.parse_args()
652
653    cache_mgr = CacheManager()
654
655    # Handle cache management commands
656    if args.clear_all_cache:
657        if cache_mgr.clear():
658            print(f"Cache cleared: {cache_mgr.cache_dir}", file=sys.stderr)
659        else:
660            print("Cache was already empty.", file=sys.stderr)
661        sys.exit(0)
662
663    if args.cache_stats:
664        stats = cache_mgr.get_stats()
665        print(f"Cache directory: {stats['cache_dir']}", file=sys.stderr)
666        print(f"Cached PDFs: {stats['entries']}", file=sys.stderr)
667        print(f"Total size: {stats['total_size_mb']} MB", file=sys.stderr)
668        sys.exit(0)
669
670    # Require input for all other operations
671    if not args.input:
672        parser.error("the following arguments are required: input")
673
674    # Handle --clear-cache
675    if args.clear_cache:
676        if cache_mgr.clear(args.input):
677            print(f"Cache cleared for: {args.input}", file=sys.stderr)
678        else:
679            print(f"No cache found for: {args.input}", file=sys.stderr)
680
681    # Validate input exists
682    if not os.path.exists(args.input):
683        print(f"ERROR: File not found: {args.input}", file=sys.stderr)
684        sys.exit(1)
685
686    if not args.input.lower().endswith(".pdf"):
687        print(f"WARNING: File may not be a PDF: {args.input}", file=sys.stderr)
688
689    show_progress = sys.stderr.isatty() and not args.no_progress
690
691    # Check cache
692    config = ExtractionConfig(pdf_path=args.input, docling=args.docling)
693    valid, cache_key = cache_mgr.is_valid(config)
694
695    result = None
696    image_dir = None
697    cache_hit = False
698
699    if valid:
700        if show_progress:
701            mode = "docling" if args.docling else "fast"
702            print(f"Loading from cache ({mode} mode)...", file=sys.stderr)
703
704        cache_result = cache_mgr.load(cache_key)
705        if cache_result:
706            result = cache_result.markdown
707            total_pages = cache_result.total_pages
708            cache_hit = True
709
710            # Copy images from cache to output location
711            if cache_result.image_dir:
712                output_path = args.output or os.path.splitext(args.input)[0] + ".md"
713                img_mgr = ImageManager()
714                image_dir = img_mgr._copy_images_to_output(
715                    cache_result.image_dir, output_path, show_progress
716                )
717
718    # Extract if no cache hit
719    if not cache_hit:
720        if not check_dependencies(docling_mode=args.docling):
721            sys.exit(1)
722
723        from extractor import get_page_count
724
725        total_pages = get_page_count(args.input)
726
727        if not cache_key:
728            cache_key = cache_mgr.get_key(config)
729
730        img_mgr = ImageManager()
731        temp_image_dir = img_mgr.create_temp_dir(args.input)
732
733        try:
734            if show_progress:
735                if args.docling:
736                    print(
737                        f"Extracting {total_pages} pages with Docling AI (~1 sec/page)...",
738                        file=sys.stderr,
739                    )
740                else:
741                    print(
742                        f"Extracting {total_pages} pages with PyMuPDF (fast mode)...",
743                        file=sys.stderr,
744                    )
745
746            result = convert_pdf(
747                args.input,
748                image_dir=temp_image_dir,
749                show_progress=show_progress,
750                docling=args.docling,
751            )
752        except Exception as e:
753            img_mgr.cleanup()
754            print(f"ERROR: Conversion failed: {e}", file=sys.stderr)
755            sys.exit(1)
756
757        # Save to cache
758        extraction_result = ExtractionResult(
759            markdown=result,
760            image_dir=temp_image_dir,
761            total_pages=total_pages,
762        )
763        cache_mgr.save(cache_key, extraction_result, config)
764        if show_progress:
765            print(f"Cached: {cache_mgr._get_dir(cache_key)}", file=sys.stderr)
766
767        # Finalize images
768        output_path = args.output or os.path.splitext(args.input)[0] + ".md"
769        image_dir = img_mgr.finalize_images(
770            temp_dir=temp_image_dir,
771            cache_dir=cache_mgr._get_dir(cache_key),
772            output_path=output_path,
773            show_progress=show_progress,
774        )
775
776    # Format output
777    output = result
778    img_mgr_for_output = ImageManager()  # Fresh instance for output processing
779
780    referenced_images = img_mgr_for_output.extract_references(result) if result else set()
781
782    if image_dir:
783        output = img_mgr_for_output.enhance_markdown(output, image_dir)
784        images = img_mgr_for_output.get_info(image_dir, referenced_only=referenced_images)
785        if images:
786            output += img_mgr_for_output.create_summary(images)
787
788    output = add_metadata_header(output, args.input, total_pages, image_dir, cached=cache_hit)
789
790    # Write output
791    output_path = args.output or os.path.splitext(args.input)[0] + ".md"
792    with open(output_path, "w", encoding="utf-8") as f:
793        f.write(output)
794
795    msg = f"Converted {total_pages} pages to: {output_path}"
796    if cache_hit:
797        msg += " (from cache)"
798    if image_dir:
799        images = img_mgr_for_output.get_info(image_dir, referenced_only=referenced_images)
800        if images:
801            msg += f" ({len(images)} images)"
802    print(msg, file=sys.stderr)
803
804
805if __name__ == "__main__":
806    main()
Configure Feed

Configure Feed