this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Refactor pdf_to_md.py: extract CacheManager and ImageManager classes

Major refactoring to reduce complexity of 414-line main() function:
- Add ExtractionConfig and ExtractionResult dataclasses
- Extract CacheManager class (consolidates 10 cache functions)
- Extract ImageManager class with context manager for cleanup

Bug fixes:
- Fix --no-images emitting broken image links from cache fallback
- Fix fast_noimages cache entries never being reused (filter bug)
- Fix temp directory leak with --no-cache --stdout

Also adds missing --images-scale and --no-progress options to README.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

alice e20deddd 0f200655

+583 -595
+2
README.md
··· 51 51 | `--stdout` | Print to stdout instead of file | 52 52 | `--pages RANGE` | Page range (e.g., "1-5" or "1,3,5-7") | 53 53 | `--docling` | Use Docling AI for high-accuracy tables | 54 + | `--images-scale N` | Image resolution multiplier for Docling mode (default: 4.0) | 54 55 | `--no-images` | Skip image extraction | 55 56 | `--no-metadata` | Skip metadata header | 57 + | `--no-progress` | Disable progress indicator | 56 58 | `--no-cache` | Bypass cache entirely (no read or write) | 57 59 | `--clear-cache` | Clear cache for this PDF (works even if PDF deleted) | 58 60 | `--clear-all-cache` | Clear entire cache |
+581 -595
scripts/pdf_to_md.py
··· 29 29 import hashlib 30 30 import shutil 31 31 import tempfile 32 + from dataclasses import dataclass 32 33 from pathlib import Path 33 34 from datetime import datetime 34 35 36 + 37 + # ============================================================================= 38 + # DATACLASSES 39 + # ============================================================================= 40 + 41 + 42 + @dataclass 43 + class ExtractionConfig: 44 + """Configuration for PDF extraction.""" 45 + 46 + pdf_path: str 47 + docling: bool = False 48 + images_scale: float = 4.0 49 + no_images: bool = False 50 + 51 + 52 + @dataclass 53 + class ExtractionResult: 54 + """Result of PDF extraction or cache load.""" 55 + 56 + markdown: str 57 + image_dir: Path | None 58 + total_pages: int 59 + from_cache: bool = False 60 + 35 61 # Suppress PyMuPDF's "Consider using pymupdf_layout" recommendation 36 62 # This must be set before any pymupdf imports to take effect 37 63 os.environ.setdefault("PYMUPDF_SUGGEST_LAYOUT_ANALYZER", "0") 38 64 39 - # Cache directory 40 - CACHE_DIR = Path.home() / ".cache" / "pdf-to-markdown" 65 + # Default cache directory 66 + DEFAULT_CACHE_DIR = Path.home() / ".cache" / "pdf-to-markdown" 41 67 42 68 43 69 # ============================================================================= 44 - # CACHING FUNCTIONS 70 + # CACHE MANAGER 45 71 # ============================================================================= 46 72 47 73 48 - def get_cache_key( 49 - pdf_path: str, 50 - docling: bool = False, 51 - images_scale: float = 4.0, 52 - no_images: bool = False, 53 - ) -> str: 54 - """Generate cache key from file content + size + mode (path-independent). 74 + class CacheManager: 75 + """Manages PDF extraction cache with clear ownership.""" 76 + 77 + def __init__(self, cache_dir: Path = None): 78 + self.cache_dir = cache_dir or DEFAULT_CACHE_DIR 79 + 80 + def get_key(self, config: ExtractionConfig) -> str: 81 + """Generate cache key from file content + size + mode (path-independent).""" 82 + p = Path(config.pdf_path).resolve() 83 + stat = p.stat() 84 + file_size = stat.st_size 55 85 56 - Args: 57 - pdf_path: Path to the PDF file 58 - docling: Whether Docling mode is used 59 - images_scale: Image resolution multiplier (only affects cache key in Docling mode) 60 - no_images: Whether image extraction is disabled 61 - """ 62 - p = Path(pdf_path).resolve() 63 - stat = p.stat() 64 - file_size = stat.st_size 86 + # Hash content for cache key identity 87 + # For files <= 128KB: hash entire content (avoids collision for similar templates) 88 + # For larger files: hash first 64KB + last 64KB for speed 89 + chunk_size = 65536 # 64KB 90 + hasher = hashlib.sha256() 65 91 66 - # Hash content for cache key identity 67 - # For files <= 128KB: hash entire content (avoids collision for similar templates) 68 - # For larger files: hash first 64KB + last 64KB for speed 69 - chunk_size = 65536 # 64KB 70 - hasher = hashlib.sha256() 92 + with open(p, "rb") as f: 93 + if file_size <= chunk_size * 2: 94 + hasher.update(f.read()) 95 + else: 96 + hasher.update(f.read(chunk_size)) 97 + f.seek(-chunk_size, 2) 98 + hasher.update(f.read(chunk_size)) 71 99 72 - with open(p, "rb") as f: 73 - if file_size <= chunk_size * 2: 74 - # Small/medium files: hash entire content 75 - hasher.update(f.read()) 100 + # Include images_scale in mode for Docling (affects extracted image resolution) 101 + if config.docling: 102 + mode = f"docling_{config.images_scale}" 76 103 else: 77 - # Large files: hash first + last chunks 78 - hasher.update(f.read(chunk_size)) 79 - f.seek(-chunk_size, 2) # Seek from end 80 - hasher.update(f.read(chunk_size)) 104 + mode = "fast" 81 105 82 - # Include images_scale in mode for Docling (affects extracted image resolution) 83 - if docling: 84 - mode = f"docling_{images_scale}" 85 - else: 86 - mode = "fast" 106 + # Include no_images flag to avoid cache contamination 107 + if config.no_images: 108 + mode += "_noimages" 87 109 88 - # Include no_images flag to avoid cache contamination 89 - if no_images: 90 - mode += "_noimages" 110 + raw = f"{file_size}|{hasher.hexdigest()}|{mode}" 111 + return hashlib.sha256(raw.encode()).hexdigest()[:16] 91 112 92 - raw = f"{file_size}|{hasher.hexdigest()}|{mode}" 93 - return hashlib.sha256(raw.encode()).hexdigest()[:16] 113 + def _get_dir(self, cache_key: str) -> Path: 114 + """Get cache directory for a given cache key.""" 115 + return self.cache_dir / cache_key 94 116 117 + def _get_cached_total_pages(self, cache_key: str) -> int: 118 + """Get total_pages from cache metadata without loading full content.""" 119 + cache_dir = self._get_dir(cache_key) 120 + metadata_file = cache_dir / "metadata.json" 121 + try: 122 + with open(metadata_file) as f: 123 + metadata = json.load(f) 124 + return metadata.get("total_pages", 0) 125 + except (FileNotFoundError, json.JSONDecodeError, OSError): 126 + return 0 95 127 96 - def get_cache_dir(cache_key: str) -> Path: 97 - """Get cache directory for a given cache key.""" 98 - return CACHE_DIR / cache_key 128 + def is_valid(self, config: ExtractionConfig) -> tuple[bool, str]: 129 + """Check if valid cache exists for this PDF. 99 130 131 + Returns: 132 + (is_valid: bool, cache_key: str) 133 + """ 134 + from extractor import EXTRACTOR_VERSION 100 135 101 - def get_cached_total_pages(cache_key: str) -> int: 102 - """Get total_pages from cache metadata without loading the full content. 136 + try: 137 + cache_key = self.get_key(config) 138 + except (FileNotFoundError, OSError): 139 + return False, "" 103 140 104 - Used to enable page range parsing before loading the full cache. 105 - Returns 0 if cache metadata is unavailable. 106 - """ 107 - cache_dir = get_cache_dir(cache_key) 108 - metadata_file = cache_dir / "metadata.json" 109 - try: 110 - with open(metadata_file) as f: 111 - metadata = json.load(f) 112 - return metadata.get("total_pages", 0) 113 - except (FileNotFoundError, json.JSONDecodeError, OSError): 114 - return 0 141 + cache_dir = self._get_dir(cache_key) 142 + metadata_file = cache_dir / "metadata.json" 143 + output_file = cache_dir / "full_output.md" 115 144 145 + if not metadata_file.exists() or not output_file.exists(): 146 + return False, cache_key 116 147 117 - def is_cache_valid( 118 - pdf_path: str, 119 - docling: bool = False, 120 - images_scale: float = 4.0, 121 - no_images: bool = False, 122 - ) -> tuple: 123 - """ 124 - Check if valid cache exists for this PDF. 148 + try: 149 + with open(metadata_file) as f: 150 + metadata = json.load(f) 151 + 152 + p = Path(config.pdf_path).resolve() 153 + stat = p.stat() 154 + 155 + if ( 156 + metadata.get("source_size") != stat.st_size 157 + or metadata.get("source_mtime") != stat.st_mtime 158 + ): 159 + return False, cache_key 160 + 161 + if metadata.get("extractor_version") != EXTRACTOR_VERSION: 162 + return False, cache_key 163 + 164 + return True, cache_key 165 + except (json.JSONDecodeError, KeyError, OSError): 166 + return False, cache_key 125 167 126 - Returns: 127 - (is_valid: bool, cache_key: str) 128 - """ 129 - from extractor import EXTRACTOR_VERSION 168 + def load( 169 + self, cache_key: str, pages: list = None, no_images: bool = False 170 + ) -> ExtractionResult | None: 171 + """Load markdown from cache, optionally slice specific pages. 130 172 131 - try: 132 - cache_key = get_cache_key( 133 - pdf_path, docling=docling, images_scale=images_scale, no_images=no_images 173 + Returns ExtractionResult or None if cache is corrupted. 174 + """ 175 + cache_dir = self._get_dir(cache_key) 176 + 177 + try: 178 + full_md = (cache_dir / "full_output.md").read_text(encoding="utf-8") 179 + with open(cache_dir / "metadata.json") as f: 180 + metadata = json.load(f) 181 + total_pages = metadata.get("total_pages", 0) 182 + except (FileNotFoundError, IOError, json.JSONDecodeError, OSError) as e: 183 + print( 184 + f"WARNING: Cache corrupted ({e.__class__.__name__}), regenerating...", 185 + file=sys.stderr, 186 + ) 187 + try: 188 + if cache_dir.exists(): 189 + shutil.rmtree(cache_dir) 190 + except OSError: 191 + pass 192 + return None 193 + 194 + # Check for cached images (skip if no_images flag is set) 195 + image_dir = None 196 + if not no_images: 197 + cached_image_dir = cache_dir / "images" 198 + if cached_image_dir.exists() and any(cached_image_dir.iterdir()): 199 + image_dir = cached_image_dir 200 + else: 201 + # Strip image references when --no-images requested but cache has them 202 + # This prevents broken links when loading from a cache with images 203 + full_md = self._strip_image_references(full_md) 204 + 205 + # Slice pages if requested 206 + if pages: 207 + full_md = slice_pages_from_markdown(full_md, pages, total_pages) 208 + 209 + return ExtractionResult( 210 + markdown=full_md, 211 + image_dir=image_dir, 212 + total_pages=total_pages, 213 + from_cache=True, 134 214 ) 135 - except (FileNotFoundError, OSError): 136 - return False, "" 137 215 138 - cache_dir = get_cache_dir(cache_key) 139 - metadata_file = cache_dir / "metadata.json" 140 - output_file = cache_dir / "full_output.md" 216 + def _strip_image_references(self, markdown: str) -> str: 217 + """Remove image references from markdown, leaving alt text as placeholder.""" 141 218 142 - if not metadata_file.exists() or not output_file.exists(): 143 - return False, cache_key 219 + def replace_image(match): 220 + alt_text = match.group(1) 221 + if alt_text: 222 + return f"[Image: {alt_text}]" 223 + return "[Image]" 144 224 145 - # Verify metadata matches current file 146 - try: 147 - with open(metadata_file) as f: 148 - metadata = json.load(f) 225 + pattern = r"!\[([^\]]*)\]\([^)]+\)" 226 + return re.sub(pattern, replace_image, markdown) 149 227 150 - p = Path(pdf_path).resolve() 151 - stat = p.stat() 228 + def _normalize_image_paths(self, markdown: str, source_image_dir: Path) -> str: 229 + """Normalize image paths in markdown to use relative 'images/' prefix.""" 230 + if not source_image_dir: 231 + return markdown 152 232 153 - if ( 154 - metadata.get("source_size") != stat.st_size 155 - or metadata.get("source_mtime") != stat.st_mtime 156 - ): 157 - return False, cache_key 233 + source_image_dir = Path(source_image_dir) 158 234 159 - # Check extractor version - invalidate if extraction logic changed 160 - if metadata.get("extractor_version") != EXTRACTOR_VERSION: 161 - return False, cache_key 235 + def normalize_ref(match): 236 + alt_text = match.group(1) 237 + filename_raw = match.group(2) 238 + filename = Path(filename_raw).name 239 + if (source_image_dir / filename).exists(): 240 + return f"![{alt_text}](images/{filename})" 241 + return match.group(0) 162 242 163 - return True, cache_key 164 - except (json.JSONDecodeError, KeyError, OSError): 165 - return False, cache_key 243 + pattern = r"!\[([^\]]*)\]\(([^)]+)\)" 244 + return re.sub(pattern, normalize_ref, markdown) 166 245 246 + def save( 247 + self, 248 + cache_key: str, 249 + result: ExtractionResult, 250 + config: ExtractionConfig, 251 + ): 252 + """Save full extraction to cache using atomic writes.""" 253 + from extractor import EXTRACTOR_VERSION 167 254 168 - def load_from_cache( 169 - cache_key: str, pages: list = None, no_images: bool = False 170 - ) -> tuple: 171 - """ 172 - Load markdown from cache, optionally slice specific pages. 255 + cache_dir = self._get_dir(cache_key) 256 + cache_dir.mkdir(parents=True, exist_ok=True) 173 257 174 - Args: 175 - cache_key: The cache key to load from 176 - pages: Optional list of page numbers to slice 177 - no_images: If True, skip loading image directory even if cached 258 + markdown = result.markdown 259 + if result.image_dir: 260 + markdown = self._normalize_image_paths(markdown, result.image_dir) 178 261 179 - Returns: 180 - (markdown: str, image_dir: Path or None, total_pages: int) 181 - Returns (None, None, 0) if cache is corrupted (caller should treat as cache miss) 182 - """ 183 - cache_dir = get_cache_dir(cache_key) 262 + # Build metadata 263 + p = Path(config.pdf_path).resolve() 264 + stat = p.stat() 265 + mode = f"docling_{config.images_scale}" if config.docling else "fast" 266 + if config.no_images: 267 + mode += "_noimages" 184 268 185 - try: 186 - # Load full markdown 187 - full_md = (cache_dir / "full_output.md").read_text(encoding="utf-8") 269 + metadata = { 270 + "source_path": str(p), 271 + "source_mtime": stat.st_mtime, 272 + "source_size": stat.st_size, 273 + "cache_key": cache_key, 274 + "cached_at": datetime.now().isoformat(), 275 + "total_pages": result.total_pages, 276 + "extractor_version": EXTRACTOR_VERSION, 277 + "mode": mode, 278 + "images_scale": config.images_scale if config.docling else None, 279 + "no_images": config.no_images, 280 + } 188 281 189 - # Load metadata for total pages 190 - with open(cache_dir / "metadata.json") as f: 191 - metadata = json.load(f) 192 - total_pages = metadata.get("total_pages", 0) 193 - except (FileNotFoundError, IOError, json.JSONDecodeError, OSError) as e: 194 - # Cache is corrupted or incomplete - delete it and return cache miss 195 - print( 196 - f"WARNING: Cache corrupted ({e.__class__.__name__}), regenerating...", 197 - file=sys.stderr, 198 - ) 282 + temp_md = None 283 + temp_json = None 199 284 try: 200 - if cache_dir.exists(): 201 - shutil.rmtree(cache_dir) 202 - except OSError: 203 - pass # Best effort cleanup 204 - return None, None, 0 285 + with tempfile.NamedTemporaryFile( 286 + mode="w", 287 + dir=cache_dir, 288 + suffix=".md.tmp", 289 + delete=False, 290 + encoding="utf-8", 291 + ) as f: 292 + f.write(markdown) 293 + temp_md = f.name 205 294 206 - # Check for cached images (skip if no_images flag is set) 207 - image_dir = None 208 - if not no_images: 209 - cached_image_dir = cache_dir / "images" 210 - if cached_image_dir.exists() and any(cached_image_dir.iterdir()): 211 - image_dir = cached_image_dir 295 + with tempfile.NamedTemporaryFile( 296 + mode="w", dir=cache_dir, suffix=".json.tmp", delete=False 297 + ) as f: 298 + json.dump(metadata, f, indent=2) 299 + temp_json = f.name 212 300 213 - # Slice pages if requested 214 - if pages: 215 - full_md = slice_pages_from_markdown(full_md, pages, total_pages) 301 + os.replace(temp_md, cache_dir / "full_output.md") 302 + temp_md = None 303 + os.replace(temp_json, cache_dir / "metadata.json") 304 + temp_json = None 216 305 217 - return full_md, image_dir, total_pages 306 + if result.image_dir and Path(result.image_dir).exists(): 307 + temp_images = cache_dir / "images.tmp" 308 + final_images = cache_dir / "images" 218 309 310 + if temp_images.exists(): 311 + shutil.rmtree(temp_images) 219 312 220 - def normalize_image_paths_for_cache(markdown: str, source_image_dir: Path) -> str: 221 - """ 222 - Normalize image paths in markdown to use relative 'images/' prefix. 313 + shutil.copytree(result.image_dir, temp_images) 223 314 224 - This ensures cached markdown has stable paths that work when loaded later, 225 - regardless of the original temp directory used during extraction. 226 - """ 227 - if not source_image_dir: 228 - return markdown 315 + if final_images.exists(): 316 + shutil.rmtree(final_images) 317 + os.rename(temp_images, final_images) 229 318 230 - source_image_dir = Path(source_image_dir) 319 + finally: 320 + if temp_md and os.path.exists(temp_md): 321 + os.unlink(temp_md) 322 + if temp_json and os.path.exists(temp_json): 323 + os.unlink(temp_json) 231 324 232 - def normalize_ref(match): 233 - alt_text = match.group(1) 234 - filename_raw = match.group(2) 235 - filename = Path(filename_raw).name 325 + def find_by_source( 326 + self, pdf_path: str, docling: bool = None, images_scale: float = None 327 + ) -> list: 328 + """Find cache entries by source path in metadata. 236 329 237 - # Check if file exists in source directory 238 - if (source_image_dir / filename).exists(): 239 - # Rewrite to relative path for cache storage 240 - return f"![{alt_text}](images/{filename})" 241 - return match.group(0) 330 + Used as fallback when the source PDF no longer exists. 331 + Returns list of (cache_dir, metadata) tuples sorted by cached_at (freshest first). 332 + """ 333 + if not self.cache_dir.exists(): 334 + return [] 242 335 243 - pattern = r"!\[([^\]]*)\]\(([^)]+)\)" 244 - return re.sub(pattern, normalize_ref, markdown) 336 + pdf_path_resolved = str(Path(pdf_path).resolve()) 337 + matching = [] 245 338 339 + for entry in self.cache_dir.iterdir(): 340 + if not entry.is_dir(): 341 + continue 342 + metadata_file = entry / "metadata.json" 343 + if not metadata_file.exists(): 344 + continue 345 + try: 346 + with open(metadata_file) as f: 347 + metadata = json.load(f) 246 348 247 - def save_to_cache( 248 - cache_key: str, 249 - markdown: str, 250 - image_dir: Path, 251 - pdf_path: str, 252 - total_pages: int, 253 - docling: bool = False, 254 - images_scale: float = 4.0, 255 - no_images: bool = False, 256 - ): 257 - """Save full extraction to cache using atomic writes. 349 + if metadata.get("source_path") != pdf_path_resolved: 350 + continue 258 351 259 - Uses temp files + os.replace() to ensure cache integrity even if 260 - the process is interrupted mid-write (power loss, Ctrl+C, etc.). 352 + if docling is not None: 353 + cached_mode = metadata.get("mode") 354 + if cached_mode is None: 355 + continue 356 + if docling and not cached_mode.startswith("docling"): 357 + continue 358 + # Match fast mode including fast_noimages variant 359 + if not docling and not cached_mode.startswith("fast"): 360 + continue 261 361 262 - Image paths in markdown are normalized to 'images/<filename>' before 263 - caching so they work correctly when loaded later. 264 - """ 265 - from extractor import EXTRACTOR_VERSION 362 + if docling and images_scale is not None: 363 + cached_scale = metadata.get("images_scale") 364 + if cached_scale is not None and cached_scale != images_scale: 365 + continue 266 366 267 - cache_dir = get_cache_dir(cache_key) 268 - cache_dir.mkdir(parents=True, exist_ok=True) 367 + matching.append((entry, metadata)) 368 + except (json.JSONDecodeError, OSError): 369 + continue 269 370 270 - # Normalize image paths before caching (convert temp paths to relative) 271 - if image_dir: 272 - markdown = normalize_image_paths_for_cache(markdown, image_dir) 371 + matching.sort(key=lambda x: x[1].get("cached_at", ""), reverse=True) 372 + return matching 273 373 274 - # Build metadata 275 - p = Path(pdf_path).resolve() 276 - stat = p.stat() 277 - mode = f"docling_{images_scale}" if docling else "fast" 278 - if no_images: 279 - mode += "_noimages" 374 + def clear(self, pdf_path: str = None) -> bool: 375 + """Clear cache for specific PDF (all modes and scale variants) or entire cache.""" 376 + if pdf_path: 377 + cleared = False 280 378 281 - metadata = { 282 - "source_path": str(p), 283 - "source_mtime": stat.st_mtime, 284 - "source_size": stat.st_size, 285 - "cache_key": cache_key, 286 - "cached_at": datetime.now().isoformat(), 287 - "total_pages": total_pages, 288 - "extractor_version": EXTRACTOR_VERSION, 289 - "mode": mode, 290 - "images_scale": images_scale if docling else None, 291 - "no_images": no_images, 292 - } 379 + try: 380 + config = ExtractionConfig(pdf_path=pdf_path) 381 + cache_key = self.get_key(config) 382 + cache_dir = self._get_dir(cache_key) 383 + if cache_dir.exists(): 384 + shutil.rmtree(cache_dir) 385 + cleared = True 386 + except (FileNotFoundError, OSError): 387 + pass 293 388 294 - # Write to temp files first, then atomic rename 295 - # (same filesystem guarantees atomicity via os.replace) 296 - temp_md = None 297 - temp_json = None 298 - try: 299 - # Write markdown to temp file 300 - with tempfile.NamedTemporaryFile( 301 - mode="w", dir=cache_dir, suffix=".md.tmp", delete=False, encoding="utf-8" 302 - ) as f: 303 - f.write(markdown) 304 - temp_md = f.name 389 + matching_caches = self.find_by_source(pdf_path) 390 + for cache_dir, _metadata in matching_caches: 391 + if cache_dir.exists(): 392 + shutil.rmtree(cache_dir) 393 + cleared = True 305 394 306 - # Write metadata to temp file 307 - with tempfile.NamedTemporaryFile( 308 - mode="w", dir=cache_dir, suffix=".json.tmp", delete=False 309 - ) as f: 310 - json.dump(metadata, f, indent=2) 311 - temp_json = f.name 395 + return cleared 396 + else: 397 + if self.cache_dir.exists(): 398 + shutil.rmtree(self.cache_dir) 399 + return True 400 + return False 312 401 313 - # Atomic moves (os.replace is atomic on POSIX when same filesystem) 314 - os.replace(temp_md, cache_dir / "full_output.md") 315 - temp_md = None # Successfully moved, don't cleanup 316 - os.replace(temp_json, cache_dir / "metadata.json") 317 - temp_json = None # Successfully moved, don't cleanup 402 + def get_stats(self) -> dict: 403 + """Get statistics about the cache.""" 404 + if not self.cache_dir.exists(): 405 + return {"entries": 0, "total_size_mb": 0, "cache_dir": str(self.cache_dir)} 318 406 319 - # Copy images to temp dir, then rename atomically 320 - if image_dir and Path(image_dir).exists(): 321 - temp_images = cache_dir / "images.tmp" 322 - final_images = cache_dir / "images" 407 + entries = 0 408 + total_size = 0 323 409 324 - # Clean up any stale temp directory 325 - if temp_images.exists(): 326 - shutil.rmtree(temp_images) 410 + for entry in self.cache_dir.iterdir(): 411 + if entry.is_dir(): 412 + entries += 1 413 + for f in entry.rglob("*"): 414 + if f.is_file(): 415 + total_size += f.stat().st_size 327 416 328 - shutil.copytree(image_dir, temp_images) 417 + return { 418 + "entries": entries, 419 + "total_size_mb": round(total_size / (1024 * 1024), 2), 420 + "cache_dir": str(self.cache_dir), 421 + } 329 422 330 - # Remove old images dir and rename temp to final 331 - if final_images.exists(): 332 - shutil.rmtree(final_images) 333 - os.rename(temp_images, final_images) 334 423 335 - finally: 336 - # Cleanup temp files on failure 337 - if temp_md and os.path.exists(temp_md): 338 - os.unlink(temp_md) 339 - if temp_json and os.path.exists(temp_json): 340 - os.unlink(temp_json) 424 + # ============================================================================= 425 + # IMAGE MANAGER 426 + # ============================================================================= 341 427 342 428 343 - def slice_pages_from_markdown(full_md: str, pages: list, total_pages: int) -> str: 344 - """ 345 - Extract specific pages from full markdown. 429 + class ImageManager: 430 + """Manages image extraction and cleanup with proper lifecycle.""" 346 431 347 - Uses explicit <!-- PAGE_BREAK --> sentinels inserted during extraction. 348 - This is more reliable than matching "-----" which could appear in content. 349 - """ 350 - # Split on explicit page break sentinel 351 - page_pattern = r"\n<!-- PAGE_BREAK -->\n" 352 - parts = re.split(page_pattern, full_md) 432 + def __init__(self): 433 + self._temp_dirs: list[Path] = [] 353 434 354 - if len(parts) <= 1: 355 - # No page separators found (single page or Docling mode) 356 - return full_md 435 + def create_temp_dir(self, pdf_path: str) -> Path: 436 + """Create tracked temp directory for image extraction.""" 437 + pdf_name = Path(pdf_path).stem 438 + safe_name = re.sub(r"[^\w\-_]", "_", pdf_name) 439 + temp_dir = Path(tempfile.mkdtemp(prefix=f"pdf_images_{safe_name}_")) 440 + self._temp_dirs.append(temp_dir) 441 + return temp_dir 357 442 358 - # Convert 0-indexed pages to parts indices 359 - selected_parts = [] 360 - for page_num in pages: 361 - if 0 <= page_num < len(parts): 362 - selected_parts.append(parts[page_num]) 443 + def cleanup(self): 444 + """Clean up all tracked temp directories.""" 445 + for temp_dir in self._temp_dirs: 446 + if temp_dir.exists(): 447 + shutil.rmtree(temp_dir) 448 + self._temp_dirs.clear() 363 449 364 - if not selected_parts: 365 - return full_md 450 + def __enter__(self): 451 + return self 366 452 367 - return "\n<!-- PAGE_BREAK -->\n".join(selected_parts) 453 + def __exit__(self, *args): 454 + self.cleanup() 368 455 456 + def extract_references(self, markdown: str) -> set: 457 + """Extract the set of image filenames referenced in markdown.""" 458 + pattern = r"!\[[^\]]*\]\(([^)]+)\)" 459 + matches = re.findall(pattern, markdown) 460 + return {Path(m).name for m in matches} 369 461 370 - def find_cache_by_source_path( 371 - pdf_path: str, docling: bool = None, images_scale: float = None 372 - ) -> list: 373 - """ 374 - Find cache entries by source path in metadata. 462 + def get_info(self, image_dir: Path, referenced_only: set = None) -> list: 463 + """Get information about extracted images.""" 464 + if not image_dir: 465 + return [] 375 466 376 - Used as fallback when the source PDF no longer exists (can't compute hash). 467 + image_dir = Path(image_dir) 468 + if not image_dir.exists(): 469 + return [] 377 470 378 - Args: 379 - pdf_path: Path to the PDF file (used to match source_path in metadata) 380 - docling: If specified, filter to only caches matching this mode 381 - images_scale: If specified (and docling=True), filter to matching scale 471 + images = [] 472 + for img_path in sorted(image_dir.glob("*")): 473 + if img_path.suffix.lower() in ( 474 + ".png", 475 + ".jpg", 476 + ".jpeg", 477 + ".gif", 478 + ".bmp", 479 + ".webp", 480 + ): 481 + if referenced_only is not None and img_path.name not in referenced_only: 482 + continue 382 483 383 - Returns: 384 - List of (cache_dir, metadata) tuples sorted by cached_at (freshest first) 385 - """ 386 - if not CACHE_DIR.exists(): 387 - return [] 484 + try: 485 + size_bytes = img_path.stat().st_size 486 + size_kb = size_bytes / 1024 388 487 389 - pdf_path_resolved = str(Path(pdf_path).resolve()) 390 - matching = [] 488 + try: 489 + import pymupdf 391 490 392 - for entry in CACHE_DIR.iterdir(): 393 - if not entry.is_dir(): 394 - continue 395 - metadata_file = entry / "metadata.json" 396 - if not metadata_file.exists(): 397 - continue 398 - try: 399 - with open(metadata_file) as f: 400 - metadata = json.load(f) 491 + pix = pymupdf.Pixmap(str(img_path)) 492 + dimensions = f"{pix.width}x{pix.height}" 493 + pix = None 494 + except Exception: 495 + dimensions = "unknown" 401 496 402 - if metadata.get("source_path") != pdf_path_resolved: 403 - continue 497 + images.append( 498 + { 499 + "filename": img_path.name, 500 + "path": str(img_path), 501 + "size_kb": round(size_kb, 1), 502 + "dimensions": dimensions, 503 + } 504 + ) 505 + except Exception: 506 + pass 404 507 405 - # Filter by mode if specified 406 - if docling is not None: 407 - # Determine mode from cache_key pattern in metadata or by checking the key 408 - cache_key = metadata.get("cache_key", entry.name) 409 - # Docling cache keys contain "docling" in the mode component 410 - # We can infer mode from whether the cache was created with docling 411 - # Check if this looks like a docling cache by examining the entry 412 - # Actually, we need to store mode in metadata - let's check if it exists 413 - cached_mode = metadata.get("mode") 414 - if cached_mode is None: 415 - # Legacy cache without mode - try to infer from cache key 416 - # This is imperfect but better than nothing 417 - continue # Skip caches without mode info for filtering 418 - if docling and not cached_mode.startswith("docling"): 419 - continue 420 - if not docling and cached_mode != "fast": 421 - continue 508 + return images 422 509 423 - # Filter by images_scale if docling mode and scale specified 424 - if docling and images_scale is not None: 425 - cached_scale = metadata.get("images_scale") 426 - if cached_scale is not None and cached_scale != images_scale: 427 - continue 510 + def enhance_markdown(self, markdown: str, image_dir: Path) -> str: 511 + """Rewrite image references to point to actual image location.""" 512 + if not image_dir: 513 + return markdown 428 514 429 - matching.append((entry, metadata)) 430 - except (json.JSONDecodeError, OSError): 431 - continue 515 + image_dir = Path(image_dir) 432 516 433 - # Sort by cached_at (freshest first) 434 - matching.sort( 435 - key=lambda x: x[1].get("cached_at", ""), 436 - reverse=True, 437 - ) 517 + def replace_image_ref(match): 518 + alt_text = match.group(1) 519 + filename_raw = match.group(2) 520 + filename = Path(filename_raw).name 521 + full_path = image_dir / filename 438 522 439 - return matching 523 + if full_path.exists(): 524 + try: 525 + size_kb = round(full_path.stat().st_size / 1024, 1) 526 + try: 527 + import pymupdf 440 528 529 + pix = pymupdf.Pixmap(str(full_path)) 530 + dims = f"{pix.width}x{pix.height}" 531 + pix = None 532 + except Exception: 533 + dims = "?" 441 534 442 - def clear_cache(pdf_path: str = None): 443 - """Clear cache for specific PDF (all modes and scale variants) or entire cache.""" 444 - if pdf_path: 445 - cleared = False 535 + return f"![{alt_text}]({full_path})\n\n**[Image: {filename} ({dims}, {size_kb}KB)]**" 536 + except Exception: 537 + return f"![{alt_text}]({full_path})\n\n**[Image: {filename}]**" 446 538 447 - # Try to clear fast mode cache by computing key (if file exists) 448 - try: 449 - cache_key = get_cache_key(pdf_path, docling=False) 450 - cache_dir = get_cache_dir(cache_key) 451 - if cache_dir.exists(): 452 - shutil.rmtree(cache_dir) 453 - cleared = True 454 - except (FileNotFoundError, OSError): 455 - pass 539 + return match.group(0) 456 540 457 - # Use source_path lookup to find all variants (docling with any scale, etc.) 458 - # This handles: docling mode with any images_scale, and fast mode if file was moved/deleted 459 - matching_caches = find_cache_by_source_path(pdf_path) 460 - for cache_dir, _metadata in matching_caches: 461 - if cache_dir.exists(): # May have been cleared above 462 - shutil.rmtree(cache_dir) 463 - cleared = True 541 + pattern = r"!\[([^\]]*)\]\(([^)]+)\)" 542 + return re.sub(pattern, replace_image_ref, markdown) 464 543 465 - return cleared 466 - else: 467 - # Clear all cache 468 - if CACHE_DIR.exists(): 469 - shutil.rmtree(CACHE_DIR) 470 - return True 471 - return False 544 + def create_summary(self, images: list) -> str: 545 + """Create a summary section listing all extracted images.""" 546 + if not images: 547 + return "" 472 548 549 + lines = [ 550 + "", 551 + "---", 552 + "", 553 + "## Extracted Images", 554 + "", 555 + "| # | File | Dimensions | Size | Path |", 556 + "|---|------|------------|------|------|", 557 + ] 473 558 474 - def get_cache_stats() -> dict: 475 - """Get statistics about the cache.""" 476 - if not CACHE_DIR.exists(): 477 - return {"entries": 0, "total_size_mb": 0, "cache_dir": str(CACHE_DIR)} 559 + for i, img in enumerate(images, 1): 560 + lines.append( 561 + f"| {i} | {img['filename']} | {img['dimensions']} | {img['size_kb']}KB | `{img['path']}` |" 562 + ) 563 + 564 + lines.append("") 565 + return "\n".join(lines) 566 + 567 + def finalize_images( 568 + self, 569 + temp_dir: Path, 570 + cache_dir: Path = None, 571 + output_dir: Path = None, 572 + no_cache: bool = False, 573 + show_progress: bool = False, 574 + ) -> Path | None: 575 + """Finalize image directory after extraction. 478 576 479 - entries = 0 480 - total_size = 0 577 + Handles: 578 + - Copying to output location when --no-cache 579 + - Returning cached location when caching enabled 580 + - Cleaning up temp directories 481 581 482 - for entry in CACHE_DIR.iterdir(): 483 - if entry.is_dir(): 484 - entries += 1 485 - for f in entry.rglob("*"): 486 - if f.is_file(): 487 - total_size += f.stat().st_size 582 + Returns the final image directory to use for output. 583 + """ 584 + if not temp_dir: 585 + return None 586 + 587 + temp_dir = Path(temp_dir) 588 + if not temp_dir.exists() or not any(temp_dir.iterdir()): 589 + return None 488 590 489 - return { 490 - "entries": entries, 491 - "total_size_mb": round(total_size / (1024 * 1024), 2), 492 - "cache_dir": str(CACHE_DIR), 493 - } 591 + if no_cache: 592 + if output_dir: 593 + # Copy images to output location 594 + output_images_dir = Path(str(output_dir).rsplit(".", 1)[0] + "_images") 595 + if output_images_dir.exists(): 596 + shutil.rmtree(output_images_dir) 597 + shutil.copytree(temp_dir, output_images_dir) 598 + # Clean up temp directory 599 + if temp_dir.exists(): 600 + shutil.rmtree(temp_dir) 601 + # Remove from tracking since we cleaned it up 602 + if temp_dir in self._temp_dirs: 603 + self._temp_dirs.remove(temp_dir) 604 + if show_progress: 605 + print(f"Images copied to: {output_images_dir}", file=sys.stderr) 606 + return output_images_dir 607 + else: 608 + # Outputting to stdout with --no-cache: clean up temp dir 609 + # (--no-cache contract: don't leave any files behind) 610 + print( 611 + "WARNING: --no-cache with stdout: images not available (would require temp files).", 612 + file=sys.stderr, 613 + ) 614 + if temp_dir.exists(): 615 + shutil.rmtree(temp_dir) 616 + if temp_dir in self._temp_dirs: 617 + self._temp_dirs.remove(temp_dir) 618 + return None 619 + else: 620 + # Caching enabled - use cached location if available 621 + if cache_dir: 622 + cached_image_dir = cache_dir / "images" 623 + if cached_image_dir.exists() and any(cached_image_dir.iterdir()): 624 + # Clean up temp directory 625 + if temp_dir.exists(): 626 + shutil.rmtree(temp_dir) 627 + if temp_dir in self._temp_dirs: 628 + self._temp_dirs.remove(temp_dir) 629 + return cached_image_dir 630 + return None 631 + 632 + 633 + # ============================================================================= 634 + # HELPER FUNCTIONS 635 + # ============================================================================= 636 + 637 + 638 + def slice_pages_from_markdown(full_md: str, pages: list, total_pages: int) -> str: 639 + """Extract specific pages from full markdown. 640 + 641 + Uses explicit <!-- PAGE_BREAK --> sentinels inserted during extraction. 642 + """ 643 + page_pattern = r"\n<!-- PAGE_BREAK -->\n" 644 + parts = re.split(page_pattern, full_md) 645 + 646 + if len(parts) <= 1: 647 + return full_md 648 + 649 + selected_parts = [] 650 + for page_num in pages: 651 + if 0 <= page_num < len(parts): 652 + selected_parts.append(parts[page_num]) 653 + 654 + if not selected_parts: 655 + return full_md 656 + 657 + return "\n<!-- PAGE_BREAK -->\n".join(selected_parts) 494 658 495 659 496 660 # ============================================================================= ··· 620 784 return result 621 785 622 786 623 - def extract_referenced_images(markdown: str) -> set: 624 - """ 625 - Extract the set of image filenames referenced in markdown. 626 - 627 - Returns: 628 - Set of image filenames (without directory path) 629 - """ 630 - # Match markdown image references: ![alt](path) 631 - pattern = r"!\[[^\]]*\]\(([^)]+)\)" 632 - matches = re.findall(pattern, markdown) 633 - # Extract just the filename from each path 634 - return {Path(m).name for m in matches} 635 - 636 - 637 - def get_image_info(image_dir, referenced_only: set = None): 638 - """ 639 - Get information about extracted images. 640 - 641 - Args: 642 - image_dir: Directory containing images 643 - referenced_only: If provided, only include images with filenames in this set 644 - 645 - Returns: 646 - List of dicts with image metadata 647 - """ 648 - if not image_dir: 649 - return [] 650 - 651 - image_dir = Path(image_dir) 652 - if not image_dir.exists(): 653 - return [] 654 - 655 - images = [] 656 - for img_path in sorted(image_dir.glob("*")): 657 - if img_path.suffix.lower() in ( 658 - ".png", 659 - ".jpg", 660 - ".jpeg", 661 - ".gif", 662 - ".bmp", 663 - ".webp", 664 - ): 665 - # Filter to only referenced images if specified 666 - if referenced_only is not None and img_path.name not in referenced_only: 667 - continue 668 - 669 - try: 670 - # Get file size 671 - size_bytes = img_path.stat().st_size 672 - size_kb = size_bytes / 1024 673 - 674 - # Try to get dimensions using pymupdf 675 - try: 676 - import pymupdf 677 - 678 - pix = pymupdf.Pixmap(str(img_path)) 679 - dimensions = f"{pix.width}x{pix.height}" 680 - pix = None 681 - except: 682 - dimensions = "unknown" 683 - 684 - images.append( 685 - { 686 - "filename": img_path.name, 687 - "path": str(img_path), 688 - "size_kb": round(size_kb, 1), 689 - "dimensions": dimensions, 690 - } 691 - ) 692 - except Exception: 693 - pass 694 - 695 - return images 696 - 697 - 698 - def enhance_markdown_with_image_paths(markdown, image_dir): 699 - """ 700 - Rewrite image references in markdown to point to the actual image location. 701 - 702 - This fixes broken references that point to temp directories by rewriting 703 - the actual ![alt](path) to use the durable image_dir location. 704 - """ 705 - if not image_dir: 706 - return markdown 707 - 708 - image_dir = Path(image_dir) 709 - 710 - def replace_image_ref(match): 711 - alt_text = match.group(1) 712 - filename_raw = match.group(2) 713 - # Strip any directory components (e.g., "images/figure_0001.png" -> "figure_0001.png") 714 - # This handles Docling's output which includes "images/" prefix 715 - filename = Path(filename_raw).name 716 - full_path = image_dir / filename 717 - 718 - if full_path.exists(): 719 - try: 720 - size_kb = round(full_path.stat().st_size / 1024, 1) 721 - try: 722 - import pymupdf 723 - 724 - pix = pymupdf.Pixmap(str(full_path)) 725 - dims = f"{pix.width}x{pix.height}" 726 - pix = None 727 - except: 728 - dims = "?" 729 - 730 - # FIXED: Rewrite the actual image reference to the correct path 731 - return f"![{alt_text}]({full_path})\n\n**[Image: {filename} ({dims}, {size_kb}KB)]**" 732 - except: 733 - # FIXED: Rewrite even on metadata failure 734 - return f"![{alt_text}]({full_path})\n\n**[Image: {filename}]**" 735 - 736 - return match.group(0) 737 - 738 - pattern = r"!\[([^\]]*)\]\(([^)]+)\)" 739 - return re.sub(pattern, replace_image_ref, markdown) 740 - 741 - 742 - def create_image_summary(images): 743 - """Create a summary section listing all extracted images.""" 744 - if not images: 745 - return "" 746 - 747 - lines = [ 748 - "", 749 - "---", 750 - "", 751 - "## Extracted Images", 752 - "", 753 - "| # | File | Dimensions | Size | Path |", 754 - "|---|------|------------|------|------|", 755 - ] 756 - 757 - for i, img in enumerate(images, 1): 758 - lines.append( 759 - f"| {i} | {img['filename']} | {img['dimensions']} | {img['size_kb']}KB | `{img['path']}` |" 760 - ) 761 - 762 - lines.append("") 763 - return "\n".join(lines) 764 - 765 - 766 787 def convert_pdf( 767 788 pdf_path, 768 789 image_dir=None, ··· 829 850 header_lines.extend(["---", "", ""]) 830 851 831 852 return "\n".join(header_lines) + markdown 832 - 833 - 834 - def setup_temp_image_dir(pdf_path): 835 - """ 836 - Create temporary image directory for extraction. 837 - 838 - Uses tempfile.mkdtemp() for: 839 - - Unique directory names (safe for concurrent runs) 840 - - Cross-platform compatibility (works on Windows) 841 - """ 842 - pdf_name = Path(pdf_path).stem 843 - safe_name = re.sub(r"[^\w\-_]", "_", pdf_name) 844 - # Create unique temp directory with prefix based on PDF name 845 - image_dir = tempfile.mkdtemp(prefix=f"pdf_images_{safe_name}_") 846 - return image_dir 847 853 848 854 849 855 # ============================================================================= ··· 933 939 934 940 args = parser.parse_args() 935 941 942 + # Initialize cache manager 943 + cache_mgr = CacheManager() 944 + 936 945 # Handle cache management commands first 937 946 if args.clear_all_cache: 938 - if clear_cache(): 939 - print(f"Cache cleared: {CACHE_DIR}", file=sys.stderr) 947 + if cache_mgr.clear(): 948 + print(f"Cache cleared: {cache_mgr.cache_dir}", file=sys.stderr) 940 949 else: 941 950 print("Cache was already empty.", file=sys.stderr) 942 951 sys.exit(0) 943 952 944 953 if args.cache_stats: 945 - stats = get_cache_stats() 954 + stats = cache_mgr.get_stats() 946 955 print(f"Cache directory: {stats['cache_dir']}", file=sys.stderr) 947 956 print(f"Cached PDFs: {stats['entries']}", file=sys.stderr) 948 957 print(f"Total size: {stats['total_size_mb']} MB", file=sys.stderr) ··· 954 963 955 964 # Handle --clear-cache before existence check (allows clearing cache for deleted PDFs) 956 965 if args.clear_cache: 957 - if clear_cache(args.input): 966 + if cache_mgr.clear(args.input): 958 967 print(f"Cache cleared for: {args.input}", file=sys.stderr) 959 968 else: 960 969 print(f"No cache found for: {args.input}", file=sys.stderr) ··· 971 980 # Try to find cached extraction by source path, filtered by requested mode 972 981 if not args.no_cache: 973 982 # First try to find cache matching the requested mode/scale 974 - matching_caches = find_cache_by_source_path( 983 + matching_caches = cache_mgr.find_by_source( 975 984 args.input, 976 985 docling=args.docling, 977 986 images_scale=args.images_scale if args.docling else None, 978 987 ) 979 988 # If no exact match, try any cache for this file 980 989 if not matching_caches: 981 - matching_caches = find_cache_by_source_path(args.input) 990 + matching_caches = cache_mgr.find_by_source(args.input) 982 991 983 992 if matching_caches: 984 993 # Use the freshest matching cache (already sorted by cached_at desc) ··· 1054 1063 cache_hit = True # Will load below after page range parsing 1055 1064 elif not args.no_cache: 1056 1065 # Check if we have a valid cache for this PDF 1057 - valid, cache_key = is_cache_valid( 1058 - args.input, 1066 + config = ExtractionConfig( 1067 + pdf_path=args.input, 1059 1068 docling=args.docling, 1060 1069 images_scale=args.images_scale, 1061 1070 no_images=args.no_images, 1062 1071 ) 1072 + valid, cache_key = cache_mgr.is_valid(config) 1063 1073 if valid: 1064 1074 # Get total_pages from cache metadata (doesn't load full content) 1065 - total_pages = get_cached_total_pages(cache_key) 1075 + total_pages = cache_mgr._get_cached_total_pages(cache_key) 1066 1076 if total_pages > 0: 1067 1077 cache_hit = True # Will load below after page range parsing 1068 1078 ··· 1096 1106 if show_progress: 1097 1107 mode = "docling" if args.docling else "fast" 1098 1108 print(f"Loading from cache ({mode} mode)...", file=sys.stderr) 1099 - result, image_dir, cached_total = load_from_cache( 1109 + cache_result = cache_mgr.load( 1100 1110 cache_key, requested_pages, no_images=args.no_images 1101 1111 ) 1102 - if result is None: 1112 + if cache_result is None: 1103 1113 if cache_fallback: 1104 1114 # Cache was corrupted and PDF doesn't exist - can't recover 1105 1115 print( ··· 1113 1123 # Need to check dependencies now since we're going to extract 1114 1124 if not check_dependencies(docling_mode=args.docling): 1115 1125 sys.exit(1) 1126 + else: 1127 + result = cache_result.markdown 1128 + image_dir = cache_result.image_dir 1129 + 1130 + # Determine output path early (needed for image handling) 1131 + output_path = None 1132 + if not args.stdout: 1133 + output_path = args.output or os.path.splitext(args.input)[0] + ".md" 1134 + 1135 + # Use ImageManager for extraction with automatic cleanup 1136 + img_mgr = ImageManager() 1116 1137 1117 1138 # If no cache hit, extract full PDF 1118 1139 if not cache_hit: 1119 - # Get cache key if we don't have it 1120 - if not cache_key: 1121 - cache_key = get_cache_key( 1122 - args.input, 1140 + # Build config if we don't have it 1141 + if "config" not in dir() or config is None: 1142 + config = ExtractionConfig( 1143 + pdf_path=args.input, 1123 1144 docling=args.docling, 1124 1145 images_scale=args.images_scale, 1125 1146 no_images=args.no_images, 1126 1147 ) 1127 1148 1149 + # Get cache key if we don't have it 1150 + if not cache_key: 1151 + cache_key = cache_mgr.get_key(config) 1152 + 1128 1153 # Setup image directory for extraction (temporary) 1129 1154 temp_image_dir = None 1130 1155 if not args.no_images: 1131 - temp_image_dir = setup_temp_image_dir(args.input) 1156 + temp_image_dir = img_mgr.create_temp_dir(args.input) 1132 1157 1133 1158 # Extract FULL PDF 1134 1159 try: ··· 1153 1178 images_scale=args.images_scale, 1154 1179 ) 1155 1180 except Exception as e: 1181 + img_mgr.cleanup() # Clean up on error 1156 1182 print(f"ERROR: Conversion failed: {e}", file=sys.stderr) 1157 1183 sys.exit(1) 1158 1184 1159 1185 # Save to cache (full result) 1160 1186 if not args.no_cache: 1161 - save_to_cache( 1162 - cache_key, 1163 - result, 1164 - temp_image_dir, 1165 - args.input, 1166 - total_pages, 1167 - docling=args.docling, 1168 - images_scale=args.images_scale, 1169 - no_images=args.no_images, 1187 + extraction_result = ExtractionResult( 1188 + markdown=result, 1189 + image_dir=temp_image_dir, 1190 + total_pages=total_pages, 1170 1191 ) 1192 + cache_mgr.save(cache_key, extraction_result, config) 1171 1193 if show_progress: 1172 - print(f"Cached: {get_cache_dir(cache_key)}", file=sys.stderr) 1194 + print(f"Cached: {cache_mgr._get_dir(cache_key)}", file=sys.stderr) 1173 1195 1174 - # Set image_dir for output 1175 - # When --no-cache is set, always use temp_image_dir (don't reference stale cache) 1176 - # Otherwise, prefer cached location if available 1196 + # Finalize image directory 1177 1197 if not args.no_images: 1178 - if args.no_cache: 1179 - image_dir = temp_image_dir 1180 - else: 1181 - cached_image_dir = get_cache_dir(cache_key) / "images" 1182 - if cached_image_dir.exists() and any(cached_image_dir.iterdir()): 1183 - image_dir = cached_image_dir 1184 - else: 1185 - # No images found (text-only PDF), don't set image_dir 1186 - image_dir = None 1187 - 1188 - # Always clean up temp directory when caching is enabled 1189 - if temp_image_dir and os.path.exists(temp_image_dir): 1190 - shutil.rmtree(temp_image_dir) 1198 + image_dir = img_mgr.finalize_images( 1199 + temp_dir=temp_image_dir, 1200 + cache_dir=cache_mgr._get_dir(cache_key) if not args.no_cache else None, 1201 + output_dir=output_path, 1202 + no_cache=args.no_cache, 1203 + show_progress=show_progress, 1204 + ) 1191 1205 1192 1206 # Slice pages if requested (after caching full result) 1193 1207 if requested_pages: 1194 1208 result = slice_pages_from_markdown(result, requested_pages, total_pages) 1195 1209 1196 - # Determine output path early (needed for --no-cache image handling) 1197 - output_path = None 1198 - if not args.stdout: 1199 - output_path = args.output or os.path.splitext(args.input)[0] + ".md" 1200 - 1201 - # Handle --no-cache image directory: copy to output location or warn 1202 - if args.no_cache and image_dir and not args.no_images: 1203 - if output_path: 1204 - # Copy images to a directory next to the output file (e.g., doc_images/) 1205 - output_images_dir = Path(str(output_path).rsplit(".", 1)[0] + "_images") 1206 - if Path(image_dir).exists() and any(Path(image_dir).iterdir()): 1207 - if output_images_dir.exists(): 1208 - shutil.rmtree(output_images_dir) 1209 - shutil.copytree(image_dir, output_images_dir) 1210 - # Clean up temp directory 1211 - if os.path.exists(image_dir): 1212 - shutil.rmtree(image_dir) 1213 - image_dir = output_images_dir 1214 - if show_progress: 1215 - print(f"Images copied to: {output_images_dir}", file=sys.stderr) 1216 - else: 1217 - # Outputting to stdout with --no-cache: warn about ephemeral paths 1218 - print( 1219 - f"WARNING: --no-cache with stdout: image paths reference temporary directory {image_dir} which may be cleaned up by the system.", 1220 - file=sys.stderr, 1221 - ) 1222 - 1223 1210 # Format output 1224 1211 output = result 1225 1212 1226 1213 # Extract referenced images before enhancement (for filtering summary) 1227 - # This ensures we only show images that are actually in the sliced output 1228 - referenced_images = extract_referenced_images(result) if result else set() 1214 + referenced_images = img_mgr.extract_references(result) if result else set() 1229 1215 1230 1216 # Enhance image references with full paths (skip if --no-images) 1231 1217 if image_dir and not args.no_images: 1232 - output = enhance_markdown_with_image_paths(output, image_dir) 1218 + output = img_mgr.enhance_markdown(output, image_dir) 1233 1219 1234 1220 # Add image summary table at the end (filtered to referenced images only) 1235 - images = get_image_info(image_dir, referenced_only=referenced_images) 1221 + images = img_mgr.get_info(image_dir, referenced_only=referenced_images) 1236 1222 if images: 1237 - output += create_image_summary(images) 1223 + output += img_mgr.create_summary(images) 1238 1224 1239 1225 if not args.no_metadata: 1240 1226 output = add_metadata_header( ··· 1258 1244 msg += " (from cache)" 1259 1245 if image_dir and not args.no_images: 1260 1246 # Use the same filtered image set for consistency 1261 - images = get_image_info(image_dir, referenced_only=referenced_images) 1247 + images = img_mgr.get_info(image_dir, referenced_only=referenced_images) 1262 1248 if images: 1263 1249 msg += f" ({len(images)} images)" 1264 1250 print(msg, file=sys.stderr)