Fix code review findings: cache clearing, missing images, and portable paths · alice.mosphere.at/claude-skill-pdf-to-markdown@9299034

+98 -39

2 changed files

expand all

SKILL.md

scripts

pdf_to_md.py

+12 -11

SKILL.md

··· 12 12 - Lists (ordered and unordered) 13 13 - Multi-column layouts (correct reading order) 14 14 - Code blocks 15 - - **Images** (always extracted to cache with paths in output) 15 + - **Images** (extracted and copied next to output with relative paths) 16 16 17 17 ## When to Use This Skill 18 18 ··· 54 54 # Convert PDF to markdown (always extracts images) 55 55 ~/.claude/skills/pdf-to-markdown/.venv/bin/python ~/.claude/skills/pdf-to-markdown/scripts/pdf_to_md.py document.pdf 56 56 57 - # Output: document.md + images in cache 57 + # Output: document.md + images/ folder (next to the .md file) 58 58 ``` 59 59 60 60 ## Standard Workflow ··· 112 112 ## Image Handling 113 113 114 114 Images are always extracted. They are: 115 - 1. **Extracted** to cache directory `~/.cache/pdf-to-markdown/<cache_key>/images/` 116 - 2. **Referenced** in the markdown with full paths 117 - 3. **Summarized** in a table at the end of the document 115 + 1. **Cached** in `~/.cache/pdf-to-markdown/<cache_key>/images/` 116 + 2. **Copied** to `images/` folder next to the output `.md` file 117 + 3. **Referenced** in the markdown with relative paths (`images/filename.png`) 118 + 4. **Summarized** in a table at the end of the document 118 119 119 120 ### Auto-View Behavior for Images 120 121 ··· 143 144 total_pages: 42 144 145 extracted_at: 2025-01-15T10:30:00 145 146 from_cache: true 146 - images_dir: /Users/.../.cache/pdf-to-markdown/abc123/images 147 + images_dir: images 147 148 --- 148 149 ``` 149 150 ··· 155 156 156 157 Regular paragraph text with **bold**, *italic*, and `code` formatting. 157 158 158 - ![Figure 1](/Users/.../.cache/pdf-to-markdown/abc123/images/figure_1.png) 159 + ![Figure 1](images/figure_1.png) 159 160 160 161 **[Image: figure_1.png (800x600, 45.2KB)]** 161 162 ··· 170 171 171 172 ## Extracted Images 172 173 173 - | # | File | Dimensions | Size | Path | 174 - |---|------|------------|------|------| 175 - | 1 | figure_1.png | 800x600 | 45.2KB | `~/.cache/.../images/figure_1.png` | 176 - | 2 | chart_2.png | 1200x800 | 89.1KB | `~/.cache/.../images/chart_2.png` | 174 + | # | File | Dimensions | Size | 175 + |---|------|------------|------| 176 + | 1 | figure_1.png | 800x600 | 45.2KB | 177 + | 2 | chart_2.png | 1200x800 | 89.1KB | 177 178 ``` 178 179 179 180 ## Script Reference

+86 -28

scripts/pdf_to_md.py

··· 158 158 pass 159 159 return None 160 160 161 + # Check if markdown references images 162 + has_image_refs = bool(re.search(r"!\[[^\]]*\]\([^)]+\)", full_md)) 163 + 161 164 # Get cached images directory 162 - image_dir = None 163 165 cached_image_dir = cache_dir / "images" 164 - if cached_image_dir.exists() and any(cached_image_dir.iterdir()): 165 - image_dir = cached_image_dir 166 + has_images = cached_image_dir.exists() and any(cached_image_dir.iterdir()) 167 + 168 + # If markdown expects images but they're missing, invalidate cache 169 + if has_image_refs and not has_images: 170 + print( 171 + "WARNING: Cache missing images, regenerating...", 172 + file=sys.stderr, 173 + ) 174 + try: 175 + shutil.rmtree(cache_dir) 176 + except OSError: 177 + pass 178 + return None 179 + 180 + image_dir = cached_image_dir if has_images else None 166 181 167 182 return ExtractionResult( 168 183 markdown=full_md, ··· 260 275 os.unlink(temp_json) 261 276 262 277 def clear(self, pdf_path: str = None) -> bool: 263 - """Clear cache for specific PDF or entire cache.""" 278 + """Clear cache for specific PDF (both fast and docling modes) or entire cache.""" 264 279 if pdf_path: 265 - try: 266 - config = ExtractionConfig(pdf_path=pdf_path) 267 - cache_key = self.get_key(config) 268 - cache_dir = self._get_dir(cache_key) 269 - if cache_dir.exists(): 270 - shutil.rmtree(cache_dir) 271 - return True 272 - except (FileNotFoundError, OSError): 273 - pass 274 - return False 280 + # Clear BOTH fast and docling caches for this PDF 281 + cleared = False 282 + for docling_mode in [False, True]: 283 + try: 284 + config = ExtractionConfig(pdf_path=pdf_path, docling=docling_mode) 285 + cache_key = self.get_key(config) 286 + cache_dir = self._get_dir(cache_key) 287 + if cache_dir.exists(): 288 + shutil.rmtree(cache_dir) 289 + cleared = True 290 + except (FileNotFoundError, OSError): 291 + pass 292 + return cleared 275 293 else: 276 294 if self.cache_dir.exists(): 277 295 shutil.rmtree(self.cache_dir) ··· 369 387 return images 370 388 371 389 def enhance_markdown(self, markdown: str, image_dir: Path) -> str: 372 - """Rewrite image references to point to actual image location.""" 390 + """Rewrite image references to use relative paths (portable, Windows-safe).""" 373 391 if not image_dir: 374 392 return markdown 375 393 ··· 381 399 filename = Path(filename_raw).name 382 400 full_path = image_dir / filename 383 401 402 + # Use relative path for portability (POSIX format for Windows compatibility) 403 + relative_path = Path("images") / filename 404 + 384 405 if full_path.exists(): 385 406 try: 386 407 size_kb = round(full_path.stat().st_size / 1024, 1) ··· 392 413 except Exception: 393 414 dims = "?" 394 415 395 - return f"![{alt_text}]({full_path})\n\n**[Image: {filename} ({dims}, {size_kb}KB)]**" 416 + return f"![{alt_text}]({relative_path.as_posix()})\n\n**[Image: {filename} ({dims}, {size_kb}KB)]**" 396 417 except Exception: 397 - return f"![{alt_text}]({full_path})\n\n**[Image: {filename}]**" 418 + return f"![{alt_text}]({relative_path.as_posix()})\n\n**[Image: {filename}]**" 398 419 399 420 return match.group(0) 400 421 ··· 412 433 "", 413 434 "## Extracted Images", 414 435 "", 415 - "| # | File | Dimensions | Size | Path |", 416 - "|---|------|------------|------|------|", 436 + "| # | File | Dimensions | Size |", 437 + "|---|------|------------|------|", 417 438 ] 418 439 419 440 for i, img in enumerate(images, 1): 420 441 lines.append( 421 - f"| {i} | {img['filename']} | {img['dimensions']} | {img['size_kb']}KB | `{img['path']}` |" 442 + f"| {i} | {img['filename']} | {img['dimensions']} | {img['size_kb']}KB |" 422 443 ) 423 444 424 445 lines.append("") 425 446 return "\n".join(lines) 426 447 427 448 def finalize_images( 428 - self, temp_dir: Path, cache_dir: Path, output_dir: Path, show_progress: bool = False 449 + self, temp_dir: Path, cache_dir: Path, output_path: Path, show_progress: bool = False 429 450 ) -> Path | None: 430 451 """Finalize image directory after extraction. 431 452 432 - Copies images from cache to output location. 453 + Copies images from cache to output location (next to the markdown file). 433 454 Cleans up temp directories. 434 455 435 - Returns the final image directory to use for output. 456 + Returns the final image directory (next to output) for reference. 436 457 """ 437 458 if not temp_dir: 438 459 return None ··· 453 474 if temp_dir in self._temp_dirs: 454 475 self._temp_dirs.remove(temp_dir) 455 476 456 - # Use cached images 477 + # Copy images from cache to output location 457 478 if cache_dir: 458 479 cached_image_dir = cache_dir / "images" 459 480 if cached_image_dir.exists() and any(cached_image_dir.iterdir()): 460 - return cached_image_dir 481 + return self._copy_images_to_output(cached_image_dir, output_path, show_progress) 461 482 462 483 return None 463 484 485 + def _copy_images_to_output( 486 + self, source_dir: Path, output_path: Path, show_progress: bool = False 487 + ) -> Path | None: 488 + """Copy images from cache to output location (next to markdown file).""" 489 + output_path = Path(output_path) 490 + 491 + # Determine output images directory (sibling to markdown file) 492 + if output_path.suffix: # It's a file path like "output.md" 493 + output_images_dir = output_path.parent / "images" 494 + else: # It's a directory 495 + output_images_dir = output_path / "images" 496 + 497 + # Don't copy if already at output location 498 + if output_images_dir.resolve() == Path(source_dir).resolve(): 499 + return output_images_dir 500 + 501 + # Copy images to output location 502 + output_images_dir.mkdir(parents=True, exist_ok=True) 503 + copied_count = 0 504 + for img in source_dir.iterdir(): 505 + if img.is_file(): 506 + shutil.copy2(img, output_images_dir / img.name) 507 + copied_count += 1 508 + 509 + if show_progress and copied_count > 0: 510 + print(f"Copied {copied_count} images to: {output_images_dir}", file=sys.stderr) 511 + 512 + return output_images_dir 513 + 464 514 465 515 # ============================================================================= 466 516 # PDF PROCESSING ··· 542 592 header_lines.append("from_cache: true") 543 593 544 594 if image_dir: 545 - header_lines.append(f"images_dir: {image_dir}") 595 + # Use relative path for portability 596 + header_lines.append("images_dir: images") 546 597 547 598 header_lines.extend(["---", "", ""]) 548 599 ··· 653 704 cache_result = cache_mgr.load(cache_key) 654 705 if cache_result: 655 706 result = cache_result.markdown 656 - image_dir = cache_result.image_dir 657 707 total_pages = cache_result.total_pages 658 708 cache_hit = True 709 + 710 + # Copy images from cache to output location 711 + if cache_result.image_dir: 712 + output_path = args.output or os.path.splitext(args.input)[0] + ".md" 713 + img_mgr = ImageManager() 714 + image_dir = img_mgr._copy_images_to_output( 715 + cache_result.image_dir, output_path, show_progress 716 + ) 659 717 660 718 # Extract if no cache hit 661 719 if not cache_hit: ··· 711 769 image_dir = img_mgr.finalize_images( 712 770 temp_dir=temp_image_dir, 713 771 cache_dir=cache_mgr._get_dir(cache_key), 714 - output_dir=output_path, 772 + output_path=output_path, 715 773 show_progress=show_progress, 716 774 ) 717 775

Configure Feed

Configure Feed