feat: implement PDS endpoint resolution for web DIDs and enhance media cache file tracking

+138 -30

2 changed files

expand all

scrapers

blueskyScraper.js

utils

mediaCache.js

+88 -30

scrapers/blueskyScraper.js

··· 30 30 } 31 31 32 32 /** 33 + * Resolve the PDS service endpoint for a web DID 34 + * @param {string} did - The web DID (e.g., did:web:didd.uk) 35 + * @returns {Promise<string>} - The PDS service endpoint URL 36 + */ 37 + async resolveWebDidPdsEndpoint(did) { 38 + if (!did.startsWith('did:web:')) { 39 + throw new Error('Not a web DID'); 40 + } 41 + 42 + try { 43 + // Extract domain from web DID (e.g., did:web:didd.uk -> didd.uk) 44 + const webDomain = did.replace('did:web:', '').replace(/:/g, '/'); 45 + const didDocUrl = `https://${webDomain}/.well-known/did.json`; 46 + 47 + console.log(`Fetching DID document from: ${didDocUrl}`); 48 + const response = await axios.get(didDocUrl, { 49 + headers: { 'User-Agent': 'Mozilla/5.0 Stagehand/1.1.0' }, 50 + timeout: 10000 51 + }); 52 + 53 + const didDoc = response.data; 54 + 55 + // Look for the PDS service endpoint in the DID document 56 + // It should have type "AtprotoPersonalDataServer" 57 + const pdsService = didDoc.service?.find(s => 58 + s.type === 'AtprotoPersonalDataServer' || 59 + s.id === '#atproto_pds' 60 + ); 61 + 62 + if (pdsService && pdsService.serviceEndpoint) { 63 + console.log(`Resolved PDS endpoint: ${pdsService.serviceEndpoint}`); 64 + return pdsService.serviceEndpoint; 65 + } 66 + 67 + // Fallback to the domain itself if no service endpoint found 68 + const fallbackEndpoint = `https://${webDomain.split('/')[0]}`; 69 + console.log(`No PDS service in DID document, using fallback: ${fallbackEndpoint}`); 70 + return fallbackEndpoint; 71 + } catch (error) { 72 + console.error(`Failed to resolve web DID PDS endpoint: ${error.message}`); 73 + // Fallback to direct domain 74 + const webDomain = did.replace('did:web:', '').split(':')[0]; 75 + return `https://${webDomain}`; 76 + } 77 + } 78 + 79 + /** 33 80 * Parse a Bluesky URL to extract handle/DID and rkey (post ID) 34 81 * @param {string} url - Bluesky URL 35 82 * @returns {{repo: string, rkey: string}} - Extracted repo (handle/DID) and rkey ··· 548 595 const thumbUrl = `https://video.bsky.app/watch/${did}/${cid}/thumbnail.jpg`; 549 596 const thumbnailProcessed = await mediaCache.processMediaUrl(thumbUrl, false, thumbUrl); 550 597 598 + // Determine the correct PDS endpoint for web DIDs 599 + let pdsEndpoint = this.serviceEndpoint; 600 + if (did.startsWith('did:web:')) { 601 + console.log(`Detected web DID: ${did}`); 602 + // Resolve the actual PDS endpoint from the DID document 603 + pdsEndpoint = await this.resolveWebDidPdsEndpoint(did); 604 + console.log(`Using resolved PDS endpoint: ${pdsEndpoint}`); 605 + } 606 + 551 607 // Direct blob access is most reliable for video 552 - const videoBlobUrl = `${this.serviceEndpoint}/xrpc/com.atproto.sync.getBlob?did=${did}&cid=${cid}`; 608 + const videoBlobUrl = `${pdsEndpoint}/xrpc/com.atproto.sync.getBlob?did=${encodeURIComponent(did)}&cid=${cid}`; 609 + console.log(`Constructed blob URL: ${videoBlobUrl}`); 553 610 let videoProcessed = null; 554 - let sourceVideoUrl = `https://video.bsky.app/watch/${did}/${cid}/video.mp4`; 611 + let sourceVideoUrl = videoBlobUrl; 555 612 556 613 try { 557 614 console.log(`Downloading video from blob URL: ${videoBlobUrl}`); ··· 575 632 console.error(`Failed to download video from blob URL: ${blobError.message}`); 576 633 577 634 // If direct blob access fails, try alternative URLs 578 - const videoUrls = [ 579 - `https://video.bsky.app/watch/${did}/${cid}/video.mp4`, 580 - `https://video.bsky.app/watch/${did}/${cid}/480.mp4`, 581 - `https://video.bsky.app/watch/${did}/${cid}/720.mp4`, 582 - `https://video.bsky.app/watch/${did}/${cid}/1080.mp4`, 583 - `https://video.bsky.app/watch/${did}/${cid}/${cid}.mp4` 584 - ]; 635 + // Note: video.bsky.app URLs only work for standard Bluesky DIDs, not web DIDs 636 + const isWebDid = did.startsWith('did:web:'); 585 637 586 - for (const videoUrl of videoUrls) { 587 - try { 588 - console.log(`Trying alternative video URL: ${videoUrl}`); 589 - videoProcessed = await mediaCache.processMediaUrl(videoUrl, true, videoUrl); 590 - console.log(`Successfully downloaded video from: ${videoUrl}`); 591 - sourceVideoUrl = videoUrl; 592 - break; 593 - } catch (e) { 594 - console.log(`Failed with URL ${videoUrl}: ${e.message}`); 638 + if (!isWebDid) { 639 + const videoUrls = [ 640 + `https://video.bsky.app/watch/${did}/${cid}/video.mp4`, 641 + `https://video.bsky.app/watch/${did}/${cid}/480.mp4`, 642 + `https://video.bsky.app/watch/${did}/${cid}/720.mp4`, 643 + `https://video.bsky.app/watch/${did}/${cid}/1080.mp4`, 644 + `https://video.bsky.app/watch/${did}/${cid}/${cid}.mp4` 645 + ]; 646 + 647 + for (const videoUrl of videoUrls) { 648 + try { 649 + console.log(`Trying alternative video URL: ${videoUrl}`); 650 + videoProcessed = await mediaCache.processMediaUrl(videoUrl, true, videoUrl); 651 + console.log(`Successfully downloaded video from: ${videoUrl}`); 652 + sourceVideoUrl = videoUrl; 653 + break; 654 + } catch (e) { 655 + console.log(`Failed with URL ${videoUrl}: ${e.message}`); 656 + } 595 657 } 658 + } else { 659 + console.log(`Web DID detected - video.bsky.app URLs not available, blob API is the only option`); 596 660 } 597 661 598 662 if (videoProcessed) { ··· 612 676 } 613 677 } 614 678 615 - // If all video attempts fail, fall back to thumbnail only 616 - console.warn('Could not download Bluesky video, using thumbnail only'); 617 - return { 618 - imageUrl: thumbnailProcessed.localPath, 619 - imageUrls: [thumbnailProcessed.localPath], // Single thumbnail 620 - sourceUrl: url, 621 - title: `Bluesky Video by ${displayName}`, 622 - siteName: 'Bluesky', 623 - isVideo: false, 624 - originalImageUrl: thumbUrl, 625 - sourceImgUrl: thumbUrl // Add the new sourceImgUrl field 626 - }; 679 + // If all video attempts fail, throw an error instead of falling back to thumbnail 680 + const errorMsg = did.startsWith('did:web:') 681 + ? `Failed to download video from web DID (${did}). The blob API at the custom PDS server is not accessible or returned an error.` 682 + : `Failed to download Bluesky video from all attempted sources (blob API and CDN URLs).`; 683 + console.error(errorMsg); 684 + throw new Error(errorMsg); 627 685 } catch (error) { 628 686 console.error('Error processing Bluesky video:', error); 629 687 throw new Error(`Could not process Bluesky video: ${error.message}`);

+50

utils/mediaCache.js

··· 70 70 } 71 71 72 72 /** 73 + * Get a set of all file paths currently in the queue 74 + */ 75 + async getQueuedFiles() { 76 + try { 77 + const queueFile = config.queueFilePath || path.join(__dirname, '..', 'queue', 'queue.json'); 78 + 79 + if (await fs.pathExists(queueFile)) { 80 + const queueData = await fs.readJson(queueFile); 81 + const filesInUse = new Set(); 82 + 83 + if (queueData.queue && Array.isArray(queueData.queue)) { 84 + for (const item of queueData.queue) { 85 + // Add all file paths from queue items 86 + if (item.imageUrl) filesInUse.add(path.resolve(item.imageUrl)); 87 + if (item.videoUrl) filesInUse.add(path.resolve(item.videoUrl)); 88 + 89 + // Handle multiple images 90 + if (item.imageUrls && Array.isArray(item.imageUrls)) { 91 + item.imageUrls.forEach(url => filesInUse.add(path.resolve(url))); 92 + } 93 + } 94 + } 95 + 96 + console.log(`Found ${filesInUse.size} files currently in queue`); 97 + return filesInUse; 98 + } 99 + } catch (error) { 100 + console.error('Error reading queue file for cleanup:', error); 101 + } 102 + 103 + return new Set(); 104 + } 105 + 106 + /** 73 107 * Clean up old cache files 74 108 */ 75 109 async cleanupCache() { ··· 77 111 const now = Date.now(); 78 112 const maxAgeMs = this.maxCacheAgeDays * 24 * 60 * 60 * 1000; 79 113 114 + // Get files that are currently in the queue 115 + const queuedFiles = await this.getQueuedFiles(); 116 + 80 117 // Helper function to clean a specific directory 81 118 const cleanDir = async (dir) => { 82 119 try { 83 120 const files = await fs.readdir(dir); 121 + let skippedCount = 0; 84 122 85 123 for (const file of files) { 86 124 const filePath = path.join(dir, file); 125 + const resolvedPath = path.resolve(filePath); 126 + 127 + // Skip files that are in the queue 128 + if (queuedFiles.has(resolvedPath)) { 129 + skippedCount++; 130 + continue; 131 + } 132 + 87 133 const stats = await fs.stat(filePath); 88 134 89 135 // Check if file is older than max cache age ··· 91 137 await fs.remove(filePath); 92 138 console.log(`Removed old cache file: ${file}`); 93 139 } 140 + } 141 + 142 + if (skippedCount > 0) { 143 + console.log(`Skipped ${skippedCount} files in queue from ${path.basename(dir)}`); 94 144 } 95 145 } catch (error) { 96 146 console.error(`Error cleaning directory ${dir}:`, error);

Configure Feed

Configure Feed