Add ticket to avoid duplicate asset queuing · jauntywk.bsky.social/dexport@6052560

+1

.beads/issues.jsonl

··· 1 + {"id":"de-duplicate-asset-queue","title":"Avoid queueing an asset that has already been queued once","description":"The system should track which assets have been queued and avoid queueing the same asset multiple times.","status":"open","priority":2,"issue_type":"bug","created_at":"2026-02-10T13:19:57.21449169-05:00","updated_at":"2026-02-10T13:19:57.21449169-05:00"} 1 2 {"id":"de-optional-output-smart-url-naming","title":"Make output directory optional with smart URL-based naming","description":"Currently, output directory is required. We should make it optional and support a special value (e.g., `true` or `auto`) that uses the URL to determine the output directory name.\n\nThe `getDefaultOutput()` function already implements smart directory naming:\n- `https://deepwiki.com/open-telemetry/otel-arrow` → `otel-arrow`\n- `https://deepwiki.com/` → `deepwiki.com`\n\n## Requirements\n\n1. Make `--output` argument optional (currently required)\n2. Support special value `--output true` or `--output auto` that enables smart naming\n3. When multiple URLs are provided with smart naming enabled, each URL should get its own subdirectory:\n - `node cli.ts https://example.com/foo https://example.com/bar --output true`\n - Creates: `./foo/` and `./bar/` in current working directory\n\n## Open Questions\n\n- How should this work when mixing custom output dir with smart naming?\n - Should `--output ./output true` be invalid?\n - Or should it create `./output/foo/`?\n- What about hash fragments in URLs?\n - Should `https://example.com/page#section` use `page` or `page#section`?","status":"open","priority":2,"issue_type":"task","owner":"rektide+git@voodoowarez.com","created_at":"2026-02-09T20:07:27.284140403-05:00","created_by":"rektide de la faye","updated_at":"2026-02-09T20:07:27.284140403-05:00"}

+6 -308

src/cli.ts

··· 1 1 #!/usr/bin/env node 2 2 import { cli, define } from "gunshi"; 3 - import { HTMLRewriter } from "html-rewriter-wasm"; 4 - import { access, mkdir, stat, writeFile } from "node:fs/promises"; 5 - import { dirname, join, resolve } from "node:path"; 6 - 7 - type OverwriteMode = "ignore" | "cache" | "update"; 8 - 9 - type LinkPredicate = (url: URL, baseUrl: URL) => boolean; 10 - 11 - interface PageStats { 12 - url: string; 13 - path: string; 14 - size: number; 15 - links: number; 16 - assets: number; 17 - cached: boolean; 18 - } 19 - 20 - function originPredicate(baseUrl: URL): LinkPredicate { 21 - return (url) => url.origin === baseUrl.origin; 22 - } 23 - 24 - function subtreePredicate(baseUrl: URL): LinkPredicate { 25 - return (url) => url.href.startsWith(baseUrl.href); 26 - } 27 - 28 - function ensureUrl(urlString: string): URL { 29 - if (!urlString.includes("://")) { 30 - urlString = "https://" + urlString; 31 - } 32 - return new URL(urlString); 33 - } 34 - 35 - function getDefaultOutput(url: URL): string { 36 - const pathname = url.pathname; 37 - const cleanedPath = pathname.endsWith("/") ? pathname.slice(0, -1) : pathname; 38 - const parts = cleanedPath.split("/").filter(Boolean); 39 - return parts.length > 0 ? parts[parts.length - 1] : url.hostname; 40 - } 41 - 42 - function applyReplaceHost(url: URL, newHost: string): URL { 43 - const newUrl = new URL(url.href); 44 - newUrl.hostname = newHost; 45 - return newUrl; 46 - } 47 - 48 - async function logFileStats(stats: PageStats) { 49 - console.log(JSON.stringify(stats)); 50 - } 51 - 52 - async function downloadAsset( 53 - url: URL, 54 - outputDir: string, 55 - overwrite: OverwriteMode, 56 - stripHost: boolean, 57 - replacementHost?: string, 58 - ): Promise<boolean> { 59 - const assetOutputPath = stripHost 60 - ? join(outputDir, url.pathname) 61 - : join(outputDir, url.hostname, url.pathname); 62 - 63 - try { 64 - await access(assetOutputPath); 65 - 66 - if (overwrite === "ignore") { 67 - const assetStats = await stat(assetOutputPath); 68 - logFileStats({ 69 - url: url.href, 70 - path: assetOutputPath, 71 - size: assetStats.size, 72 - links: 0, 73 - assets: 0, 74 - cached: true, 75 - }); 76 - return true; 77 - } 78 - 79 - if (overwrite === "cache") { 80 - const assetStats = await stat(assetOutputPath); 81 - logFileStats({ 82 - url: url.href, 83 - path: assetOutputPath, 84 - size: assetStats.size, 85 - links: 0, 86 - assets: 0, 87 - cached: true, 88 - }); 89 - return true; 90 - } 91 - } catch {} 92 - 93 - const fetchUrl = replacementHost ? applyReplaceHost(url, replacementHost) : url; 94 - const response = await fetch(fetchUrl.href); 95 - if (!response.ok) { 96 - return false; 97 - } 98 - 99 - const assetData = await response.arrayBuffer(); 100 - await mkdir(dirname(assetOutputPath), { recursive: true }); 101 - await writeFile(assetOutputPath, Buffer.from(assetData)); 102 - 103 - logFileStats({ 104 - url: url.href, 105 - path: assetOutputPath, 106 - size: assetData.byteLength, 107 - links: 0, 108 - assets: 0, 109 - cached: false, 110 - }); 111 - 112 - return true; 113 - } 114 - 115 - async function scrapePage( 116 - url: URL, 117 - outputDir: string, 118 - predicate: LinkPredicate, 119 - queue: Set<string>, 120 - processed: Set<string>, 121 - overwrite: OverwriteMode, 122 - stripHost: boolean, 123 - replacementHost?: string, 124 - ): Promise<void> { 125 - const urlWithoutHash = url.href.split("#")[0]; 126 - if (processed.has(urlWithoutHash)) return; 127 - 128 - processed.add(urlWithoutHash); 129 - 130 - const response = await fetch(url.href); 131 - if (!response.ok) { 132 - return; 133 - } 134 - 135 - const html = await response.text(); 136 - 137 - const links = new Set<string>(); 138 - const assets = new Set<string>(); 139 - 140 - const rewriter = new HTMLRewriter((_chunk) => {}); 141 - 142 - rewriter.on("a[href]", { 143 - element(element) { 144 - const href = element.getAttribute("href"); 145 - if (!href) return; 146 - 147 - try { 148 - let linkUrl = new URL(href, url); 149 - if (replacementHost) { 150 - linkUrl = applyReplaceHost(linkUrl, replacementHost); 151 - } 152 - if (predicate(linkUrl, url) && !processed.has(linkUrl.href.split("#")[0])) { 153 - links.add(linkUrl.href); 154 - } 155 - } catch {} 156 - }, 157 - }); 158 - 159 - rewriter.on("img[src]", { 160 - element(element) { 161 - const src = element.getAttribute("src"); 162 - if (!src) return; 163 - 164 - try { 165 - let assetUrl = new URL(src, url); 166 - if (replacementHost) { 167 - assetUrl = applyReplaceHost(assetUrl, replacementHost); 168 - } 169 - assets.add(assetUrl.href); 170 - } catch {} 171 - }, 172 - }); 173 - 174 - rewriter.on("link[href]", { 175 - element(element) { 176 - const href = element.getAttribute("href"); 177 - if (!href) return; 178 - 179 - try { 180 - let assetUrl = new URL(href, url); 181 - if (replacementHost) { 182 - assetUrl = applyReplaceHost(assetUrl, replacementHost); 183 - } 184 - assets.add(assetUrl.href); 185 - } catch {} 186 - }, 187 - }); 188 - 189 - rewriter.on("script[src]", { 190 - element(element) { 191 - const src = element.getAttribute("src"); 192 - if (!src) return; 193 - 194 - try { 195 - let assetUrl = new URL(src, url); 196 - if (replacementHost) { 197 - assetUrl = applyReplaceHost(assetUrl, replacementHost); 198 - } 199 - assets.add(assetUrl.href); 200 - } catch {} 201 - }, 202 - }); 203 - 204 - rewriter.on("link[href]", { 205 - element(element) { 206 - const href = element.getAttribute("href"); 207 - if (!href) return; 208 - 209 - try { 210 - let assetUrl = new URL(src, url); 211 - if (replacementHost) { 212 - assetUrl = applyReplaceHost(assetUrl, replacementHost); 213 - } 214 - assets.add(assetUrl.href); 215 - } catch {} 216 - }, 217 - }); 218 - 219 - rewriter.on("script[src]", { 220 - element(element) { 221 - const src = element.getAttribute("src"); 222 - if (!src) return; 223 - 224 - try { 225 - let assetUrl = new URL(src, url); 226 - if (replaceHost) { 227 - assetUrl = replaceHost(assetUrl, replaceHost); 228 - } 229 - assets.add(assetUrl.href); 230 - } catch {} 231 - }, 232 - }); 233 - 234 - const encoder = new TextEncoder(); 235 - await rewriter.write(encoder.encode(html)); 236 - await rewriter.end(); 237 - rewriter.free(); 238 - 239 - const urlPrefix = url.href.endsWith("/") ? url.href.slice(0, -1) : url.href; 240 - const hasChildResources = [...links, ...assets].some((resource) => { 241 - return resource.startsWith(urlPrefix + "/"); 242 - }); 243 - 244 - let filePath = url.pathname; 245 - if (filePath.endsWith("/") || filePath === "") { 246 - filePath = filePath + "index.html"; 247 - } else if (hasChildResources) { 248 - filePath = filePath + "/index.html"; 249 - } else if (!filePath.endsWith(".html") && !filePath.endsWith(".htm")) { 250 - filePath = filePath + ".html"; 251 - } 252 - 253 - const outputPath = stripHost 254 - ? join(outputDir, filePath) 255 - : join(outputDir, url.hostname, filePath); 256 - 257 - let cached = false; 258 - 259 - try { 260 - await access(outputPath); 261 - 262 - if (overwrite === "ignore") { 263 - const fileStats = await stat(outputPath); 264 - logFileStats({ 265 - url: url.href, 266 - path: outputPath, 267 - size: fileStats.size, 268 - links: links.size, 269 - assets: assets.size, 270 - cached: true, 271 - }); 272 - return; 273 - } 274 - 275 - if (overwrite === "cache") { 276 - cached = true; 277 - } else { 278 - await mkdir(dirname(outputPath), { recursive: true }); 279 - await writeFile(outputPath, html); 280 - } 281 - } catch { 282 - await mkdir(dirname(outputPath), { recursive: true }); 283 - await writeFile(outputPath, html); 284 - } 285 - 286 - for (const link of links) { 287 - const linkWithoutHash = link.split("#")[0]; 288 - if (!queue.has(linkWithoutHash) && !processed.has(linkWithoutHash)) { 289 - queue.add(linkWithoutHash); 290 - } 291 - } 292 - 293 - for (const asset of assets) { 294 - try { 295 - await downloadAsset(new URL(asset), outputDir, overwrite, stripHost, replacementHost); 296 - } catch {} 297 - } 298 - 299 - const fileStats = await stat(outputPath); 300 - logFileStats({ 301 - url: url.href, 302 - path: outputPath, 303 - size: fileStats.size, 304 - links: links.size, 305 - assets: assets.size, 306 - cached, 307 - }); 308 - } 3 + import type { OverwriteMode, LinkPredicate } from "./types.ts"; 4 + import { ensureUrl, applyReplaceHost } from "./url/util.ts"; 5 + import { originPredicate, subtreePredicate } from "./url/predicates.ts"; 6 + import { scrapePage } from "./scraper/scraper.ts"; 309 7 310 8 const command = define({ 311 9 args: { ··· 345 43 return; 346 44 } 347 45 348 - const outputDir = resolve(process.cwd(), output ?? "."); 46 + const outputDir = output ?? "."; 349 47 const parsedUrls = urlList.map((u) => ensureUrl(u)); 350 48 351 49 if (replacementHost) { ··· 354 52 } 355 53 } 356 54 357 - const linkPredicate = 55 + const linkPredicate: LinkPredicate = 358 56 predicate === "origin" ? originPredicate(parsedUrls[0]) : subtreePredicate(parsedUrls[0]); 359 57 360 58 const queue = new Set<string>(parsedUrls.map((u) => u.href));

+69

src/scraper/downloader.ts

··· 1 + import { access, mkdir, stat, writeFile } from "node:fs/promises"; 2 + import { logFileStats } from "../stats.ts"; 3 + import type { OverwriteMode } from "../types.ts"; 4 + import { applyReplaceHost } from "../url/util.ts"; 5 + import { getAssetOutputPath } from "../url/paths.ts"; 6 + 7 + async function downloadAsset( 8 + url: URL, 9 + outputDir: string, 10 + overwrite: OverwriteMode, 11 + stripHost: boolean, 12 + replacementHost?: string, 13 + ): Promise<boolean> { 14 + const assetOutputPath = getAssetOutputPath(url, outputDir, stripHost); 15 + 16 + try { 17 + await access(assetOutputPath); 18 + 19 + if (overwrite === "ignore") { 20 + const assetStats = await stat(assetOutputPath); 21 + logFileStats({ 22 + url: url.href, 23 + path: assetOutputPath, 24 + size: assetStats.size, 25 + links: 0, 26 + assets: 0, 27 + cached: true, 28 + }); 29 + return true; 30 + } 31 + 32 + if (overwrite === "cache") { 33 + const assetStats = await stat(assetOutputPath); 34 + logFileStats({ 35 + url: url.href, 36 + path: assetOutputPath, 37 + size: assetStats.size, 38 + links: 0, 39 + assets: 0, 40 + cached: true, 41 + }); 42 + return true; 43 + } 44 + } catch {} 45 + 46 + const fetchUrl = replacementHost ? applyReplaceHost(url, replacementHost) : url; 47 + const response = await fetch(fetchUrl.href); 48 + if (!response.ok) { 49 + return false; 50 + } 51 + 52 + const assetData = await response.arrayBuffer(); 53 + const { dirname } = await import("node:path"); 54 + await mkdir(dirname(assetOutputPath), { recursive: true }); 55 + await writeFile(assetOutputPath, Buffer.from(assetData)); 56 + 57 + logFileStats({ 58 + url: url.href, 59 + path: assetOutputPath, 60 + size: assetData.byteLength, 61 + links: 0, 62 + assets: 0, 63 + cached: false, 64 + }); 65 + 66 + return true; 67 + } 68 + 69 + export { downloadAsset };

+83

src/scraper/html.ts

··· 1 + import { HTMLRewriter } from "html-rewriter-wasm"; 2 + import { applyReplaceHost } from "../url/util.ts"; 3 + 4 + interface ExtractedResources { 5 + links: Set<string>; 6 + assets: Set<string>; 7 + } 8 + 9 + function extractResources(html: string, url: URL, replacementHost?: string): ExtractedResources { 10 + const links = new Set<string>(); 11 + const assets = new Set<string>(); 12 + 13 + const rewriter = new HTMLRewriter((_chunk) => {}); 14 + 15 + rewriter.on("a[href]", { 16 + element(element) { 17 + const href = element.getAttribute("href"); 18 + if (!href) return; 19 + 20 + try { 21 + let linkUrl = new URL(href, url); 22 + if (replacementHost) { 23 + linkUrl = applyReplaceHost(linkUrl, replacementHost); 24 + } 25 + links.add(linkUrl.href); 26 + } catch {} 27 + }, 28 + }); 29 + 30 + rewriter.on("img[src]", { 31 + element(element) { 32 + const src = element.getAttribute("src"); 33 + if (!src) return; 34 + 35 + try { 36 + let assetUrl = new URL(src, url); 37 + if (replacementHost) { 38 + assetUrl = applyReplaceHost(assetUrl, replacementHost); 39 + } 40 + assets.add(assetUrl.href); 41 + } catch {} 42 + }, 43 + }); 44 + 45 + rewriter.on("link[href]", { 46 + element(element) { 47 + const href = element.getAttribute("href"); 48 + if (!href) return; 49 + 50 + try { 51 + let assetUrl = new URL(href, url); 52 + if (replacementHost) { 53 + assetUrl = applyReplaceHost(assetUrl, replacementHost); 54 + } 55 + assets.add(assetUrl.href); 56 + } catch {} 57 + }, 58 + }); 59 + 60 + rewriter.on("script[src]", { 61 + element(element) { 62 + const src = element.getAttribute("src"); 63 + if (!src) return; 64 + 65 + try { 66 + let assetUrl = new URL(src, url); 67 + if (replacementHost) { 68 + assetUrl = applyReplaceHost(assetUrl, replacementHost); 69 + } 70 + assets.add(assetUrl.href); 71 + } catch {} 72 + }, 73 + }); 74 + 75 + const encoder = new TextEncoder(); 76 + rewriter.write(encoder.encode(html)); 77 + rewriter.end(); 78 + rewriter.free(); 79 + 80 + return { links, assets }; 81 + } 82 + 83 + export { extractResources };

+104

src/scraper/scraper.ts

··· 1 + import { access, mkdir, stat, writeFile } from "node:fs/promises"; 2 + import type { OverwriteMode, PageStats, LinkPredicate } from "../types.ts"; 3 + import { logFileStats } from "../stats.ts"; 4 + import { applyReplaceHost } from "../url/util.ts"; 5 + import { getPageOutputPath } from "../url/paths.ts"; 6 + import { extractResources } from "./html.ts"; 7 + import { downloadAsset } from "./downloader.ts"; 8 + 9 + async function scrapePage( 10 + url: URL, 11 + outputDir: string, 12 + predicate: LinkPredicate, 13 + queue: Set<string>, 14 + processed: Set<string>, 15 + overwrite: OverwriteMode, 16 + stripHost: boolean, 17 + replacementHost?: string, 18 + ): Promise<void> { 19 + const urlWithoutHash = url.href.split("#")[0]; 20 + if (processed.has(urlWithoutHash)) return; 21 + 22 + processed.add(urlWithoutHash); 23 + 24 + const response = await fetch(url.href); 25 + if (!response.ok) { 26 + return; 27 + } 28 + 29 + const html = await response.text(); 30 + 31 + const { links, assets } = extractResources(html, url, replacementHost); 32 + 33 + const filteredLinks = new Set<string>(); 34 + for (const link of links) { 35 + try { 36 + const linkUrl = new URL(link); 37 + if (predicate(linkUrl, url) && !processed.has(linkUrl.href.split("#")[0])) { 38 + filteredLinks.add(link); 39 + } 40 + } catch {} 41 + } 42 + 43 + const hasChildResources = [...filteredLinks, ...assets].some((resource) => { 44 + const urlPrefix = url.href.endsWith("/") ? url.href.slice(0, -1) : url.href; 45 + return resource.startsWith(urlPrefix + "/"); 46 + }); 47 + 48 + const outputPath = getPageOutputPath(url, outputDir, stripHost, hasChildResources); 49 + 50 + const { dirname } = await import("node:path"); 51 + let cached = false; 52 + 53 + try { 54 + await access(outputPath); 55 + 56 + if (overwrite === "ignore") { 57 + const fileStats = await stat(outputPath); 58 + logFileStats({ 59 + url: url.href, 60 + path: outputPath, 61 + size: fileStats.size, 62 + links: filteredLinks.size, 63 + assets: assets.size, 64 + cached: true, 65 + }); 66 + return; 67 + } 68 + 69 + if (overwrite === "cache") { 70 + cached = true; 71 + } else { 72 + await mkdir(dirname(outputPath), { recursive: true }); 73 + await writeFile(outputPath, html); 74 + } 75 + } catch { 76 + await mkdir(dirname(outputPath), { recursive: true }); 77 + await writeFile(outputPath, html); 78 + } 79 + 80 + for (const link of filteredLinks) { 81 + const linkWithoutHash = link.split("#")[0]; 82 + if (!queue.has(linkWithoutHash) && !processed.has(linkWithoutHash)) { 83 + queue.add(linkWithoutHash); 84 + } 85 + } 86 + 87 + for (const asset of assets) { 88 + try { 89 + await downloadAsset(new URL(asset), outputDir, overwrite, stripHost, replacementHost); 90 + } catch {} 91 + } 92 + 93 + const fileStats = await stat(outputPath); 94 + logFileStats({ 95 + url: url.href, 96 + path: outputPath, 97 + size: fileStats.size, 98 + links: filteredLinks.size, 99 + assets: assets.size, 100 + cached, 101 + }); 102 + } 103 + 104 + export { scrapePage };

+7

src/stats.ts

··· 1 + import type { PageStats } from "./types.ts"; 2 + 3 + async function logFileStats(stats: PageStats) { 4 + console.log(JSON.stringify(stats)); 5 + } 6 + 7 + export { logFileStats };

+12

src/types.ts

··· 1 + type OverwriteMode = "ignore" | "cache" | "update"; 2 + 3 + type LinkPredicate = (url: URL, baseUrl: URL) => boolean; 4 + 5 + interface PageStats { 6 + url: string; 7 + path: string; 8 + size: number; 9 + links: number; 10 + assets: number; 11 + cached: boolean; 12 + }

+40

src/url/paths.ts

··· 1 + import { join } from "node:path"; 2 + 3 + function getAssetOutputPath(url: URL, outputDir: string, stripHost: boolean): string { 4 + return stripHost ? join(outputDir, url.pathname) : join(outputDir, url.hostname, url.pathname); 5 + } 6 + 7 + function getPageFilePath(url: URL, outputDir: string, stripHost: boolean): string { 8 + const urlPrefix = url.href.endsWith("/") ? url.href.slice(0, -1) : url.href; 9 + 10 + let filePath = url.pathname; 11 + if (filePath.endsWith("/") || filePath === "") { 12 + filePath = filePath + "index.html"; 13 + } else if (!filePath.endsWith(".html") && !filePath.endsWith(".htm")) { 14 + filePath = filePath + ".html"; 15 + } 16 + 17 + return stripHost ? join(outputDir, filePath) : join(outputDir, url.hostname, filePath); 18 + } 19 + 20 + function getPageOutputPath( 21 + url: URL, 22 + outputDir: string, 23 + stripHost: boolean, 24 + hasChildResources: boolean, 25 + ): string { 26 + const urlPrefix = url.href.endsWith("/") ? url.href.slice(0, -1) : url.href; 27 + 28 + let filePath = url.pathname; 29 + if (filePath.endsWith("/") || filePath === "") { 30 + filePath = filePath + "index.html"; 31 + } else if (hasChildResources) { 32 + filePath = filePath + "/index.html"; 33 + } else if (!filePath.endsWith(".html") && !filePath.endsWith(".htm")) { 34 + filePath = filePath + ".html"; 35 + } 36 + 37 + return stripHost ? join(outputDir, filePath) : join(outputDir, url.hostname, filePath); 38 + } 39 + 40 + export { getAssetOutputPath, getPageOutputPath };

+11

src/url/predicates.ts

··· 1 + import type { LinkPredicate } from "../types.ts"; 2 + 3 + function originPredicate(baseUrl: URL): LinkPredicate { 4 + return (url) => url.origin === baseUrl.origin; 5 + } 6 + 7 + function subtreePredicate(baseUrl: URL): LinkPredicate { 8 + return (url) => url.href.startsWith(baseUrl.href); 9 + } 10 + 11 + export { originPredicate, subtreePredicate };

+21

src/url/util.ts

··· 1 + function ensureUrl(urlString: string): URL { 2 + if (!urlString.includes("://")) { 3 + urlString = "https://" + urlString; 4 + } 5 + return new URL(urlString); 6 + } 7 + 8 + function applyReplaceHost(url: URL, newHost: string): URL { 9 + const newUrl = new URL(url.href); 10 + newUrl.hostname = newHost; 11 + return newUrl; 12 + } 13 + 14 + function getDefaultOutput(url: URL): string { 15 + const pathname = url.pathname; 16 + const cleanedPath = pathname.endsWith("/") ? pathname.slice(0, -1) : pathname; 17 + const parts = cleanedPath.split("/").filter(Boolean); 18 + return parts.length > 0 ? parts[parts.length - 1] : url.hostname; 19 + } 20 + 21 + export { ensureUrl, applyReplaceHost, getDefaultOutput };

Configure Feed

Configure Feed