Small wget like mirroring utility.
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Add --replace-host option and default https:// protocol

rektide 59c735c2 f2be6bd7

+54 -11
+54 -11
src/cli.ts
··· 25 25 return (url) => url.href.startsWith(baseUrl.href); 26 26 } 27 27 28 + function ensureUrl(urlString: string): URL { 29 + if (!urlString.includes("://")) { 30 + urlString = "https://" + urlString; 31 + } 32 + return new URL(urlString); 33 + } 34 + 28 35 function getDefaultOutput(url: URL): string { 29 36 const pathname = url.pathname; 30 37 const cleanedPath = pathname.endsWith("/") ? pathname.slice(0, -1) : pathname; ··· 32 39 return parts.length > 0 ? parts[parts.length - 1] : url.hostname; 33 40 } 34 41 42 + function replaceHost(url: URL, newHost: string): URL { 43 + const newUrl = new URL(url.href); 44 + newUrl.hostname = newHost; 45 + return newUrl; 46 + } 47 + 35 48 async function logFileStats(stats: PageStats) { 36 49 console.log(JSON.stringify(stats)); 37 50 } ··· 41 54 outputDir: string, 42 55 overwrite: OverwriteMode, 43 56 stripHost: boolean, 57 + replaceHost?: string, 44 58 ): Promise<boolean> { 45 59 const assetOutputPath = stripHost 46 60 ? join(outputDir, url.pathname) ··· 76 90 } 77 91 } catch {} 78 92 79 - const response = await fetch(url.href); 93 + const fetchUrl = replaceHost ? replaceHost(url, replaceHost) : url; 94 + const response = await fetch(fetchUrl.href); 80 95 if (!response.ok) { 81 96 return false; 82 97 } ··· 105 120 processed: Set<string>, 106 121 overwrite: OverwriteMode, 107 122 stripHost: boolean, 123 + replaceHost?: string, 108 124 ): Promise<void> { 109 125 const urlWithoutHash = url.href.split("#")[0]; 110 126 if (processed.has(urlWithoutHash)) return; ··· 129 145 if (!href) return; 130 146 131 147 try { 132 - const linkUrl = new URL(href, url); 148 + let linkUrl = new URL(href, url); 149 + if (replaceHost) { 150 + linkUrl = replaceHost(linkUrl, replaceHost); 151 + } 133 152 if (predicate(linkUrl, url) && !processed.has(linkUrl.href.split("#")[0])) { 134 153 links.add(linkUrl.href); 135 154 } ··· 143 162 if (!src) return; 144 163 145 164 try { 146 - const assetUrl = new URL(src, url); 165 + let assetUrl = new URL(src, url); 166 + if (replaceHost) { 167 + assetUrl = replaceHost(assetUrl, replaceHost); 168 + } 147 169 assets.add(assetUrl.href); 148 170 } catch {} 149 171 }, ··· 155 177 if (!href) return; 156 178 157 179 try { 158 - const assetUrl = new URL(href, url); 180 + let assetUrl = new URL(href, url); 181 + if (replaceHost) { 182 + assetUrl = replaceHost(assetUrl, replaceHost); 183 + } 159 184 assets.add(assetUrl.href); 160 185 } catch {} 161 186 }, ··· 167 192 if (!src) return; 168 193 169 194 try { 170 - const assetUrl = new URL(src, url); 195 + let assetUrl = new URL(src, url); 196 + if (replaceHost) { 197 + assetUrl = replaceHost(assetUrl, replaceHost); 198 + } 171 199 assets.add(assetUrl.href); 172 200 } catch {} 173 201 }, ··· 234 262 235 263 for (const asset of assets) { 236 264 try { 237 - await downloadAsset(new URL(asset), outputDir, overwrite, stripHost); 265 + await downloadAsset(new URL(asset), outputDir, overwrite, stripHost, replaceHost); 238 266 } catch {} 239 267 } 240 268 ··· 265 293 default: "cache", 266 294 }, 267 295 "strip-host": { type: "boolean", description: "Remove hostname from output paths" }, 296 + "replace-host": { type: "string", description: "Replace hostname in URLs with specified hostname" }, 268 297 }, 269 298 async run(ctx) { 270 - const { url, urls, output, predicate, overwrite, "strip-host": stripHost } = ctx.values; 299 + const { url, urls, output, predicate, overwrite, "strip-host": stripHost, "replace-host": replaceHost } = ctx.values; 271 300 const urlList = url ? [url] : (urls ?? []); 272 301 273 302 if (urlList.length === 0) { ··· 276 305 } 277 306 278 307 const outputDir = resolve(process.cwd(), output ?? "."); 308 + const parsedUrls = urlList.map((u) => ensureUrl(u)); 309 + 310 + if (replaceHost) { 311 + for (let i = 0; i < parsedUrls.length; i++) { 312 + parsedUrls[i] = replaceHost(parsedUrls[i], replaceHost); 313 + } 314 + } 315 + 279 316 const linkPredicate = 280 317 predicate === "origin" 281 - ? originPredicate(new URL(urlList[0])) 282 - : subtreePredicate(new URL(urlList[0])); 318 + ? originPredicate(parsedUrls[0]) 319 + : subtreePredicate(parsedUrls[0]); 283 320 284 - const queue = new Set<string>(urlList); 321 + const queue = new Set<string>(parsedUrls.map((u) => u.href)); 285 322 const processed = new Set<string>(); 286 323 287 324 while (queue.size > 0) { 288 325 const nextUrl = Array.from(queue)[0]; 289 326 queue.delete(nextUrl); 290 327 328 + let urlObj = ensureUrl(nextUrl); 329 + if (replaceHost) { 330 + urlObj = replaceHost(urlObj, replaceHost); 331 + } 332 + 291 333 await scrapePage( 292 - new URL(nextUrl), 334 + urlObj, 293 335 outputDir, 294 336 linkPredicate, 295 337 queue, 296 338 processed, 297 339 overwrite as OverwriteMode, 298 340 stripHost, 341 + replaceHost, 299 342 ); 300 343 } 301 344