Small wget like mirroring utility.
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Add --stripHost flag to remove hostname from output paths

rektide d7a6a7d6 e678c38c

+8 -7
+8 -7
src/cli.ts
··· 36 36 console.log(JSON.stringify(stats)); 37 37 } 38 38 39 - async function downloadAsset(url: URL, outputDir: string, overwrite: OverwriteMode): Promise<boolean> { 40 - const assetOutputPath = join(outputDir, url.hostname, url.pathname); 39 + async function downloadAsset(url: URL, outputDir: string, overwrite: OverwriteMode, stripHost: boolean): Promise<boolean> { 40 + const assetOutputPath = stripHost ? join(outputDir, url.pathname) : join(outputDir, url.hostname, url.pathname); 41 41 42 42 try { 43 43 await access(assetOutputPath); ··· 90 90 return true; 91 91 } 92 92 93 - async function scrapePage(url: URL, outputDir: string, predicate: LinkPredicate, queue: Set<string>, processed: Set<string>, overwrite: OverwriteMode): Promise<void> { 93 + async function scrapePage(url: URL, outputDir: string, predicate: LinkPredicate, queue: Set<string>, processed: Set<string>, overwrite: OverwriteMode, stripHost: boolean): Promise<void> { 94 94 const urlWithoutHash = url.href.split('#')[0]; 95 95 if (processed.has(urlWithoutHash)) return; 96 96 ··· 177 177 filePath = filePath + '.html'; 178 178 } 179 179 180 - const outputPath = join(outputDir, url.hostname, filePath); 180 + const outputPath = stripHost ? join(outputDir, filePath) : join(outputDir, url.hostname, filePath); 181 181 182 182 let cached = false; 183 183 ··· 217 217 218 218 for (const asset of assets) { 219 219 try { 220 - await downloadAsset(new URL(asset), outputDir, overwrite); 220 + await downloadAsset(new URL(asset), outputDir, overwrite, stripHost); 221 221 } catch {} 222 222 } 223 223 ··· 239 239 output: { type: 'string', description: 'Output directory' }, 240 240 predicate: { type: 'string', description: 'Link predicate: origin or subtree', default: 'subtree' }, 241 241 overwrite: { type: 'string', description: 'Overwrite mode: ignore, cache, or update', default: 'cache' }, 242 + stripHost: { type: 'boolean', description: 'Remove hostname from output paths' }, 242 243 }, 243 244 async run(ctx) { 244 - const { url, urls, output, predicate, overwrite } = ctx.values; 245 + const { url, urls, output, predicate, overwrite, stripHost } = ctx.values; 245 246 const urlList = url ? [url] : (urls ?? []); 246 247 247 248 if (urlList.length === 0) { ··· 259 260 const nextUrl = Array.from(queue)[0]; 260 261 queue.delete(nextUrl); 261 262 262 - await scrapePage(new URL(nextUrl), outputDir, linkPredicate, queue, processed, overwrite as OverwriteMode); 263 + await scrapePage(new URL(nextUrl), outputDir, linkPredicate, queue, processed, overwrite as OverwriteMode, stripHost); 263 264 } 264 265 265 266 console.error(`Scraping complete. ${processed.size} pages processed.`);