Small wget like mirroring utility.
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Add Gunshi CLI web scraper with html-rewriter-wasm

rektide 106eb735

+403
+5
.prettierrc.json
··· 1 + { 2 + "useTabs": true, 3 + "semi": false, 4 + "max_line_length": 2047 5 + }
README.md

This is a binary file and will not be displayed.

+33
package.json
··· 1 + { 2 + "name": "dexport", 3 + "version": "1.0.0", 4 + "description": "", 5 + "main": "index.js", 6 + "scripts": { 7 + "test": "echo \"Error: no test specified\" && exit 1", 8 + "lint": "oxlint", 9 + "fmt": "oxfmt ." 10 + }, 11 + "bin": { 12 + "dexport": "src/cli.ts" 13 + }, 14 + "repository": { 15 + "type": "git", 16 + "url": "git+https://github.com/rektide/dexport.git" 17 + }, 18 + "keywords": [], 19 + "author": "rektide de la faye", 20 + "license": "MIT", 21 + "type": "module", 22 + "bugs": { 23 + "url": "https://github.com/rektide/dexport/issues" 24 + }, 25 + "homepage": "https://github.com/rektide/dexport#readme", 26 + "module": "dexport.js", 27 + "devDependencies": { 28 + "gunshi": "^0.28.0", 29 + "html-rewriter-wasm": "^0.4.1", 30 + "oxfmt": "^0.28.0", 31 + "oxlint": "^1.43.0" 32 + } 33 + }
+218
pnpm-lock.yaml
··· 1 + lockfileVersion: '9.0' 2 + 3 + settings: 4 + autoInstallPeers: true 5 + excludeLinksFromLockfile: false 6 + 7 + importers: 8 + 9 + .: 10 + devDependencies: 11 + gunshi: 12 + specifier: ^0.28.0 13 + version: 0.28.0 14 + html-rewriter-wasm: 15 + specifier: ^0.4.1 16 + version: 0.4.1 17 + oxfmt: 18 + specifier: ^0.28.0 19 + version: 0.28.0 20 + oxlint: 21 + specifier: ^1.43.0 22 + version: 1.43.0 23 + 24 + packages: 25 + 26 + '@oxfmt/darwin-arm64@0.28.0': 27 + resolution: {integrity: sha512-jmUfF7cNJPw57bEK7sMIqrYRgn4LH428tSgtgLTCtjuGuu1ShREyrkeB7y8HtkXRfhBs4lVY+HMLhqElJvZ6ww==} 28 + cpu: [arm64] 29 + os: [darwin] 30 + 31 + '@oxfmt/darwin-x64@0.28.0': 32 + resolution: {integrity: sha512-S6vlV8S7jbjzJOSjfVg2CimUC0r7/aHDLdUm/3+/B/SU/s1jV7ivqWkMv1/8EB43d1BBwT9JQ60ZMTkBqeXSFA==} 33 + cpu: [x64] 34 + os: [darwin] 35 + 36 + '@oxfmt/linux-arm64-gnu@0.28.0': 37 + resolution: {integrity: sha512-TfJkMZjePbLiskmxFXVAbGI/OZtD+y+fwS0wyW8O6DWG0ARTf0AipY9zGwGoOdpFuXOJceXvN4SHGLbYNDMY4Q==} 38 + cpu: [arm64] 39 + os: [linux] 40 + libc: [glibc] 41 + 42 + '@oxfmt/linux-arm64-musl@0.28.0': 43 + resolution: {integrity: sha512-7fyQUdW203v4WWGr1T3jwTz4L7KX9y5DeATryQ6fLT6QQp9GEuct8/k0lYhd+ys42iTV/IkJF20e3YkfSOOILg==} 44 + cpu: [arm64] 45 + os: [linux] 46 + libc: [musl] 47 + 48 + '@oxfmt/linux-x64-gnu@0.28.0': 49 + resolution: {integrity: sha512-sRKqAvEonuz0qr1X1ncUZceOBJerKzkO2gZIZmosvy/JmqyffpIFL3OE2tqacFkeDhrC+dNYQpusO8zsfHo3pw==} 50 + cpu: [x64] 51 + os: [linux] 52 + libc: [glibc] 53 + 54 + '@oxfmt/linux-x64-musl@0.28.0': 55 + resolution: {integrity: sha512-fW6czbXutX/tdQe8j4nSIgkUox9RXqjyxwyWXUDItpoDkoXllq17qbD7GVc0whrEhYQC6hFE1UEAcDypLJoSzw==} 56 + cpu: [x64] 57 + os: [linux] 58 + libc: [musl] 59 + 60 + '@oxfmt/win32-arm64@0.28.0': 61 + resolution: {integrity: sha512-D/HDeQBAQRjTbD9OLV6kRDcStrIfO+JsUODDCdGmhRfNX8LPCx95GpfyybpZfn3wVF8Jq/yjPXV1xLkQ+s7RcA==} 62 + cpu: [arm64] 63 + os: [win32] 64 + 65 + '@oxfmt/win32-x64@0.28.0': 66 + resolution: {integrity: sha512-4+S2j4OxOIyo8dz5osm5dZuL0yVmxXvtmNdHB5xyGwAWVvyWNvf7tCaQD7w2fdSsAXQLOvK7KFQrHFe33nJUCA==} 67 + cpu: [x64] 68 + os: [win32] 69 + 70 + '@oxlint/darwin-arm64@1.43.0': 71 + resolution: {integrity: sha512-C/GhObv/pQZg34NOzB6Mk8x0wc9AKj8fXzJF8ZRKTsBPyHusC6AZ6bba0QG0TUufw1KWuD0j++oebQfWeiFXNw==} 72 + cpu: [arm64] 73 + os: [darwin] 74 + 75 + '@oxlint/darwin-x64@1.43.0': 76 + resolution: {integrity: sha512-4NjfUtEEH8ewRQ2KlZGmm6DyrvypMdHwBnQT92vD0dLScNOQzr0V9O8Ua4IWXdeCNl/XMVhAV3h4/3YEYern5A==} 77 + cpu: [x64] 78 + os: [darwin] 79 + 80 + '@oxlint/linux-arm64-gnu@1.43.0': 81 + resolution: {integrity: sha512-75tf1HvwdZ3ebk83yMbSB+moAEWK98mYqpXiaFAi6Zshie7r+Cx5PLXZFUEqkscenoZ+fcNXakHxfn94V6nf1g==} 82 + cpu: [arm64] 83 + os: [linux] 84 + libc: [glibc] 85 + 86 + '@oxlint/linux-arm64-musl@1.43.0': 87 + resolution: {integrity: sha512-BHV4fb36T2p/7bpA9fiJ5ayt7oJbiYX10nklW5arYp4l9/9yG/FQC5J4G1evzbJ/YbipF9UH0vYBAm5xbqGrvw==} 88 + cpu: [arm64] 89 + os: [linux] 90 + libc: [musl] 91 + 92 + '@oxlint/linux-x64-gnu@1.43.0': 93 + resolution: {integrity: sha512-1l3nvnzWWse1YHibzZ4HQXdF/ibfbKZhp9IguElni3bBqEyPEyurzZ0ikWynDxKGXqZa+UNXTFuU1NRVX1RJ3g==} 94 + cpu: [x64] 95 + os: [linux] 96 + libc: [glibc] 97 + 98 + '@oxlint/linux-x64-musl@1.43.0': 99 + resolution: {integrity: sha512-+jNYgLGRFTJxJuaSOZJBwlYo5M0TWRw0+3y5MHOL4ArrIdHyCthg6r4RbVWrsR1qUfUE1VSSHQ2bfbC99RXqMg==} 100 + cpu: [x64] 101 + os: [linux] 102 + libc: [musl] 103 + 104 + '@oxlint/win32-arm64@1.43.0': 105 + resolution: {integrity: sha512-dvs1C/HCjCyGTURMagiHprsOvVTT3omDiSzi5Qw0D4QFJ1pEaNlfBhVnOUYgUfS6O7Mcmj4+G+sidRsQcWQ/kA==} 106 + cpu: [arm64] 107 + os: [win32] 108 + 109 + '@oxlint/win32-x64@1.43.0': 110 + resolution: {integrity: sha512-bSuItSU8mTSDsvmmLTepTdCL2FkJI6dwt9tot/k0EmiYF+ArRzmsl4lXVLssJNRV5lJEc5IViyTrh7oiwrjUqA==} 111 + cpu: [x64] 112 + os: [win32] 113 + 114 + gunshi@0.28.0: 115 + resolution: {integrity: sha512-xR/iKAT4zMeYsboTDHOVSEGS1BNyYReUE48vXdNIT6ujcZLqE+BR/p9T6n0N7ugNdfoZtHm3NTuya9jc7cdsbw==} 116 + engines: {node: '>= 20'} 117 + 118 + html-rewriter-wasm@0.4.1: 119 + resolution: {integrity: sha512-lNovG8CMCCmcVB1Q7xggMSf7tqPCijZXaH4gL6iE8BFghdQCbaY5Met9i1x2Ex8m/cZHDUtXK9H6/znKamRP8Q==} 120 + 121 + oxfmt@0.28.0: 122 + resolution: {integrity: sha512-3+hhBqPE6Kp22KfJmnstrZbl+KdOVSEu1V0ABaFIg1rYLtrMgrupx9znnHgHLqKxAVHebjTdiCJDk30CXOt6cw==} 123 + engines: {node: ^20.19.0 || >=22.12.0} 124 + hasBin: true 125 + 126 + oxlint@1.43.0: 127 + resolution: {integrity: sha512-xiqTCsKZch+R61DPCjyqUVP2MhkQlRRYxLRBeBDi+dtQJ90MOgdcjIktvDCgXz0bgtx94EQzHEndsizZjMX2OA==} 128 + engines: {node: ^20.19.0 || >=22.12.0} 129 + hasBin: true 130 + peerDependencies: 131 + oxlint-tsgolint: '>=0.11.2' 132 + peerDependenciesMeta: 133 + oxlint-tsgolint: 134 + optional: true 135 + 136 + tinypool@2.1.0: 137 + resolution: {integrity: sha512-Pugqs6M0m7Lv1I7FtxN4aoyToKg1C4tu+/381vH35y8oENM/Ai7f7C4StcoK4/+BSw9ebcS8jRiVrORFKCALLw==} 138 + engines: {node: ^20.0.0 || >=22.0.0} 139 + 140 + snapshots: 141 + 142 + '@oxfmt/darwin-arm64@0.28.0': 143 + optional: true 144 + 145 + '@oxfmt/darwin-x64@0.28.0': 146 + optional: true 147 + 148 + '@oxfmt/linux-arm64-gnu@0.28.0': 149 + optional: true 150 + 151 + '@oxfmt/linux-arm64-musl@0.28.0': 152 + optional: true 153 + 154 + '@oxfmt/linux-x64-gnu@0.28.0': 155 + optional: true 156 + 157 + '@oxfmt/linux-x64-musl@0.28.0': 158 + optional: true 159 + 160 + '@oxfmt/win32-arm64@0.28.0': 161 + optional: true 162 + 163 + '@oxfmt/win32-x64@0.28.0': 164 + optional: true 165 + 166 + '@oxlint/darwin-arm64@1.43.0': 167 + optional: true 168 + 169 + '@oxlint/darwin-x64@1.43.0': 170 + optional: true 171 + 172 + '@oxlint/linux-arm64-gnu@1.43.0': 173 + optional: true 174 + 175 + '@oxlint/linux-arm64-musl@1.43.0': 176 + optional: true 177 + 178 + '@oxlint/linux-x64-gnu@1.43.0': 179 + optional: true 180 + 181 + '@oxlint/linux-x64-musl@1.43.0': 182 + optional: true 183 + 184 + '@oxlint/win32-arm64@1.43.0': 185 + optional: true 186 + 187 + '@oxlint/win32-x64@1.43.0': 188 + optional: true 189 + 190 + gunshi@0.28.0: {} 191 + 192 + html-rewriter-wasm@0.4.1: {} 193 + 194 + oxfmt@0.28.0: 195 + dependencies: 196 + tinypool: 2.1.0 197 + optionalDependencies: 198 + '@oxfmt/darwin-arm64': 0.28.0 199 + '@oxfmt/darwin-x64': 0.28.0 200 + '@oxfmt/linux-arm64-gnu': 0.28.0 201 + '@oxfmt/linux-arm64-musl': 0.28.0 202 + '@oxfmt/linux-x64-gnu': 0.28.0 203 + '@oxfmt/linux-x64-musl': 0.28.0 204 + '@oxfmt/win32-arm64': 0.28.0 205 + '@oxfmt/win32-x64': 0.28.0 206 + 207 + oxlint@1.43.0: 208 + optionalDependencies: 209 + '@oxlint/darwin-arm64': 1.43.0 210 + '@oxlint/darwin-x64': 1.43.0 211 + '@oxlint/linux-arm64-gnu': 1.43.0 212 + '@oxlint/linux-arm64-musl': 1.43.0 213 + '@oxlint/linux-x64-gnu': 1.43.0 214 + '@oxlint/linux-x64-musl': 1.43.0 215 + '@oxlint/win32-arm64': 1.43.0 216 + '@oxlint/win32-x64': 1.43.0 217 + 218 + tinypool@2.1.0: {}
+147
src/cli.ts
··· 1 + #!/usr/bin/env node 2 + import { cli, define } from 'gunshi'; 3 + import { HTMLRewriter } from 'html-rewriter-wasm'; 4 + import { mkdir, writeFile } from 'node:fs/promises'; 5 + import { dirname, join } from 'node:path'; 6 + 7 + type LinkPredicate = (url: URL, baseUrl: URL) => boolean; 8 + 9 + function originPredicate(baseUrl: URL): LinkPredicate { 10 + return (url) => url.origin === baseUrl.origin; 11 + } 12 + 13 + function subtreePredicate(baseUrl: URL): LinkPredicate { 14 + return (url) => url.href.startsWith(baseUrl.href); 15 + } 16 + 17 + async function scrapePage(url: URL, outputDir: string, predicate: LinkPredicate, queue: Set<string>, processed: Set<string>) { 18 + if (processed.has(url.href)) return; 19 + 20 + processed.add(url.href); 21 + 22 + const response = await fetch(url.href); 23 + if (!response.ok) { 24 + return; 25 + } 26 + 27 + const html = await response.text(); 28 + 29 + let filePath = url.pathname; 30 + if (filePath.endsWith('/') || filePath === '') { 31 + filePath = filePath + 'index.html'; 32 + } else if (!filePath.endsWith('.html') && !filePath.endsWith('.htm')) { 33 + filePath = filePath + '.html'; 34 + } 35 + 36 + const outputPath = join(outputDir, url.hostname, filePath); 37 + await mkdir(dirname(outputPath), { recursive: true }); 38 + await writeFile(outputPath, html); 39 + 40 + const links = new Set<string>(); 41 + const assets = new Set<string>(); 42 + 43 + const rewriter = new HTMLRewriter((_chunk) => {}); 44 + 45 + rewriter.on('a[href]', { 46 + element(element) { 47 + const href = element.getAttribute('href'); 48 + if (!href) return; 49 + 50 + try { 51 + const linkUrl = new URL(href, url); 52 + if (predicate(linkUrl, url) && !processed.has(linkUrl.href)) { 53 + links.add(linkUrl.href); 54 + } 55 + } catch {} 56 + } 57 + }); 58 + 59 + rewriter.on('img[src]', { 60 + element(element) { 61 + const src = element.getAttribute('src'); 62 + if (!src) return; 63 + 64 + try { 65 + const assetUrl = new URL(src, url); 66 + assets.add(assetUrl.href); 67 + } catch {} 68 + } 69 + }); 70 + 71 + rewriter.on('link[href]', { 72 + element(element) { 73 + const href = element.getAttribute('href'); 74 + if (!href) return; 75 + 76 + try { 77 + const assetUrl = new URL(href, url); 78 + assets.add(assetUrl.href); 79 + } catch {} 80 + } 81 + }); 82 + 83 + rewriter.on('script[src]', { 84 + element(element) { 85 + const src = element.getAttribute('src'); 86 + if (!src) return; 87 + 88 + try { 89 + const assetUrl = new URL(src, url); 90 + assets.add(assetUrl.href); 91 + } catch {} 92 + } 93 + }); 94 + 95 + const encoder = new TextEncoder(); 96 + await rewriter.write(encoder.encode(html)); 97 + await rewriter.end(); 98 + rewriter.free(); 99 + 100 + for (const link of links) { 101 + if (!queue.has(link) && !processed.has(link)) { 102 + queue.add(link); 103 + } 104 + } 105 + 106 + for (const asset of assets) { 107 + try { 108 + const assetResponse = await fetch(asset); 109 + if (assetResponse.ok) { 110 + const assetData = await assetResponse.arrayBuffer(); 111 + const assetPath = new URL(asset); 112 + const assetOutputPath = join(outputDir, assetPath.hostname, assetPath.pathname); 113 + await mkdir(dirname(assetOutputPath), { recursive: true }); 114 + await writeFile(assetOutputPath, Buffer.from(assetData)); 115 + } 116 + } catch {} 117 + } 118 + } 119 + 120 + const command = define({ 121 + args: { 122 + url: { type: 'string', description: 'URL to start scraping', required: true }, 123 + output: { type: 'string', description: 'Output directory', required: true }, 124 + predicate: { type: 'string', description: 'Link predicate: origin or subtree', default: 'subtree' }, 125 + }, 126 + async run(ctx) { 127 + const { url, output, predicate } = ctx.values; 128 + const baseUrl = new URL(url); 129 + const linkPredicate = predicate === 'origin' ? originPredicate(baseUrl) : subtreePredicate(baseUrl); 130 + 131 + const queue = new Set<string>([baseUrl.href]); 132 + const processed = new Set<string>(); 133 + 134 + while (queue.size > 0) { 135 + const nextUrl = Array.from(queue)[0]; 136 + queue.delete(nextUrl); 137 + 138 + await scrapePage(new URL(nextUrl), output, linkPredicate, queue, processed); 139 + } 140 + 141 + console.log(`Scraping complete. ${processed.size} pages processed.`); 142 + }, 143 + }); 144 + 145 + if (import.meta.url === `file://${process.argv[1]}`) { 146 + await cli(process.argv.slice(2), command); 147 + }