Add decorator system for HTML metadata extraction · jauntywk.bsky.social/dexport@f756e0a

+5 -1

.beads/README.md

··· 33 33 ### Working with Issues 34 34 35 35 Issues in Beads are: 36 + 36 37 - **Git-native**: Stored in `.beads/issues.jsonl` and synced like code 37 38 - **AI-friendly**: CLI-first design works perfectly with AI coding agents 38 39 - **Branch-aware**: Issues can follow your branch workflow ··· 41 42 ## Why Beads? 42 43 43 44 ✨ **AI-Native Design** 45 + 44 46 - Built specifically for AI-assisted development workflows 45 47 - CLI-first interface works seamlessly with AI coding agents 46 48 - No context switching to web UIs 47 49 48 50 🚀 **Developer Focused** 51 + 49 52 - Issues live in your repo, right next to your code 50 53 - Works offline, syncs when you push 51 54 - Fast, lightweight, and stays out of your way 52 55 53 56 🔧 **Git Integration** 57 + 54 58 - Automatic sync with git commits 55 59 - Branch-aware issue tracking 56 60 - Intelligent JSONL merge resolution ··· 78 82 79 83 --- 80 84 81 - *Beads: Issue tracking that moves at the speed of thought* ⚡ 85 + _Beads: Issue tracking that moves at the speed of thought_ ⚡

+1 -1

.beads/metadata.json

··· 1 1 { 2 2 "database": "beads.db", 3 3 "jsonl_export": "issues.jsonl" 4 - } 4 + }

+1 -1

AGENTS.md

··· 33 33 7. **Hand off** - Provide context for next session 34 34 35 35 **CRITICAL RULES:** 36 + 36 37 - Work is NOT complete until `git push` succeeds 37 38 - NEVER stop before pushing - that leaves work stranded locally 38 39 - NEVER say "ready to push when you are" - YOU must push 39 40 - If push fails, resolve and retry until it succeeds 40 -

+31 -31

package.json

··· 1 1 { 2 - "name": "dexport", 3 - "version": "1.0.0", 4 - "description": "", 5 - "main": "src/cli.js", 6 - "scripts": { 7 - "test": "echo \"Error: no test specified\" && exit 1", 8 - "lint": "oxlint", 9 - "fmt": "oxfmt ." 10 - }, 11 - "bin": { 12 - "dexport": "src/cli.ts" 13 - }, 14 - "repository": { 15 - "type": "git", 16 - "url": "git+https://github.com/rektide/dexport.git" 17 - }, 18 - "keywords": [], 19 - "author": "rektide de la faye", 20 - "license": "MIT", 21 - "type": "module", 22 - "bugs": { 23 - "url": "https://github.com/rektide/dexport/issues" 24 - }, 25 - "homepage": "https://github.com/rektide/dexport#readme", 26 - "module": "dexport.js", 27 - "devDependencies": { 28 - "gunshi": "^0.28.0", 29 - "html-rewriter-wasm": "^0.4.1", 30 - "oxfmt": "^0.28.0", 31 - "oxlint": "^1.43.0" 32 - } 2 + "name": "dexport", 3 + "version": "1.0.0", 4 + "description": "", 5 + "keywords": [], 6 + "homepage": "https://github.com/rektide/dexport#readme", 7 + "bugs": { 8 + "url": "https://github.com/rektide/dexport/issues" 9 + }, 10 + "license": "MIT", 11 + "author": "rektide de la faye", 12 + "repository": { 13 + "type": "git", 14 + "url": "git+https://github.com/rektide/dexport.git" 15 + }, 16 + "bin": { 17 + "dexport": "src/cli.ts" 18 + }, 19 + "type": "module", 20 + "main": "src/cli.js", 21 + "module": "dexport.js", 22 + "scripts": { 23 + "test": "echo \"Error: no test specified\" && exit 1", 24 + "lint": "oxlint", 25 + "fmt": "oxfmt ." 26 + }, 27 + "devDependencies": { 28 + "gunshi": "^0.28.0", 29 + "html-rewriter-wasm": "^0.4.1", 30 + "oxfmt": "^0.28.0", 31 + "oxlint": "^1.43.0" 32 + } 33 33 }

+121 -85

src/cli.ts

··· 1 1 #!/usr/bin/env node 2 - import { cli, define } from 'gunshi'; 3 - import { HTMLRewriter } from 'html-rewriter-wasm'; 4 - import { access, mkdir, stat, writeFile } from 'node:fs/promises'; 5 - import { dirname, join, resolve } from 'node:path'; 2 + import { cli, define } from "gunshi"; 3 + import { HTMLRewriter } from "html-rewriter-wasm"; 4 + import { access, mkdir, stat, writeFile } from "node:fs/promises"; 5 + import { dirname, join, resolve } from "node:path"; 6 6 7 - type OverwriteMode = 'ignore' | 'cache' | 'update'; 7 + type OverwriteMode = "ignore" | "cache" | "update"; 8 8 9 9 type LinkPredicate = (url: URL, baseUrl: URL) => boolean; 10 10 ··· 27 27 28 28 function getDefaultOutput(url: URL): string { 29 29 const pathname = url.pathname; 30 - const cleanedPath = pathname.endsWith('/') ? pathname.slice(0, -1) : pathname; 31 - const parts = cleanedPath.split('/').filter(Boolean); 30 + const cleanedPath = pathname.endsWith("/") ? pathname.slice(0, -1) : pathname; 31 + const parts = cleanedPath.split("/").filter(Boolean); 32 32 return parts.length > 0 ? parts[parts.length - 1] : url.hostname; 33 33 } 34 34 ··· 36 36 console.log(JSON.stringify(stats)); 37 37 } 38 38 39 - async function downloadAsset(url: URL, outputDir: string, overwrite: OverwriteMode, stripHost: boolean): Promise<boolean> { 40 - const assetOutputPath = stripHost ? join(outputDir, url.pathname) : join(outputDir, url.hostname, url.pathname); 41 - 39 + async function downloadAsset( 40 + url: URL, 41 + outputDir: string, 42 + overwrite: OverwriteMode, 43 + stripHost: boolean, 44 + ): Promise<boolean> { 45 + const assetOutputPath = stripHost 46 + ? join(outputDir, url.pathname) 47 + : join(outputDir, url.hostname, url.pathname); 48 + 42 49 try { 43 50 await access(assetOutputPath); 44 - 45 - if (overwrite === 'ignore') { 51 + 52 + if (overwrite === "ignore") { 46 53 const assetStats = await stat(assetOutputPath); 47 54 logFileStats({ 48 55 url: url.href, ··· 54 61 }); 55 62 return true; 56 63 } 57 - 58 - if (overwrite === 'cache') { 64 + 65 + if (overwrite === "cache") { 59 66 const assetStats = await stat(assetOutputPath); 60 67 logFileStats({ 61 68 url: url.href, ··· 68 75 return true; 69 76 } 70 77 } catch {} 71 - 78 + 72 79 const response = await fetch(url.href); 73 80 if (!response.ok) { 74 81 return false; 75 82 } 76 - 83 + 77 84 const assetData = await response.arrayBuffer(); 78 85 await mkdir(dirname(assetOutputPath), { recursive: true }); 79 86 await writeFile(assetOutputPath, Buffer.from(assetData)); 80 - 87 + 81 88 logFileStats({ 82 89 url: url.href, 83 90 path: assetOutputPath, ··· 86 93 assets: 0, 87 94 cached: false, 88 95 }); 89 - 96 + 90 97 return true; 91 98 } 92 99 93 - async function scrapePage(url: URL, outputDir: string, predicate: LinkPredicate, queue: Set<string>, processed: Set<string>, overwrite: OverwriteMode, stripHost: boolean): Promise<void> { 94 - const urlWithoutHash = url.href.split('#')[0]; 100 + async function scrapePage( 101 + url: URL, 102 + outputDir: string, 103 + predicate: LinkPredicate, 104 + queue: Set<string>, 105 + processed: Set<string>, 106 + overwrite: OverwriteMode, 107 + stripHost: boolean, 108 + ): Promise<void> { 109 + const urlWithoutHash = url.href.split("#")[0]; 95 110 if (processed.has(urlWithoutHash)) return; 96 - 111 + 97 112 processed.add(urlWithoutHash); 98 - 113 + 99 114 const response = await fetch(url.href); 100 115 if (!response.ok) { 101 116 return; 102 117 } 103 - 118 + 104 119 const html = await response.text(); 105 - 120 + 106 121 const links = new Set<string>(); 107 122 const assets = new Set<string>(); 108 - 123 + 109 124 const rewriter = new HTMLRewriter((_chunk) => {}); 110 - 111 - rewriter.on('a[href]', { 125 + 126 + rewriter.on("a[href]", { 112 127 element(element) { 113 - const href = element.getAttribute('href'); 128 + const href = element.getAttribute("href"); 114 129 if (!href) return; 115 - 130 + 116 131 try { 117 132 const linkUrl = new URL(href, url); 118 - if (predicate(linkUrl, url) && !processed.has(linkUrl.href.split('#')[0])) { 133 + if (predicate(linkUrl, url) && !processed.has(linkUrl.href.split("#")[0])) { 119 134 links.add(linkUrl.href); 120 135 } 121 136 } catch {} 122 - } 137 + }, 123 138 }); 124 - 125 - rewriter.on('img[src]', { 139 + 140 + rewriter.on("img[src]", { 126 141 element(element) { 127 - const src = element.getAttribute('src'); 142 + const src = element.getAttribute("src"); 128 143 if (!src) return; 129 - 144 + 130 145 try { 131 146 const assetUrl = new URL(src, url); 132 147 assets.add(assetUrl.href); 133 148 } catch {} 134 - } 149 + }, 135 150 }); 136 - 137 - rewriter.on('link[href]', { 151 + 152 + rewriter.on("link[href]", { 138 153 element(element) { 139 - const href = element.getAttribute('href'); 154 + const href = element.getAttribute("href"); 140 155 if (!href) return; 141 - 156 + 142 157 try { 143 158 const assetUrl = new URL(href, url); 144 159 assets.add(assetUrl.href); 145 160 } catch {} 146 - } 161 + }, 147 162 }); 148 - 149 - rewriter.on('script[src]', { 163 + 164 + rewriter.on("script[src]", { 150 165 element(element) { 151 - const src = element.getAttribute('src'); 166 + const src = element.getAttribute("src"); 152 167 if (!src) return; 153 - 168 + 154 169 try { 155 170 const assetUrl = new URL(src, url); 156 171 assets.add(assetUrl.href); 157 172 } catch {} 158 - } 173 + }, 159 174 }); 160 - 175 + 161 176 const encoder = new TextEncoder(); 162 177 await rewriter.write(encoder.encode(html)); 163 178 await rewriter.end(); 164 179 rewriter.free(); 165 - 166 - const urlPrefix = url.href.endsWith('/') ? url.href.slice(0, -1) : url.href; 180 + 181 + const urlPrefix = url.href.endsWith("/") ? url.href.slice(0, -1) : url.href; 167 182 const hasChildResources = [...links, ...assets].some((resource) => { 168 - return resource.startsWith(urlPrefix + '/'); 183 + return resource.startsWith(urlPrefix + "/"); 169 184 }); 170 - 185 + 171 186 let filePath = url.pathname; 172 - if (filePath.endsWith('/') || filePath === '') { 173 - filePath = filePath + 'index.html'; 187 + if (filePath.endsWith("/") || filePath === "") { 188 + filePath = filePath + "index.html"; 174 189 } else if (hasChildResources) { 175 - filePath = filePath + '/index.html'; 176 - } else if (!filePath.endsWith('.html') && !filePath.endsWith('.htm')) { 177 - filePath = filePath + '.html'; 190 + filePath = filePath + "/index.html"; 191 + } else if (!filePath.endsWith(".html") && !filePath.endsWith(".htm")) { 192 + filePath = filePath + ".html"; 178 193 } 179 - 180 - const outputPath = stripHost ? join(outputDir, filePath) : join(outputDir, url.hostname, filePath); 181 - 194 + 195 + const outputPath = stripHost 196 + ? join(outputDir, filePath) 197 + : join(outputDir, url.hostname, filePath); 198 + 182 199 let cached = false; 183 - 200 + 184 201 try { 185 202 await access(outputPath); 186 - 187 - if (overwrite === 'ignore') { 203 + 204 + if (overwrite === "ignore") { 188 205 const fileStats = await stat(outputPath); 189 206 logFileStats({ 190 207 url: url.href, ··· 196 213 }); 197 214 return; 198 215 } 199 - 200 - if (overwrite === 'cache') { 216 + 217 + if (overwrite === "cache") { 201 218 cached = true; 202 219 } else { 203 220 await mkdir(dirname(outputPath), { recursive: true }); ··· 207 224 await mkdir(dirname(outputPath), { recursive: true }); 208 225 await writeFile(outputPath, html); 209 226 } 210 - 227 + 211 228 for (const link of links) { 212 - const linkWithoutHash = link.split('#')[0]; 229 + const linkWithoutHash = link.split("#")[0]; 213 230 if (!queue.has(linkWithoutHash) && !processed.has(linkWithoutHash)) { 214 231 queue.add(linkWithoutHash); 215 232 } 216 233 } 217 - 234 + 218 235 for (const asset of assets) { 219 236 try { 220 237 await downloadAsset(new URL(asset), outputDir, overwrite, stripHost); 221 238 } catch {} 222 239 } 223 - 240 + 224 241 const fileStats = await stat(outputPath); 225 242 logFileStats({ 226 243 url: url.href, ··· 234 251 235 252 const command = define({ 236 253 args: { 237 - urls: { type: 'positional', description: 'URLs to scrape', multiple: true }, 238 - url: { type: 'string', description: 'URL to start scraping' }, 239 - output: { type: 'string', description: 'Output directory' }, 240 - predicate: { type: 'string', description: 'Link predicate: origin or subtree', default: 'subtree' }, 241 - overwrite: { type: 'string', description: 'Overwrite mode: ignore, cache, or update', default: 'cache' }, 242 - 'strip-host': { type: 'boolean', description: 'Remove hostname from output paths' }, 254 + urls: { type: "positional", description: "URLs to scrape", multiple: true }, 255 + url: { type: "string", description: "URL to start scraping" }, 256 + output: { type: "string", description: "Output directory" }, 257 + predicate: { 258 + type: "string", 259 + description: "Link predicate: origin or subtree", 260 + default: "subtree", 261 + }, 262 + overwrite: { 263 + type: "string", 264 + description: "Overwrite mode: ignore, cache, or update", 265 + default: "cache", 266 + }, 267 + "strip-host": { type: "boolean", description: "Remove hostname from output paths" }, 243 268 }, 244 269 async run(ctx) { 245 - const { url, urls, output, predicate, overwrite, 'strip-host': stripHost } = ctx.values; 270 + const { url, urls, output, predicate, overwrite, "strip-host": stripHost } = ctx.values; 246 271 const urlList = url ? [url] : (urls ?? []); 247 - 272 + 248 273 if (urlList.length === 0) { 249 - console.error('No URLs provided. Use positional arguments or --url.'); 274 + console.error("No URLs provided. Use positional arguments or --url."); 250 275 return; 251 276 } 252 - 253 - const outputDir = resolve(process.cwd(), output ?? '.'); 254 - const linkPredicate = predicate === 'origin' ? originPredicate(new URL(urlList[0])) : subtreePredicate(new URL(urlList[0])); 255 - 277 + 278 + const outputDir = resolve(process.cwd(), output ?? "."); 279 + const linkPredicate = 280 + predicate === "origin" 281 + ? originPredicate(new URL(urlList[0])) 282 + : subtreePredicate(new URL(urlList[0])); 283 + 256 284 const queue = new Set<string>(urlList); 257 285 const processed = new Set<string>(); 258 - 286 + 259 287 while (queue.size > 0) { 260 288 const nextUrl = Array.from(queue)[0]; 261 289 queue.delete(nextUrl); 262 - 263 - await scrapePage(new URL(nextUrl), outputDir, linkPredicate, queue, processed, overwrite as OverwriteMode, stripHost); 290 + 291 + await scrapePage( 292 + new URL(nextUrl), 293 + outputDir, 294 + linkPredicate, 295 + queue, 296 + processed, 297 + overwrite as OverwriteMode, 298 + stripHost, 299 + ); 264 300 } 265 - 301 + 266 302 console.error(`Scraping complete. ${processed.size} pages processed.`); 267 303 }, 268 304 });

+30

src/decorate/registry.ts

··· 1 + import { HTMLRewriter } from "html-rewriter-wasm"; 2 + import { Decorator, DecoratorResult } from "./types.ts"; 3 + 4 + const decorators = new Map<string, Decorator>(); 5 + 6 + export function registerDecorator(decorator: Decorator): void { 7 + decorators.set(decorator.name, decorator); 8 + } 9 + 10 + export function getDecorator(name: string): Decorator | undefined { 11 + return decorators.get(name); 12 + } 13 + 14 + export async function decorateHtml(html: string): Promise<DecoratorResult> { 15 + const result: DecoratorResult = {}; 16 + 17 + for (const decorator of decorators.values()) { 18 + const rewriter = new HTMLRewriter(() => {}); 19 + const decoratorResult = decorator.decorate(rewriter); 20 + 21 + const encoder = new TextEncoder(); 22 + await rewriter.write(encoder.encode(html)); 23 + await rewriter.end(); 24 + rewriter.free(); 25 + 26 + Object.assign(result, decoratorResult); 27 + } 28 + 29 + return result; 30 + }

+10

src/decorate/types.ts

··· 1 + import { HTMLRewriter } from "html-rewriter-wasm"; 2 + 3 + export interface DecoratorResult { 4 + [key: string]: unknown; 5 + } 6 + 7 + export interface Decorator { 8 + name: string; 9 + decorate(rewriter: HTMLRewriter): DecoratorResult; 10 + }

+29

src/decorator/last-indexed.ts

··· 1 + import { HTMLRewriter } from "html-rewriter-wasm"; 2 + import { Decorator, DecoratorResult } from "../decorate/types.ts"; 3 + 4 + export const lastIndexedDecorator: Decorator = { 5 + name: "last-indexed", 6 + decorate(rewriter: HTMLRewriter): DecoratorResult { 7 + let lastIndexed: string | undefined; 8 + 9 + rewriter.on('script[type="application/ld+json"]', { 10 + text(text) { 11 + const content = text.text; 12 + if (!content) return; 13 + 14 + try { 15 + const data = JSON.parse(content); 16 + const dateModified = data.dateModified; 17 + 18 + if (typeof dateModified === "string" && !lastIndexed) { 19 + lastIndexed = dateModified; 20 + } 21 + } catch {} 22 + }, 23 + }); 24 + 25 + return { 26 + "last-indexed": lastIndexed, 27 + }; 28 + }, 29 + };

Configure Feed

Configure Feed