this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Add prompt rewrite for markdown sections

+188 -24
+2
.env.example
··· 1 + OPENAI_API_KEY= 2 + OPENAI_API_URL=https://api.cloudflare.com/client/v4/accounts/ID/ai/v1
+50
bun.lock
··· 4 4 "": { 5 5 "name": "llms-txt-gen", 6 6 "dependencies": { 7 + "@ai-sdk/openai": "^1.3.22", 7 8 "@mozilla/readability": "^0.6.0", 8 9 "@tsconfig/bun": "^1.0.8", 10 + "ai": "^4.3.16", 11 + "ai-fallback": "^0.1.5", 9 12 "debug": "^4.4.1", 10 13 "happy-dom": "^18.0.1", 11 14 "hast-util-sanitize": "^5.0.2", 12 15 "js-graph-algorithms": "^1.0.18", 13 16 "mdast": "^3.0.0", 17 + "ollama-ai-provider": "^1.2.0", 14 18 "prettier": "^3.5.3", 15 19 "rehype-parse": "^9.0.1", 16 20 "rehype-remark": "^10.0.1", ··· 30 34 }, 31 35 }, 32 36 "packages": { 37 + "@ai-sdk/openai": ["@ai-sdk/openai@1.3.22", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "@ai-sdk/provider-utils": "2.2.8" }, "peerDependencies": { "zod": "^3.0.0" } }, "sha512-QwA+2EkG0QyjVR+7h6FE7iOu2ivNqAVMm9UJZkVxxTk5OIq5fFJDTEI/zICEMuHImTTXR2JjsL6EirJ28Jc4cw=="], 38 + 39 + "@ai-sdk/provider": ["@ai-sdk/provider@1.1.3", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg=="], 40 + 41 + "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@2.2.8", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "nanoid": "^3.3.8", "secure-json-parse": "^2.7.0" }, "peerDependencies": { "zod": "^3.23.8" } }, "sha512-fqhG+4sCVv8x7nFzYnFo19ryhAa3w096Kmc3hWxMQfW/TubPOmt3A6tYZhl4mUfQWWQMsuSkLrtjlWuXBVSGQA=="], 42 + 43 + "@ai-sdk/react": ["@ai-sdk/react@1.2.12", "", { "dependencies": { "@ai-sdk/provider-utils": "2.2.8", "@ai-sdk/ui-utils": "1.2.11", "swr": "^2.2.5", "throttleit": "2.1.0" }, "peerDependencies": { "react": "^18 || ^19 || ^19.0.0-rc", "zod": "^3.23.8" }, "optionalPeers": ["zod"] }, "sha512-jK1IZZ22evPZoQW3vlkZ7wvjYGYF+tRBKXtrcolduIkQ/m/sOAVcVeVDUDvh1T91xCnWCdUGCPZg2avZ90mv3g=="], 44 + 45 + "@ai-sdk/ui-utils": ["@ai-sdk/ui-utils@1.2.11", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "@ai-sdk/provider-utils": "2.2.8", "zod-to-json-schema": "^3.24.1" }, "peerDependencies": { "zod": "^3.23.8" } }, "sha512-3zcwCc8ezzFlwp3ZD15wAPjf2Au4s3vAbKsXQVyhxODHcmu0iyPO2Eua6D/vicq/AUm/BAo60r97O6HU+EI0+w=="], 46 + 33 47 "@mozilla/readability": ["@mozilla/readability@0.6.0", "", {}, "sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ=="], 48 + 49 + "@opentelemetry/api": ["@opentelemetry/api@1.9.0", "", {}, "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg=="], 34 50 35 51 "@tsconfig/bun": ["@tsconfig/bun@1.0.8", "", {}, "sha512-JlJaRaS4hBTypxtFe8WhnwV8blf0R+3yehLk8XuyxUYNx6VXsKCjACSCvOYEFUiqlhlBWxtYCn/zRlOb8BzBQg=="], 36 52 37 53 "@types/debug": ["@types/debug@4.1.12", "", { "dependencies": { "@types/ms": "*" } }, "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ=="], 54 + 55 + "@types/diff-match-patch": ["@types/diff-match-patch@1.0.36", "", {}, "sha512-xFdR6tkm0MWvBfO8xXCSsinYxHcqkQUlcHeSpMC2ukzOb6lwQAfDmW+Qt0AvlGd8HpsS28qKsB+oPeJn9I39jg=="], 38 56 39 57 "@types/hast": ["@types/hast@3.0.4", "", { "dependencies": { "@types/unist": "*" } }, "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ=="], 40 58 ··· 50 68 51 69 "@ungap/structured-clone": ["@ungap/structured-clone@1.3.0", "", {}, "sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g=="], 52 70 71 + "ai": ["ai@4.3.16", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "@ai-sdk/provider-utils": "2.2.8", "@ai-sdk/react": "1.2.12", "@ai-sdk/ui-utils": "1.2.11", "@opentelemetry/api": "1.9.0", "jsondiffpatch": "0.6.0" }, "peerDependencies": { "react": "^18 || ^19 || ^19.0.0-rc", "zod": "^3.23.8" }, "optionalPeers": ["react"] }, "sha512-KUDwlThJ5tr2Vw0A1ZkbDKNME3wzWhuVfAOwIvFUzl1TPVDFAXDFTXio3p+jaKneB+dKNCvFFlolYmmgHttG1g=="], 72 + 73 + "ai-fallback": ["ai-fallback@0.1.5", "", { "dependencies": { "@ai-sdk/provider": "^1", "@ai-sdk/provider-utils": "^2" } }, "sha512-/FhTd9SGMEUDYBKbO3ZyfS0CBGglJByMbMRQOGjjDYlxZinFZtn99w1SPh4NZYJWIP5jjoewytfZjp+30QPT1A=="], 74 + 53 75 "bail": ["bail@2.0.2", "", {}, "sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw=="], 54 76 55 77 "ccount": ["ccount@2.0.1", "", {}, "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg=="], 78 + 79 + "chalk": ["chalk@5.4.1", "", {}, "sha512-zgVZuo2WcZgfUEmsn6eO3kINexW8RAE4maiQ8QNs8CtpPCSyMiYsULR3HQYkm3w8FIA3SberyMJMSldGsW+U3w=="], 56 80 57 81 "character-entities": ["character-entities@2.0.2", "", {}, "sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ=="], 58 82 ··· 70 94 71 95 "devlop": ["devlop@1.1.0", "", { "dependencies": { "dequal": "^2.0.0" } }, "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA=="], 72 96 97 + "diff-match-patch": ["diff-match-patch@1.0.5", "", {}, "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw=="], 98 + 73 99 "entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="], 74 100 75 101 "escape-string-regexp": ["escape-string-regexp@5.0.0", "", {}, "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw=="], ··· 113 139 "is-plain-obj": ["is-plain-obj@4.1.0", "", {}, "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg=="], 114 140 115 141 "js-graph-algorithms": ["js-graph-algorithms@1.0.18", "", { "bin": { "js-graphs": "./src/jsgraphs.js" } }, "sha512-Gu1wtWzXBzGeye/j9BuyplGHscwqKRZodp/0M1vyBc19RJpblSwKGu099KwwaTx9cRIV+Qupk8xUMfEiGfFqSA=="], 142 + 143 + "json-schema": ["json-schema@0.4.0", "", {}, "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="], 144 + 145 + "jsondiffpatch": ["jsondiffpatch@0.6.0", "", { "dependencies": { "@types/diff-match-patch": "^1.0.36", "chalk": "^5.3.0", "diff-match-patch": "^1.0.5" }, "bin": { "jsondiffpatch": "bin/jsondiffpatch.js" } }, "sha512-3QItJOXp2AP1uv7waBkao5nCvhEv+QmJAd38Ybq7wNI74Q+BBmnLn4EDKz6yI9xGAIQoUF87qHt+kc1IVxB4zQ=="], 116 146 117 147 "longest-streak": ["longest-streak@3.1.0", "", {}, "sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g=="], 118 148 ··· 206 236 207 237 "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], 208 238 239 + "nanoid": ["nanoid@3.3.11", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w=="], 240 + 241 + "ollama-ai-provider": ["ollama-ai-provider@1.2.0", "", { "dependencies": { "@ai-sdk/provider": "^1.0.0", "@ai-sdk/provider-utils": "^2.0.0", "partial-json": "0.1.7" }, "peerDependencies": { "zod": "^3.0.0" }, "optionalPeers": ["zod"] }, "sha512-jTNFruwe3O/ruJeppI/quoOUxG7NA6blG3ZyQj3lei4+NnJo7bi3eIRWqlVpRlu/mbzbFXeJSBuYQWF6pzGKww=="], 242 + 209 243 "parse5": ["parse5@7.3.0", "", { "dependencies": { "entities": "^6.0.0" } }, "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw=="], 210 244 245 + "partial-json": ["partial-json@0.1.7", "", {}, "sha512-Njv/59hHaokb/hRUjce3Hdv12wd60MtM9Z5Olmn+nehe0QDAsRtRbJPvJ0Z91TusF0SuZRIvnM+S4l6EIP8leA=="], 246 + 211 247 "prettier": ["prettier@3.5.3", "", { "bin": { "prettier": "bin/prettier.cjs" } }, "sha512-QQtaxnoDJeAkDvDKWCLiwIXkTgRhwYDEQCghU9Z6q03iyek/rxRh/2lC3HB7P8sWT2xC/y5JDctPLBIGzHKbhw=="], 212 248 213 249 "property-information": ["property-information@7.1.0", "", {}, "sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ=="], 250 + 251 + "react": ["react@19.1.0", "", {}, "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg=="], 214 252 215 253 "rehype-minify-whitespace": ["rehype-minify-whitespace@6.0.2", "", { "dependencies": { "@types/hast": "^3.0.0", "hast-util-minify-whitespace": "^1.0.0" } }, "sha512-Zk0pyQ06A3Lyxhe9vGtOtzz3Z0+qZ5+7icZ/PL/2x1SHPbKao5oB/g/rlc6BCTajqBb33JcOe71Ye1oFsuYbnw=="], 216 254 ··· 234 272 235 273 "remark-unlink": ["remark-unlink@5.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "mdast-squeeze-paragraphs": "^6.0.0", "unist-util-visit": "^5.0.0" } }, "sha512-8NFrI3SecxhOLb734tKaxcU//lNDABabz1I26MGjdlpkUg1I+Fr7lyqL9ckxaCB4kErXD10mScPD7yhCXX4Pfw=="], 236 274 275 + "secure-json-parse": ["secure-json-parse@2.7.0", "", {}, "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw=="], 276 + 237 277 "space-separated-tokens": ["space-separated-tokens@2.0.2", "", {}, "sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q=="], 238 278 239 279 "stringify-entities": ["stringify-entities@4.0.4", "", { "dependencies": { "character-entities-html4": "^2.0.0", "character-entities-legacy": "^3.0.0" } }, "sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg=="], 280 + 281 + "swr": ["swr@2.3.3", "", { "dependencies": { "dequal": "^2.0.3", "use-sync-external-store": "^1.4.0" }, "peerDependencies": { "react": "^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" } }, "sha512-dshNvs3ExOqtZ6kJBaAsabhPdHyeY4P2cKwRCniDVifBMoG/SVI7tfLWqPXriVspf2Rg4tPzXJTnwaihIeFw2A=="], 282 + 283 + "throttleit": ["throttleit@2.1.0", "", {}, "sha512-nt6AMGKW1p/70DF/hGBdJB57B8Tspmbp5gfJ8ilhLnt7kkr2ye7hzD6NVG8GGErk2HWF34igrL2CXmNIkzKqKw=="], 240 284 241 285 "trim-lines": ["trim-lines@3.0.1", "", {}, "sha512-kRj8B+YHZCc9kQYdWfJB2/oUl9rA99qbowYYBtr4ui4mZyAQ2JpvVBd/6U2YloATfqBhBTSMhTpgBHtU0Mf3Rg=="], 242 286 ··· 264 308 265 309 "urlpattern-polyfill": ["urlpattern-polyfill@10.1.0", "", {}, "sha512-IGjKp/o0NL3Bso1PymYURCJxMPNAf/ILOpendP9f5B6e1rTJgdgiOvgfoT8VxCAdY+Wisb9uhGaJJf3yZ2V9nw=="], 266 310 311 + "use-sync-external-store": ["use-sync-external-store@1.5.0", "", { "peerDependencies": { "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" } }, "sha512-Rb46I4cGGVBmjamjphe8L/UnvJD+uPPtTkNvX5mZgqdbavhI4EbgIWJiIHXJ8bc/i9EQGPRh4DwEURJ552Do0A=="], 312 + 267 313 "vfile": ["vfile@6.0.3", "", { "dependencies": { "@types/unist": "^3.0.0", "vfile-message": "^4.0.0" } }, "sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q=="], 268 314 269 315 "vfile-location": ["vfile-location@5.0.3", "", { "dependencies": { "@types/unist": "^3.0.0", "vfile": "^6.0.0" } }, "sha512-5yXvWDEgqeiYiBe1lbxYF7UMAIm/IcopxMHrMQDq3nvKcjPKIhZklUKL+AE7J7uApI4kwe2snsK+eI6UTj9EHg=="], ··· 273 319 "web-namespaces": ["web-namespaces@2.0.1", "", {}, "sha512-bKr1DkiNa2krS7qxNtdrtHAmzuYGFQLiQ13TsorsdT6ULTkPLKuu5+GsFpDlg6JFjUTwX2DyhMPG2be8uPrqsQ=="], 274 320 275 321 "whatwg-mimetype": ["whatwg-mimetype@3.0.0", "", {}, "sha512-nt+N2dzIutVRxARx1nghPKGv1xHikU7HKdfafKkLNLindmPU/ch3U31NOCGGA/dmPcmb1VlofO0vnKAcsm0o/Q=="], 322 + 323 + "zod": ["zod@3.25.61", "", {}, "sha512-fzfJgUw78LTNnHujj9re1Ov/JJQkRZZGDMcYqSx7Hp4rPOkKywaFHq0S6GoHeXs0wGNE/sIOutkXgnwzrVOGCQ=="], 324 + 325 + "zod-to-json-schema": ["zod-to-json-schema@3.24.5", "", { "peerDependencies": { "zod": "^3.24.1" } }, "sha512-/AuWwMP+YqiPbsJx5D6TfgRTc4kTLjsh5SOcd4bLsfUg2RcEXrFMJl1DGgdHy2aCfsIA/cr/1JM0xcB2GZji8g=="], 276 326 277 327 "zwitch": ["zwitch@2.0.4", "", {}, "sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A=="], 278 328 }
+4
package.json
··· 7 7 "start": "DEBUG=* bun src/index.ts" 8 8 }, 9 9 "dependencies": { 10 + "@ai-sdk/openai": "^1.3.22", 10 11 "@mozilla/readability": "^0.6.0", 11 12 "@tsconfig/bun": "^1.0.8", 13 + "ai": "^4.3.16", 14 + "ai-fallback": "^0.1.5", 12 15 "debug": "^4.4.1", 13 16 "happy-dom": "^18.0.1", 14 17 "hast-util-sanitize": "^5.0.2", 15 18 "js-graph-algorithms": "^1.0.18", 16 19 "mdast": "^3.0.0", 20 + "ollama-ai-provider": "^1.2.0", 17 21 "prettier": "^3.5.3", 18 22 "rehype-parse": "^9.0.1", 19 23 "rehype-remark": "^10.0.1",
+3 -10
src/fetch.ts
··· 1 1 import { debug } from 'debug'; 2 2 import * as fs from 'node:fs/promises'; 3 3 import * as path from 'node:path'; 4 + import { makeCacheFileHelper } from './path'; 4 5 5 6 const log = debug('llms-txt-gen.fetch'); 6 7 7 - const cacheDir = path.join(process.cwd(), '.cache'); 8 + const cacheDir = path.join(process.cwd(), '.cache/fetch'); 8 9 await fs.mkdir(cacheDir, { recursive: true }); 9 - 10 - const getCacheFile = async (url: URL) => { 11 - const { hostname, pathname } = url; 12 - const name = pathname.split('/').filter(Boolean).join('_'); 13 - const targetDir = path.join(cacheDir, hostname); 14 - const basename = path.basename(name, path.extname(name)); 15 - await fs.mkdir(targetDir, { recursive: true }); 16 - return path.join(targetDir, `${basename}.html`); 17 - }; 10 + const getCacheFile = makeCacheFileHelper(cacheDir); 18 11 19 12 export async function fetchHtml(url: URL): Promise<string | null> { 20 13 const cacheFile = await getCacheFile(url);
+7 -4
src/index.ts
··· 18 18 async function generate(site: Site) { 19 19 log('crawl', site.name); 20 20 const pages = await crawl(site); 21 - const contents = await concatMarkdown( 22 - pages.map((page) => page.getContent()) 23 - ); 24 - const formatted = await formatMarkdown(contents); 21 + const contents: string[] = []; 22 + for (const page of pages) { 23 + const content = await page.getContent(); 24 + if (content) contents.push(content); 25 + } 26 + const output = await concatMarkdown(contents); 27 + const formatted = await formatMarkdown(output); 25 28 const file = path.join(output, `llms-full-${site.name}.txt`); 26 29 await fs.writeFile(file, formatted, 'utf-8'); 27 30 }
+34 -8
src/page.ts
··· 1 1 import { debug } from 'debug'; 2 + import * as fs from 'node:fs/promises'; 3 + import * as path from 'node:path'; 2 4 import { URLPattern } from 'urlpattern-polyfill/urlpattern'; 3 5 import { WeightedDiGraph, KruskalMST, Edge } from 'js-graph-algorithms'; 4 6 import { extractContent, extractLinks, parseBody } from "./dom"; 5 7 import { fetchHtml } from "./fetch"; 6 8 import { htmlToMarkdown, sanitizeHtml } from "./unified"; 9 + import { rewriteMarkdown } from './rewrite'; 10 + import { makeCacheFileHelper } from './path'; 7 11 8 12 const log = debug('llms-txt-gen.graph'); 9 13 ··· 11 15 baseURL: URL | string; 12 16 include?: string[]; 13 17 exclude?: string[]; 18 + } 19 + 20 + const cacheDir = path.join(process.cwd(), '.cache/page'); 21 + await fs.mkdir(cacheDir, { recursive: true }); 22 + const getCacheFile = makeCacheFileHelper(cacheDir, '.md'); 23 + 24 + async function extractContentToMarkdown(url: URL, html: string): Promise<string | null> { 25 + const cacheFile = await getCacheFile(url); 26 + try { 27 + const content = await fs.readFile(cacheFile, 'utf-8'); 28 + if (content) { 29 + log('extracted output from cache', url.pathname); 30 + return content; 31 + } 32 + } catch {} 33 + log('extracting content', url.pathname); 34 + const doc = parseBody(url, html); 35 + const output = extractContent(doc); 36 + if (output) { 37 + const markdown = await htmlToMarkdown(output); 38 + await fs.writeFile(cacheFile, markdown, 'utf-8'); 39 + return markdown; 40 + } else { 41 + return null; 42 + } 14 43 } 15 44 16 45 class Root { ··· 106 135 if (this.#content !== null || !this.isPage) 107 136 return this.#content; 108 137 const html = await this.getHTML(); 109 - if (!html) return null; 110 - const doc = parseBody(this.url, html); 111 - const content = extractContent(doc); 112 - if (content) { 113 - return (this.#content = await htmlToMarkdown(content)); 114 - } else { 115 - return (this.#content = null); 116 - } 138 + if (!html) return (this.#content = null); 139 + const markdown = await extractContentToMarkdown(this.url, html); 140 + if (!markdown) return (this.#content = null); 141 + const rewritten = await rewriteMarkdown(this.url, markdown); 142 + return (this.#content = rewritten); 117 143 } 118 144 } 119 145
+11
src/path.ts
··· 1 + import * as fs from 'node:fs/promises'; 2 + import * as path from 'node:path'; 3 + 4 + export const makeCacheFileHelper = (baseDir: string, ext = '.html') => async (url: URL) => { 5 + const { hostname, pathname } = url; 6 + const name = pathname.split('/').filter(Boolean).join('_'); 7 + const targetDir = path.join(baseDir, hostname); 8 + const basename = path.basename(name, path.extname(name)); 9 + await fs.mkdir(targetDir, { recursive: true }); 10 + return path.join(targetDir, `${basename}${ext}`); 11 + };
+66
src/rewrite.ts
··· 1 + import { debug } from 'debug'; 2 + import { createFallback } from 'ai-fallback'; 3 + import { generateText } from 'ai'; 4 + import { createOpenAI } from '@ai-sdk/openai'; 5 + import { createOllama } from 'ollama-ai-provider'; 6 + import * as fs from 'node:fs/promises'; 7 + import * as path from 'node:path'; 8 + 9 + import { makeCacheFileHelper } from './path'; 10 + 11 + const log = debug('llms-txt-gen.rewrite'); 12 + 13 + const cacheDir = path.join(process.cwd(), '.cache/rewrite'); 14 + await fs.mkdir(cacheDir, { recursive: true }); 15 + const getCacheFile = makeCacheFileHelper(cacheDir, '.txt'); 16 + 17 + if (!process.env.OPENAI_API_KEY) throw new Error('Missing OPENAI_API_KEY env var'); 18 + if (!process.env.OPENAI_API_URL) throw new Error('Missing OPENAI_API_URL env var'); 19 + 20 + const SYSTEM_PROMPT = ` 21 + Reformat markdown content you're given into an llms-full.txt file, also in markdown format 22 + - Where the format isn't easily understandable by AI, reformat it faithfully to make it processable 23 + - Reformat for an AI and paraphrase where necessary, but don't add interpretations 24 + - Preserve code snippets and keep them in TypeScript or TypeScript typings format 25 + - Avoid using emphasis or excessive markdown syntax, but keep code snippets where they are 26 + - Don't mention other content, pages, or external content (Remove sentences such as "Refer to", "Read more") 27 + - When encountering a markdown table, ensure that you don't output a separate legend, and keep all relevant information in the table 28 + - Don't use any knowledge you may have on the subject. Only output what you're given. 29 + `; 30 + 31 + const ai = createOpenAI({ 32 + apiKey: process.env.OPENAI_API_KEY, 33 + baseURL: process.env.OPENAI_API_URL, 34 + }); 35 + 36 + const ollama = createOllama({ 37 + baseURL: 'http://localhost:11434/api', 38 + }); 39 + 40 + export async function rewriteMarkdown(url: URL, input: string) { 41 + const cacheFile = await getCacheFile(url); 42 + let content: string; 43 + try { 44 + content = await fs.readFile(cacheFile, 'utf-8'); 45 + if (content) { 46 + log('prompt output from cache', url.pathname); 47 + return content; 48 + } 49 + } catch {} 50 + log('prompting to rewrite', url.pathname); 51 + const { text } = await generateText({ 52 + model: createFallback({ 53 + models: [ 54 + ollama('gemma:7b'), 55 + ai('@hf/google/gemma-7b-it'), 56 + ], 57 + onError(error, modelId) { 58 + log(`error using model ${modelId}`, error); 59 + }, 60 + }), 61 + system: SYSTEM_PROMPT.trim(), 62 + prompt: input, 63 + }); 64 + await fs.writeFile(cacheFile, text, 'utf-8'); 65 + return text; 66 + }
+11 -2
src/unified.ts
··· 58 58 parent.children.splice(index, 1); 59 59 if (node.children.length > 1 || !child || child.type !== 'text') 60 60 return; 61 - switch (child.value.trim()) { 61 + const value = child.value.trim(); 62 + switch (value) { 62 63 case 'Example': 63 64 case 'Remarks': 64 65 case 'Note': ··· 71 72 } else if (node.type === 'text') { 72 73 if (!parent || parent.type !== 'paragraph' || parent.children.length > 1) 73 74 return; 74 - switch (node.value.trim()) { 75 + const value = node.value.trim(); 76 + if ( 77 + value.startsWith('Last updated on ') || 78 + value.startsWith('Copyright ') 79 + ) { 80 + parent.children.splice(index, 1); 81 + return; 82 + } 83 + switch (value) { 75 84 case 'Loading...': 76 85 case 'Caution': 77 86 case 'tsx':