Sync articles marked toread in kipclip to Crosspoint Reader (Xteink X4)
5
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 133 lines 3.6 kB view raw
1import { parseHTML } from "npm:linkedom@0.18.9"; 2 3/** XHTML void elements that must be self-closing */ 4const VOID_ELEMENTS = new Set([ 5 "area", 6 "base", 7 "br", 8 "col", 9 "embed", 10 "hr", 11 "img", 12 "input", 13 "link", 14 "meta", 15 "param", 16 "source", 17 "track", 18 "wbr", 19]); 20 21/** 22 * Sanitize Readability HTML output into valid EPUB XHTML. 23 * 24 * Fixes: 25 * - Unwraps <picture> to just <img> (keeps fallback img, drops <source>) 26 * - Ensures void elements are self-closing 27 * - Escapes unescaped & characters 28 * - Removes data-* attributes (bloat) 29 * - Strips <script>, <style>, <iframe> elements 30 */ 31export function sanitizeForXhtml( 32 html: string, 33 options?: { stripRemoteImages?: boolean }, 34): string { 35 const { document } = parseHTML(`<!DOCTYPE html><html><body>${html}</body></html>`); 36 37 // Strip dangerous/useless elements 38 for (const tag of ["script", "style", "iframe", "noscript"]) { 39 for (const el of [...document.querySelectorAll(tag)]) { 40 el.remove(); 41 } 42 } 43 44 // Unwrap <picture> → keep <img>, drop <source> 45 for (const picture of [...document.querySelectorAll("picture")]) { 46 const img = picture.querySelector("img"); 47 if (img) { 48 picture.replaceWith(img); 49 } else { 50 picture.remove(); 51 } 52 } 53 54 // Remove <source> elements that may be orphaned 55 for (const source of [...document.querySelectorAll("source")]) { 56 source.remove(); 57 } 58 59 // Optionally strip remote images (for EPUB without embedded images) 60 if (options?.stripRemoteImages) { 61 for (const img of [...document.querySelectorAll("img")]) { 62 const src = img.getAttribute("src") || ""; 63 if (src.startsWith("http://") || src.startsWith("https://")) { 64 img.remove(); 65 } 66 } 67 // Remove empty <figure> elements left behind after image removal 68 for (const figure of [...document.querySelectorAll("figure")]) { 69 if (!figure.querySelector("img") && figure.textContent?.trim() === "") { 70 figure.remove(); 71 } 72 } 73 } 74 75 // Strip data-* attributes from all elements 76 for (const el of [...document.querySelectorAll("*")]) { 77 const attrs = [...el.attributes]; 78 for (const attr of attrs) { 79 if (attr.name.startsWith("data-")) { 80 el.removeAttribute(attr.name); 81 } 82 } 83 } 84 85 // Serialize body innerHTML 86 let xhtml = document.body.innerHTML; 87 88 // Fix void elements: ensure self-closing (e.g., <img src="..."> → <img src="..." />) 89 xhtml = xhtml.replace(/<([\w-]+)([^>]*?)\/?\s*>/g, (match, tag, attrs) => { 90 const tagLower = tag.toLowerCase(); 91 if (VOID_ELEMENTS.has(tagLower)) { 92 // Ensure self-closing 93 return `<${tag}${attrs} />`; 94 } 95 return match; 96 }); 97 98 // Fix unescaped & that aren't part of entities (& not followed by #word;) 99 xhtml = xhtml.replace(/&(?!(?:#\d+|#x[\da-fA-F]+|[a-zA-Z]\w{0,30});)/g, "&amp;"); 100 101 return xhtml; 102} 103 104/** 105 * Extract all image URLs from HTML content. 106 * Returns array of { src, index } for replacement. 107 */ 108export function extractImageUrls(html: string): string[] { 109 const urls: string[] = []; 110 const imgRegex = /<img[^>]+src=["']([^"']+)["']/gi; 111 let match; 112 while ((match = imgRegex.exec(html)) !== null) { 113 const src = match[1]; 114 if (src.startsWith("http://") || src.startsWith("https://")) { 115 urls.push(src); 116 } 117 } 118 return [...new Set(urls)]; // deduplicate 119} 120 121/** 122 * Replace image URLs in HTML with local EPUB paths. 123 */ 124export function replaceImageUrls( 125 html: string, 126 urlToPath: Map<string, string>, 127): string { 128 let result = html; 129 for (const [url, localPath] of urlToPath) { 130 result = result.replaceAll(url, localPath); 131 } 132 return result; 133}