Sync articles marked toread in kipclip to Crosspoint Reader (Xteink X4)
1import { parseHTML } from "npm:linkedom@0.18.9";
2
3/** XHTML void elements that must be self-closing */
4const VOID_ELEMENTS = new Set([
5 "area",
6 "base",
7 "br",
8 "col",
9 "embed",
10 "hr",
11 "img",
12 "input",
13 "link",
14 "meta",
15 "param",
16 "source",
17 "track",
18 "wbr",
19]);
20
21/**
22 * Sanitize Readability HTML output into valid EPUB XHTML.
23 *
24 * Fixes:
25 * - Unwraps <picture> to just <img> (keeps fallback img, drops <source>)
26 * - Ensures void elements are self-closing
27 * - Escapes unescaped & characters
28 * - Removes data-* attributes (bloat)
29 * - Strips <script>, <style>, <iframe> elements
30 */
31export function sanitizeForXhtml(
32 html: string,
33 options?: { stripRemoteImages?: boolean },
34): string {
35 const { document } = parseHTML(`<!DOCTYPE html><html><body>${html}</body></html>`);
36
37 // Strip dangerous/useless elements
38 for (const tag of ["script", "style", "iframe", "noscript"]) {
39 for (const el of [...document.querySelectorAll(tag)]) {
40 el.remove();
41 }
42 }
43
44 // Unwrap <picture> → keep <img>, drop <source>
45 for (const picture of [...document.querySelectorAll("picture")]) {
46 const img = picture.querySelector("img");
47 if (img) {
48 picture.replaceWith(img);
49 } else {
50 picture.remove();
51 }
52 }
53
54 // Remove <source> elements that may be orphaned
55 for (const source of [...document.querySelectorAll("source")]) {
56 source.remove();
57 }
58
59 // Optionally strip remote images (for EPUB without embedded images)
60 if (options?.stripRemoteImages) {
61 for (const img of [...document.querySelectorAll("img")]) {
62 const src = img.getAttribute("src") || "";
63 if (src.startsWith("http://") || src.startsWith("https://")) {
64 img.remove();
65 }
66 }
67 // Remove empty <figure> elements left behind after image removal
68 for (const figure of [...document.querySelectorAll("figure")]) {
69 if (!figure.querySelector("img") && figure.textContent?.trim() === "") {
70 figure.remove();
71 }
72 }
73 }
74
75 // Strip data-* attributes from all elements
76 for (const el of [...document.querySelectorAll("*")]) {
77 const attrs = [...el.attributes];
78 for (const attr of attrs) {
79 if (attr.name.startsWith("data-")) {
80 el.removeAttribute(attr.name);
81 }
82 }
83 }
84
85 // Serialize body innerHTML
86 let xhtml = document.body.innerHTML;
87
88 // Fix void elements: ensure self-closing (e.g., <img src="..."> → <img src="..." />)
89 xhtml = xhtml.replace(/<([\w-]+)([^>]*?)\/?\s*>/g, (match, tag, attrs) => {
90 const tagLower = tag.toLowerCase();
91 if (VOID_ELEMENTS.has(tagLower)) {
92 // Ensure self-closing
93 return `<${tag}${attrs} />`;
94 }
95 return match;
96 });
97
98 // Fix unescaped & that aren't part of entities (& not followed by #word;)
99 xhtml = xhtml.replace(/&(?!(?:#\d+|#x[\da-fA-F]+|[a-zA-Z]\w{0,30});)/g, "&");
100
101 return xhtml;
102}
103
104/**
105 * Extract all image URLs from HTML content.
106 * Returns array of { src, index } for replacement.
107 */
108export function extractImageUrls(html: string): string[] {
109 const urls: string[] = [];
110 const imgRegex = /<img[^>]+src=["']([^"']+)["']/gi;
111 let match;
112 while ((match = imgRegex.exec(html)) !== null) {
113 const src = match[1];
114 if (src.startsWith("http://") || src.startsWith("https://")) {
115 urls.push(src);
116 }
117 }
118 return [...new Set(urls)]; // deduplicate
119}
120
121/**
122 * Replace image URLs in HTML with local EPUB paths.
123 */
124export function replaceImageUrls(
125 html: string,
126 urlToPath: Map<string, string>,
127): string {
128 let result = html;
129 for (const [url, localPath] of urlToPath) {
130 result = result.replaceAll(url, localPath);
131 }
132 return result;
133}