backend/html-sanitizer.ts at main · tijs.org/crosspoint-articles

tijs.org / crosspoint-articles
fork
Sync articles marked toread in kipclip to Crosspoint Reader (Xteink X4)
fork
crosspoint-articles / backend / html-sanitizer.ts
at main 133 lines 3.6 kB view raw
wrap content
Tijs Teulings Initial commit: crosspoint articles backend 3w ago
3a9df645
  1import { parseHTML } from "npm:linkedom@0.18.9";
  2
  3/** XHTML void elements that must be self-closing */
  4const VOID_ELEMENTS = new Set([
  5  "area",
  6  "base",
  7  "br",
  8  "col",
  9  "embed",
 10  "hr",
 11  "img",
 12  "input",
 13  "link",
 14  "meta",
 15  "param",
 16  "source",
 17  "track",
 18  "wbr",
 19]);
 20
 21/**
 22 * Sanitize Readability HTML output into valid EPUB XHTML.
 23 *
 24 * Fixes:
 25 * - Unwraps <picture> to just <img> (keeps fallback img, drops <source>)
 26 * - Ensures void elements are self-closing
 27 * - Escapes unescaped & characters
 28 * - Removes data-* attributes (bloat)
 29 * - Strips <script>, <style>, <iframe> elements
 30 */
 31export function sanitizeForXhtml(
 32  html: string,
 33  options?: { stripRemoteImages?: boolean },
 34): string {
 35  const { document } = parseHTML(`<!DOCTYPE html><html><body>${html}</body></html>`);
 36
 37  // Strip dangerous/useless elements
 38  for (const tag of ["script", "style", "iframe", "noscript"]) {
 39    for (const el of [...document.querySelectorAll(tag)]) {
 40      el.remove();
 41    }
 42  }
 43
 44  // Unwrap <picture> → keep <img>, drop <source>
 45  for (const picture of [...document.querySelectorAll("picture")]) {
 46    const img = picture.querySelector("img");
 47    if (img) {
 48      picture.replaceWith(img);
 49    } else {
 50      picture.remove();
 51    }
 52  }
 53
 54  // Remove <source> elements that may be orphaned
 55  for (const source of [...document.querySelectorAll("source")]) {
 56    source.remove();
 57  }
 58
 59  // Optionally strip remote images (for EPUB without embedded images)
 60  if (options?.stripRemoteImages) {
 61    for (const img of [...document.querySelectorAll("img")]) {
 62      const src = img.getAttribute("src") || "";
 63      if (src.startsWith("http://") || src.startsWith("https://")) {
 64        img.remove();
 65      }
 66    }
 67    // Remove empty <figure> elements left behind after image removal
 68    for (const figure of [...document.querySelectorAll("figure")]) {
 69      if (!figure.querySelector("img") && figure.textContent?.trim() === "") {
 70        figure.remove();
 71      }
 72    }
 73  }
 74
 75  // Strip data-* attributes from all elements
 76  for (const el of [...document.querySelectorAll("*")]) {
 77    const attrs = [...el.attributes];
 78    for (const attr of attrs) {
 79      if (attr.name.startsWith("data-")) {
 80        el.removeAttribute(attr.name);
 81      }
 82    }
 83  }
 84
 85  // Serialize body innerHTML
 86  let xhtml = document.body.innerHTML;
 87
 88  // Fix void elements: ensure self-closing (e.g., <img src="..."> → <img src="..." />)
 89  xhtml = xhtml.replace(/<([\w-]+)([^>]*?)\/?\s*>/g, (match, tag, attrs) => {
 90    const tagLower = tag.toLowerCase();
 91    if (VOID_ELEMENTS.has(tagLower)) {
 92      // Ensure self-closing
 93      return `<${tag}${attrs} />`;
 94    }
 95    return match;
 96  });
 97
 98  // Fix unescaped & that aren't part of entities (& not followed by #word;)
 99  xhtml = xhtml.replace(/&(?!(?:#\d+|#x[\da-fA-F]+|[a-zA-Z]\w{0,30});)/g, "&amp;");
100
101  return xhtml;
102}
103
104/**
105 * Extract all image URLs from HTML content.
106 * Returns array of { src, index } for replacement.
107 */
108export function extractImageUrls(html: string): string[] {
109  const urls: string[] = [];
110  const imgRegex = /<img[^>]+src=["']([^"']+)["']/gi;
111  let match;
112  while ((match = imgRegex.exec(html)) !== null) {
113    const src = match[1];
114    if (src.startsWith("http://") || src.startsWith("https://")) {
115      urls.push(src);
116    }
117  }
118  return [...new Set(urls)]; // deduplicate
119}
120
121/**
122 * Replace image URLs in HTML with local EPUB paths.
123 */
124export function replaceImageUrls(
125  html: string,
126  urlToPath: Map<string, string>,
127): string {
128  let result = html;
129  for (const [url, localPath] of urlToPath) {
130    result = result.replaceAll(url, localPath);
131  }
132  return result;
133}
Configure Feed

Configure Feed