backend/article-processor.ts at main · tijs.org/crosspoint-articles

tijs.org / crosspoint-articles

fork

Sync articles marked toread in kipclip to Crosspoint Reader (Xteink X4)

fork

crosspoint-articles / backend / article-processor.ts

at main 94 lines 3.0 kB view raw

wrap content

Tijs Teulings Initial commit: crosspoint articles backend 3w ago

3a9df645

 1import { Readability } from "npm:@mozilla/readability@0.5.0";
 2import { parseHTML } from "npm:linkedom@0.18.9";
 3import { extractImageUrls, replaceImageUrls, sanitizeForXhtml } from "./html-sanitizer.ts";
 4import { type ProcessedImage, processImage } from "./image-processor.ts";
 5
 6export interface ProcessedArticle {
 7  title: string;
 8  content: string; // sanitized XHTML with local image paths
 9  excerpt?: string;
10  byline?: string;
11  images: ProcessedImage[];
12}
13
14/** Fetch and clean an article URL using Mozilla Readability */
15export async function processArticle(
16  url: string,
17  fallbackTitle?: string,
18  options?: { includeImages?: boolean },
19): Promise<ProcessedArticle | null> {
20  const html = await fetchArticleHtml(url);
21  if (!html) return null;
22
23  const { document } = parseHTML(html);
24  // Set documentURI for Readability's URL resolution
25  Object.defineProperty(document, "documentURI", { value: url });
26  const reader = new Readability(document);
27  const article = reader.parse();
28
29  if (!article || !article.content) return null;
30
31  const includeImages = options?.includeImages ?? false;
32
33  // Sanitize HTML to valid XHTML (fix <picture>, void elements, unescaped &)
34  let content = sanitizeForXhtml(article.content, {
35    stripRemoteImages: !includeImages,
36  });
37
38  // Optionally extract, download, and process images (grayscale + dither for e-ink)
39  const images: ProcessedImage[] = [];
40  if (options?.includeImages) {
41    const imageUrls = extractImageUrls(content);
42    const urlToPath = new Map<string, string>();
43
44    for (let i = 0; i < imageUrls.length; i++) {
45      const processed = await processImage(imageUrls[i], i);
46      if (processed) {
47        images.push(processed);
48        urlToPath.set(imageUrls[i], `images/${processed.filename}`);
49      }
50    }
51
52    // Replace remote URLs with local EPUB paths
53    if (urlToPath.size > 0) {
54      content = replaceImageUrls(content, urlToPath);
55    }
56  }
57
58  return {
59    title: article.title || fallbackTitle || new URL(url).hostname,
60    content,
61    excerpt: article.excerpt || undefined,
62    byline: article.byline || undefined,
63    images,
64  };
65}
66
67async function fetchArticleHtml(url: string): Promise<string | null> {
68  const controller = new AbortController();
69  const timeout = setTimeout(() => controller.abort(), 15_000);
70
71  try {
72    const res = await fetch(url, {
73      signal: controller.signal,
74      headers: {
75        "User-Agent": "Mozilla/5.0 (compatible; KipclipSync/1.0; +https://kipclip.com)",
76        Accept: "text/html,application/xhtml+xml",
77      },
78    });
79    if (!res.ok) return null;
80
81    const contentType = res.headers.get("content-type") || "";
82    if (!contentType.includes("html") && !contentType.includes("xhtml")) {
83      return null;
84    }
85
86    const text = await res.text();
87    // Cap at 500KB to avoid processing enormous pages
88    return text.length > 512_000 ? text.slice(0, 512_000) : text;
89  } catch {
90    return null;
91  } finally {
92    clearTimeout(timeout);
93  }
94}

Configure Feed

Configure Feed