Sync articles marked toread in kipclip to Crosspoint Reader (Xteink X4)
1import { Readability } from "npm:@mozilla/readability@0.5.0";
2import { parseHTML } from "npm:linkedom@0.18.9";
3import { extractImageUrls, replaceImageUrls, sanitizeForXhtml } from "./html-sanitizer.ts";
4import { type ProcessedImage, processImage } from "./image-processor.ts";
5
6export interface ProcessedArticle {
7 title: string;
8 content: string; // sanitized XHTML with local image paths
9 excerpt?: string;
10 byline?: string;
11 images: ProcessedImage[];
12}
13
14/** Fetch and clean an article URL using Mozilla Readability */
15export async function processArticle(
16 url: string,
17 fallbackTitle?: string,
18 options?: { includeImages?: boolean },
19): Promise<ProcessedArticle | null> {
20 const html = await fetchArticleHtml(url);
21 if (!html) return null;
22
23 const { document } = parseHTML(html);
24 // Set documentURI for Readability's URL resolution
25 Object.defineProperty(document, "documentURI", { value: url });
26 const reader = new Readability(document);
27 const article = reader.parse();
28
29 if (!article || !article.content) return null;
30
31 const includeImages = options?.includeImages ?? false;
32
33 // Sanitize HTML to valid XHTML (fix <picture>, void elements, unescaped &)
34 let content = sanitizeForXhtml(article.content, {
35 stripRemoteImages: !includeImages,
36 });
37
38 // Optionally extract, download, and process images (grayscale + dither for e-ink)
39 const images: ProcessedImage[] = [];
40 if (options?.includeImages) {
41 const imageUrls = extractImageUrls(content);
42 const urlToPath = new Map<string, string>();
43
44 for (let i = 0; i < imageUrls.length; i++) {
45 const processed = await processImage(imageUrls[i], i);
46 if (processed) {
47 images.push(processed);
48 urlToPath.set(imageUrls[i], `images/${processed.filename}`);
49 }
50 }
51
52 // Replace remote URLs with local EPUB paths
53 if (urlToPath.size > 0) {
54 content = replaceImageUrls(content, urlToPath);
55 }
56 }
57
58 return {
59 title: article.title || fallbackTitle || new URL(url).hostname,
60 content,
61 excerpt: article.excerpt || undefined,
62 byline: article.byline || undefined,
63 images,
64 };
65}
66
67async function fetchArticleHtml(url: string): Promise<string | null> {
68 const controller = new AbortController();
69 const timeout = setTimeout(() => controller.abort(), 15_000);
70
71 try {
72 const res = await fetch(url, {
73 signal: controller.signal,
74 headers: {
75 "User-Agent": "Mozilla/5.0 (compatible; KipclipSync/1.0; +https://kipclip.com)",
76 Accept: "text/html,application/xhtml+xml",
77 },
78 });
79 if (!res.ok) return null;
80
81 const contentType = res.headers.get("content-type") || "";
82 if (!contentType.includes("html") && !contentType.includes("xhtml")) {
83 return null;
84 }
85
86 const text = await res.text();
87 // Cap at 500KB to avoid processing enormous pages
88 return text.length > 512_000 ? text.slice(0, 512_000) : text;
89 } catch {
90 return null;
91 } finally {
92 clearTimeout(timeout);
93 }
94}