fix(extraction): skip embeds and strip literal iframe/embed markup from text

+44 -4

1 changed file

expand all

scripts

content.js

+44 -4

scripts/content.js

··· 78 78 "style", 79 79 "noscript", 80 80 "iframe", 81 + "embed", 82 + "object", 83 + "frame", 81 84 "nav", 82 85 "aside", 83 86 "form", ··· 85 88 "input", 86 89 ]; 87 90 91 + /** 92 + * Subtrees to omit from plain-text collection. `textContent` includes script/style/template 93 + * bodies, which pulls in Astro/React/Vite inline bundles when walking `main div` etc. 94 + * Also skip embed-like tags (iframe/object/embed) so news players do not dump URLs or attrs. 95 + */ 96 + const TEXT_SUBTREE_EXCLUDE_TAGS = new Set([ 97 + ...EXCLUDE_TAGS, 98 + "template", 99 + ]); 100 + 101 + function shouldExcludeTextSubtree(tagName) { 102 + return TEXT_SUBTREE_EXCLUDE_TAGS.has(tagName.toLowerCase()); 103 + } 104 + 105 + /** 106 + * Strip embed markup that appears as literal text in article bodies (CMS/oEmbed fallbacks). 107 + * Complements subtree skipping — some sites still surface tags as visible copy. 108 + */ 109 + function sanitizeLiteralEmbedMarkup(text) { 110 + if (!text || typeof text !== "string") { 111 + return text; 112 + } 113 + let t = text; 114 + t = t.replace(/<iframe\b[\s\S]{0,20000}?<\/iframe>/gi, "\n"); 115 + t = t.replace(/<iframe\b[^>]{0,8000}\/?>/gi, "\n"); 116 + t = t.replace(/<\/iframe>/gi, ""); 117 + t = t.replace(/<embed\b[^>]{0,8000}\/?>/gi, "\n"); 118 + t = t.replace(/<object\b[\s\S]{0,20000}?<\/object>/gi, "\n"); 119 + return t; 120 + } 121 + 88 122 function extractWithReadability() { 89 123 const documentClone = document.cloneNode(true); 90 124 const reader = new Readability(documentClone); ··· 126 160 } 127 161 128 162 let content = article.textContent || ""; 163 + content = sanitizeLiteralEmbedMarkup(content); 129 164 130 165 content = content 131 166 .replace(/[^\S\n]+/g, " ") ··· 134 169 135 170 extractedText += content; 136 171 137 - const bodyTextLen = (article.textContent || "").trim().length; 172 + const bodyTextLen = content.trim().length; 138 173 139 174 let wasTruncated = false; 140 175 if (extractedText.length > MAX_LENGTH) { ··· 177 212 try { 178 213 const elements = document.querySelectorAll(selector); 179 214 for (const el of elements) { 180 - const content = el.textContent.trim(); 215 + const content = getTextContent(el).trim(); 181 216 if (content.length < 20 || seen.has(content.substring(0, 100))) continue; 182 217 183 218 const style = window.getComputedStyle(el); ··· 200 235 if (text.length < 500 && !wasTruncated) { 201 236 const allParagraphs = document.querySelectorAll("p"); 202 237 for (const p of allParagraphs) { 203 - const content = p.textContent.trim(); 238 + const content = getTextContent(p).trim(); 204 239 if (content.length > 30 && !seen.has(content.substring(0, 100))) { 205 240 const style = window.getComputedStyle(p); 206 241 if (style.display === "none" || style.visibility === "hidden") continue; ··· 404 439 } else if (node.nodeType === Node.ELEMENT_NODE) { 405 440 const tagName = node.tagName.toLowerCase(); 406 441 442 + if (shouldExcludeTextSubtree(tagName)) { 443 + continue; 444 + } 445 + 407 446 if (["br", "p", "div", "li"].includes(tagName)) { 408 447 text += " " + getTextContent(node) + " "; 409 448 } else { ··· 416 455 } 417 456 418 457 function cleanExtractedText(text, shouldTruncate = true) { 419 - let cleaned = text 458 + let cleaned = sanitizeLiteralEmbedMarkup(text); 459 + cleaned = cleaned 420 460 .replace(/[^\S\n]+/g, " ") 421 461 .replace(/\n{3,}/g, "\n\n") 422 462 .replace(/^\s+|\s+$/g, "");

Configure Feed

Configure Feed