feat(extraction): inject Readability and refine legacy vs article heuristics

+23

popup/popup.js

·····················

+3 -2

scripts/background.js

··· 302 302 return { content: "", wasTruncated: false }; 303 303 } 304 304 305 - // Inject extractor only when needed to avoid running on all pages by default. 305 + // Readability must load before content.js (content.js is not bundled with it). 306 306 await chrome.scripting.executeScript({ 307 307 target: { tabId }, 308 - files: ["scripts/content.js"], 308 + files: ["scripts/Readability.js", "scripts/content.js"], 309 309 }); 310 310 311 311 const response = await chrome.tabs.sendMessage(tabId, { action: "extract" }); ··· 316 316 return { 317 317 content: response.content, 318 318 wasTruncated: Boolean(response.wasTruncated), 319 + extractionSource: response.extractionSource ?? "unknown", 319 320 }; 320 321 } 321 322

+489 -68

scripts/content.js

··· 1 - // Content script - extracts article content using Mozilla Readability 1 + // Content script: Mozilla Readability for articles, legacy DOM walk for feeds / non-articles 2 2 3 3 (function () { 4 4 "use strict"; ··· 9 9 } 10 10 window.__webaiExtractorInstalled = true; 11 11 12 - // Import config if available (in extension context) 13 12 const MAX_LENGTH = 14 13 typeof CONFIG !== "undefined" && CONFIG.EXTRACTION?.MAX_LENGTH 15 14 ? CONFIG.EXTRACTION.MAX_LENGTH 16 15 : 50000; 17 16 17 + /** 18 + * Readability `textContent` length (article body only). Above this, treat as a real article 19 + * extract. Do NOT use total formatted string length — title/metadata can exceed 500 chars alone. 20 + * Kept moderately high so homepages don't classify sponsor blocks as "articles". 21 + */ 22 + const READABILITY_STRONG_BODY_CHARS = 560; 23 + 24 + /** 25 + * When body is shorter, still trust Readability if the body is a large share of our formatted 26 + * output (not mostly "Title:/Description:" boilerplate from a feed shell). 27 + */ 28 + const READABILITY_MIN_BODY_CHARS = 120; 29 + 30 + /** Minimum body / formatted-text ratio for the "substantial body" heuristic. */ 31 + const READABILITY_MIN_CONTENT_RATIO = 0.34; 32 + 33 + /** 34 + * If Readability output is shorter than this, compare with legacy: full-page text often 35 + * dwarfs a homepage "article" grab (e.g. link aggregators). 36 + */ 37 + const READABILITY_SHORT_EXTRACT_MAX = 2200; 38 + 39 + /** 40 + * When Readability extract is "short", prefer legacy if the DOM walk yields this many times 41 + * more text (typical feed / homepage vs a thin Readability pick). 42 + */ 43 + const READABILITY_LEGACY_DOMINANCE_RATIO = 3.25; 44 + 45 + /** Prefer legacy when the grab looks like promos / feed chrome, not prose. */ 46 + const READABILITY_BIAS_LEGACY_REGEXES = [ 47 + /\bSponsor(?:ed)?\s+Posts?\b/i, 48 + /\bPromoted\s+(?:Stories?|Posts?|Content)\b/i, 49 + /\bAdvertiser\s+Content\b/i, 50 + /\bPaid\s+Partnership\b/i, 51 + ]; 52 + 53 + function readabilityTextBiasLegacy(formattedText) { 54 + return READABILITY_BIAS_LEGACY_REGEXES.some((re) => re.test(formattedText)); 55 + } 56 + 57 + /** 58 + * Short Readability extract + huge legacy output ⇒ likely listing/homepage, not an article. 59 + * Skip when body is clearly long-form anyway (avoid flipping concise real articles on noisy DOMs). 60 + */ 61 + function tryLegacyWhenShortReadOverwhelmed(readability) { 62 + const readLen = readability.text.length; 63 + if (readLen > READABILITY_SHORT_EXTRACT_MAX) { 64 + return null; 65 + } 66 + if (readability.bodyTextLen >= READABILITY_STRONG_BODY_CHARS + 400) { 67 + return null; 68 + } 69 + const legacy = extractLegacy(); 70 + if (legacy.text.length > readLen * READABILITY_LEGACY_DOMINANCE_RATIO) { 71 + return legacy; 72 + } 73 + return null; 74 + } 75 + 76 + const EXCLUDE_TAGS = [ 77 + "script", 78 + "style", 79 + "noscript", 80 + "iframe", 81 + "nav", 82 + "aside", 83 + "form", 84 + "button", 85 + "input", 86 + ]; 87 + 18 88 function extractWithReadability() { 19 - // Clone the document so we don't modify the real page 20 89 const documentClone = document.cloneNode(true); 21 - 22 - // Create Readability instance 23 90 const reader = new Readability(documentClone); 24 - 25 - // Parse the article 26 91 const article = reader.parse(); 27 92 28 93 if (!article) { 29 - return { text: "", wasTruncated: false }; 94 + return { 95 + text: "", 96 + wasTruncated: false, 97 + articleNull: true, 98 + bodyTextLen: 0, 99 + }; 30 100 } 31 101 32 - // Build the extracted text with metadata 33 102 let extractedText = ""; 34 103 35 104 if (article.title) { ··· 52 121 extractedText += `Source: ${article.siteName}\n\n`; 53 122 } 54 123 55 - // Add separator before content 56 124 if (extractedText) { 57 125 extractedText += "---\n\n"; 58 126 } 59 127 60 - // Get text content (strip HTML) 61 128 let content = article.textContent || ""; 62 129 63 - // Clean up the text 64 130 content = content 65 - .replace(/[^\S\n]+/g, " ") // Collapse spaces/tabs but preserve newlines 66 - .replace(/\n{3,}/g, "\n\n") // Collapse 3+ newlines to 2 67 - .replace(/^\s+|\s+$/g, ""); // Trim 131 + .replace(/[^\S\n]+/g, " ") 132 + .replace(/\n{3,}/g, "\n\n") 133 + .replace(/^\s+|\s+$/g, ""); 68 134 69 135 extractedText += content; 70 136 71 - // Check if we need to truncate 137 + const bodyTextLen = (article.textContent || "").trim().length; 138 + 72 139 let wasTruncated = false; 73 140 if (extractedText.length > MAX_LENGTH) { 74 141 wasTruncated = true; 75 142 extractedText = extractedText.substring(0, MAX_LENGTH); 76 143 } 77 144 78 - return { text: extractedText, wasTruncated }; 145 + return { 146 + text: extractedText, 147 + wasTruncated, 148 + articleNull: false, 149 + bodyTextLen, 150 + }; 151 + } 152 + 153 + // --- Legacy extraction (pre-Readability): structured body walk + selector fallback --- 154 + 155 + function extractLegacySelectorsFallback() { 156 + const selectors = [ 157 + "article p", 158 + "article div", 159 + ".content p", 160 + ".content div", 161 + ".post-content p", 162 + ".entry-content p", 163 + ".article-body p", 164 + "main p", 165 + "main div", 166 + '[role="main"] p', 167 + ".story p", 168 + ".story-body p", 169 + "#story p", 170 + ]; 171 + 172 + let text = ""; 173 + let wasTruncated = false; 174 + const seen = new Set(); 175 + 176 + for (const selector of selectors) { 177 + try { 178 + const elements = document.querySelectorAll(selector); 179 + for (const el of elements) { 180 + const content = el.textContent.trim(); 181 + if (content.length < 20 || seen.has(content.substring(0, 100))) continue; 182 + 183 + const style = window.getComputedStyle(el); 184 + if (style.display === "none" || style.visibility === "hidden") continue; 185 + 186 + seen.add(content.substring(0, 100)); 187 + text += content + "\n\n"; 188 + 189 + if (text.length > MAX_LENGTH) { 190 + wasTruncated = true; 191 + break; 192 + } 193 + } 194 + } catch (e) { 195 + // Ignore invalid selectors 196 + } 197 + if (wasTruncated) break; 198 + } 199 + 200 + if (text.length < 500 && !wasTruncated) { 201 + const allParagraphs = document.querySelectorAll("p"); 202 + for (const p of allParagraphs) { 203 + const content = p.textContent.trim(); 204 + if (content.length > 30 && !seen.has(content.substring(0, 100))) { 205 + const style = window.getComputedStyle(p); 206 + if (style.display === "none" || style.visibility === "hidden") continue; 207 + 208 + seen.add(content.substring(0, 100)); 209 + text += content + "\n\n"; 210 + 211 + if (text.length > MAX_LENGTH) { 212 + wasTruncated = true; 213 + break; 214 + } 215 + } 216 + } 217 + } 218 + 219 + return { text: text.substring(0, MAX_LENGTH), wasTruncated }; 220 + } 221 + 222 + function shouldSkipElement(el) { 223 + const tag = el.tagName.toLowerCase(); 224 + if (EXCLUDE_TAGS.includes(tag)) { 225 + return true; 226 + } 227 + 228 + try { 229 + const style = window.getComputedStyle(el); 230 + if ( 231 + style.display === "none" || 232 + style.visibility === "hidden" || 233 + style.opacity === "0" 234 + ) { 235 + return true; 236 + } 237 + } catch (e) { 238 + // ignore 239 + } 240 + 241 + if (isMainContent(el)) return false; 242 + 243 + const role = el.getAttribute("role"); 244 + if (role === "navigation" || role === "banner" || role === "complementary") { 245 + return true; 246 + } 247 + 248 + let className = ""; 249 + let id = ""; 250 + 251 + if (el.className) { 252 + if (typeof el.className === "string") { 253 + className = el.className; 254 + } else if (el.className.baseVal) { 255 + className = el.className.baseVal; 256 + } 257 + } 258 + 259 + if (el.id) { 260 + if (typeof el.id === "string") { 261 + id = el.id; 262 + } else if (el.id.baseVal) { 263 + id = el.id.baseVal; 264 + } 265 + } 266 + 267 + const classAndId = (className + " " + id).toLowerCase(); 268 + const strictNoisePatterns = [ 269 + /^nav$/, 270 + /-nav$/, 271 + /^nav-/, 272 + /^navigation$/, 273 + /^footer$/, 274 + /-footer$/, 275 + /^footer-/, 276 + /^header$/, 277 + /^site-header$/, 278 + /^page-header$/, 279 + /^sidebar$/, 280 + /^advertisement$/, 281 + /^ad-container$/, 282 + ]; 283 + if (strictNoisePatterns.some((p) => p.test(classAndId.trim()))) { 284 + return true; 285 + } 286 + 287 + return false; 288 + } 289 + 290 + function isMainContent(element) { 291 + const role = element.getAttribute("role"); 292 + const tagName = element.tagName.toLowerCase(); 293 + 294 + let className = ""; 295 + let id = ""; 296 + 297 + if (element.className) { 298 + if (typeof element.className === "string") { 299 + className = element.className.toLowerCase(); 300 + } else if (element.className.baseVal) { 301 + className = element.className.baseVal.toLowerCase(); 302 + } 303 + } 304 + 305 + if (element.id) { 306 + if (typeof element.id === "string") { 307 + id = element.id.toLowerCase(); 308 + } else if (element.id.baseVal) { 309 + id = element.id.baseVal.toLowerCase(); 310 + } 311 + } 312 + 313 + const contentPatterns = [ 314 + "content", 315 + "main-content", 316 + "article-content", 317 + "post-content", 318 + "entry-content", 319 + "page-content", 320 + "story-content", 321 + "body-content", 322 + "article", 323 + "post", 324 + "entry", 325 + "story", 326 + "main", 327 + ]; 328 + 329 + const isContentClass = contentPatterns.some( 330 + (p) => className.includes(p) || id.includes(p), 331 + ); 332 + 333 + return ( 334 + role === "main" || 335 + role === "article" || 336 + tagName === "main" || 337 + tagName === "article" || 338 + isContentClass 339 + ); 340 + } 341 + 342 + function extractTextFromElement(element, depth = 0) { 343 + let text = ""; 344 + const indent = " ".repeat(depth); 345 + 346 + const directText = getDirectTextContent(element).trim(); 347 + if (directText.length > 20 && depth > 0) { 348 + text += directText + "\n\n"; 349 + } 350 + 351 + for (const child of element.children) { 352 + const childTag = child.tagName.toLowerCase(); 353 + 354 + if (shouldSkipElement(child)) continue; 355 + 356 + if (/^h[1-6]$/.test(childTag)) { 357 + const headingText = getTextContent(child).trim(); 358 + if (headingText) { 359 + const prefix = "#".repeat(parseInt(childTag[1], 10)); 360 + text += `\n${prefix} ${headingText}\n\n`; 361 + } 362 + } else if (childTag === "p") { 363 + const pText = getTextContent(child).trim(); 364 + if (pText.length > 5) { 365 + text += `${pText}\n\n`; 366 + } 367 + } else if (childTag === "li") { 368 + const liText = getTextContent(child).trim(); 369 + if (liText) { 370 + text += `${indent}- ${liText}\n`; 371 + } 372 + } else if (childTag === "pre" || childTag === "code") { 373 + const codeText = getTextContent(child).trim(); 374 + if (codeText) { 375 + text += `\n\`\`\`\n${codeText}\n\`\`\`\n\n`; 376 + } 377 + } else { 378 + const childText = extractTextFromElement(child, depth + 1); 379 + if (childText.trim()) { 380 + text += childText; 381 + } 382 + } 383 + } 384 + 385 + return text; 386 + } 387 + 388 + function getDirectTextContent(element) { 389 + let text = ""; 390 + for (const node of element.childNodes) { 391 + if (node.nodeType === Node.TEXT_NODE) { 392 + text += node.textContent; 393 + } 394 + } 395 + return text.trim(); 79 396 } 80 397 81 - // Fallback extraction for pages where Readability fails 82 - function extractFallback() { 398 + function getTextContent(element) { 83 399 let text = ""; 400 + 401 + for (const node of element.childNodes) { 402 + if (node.nodeType === Node.TEXT_NODE) { 403 + text += node.textContent; 404 + } else if (node.nodeType === Node.ELEMENT_NODE) { 405 + const tagName = node.tagName.toLowerCase(); 406 + 407 + if (["br", "p", "div", "li"].includes(tagName)) { 408 + text += " " + getTextContent(node) + " "; 409 + } else { 410 + text += getTextContent(node); 411 + } 412 + } 413 + } 414 + 415 + return text; 416 + } 417 + 418 + function cleanExtractedText(text, shouldTruncate = true) { 419 + let cleaned = text 420 + .replace(/[^\S\n]+/g, " ") 421 + .replace(/\n{3,}/g, "\n\n") 422 + .replace(/^\s+|\s+$/g, ""); 423 + 424 + if (shouldTruncate && cleaned.length > MAX_LENGTH) { 425 + cleaned = cleaned.substring(0, MAX_LENGTH); 426 + } 427 + 428 + return cleaned; 429 + } 430 + 431 + /** 432 + * Full-document extraction used before Readability. Works well for feeds, homepages, 433 + * and app-like pages where Readability returns nothing or very little. 434 + */ 435 + function extractLegacy() { 436 + if (!document.body) { 437 + return { text: document.title || "", wasTruncated: false }; 438 + } 439 + 440 + let extractedText = ""; 84 441 let wasTruncated = false; 85 442 86 - // Get title 87 443 const title = document.title || ""; 88 444 if (title) { 89 - text += `Title: ${title}\n\n`; 445 + extractedText += `Title: ${title}\n\n`; 90 446 } 91 447 92 - // Get meta description 93 448 const metaDesc = document.querySelector('meta[name="description"]'); 94 449 if (metaDesc) { 95 - const content = metaDesc.getAttribute("content"); 96 - if (content) { 97 - text += `Description: ${content}\n\n---\n\n`; 450 + const desc = metaDesc.getAttribute("content"); 451 + if (desc) { 452 + extractedText += `Description: ${desc}\n\n`; 98 453 } 99 454 } 100 455 101 - // Fallback: get all paragraphs 102 - const paragraphs = document.querySelectorAll("p"); 103 - const seen = new Set(); 104 - 105 - for (const p of paragraphs) { 106 - // Skip hidden elements 107 - const style = window.getComputedStyle(p); 108 - if (style.display === "none" || style.visibility === "hidden") { 109 - continue; 110 - } 111 - 112 - const content = p.textContent.trim(); 456 + extractedText += extractTextFromElement(document.body); 113 457 114 - // Skip short or duplicate paragraphs 115 - if (content.length < 30) continue; 116 - const key = content.substring(0, 100); 117 - if (seen.has(key)) continue; 458 + extractedText = cleanExtractedText(extractedText, false); 118 459 119 - seen.add(key); 120 - text += content + "\n\n"; 460 + if (extractedText.length > MAX_LENGTH) { 461 + wasTruncated = true; 462 + extractedText = extractedText.substring(0, MAX_LENGTH); 463 + } 121 464 122 - if (text.length > MAX_LENGTH) { 123 - wasTruncated = true; 124 - break; 465 + if (extractedText.length < 1000) { 466 + const fallbackResult = extractLegacySelectorsFallback(); 467 + if (fallbackResult.text.length > extractedText.length) { 468 + extractedText = `Title: ${title}\n\n${fallbackResult.text}`; 469 + wasTruncated = fallbackResult.wasTruncated; 125 470 } 126 471 } 127 472 128 - return { 129 - text: text.substring(0, MAX_LENGTH), 130 - wasTruncated, 131 - }; 473 + return { text: extractedText, wasTruncated }; 474 + } 475 + 476 + /** TEMP: remove after testing — logs to the tab's DevTools console (page context). */ 477 + function logExtractionDebug(source, text) { 478 + console.log("[Summarizer DEBUG] extraction:", source); 479 + console.log( 480 + "[Summarizer DEBUG] raw extracted text (" + text.length + " chars):", 481 + text, 482 + ); 132 483 } 133 484 134 485 function extractContent() { 135 486 try { 136 - // Try Readability first 137 - const result = extractWithReadability(); 487 + const readability = extractWithReadability(); 488 + const rbLen = readability.text.trim().length; 489 + 490 + if (readability.articleNull || rbLen === 0) { 491 + const legacy = extractLegacy(); 492 + logExtractionDebug("legacy", legacy.text); 493 + return { ...legacy, extractionSource: "legacy" }; 494 + } 495 + 496 + const bodyLen = readability.bodyTextLen; 497 + const contentRatio = 498 + bodyLen / Math.max(readability.text.length, 1); 499 + 500 + const biasedToLegacy = readabilityTextBiasLegacy(readability.text); 501 + 502 + const strongArticleBody = 503 + bodyLen >= READABILITY_STRONG_BODY_CHARS && !biasedToLegacy; 504 + const ratioLooksLikeArticle = 505 + bodyLen >= READABILITY_MIN_BODY_CHARS && 506 + contentRatio >= READABILITY_MIN_CONTENT_RATIO && 507 + !biasedToLegacy; 508 + 509 + // Long article body — Readability clearly won (unless sponsor/promo bias sent us to legacy). 510 + if (strongArticleBody) { 511 + const flip = tryLegacyWhenShortReadOverwhelmed(readability); 512 + if (flip) { 513 + logExtractionDebug("legacy", flip.text); 514 + return { text: flip.text, wasTruncated: flip.wasTruncated, extractionSource: "legacy" }; 515 + } 516 + logExtractionDebug("readability", readability.text); 517 + return { 518 + text: readability.text, 519 + wasTruncated: readability.wasTruncated, 520 + extractionSource: "readability", 521 + }; 522 + } 138 523 139 - // If Readability got good content, use it 140 - if (result.text.length > 500) { 141 - return result; 524 + // Short pages / stubs: body is still most of what we output (not metadata padding). 525 + if (ratioLooksLikeArticle) { 526 + const flip = tryLegacyWhenShortReadOverwhelmed(readability); 527 + if (flip) { 528 + logExtractionDebug("legacy", flip.text); 529 + return { text: flip.text, wasTruncated: flip.wasTruncated, extractionSource: "legacy" }; 530 + } 531 + logExtractionDebug("readability", readability.text); 532 + return { 533 + text: readability.text, 534 + wasTruncated: readability.wasTruncated, 535 + extractionSource: "readability", 536 + }; 142 537 } 143 538 144 - // Otherwise fall back to basic extraction 145 - console.debug( 146 - "[Summarizer] Readability extracted minimal content, trying fallback...", 147 - ); 148 - const fallback = extractFallback(); 539 + // Marginal Readability (feed hero, cookie copy, tiny grab): compare with legacy. 540 + const legacy = extractLegacy(); 541 + const legLen = legacy.text.length; 542 + const readLen = readability.text.length; 543 + 544 + if (bodyLen < READABILITY_MIN_BODY_CHARS) { 545 + const useLegacy = legLen > readLen; 546 + const picked = useLegacy ? legacy : readability; 547 + logExtractionDebug(useLegacy ? "legacy" : "readability", picked.text); 548 + return { 549 + text: picked.text, 550 + wasTruncated: picked.wasTruncated, 551 + extractionSource: useLegacy ? "legacy" : "readability", 552 + }; 553 + } 149 554 150 - // Use whichever got more content 151 - if (fallback.text.length > result.text.length) { 152 - return fallback; 555 + // Body exists but is mostly not "article-like" vs full page (low ratio already failed). 556 + if ( 557 + contentRatio < READABILITY_MIN_CONTENT_RATIO && 558 + legLen > readLen * 1.4 559 + ) { 560 + logExtractionDebug("legacy", legacy.text); 561 + return { 562 + text: legacy.text, 563 + wasTruncated: legacy.wasTruncated, 564 + extractionSource: "legacy", 565 + }; 153 566 } 154 567 155 - return result; 568 + const useLegacy = legLen > readLen; 569 + const picked = useLegacy ? legacy : readability; 570 + logExtractionDebug(useLegacy ? "legacy" : "readability", picked.text); 571 + return { 572 + text: picked.text, 573 + wasTruncated: picked.wasTruncated, 574 + extractionSource: useLegacy ? "legacy" : "readability", 575 + }; 156 576 } catch (error) { 157 577 console.error("[Summarizer] Readability error:", error); 158 - // On error, try fallback 159 - return extractFallback(); 578 + const legacy = extractLegacy(); 579 + logExtractionDebug("legacy", legacy.text); 580 + return { ...legacy, extractionSource: "legacy" }; 160 581 } 161 582 } 162 583 163 - // Listen for messages from the extension 164 584 chrome.runtime.onMessage.addListener((request, sender, sendResponse) => { 165 585 if (request.action === "extract") { 166 586 const result = extractContent(); 167 587 sendResponse({ 168 588 content: result.text, 169 589 wasTruncated: result.wasTruncated, 590 + extractionSource: result.extractionSource, 170 591 }); 171 592 } 172 - return true; // Keep channel open for async 593 + return true; 173 594 }); 174 595 })();

Configure Feed

Configure Feed