refactor: migrate to Mozilla Readability and centralize config · ellioth.co/summarizer-extension@ffa94d1

+140 -33

options/options.html

··· 13 13 </div> 14 14 <h1>Settings</h1> 15 15 <button id="theme-btn" class="icon-btn" title="Toggle theme"> 16 - <svg id="theme-icon-light" width="15" height="15" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> 17 - <circle cx="12" cy="12" r="5"/> 18 - <line x1="12" y1="1" x2="12" y2="3"/> 19 - <line x1="12" y1="21" x2="12" y2="23"/> 20 - <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/> 21 - <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/> 22 - <line x1="1" y1="12" x2="3" y2="12"/> 23 - <line x1="21" y1="12" x2="23" y2="12"/> 24 - <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/> 25 - <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/> 16 + <svg 17 + id="theme-icon-light" 18 + width="15" 19 + height="15" 20 + viewBox="0 0 24 24" 21 + fill="none" 22 + stroke="currentColor" 23 + stroke-width="2" 24 + stroke-linecap="round" 25 + stroke-linejoin="round" 26 + > 27 + <circle cx="12" cy="12" r="5" /> 28 + <line x1="12" y1="1" x2="12" y2="3" /> 29 + <line x1="12" y1="21" x2="12" y2="23" /> 30 + <line x1="4.22" y1="4.22" x2="5.64" y2="5.64" /> 31 + <line x1="18.36" y1="18.36" x2="19.78" y2="19.78" /> 32 + <line x1="1" y1="12" x2="3" y2="12" /> 33 + <line x1="21" y1="12" x2="23" y2="12" /> 34 + <line x1="4.22" y1="19.78" x2="5.64" y2="18.36" /> 35 + <line x1="18.36" y1="5.64" x2="19.78" y2="4.22" /> 26 36 </svg> 27 - <svg id="theme-icon-dark" width="15" height="15" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="hidden"> 28 - <path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z"/> 37 + <svg 38 + id="theme-icon-dark" 39 + width="15" 40 + height="15" 41 + viewBox="0 0 24 24" 42 + fill="none" 43 + stroke="currentColor" 44 + stroke-width="2" 45 + stroke-linecap="round" 46 + stroke-linejoin="round" 47 + class="hidden" 48 + > 49 + <path 50 + d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z" 51 + /> 29 52 </svg> 30 - <svg id="theme-icon-system" width="15" height="15" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="hidden"> 31 - <rect x="2" y="3" width="20" height="14" rx="2" ry="2"/> 32 - <line x1="8" y1="21" x2="16" y2="21"/> 33 - <line x1="12" y1="17" x2="12" y2="21"/> 53 + <svg 54 + id="theme-icon-system" 55 + width="15" 56 + height="15" 57 + viewBox="0 0 24 24" 58 + fill="none" 59 + stroke="currentColor" 60 + stroke-width="2" 61 + stroke-linecap="round" 62 + stroke-linejoin="round" 63 + class="hidden" 64 + > 65 + <rect 66 + x="2" 67 + y="3" 68 + width="20" 69 + height="14" 70 + rx="2" 71 + ry="2" 72 + /> 73 + <line x1="8" y1="21" x2="16" y2="21" /> 74 + <line x1="12" y1="17" x2="12" y2="21" /> 34 75 </svg> 35 76 </button> 36 77 </div> ··· 79 120 <div class="form-group"> 80 121 <label>Accent Color</label> 81 122 <input type="hidden" id="accent-preset" value="orange" /> 82 - <div class="accent-swatches" role="radiogroup" aria-label="Accent Color"> 83 - <button type="button" class="accent-swatch selected" data-accent-preset="orange" title="Orange (default)" aria-label="Orange (default)"></button> 84 - <button type="button" class="accent-swatch" data-accent-preset="blue" title="Blue" aria-label="Blue"></button> 85 - <button type="button" class="accent-swatch" data-accent-preset="green" title="Green" aria-label="Green"></button> 86 - <button type="button" class="accent-swatch" data-accent-preset="purple" title="Purple" aria-label="Purple"></button> 87 - <button type="button" class="accent-swatch" data-accent-preset="teal" title="Teal" aria-label="Teal"></button> 88 - <button type="button" class="accent-swatch" data-accent-preset="pink" title="Pink" aria-label="Pink"></button> 89 - <button type="button" class="accent-swatch" data-accent-preset="indigo" title="Indigo" aria-label="Indigo"></button> 90 - <button type="button" class="accent-swatch custom-swatch" data-accent-preset="custom" title="Custom color" aria-label="Custom color">+</button> 123 + <div 124 + class="accent-swatches" 125 + role="radiogroup" 126 + aria-label="Accent Color" 127 + > 128 + <button 129 + type="button" 130 + class="accent-swatch selected" 131 + data-accent-preset="orange" 132 + title="Orange (default)" 133 + aria-label="Orange (default)" 134 + ></button> 135 + <button 136 + type="button" 137 + class="accent-swatch" 138 + data-accent-preset="blue" 139 + title="Blue" 140 + aria-label="Blue" 141 + ></button> 142 + <button 143 + type="button" 144 + class="accent-swatch" 145 + data-accent-preset="green" 146 + title="Green" 147 + aria-label="Green" 148 + ></button> 149 + <button 150 + type="button" 151 + class="accent-swatch" 152 + data-accent-preset="purple" 153 + title="Purple" 154 + aria-label="Purple" 155 + ></button> 156 + <button 157 + type="button" 158 + class="accent-swatch" 159 + data-accent-preset="teal" 160 + title="Teal" 161 + aria-label="Teal" 162 + ></button> 163 + <button 164 + type="button" 165 + class="accent-swatch" 166 + data-accent-preset="pink" 167 + title="Pink" 168 + aria-label="Pink" 169 + ></button> 170 + <button 171 + type="button" 172 + class="accent-swatch" 173 + data-accent-preset="indigo" 174 + title="Indigo" 175 + aria-label="Indigo" 176 + ></button> 177 + <button 178 + type="button" 179 + class="accent-swatch custom-swatch" 180 + data-accent-preset="custom" 181 + title="Custom color" 182 + aria-label="Custom color" 183 + > 184 + + 185 + </button> 91 186 </div> 92 - <div id="accent-custom-group" class="accent-custom-group hidden"> 187 + <div 188 + id="accent-custom-group" 189 + class="accent-custom-group hidden" 190 + > 93 191 <input 94 192 type="text" 95 193 id="accent-custom" ··· 98 196 /> 99 197 </div> 100 198 <p class="help"> 101 - Used for primary actions and accent highlights in the UI. 102 - Custom must be a 6-digit hex code like <code>#F15B2F</code>. 199 + Used for primary actions and accent highlights in the 200 + UI. Custom must be a 6-digit hex code like 201 + <code>#F15B2F</code>. 103 202 </p> 104 203 </div> 105 204 ··· 128 227 Disable thinking mode 129 228 </label> 130 229 <p class="help"> 131 - Disable the model's thinking process for faster responses. 132 - Only applies to thinking-capable models (DeepSeek R1, Qwen 3, etc.). 230 + Disable the model's thinking process for faster 231 + responses. Only applies to thinking-capable models 232 + (DeepSeek R1, Qwen 3, etc.). 133 233 <strong>Only works with Ollama native API.</strong> 134 234 </p> 135 235 </div> ··· 137 237 <div class="form-group"> 138 238 <label>Keyboard Shortcut</label> 139 239 <div class="shortcut-display"> 140 - <code>Ctrl+Shift+U</code> (Windows/Linux)  ·  <code>Cmd+Shift+U</code> (Mac) 240 + <code>Ctrl+Shift+U</code> (Windows/Linux)  ·  241 + <code>Cmd+Shift+U</code> (Mac) 141 242 </div> 142 243 <p class="help"> 143 - <a href="#" id="keyboard-shortcuts-link">Change or enable keyboard shortcut</a><br> 144 - <small style="color: var(--text-muted);">Chrome: You may need to manually enable the shortcut after install.</small> 244 + <a href="#" id="keyboard-shortcuts-link" 245 + >Change or enable keyboard shortcut</a 246 + ><br /> 247 + <small style="color: var(--text-muted)" 248 + >Chrome: You may need to manually enable the 249 + shortcut after install.</small 250 + > 145 251 </p> 146 252 </div> 147 253 ··· 190 296 </div> 191 297 </div> 192 298 299 + <script src="../scripts/config.js"></script> 193 300 <script src="options.js"></script> 194 301 </body> 195 302 </html>

+80 -57

options/options.js

··· 1 - // Options page script 1 + // Options page script - uses centralized CONFIG from config.js 2 2 3 3 const form = document.getElementById("settings-form"); 4 4 const apiModeInput = document.getElementById("api-mode"); 5 5 const apiBaseUrlInput = document.getElementById("api-base-url"); 6 6 const modelInput = document.getElementById("model"); 7 7 const accentPresetInput = document.getElementById("accent-preset"); 8 - const accentSwatchButtons = Array.from(document.querySelectorAll(".accent-swatch")); 8 + const accentSwatchButtons = Array.from( 9 + document.querySelectorAll(".accent-swatch"), 10 + ); 9 11 const accentCustomGroup = document.getElementById("accent-custom-group"); 10 12 const accentCustomInput = document.getElementById("accent-custom"); 11 13 const themeBtn = document.getElementById("theme-btn"); ··· 16 18 const testBtn = document.getElementById("test-connection"); 17 19 const resetBtn = document.getElementById("reset-defaults"); 18 20 21 + // Build defaultSettings from centralized CONFIG 19 22 const defaultSettings = { 20 - apiMode: "ollama", 21 - apiBaseUrl: "http://localhost:11434", 22 - model: "gpt-oss:20b-cloud", 23 - accentPreset: "orange", 24 - accentColor: "#F15B2F", 25 - apiKey: "", 26 - disableThinking: false, 23 + apiMode: CONFIG.API.MODE, 24 + apiBaseUrl: CONFIG.API.BASE_URL, 25 + model: CONFIG.API.MODEL, 26 + accentPreset: CONFIG.ACCENTS.DEFAULT_PRESET, 27 + accentColor: CONFIG.ACCENTS.DEFAULT_COLOR, 28 + apiKey: CONFIG.API.KEY, 29 + disableThinking: CONFIG.API.DISABLE_THINKING, 27 30 }; 28 31 29 - const ACCENT_PRESETS = { 30 - orange: "#F15B2F", 31 - blue: "#2F80ED", 32 - green: "#2FA36B", 33 - purple: "#7E57C2", 34 - teal: "#14B8A6", 35 - pink: "#EC4899", 36 - indigo: "#4F46E5", 37 - }; 32 + // Use accent presets from CONFIG 33 + const ACCENT_PRESETS = CONFIG.ACCENTS.PRESETS; 38 34 39 35 // Load settings/theme on page load 40 36 document.addEventListener("DOMContentLoaded", initializePage); 41 37 42 - const THEMES = ["light", "dark", "system"]; 43 - let currentTheme = "system"; 38 + // Use themes from CONFIG 39 + const THEMES = CONFIG.THEMES.OPTIONS; 40 + let currentTheme = CONFIG.THEMES.DEFAULT; 44 41 45 42 // Update URL placeholder and thinking mode visibility when mode changes 46 43 apiModeInput.addEventListener("change", () => { ··· 75 72 await chrome.storage.sync.set({ theme: currentTheme }); 76 73 }); 77 74 78 - window.matchMedia("(prefers-color-scheme: dark)").addEventListener("change", () => { 79 - if (currentTheme === "system") { 80 - applyTheme("system"); 81 - } 82 - }); 75 + window 76 + .matchMedia("(prefers-color-scheme: dark)") 77 + .addEventListener("change", () => { 78 + if (currentTheme === "system") { 79 + applyTheme("system"); 80 + } 81 + }); 83 82 84 83 // Save settings 85 84 form.addEventListener("submit", async (e) => { ··· 87 86 88 87 const settings = { 89 88 apiMode: apiModeInput.value, 90 - apiBaseUrl: apiBaseUrlInput.value.trim() || defaultSettings.apiBaseUrl, 91 - model: modelInput.value.trim() || defaultSettings.model, 89 + apiBaseUrl: apiBaseUrlInput.value.trim() || CONFIG.API.BASE_URL, 90 + model: modelInput.value.trim() || CONFIG.API.MODEL, 92 91 accentPreset: accentPresetInput.value, 93 92 accentColor: resolveAccentColor(), 94 93 apiKey: apiKeyInput.value.trim(), ··· 96 95 }; 97 96 98 97 if (!settings.accentColor) { 99 - showStatus("❌ Custom accent color must be a valid hex code (e.g. #F15B2F).", "error"); 98 + showStatus( 99 + "❌ Custom accent color must be a valid hex code (e.g. #F15B2F).", 100 + "error", 101 + ); 100 102 return; 101 103 } 102 104 ··· 112 114 testBtn.addEventListener("click", async () => { 113 115 const settings = { 114 116 apiMode: apiModeInput.value, 115 - apiBaseUrl: apiBaseUrlInput.value.trim() || defaultSettings.apiBaseUrl, 116 - model: modelInput.value.trim() || defaultSettings.model, 117 + apiBaseUrl: apiBaseUrlInput.value.trim() || CONFIG.API.BASE_URL, 118 + model: modelInput.value.trim() || CONFIG.API.MODEL, 117 119 apiKey: apiKeyInput.value.trim(), 118 120 disableThinking: disableThinkingInput.checked, 119 121 }; ··· 171 173 172 174 // Reset to defaults 173 175 resetBtn.addEventListener("click", () => { 174 - apiModeInput.value = defaultSettings.apiMode; 175 - apiBaseUrlInput.value = defaultSettings.apiBaseUrl; 176 - modelInput.value = defaultSettings.model; 177 - setSelectedAccentPreset(defaultSettings.accentPreset); 178 - accentCustomInput.value = defaultSettings.accentColor; 179 - applyAccentColor(defaultSettings.accentColor); 180 - apiKeyInput.value = defaultSettings.apiKey; 181 - disableThinkingInput.checked = defaultSettings.disableThinking; 176 + apiModeInput.value = CONFIG.API.MODE; 177 + apiBaseUrlInput.value = CONFIG.API.BASE_URL; 178 + modelInput.value = CONFIG.API.MODEL; 179 + setSelectedAccentPreset(CONFIG.ACCENTS.DEFAULT_PRESET); 180 + accentCustomInput.value = CONFIG.ACCENTS.DEFAULT_COLOR; 181 + applyAccentColor(CONFIG.ACCENTS.DEFAULT_COLOR); 182 + apiKeyInput.value = CONFIG.API.KEY; 183 + disableThinkingInput.checked = CONFIG.API.DISABLE_THINKING; 182 184 // Show/hide thinking mode group based on API mode 183 - thinkingModeGroup.style.display = defaultSettings.apiMode === "ollama" ? "block" : "none"; 185 + thinkingModeGroup.style.display = 186 + CONFIG.API.MODE === "ollama" ? "block" : "none"; 184 187 showStatus("Settings reset to defaults. Click Save to apply.", "success"); 185 188 }); 186 189 187 190 // Keyboard shortcuts link 188 - const keyboardShortcutsLink = document.getElementById("keyboard-shortcuts-link"); 191 + const keyboardShortcutsLink = document.getElementById( 192 + "keyboard-shortcuts-link", 193 + ); 189 194 keyboardShortcutsLink.addEventListener("click", (e) => { 190 195 e.preventDefault(); 191 196 // Detect Firefox vs Chrome for keyboard shortcuts URL 192 - if (typeof browser !== 'undefined') { 197 + if (typeof browser !== "undefined") { 193 198 // Firefox: Can't open about:addons programmatically, show instructions 194 - const infoDiv = document.getElementById("shortcuts-info") || createShortcutsInfo(); 195 - infoDiv.style.display = 'block'; 199 + const infoDiv = 200 + document.getElementById("shortcuts-info") || createShortcutsInfo(); 201 + infoDiv.style.display = "block"; 196 202 } else { 197 203 // Chrome/Edge: Open built-in shortcuts page 198 204 chrome.tabs.create({ url: "chrome://extensions/shortcuts" }); ··· 202 208 function createShortcutsInfo() { 203 209 const infoDiv = document.createElement("div"); 204 210 infoDiv.id = "shortcuts-info"; 205 - infoDiv.style.cssText = "margin-top:10px;padding:10px 14px;background:var(--bg-subtle);border:1px solid var(--border);border-radius:6px;font-size:12.5px;line-height:1.5;color:var(--text-secondary);"; 206 - infoDiv.innerHTML = 'ℹ️ To manage shortcuts in Firefox:<br>1. Type <code style="background:var(--bg);padding:1px 5px;border-radius:3px;font-size:11px;">about:addons</code> in the address bar<br>2. Click the gear icon (⚙️) → <strong>Manage Extension Shortcuts</strong>'; 211 + infoDiv.style.cssText = 212 + "margin-top:10px;padding:10px 14px;background:var(--bg-subtle);border:1px solid var(--border);border-radius:6px;font-size:12.5px;line-height:1.5;color:var(--text-secondary);"; 213 + infoDiv.innerHTML = 214 + 'ℹ️ To manage shortcuts in Firefox:<br>1. Type <code style="background:var(--bg);padding:1px 5px;border-radius:3px;font-size:11px;">about:addons</code> in the address bar<br>2. Click the gear icon (⚙️) → <strong>Manage Extension Shortcuts</strong>'; 207 215 keyboardShortcutsLink.parentNode.appendChild(infoDiv); 208 216 return infoDiv; 209 217 } ··· 223 231 apiModeInput.value = settings.apiMode; 224 232 apiBaseUrlInput.value = settings.apiBaseUrl; 225 233 modelInput.value = settings.model; 226 - let preset = settings.accentPreset || getPresetNameForColor(settings.accentColor); 234 + let preset = 235 + settings.accentPreset || getPresetNameForColor(settings.accentColor); 227 236 if (preset === "red") preset = "orange"; 228 237 setSelectedAccentPreset(preset || "custom"); 229 - accentCustomInput.value = settings.accentColor || defaultSettings.accentColor; 230 - applyAccentColor(settings.accentColor || defaultSettings.accentColor); 238 + accentCustomInput.value = 239 + settings.accentColor || CONFIG.ACCENTS.DEFAULT_COLOR; 240 + applyAccentColor(settings.accentColor || CONFIG.ACCENTS.DEFAULT_COLOR); 231 241 apiKeyInput.value = settings.apiKey; 232 242 disableThinkingInput.checked = settings.disableThinking; 233 243 // Show/hide thinking mode group based on API mode 234 - thinkingModeGroup.style.display = settings.apiMode === "ollama" ? "block" : "none"; 244 + thinkingModeGroup.style.display = 245 + settings.apiMode === "ollama" ? "block" : "none"; 235 246 } catch (error) { 236 247 showStatus("Error loading settings: " + error.message, "error"); 237 248 } ··· 254 265 const normalized = normalizeHexColor(color); 255 266 if (!normalized) return null; 256 267 return ( 257 - Object.entries(ACCENT_PRESETS).find(([, presetColor]) => presetColor === normalized)?.[0] || 258 - null 268 + Object.entries(ACCENT_PRESETS).find( 269 + ([, presetColor]) => presetColor === normalized, 270 + )?.[0] || null 259 271 ); 260 272 } 261 273 ··· 274 286 275 287 function darkenHexColor(hexColor, amount) { 276 288 const normalized = normalizeHexColor(hexColor); 277 - if (!normalized) return hexColor; 289 + if (!normalized) return CONFIG.ACCENTS.DEFAULT_COLOR; 278 290 const r = parseInt(normalized.slice(1, 3), 16); 279 291 const g = parseInt(normalized.slice(3, 5), 16); 280 292 const b = parseInt(normalized.slice(5, 7), 16); 281 293 const factor = 1 - amount; 282 - const toHex = (n) => Math.round(Math.max(0, Math.min(255, n * factor))).toString(16).padStart(2, "0").toUpperCase(); 294 + const toHex = (n) => 295 + Math.round(Math.max(0, Math.min(255, n * factor))) 296 + .toString(16) 297 + .padStart(2, "0") 298 + .toUpperCase(); 283 299 return `#${toHex(r)}${toHex(g)}${toHex(b)}`; 284 300 } 285 301 286 302 function applyAccentColor(color) { 287 - const normalized = normalizeHexColor(color) || defaultSettings.accentColor; 303 + const normalized = normalizeHexColor(color) || CONFIG.ACCENTS.DEFAULT_COLOR; 288 304 document.documentElement.style.setProperty("--brand", normalized); 289 - document.documentElement.style.setProperty("--brand-hover", darkenHexColor(normalized, 0.1)); 290 - document.documentElement.style.setProperty("--brand-active", darkenHexColor(normalized, 0.2)); 305 + document.documentElement.style.setProperty( 306 + "--brand-hover", 307 + darkenHexColor(normalized, 0.1), 308 + ); 309 + document.documentElement.style.setProperty( 310 + "--brand-active", 311 + darkenHexColor(normalized, 0.2), 312 + ); 291 313 } 292 314 293 315 function applyTheme(theme) { 294 316 const root = document.documentElement; 295 317 const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches; 296 - const effectiveTheme = theme === "system" ? (prefersDark ? "dark" : "light") : theme; 318 + const effectiveTheme = 319 + theme === "system" ? (prefersDark ? "dark" : "light") : theme; 297 320 root.setAttribute("data-theme", effectiveTheme); 298 321 299 322 document

+155 -79

popup/popup.html

···

+149 -119

popup/popup.js

·························································

+2786

scripts/Readability.js

··· 1 + /* 2 + * Copyright (c) 2010 Arc90 Inc 3 + * 4 + * Licensed under the Apache License, Version 2.0 (the "License"); 5 + * you may not use this file except in compliance with the License. 6 + * You may obtain a copy of the License at 7 + * 8 + * http://www.apache.org/licenses/LICENSE-2.0 9 + * 10 + * Unless required by applicable law or agreed to in writing, software 11 + * distributed under the License is distributed on an "AS IS" BASIS, 12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 + * See the License for the specific language governing permissions and 14 + * limitations under the License. 15 + */ 16 + 17 + /* 18 + * This code is heavily based on Arc90's readability.js (1.7.1) script 19 + * available at: http://code.google.com/p/arc90labs-readability 20 + */ 21 + 22 + /** 23 + * Public constructor. 24 + * @param {HTMLDocument} doc The document to parse. 25 + * @param {Object} options The options object. 26 + */ 27 + function Readability(doc, options) { 28 + // In some older versions, people passed a URI as the first argument. Cope: 29 + if (options && options.documentElement) { 30 + doc = options; 31 + options = arguments[2]; 32 + } else if (!doc || !doc.documentElement) { 33 + throw new Error( 34 + "First argument to Readability constructor should be a document object." 35 + ); 36 + } 37 + options = options || {}; 38 + 39 + this._doc = doc; 40 + this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__; 41 + this._articleTitle = null; 42 + this._articleByline = null; 43 + this._articleDir = null; 44 + this._articleSiteName = null; 45 + this._attempts = []; 46 + this._metadata = {}; 47 + 48 + // Configurable options 49 + this._debug = !!options.debug; 50 + this._maxElemsToParse = 51 + options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; 52 + this._nbTopCandidates = 53 + options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; 54 + this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; 55 + this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat( 56 + options.classesToPreserve || [] 57 + ); 58 + this._keepClasses = !!options.keepClasses; 59 + this._serializer = 60 + options.serializer || 61 + function (el) { 62 + return el.innerHTML; 63 + }; 64 + this._disableJSONLD = !!options.disableJSONLD; 65 + this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos; 66 + this._linkDensityModifier = options.linkDensityModifier || 0; 67 + 68 + // Start with all flags set 69 + this._flags = 70 + this.FLAG_STRIP_UNLIKELYS | 71 + this.FLAG_WEIGHT_CLASSES | 72 + this.FLAG_CLEAN_CONDITIONALLY; 73 + 74 + // Control whether log messages are sent to the console 75 + if (this._debug) { 76 + let logNode = function (node) { 77 + if (node.nodeType == node.TEXT_NODE) { 78 + return `${node.nodeName} ("${node.textContent}")`; 79 + } 80 + let attrPairs = Array.from(node.attributes || [], function (attr) { 81 + return `${attr.name}="${attr.value}"`; 82 + }).join(" "); 83 + return `<${node.localName} ${attrPairs}>`; 84 + }; 85 + this.log = function () { 86 + if (typeof console !== "undefined") { 87 + let args = Array.from(arguments, arg => { 88 + if (arg && arg.nodeType == this.ELEMENT_NODE) { 89 + return logNode(arg); 90 + } 91 + return arg; 92 + }); 93 + args.unshift("Reader: (Readability)"); 94 + // eslint-disable-next-line no-console 95 + console.log(...args); 96 + } else if (typeof dump !== "undefined") { 97 + /* global dump */ 98 + var msg = Array.prototype.map 99 + .call(arguments, function (x) { 100 + return x && x.nodeName ? logNode(x) : x; 101 + }) 102 + .join(" "); 103 + dump("Reader: (Readability) " + msg + "\n"); 104 + } 105 + }; 106 + } else { 107 + this.log = function () {}; 108 + } 109 + } 110 + 111 + Readability.prototype = { 112 + FLAG_STRIP_UNLIKELYS: 0x1, 113 + FLAG_WEIGHT_CLASSES: 0x2, 114 + FLAG_CLEAN_CONDITIONALLY: 0x4, 115 + 116 + // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType 117 + ELEMENT_NODE: 1, 118 + TEXT_NODE: 3, 119 + 120 + // Max number of nodes supported by this parser. Default: 0 (no limit) 121 + DEFAULT_MAX_ELEMS_TO_PARSE: 0, 122 + 123 + // The number of top candidates to consider when analysing how 124 + // tight the competition is among candidates. 125 + DEFAULT_N_TOP_CANDIDATES: 5, 126 + 127 + // Element tags to score by default. 128 + DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre" 129 + .toUpperCase() 130 + .split(","), 131 + 132 + // The default number of chars an article must have in order to return a result 133 + DEFAULT_CHAR_THRESHOLD: 500, 134 + 135 + // All of the regular expressions in use within readability. 136 + // Defined up here so we don't instantiate them repeatedly in loops. 137 + REGEXPS: { 138 + // NOTE: These two regular expressions are duplicated in 139 + // Readability-readerable.js. Please keep both copies in sync. 140 + unlikelyCandidates: 141 + /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, 142 + okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, 143 + 144 + positive: 145 + /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, 146 + negative: 147 + /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget/i, 148 + extraneous: 149 + /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, 150 + byline: /byline|author|dateline|writtenby|p-author/i, 151 + replaceFonts: /<(\/?)font[^>]*>/gi, 152 + normalize: /\s{2,}/g, 153 + videos: 154 + /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, 155 + shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i, 156 + nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, 157 + prevLink: /(prev|earl|old|new|<|«)/i, 158 + tokenize: /\W+/g, 159 + whitespace: /^\s*$/, 160 + hasContent: /\S$/, 161 + hashUrl: /^#.+/, 162 + srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, 163 + b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, 164 + // Commas as used in Latin, Sindhi, Chinese and various other scripts. 165 + // see: https://en.wikipedia.org/wiki/Comma#Comma_variants 166 + commas: /\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C/g, 167 + // See: https://schema.org/Article 168 + jsonLdArticleTypes: 169 + /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/, 170 + // used to see if a node's content matches words commonly used for ad blocks or loading indicators 171 + adWords: 172 + /^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)$/iu, 173 + loadingWords: 174 + /^((loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?)$/iu, 175 + }, 176 + 177 + UNLIKELY_ROLES: [ 178 + "menu", 179 + "menubar", 180 + "complementary", 181 + "navigation", 182 + "alert", 183 + "alertdialog", 184 + "dialog", 185 + ], 186 + 187 + DIV_TO_P_ELEMS: new Set([ 188 + "BLOCKQUOTE", 189 + "DL", 190 + "DIV", 191 + "IMG", 192 + "OL", 193 + "P", 194 + "PRE", 195 + "TABLE", 196 + "UL", 197 + ]), 198 + 199 + ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P", "OL", "UL"], 200 + 201 + PRESENTATIONAL_ATTRIBUTES: [ 202 + "align", 203 + "background", 204 + "bgcolor", 205 + "border", 206 + "cellpadding", 207 + "cellspacing", 208 + "frame", 209 + "hspace", 210 + "rules", 211 + "style", 212 + "valign", 213 + "vspace", 214 + ], 215 + 216 + DEPRECATED_SIZE_ATTRIBUTE_ELEMS: ["TABLE", "TH", "TD", "HR", "PRE"], 217 + 218 + // The commented out elements qualify as phrasing content but tend to be 219 + // removed by readability when put into paragraphs, so we ignore them here. 220 + PHRASING_ELEMS: [ 221 + // "CANVAS", "IFRAME", "SVG", "VIDEO", 222 + "ABBR", 223 + "AUDIO", 224 + "B", 225 + "BDO", 226 + "BR", 227 + "BUTTON", 228 + "CITE", 229 + "CODE", 230 + "DATA", 231 + "DATALIST", 232 + "DFN", 233 + "EM", 234 + "EMBED", 235 + "I", 236 + "IMG", 237 + "INPUT", 238 + "KBD", 239 + "LABEL", 240 + "MARK", 241 + "MATH", 242 + "METER", 243 + "NOSCRIPT", 244 + "OBJECT", 245 + "OUTPUT", 246 + "PROGRESS", 247 + "Q", 248 + "RUBY", 249 + "SAMP", 250 + "SCRIPT", 251 + "SELECT", 252 + "SMALL", 253 + "SPAN", 254 + "STRONG", 255 + "SUB", 256 + "SUP", 257 + "TEXTAREA", 258 + "TIME", 259 + "VAR", 260 + "WBR", 261 + ], 262 + 263 + // These are the classes that readability sets itself. 264 + CLASSES_TO_PRESERVE: ["page"], 265 + 266 + // These are the list of HTML entities that need to be escaped. 267 + HTML_ESCAPE_MAP: { 268 + lt: "<", 269 + gt: ">", 270 + amp: "&", 271 + quot: '"', 272 + apos: "'", 273 + }, 274 + 275 + /** 276 + * Run any post-process modifications to article content as necessary. 277 + * 278 + * @param Element 279 + * @return void 280 + **/ 281 + _postProcessContent(articleContent) { 282 + // Readability cannot open relative uris so we convert them to absolute uris. 283 + this._fixRelativeUris(articleContent); 284 + 285 + this._simplifyNestedElements(articleContent); 286 + 287 + if (!this._keepClasses) { 288 + // Remove classes. 289 + this._cleanClasses(articleContent); 290 + } 291 + }, 292 + 293 + /** 294 + * Iterates over a NodeList, calls `filterFn` for each node and removes node 295 + * if function returned `true`. 296 + * 297 + * If function is not passed, removes all the nodes in node list. 298 + * 299 + * @param NodeList nodeList The nodes to operate on 300 + * @param Function filterFn the function to use as a filter 301 + * @return void 302 + */ 303 + _removeNodes(nodeList, filterFn) { 304 + // Avoid ever operating on live node lists. 305 + if (this._docJSDOMParser && nodeList._isLiveNodeList) { 306 + throw new Error("Do not pass live node lists to _removeNodes"); 307 + } 308 + for (var i = nodeList.length - 1; i >= 0; i--) { 309 + var node = nodeList[i]; 310 + var parentNode = node.parentNode; 311 + if (parentNode) { 312 + if (!filterFn || filterFn.call(this, node, i, nodeList)) { 313 + parentNode.removeChild(node); 314 + } 315 + } 316 + } 317 + }, 318 + 319 + /** 320 + * Iterates over a NodeList, and calls _setNodeTag for each node. 321 + * 322 + * @param NodeList nodeList The nodes to operate on 323 + * @param String newTagName the new tag name to use 324 + * @return void 325 + */ 326 + _replaceNodeTags(nodeList, newTagName) { 327 + // Avoid ever operating on live node lists. 328 + if (this._docJSDOMParser && nodeList._isLiveNodeList) { 329 + throw new Error("Do not pass live node lists to _replaceNodeTags"); 330 + } 331 + for (const node of nodeList) { 332 + this._setNodeTag(node, newTagName); 333 + } 334 + }, 335 + 336 + /** 337 + * Iterate over a NodeList, which doesn't natively fully implement the Array 338 + * interface. 339 + * 340 + * For convenience, the current object context is applied to the provided 341 + * iterate function. 342 + * 343 + * @param NodeList nodeList The NodeList. 344 + * @param Function fn The iterate function. 345 + * @return void 346 + */ 347 + _forEachNode(nodeList, fn) { 348 + Array.prototype.forEach.call(nodeList, fn, this); 349 + }, 350 + 351 + /** 352 + * Iterate over a NodeList, and return the first node that passes 353 + * the supplied test function 354 + * 355 + * For convenience, the current object context is applied to the provided 356 + * test function. 357 + * 358 + * @param NodeList nodeList The NodeList. 359 + * @param Function fn The test function. 360 + * @return void 361 + */ 362 + _findNode(nodeList, fn) { 363 + return Array.prototype.find.call(nodeList, fn, this); 364 + }, 365 + 366 + /** 367 + * Iterate over a NodeList, return true if any of the provided iterate 368 + * function calls returns true, false otherwise. 369 + * 370 + * For convenience, the current object context is applied to the 371 + * provided iterate function. 372 + * 373 + * @param NodeList nodeList The NodeList. 374 + * @param Function fn The iterate function. 375 + * @return Boolean 376 + */ 377 + _someNode(nodeList, fn) { 378 + return Array.prototype.some.call(nodeList, fn, this); 379 + }, 380 + 381 + /** 382 + * Iterate over a NodeList, return true if all of the provided iterate 383 + * function calls return true, false otherwise. 384 + * 385 + * For convenience, the current object context is applied to the 386 + * provided iterate function. 387 + * 388 + * @param NodeList nodeList The NodeList. 389 + * @param Function fn The iterate function. 390 + * @return Boolean 391 + */ 392 + _everyNode(nodeList, fn) { 393 + return Array.prototype.every.call(nodeList, fn, this); 394 + }, 395 + 396 + _getAllNodesWithTag(node, tagNames) { 397 + if (node.querySelectorAll) { 398 + return node.querySelectorAll(tagNames.join(",")); 399 + } 400 + return [].concat.apply( 401 + [], 402 + tagNames.map(function (tag) { 403 + var collection = node.getElementsByTagName(tag); 404 + return Array.isArray(collection) ? collection : Array.from(collection); 405 + }) 406 + ); 407 + }, 408 + 409 + /** 410 + * Removes the class="" attribute from every element in the given 411 + * subtree, except those that match CLASSES_TO_PRESERVE and 412 + * the classesToPreserve array from the options object. 413 + * 414 + * @param Element 415 + * @return void 416 + */ 417 + _cleanClasses(node) { 418 + var classesToPreserve = this._classesToPreserve; 419 + var className = (node.getAttribute("class") || "") 420 + .split(/\s+/) 421 + .filter(cls => classesToPreserve.includes(cls)) 422 + .join(" "); 423 + 424 + if (className) { 425 + node.setAttribute("class", className); 426 + } else { 427 + node.removeAttribute("class"); 428 + } 429 + 430 + for (node = node.firstElementChild; node; node = node.nextElementSibling) { 431 + this._cleanClasses(node); 432 + } 433 + }, 434 + 435 + /** 436 + * Tests whether a string is a URL or not. 437 + * 438 + * @param {string} str The string to test 439 + * @return {boolean} true if str is a URL, false if not 440 + */ 441 + _isUrl(str) { 442 + try { 443 + new URL(str); 444 + return true; 445 + } catch { 446 + return false; 447 + } 448 + }, 449 + /** 450 + * Converts each <a> and <img> uri in the given element to an absolute URI, 451 + * ignoring #ref URIs. 452 + * 453 + * @param Element 454 + * @return void 455 + */ 456 + _fixRelativeUris(articleContent) { 457 + var baseURI = this._doc.baseURI; 458 + var documentURI = this._doc.documentURI; 459 + function toAbsoluteURI(uri) { 460 + // Leave hash links alone if the base URI matches the document URI: 461 + if (baseURI == documentURI && uri.charAt(0) == "#") { 462 + return uri; 463 + } 464 + 465 + // Otherwise, resolve against base URI: 466 + try { 467 + return new URL(uri, baseURI).href; 468 + } catch (ex) { 469 + // Something went wrong, just return the original: 470 + } 471 + return uri; 472 + } 473 + 474 + var links = this._getAllNodesWithTag(articleContent, ["a"]); 475 + this._forEachNode(links, function (link) { 476 + var href = link.getAttribute("href"); 477 + if (href) { 478 + // Remove links with javascript: URIs, since 479 + // they won't work after scripts have been removed from the page. 480 + if (href.indexOf("javascript:") === 0) { 481 + // if the link only contains simple text content, it can be converted to a text node 482 + if ( 483 + link.childNodes.length === 1 && 484 + link.childNodes[0].nodeType === this.TEXT_NODE 485 + ) { 486 + var text = this._doc.createTextNode(link.textContent); 487 + link.parentNode.replaceChild(text, link); 488 + } else { 489 + // if the link has multiple children, they should all be preserved 490 + var container = this._doc.createElement("span"); 491 + while (link.firstChild) { 492 + container.appendChild(link.firstChild); 493 + } 494 + link.parentNode.replaceChild(container, link); 495 + } 496 + } else { 497 + link.setAttribute("href", toAbsoluteURI(href)); 498 + } 499 + } 500 + }); 501 + 502 + var medias = this._getAllNodesWithTag(articleContent, [ 503 + "img", 504 + "picture", 505 + "figure", 506 + "video", 507 + "audio", 508 + "source", 509 + ]); 510 + 511 + this._forEachNode(medias, function (media) { 512 + var src = media.getAttribute("src"); 513 + var poster = media.getAttribute("poster"); 514 + var srcset = media.getAttribute("srcset"); 515 + 516 + if (src) { 517 + media.setAttribute("src", toAbsoluteURI(src)); 518 + } 519 + 520 + if (poster) { 521 + media.setAttribute("poster", toAbsoluteURI(poster)); 522 + } 523 + 524 + if (srcset) { 525 + var newSrcset = srcset.replace( 526 + this.REGEXPS.srcsetUrl, 527 + function (_, p1, p2, p3) { 528 + return toAbsoluteURI(p1) + (p2 || "") + p3; 529 + } 530 + ); 531 + 532 + media.setAttribute("srcset", newSrcset); 533 + } 534 + }); 535 + }, 536 + 537 + _simplifyNestedElements(articleContent) { 538 + var node = articleContent; 539 + 540 + while (node) { 541 + if ( 542 + node.parentNode && 543 + ["DIV", "SECTION"].includes(node.tagName) && 544 + !(node.id && node.id.startsWith("readability")) 545 + ) { 546 + if (this._isElementWithoutContent(node)) { 547 + node = this._removeAndGetNext(node); 548 + continue; 549 + } else if ( 550 + this._hasSingleTagInsideElement(node, "DIV") || 551 + this._hasSingleTagInsideElement(node, "SECTION") 552 + ) { 553 + var child = node.children[0]; 554 + for (var i = 0; i < node.attributes.length; i++) { 555 + child.setAttributeNode(node.attributes[i].cloneNode()); 556 + } 557 + node.parentNode.replaceChild(child, node); 558 + node = child; 559 + continue; 560 + } 561 + } 562 + 563 + node = this._getNextNode(node); 564 + } 565 + }, 566 + 567 + /** 568 + * Get the article title as an H1. 569 + * 570 + * @return string 571 + **/ 572 + _getArticleTitle() { 573 + var doc = this._doc; 574 + var curTitle = ""; 575 + var origTitle = ""; 576 + 577 + try { 578 + curTitle = origTitle = doc.title.trim(); 579 + 580 + // If they had an element with id "title" in their HTML 581 + if (typeof curTitle !== "string") { 582 + curTitle = origTitle = this._getInnerText( 583 + doc.getElementsByTagName("title")[0] 584 + ); 585 + } 586 + } catch (e) { 587 + /* ignore exceptions setting the title. */ 588 + } 589 + 590 + var titleHadHierarchicalSeparators = false; 591 + function wordCount(str) { 592 + return str.split(/\s+/).length; 593 + } 594 + 595 + // If there's a separator in the title, first remove the final part 596 + if (/ [\|\-\\\/>»] /.test(curTitle)) { 597 + titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle); 598 + let allSeparators = Array.from(origTitle.matchAll(/ [\|\-\\\/>»] /gi)); 599 + curTitle = origTitle.substring(0, allSeparators.pop().index); 600 + 601 + // If the resulting title is too short, remove the first part instead: 602 + if (wordCount(curTitle) < 3) { 603 + curTitle = origTitle.replace(/^[^\|\-\\\/>»]*[\|\-\\\/>»]/gi, ""); 604 + } 605 + } else if (curTitle.includes(": ")) { 606 + // Check if we have an heading containing this exact string, so we 607 + // could assume it's the full title. 608 + var headings = this._getAllNodesWithTag(doc, ["h1", "h2"]); 609 + var trimmedTitle = curTitle.trim(); 610 + var match = this._someNode(headings, function (heading) { 611 + return heading.textContent.trim() === trimmedTitle; 612 + }); 613 + 614 + // If we don't, let's extract the title out of the original title string. 615 + if (!match) { 616 + curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1); 617 + 618 + // If the title is now too short, try the first colon instead: 619 + if (wordCount(curTitle) < 3) { 620 + curTitle = origTitle.substring(origTitle.indexOf(":") + 1); 621 + // But if we have too many words before the colon there's something weird 622 + // with the titles and the H tags so let's just use the original title instead 623 + } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) { 624 + curTitle = origTitle; 625 + } 626 + } 627 + } else if (curTitle.length > 150 || curTitle.length < 15) { 628 + var hOnes = doc.getElementsByTagName("h1"); 629 + 630 + if (hOnes.length === 1) { 631 + curTitle = this._getInnerText(hOnes[0]); 632 + } 633 + } 634 + 635 + curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " "); 636 + // If we now have 4 words or fewer as our title, and either no 637 + // 'hierarchical' separators (\, /, > or ») were found in the original 638 + // title or we decreased the number of words by more than 1 word, use 639 + // the original title. 640 + var curTitleWordCount = wordCount(curTitle); 641 + if ( 642 + curTitleWordCount <= 4 && 643 + (!titleHadHierarchicalSeparators || 644 + curTitleWordCount != 645 + wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1) 646 + ) { 647 + curTitle = origTitle; 648 + } 649 + 650 + return curTitle; 651 + }, 652 + 653 + /** 654 + * Prepare the HTML document for readability to scrape it. 655 + * This includes things like stripping javascript, CSS, and handling terrible markup. 656 + * 657 + * @return void 658 + **/ 659 + _prepDocument() { 660 + var doc = this._doc; 661 + 662 + // Remove all style tags in head 663 + this._removeNodes(this._getAllNodesWithTag(doc, ["style"])); 664 + 665 + if (doc.body) { 666 + this._replaceBrs(doc.body); 667 + } 668 + 669 + this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN"); 670 + }, 671 + 672 + /** 673 + * Finds the next node, starting from the given node, and ignoring 674 + * whitespace in between. If the given node is an element, the same node is 675 + * returned. 676 + */ 677 + _nextNode(node) { 678 + var next = node; 679 + while ( 680 + next && 681 + next.nodeType != this.ELEMENT_NODE && 682 + this.REGEXPS.whitespace.test(next.textContent) 683 + ) { 684 + next = next.nextSibling; 685 + } 686 + return next; 687 + }, 688 + 689 + /** 690 + * Replaces 2 or more successive <br> elements with a single <p>. 691 + * Whitespace between <br> elements are ignored. For example: 692 + * <div>foo<br>bar<br> <br><br>abc</div> 693 + * will become: 694 + * <div>foo<br>bar<p>abc</p></div> 695 + */ 696 + _replaceBrs(elem) { 697 + this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function (br) { 698 + var next = br.nextSibling; 699 + 700 + // Whether 2 or more <br> elements have been found and replaced with a 701 + // <p> block. 702 + var replaced = false; 703 + 704 + // If we find a <br> chain, remove the <br>s until we hit another node 705 + // or non-whitespace. This leaves behind the first <br> in the chain 706 + // (which will be replaced with a <p> later). 707 + while ((next = this._nextNode(next)) && next.tagName == "BR") { 708 + replaced = true; 709 + var brSibling = next.nextSibling; 710 + next.remove(); 711 + next = brSibling; 712 + } 713 + 714 + // If we removed a <br> chain, replace the remaining <br> with a <p>. Add 715 + // all sibling nodes as children of the <p> until we hit another <br> 716 + // chain. 717 + if (replaced) { 718 + var p = this._doc.createElement("p"); 719 + br.parentNode.replaceChild(p, br); 720 + 721 + next = p.nextSibling; 722 + while (next) { 723 + // If we've hit another <br><br>, we're done adding children to this <p>. 724 + if (next.tagName == "BR") { 725 + var nextElem = this._nextNode(next.nextSibling); 726 + if (nextElem && nextElem.tagName == "BR") { 727 + break; 728 + } 729 + } 730 + 731 + if (!this._isPhrasingContent(next)) { 732 + break; 733 + } 734 + 735 + // Otherwise, make this node a child of the new <p>. 736 + var sibling = next.nextSibling; 737 + p.appendChild(next); 738 + next = sibling; 739 + } 740 + 741 + while (p.lastChild && this._isWhitespace(p.lastChild)) { 742 + p.lastChild.remove(); 743 + } 744 + 745 + if (p.parentNode.tagName === "P") { 746 + this._setNodeTag(p.parentNode, "DIV"); 747 + } 748 + } 749 + }); 750 + }, 751 + 752 + _setNodeTag(node, tag) { 753 + this.log("_setNodeTag", node, tag); 754 + if (this._docJSDOMParser) { 755 + node.localName = tag.toLowerCase(); 756 + node.tagName = tag.toUpperCase(); 757 + return node; 758 + } 759 + 760 + var replacement = node.ownerDocument.createElement(tag); 761 + while (node.firstChild) { 762 + replacement.appendChild(node.firstChild); 763 + } 764 + node.parentNode.replaceChild(replacement, node); 765 + if (node.readability) { 766 + replacement.readability = node.readability; 767 + } 768 + 769 + for (var i = 0; i < node.attributes.length; i++) { 770 + replacement.setAttributeNode(node.attributes[i].cloneNode()); 771 + } 772 + return replacement; 773 + }, 774 + 775 + /** 776 + * Prepare the article node for display. Clean out any inline styles, 777 + * iframes, forms, strip extraneous <p> tags, etc. 778 + * 779 + * @param Element 780 + * @return void 781 + **/ 782 + _prepArticle(articleContent) { 783 + this._cleanStyles(articleContent); 784 + 785 + // Check for data tables before we continue, to avoid removing items in 786 + // those tables, which will often be isolated even though they're 787 + // visually linked to other content-ful elements (text, images, etc.). 788 + this._markDataTables(articleContent); 789 + 790 + this._fixLazyImages(articleContent); 791 + 792 + // Clean out junk from the article content 793 + this._cleanConditionally(articleContent, "form"); 794 + this._cleanConditionally(articleContent, "fieldset"); 795 + this._clean(articleContent, "object"); 796 + this._clean(articleContent, "embed"); 797 + this._clean(articleContent, "footer"); 798 + this._clean(articleContent, "link"); 799 + this._clean(articleContent, "aside"); 800 + 801 + // Clean out elements with little content that have "share" in their id/class combinations from final top candidates, 802 + // which means we don't remove the top candidates even they have "share". 803 + 804 + var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD; 805 + 806 + this._forEachNode(articleContent.children, function (topCandidate) { 807 + this._cleanMatchedNodes(topCandidate, function (node, matchString) { 808 + return ( 809 + this.REGEXPS.shareElements.test(matchString) && 810 + node.textContent.length < shareElementThreshold 811 + ); 812 + }); 813 + }); 814 + 815 + this._clean(articleContent, "iframe"); 816 + this._clean(articleContent, "input"); 817 + this._clean(articleContent, "textarea"); 818 + this._clean(articleContent, "select"); 819 + this._clean(articleContent, "button"); 820 + this._cleanHeaders(articleContent); 821 + 822 + // Do these last as the previous stuff may have removed junk 823 + // that will affect these 824 + this._cleanConditionally(articleContent, "table"); 825 + this._cleanConditionally(articleContent, "ul"); 826 + this._cleanConditionally(articleContent, "div"); 827 + 828 + // replace H1 with H2 as H1 should be only title that is displayed separately 829 + this._replaceNodeTags( 830 + this._getAllNodesWithTag(articleContent, ["h1"]), 831 + "h2" 832 + ); 833 + 834 + // Remove extra paragraphs 835 + this._removeNodes( 836 + this._getAllNodesWithTag(articleContent, ["p"]), 837 + function (paragraph) { 838 + // At this point, nasty iframes have been removed; only embedded video 839 + // ones remain. 840 + var contentElementCount = this._getAllNodesWithTag(paragraph, [ 841 + "img", 842 + "embed", 843 + "object", 844 + "iframe", 845 + ]).length; 846 + return ( 847 + contentElementCount === 0 && !this._getInnerText(paragraph, false) 848 + ); 849 + } 850 + ); 851 + 852 + this._forEachNode( 853 + this._getAllNodesWithTag(articleContent, ["br"]), 854 + function (br) { 855 + var next = this._nextNode(br.nextSibling); 856 + if (next && next.tagName == "P") { 857 + br.remove(); 858 + } 859 + } 860 + ); 861 + 862 + // Remove single-cell tables 863 + this._forEachNode( 864 + this._getAllNodesWithTag(articleContent, ["table"]), 865 + function (table) { 866 + var tbody = this._hasSingleTagInsideElement(table, "TBODY") 867 + ? table.firstElementChild 868 + : table; 869 + if (this._hasSingleTagInsideElement(tbody, "TR")) { 870 + var row = tbody.firstElementChild; 871 + if (this._hasSingleTagInsideElement(row, "TD")) { 872 + var cell = row.firstElementChild; 873 + cell = this._setNodeTag( 874 + cell, 875 + this._everyNode(cell.childNodes, this._isPhrasingContent) 876 + ? "P" 877 + : "DIV" 878 + ); 879 + table.parentNode.replaceChild(cell, table); 880 + } 881 + } 882 + } 883 + ); 884 + }, 885 + 886 + /** 887 + * Initialize a node with the readability object. Also checks the 888 + * className/id for special names to add to its score. 889 + * 890 + * @param Element 891 + * @return void 892 + **/ 893 + _initializeNode(node) { 894 + node.readability = { contentScore: 0 }; 895 + 896 + switch (node.tagName) { 897 + case "DIV": 898 + node.readability.contentScore += 5; 899 + break; 900 + 901 + case "PRE": 902 + case "TD": 903 + case "BLOCKQUOTE": 904 + node.readability.contentScore += 3; 905 + break; 906 + 907 + case "ADDRESS": 908 + case "OL": 909 + case "UL": 910 + case "DL": 911 + case "DD": 912 + case "DT": 913 + case "LI": 914 + case "FORM": 915 + node.readability.contentScore -= 3; 916 + break; 917 + 918 + case "H1": 919 + case "H2": 920 + case "H3": 921 + case "H4": 922 + case "H5": 923 + case "H6": 924 + case "TH": 925 + node.readability.contentScore -= 5; 926 + break; 927 + } 928 + 929 + node.readability.contentScore += this._getClassWeight(node); 930 + }, 931 + 932 + _removeAndGetNext(node) { 933 + var nextNode = this._getNextNode(node, true); 934 + node.remove(); 935 + return nextNode; 936 + }, 937 + 938 + /** 939 + * Traverse the DOM from node to node, starting at the node passed in. 940 + * Pass true for the second parameter to indicate this node itself 941 + * (and its kids) are going away, and we want the next node over. 942 + * 943 + * Calling this in a loop will traverse the DOM depth-first. 944 + * 945 + * @param {Element} node 946 + * @param {boolean} ignoreSelfAndKids 947 + * @return {Element} 948 + */ 949 + _getNextNode(node, ignoreSelfAndKids) { 950 + // First check for kids if those aren't being ignored 951 + if (!ignoreSelfAndKids && node.firstElementChild) { 952 + return node.firstElementChild; 953 + } 954 + // Then for siblings... 955 + if (node.nextElementSibling) { 956 + return node.nextElementSibling; 957 + } 958 + // And finally, move up the parent chain *and* find a sibling 959 + // (because this is depth-first traversal, we will have already 960 + // seen the parent nodes themselves). 961 + do { 962 + node = node.parentNode; 963 + } while (node && !node.nextElementSibling); 964 + return node && node.nextElementSibling; 965 + }, 966 + 967 + // compares second text to first one 968 + // 1 = same text, 0 = completely different text 969 + // works the way that it splits both texts into words and then finds words that are unique in second text 970 + // the result is given by the lower length of unique parts 971 + _textSimilarity(textA, textB) { 972 + var tokensA = textA 973 + .toLowerCase() 974 + .split(this.REGEXPS.tokenize) 975 + .filter(Boolean); 976 + var tokensB = textB 977 + .toLowerCase() 978 + .split(this.REGEXPS.tokenize) 979 + .filter(Boolean); 980 + if (!tokensA.length || !tokensB.length) { 981 + return 0; 982 + } 983 + var uniqTokensB = tokensB.filter(token => !tokensA.includes(token)); 984 + var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length; 985 + return 1 - distanceB; 986 + }, 987 + 988 + /** 989 + * Checks whether an element node contains a valid byline 990 + * 991 + * @param node {Element} 992 + * @param matchString {string} 993 + * @return boolean 994 + */ 995 + _isValidByline(node, matchString) { 996 + var rel = node.getAttribute("rel"); 997 + var itemprop = node.getAttribute("itemprop"); 998 + var bylineLength = node.textContent.trim().length; 999 + 1000 + return ( 1001 + (rel === "author" || 1002 + (itemprop && itemprop.includes("author")) || 1003 + this.REGEXPS.byline.test(matchString)) && 1004 + !!bylineLength && 1005 + bylineLength < 100 1006 + ); 1007 + }, 1008 + 1009 + _getNodeAncestors(node, maxDepth) { 1010 + maxDepth = maxDepth || 0; 1011 + var i = 0, 1012 + ancestors = []; 1013 + while (node.parentNode) { 1014 + ancestors.push(node.parentNode); 1015 + if (maxDepth && ++i === maxDepth) { 1016 + break; 1017 + } 1018 + node = node.parentNode; 1019 + } 1020 + return ancestors; 1021 + }, 1022 + 1023 + /*** 1024 + * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 1025 + * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 1026 + * 1027 + * @param page a document to run upon. Needs to be a full document, complete with body. 1028 + * @return Element 1029 + **/ 1030 + /* eslint-disable-next-line complexity */ 1031 + _grabArticle(page) { 1032 + this.log("**** grabArticle ****"); 1033 + var doc = this._doc; 1034 + var isPaging = page !== null; 1035 + page = page ? page : this._doc.body; 1036 + 1037 + // We can't grab an article if we don't have a page! 1038 + if (!page) { 1039 + this.log("No body found in document. Abort."); 1040 + return null; 1041 + } 1042 + 1043 + var pageCacheHtml = page.innerHTML; 1044 + 1045 + while (true) { 1046 + this.log("Starting grabArticle loop"); 1047 + var stripUnlikelyCandidates = this._flagIsActive( 1048 + this.FLAG_STRIP_UNLIKELYS 1049 + ); 1050 + 1051 + // First, node prepping. Trash nodes that look cruddy (like ones with the 1052 + // class name "comment", etc), and turn divs into P tags where they have been 1053 + // used inappropriately (as in, where they contain no other block level elements.) 1054 + var elementsToScore = []; 1055 + var node = this._doc.documentElement; 1056 + 1057 + let shouldRemoveTitleHeader = true; 1058 + 1059 + while (node) { 1060 + if (node.tagName === "HTML") { 1061 + this._articleLang = node.getAttribute("lang"); 1062 + } 1063 + 1064 + var matchString = node.className + " " + node.id; 1065 + 1066 + if (!this._isProbablyVisible(node)) { 1067 + this.log("Removing hidden node - " + matchString); 1068 + node = this._removeAndGetNext(node); 1069 + continue; 1070 + } 1071 + 1072 + // User is not able to see elements applied with both "aria-modal = true" and "role = dialog" 1073 + if ( 1074 + node.getAttribute("aria-modal") == "true" && 1075 + node.getAttribute("role") == "dialog" 1076 + ) { 1077 + node = this._removeAndGetNext(node); 1078 + continue; 1079 + } 1080 + 1081 + // If we don't have a byline yet check to see if this node is a byline; if it is store the byline and remove the node. 1082 + if ( 1083 + !this._articleByline && 1084 + !this._metadata.byline && 1085 + this._isValidByline(node, matchString) 1086 + ) { 1087 + // Find child node matching [itemprop="name"] and use that if it exists for a more accurate author name byline 1088 + var endOfSearchMarkerNode = this._getNextNode(node, true); 1089 + var next = this._getNextNode(node); 1090 + var itemPropNameNode = null; 1091 + while (next && next != endOfSearchMarkerNode) { 1092 + var itemprop = next.getAttribute("itemprop"); 1093 + if (itemprop && itemprop.includes("name")) { 1094 + itemPropNameNode = next; 1095 + break; 1096 + } else { 1097 + next = this._getNextNode(next); 1098 + } 1099 + } 1100 + this._articleByline = (itemPropNameNode ?? node).textContent.trim(); 1101 + node = this._removeAndGetNext(node); 1102 + continue; 1103 + } 1104 + 1105 + if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) { 1106 + this.log( 1107 + "Removing header: ", 1108 + node.textContent.trim(), 1109 + this._articleTitle.trim() 1110 + ); 1111 + shouldRemoveTitleHeader = false; 1112 + node = this._removeAndGetNext(node); 1113 + continue; 1114 + } 1115 + 1116 + // Remove unlikely candidates 1117 + if (stripUnlikelyCandidates) { 1118 + if ( 1119 + this.REGEXPS.unlikelyCandidates.test(matchString) && 1120 + !this.REGEXPS.okMaybeItsACandidate.test(matchString) && 1121 + !this._hasAncestorTag(node, "table") && 1122 + !this._hasAncestorTag(node, "code") && 1123 + node.tagName !== "BODY" && 1124 + node.tagName !== "A" 1125 + ) { 1126 + this.log("Removing unlikely candidate - " + matchString); 1127 + node = this._removeAndGetNext(node); 1128 + continue; 1129 + } 1130 + 1131 + if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { 1132 + this.log( 1133 + "Removing content with role " + 1134 + node.getAttribute("role") + 1135 + " - " + 1136 + matchString 1137 + ); 1138 + node = this._removeAndGetNext(node); 1139 + continue; 1140 + } 1141 + } 1142 + 1143 + // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). 1144 + if ( 1145 + (node.tagName === "DIV" || 1146 + node.tagName === "SECTION" || 1147 + node.tagName === "HEADER" || 1148 + node.tagName === "H1" || 1149 + node.tagName === "H2" || 1150 + node.tagName === "H3" || 1151 + node.tagName === "H4" || 1152 + node.tagName === "H5" || 1153 + node.tagName === "H6") && 1154 + this._isElementWithoutContent(node) 1155 + ) { 1156 + node = this._removeAndGetNext(node); 1157 + continue; 1158 + } 1159 + 1160 + if (this.DEFAULT_TAGS_TO_SCORE.includes(node.tagName)) { 1161 + elementsToScore.push(node); 1162 + } 1163 + 1164 + // Turn all divs that don't have children block level elements into p's 1165 + if (node.tagName === "DIV") { 1166 + // Put phrasing content into paragraphs. 1167 + var p = null; 1168 + var childNode = node.firstChild; 1169 + while (childNode) { 1170 + var nextSibling = childNode.nextSibling; 1171 + if (this._isPhrasingContent(childNode)) { 1172 + if (p !== null) { 1173 + p.appendChild(childNode); 1174 + } else if (!this._isWhitespace(childNode)) { 1175 + p = doc.createElement("p"); 1176 + node.replaceChild(p, childNode); 1177 + p.appendChild(childNode); 1178 + } 1179 + } else if (p !== null) { 1180 + while (p.lastChild && this._isWhitespace(p.lastChild)) { 1181 + p.lastChild.remove(); 1182 + } 1183 + p = null; 1184 + } 1185 + childNode = nextSibling; 1186 + } 1187 + 1188 + // Sites like http://mobile.slate.com encloses each paragraph with a DIV 1189 + // element. DIVs with only a P element inside and no text content can be 1190 + // safely converted into plain P elements to avoid confusing the scoring 1191 + // algorithm with DIVs with are, in practice, paragraphs. 1192 + if ( 1193 + this._hasSingleTagInsideElement(node, "P") && 1194 + this._getLinkDensity(node) < 0.25 1195 + ) { 1196 + var newNode = node.children[0]; 1197 + node.parentNode.replaceChild(newNode, node); 1198 + node = newNode; 1199 + elementsToScore.push(node); 1200 + } else if (!this._hasChildBlockElement(node)) { 1201 + node = this._setNodeTag(node, "P"); 1202 + elementsToScore.push(node); 1203 + } 1204 + } 1205 + node = this._getNextNode(node); 1206 + } 1207 + 1208 + /** 1209 + * Loop through all paragraphs, and assign a score to them based on how content-y they look. 1210 + * Then add their score to their parent node. 1211 + * 1212 + * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. 1213 + **/ 1214 + var candidates = []; 1215 + this._forEachNode(elementsToScore, function (elementToScore) { 1216 + if ( 1217 + !elementToScore.parentNode || 1218 + typeof elementToScore.parentNode.tagName === "undefined" 1219 + ) { 1220 + return; 1221 + } 1222 + 1223 + // If this paragraph is less than 25 characters, don't even count it. 1224 + var innerText = this._getInnerText(elementToScore); 1225 + if (innerText.length < 25) { 1226 + return; 1227 + } 1228 + 1229 + // Exclude nodes with no ancestor. 1230 + var ancestors = this._getNodeAncestors(elementToScore, 5); 1231 + if (ancestors.length === 0) { 1232 + return; 1233 + } 1234 + 1235 + var contentScore = 0; 1236 + 1237 + // Add a point for the paragraph itself as a base. 1238 + contentScore += 1; 1239 + 1240 + // Add points for any commas within this paragraph. 1241 + contentScore += innerText.split(this.REGEXPS.commas).length; 1242 + 1243 + // For every 100 characters in this paragraph, add another point. Up to 3 points. 1244 + contentScore += Math.min(Math.floor(innerText.length / 100), 3); 1245 + 1246 + // Initialize and score ancestors. 1247 + this._forEachNode(ancestors, function (ancestor, level) { 1248 + if ( 1249 + !ancestor.tagName || 1250 + !ancestor.parentNode || 1251 + typeof ancestor.parentNode.tagName === "undefined" 1252 + ) { 1253 + return; 1254 + } 1255 + 1256 + if (typeof ancestor.readability === "undefined") { 1257 + this._initializeNode(ancestor); 1258 + candidates.push(ancestor); 1259 + } 1260 + 1261 + // Node score divider: 1262 + // - parent: 1 (no division) 1263 + // - grandparent: 2 1264 + // - great grandparent+: ancestor level * 3 1265 + if (level === 0) { 1266 + var scoreDivider = 1; 1267 + } else if (level === 1) { 1268 + scoreDivider = 2; 1269 + } else { 1270 + scoreDivider = level * 3; 1271 + } 1272 + ancestor.readability.contentScore += contentScore / scoreDivider; 1273 + }); 1274 + }); 1275 + 1276 + // After we've calculated scores, loop through all of the possible 1277 + // candidate nodes we found and find the one with the highest score. 1278 + var topCandidates = []; 1279 + for (var c = 0, cl = candidates.length; c < cl; c += 1) { 1280 + var candidate = candidates[c]; 1281 + 1282 + // Scale the final candidates score based on link density. Good content 1283 + // should have a relatively small link density (5% or less) and be mostly 1284 + // unaffected by this operation. 1285 + var candidateScore = 1286 + candidate.readability.contentScore * 1287 + (1 - this._getLinkDensity(candidate)); 1288 + candidate.readability.contentScore = candidateScore; 1289 + 1290 + this.log("Candidate:", candidate, "with score " + candidateScore); 1291 + 1292 + for (var t = 0; t < this._nbTopCandidates; t++) { 1293 + var aTopCandidate = topCandidates[t]; 1294 + 1295 + if ( 1296 + !aTopCandidate || 1297 + candidateScore > aTopCandidate.readability.contentScore 1298 + ) { 1299 + topCandidates.splice(t, 0, candidate); 1300 + if (topCandidates.length > this._nbTopCandidates) { 1301 + topCandidates.pop(); 1302 + } 1303 + break; 1304 + } 1305 + } 1306 + } 1307 + 1308 + var topCandidate = topCandidates[0] || null; 1309 + var neededToCreateTopCandidate = false; 1310 + var parentOfTopCandidate; 1311 + 1312 + // If we still have no top candidate, just use the body as a last resort. 1313 + // We also have to copy the body node so it is something we can modify. 1314 + if (topCandidate === null || topCandidate.tagName === "BODY") { 1315 + // Move all of the page's children into topCandidate 1316 + topCandidate = doc.createElement("DIV"); 1317 + neededToCreateTopCandidate = true; 1318 + // Move everything (not just elements, also text nodes etc.) into the container 1319 + // so we even include text directly in the body: 1320 + while (page.firstChild) { 1321 + this.log("Moving child out:", page.firstChild); 1322 + topCandidate.appendChild(page.firstChild); 1323 + } 1324 + 1325 + page.appendChild(topCandidate); 1326 + 1327 + this._initializeNode(topCandidate); 1328 + } else if (topCandidate) { 1329 + // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array 1330 + // and whose scores are quite closed with current `topCandidate` node. 1331 + var alternativeCandidateAncestors = []; 1332 + for (var i = 1; i < topCandidates.length; i++) { 1333 + if ( 1334 + topCandidates[i].readability.contentScore / 1335 + topCandidate.readability.contentScore >= 1336 + 0.75 1337 + ) { 1338 + alternativeCandidateAncestors.push( 1339 + this._getNodeAncestors(topCandidates[i]) 1340 + ); 1341 + } 1342 + } 1343 + var MINIMUM_TOPCANDIDATES = 3; 1344 + if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) { 1345 + parentOfTopCandidate = topCandidate.parentNode; 1346 + while (parentOfTopCandidate.tagName !== "BODY") { 1347 + var listsContainingThisAncestor = 0; 1348 + for ( 1349 + var ancestorIndex = 0; 1350 + ancestorIndex < alternativeCandidateAncestors.length && 1351 + listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; 1352 + ancestorIndex++ 1353 + ) { 1354 + listsContainingThisAncestor += Number( 1355 + alternativeCandidateAncestors[ancestorIndex].includes( 1356 + parentOfTopCandidate 1357 + ) 1358 + ); 1359 + } 1360 + if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) { 1361 + topCandidate = parentOfTopCandidate; 1362 + break; 1363 + } 1364 + parentOfTopCandidate = parentOfTopCandidate.parentNode; 1365 + } 1366 + } 1367 + if (!topCandidate.readability) { 1368 + this._initializeNode(topCandidate); 1369 + } 1370 + 1371 + // Because of our bonus system, parents of candidates might have scores 1372 + // themselves. They get half of the node. There won't be nodes with higher 1373 + // scores than our topCandidate, but if we see the score going *up* in the first 1374 + // few steps up the tree, that's a decent sign that there might be more content 1375 + // lurking in other places that we want to unify in. The sibling stuff 1376 + // below does some of that - but only if we've looked high enough up the DOM 1377 + // tree. 1378 + parentOfTopCandidate = topCandidate.parentNode; 1379 + var lastScore = topCandidate.readability.contentScore; 1380 + // The scores shouldn't get too low. 1381 + var scoreThreshold = lastScore / 3; 1382 + while (parentOfTopCandidate.tagName !== "BODY") { 1383 + if (!parentOfTopCandidate.readability) { 1384 + parentOfTopCandidate = parentOfTopCandidate.parentNode; 1385 + continue; 1386 + } 1387 + var parentScore = parentOfTopCandidate.readability.contentScore; 1388 + if (parentScore < scoreThreshold) { 1389 + break; 1390 + } 1391 + if (parentScore > lastScore) { 1392 + // Alright! We found a better parent to use. 1393 + topCandidate = parentOfTopCandidate; 1394 + break; 1395 + } 1396 + lastScore = parentOfTopCandidate.readability.contentScore; 1397 + parentOfTopCandidate = parentOfTopCandidate.parentNode; 1398 + } 1399 + 1400 + // If the top candidate is the only child, use parent instead. This will help sibling 1401 + // joining logic when adjacent content is actually located in parent's sibling node. 1402 + parentOfTopCandidate = topCandidate.parentNode; 1403 + while ( 1404 + parentOfTopCandidate.tagName != "BODY" && 1405 + parentOfTopCandidate.children.length == 1 1406 + ) { 1407 + topCandidate = parentOfTopCandidate; 1408 + parentOfTopCandidate = topCandidate.parentNode; 1409 + } 1410 + if (!topCandidate.readability) { 1411 + this._initializeNode(topCandidate); 1412 + } 1413 + } 1414 + 1415 + // Now that we have the top candidate, look through its siblings for content 1416 + // that might also be related. Things like preambles, content split by ads 1417 + // that we removed, etc. 1418 + var articleContent = doc.createElement("DIV"); 1419 + if (isPaging) { 1420 + articleContent.id = "readability-content"; 1421 + } 1422 + 1423 + var siblingScoreThreshold = Math.max( 1424 + 10, 1425 + topCandidate.readability.contentScore * 0.2 1426 + ); 1427 + // Keep potential top candidate's parent node to try to get text direction of it later. 1428 + parentOfTopCandidate = topCandidate.parentNode; 1429 + var siblings = parentOfTopCandidate.children; 1430 + 1431 + for (var s = 0, sl = siblings.length; s < sl; s++) { 1432 + var sibling = siblings[s]; 1433 + var append = false; 1434 + 1435 + this.log( 1436 + "Looking at sibling node:", 1437 + sibling, 1438 + sibling.readability 1439 + ? "with score " + sibling.readability.contentScore 1440 + : "" 1441 + ); 1442 + this.log( 1443 + "Sibling has score", 1444 + sibling.readability ? sibling.readability.contentScore : "Unknown" 1445 + ); 1446 + 1447 + if (sibling === topCandidate) { 1448 + append = true; 1449 + } else { 1450 + var contentBonus = 0; 1451 + 1452 + // Give a bonus if sibling nodes and top candidates have the example same classname 1453 + if ( 1454 + sibling.className === topCandidate.className && 1455 + topCandidate.className !== "" 1456 + ) { 1457 + contentBonus += topCandidate.readability.contentScore * 0.2; 1458 + } 1459 + 1460 + if ( 1461 + sibling.readability && 1462 + sibling.readability.contentScore + contentBonus >= 1463 + siblingScoreThreshold 1464 + ) { 1465 + append = true; 1466 + } else if (sibling.nodeName === "P") { 1467 + var linkDensity = this._getLinkDensity(sibling); 1468 + var nodeContent = this._getInnerText(sibling); 1469 + var nodeLength = nodeContent.length; 1470 + 1471 + if (nodeLength > 80 && linkDensity < 0.25) { 1472 + append = true; 1473 + } else if ( 1474 + nodeLength < 80 && 1475 + nodeLength > 0 && 1476 + linkDensity === 0 && 1477 + nodeContent.search(/\.( |$)/) !== -1 1478 + ) { 1479 + append = true; 1480 + } 1481 + } 1482 + } 1483 + 1484 + if (append) { 1485 + this.log("Appending node:", sibling); 1486 + 1487 + if (!this.ALTER_TO_DIV_EXCEPTIONS.includes(sibling.nodeName)) { 1488 + // We have a node that isn't a common block level element, like a form or td tag. 1489 + // Turn it into a div so it doesn't get filtered out later by accident. 1490 + this.log("Altering sibling:", sibling, "to div."); 1491 + 1492 + sibling = this._setNodeTag(sibling, "DIV"); 1493 + } 1494 + 1495 + articleContent.appendChild(sibling); 1496 + // Fetch children again to make it compatible 1497 + // with DOM parsers without live collection support. 1498 + siblings = parentOfTopCandidate.children; 1499 + // siblings is a reference to the children array, and 1500 + // sibling is removed from the array when we call appendChild(). 1501 + // As a result, we must revisit this index since the nodes 1502 + // have been shifted. 1503 + s -= 1; 1504 + sl -= 1; 1505 + } 1506 + } 1507 + 1508 + if (this._debug) { 1509 + this.log("Article content pre-prep: " + articleContent.innerHTML); 1510 + } 1511 + // So we have all of the content that we need. Now we clean it up for presentation. 1512 + this._prepArticle(articleContent); 1513 + if (this._debug) { 1514 + this.log("Article content post-prep: " + articleContent.innerHTML); 1515 + } 1516 + 1517 + if (neededToCreateTopCandidate) { 1518 + // We already created a fake div thing, and there wouldn't have been any siblings left 1519 + // for the previous loop, so there's no point trying to create a new div, and then 1520 + // move all the children over. Just assign IDs and class names here. No need to append 1521 + // because that already happened anyway. 1522 + topCandidate.id = "readability-page-1"; 1523 + topCandidate.className = "page"; 1524 + } else { 1525 + var div = doc.createElement("DIV"); 1526 + div.id = "readability-page-1"; 1527 + div.className = "page"; 1528 + while (articleContent.firstChild) { 1529 + div.appendChild(articleContent.firstChild); 1530 + } 1531 + articleContent.appendChild(div); 1532 + } 1533 + 1534 + if (this._debug) { 1535 + this.log("Article content after paging: " + articleContent.innerHTML); 1536 + } 1537 + 1538 + var parseSuccessful = true; 1539 + 1540 + // Now that we've gone through the full algorithm, check to see if 1541 + // we got any meaningful content. If we didn't, we may need to re-run 1542 + // grabArticle with different flags set. This gives us a higher likelihood of 1543 + // finding the content, and the sieve approach gives us a higher likelihood of 1544 + // finding the -right- content. 1545 + var textLength = this._getInnerText(articleContent, true).length; 1546 + if (textLength < this._charThreshold) { 1547 + parseSuccessful = false; 1548 + // eslint-disable-next-line no-unsanitized/property 1549 + page.innerHTML = pageCacheHtml; 1550 + 1551 + this._attempts.push({ 1552 + articleContent, 1553 + textLength, 1554 + }); 1555 + 1556 + if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { 1557 + this._removeFlag(this.FLAG_STRIP_UNLIKELYS); 1558 + } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { 1559 + this._removeFlag(this.FLAG_WEIGHT_CLASSES); 1560 + } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { 1561 + this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); 1562 + } else { 1563 + // No luck after removing flags, just return the longest text we found during the different loops 1564 + this._attempts.sort(function (a, b) { 1565 + return b.textLength - a.textLength; 1566 + }); 1567 + 1568 + // But first check if we actually have something 1569 + if (!this._attempts[0].textLength) { 1570 + return null; 1571 + } 1572 + 1573 + articleContent = this._attempts[0].articleContent; 1574 + parseSuccessful = true; 1575 + } 1576 + } 1577 + 1578 + if (parseSuccessful) { 1579 + // Find out text direction from ancestors of final top candidate. 1580 + var ancestors = [parentOfTopCandidate, topCandidate].concat( 1581 + this._getNodeAncestors(parentOfTopCandidate) 1582 + ); 1583 + this._someNode(ancestors, function (ancestor) { 1584 + if (!ancestor.tagName) { 1585 + return false; 1586 + } 1587 + var articleDir = ancestor.getAttribute("dir"); 1588 + if (articleDir) { 1589 + this._articleDir = articleDir; 1590 + return true; 1591 + } 1592 + return false; 1593 + }); 1594 + return articleContent; 1595 + } 1596 + } 1597 + }, 1598 + 1599 + /** 1600 + * Converts some of the common HTML entities in string to their corresponding characters. 1601 + * 1602 + * @param str {string} - a string to unescape. 1603 + * @return string without HTML entity. 1604 + */ 1605 + _unescapeHtmlEntities(str) { 1606 + if (!str) { 1607 + return str; 1608 + } 1609 + 1610 + var htmlEscapeMap = this.HTML_ESCAPE_MAP; 1611 + return str 1612 + .replace(/&(quot|amp|apos|lt|gt);/g, function (_, tag) { 1613 + return htmlEscapeMap[tag]; 1614 + }) 1615 + .replace(/&#(?:x([0-9a-f]+)|([0-9]+));/gi, function (_, hex, numStr) { 1616 + var num = parseInt(hex || numStr, hex ? 16 : 10); 1617 + 1618 + // these character references are replaced by a conforming HTML parser 1619 + if (num == 0 || num > 0x10ffff || (num >= 0xd800 && num <= 0xdfff)) { 1620 + num = 0xfffd; 1621 + } 1622 + 1623 + return String.fromCodePoint(num); 1624 + }); 1625 + }, 1626 + 1627 + /** 1628 + * Try to extract metadata from JSON-LD object. 1629 + * For now, only Schema.org objects of type Article or its subtypes are supported. 1630 + * @return Object with any metadata that could be extracted (possibly none) 1631 + */ 1632 + _getJSONLD(doc) { 1633 + var scripts = this._getAllNodesWithTag(doc, ["script"]); 1634 + 1635 + var metadata; 1636 + 1637 + this._forEachNode(scripts, function (jsonLdElement) { 1638 + if ( 1639 + !metadata && 1640 + jsonLdElement.getAttribute("type") === "application/ld+json" 1641 + ) { 1642 + try { 1643 + // Strip CDATA markers if present 1644 + var content = jsonLdElement.textContent.replace( 1645 + /^\s*<!\[CDATA\[|\]\]>\s*$/g, 1646 + "" 1647 + ); 1648 + var parsed = JSON.parse(content); 1649 + 1650 + if (Array.isArray(parsed)) { 1651 + parsed = parsed.find(it => { 1652 + return ( 1653 + it["@type"] && 1654 + it["@type"].match(this.REGEXPS.jsonLdArticleTypes) 1655 + ); 1656 + }); 1657 + if (!parsed) { 1658 + return; 1659 + } 1660 + } 1661 + 1662 + var schemaDotOrgRegex = /^https?\:\/\/schema\.org\/?$/; 1663 + var matches = 1664 + (typeof parsed["@context"] === "string" && 1665 + parsed["@context"].match(schemaDotOrgRegex)) || 1666 + (typeof parsed["@context"] === "object" && 1667 + typeof parsed["@context"]["@vocab"] == "string" && 1668 + parsed["@context"]["@vocab"].match(schemaDotOrgRegex)); 1669 + 1670 + if (!matches) { 1671 + return; 1672 + } 1673 + 1674 + if (!parsed["@type"] && Array.isArray(parsed["@graph"])) { 1675 + parsed = parsed["@graph"].find(it => { 1676 + return (it["@type"] || "").match(this.REGEXPS.jsonLdArticleTypes); 1677 + }); 1678 + } 1679 + 1680 + if ( 1681 + !parsed || 1682 + !parsed["@type"] || 1683 + !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes) 1684 + ) { 1685 + return; 1686 + } 1687 + 1688 + metadata = {}; 1689 + 1690 + if ( 1691 + typeof parsed.name === "string" && 1692 + typeof parsed.headline === "string" && 1693 + parsed.name !== parsed.headline 1694 + ) { 1695 + // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz 1696 + // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either 1697 + // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default. 1698 + 1699 + var title = this._getArticleTitle(); 1700 + var nameMatches = this._textSimilarity(parsed.name, title) > 0.75; 1701 + var headlineMatches = 1702 + this._textSimilarity(parsed.headline, title) > 0.75; 1703 + 1704 + if (headlineMatches && !nameMatches) { 1705 + metadata.title = parsed.headline; 1706 + } else { 1707 + metadata.title = parsed.name; 1708 + } 1709 + } else if (typeof parsed.name === "string") { 1710 + metadata.title = parsed.name.trim(); 1711 + } else if (typeof parsed.headline === "string") { 1712 + metadata.title = parsed.headline.trim(); 1713 + } 1714 + if (parsed.author) { 1715 + if (typeof parsed.author.name === "string") { 1716 + metadata.byline = parsed.author.name.trim(); 1717 + } else if ( 1718 + Array.isArray(parsed.author) && 1719 + parsed.author[0] && 1720 + typeof parsed.author[0].name === "string" 1721 + ) { 1722 + metadata.byline = parsed.author 1723 + .filter(function (author) { 1724 + return author && typeof author.name === "string"; 1725 + }) 1726 + .map(function (author) { 1727 + return author.name.trim(); 1728 + }) 1729 + .join(", "); 1730 + } 1731 + } 1732 + if (typeof parsed.description === "string") { 1733 + metadata.excerpt = parsed.description.trim(); 1734 + } 1735 + if (parsed.publisher && typeof parsed.publisher.name === "string") { 1736 + metadata.siteName = parsed.publisher.name.trim(); 1737 + } 1738 + if (typeof parsed.datePublished === "string") { 1739 + metadata.datePublished = parsed.datePublished.trim(); 1740 + } 1741 + } catch (err) { 1742 + this.log(err.message); 1743 + } 1744 + } 1745 + }); 1746 + return metadata ? metadata : {}; 1747 + }, 1748 + 1749 + /** 1750 + * Attempts to get excerpt and byline metadata for the article. 1751 + * 1752 + * @param {Object} jsonld — object containing any metadata that 1753 + * could be extracted from JSON-LD object. 1754 + * 1755 + * @return Object with optional "excerpt" and "byline" properties 1756 + */ 1757 + _getArticleMetadata(jsonld) { 1758 + var metadata = {}; 1759 + var values = {}; 1760 + var metaElements = this._doc.getElementsByTagName("meta"); 1761 + 1762 + // property is a space-separated list of values 1763 + var propertyPattern = 1764 + /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi; 1765 + 1766 + // name is a single value 1767 + var namePattern = 1768 + /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i; 1769 + 1770 + // Find description tags. 1771 + this._forEachNode(metaElements, function (element) { 1772 + var elementName = element.getAttribute("name"); 1773 + var elementProperty = element.getAttribute("property"); 1774 + var content = element.getAttribute("content"); 1775 + if (!content) { 1776 + return; 1777 + } 1778 + var matches = null; 1779 + var name = null; 1780 + 1781 + if (elementProperty) { 1782 + matches = elementProperty.match(propertyPattern); 1783 + if (matches) { 1784 + // Convert to lowercase, and remove any whitespace 1785 + // so we can match below. 1786 + name = matches[0].toLowerCase().replace(/\s/g, ""); 1787 + // multiple authors 1788 + values[name] = content.trim(); 1789 + } 1790 + } 1791 + if (!matches && elementName && namePattern.test(elementName)) { 1792 + name = elementName; 1793 + if (content) { 1794 + // Convert to lowercase, remove any whitespace, and convert dots 1795 + // to colons so we can match below. 1796 + name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":"); 1797 + values[name] = content.trim(); 1798 + } 1799 + } 1800 + }); 1801 + 1802 + // get title 1803 + metadata.title = 1804 + jsonld.title || 1805 + values["dc:title"] || 1806 + values["dcterm:title"] || 1807 + values["og:title"] || 1808 + values["weibo:article:title"] || 1809 + values["weibo:webpage:title"] || 1810 + values.title || 1811 + values["twitter:title"] || 1812 + values["parsely-title"]; 1813 + 1814 + if (!metadata.title) { 1815 + metadata.title = this._getArticleTitle(); 1816 + } 1817 + 1818 + const articleAuthor = 1819 + typeof values["article:author"] === "string" && 1820 + !this._isUrl(values["article:author"]) 1821 + ? values["article:author"] 1822 + : undefined; 1823 + 1824 + // get author 1825 + metadata.byline = 1826 + jsonld.byline || 1827 + values["dc:creator"] || 1828 + values["dcterm:creator"] || 1829 + values.author || 1830 + values["parsely-author"] || 1831 + articleAuthor; 1832 + 1833 + // get description 1834 + metadata.excerpt = 1835 + jsonld.excerpt || 1836 + values["dc:description"] || 1837 + values["dcterm:description"] || 1838 + values["og:description"] || 1839 + values["weibo:article:description"] || 1840 + values["weibo:webpage:description"] || 1841 + values.description || 1842 + values["twitter:description"]; 1843 + 1844 + // get site name 1845 + metadata.siteName = jsonld.siteName || values["og:site_name"]; 1846 + 1847 + // get article published time 1848 + metadata.publishedTime = 1849 + jsonld.datePublished || 1850 + values["article:published_time"] || 1851 + values["parsely-pub-date"] || 1852 + null; 1853 + 1854 + // in many sites the meta value is escaped with HTML entities, 1855 + // so here we need to unescape it 1856 + metadata.title = this._unescapeHtmlEntities(metadata.title); 1857 + metadata.byline = this._unescapeHtmlEntities(metadata.byline); 1858 + metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); 1859 + metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); 1860 + metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime); 1861 + 1862 + return metadata; 1863 + }, 1864 + 1865 + /** 1866 + * Check if node is image, or if node contains exactly only one image 1867 + * whether as a direct child or as its descendants. 1868 + * 1869 + * @param Element 1870 + **/ 1871 + _isSingleImage(node) { 1872 + while (node) { 1873 + if (node.tagName === "IMG") { 1874 + return true; 1875 + } 1876 + if (node.children.length !== 1 || node.textContent.trim() !== "") { 1877 + return false; 1878 + } 1879 + node = node.children[0]; 1880 + } 1881 + return false; 1882 + }, 1883 + 1884 + /** 1885 + * Find all <noscript> that are located after <img> nodes, and which contain only one 1886 + * <img> element. Replace the first image with the image from inside the <noscript> tag, 1887 + * and remove the <noscript> tag. This improves the quality of the images we use on 1888 + * some sites (e.g. Medium). 1889 + * 1890 + * @param Element 1891 + **/ 1892 + _unwrapNoscriptImages(doc) { 1893 + // Find img without source or attributes that might contains image, and remove it. 1894 + // This is done to prevent a placeholder img is replaced by img from noscript in next step. 1895 + var imgs = Array.from(doc.getElementsByTagName("img")); 1896 + this._forEachNode(imgs, function (img) { 1897 + for (var i = 0; i < img.attributes.length; i++) { 1898 + var attr = img.attributes[i]; 1899 + switch (attr.name) { 1900 + case "src": 1901 + case "srcset": 1902 + case "data-src": 1903 + case "data-srcset": 1904 + return; 1905 + } 1906 + 1907 + if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) { 1908 + return; 1909 + } 1910 + } 1911 + 1912 + img.remove(); 1913 + }); 1914 + 1915 + // Next find noscript and try to extract its image 1916 + var noscripts = Array.from(doc.getElementsByTagName("noscript")); 1917 + this._forEachNode(noscripts, function (noscript) { 1918 + // Parse content of noscript and make sure it only contains image 1919 + if (!this._isSingleImage(noscript)) { 1920 + return; 1921 + } 1922 + var tmp = doc.createElement("div"); 1923 + // We're running in the document context, and using unmodified 1924 + // document contents, so doing this should be safe. 1925 + // (Also we heavily discourage people from allowing script to 1926 + // run at all in this document...) 1927 + // eslint-disable-next-line no-unsanitized/property 1928 + tmp.innerHTML = noscript.innerHTML; 1929 + 1930 + // If noscript has previous sibling and it only contains image, 1931 + // replace it with noscript content. However we also keep old 1932 + // attributes that might contains image. 1933 + var prevElement = noscript.previousElementSibling; 1934 + if (prevElement && this._isSingleImage(prevElement)) { 1935 + var prevImg = prevElement; 1936 + if (prevImg.tagName !== "IMG") { 1937 + prevImg = prevElement.getElementsByTagName("img")[0]; 1938 + } 1939 + 1940 + var newImg = tmp.getElementsByTagName("img")[0]; 1941 + for (var i = 0; i < prevImg.attributes.length; i++) { 1942 + var attr = prevImg.attributes[i]; 1943 + if (attr.value === "") { 1944 + continue; 1945 + } 1946 + 1947 + if ( 1948 + attr.name === "src" || 1949 + attr.name === "srcset" || 1950 + /\.(jpg|jpeg|png|webp)/i.test(attr.value) 1951 + ) { 1952 + if (newImg.getAttribute(attr.name) === attr.value) { 1953 + continue; 1954 + } 1955 + 1956 + var attrName = attr.name; 1957 + if (newImg.hasAttribute(attrName)) { 1958 + attrName = "data-old-" + attrName; 1959 + } 1960 + 1961 + newImg.setAttribute(attrName, attr.value); 1962 + } 1963 + } 1964 + 1965 + noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement); 1966 + } 1967 + }); 1968 + }, 1969 + 1970 + /** 1971 + * Removes script tags from the document. 1972 + * 1973 + * @param Element 1974 + **/ 1975 + _removeScripts(doc) { 1976 + this._removeNodes(this._getAllNodesWithTag(doc, ["script", "noscript"])); 1977 + }, 1978 + 1979 + /** 1980 + * Check if this node has only whitespace and a single element with given tag 1981 + * Returns false if the DIV node contains non-empty text nodes 1982 + * or if it contains no element with given tag or more than 1 element. 1983 + * 1984 + * @param Element 1985 + * @param string tag of child element 1986 + **/ 1987 + _hasSingleTagInsideElement(element, tag) { 1988 + // There should be exactly 1 element child with given tag 1989 + if (element.children.length != 1 || element.children[0].tagName !== tag) { 1990 + return false; 1991 + } 1992 + 1993 + // And there should be no text nodes with real content 1994 + return !this._someNode(element.childNodes, function (node) { 1995 + return ( 1996 + node.nodeType === this.TEXT_NODE && 1997 + this.REGEXPS.hasContent.test(node.textContent) 1998 + ); 1999 + }); 2000 + }, 2001 + 2002 + _isElementWithoutContent(node) { 2003 + return ( 2004 + node.nodeType === this.ELEMENT_NODE && 2005 + !node.textContent.trim().length && 2006 + (!node.children.length || 2007 + node.children.length == 2008 + node.getElementsByTagName("br").length + 2009 + node.getElementsByTagName("hr").length) 2010 + ); 2011 + }, 2012 + 2013 + /** 2014 + * Determine whether element has any children block level elements. 2015 + * 2016 + * @param Element 2017 + */ 2018 + _hasChildBlockElement(element) { 2019 + return this._someNode(element.childNodes, function (node) { 2020 + return ( 2021 + this.DIV_TO_P_ELEMS.has(node.tagName) || 2022 + this._hasChildBlockElement(node) 2023 + ); 2024 + }); 2025 + }, 2026 + 2027 + /*** 2028 + * Determine if a node qualifies as phrasing content. 2029 + * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content 2030 + **/ 2031 + _isPhrasingContent(node) { 2032 + return ( 2033 + node.nodeType === this.TEXT_NODE || 2034 + this.PHRASING_ELEMS.includes(node.tagName) || 2035 + ((node.tagName === "A" || 2036 + node.tagName === "DEL" || 2037 + node.tagName === "INS") && 2038 + this._everyNode(node.childNodes, this._isPhrasingContent)) 2039 + ); 2040 + }, 2041 + 2042 + _isWhitespace(node) { 2043 + return ( 2044 + (node.nodeType === this.TEXT_NODE && 2045 + node.textContent.trim().length === 0) || 2046 + (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR") 2047 + ); 2048 + }, 2049 + 2050 + /** 2051 + * Get the inner text of a node - cross browser compatibly. 2052 + * This also strips out any excess whitespace to be found. 2053 + * 2054 + * @param Element 2055 + * @param Boolean normalizeSpaces (default: true) 2056 + * @return string 2057 + **/ 2058 + _getInnerText(e, normalizeSpaces) { 2059 + normalizeSpaces = 2060 + typeof normalizeSpaces === "undefined" ? true : normalizeSpaces; 2061 + var textContent = e.textContent.trim(); 2062 + 2063 + if (normalizeSpaces) { 2064 + return textContent.replace(this.REGEXPS.normalize, " "); 2065 + } 2066 + return textContent; 2067 + }, 2068 + 2069 + /** 2070 + * Get the number of times a string s appears in the node e. 2071 + * 2072 + * @param Element 2073 + * @param string - what to split on. Default is "," 2074 + * @return number (integer) 2075 + **/ 2076 + _getCharCount(e, s) { 2077 + s = s || ","; 2078 + return this._getInnerText(e).split(s).length - 1; 2079 + }, 2080 + 2081 + /** 2082 + * Remove the style attribute on every e and under. 2083 + * TODO: Test if getElementsByTagName(*) is faster. 2084 + * 2085 + * @param Element 2086 + * @return void 2087 + **/ 2088 + _cleanStyles(e) { 2089 + if (!e || e.tagName.toLowerCase() === "svg") { 2090 + return; 2091 + } 2092 + 2093 + // Remove `style` and deprecated presentational attributes 2094 + for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) { 2095 + e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]); 2096 + } 2097 + 2098 + if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.includes(e.tagName)) { 2099 + e.removeAttribute("width"); 2100 + e.removeAttribute("height"); 2101 + } 2102 + 2103 + var cur = e.firstElementChild; 2104 + while (cur !== null) { 2105 + this._cleanStyles(cur); 2106 + cur = cur.nextElementSibling; 2107 + } 2108 + }, 2109 + 2110 + /** 2111 + * Get the density of links as a percentage of the content 2112 + * This is the amount of text that is inside a link divided by the total text in the node. 2113 + * 2114 + * @param Element 2115 + * @return number (float) 2116 + **/ 2117 + _getLinkDensity(element) { 2118 + var textLength = this._getInnerText(element).length; 2119 + if (textLength === 0) { 2120 + return 0; 2121 + } 2122 + 2123 + var linkLength = 0; 2124 + 2125 + // XXX implement _reduceNodeList? 2126 + this._forEachNode(element.getElementsByTagName("a"), function (linkNode) { 2127 + var href = linkNode.getAttribute("href"); 2128 + var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1; 2129 + linkLength += this._getInnerText(linkNode).length * coefficient; 2130 + }); 2131 + 2132 + return linkLength / textLength; 2133 + }, 2134 + 2135 + /** 2136 + * Get an elements class/id weight. Uses regular expressions to tell if this 2137 + * element looks good or bad. 2138 + * 2139 + * @param Element 2140 + * @return number (Integer) 2141 + **/ 2142 + _getClassWeight(e) { 2143 + if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { 2144 + return 0; 2145 + } 2146 + 2147 + var weight = 0; 2148 + 2149 + // Look for a special classname 2150 + if (typeof e.className === "string" && e.className !== "") { 2151 + if (this.REGEXPS.negative.test(e.className)) { 2152 + weight -= 25; 2153 + } 2154 + 2155 + if (this.REGEXPS.positive.test(e.className)) { 2156 + weight += 25; 2157 + } 2158 + } 2159 + 2160 + // Look for a special ID 2161 + if (typeof e.id === "string" && e.id !== "") { 2162 + if (this.REGEXPS.negative.test(e.id)) { 2163 + weight -= 25; 2164 + } 2165 + 2166 + if (this.REGEXPS.positive.test(e.id)) { 2167 + weight += 25; 2168 + } 2169 + } 2170 + 2171 + return weight; 2172 + }, 2173 + 2174 + /** 2175 + * Clean a node of all elements of type "tag". 2176 + * (Unless it's a youtube/vimeo video. People love movies.) 2177 + * 2178 + * @param Element 2179 + * @param string tag to clean 2180 + * @return void 2181 + **/ 2182 + _clean(e, tag) { 2183 + var isEmbed = ["object", "embed", "iframe"].includes(tag); 2184 + 2185 + this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (element) { 2186 + // Allow youtube and vimeo videos through as people usually want to see those. 2187 + if (isEmbed) { 2188 + // First, check the elements attributes to see if any of them contain youtube or vimeo 2189 + for (var i = 0; i < element.attributes.length; i++) { 2190 + if (this._allowedVideoRegex.test(element.attributes[i].value)) { 2191 + return false; 2192 + } 2193 + } 2194 + 2195 + // For embed with <object> tag, check inner HTML as well. 2196 + if ( 2197 + element.tagName === "object" && 2198 + this._allowedVideoRegex.test(element.innerHTML) 2199 + ) { 2200 + return false; 2201 + } 2202 + } 2203 + 2204 + return true; 2205 + }); 2206 + }, 2207 + 2208 + /** 2209 + * Check if a given node has one of its ancestor tag name matching the 2210 + * provided one. 2211 + * @param HTMLElement node 2212 + * @param String tagName 2213 + * @param Number maxDepth 2214 + * @param Function filterFn a filter to invoke to determine whether this node 'counts' 2215 + * @return Boolean 2216 + */ 2217 + _hasAncestorTag(node, tagName, maxDepth, filterFn) { 2218 + maxDepth = maxDepth || 3; 2219 + tagName = tagName.toUpperCase(); 2220 + var depth = 0; 2221 + while (node.parentNode) { 2222 + if (maxDepth > 0 && depth > maxDepth) { 2223 + return false; 2224 + } 2225 + if ( 2226 + node.parentNode.tagName === tagName && 2227 + (!filterFn || filterFn(node.parentNode)) 2228 + ) { 2229 + return true; 2230 + } 2231 + node = node.parentNode; 2232 + depth++; 2233 + } 2234 + return false; 2235 + }, 2236 + 2237 + /** 2238 + * Return an object indicating how many rows and columns this table has. 2239 + */ 2240 + _getRowAndColumnCount(table) { 2241 + var rows = 0; 2242 + var columns = 0; 2243 + var trs = table.getElementsByTagName("tr"); 2244 + for (var i = 0; i < trs.length; i++) { 2245 + var rowspan = trs[i].getAttribute("rowspan") || 0; 2246 + if (rowspan) { 2247 + rowspan = parseInt(rowspan, 10); 2248 + } 2249 + rows += rowspan || 1; 2250 + 2251 + // Now look for column-related info 2252 + var columnsInThisRow = 0; 2253 + var cells = trs[i].getElementsByTagName("td"); 2254 + for (var j = 0; j < cells.length; j++) { 2255 + var colspan = cells[j].getAttribute("colspan") || 0; 2256 + if (colspan) { 2257 + colspan = parseInt(colspan, 10); 2258 + } 2259 + columnsInThisRow += colspan || 1; 2260 + } 2261 + columns = Math.max(columns, columnsInThisRow); 2262 + } 2263 + return { rows, columns }; 2264 + }, 2265 + 2266 + /** 2267 + * Look for 'data' (as opposed to 'layout') tables, for which we use 2268 + * similar checks as 2269 + * https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19 2270 + */ 2271 + _markDataTables(root) { 2272 + var tables = root.getElementsByTagName("table"); 2273 + for (var i = 0; i < tables.length; i++) { 2274 + var table = tables[i]; 2275 + var role = table.getAttribute("role"); 2276 + if (role == "presentation") { 2277 + table._readabilityDataTable = false; 2278 + continue; 2279 + } 2280 + var datatable = table.getAttribute("datatable"); 2281 + if (datatable == "0") { 2282 + table._readabilityDataTable = false; 2283 + continue; 2284 + } 2285 + var summary = table.getAttribute("summary"); 2286 + if (summary) { 2287 + table._readabilityDataTable = true; 2288 + continue; 2289 + } 2290 + 2291 + var caption = table.getElementsByTagName("caption")[0]; 2292 + if (caption && caption.childNodes.length) { 2293 + table._readabilityDataTable = true; 2294 + continue; 2295 + } 2296 + 2297 + // If the table has a descendant with any of these tags, consider a data table: 2298 + var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"]; 2299 + var descendantExists = function (tag) { 2300 + return !!table.getElementsByTagName(tag)[0]; 2301 + }; 2302 + if (dataTableDescendants.some(descendantExists)) { 2303 + this.log("Data table because found data-y descendant"); 2304 + table._readabilityDataTable = true; 2305 + continue; 2306 + } 2307 + 2308 + // Nested tables indicate a layout table: 2309 + if (table.getElementsByTagName("table")[0]) { 2310 + table._readabilityDataTable = false; 2311 + continue; 2312 + } 2313 + 2314 + var sizeInfo = this._getRowAndColumnCount(table); 2315 + 2316 + if (sizeInfo.columns == 1 || sizeInfo.rows == 1) { 2317 + // single colum/row tables are commonly used for page layout purposes. 2318 + table._readabilityDataTable = false; 2319 + continue; 2320 + } 2321 + 2322 + if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) { 2323 + table._readabilityDataTable = true; 2324 + continue; 2325 + } 2326 + // Now just go by size entirely: 2327 + table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10; 2328 + } 2329 + }, 2330 + 2331 + /* convert images and figures that have properties like data-src into images that can be loaded without JS */ 2332 + _fixLazyImages(root) { 2333 + this._forEachNode( 2334 + this._getAllNodesWithTag(root, ["img", "picture", "figure"]), 2335 + function (elem) { 2336 + // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute. 2337 + // So, here we check if the data uri is too short, just might as well remove it. 2338 + if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) { 2339 + // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes. 2340 + var parts = this.REGEXPS.b64DataUrl.exec(elem.src); 2341 + if (parts[1] === "image/svg+xml") { 2342 + return; 2343 + } 2344 + 2345 + // Make sure this element has other attributes which contains image. 2346 + // If it doesn't, then this src is important and shouldn't be removed. 2347 + var srcCouldBeRemoved = false; 2348 + for (var i = 0; i < elem.attributes.length; i++) { 2349 + var attr = elem.attributes[i]; 2350 + if (attr.name === "src") { 2351 + continue; 2352 + } 2353 + 2354 + if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) { 2355 + srcCouldBeRemoved = true; 2356 + break; 2357 + } 2358 + } 2359 + 2360 + // Here we assume if image is less than 100 bytes (or 133 after encoded to base64) 2361 + // it will be too small, therefore it might be placeholder image. 2362 + if (srcCouldBeRemoved) { 2363 + var b64starts = parts[0].length; 2364 + var b64length = elem.src.length - b64starts; 2365 + if (b64length < 133) { 2366 + elem.removeAttribute("src"); 2367 + } 2368 + } 2369 + } 2370 + 2371 + // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580 2372 + if ( 2373 + (elem.src || (elem.srcset && elem.srcset != "null")) && 2374 + !elem.className.toLowerCase().includes("lazy") 2375 + ) { 2376 + return; 2377 + } 2378 + 2379 + for (var j = 0; j < elem.attributes.length; j++) { 2380 + attr = elem.attributes[j]; 2381 + if ( 2382 + attr.name === "src" || 2383 + attr.name === "srcset" || 2384 + attr.name === "alt" 2385 + ) { 2386 + continue; 2387 + } 2388 + var copyTo = null; 2389 + if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) { 2390 + copyTo = "srcset"; 2391 + } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) { 2392 + copyTo = "src"; 2393 + } 2394 + if (copyTo) { 2395 + //if this is an img or picture, set the attribute directly 2396 + if (elem.tagName === "IMG" || elem.tagName === "PICTURE") { 2397 + elem.setAttribute(copyTo, attr.value); 2398 + } else if ( 2399 + elem.tagName === "FIGURE" && 2400 + !this._getAllNodesWithTag(elem, ["img", "picture"]).length 2401 + ) { 2402 + //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure 2403 + //see the nytimes-3 testcase for an example 2404 + var img = this._doc.createElement("img"); 2405 + img.setAttribute(copyTo, attr.value); 2406 + elem.appendChild(img); 2407 + } 2408 + } 2409 + } 2410 + } 2411 + ); 2412 + }, 2413 + 2414 + _getTextDensity(e, tags) { 2415 + var textLength = this._getInnerText(e, true).length; 2416 + if (textLength === 0) { 2417 + return 0; 2418 + } 2419 + var childrenLength = 0; 2420 + var children = this._getAllNodesWithTag(e, tags); 2421 + this._forEachNode( 2422 + children, 2423 + child => (childrenLength += this._getInnerText(child, true).length) 2424 + ); 2425 + return childrenLength / textLength; 2426 + }, 2427 + 2428 + /** 2429 + * Clean an element of all tags of type "tag" if they look fishy. 2430 + * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. 2431 + * 2432 + * @return void 2433 + **/ 2434 + _cleanConditionally(e, tag) { 2435 + if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { 2436 + return; 2437 + } 2438 + 2439 + // Gather counts for other typical elements embedded within. 2440 + // Traverse backwards so we can remove nodes at the same time 2441 + // without effecting the traversal. 2442 + // 2443 + // TODO: Consider taking into account original contentScore here. 2444 + this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (node) { 2445 + // First check if this node IS data table, in which case don't remove it. 2446 + var isDataTable = function (t) { 2447 + return t._readabilityDataTable; 2448 + }; 2449 + 2450 + var isList = tag === "ul" || tag === "ol"; 2451 + if (!isList) { 2452 + var listLength = 0; 2453 + var listNodes = this._getAllNodesWithTag(node, ["ul", "ol"]); 2454 + this._forEachNode( 2455 + listNodes, 2456 + list => (listLength += this._getInnerText(list).length) 2457 + ); 2458 + isList = listLength / this._getInnerText(node).length > 0.9; 2459 + } 2460 + 2461 + if (tag === "table" && isDataTable(node)) { 2462 + return false; 2463 + } 2464 + 2465 + // Next check if we're inside a data table, in which case don't remove it as well. 2466 + if (this._hasAncestorTag(node, "table", -1, isDataTable)) { 2467 + return false; 2468 + } 2469 + 2470 + if (this._hasAncestorTag(node, "code")) { 2471 + return false; 2472 + } 2473 + 2474 + // keep element if it has a data tables 2475 + if ( 2476 + [...node.getElementsByTagName("table")].some( 2477 + tbl => tbl._readabilityDataTable 2478 + ) 2479 + ) { 2480 + return false; 2481 + } 2482 + 2483 + var weight = this._getClassWeight(node); 2484 + 2485 + this.log("Cleaning Conditionally", node); 2486 + 2487 + var contentScore = 0; 2488 + 2489 + if (weight + contentScore < 0) { 2490 + return true; 2491 + } 2492 + 2493 + if (this._getCharCount(node, ",") < 10) { 2494 + // If there are not very many commas, and the number of 2495 + // non-paragraph elements is more than paragraphs or other 2496 + // ominous signs, remove the element. 2497 + var p = node.getElementsByTagName("p").length; 2498 + var img = node.getElementsByTagName("img").length; 2499 + var li = node.getElementsByTagName("li").length - 100; 2500 + var input = node.getElementsByTagName("input").length; 2501 + var headingDensity = this._getTextDensity(node, [ 2502 + "h1", 2503 + "h2", 2504 + "h3", 2505 + "h4", 2506 + "h5", 2507 + "h6", 2508 + ]); 2509 + 2510 + var embedCount = 0; 2511 + var embeds = this._getAllNodesWithTag(node, [ 2512 + "object", 2513 + "embed", 2514 + "iframe", 2515 + ]); 2516 + 2517 + for (var i = 0; i < embeds.length; i++) { 2518 + // If this embed has attribute that matches video regex, don't delete it. 2519 + for (var j = 0; j < embeds[i].attributes.length; j++) { 2520 + if (this._allowedVideoRegex.test(embeds[i].attributes[j].value)) { 2521 + return false; 2522 + } 2523 + } 2524 + 2525 + // For embed with <object> tag, check inner HTML as well. 2526 + if ( 2527 + embeds[i].tagName === "object" && 2528 + this._allowedVideoRegex.test(embeds[i].innerHTML) 2529 + ) { 2530 + return false; 2531 + } 2532 + 2533 + embedCount++; 2534 + } 2535 + 2536 + var innerText = this._getInnerText(node); 2537 + 2538 + // toss any node whose inner text contains nothing but suspicious words 2539 + if ( 2540 + this.REGEXPS.adWords.test(innerText) || 2541 + this.REGEXPS.loadingWords.test(innerText) 2542 + ) { 2543 + return true; 2544 + } 2545 + 2546 + var contentLength = innerText.length; 2547 + var linkDensity = this._getLinkDensity(node); 2548 + var textishTags = ["SPAN", "LI", "TD"].concat( 2549 + Array.from(this.DIV_TO_P_ELEMS) 2550 + ); 2551 + var textDensity = this._getTextDensity(node, textishTags); 2552 + var isFigureChild = this._hasAncestorTag(node, "figure"); 2553 + 2554 + // apply shadiness checks, then check for exceptions 2555 + const shouldRemoveNode = () => { 2556 + const errs = []; 2557 + if (!isFigureChild && img > 1 && p / img < 0.5) { 2558 + errs.push(`Bad p to img ratio (img=${img}, p=${p})`); 2559 + } 2560 + if (!isList && li > p) { 2561 + errs.push(`Too many li's outside of a list. (li=${li} > p=${p})`); 2562 + } 2563 + if (input > Math.floor(p / 3)) { 2564 + errs.push(`Too many inputs per p. (input=${input}, p=${p})`); 2565 + } 2566 + if ( 2567 + !isList && 2568 + !isFigureChild && 2569 + headingDensity < 0.9 && 2570 + contentLength < 25 && 2571 + (img === 0 || img > 2) && 2572 + linkDensity > 0 2573 + ) { 2574 + errs.push( 2575 + `Suspiciously short. (headingDensity=${headingDensity}, img=${img}, linkDensity=${linkDensity})` 2576 + ); 2577 + } 2578 + if ( 2579 + !isList && 2580 + weight < 25 && 2581 + linkDensity > 0.2 + this._linkDensityModifier 2582 + ) { 2583 + errs.push( 2584 + `Low weight and a little linky. (linkDensity=${linkDensity})` 2585 + ); 2586 + } 2587 + if (weight >= 25 && linkDensity > 0.5 + this._linkDensityModifier) { 2588 + errs.push( 2589 + `High weight and mostly links. (linkDensity=${linkDensity})` 2590 + ); 2591 + } 2592 + if ((embedCount === 1 && contentLength < 75) || embedCount > 1) { 2593 + errs.push( 2594 + `Suspicious embed. (embedCount=${embedCount}, contentLength=${contentLength})` 2595 + ); 2596 + } 2597 + if (img === 0 && textDensity === 0) { 2598 + errs.push( 2599 + `No useful content. (img=${img}, textDensity=${textDensity})` 2600 + ); 2601 + } 2602 + 2603 + if (errs.length) { 2604 + this.log("Checks failed", errs); 2605 + return true; 2606 + } 2607 + 2608 + return false; 2609 + }; 2610 + 2611 + var haveToRemove = shouldRemoveNode(); 2612 + 2613 + // Allow simple lists of images to remain in pages 2614 + if (isList && haveToRemove) { 2615 + for (var x = 0; x < node.children.length; x++) { 2616 + let child = node.children[x]; 2617 + // Don't filter in lists with li's that contain more than one child 2618 + if (child.children.length > 1) { 2619 + return haveToRemove; 2620 + } 2621 + } 2622 + let li_count = node.getElementsByTagName("li").length; 2623 + // Only allow the list to remain if every li contains an image 2624 + if (img == li_count) { 2625 + return false; 2626 + } 2627 + } 2628 + return haveToRemove; 2629 + } 2630 + return false; 2631 + }); 2632 + }, 2633 + 2634 + /** 2635 + * Clean out elements that match the specified conditions 2636 + * 2637 + * @param Element 2638 + * @param Function determines whether a node should be removed 2639 + * @return void 2640 + **/ 2641 + _cleanMatchedNodes(e, filter) { 2642 + var endOfSearchMarkerNode = this._getNextNode(e, true); 2643 + var next = this._getNextNode(e); 2644 + while (next && next != endOfSearchMarkerNode) { 2645 + if (filter.call(this, next, next.className + " " + next.id)) { 2646 + next = this._removeAndGetNext(next); 2647 + } else { 2648 + next = this._getNextNode(next); 2649 + } 2650 + } 2651 + }, 2652 + 2653 + /** 2654 + * Clean out spurious headers from an Element. 2655 + * 2656 + * @param Element 2657 + * @return void 2658 + **/ 2659 + _cleanHeaders(e) { 2660 + let headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]); 2661 + this._removeNodes(headingNodes, function (node) { 2662 + let shouldRemove = this._getClassWeight(node) < 0; 2663 + if (shouldRemove) { 2664 + this.log("Removing header with low class weight:", node); 2665 + } 2666 + return shouldRemove; 2667 + }); 2668 + }, 2669 + 2670 + /** 2671 + * Check if this node is an H1 or H2 element whose content is mostly 2672 + * the same as the article title. 2673 + * 2674 + * @param Element the node to check. 2675 + * @return boolean indicating whether this is a title-like header. 2676 + */ 2677 + _headerDuplicatesTitle(node) { 2678 + if (node.tagName != "H1" && node.tagName != "H2") { 2679 + return false; 2680 + } 2681 + var heading = this._getInnerText(node, false); 2682 + this.log("Evaluating similarity of header:", heading, this._articleTitle); 2683 + return this._textSimilarity(this._articleTitle, heading) > 0.75; 2684 + }, 2685 + 2686 + _flagIsActive(flag) { 2687 + return (this._flags & flag) > 0; 2688 + }, 2689 + 2690 + _removeFlag(flag) { 2691 + this._flags = this._flags & ~flag; 2692 + }, 2693 + 2694 + _isProbablyVisible(node) { 2695 + // Have to null-check node.style and node.className.includes to deal with SVG and MathML nodes. 2696 + return ( 2697 + (!node.style || node.style.display != "none") && 2698 + (!node.style || node.style.visibility != "hidden") && 2699 + !node.hasAttribute("hidden") && 2700 + //check for "fallback-image" so that wikimedia math images are displayed 2701 + (!node.hasAttribute("aria-hidden") || 2702 + node.getAttribute("aria-hidden") != "true" || 2703 + (node.className && 2704 + node.className.includes && 2705 + node.className.includes("fallback-image"))) 2706 + ); 2707 + }, 2708 + 2709 + /** 2710 + * Runs readability. 2711 + * 2712 + * Workflow: 2713 + * 1. Prep the document by removing script tags, css, etc. 2714 + * 2. Build readability's DOM tree. 2715 + * 3. Grab the article content from the current dom tree. 2716 + * 4. Replace the current DOM tree with the new one. 2717 + * 5. Read peacefully. 2718 + * 2719 + * @return void 2720 + **/ 2721 + parse() { 2722 + // Avoid parsing too large documents, as per configuration option 2723 + if (this._maxElemsToParse > 0) { 2724 + var numTags = this._doc.getElementsByTagName("*").length; 2725 + if (numTags > this._maxElemsToParse) { 2726 + throw new Error( 2727 + "Aborting parsing document; " + numTags + " elements found" 2728 + ); 2729 + } 2730 + } 2731 + 2732 + // Unwrap image from noscript 2733 + this._unwrapNoscriptImages(this._doc); 2734 + 2735 + // Extract JSON-LD metadata before removing scripts 2736 + var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc); 2737 + 2738 + // Remove script tags from the document. 2739 + this._removeScripts(this._doc); 2740 + 2741 + this._prepDocument(); 2742 + 2743 + var metadata = this._getArticleMetadata(jsonLd); 2744 + this._metadata = metadata; 2745 + this._articleTitle = metadata.title; 2746 + 2747 + var articleContent = this._grabArticle(); 2748 + if (!articleContent) { 2749 + return null; 2750 + } 2751 + 2752 + this.log("Grabbed: " + articleContent.innerHTML); 2753 + 2754 + this._postProcessContent(articleContent); 2755 + 2756 + // If we haven't found an excerpt in the article's metadata, use the article's 2757 + // first paragraph as the excerpt. This is used for displaying a preview of 2758 + // the article's content. 2759 + if (!metadata.excerpt) { 2760 + var paragraphs = articleContent.getElementsByTagName("p"); 2761 + if (paragraphs.length) { 2762 + metadata.excerpt = paragraphs[0].textContent.trim(); 2763 + } 2764 + } 2765 + 2766 + var textContent = articleContent.textContent; 2767 + return { 2768 + title: this._articleTitle, 2769 + byline: metadata.byline || this._articleByline, 2770 + dir: this._articleDir, 2771 + lang: this._articleLang, 2772 + content: this._serializer(articleContent), 2773 + textContent, 2774 + length: textContent.length, 2775 + excerpt: metadata.excerpt, 2776 + siteName: metadata.siteName || this._articleSiteName, 2777 + publishedTime: metadata.publishedTime, 2778 + }; 2779 + }, 2780 + }; 2781 + 2782 + if (typeof module === "object") { 2783 + /* eslint-disable-next-line no-redeclare */ 2784 + /* global module */ 2785 + module.exports = Readability; 2786 + }

+120 -74

scripts/background.js

··· 1 1 // Background script - handles API communication 2 + // Uses centralized CONFIG from config.js 2 3 3 - // Cache key prefixes (must match popup.js) 4 - const QUICK_SUMMARY_CACHE_PREFIX = "quick_summary_cache_"; 5 - const DETAILED_SUMMARY_CACHE_PREFIX = "detailed_summary_cache_"; 6 - const CONTENT_CACHE_PREFIX = "content_cache_"; 7 - const CHAT_CACHE_PREFIX = "chat_cache_"; 4 + // Import shared configuration (service worker context) 5 + importScripts("config.js"); 6 + 7 + // Cache key prefixes from CONFIG 8 + const QUICK_SUMMARY_CACHE_PREFIX = CONFIG.CACHE.QUICK_SUMMARY; 9 + const DETAILED_SUMMARY_CACHE_PREFIX = CONFIG.CACHE.DETAILED_SUMMARY; 10 + const CONTENT_CACHE_PREFIX = CONFIG.CACHE.CONTENT; 11 + const CHAT_CACHE_PREFIX = CONFIG.CACHE.CHAT; 8 12 9 - // ── Prompt templates ───────────────────────────────────────── 10 - const OLLAMA_CONTEXT_TEMPLATE = "Context:\n${context}\n\nUser: ${userMessage}"; 11 - const OLLAMA_SINGLE_MESSAGE_TEMPLATE = "${userMessage}"; 13 + // ── Prompt templates from CONFIG ───────────────────────────────────────── 14 + const OLLAMA_CONTEXT_TEMPLATE = CONFIG.OLLAMA.CONTEXT_TEMPLATE; 15 + const OLLAMA_SINGLE_MESSAGE_TEMPLATE = CONFIG.OLLAMA.SINGLE_MESSAGE_TEMPLATE; 12 16 13 17 chrome.runtime.onInstalled.addListener(() => { 14 18 // Set default settings only if they don't already exist 15 19 chrome.storage.sync.get(["apiMode"]).then((result) => { 16 20 if (!result.apiMode) { 17 - // Settings don't exist yet, set defaults 21 + // Settings don't exist yet, set defaults from CONFIG 18 22 chrome.storage.sync.set({ 19 - apiMode: "ollama", 20 - apiBaseUrl: "http://localhost:11434", 21 - model: "gpt-oss:20b-cloud", 22 - apiKey: "", 23 - disableThinking: false, 23 + apiMode: CONFIG.API.MODE, 24 + apiBaseUrl: CONFIG.API.BASE_URL, 25 + model: CONFIG.API.MODEL, 26 + apiKey: CONFIG.API.KEY, 27 + disableThinking: CONFIG.API.DISABLE_THINKING, 24 28 }); 25 29 } 26 30 }); ··· 52 56 QUICK_SUMMARY_CACHE_PREFIX + tabId, 53 57 DETAILED_SUMMARY_CACHE_PREFIX + tabId, 54 58 CONTENT_CACHE_PREFIX + tabId, 55 - CHAT_CACHE_PREFIX + tabId 59 + CHAT_CACHE_PREFIX + tabId, 60 + CONFIG.CACHE.SUGGESTIONS + tabId, 56 61 ]); 57 62 } catch (e) { 58 63 console.error("[WebAI] Error clearing cache:", e); ··· 84 89 85 90 // Firefox: Create a popup window 86 91 // Chrome: Use action.openPopup() for toolbar popup 87 - if (typeof browser !== 'undefined') { 92 + if (typeof browser !== "undefined") { 88 93 // Firefox: Create a popup window matching the UI size (extra height for browser chrome) 89 94 chrome.windows.create({ 90 - url: chrome.runtime.getURL('popup/popup.html'), 91 - type: 'popup', 95 + url: chrome.runtime.getURL("popup/popup.html"), 96 + type: "popup", 92 97 width: 400, 93 98 height: 600, 94 - focused: true 99 + focused: true, 95 100 }); 96 101 } else { 97 102 // Chrome: Programmatically open the popup ··· 134 139 135 140 if (request.action === "streamChat") { 136 141 const { tabId } = request; 137 - handleStreamChatRequest(request.data, tabId) 138 - .catch((error) => { 139 - console.error("Stream chat error:", error); 140 - chrome.tabs.sendMessage(tabId, { 141 - action: "streamDone", 142 - error: error.message, 143 - }); 142 + handleStreamChatRequest(request.data, tabId).catch((error) => { 143 + console.error("Stream chat error:", error); 144 + chrome.tabs.sendMessage(tabId, { 145 + action: "streamDone", 146 + error: error.message, 144 147 }); 148 + }); 145 149 return false; // We handle the response ourselves via sendMessage to tab 146 150 } 147 151 ··· 202 206 } 203 207 204 208 async function handleChatRequest(data) { 205 - const { apiBaseUrl, model, apiKey, messages, apiMode, disableThinking } = data; 209 + const { apiBaseUrl, model, apiKey, messages, apiMode, disableThinking } = 210 + data; 206 211 207 212 let useNativeOllama = apiMode === "ollama"; 208 213 ··· 214 219 } 215 220 216 221 async function handleStreamChatRequest(data, tabId) { 217 - const { apiBaseUrl, model, apiKey, messages, apiMode, disableThinking } = data; 222 + const { apiBaseUrl, model, apiKey, messages, apiMode, disableThinking } = 223 + data; 218 224 219 225 let useNativeOllama = apiMode === "ollama"; 220 226 221 227 if (useNativeOllama) { 222 - await callOllamaNativeStream(apiBaseUrl, model, messages, disableThinking, tabId); 228 + await callOllamaNativeStream( 229 + apiBaseUrl, 230 + model, 231 + messages, 232 + disableThinking, 233 + tabId, 234 + ); 223 235 } else { 224 - await callOpenAICompatibleStream(apiBaseUrl, model, apiKey, messages, tabId); 236 + await callOpenAICompatibleStream( 237 + apiBaseUrl, 238 + model, 239 + apiKey, 240 + messages, 241 + tabId, 242 + ); 225 243 } 226 244 } 227 245 ··· 239 257 .slice(0, -1) 240 258 .map((m) => `${m.role}: ${m.content}`) 241 259 .join("\n"); 242 - prompt = OLLAMA_CONTEXT_TEMPLATE 243 - .replace("${context}", context) 244 - .replace("${userMessage}", lastUserMsg?.content || ""); 260 + prompt = OLLAMA_CONTEXT_TEMPLATE.replace("${context}", context).replace( 261 + "${userMessage}", 262 + lastUserMsg?.content || "", 263 + ); 245 264 } else { 246 265 prompt = lastUserMsg?.content || ""; 247 266 } ··· 255 274 system: systemContent, 256 275 stream: false, 257 276 options: { 258 - temperature: 0.7, 259 - num_predict: 2048, 277 + temperature: CONFIG.API.TEMPERATURE, 278 + num_predict: CONFIG.API.MAX_TOKENS, 260 279 }, 261 280 }; 262 281 ··· 325 344 model: model, 326 345 messages: messages, 327 346 stream: false, 328 - max_tokens: 2048, 347 + max_tokens: CONFIG.API.MAX_TOKENS, 329 348 }), 330 349 }); 331 350 ··· 359 378 return await response.json(); 360 379 } 361 380 362 - async function callOllamaNativeStream(baseUrl, model, messages, disableThinking, tabId) { 381 + async function callOllamaNativeStream( 382 + baseUrl, 383 + model, 384 + messages, 385 + disableThinking, 386 + tabId, 387 + ) { 363 388 const systemMsgs = messages.filter((m) => m.role === "system"); 364 389 const systemContent = systemMsgs.map((m) => m.content).join("\n\n"); 365 390 const otherMessages = messages.filter((m) => m.role !== "system"); ··· 371 396 .slice(0, -1) 372 397 .map((m) => `${m.role}: ${m.content}`) 373 398 .join("\n"); 374 - prompt = OLLAMA_CONTEXT_TEMPLATE 375 - .replace("${context}", context) 376 - .replace("${userMessage}", lastUserMsg?.content || ""); 399 + prompt = OLLAMA_CONTEXT_TEMPLATE.replace("${context}", context).replace( 400 + "${userMessage}", 401 + lastUserMsg?.content || "", 402 + ); 377 403 } else { 378 404 prompt = lastUserMsg?.content || ""; 379 405 } ··· 386 412 system: systemContent, 387 413 stream: true, 388 414 options: { 389 - temperature: 0.7, 390 - num_predict: 2048, 415 + temperature: CONFIG.API.TEMPERATURE, 416 + num_predict: CONFIG.API.MAX_TOKENS, 391 417 }, 392 418 }; 393 419 ··· 408 434 const text = await response.text(); 409 435 let errorMsg = `HTTP ${response.status}`; 410 436 if (response.status === 403) { 411 - errorMsg = "403 Forbidden. Ollama is rejecting the request origin. Fix: restart Ollama with OLLAMA_ORIGINS=* (e.g. OLLAMA_ORIGINS=* ollama serve)."; 437 + errorMsg = 438 + "403 Forbidden. Ollama is rejecting the request origin. Fix: restart Ollama with OLLAMA_ORIGINS=* (e.g. OLLAMA_ORIGINS=* ollama serve)."; 412 439 } else { 413 440 try { 414 441 const err = JSON.parse(text); ··· 437 464 try { 438 465 const json = JSON.parse(line); 439 466 if (json.response) { 440 - chrome.runtime.sendMessage({ 441 - action: "streamChunk", 442 - chunk: json.response, 443 - done: false, 444 - }).catch(() => {}); 467 + chrome.runtime 468 + .sendMessage({ 469 + action: "streamChunk", 470 + chunk: json.response, 471 + done: false, 472 + }) 473 + .catch(() => {}); 445 474 } 446 475 } catch (e) { 447 476 // Skip invalid JSON lines ··· 451 480 } 452 481 453 482 // Streaming complete - send done message 454 - chrome.runtime.sendMessage({ 455 - action: "streamDone", 456 - }).catch(() => {}); 457 - 483 + chrome.runtime 484 + .sendMessage({ 485 + action: "streamDone", 486 + }) 487 + .catch(() => {}); 458 488 } catch (error) { 459 - chrome.runtime.sendMessage({ 460 - action: "streamDone", 461 - error: error.message, 462 - }).catch(() => {}); 489 + chrome.runtime 490 + .sendMessage({ 491 + action: "streamDone", 492 + error: error.message, 493 + }) 494 + .catch(() => {}); 463 495 } 464 496 } 465 497 466 - async function callOpenAICompatibleStream(baseUrl, model, apiKey, messages, tabId) { 498 + async function callOpenAICompatibleStream( 499 + baseUrl, 500 + model, 501 + apiKey, 502 + messages, 503 + tabId, 504 + ) { 467 505 let url = baseUrl.replace(/\/$/, ""); 468 506 469 507 if (!url.includes("/v1")) { ··· 483 521 model: model, 484 522 messages: messages, 485 523 stream: true, 486 - max_tokens: 2048, 524 + max_tokens: CONFIG.API.MAX_TOKENS, 487 525 }), 488 526 }); 489 527 ··· 493 531 494 532 if (response.status === 403) { 495 533 if (url.includes("/v1")) { 496 - errorMsg = "403 Forbidden. This often means: invalid API key, API key lacks permissions, or the server rejected the request origin."; 534 + errorMsg = 535 + "403 Forbidden. This often means: invalid API key, API key lacks permissions, or the server rejected the request origin."; 497 536 } else { 498 - errorMsg = "403 Forbidden. If using Ollama, ensure it's running with: ollama serve"; 537 + errorMsg = 538 + "403 Forbidden. If using Ollama, ensure it's running with: ollama serve"; 499 539 } 500 540 } else if (response.status === 405) { 501 - errorMsg = "405 Method not allowed. Check if the API URL is correct for your API mode (Native vs OpenAI-compatible)."; 541 + errorMsg = 542 + "405 Method not allowed. Check if the API URL is correct for your API mode (Native vs OpenAI-compatible)."; 502 543 } else { 503 544 try { 504 545 const err = JSON.parse(text); ··· 530 571 const json = JSON.parse(data); 531 572 const content = json.choices?.[0]?.delta?.content; 532 573 if (content) { 533 - chrome.runtime.sendMessage({ 534 - action: "streamChunk", 535 - chunk: content, 536 - done: false, 537 - }).catch(() => {}); 574 + chrome.runtime 575 + .sendMessage({ 576 + action: "streamChunk", 577 + chunk: content, 578 + done: false, 579 + }) 580 + .catch(() => {}); 538 581 } 539 582 } catch (e) { 540 583 // Skip invalid JSON lines ··· 544 587 } 545 588 546 589 // Streaming complete - send done message 547 - chrome.runtime.sendMessage({ 548 - action: "streamDone", 549 - }).catch(() => {}); 550 - 590 + chrome.runtime 591 + .sendMessage({ 592 + action: "streamDone", 593 + }) 594 + .catch(() => {}); 551 595 } catch (error) { 552 - chrome.runtime.sendMessage({ 553 - action: "streamDone", 554 - error: error.message, 555 - }).catch(() => {}); 596 + chrome.runtime 597 + .sendMessage({ 598 + action: "streamDone", 599 + error: error.message, 600 + }) 601 + .catch(() => {}); 556 602 } 557 603 }

+115

scripts/config.js

··· 1 + /** 2 + * Shared configuration for WebAI Summarizer 3 + * Centralizes all defaults, constants, and settings 4 + */ 5 + 6 + const CONFIG = { 7 + // Cache prefixes (must match across all files) 8 + CACHE: { 9 + QUICK_SUMMARY: "quick_summary_cache_", 10 + DETAILED_SUMMARY: "detailed_summary_cache_", 11 + CONTENT: "content_cache_", 12 + CHAT: "chat_cache_", 13 + SUGGESTIONS: "suggestions_cache_", 14 + }, 15 + 16 + // Default API settings 17 + API: { 18 + MODE: "ollama", // "ollama" or "openai" 19 + BASE_URL: "http://localhost:11434", 20 + MODEL: "gpt-oss:20b-cloud", 21 + KEY: "", 22 + DISABLE_THINKING: false, 23 + MAX_TOKENS: 2048, 24 + TEMPERATURE: 0.7, 25 + TIMEOUT_MS: 30000, 26 + }, 27 + 28 + // Content extraction settings 29 + EXTRACTION: { 30 + MAX_LENGTH: 50000, 31 + FALLBACK_SELECTORS: [ 32 + "article p", 33 + "article div", 34 + ".content p", 35 + ".content div", 36 + ".post-content p", 37 + ".entry-content p", 38 + ".article-body p", 39 + "main p", 40 + "main div", 41 + '[role="main"] p', 42 + ".story p", 43 + ".story-body p", 44 + "#story p", 45 + ], 46 + }, 47 + 48 + // UI Themes 49 + THEMES: { 50 + OPTIONS: ["light", "dark", "system"], 51 + DEFAULT: "system", 52 + }, 53 + 54 + // Accent colors 55 + ACCENTS: { 56 + DEFAULT_PRESET: "orange", 57 + DEFAULT_COLOR: "#F15B2F", 58 + PRESETS: { 59 + orange: "#F15B2F", 60 + blue: "#2F80ED", 61 + green: "#2FA36B", 62 + purple: "#7E57C2", 63 + teal: "#14B8A6", 64 + pink: "#EC4899", 65 + indigo: "#4F46E5", 66 + }, 67 + }, 68 + 69 + // Default suggestions 70 + SUGGESTIONS: { 71 + DEFAULT: "What are the main points?", 72 + LIST: [ 73 + "What are the main points?", 74 + "Summarize in 3 bullet points", 75 + "Explain like I'm 5", 76 + "What are the key takeaways?", 77 + ], 78 + }, 79 + 80 + // Prompt templates 81 + PROMPTS: { 82 + SYSTEM_SUMMARIZER: 83 + "You are a helpful assistant that summarizes webpages concisely.", 84 + SYSTEM_CHAT: 85 + "You are a helpful assistant answering questions about a webpage. Use the provided page content and summary to give accurate, concise answers. You may use short sentences, sections, and bullet points to answer. Avoid long paragraphs and tables. ONLY answer based on the provided page content and summary, not any external knowledge or information.", 86 + QUICK_SUMMARY: `Please provide a "Quick Summary" of this webpage. Focus on the main points and key takeaways. Use markdown formatting (headings, bullet points, etc.). 87 + 88 + The "Quick Summary" should be 3-5 **short** one-sentence bullet points. Each of these bullet points should have key points/takeaways **bolded** so people can quickly scan.`, 89 + SUGGESTIONS: `Based on the summary provided, generate 2 natural follow-up questions that a reader might want to ask (besides "Why would this be worth reading?"). Keep questions short (5-8 words), like these examples: 90 + - What are some key quotes? 91 + - Explain this simply 92 + 93 + Return only the 2 questions, one per line, no numbering or bullet points.`, 94 + CHAT_SUGGESTIONS: `Based on this chat response, generate 2 natural follow-up questions the reader might want to ask next. Keep questions short (5-8words). Make them specific to what was just discussed. Return only the 2 questions, one per line, no numbering or bullet points.`, 95 + }, 96 + 97 + // Ollama template strings 98 + OLLAMA: { 99 + CONTEXT_TEMPLATE: "Context:\n${context}\n\nUser: ${userMessage}", 100 + SINGLE_MESSAGE_TEMPLATE: "${userMessage}", 101 + }, 102 + 103 + // Extension metadata 104 + EXTENSION: { 105 + NAME: "WebAI Summarizer", 106 + VERSION: "1.0.0", 107 + SHORTCUT: "Ctrl+Shift+U", 108 + SHORTCUT_MAC: "Command+Shift+U", 109 + }, 110 + }; 111 + 112 + // Make available for both Chrome extension contexts and tests 113 + if (typeof module !== "undefined" && module.exports) { 114 + module.exports = CONFIG; 115 + }

+135 -319

scripts/content.js

··· 1 - // Content script - extracts text from webpage 1 + // Content script - extracts article content using Mozilla Readability 2 2 3 - (function() { 4 - 'use strict'; 3 + (function () { 4 + "use strict"; 5 5 6 + // Prevent multiple injections 6 7 if (window.__webaiExtractorInstalled) { 7 8 return; 8 9 } 9 10 window.__webaiExtractorInstalled = true; 10 - 11 - // Tags to extract text from - be more inclusive 12 - const CONTENT_TAGS = [ 13 - 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 14 - 'article', 'section', 'main', 'div', 'span', 15 - 'li', 'td', 'th', 'blockquote', 16 - 'code', 'pre', 'figcaption', 'figure', 17 - 'strong', 'b', 'em', 'i', 'a' 18 - ]; 19 - 20 - // Tags to exclude 21 - const EXCLUDE_TAGS = [ 22 - 'script', 'style', 'noscript', 'iframe', 23 - 'nav', 'aside', 'form', 'button', 'input' 24 - ]; 25 - 26 - // Maximum characters to extract (increased to capture more content) 27 - const MAX_EXTRACTION_LENGTH = 50000; 28 - 29 - function extractText() { 30 - // Extract text with structure 31 - let extractedText = ''; 32 - let wasTruncated = false; 33 - 34 - // Get title 35 - const title = document.title || ''; 36 - if (title) { 37 - extractedText += `Title: ${title}\n\n`; 11 + 12 + // Import config if available (in extension context) 13 + const MAX_LENGTH = 14 + typeof CONFIG !== "undefined" && CONFIG.EXTRACTION?.MAX_LENGTH 15 + ? CONFIG.EXTRACTION.MAX_LENGTH 16 + : 50000; 17 + 18 + function extractWithReadability() { 19 + // Clone the document so we don't modify the real page 20 + const documentClone = document.cloneNode(true); 21 + 22 + // Create Readability instance 23 + const reader = new Readability(documentClone); 24 + 25 + // Parse the article 26 + const article = reader.parse(); 27 + 28 + if (!article) { 29 + return { text: "", wasTruncated: false }; 38 30 } 39 - 40 - // Get meta description 41 - const metaDesc = document.querySelector('meta[name="description"]'); 42 - if (metaDesc) { 43 - extractedText += `Description: ${metaDesc.getAttribute('content')}\n\n`; 31 + 32 + // Build the extracted text with metadata 33 + let extractedText = ""; 34 + 35 + if (article.title) { 36 + extractedText += `Title: ${article.title}\n\n`; 44 37 } 45 - 46 - // Extract content from body, skipping noise elements 47 - extractedText += extractTextFromElement(document.body); 48 - 49 - // Check if content exceeds limit before cleaning 50 - const originalLength = extractedText.length; 51 - 52 - // Clean up the text (without truncating yet) 53 - extractedText = cleanText(extractedText, false); 54 - 55 - // Track if we need to truncate 56 - if (extractedText.length > MAX_EXTRACTION_LENGTH) { 57 - wasTruncated = true; 58 - extractedText = extractedText.substring(0, MAX_EXTRACTION_LENGTH); 38 + 39 + if (article.byline) { 40 + extractedText += `Author: ${article.byline}\n\n`; 59 41 } 60 - 61 - // Fallback: if we got very little content, try brute force extraction 62 - if (extractedText.length < 1000) { 63 - const fallbackResult = extractTextFallback(); 64 - const fallbackText = fallbackResult.text; 65 - if (fallbackText.length > extractedText.length) { 66 - extractedText = `Title: ${title}\n\n${fallbackText}`; 67 - wasTruncated = fallbackResult.wasTruncated; 68 - } 42 + 43 + if (article.excerpt && article.excerpt !== article.title) { 44 + extractedText += `Description: ${article.excerpt}\n\n`; 69 45 } 70 - 71 - return { text: extractedText, wasTruncated }; 72 - } 73 - 74 - function extractTextFallback() { 75 - // Brute force: get all paragraphs and divs with text content 76 - const selectors = [ 77 - 'article p', 'article div', '.content p', '.content div', 78 - '.post-content p', '.entry-content p', '.article-body p', 79 - 'main p', 'main div', '[role="main"] p', 80 - '.story p', '.story-body p', '#story p' 81 - ]; 82 - 83 - let text = ''; 84 - let wasTruncated = false; 85 - const seen = new Set(); 86 - 87 - for (const selector of selectors) { 88 - try { 89 - const elements = document.querySelectorAll(selector); 90 - for (const el of elements) { 91 - const content = el.textContent.trim(); 92 - // Skip if too short or already seen 93 - if (content.length < 20 || seen.has(content.substring(0, 100))) continue; 94 - 95 - // Check if visible 96 - const style = window.getComputedStyle(el); 97 - if (style.display === 'none' || style.visibility === 'hidden') continue; 98 - 99 - seen.add(content.substring(0, 100)); 100 - text += content + '\n\n'; 101 - 102 - // Check if we're approaching the limit 103 - if (text.length > MAX_EXTRACTION_LENGTH) { 104 - wasTruncated = true; 105 - break; 106 - } 107 - } 108 - } catch (e) { 109 - // Ignore invalid selectors 110 - } 111 - if (wasTruncated) break; 46 + 47 + if (article.publishedTime) { 48 + extractedText += `Published: ${article.publishedTime}\n\n`; 112 49 } 113 - 114 - // Last resort: get all paragraphs on the page 115 - if (text.length < 500 && !wasTruncated) { 116 - const allParagraphs = document.querySelectorAll('p'); 117 - for (const p of allParagraphs) { 118 - const content = p.textContent.trim(); 119 - if (content.length > 30 && !seen.has(content.substring(0, 100))) { 120 - const style = window.getComputedStyle(p); 121 - if (style.display === 'none' || style.visibility === 'hidden') continue; 122 - 123 - seen.add(content.substring(0, 100)); 124 - text += content + '\n\n'; 125 - 126 - if (text.length > MAX_EXTRACTION_LENGTH) { 127 - wasTruncated = true; 128 - break; 129 - } 130 - } 131 - } 50 + 51 + if (article.siteName) { 52 + extractedText += `Source: ${article.siteName}\n\n`; 132 53 } 133 - 134 - return { text: text.substring(0, MAX_EXTRACTION_LENGTH), wasTruncated }; 135 - } 136 54 137 - function shouldSkipElement(el) { 138 - const tag = el.tagName.toLowerCase(); 139 - if (EXCLUDE_TAGS.includes(tag)) { 140 - return true; 55 + // Add separator before content 56 + if (extractedText) { 57 + extractedText += "---\n\n"; 141 58 } 142 59 143 - // Skip hidden elements using live computed style 144 - try { 145 - const style = window.getComputedStyle(el); 146 - if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') { 147 - return true; 148 - } 149 - } catch (e) { 150 - // Computed style might fail for some elements 151 - } 60 + // Get text content (strip HTML) 61 + let content = article.textContent || ""; 152 62 153 - // Skip common noise elements by role/class/id (but protect main content) 154 - if (isMainContent(el)) return false; 63 + // Clean up the text 64 + content = content 65 + .replace(/[^\S\n]+/g, " ") // Collapse spaces/tabs but preserve newlines 66 + .replace(/\n{3,}/g, "\n\n") // Collapse 3+ newlines to 2 67 + .replace(/^\s+|\s+$/g, ""); // Trim 155 68 156 - const role = el.getAttribute('role'); 157 - if (role === 'navigation' || role === 'banner' || role === 'complementary') { 158 - return true; 159 - } 69 + extractedText += content; 160 70 161 - // Only skip if element is clearly a nav/footer/header, not if it just contains the word 162 - // Safely get className and id (they can be objects for SVG elements) 163 - let className = ''; 164 - let id = ''; 165 - 166 - if (el.className) { 167 - if (typeof el.className === 'string') { 168 - className = el.className; 169 - } else if (el.className.baseVal) { 170 - className = el.className.baseVal; 171 - } 172 - } 173 - 174 - if (el.id) { 175 - if (typeof el.id === 'string') { 176 - id = el.id; 177 - } else if (el.id.baseVal) { 178 - id = el.id.baseVal; 179 - } 180 - } 181 - 182 - const classAndId = (className + ' ' + id).toLowerCase(); 183 - const strictNoisePatterns = [ 184 - /^nav$/, /-nav$/, /^nav-/, /^navigation$/, 185 - /^footer$/, /-footer$/, /^footer-/, 186 - /^header$/, /^site-header$/, /^page-header$/, 187 - /^sidebar$/, /^advertisement$/, /^ad-container$/ 188 - ]; 189 - if (strictNoisePatterns.some(p => p.test(classAndId.trim()))) { 190 - return true; 71 + // Check if we need to truncate 72 + let wasTruncated = false; 73 + if (extractedText.length > MAX_LENGTH) { 74 + wasTruncated = true; 75 + extractedText = extractedText.substring(0, MAX_LENGTH); 191 76 } 192 77 193 - return false; 78 + return { text: extractedText, wasTruncated }; 194 79 } 195 - 196 - function isMainContent(element) { 197 - // Check if element is likely main content 198 - const role = element.getAttribute('role'); 199 - const tagName = element.tagName.toLowerCase(); 200 - 201 - // Safely get className and id (they can be objects for SVG elements) 202 - let className = ''; 203 - let id = ''; 204 - 205 - if (element.className) { 206 - if (typeof element.className === 'string') { 207 - className = element.className.toLowerCase(); 208 - } else if (element.className.baseVal) { 209 - // SVGAnimatedString case 210 - className = element.className.baseVal.toLowerCase(); 211 - } 80 + 81 + // Fallback extraction for pages where Readability fails 82 + function extractFallback() { 83 + let text = ""; 84 + let wasTruncated = false; 85 + 86 + // Get title 87 + const title = document.title || ""; 88 + if (title) { 89 + text += `Title: ${title}\n\n`; 212 90 } 213 - 214 - if (element.id) { 215 - if (typeof element.id === 'string') { 216 - id = element.id.toLowerCase(); 217 - } else if (element.id.baseVal) { 218 - id = element.id.baseVal.toLowerCase(); 91 + 92 + // Get meta description 93 + const metaDesc = document.querySelector('meta[name="description"]'); 94 + if (metaDesc) { 95 + const content = metaDesc.getAttribute("content"); 96 + if (content) { 97 + text += `Description: ${content}\n\n---\n\n`; 219 98 } 220 99 } 221 - 222 - // Common content container patterns 223 - const contentPatterns = [ 224 - 'content', 'main-content', 'article-content', 'post-content', 225 - 'entry-content', 'page-content', 'story-content', 'body-content', 226 - 'article', 'post', 'entry', 'story', 'main' 227 - ]; 228 - 229 - const isContentClass = contentPatterns.some(p => 230 - className.includes(p) || id.includes(p) 231 - ); 232 - 233 - return role === 'main' || 234 - role === 'article' || 235 - tagName === 'main' || 236 - tagName === 'article' || 237 - isContentClass; 238 - } 239 - 240 - function extractTextFromElement(element, depth = 0) { 241 - let text = ''; 242 - const indent = ' '.repeat(depth); 243 - const elementTag = element.tagName.toLowerCase(); 244 100 245 - // Get direct text content of this element (if any) 246 - const directText = getDirectTextContent(element).trim(); 247 - if (directText.length > 20 && depth > 0) { 248 - // This element has meaningful direct text 249 - text += directText + '\n\n'; 250 - } 251 - 252 - for (const child of element.children) { 253 - const childTag = child.tagName.toLowerCase(); 254 - 255 - // Skip unwanted elements 256 - if (shouldSkipElement(child)) continue; 101 + // Fallback: get all paragraphs 102 + const paragraphs = document.querySelectorAll("p"); 103 + const seen = new Set(); 257 104 258 - // Handle headings with emphasis 259 - if (/^h[1-6]$/.test(childTag)) { 260 - const headingText = getTextContent(child).trim(); 261 - if (headingText) { 262 - const prefix = '#'.repeat(parseInt(childTag[1])); 263 - text += `\n${prefix} ${headingText}\n\n`; 264 - } 105 + for (const p of paragraphs) { 106 + // Skip hidden elements 107 + const style = window.getComputedStyle(p); 108 + if (style.display === "none" || style.visibility === "hidden") { 109 + continue; 265 110 } 266 - // Handle paragraphs 267 - else if (childTag === 'p') { 268 - const pText = getTextContent(child).trim(); 269 - if (pText.length > 5) { 270 - text += `${pText}\n\n`; 271 - } 272 - } 273 - // Handle lists 274 - else if (childTag === 'li') { 275 - const liText = getTextContent(child).trim(); 276 - if (liText) { 277 - text += `${indent}- ${liText}\n`; 278 - } 279 - } 280 - // Handle code blocks 281 - else if (childTag === 'pre' || childTag === 'code') { 282 - const codeText = getTextContent(child).trim(); 283 - if (codeText) { 284 - text += `\n\`\`\`\n${codeText}\n\`\`\`\n\n`; 285 - } 286 - } 287 - // Recursively process ALL other elements that might contain text 288 - else { 289 - const childText = extractTextFromElement(child, depth + 1); 290 - if (childText.trim()) { 291 - text += childText; 292 - } 111 + 112 + const content = p.textContent.trim(); 113 + 114 + // Skip short or duplicate paragraphs 115 + if (content.length < 30) continue; 116 + const key = content.substring(0, 100); 117 + if (seen.has(key)) continue; 118 + 119 + seen.add(key); 120 + text += content + "\n\n"; 121 + 122 + if (text.length > MAX_LENGTH) { 123 + wasTruncated = true; 124 + break; 293 125 } 294 126 } 295 - 296 - return text; 127 + 128 + return { 129 + text: text.substring(0, MAX_LENGTH), 130 + wasTruncated, 131 + }; 297 132 } 298 - 299 - function getDirectTextContent(element) { 300 - // Get only the direct text nodes of this element (not children) 301 - let text = ''; 302 - for (const node of element.childNodes) { 303 - if (node.nodeType === Node.TEXT_NODE) { 304 - text += node.textContent; 133 + 134 + function extractContent() { 135 + try { 136 + // Try Readability first 137 + const result = extractWithReadability(); 138 + 139 + // If Readability got good content, use it 140 + if (result.text.length > 500) { 141 + return result; 305 142 } 306 - } 307 - return text.trim(); 308 - } 309 - 310 - function getTextContent(element) { 311 - // Get text content but preserve some structure 312 - let text = ''; 313 - 314 - for (const node of element.childNodes) { 315 - if (node.nodeType === Node.TEXT_NODE) { 316 - text += node.textContent; 317 - } else if (node.nodeType === Node.ELEMENT_NODE) { 318 - const tagName = node.tagName.toLowerCase(); 319 - 320 - // Add newlines for block elements 321 - if (['br', 'p', 'div', 'li'].includes(tagName)) { 322 - text += ' ' + getTextContent(node) + ' '; 323 - } else { 324 - text += getTextContent(node); 325 - } 143 + 144 + // Otherwise fall back to basic extraction 145 + console.log( 146 + "[WebAI] Readability extracted minimal content, trying fallback...", 147 + ); 148 + const fallback = extractFallback(); 149 + 150 + // Use whichever got more content 151 + if (fallback.text.length > result.text.length) { 152 + return fallback; 326 153 } 327 - } 328 - 329 - return text; 330 - } 331 - 332 - function cleanText(text, shouldTruncate = true) { 333 - let cleaned = text 334 - .replace(/[^\S\n]+/g, ' ') // Collapse spaces/tabs but preserve newlines 335 - .replace(/\n{3,}/g, '\n\n') // Collapse 3+ newlines to 2 336 - .replace(/^\s+|\s+$/g, ''); // Trim 337 - 338 - // Only truncate if explicitly requested (used for final output) 339 - if (shouldTruncate && cleaned.length > MAX_EXTRACTION_LENGTH) { 340 - cleaned = cleaned.substring(0, MAX_EXTRACTION_LENGTH); 154 + 155 + return result; 156 + } catch (error) { 157 + console.error("[WebAI] Readability error:", error); 158 + // On error, try fallback 159 + return extractFallback(); 341 160 } 342 - 343 - return cleaned; 344 161 } 345 - 346 - // Listen for messages from popup 162 + 163 + // Listen for messages from the extension 347 164 chrome.runtime.onMessage.addListener((request, sender, sendResponse) => { 348 - if (request.action === 'extract') { 349 - const result = extractText(); 350 - sendResponse({ 351 - content: result.text, 352 - wasTruncated: result.wasTruncated 165 + if (request.action === "extract") { 166 + const result = extractContent(); 167 + sendResponse({ 168 + content: result.text, 169 + wasTruncated: result.wasTruncated, 353 170 }); 354 171 } 355 - return true; 172 + return true; // Keep channel open for async 356 173 }); 357 - 358 174 })();

Configure Feed

Configure Feed