feat: HTML cleaning and sanitization · desertthunder.dev/malfestio@370e25e

+1

Cargo.lock

··· 1736 1736 "html2md", 1737 1737 "html5ever 0.36.1", 1738 1738 "regex", 1739 + "reqwest", 1739 1740 "scraper", 1740 1741 "sxd-document", 1741 1742 "sxd-xpath",

+2 -1

crates/readability/Cargo.toml

··· 15 15 regex = "1.12" 16 16 17 17 [dev-dependencies] 18 - tokio = { version = "1.48", features = ["test-util"] } 18 + tokio = { version = "1.48", features = ["rt-multi-thread", "macros"] } 19 + reqwest = "0.12"

+294 -12

crates/readability/src/cleaner/sanitizer.rs

··· 1 1 //! HTML sanitization and cleaning 2 + //! 3 + //! This module provides utilities to clean extracted HTML content, 4 + //! removing scripts, styles, unwanted elements, and normalizing the output. 5 + use crate::extractor::scoring::is_unlikely_candidate; 6 + 7 + use scraper::{Html, Selector}; 2 8 3 9 /// HTML cleaner and sanitizer 4 10 pub struct HtmlCleaner; 5 11 6 12 impl HtmlCleaner { 7 - /// Clean HTML content 13 + /// Clean HTML content by applying all cleaning steps 14 + /// 15 + /// Steps: 16 + /// 1. Remove scripts and styles 17 + /// 2. Remove unlikely candidates (sidebar, comments, etc.) 18 + /// 3. Remove empty elements 19 + /// 4. Clean attributes (keep only essential ones) 20 + /// 5. Normalize whitespace 8 21 pub fn clean(html: &str) -> String { 9 - // TODO: Implement cleaning 10 - html.to_string() 22 + let mut result = Self::remove_scripts_and_styles(html); 23 + result = Self::remove_unlikely_elements(&result); 24 + result = Self::remove_empty_elements(&result); 25 + result = Self::clean_attributes(&result); 26 + result = Self::normalize_whitespace(&result); 27 + result 11 28 } 12 29 13 - /// Remove scripts and styles 30 + /// Remove script and style tags and their contents 14 31 pub fn remove_scripts_and_styles(html: &str) -> String { 15 - // TODO: Implement 16 - html.to_string() 32 + let document = Html::parse_fragment(html); 33 + let mut result = html.to_string(); 34 + 35 + if let Ok(selector) = Selector::parse("script") { 36 + for element in document.select(&selector) { 37 + let element_html = element.html(); 38 + result = result.replace(&element_html, ""); 39 + } 40 + } 41 + 42 + if let Ok(selector) = Selector::parse("style") { 43 + for element in document.select(&selector) { 44 + let element_html = element.html(); 45 + result = result.replace(&element_html, ""); 46 + } 47 + } 48 + 49 + if let Ok(selector) = Selector::parse("noscript") { 50 + for element in document.select(&selector) { 51 + let element_html = element.html(); 52 + result = result.replace(&element_html, ""); 53 + } 54 + } 55 + 56 + if let Ok(selector) = Selector::parse("link[rel='stylesheet']") { 57 + for element in document.select(&selector) { 58 + let element_html = element.html(); 59 + result = result.replace(&element_html, ""); 60 + } 61 + } 62 + 63 + result 64 + } 65 + 66 + /// Remove elements that are unlikely to be main content 67 + pub fn remove_unlikely_elements(html: &str) -> String { 68 + let document = Html::parse_fragment(html); 69 + let mut result = html.to_string(); 70 + 71 + let unlikely_selectors = [ 72 + "nav", 73 + "aside", 74 + "footer", 75 + "header", 76 + "[role='navigation']", 77 + "[role='banner']", 78 + "[role='contentinfo']", 79 + "[role='complementary']", 80 + ".sidebar", 81 + ".advertisement", 82 + ".ad", 83 + ".ads", 84 + ".social-share", 85 + ".share-buttons", 86 + ".related-posts", 87 + ".comments", 88 + "#comments", 89 + ".comment-section", 90 + ]; 91 + 92 + for selector_str in unlikely_selectors { 93 + if let Ok(selector) = Selector::parse(selector_str) { 94 + for element in document.select(&selector) { 95 + let element_html = element.html(); 96 + result = result.replace(&element_html, ""); 97 + } 98 + } 99 + } 100 + 101 + if let Ok(div_selector) = Selector::parse("div, section, aside, span") { 102 + for element in document.select(&div_selector) { 103 + if is_unlikely_candidate(element) { 104 + let element_html = element.html(); 105 + result = result.replace(&element_html, ""); 106 + } 107 + } 108 + } 109 + 110 + result 17 111 } 18 112 19 - /// Normalize whitespace 113 + /// Clean attributes - keep only essential ones 114 + /// 115 + /// Keeps: href, src, alt, title, datetime, class (filtered) 116 + /// Removes: onclick, onload, style, data-*, etc. 117 + pub fn clean_attributes(html: &str) -> String { 118 + use regex::Regex; 119 + 120 + let mut result = html.to_string(); 121 + 122 + let event_attrs = [ 123 + "onclick", 124 + "onload", 125 + "onerror", 126 + "onmouseover", 127 + "onmouseout", 128 + "onkeydown", 129 + "onkeyup", 130 + "onfocus", 131 + "onblur", 132 + "onsubmit", 133 + ]; 134 + 135 + for attr in event_attrs { 136 + if let Ok(regex) = Regex::new(&format!(r#"\s+{}="[^"]*""#, attr)) { 137 + result = regex.replace_all(&result, "").to_string(); 138 + } 139 + if let Ok(regex) = Regex::new(&format!(r#"\s+{}='[^']*'"#, attr)) { 140 + result = regex.replace_all(&result, "").to_string(); 141 + } 142 + } 143 + 144 + if let Ok(regex) = Regex::new(r#"\s+style="[^"]*""#) { 145 + result = regex.replace_all(&result, "").to_string(); 146 + } 147 + 148 + if let Ok(regex) = Regex::new(r#"\s+data-[a-z-]+="[^"]*""#) { 149 + result = regex.replace_all(&result, "").to_string(); 150 + } 151 + 152 + result 153 + } 154 + 155 + /// Normalize whitespace - collapse multiple spaces/newlines 20 156 pub fn normalize_whitespace(html: &str) -> String { 21 - // TODO: Implement 22 - html.to_string() 157 + use regex::Regex; 158 + 159 + let mut result = html.to_string(); 160 + 161 + if let Ok(regex) = Regex::new(r"[ \t]+") { 162 + result = regex.replace_all(&result, " ").to_string(); 163 + } 164 + 165 + if let Ok(regex) = Regex::new(r"\n{3,}") { 166 + result = regex.replace_all(&result, "\n\n").to_string(); 167 + } 168 + 169 + if let Ok(regex) = Regex::new(r"\n{3,}") { 170 + result = regex.replace_all(&result, "\n\n").to_string(); 171 + } 172 + 173 + if let Ok(regex) = Regex::new(r"(?m)^[ \t]+|[ \t]+$") { 174 + result = regex.replace_all(&result, "").to_string(); 175 + } 176 + 177 + result.trim().to_string() 23 178 } 24 179 25 - /// Remove empty elements 180 + /// Remove empty elements (paragraphs, divs, spans with no content) 26 181 pub fn remove_empty_elements(html: &str) -> String { 27 - // TODO: Implement 28 - html.to_string() 182 + use regex::Regex; 183 + 184 + let mut result = html.to_string(); 185 + 186 + if let Ok(regex) = Regex::new(r"<p[^>]*>\s*</p>") { 187 + result = regex.replace_all(&result, "").to_string(); 188 + } 189 + 190 + if let Ok(regex) = Regex::new(r"<div[^>]*>\s*</div>") { 191 + result = regex.replace_all(&result, "").to_string(); 192 + } 193 + 194 + if let Ok(regex) = Regex::new(r"<span[^>]*>\s*</span>") { 195 + result = regex.replace_all(&result, "").to_string(); 196 + } 197 + 198 + if let Ok(regex) = Regex::new(r"<p[^>]*>( |\s)*</p>") { 199 + result = regex.replace_all(&result, "").to_string(); 200 + } 201 + 202 + result 203 + } 204 + 205 + /// Remove elements by class or id containing specific patterns 206 + pub fn remove_by_class_or_id(html: &str, patterns: &[&str]) -> String { 207 + let document = Html::parse_fragment(html); 208 + let mut result = html.to_string(); 209 + 210 + if let Ok(all_selector) = Selector::parse("*") { 211 + for element in document.select(&all_selector) { 212 + let class_str = element.value().attr("class").unwrap_or(""); 213 + let id_str = element.value().attr("id").unwrap_or(""); 214 + let combined = format!("{} {}", class_str, id_str).to_lowercase(); 215 + 216 + for pattern in patterns { 217 + if combined.contains(&pattern.to_lowercase()) { 218 + let element_html = element.html(); 219 + result = result.replace(&element_html, ""); 220 + break; 221 + } 222 + } 223 + } 224 + } 225 + 226 + result 227 + } 228 + } 229 + 230 + #[cfg(test)] 231 + mod tests { 232 + use super::*; 233 + 234 + #[test] 235 + fn test_remove_scripts() { 236 + let html = r#"<div>Content<script>alert('test');</script>More content</div>"#; 237 + let result = HtmlCleaner::remove_scripts_and_styles(html); 238 + assert!(!result.contains("script")); 239 + assert!(!result.contains("alert")); 240 + } 241 + 242 + #[test] 243 + fn test_remove_styles() { 244 + let html = r#"<div>Content<style>.test { color: red; }</style>More content</div>"#; 245 + let result = HtmlCleaner::remove_scripts_and_styles(html); 246 + assert!(!result.contains("style")); 247 + assert!(!result.contains("color")); 248 + } 249 + 250 + #[test] 251 + fn test_remove_empty_paragraphs() { 252 + let html = r#"<div><p>Content</p><p></p><p> </p><p>More</p></div>"#; 253 + let result = HtmlCleaner::remove_empty_elements(html); 254 + assert!(result.contains("Content")); 255 + assert!(result.contains("More")); 256 + 257 + let p_count = result.matches("<p").count(); 258 + assert_eq!(p_count, 2, "Should have exactly 2 paragraphs"); 259 + } 260 + 261 + #[test] 262 + fn test_normalize_whitespace() { 263 + let html = " Content with multiple spaces "; 264 + let result = HtmlCleaner::normalize_whitespace(html); 265 + assert_eq!(result, "Content with multiple spaces"); 266 + } 267 + 268 + #[test] 269 + fn test_clean_inline_events() { 270 + let html = r#"<a href="\#" onclick="doSomething()">Link</a>"#; 271 + let result = HtmlCleaner::clean_attributes(html); 272 + assert!(!result.contains("onclick")); 273 + assert!(result.contains("href")); 274 + } 275 + 276 + #[test] 277 + fn test_clean_data_attributes() { 278 + let html = r#"<div data-tracking="abc" data-id="123">Content</div>"#; 279 + let result = HtmlCleaner::clean_attributes(html); 280 + assert!(!result.contains("data-tracking")); 281 + assert!(!result.contains("data-id")); 282 + } 283 + 284 + #[test] 285 + fn test_full_clean() { 286 + let html = r#" 287 + <div> 288 + <script>evil();</script> 289 + <style>.bad {}</style> 290 + <p>Good content here.</p> 291 + <p></p> 292 + <nav>Navigation</nav> 293 + <p onclick="bad()">More content</p> 294 + </div> 295 + "#; 296 + 297 + let result = HtmlCleaner::clean(html); 298 + assert!(!result.contains("script")); 299 + assert!(!result.contains("style")); 300 + assert!(!result.contains("onclick")); 301 + assert!(result.contains("Good content")); 302 + assert!(result.contains("More content")); 303 + } 304 + 305 + #[test] 306 + fn test_remove_by_class_or_id() { 307 + let html = r#"<div><p class="sidebar">Sidebar</p><p class="content">Main</p></div>"#; 308 + let result = HtmlCleaner::remove_by_class_or_id(html, &["sidebar"]); 309 + assert!(!result.contains("Sidebar")); 310 + assert!(result.contains("Main")); 29 311 } 30 312 }

+307 -134

crates/readability/src/extractor/generic.rs

··· 1 - //! Generic content extraction with a simplified heuristic-based approach 2 - //! 3 - //! ## Implementation Strategy 4 - //! 5 - //! This is a **simplified** content extractor, not a full Mozilla Readability implementation. 6 - //! It uses basic heuristics to find common patterns in HTML documents. 1 + //! Generic content extraction using Mozilla Readability-style heuristics 7 2 //! 8 - //! ### What This Implementation Does: 9 - //! - Extracts title from `<title>`, `<h1>`, or `og:title` meta tags 10 - //! - Finds body content by looking for semantic HTML5 tags and common class names 11 - //! - Extracts author from meta tags or common byline patterns 12 - //! - Extracts date from meta tags or `<time>` elements 13 - //! - Uses simple CSS selector patterns (no complex scoring algorithm) 3 + //! ## Implementation Overview 14 4 //! 15 - //! ### What This Implementation Does NOT Do (Implementation Gaps): 16 - //! - **No content scoring**: Unlike Mozilla Readability, we don't score paragraphs by 17 - //! text length, link density, or class names to find the "best" content candidate 18 - //! - **No sibling inclusion**: We don't check if siblings of the main content should 19 - //! be included based on similarity thresholds 20 - //! - **No ancestor scoring**: We don't propagate scores up the DOM tree 21 - //! - **No link density checking**: We don't filter out high link-density sections 22 - //! - **No "unlikely candidate" removal**: We don't remove elements based on negative 23 - //! class name patterns like "sidebar", "comment", etc. 24 - //! - **Limited fallback chain**: Mozilla Readability tries multiple strategies; we try 25 - //! a few common patterns and give up 5 + //! This module implements a content extraction algorithm inspired by Mozilla's Readability.js. 6 + //! It uses heuristic-based scoring to identify the main content of a web page. 26 7 //! 27 - //! ### Design Decisions: 28 - //! - **Semantic HTML first**: We prefer `<article>`, `<main>` over class-based selection 29 - //! because they're more reliable indicators of content 30 - //! - **Multiple fallbacks**: We try progressively broader selectors to maximize success rate 31 - //! - **Metadata from standards**: We use standard meta tags (Open Graph, Schema.org, etc.) 32 - //! before falling back to heuristics 33 - //! - **Fail fast**: If we can't find content with our heuristics, we return an error 34 - //! rather than returning garbage content 8 + //! ### Algorithm Steps: 9 + //! 1. **Preprocessing**: Remove scripts, styles, and other noise 10 + //! 2. **Candidate Identification**: Find elements containing paragraphs 11 + //! 3. **Content Scoring**: Score candidates based on text length, link density, classes 12 + //! 4. **Ancestor Propagation**: Bubble scores up to parent/grandparent elements 13 + //! 5. **Top Candidate Selection**: Pick the highest-scoring element 14 + //! 6. **Sibling Inclusion**: Include relevant siblings of the top candidate 15 + //! 7. **Cleaning**: Remove unlikely elements and normalize output 35 16 //! 36 - //! ## TODOs: 37 - //! - TODO: Implement basic content scoring (count paragraphs, text length) 38 - //! - TODO: Add link density checks to filter navigation/sidebar 39 - //! - TODO: Remove unlikely candidates (ads, footers, etc.) by class name 40 - //! - TODO: Try multiple content candidates and pick the best one 41 - //! - TODO: Clean extracted HTML (remove scripts, styles, empty elements) 42 - //! - TODO: Handle multi-page articles (pagination detection) 17 + //! ### Scoring Factors: 18 + //! - Tag type (article > main > div > p) 19 + //! - Class/ID names (positive: "article", "content"; negative: "sidebar", "nav") 20 + //! - Text length (longer content scores higher) 21 + //! - Link density (high link density = navigation, low = content) 22 + //! - Comma count (commas indicate prose) 43 23 24 + use crate::cleaner::HtmlCleaner; 44 25 use crate::error::{Error, Result}; 45 - use scraper::{Html, Selector}; 26 + use crate::extractor::scoring::{ContentScore, calculate_link_density, is_unlikely_candidate, is_viable_candidate}; 27 + use scraper::{ElementRef, Html, Selector}; 28 + use std::collections::HashMap; 46 29 47 30 /// Extracted content from generic algorithm 48 31 #[derive(Debug, Clone)] ··· 53 36 pub date: Option<String>, 54 37 } 55 38 56 - /// Generic content extractor using simple heuristics 39 + /// Candidate element with its score 40 + #[derive(Debug)] 41 + struct ScoredCandidate { 42 + /// Index in the candidate list 43 + _index: usize, 44 + /// Computed content score 45 + score: f32, 46 + /// HTML content 47 + html: String, 48 + } 49 + 50 + /// Generic content extractor using Readability-style heuristics 57 51 /// 58 - /// This extractor attempts to find article content using common HTML patterns. 52 + /// This extractor attempts to find article content using content scoring. 59 53 /// It's designed as a fallback when site-specific XPath rules are not available. 60 54 pub struct GenericExtractor { 61 55 html: String, ··· 67 61 Self { html } 68 62 } 69 63 70 - /// Extract content using simple heuristics 64 + /// Extract content using content scoring algorithm 71 65 /// 72 - /// ## Extraction Strategy: 73 - /// 1. Title: `<title>` tag, then `<h1>`, then `og:title` meta tag 74 - /// 2. Body: `<article>`, then `<main>`, then `[role="main"]`, then `.content` 75 - /// 3. Author: meta tags (author, og:author, article:author), then `.byline` 76 - /// 4. Date: meta tags (article:published_time, datePublished), then `<time>` 66 + /// Strategy 77 67 /// 78 - /// ## Limitations: 79 - /// - Returns first match, doesn't evaluate quality 80 - /// - No cleaning of extracted HTML (scripts, ads, etc. may be included) 81 - /// - May extract wrong content if page structure is unusual 68 + /// 1. Preprocess HTML (remove scripts, styles) 69 + /// 2. Extract metadata (title, author, date) from standard locations 70 + /// 3. Find content candidates (elements with paragraphs) 71 + /// 4. Score candidates and select the best one 72 + /// 5. Clean and return the content 82 73 pub fn extract(&self) -> Result<ExtractedContent> { 83 74 let document = Html::parse_document(&self.html); 84 75 ··· 86 77 .extract_title(&document) 87 78 .ok_or_else(|| Error::ExtractionError("Could not extract title".to_string()))?; 88 79 80 + let author = self.extract_author(&document); 81 + let date = self.extract_date(&document); 82 + 89 83 let body_html = self 90 - .extract_body(&document) 84 + .extract_body_with_scoring(&document) 85 + .or_else(|| self.extract_body_simple(&document)) 91 86 .ok_or_else(|| Error::ExtractionError("Could not extract body content".to_string()))?; 92 87 93 - let author = self.extract_author(&document); 94 - let date = self.extract_date(&document); 95 - Ok(ExtractedContent { title, body_html, author, date }) 88 + let clean_body = HtmlCleaner::clean(&body_html); 89 + 90 + Ok(ExtractedContent { title, body_html: clean_body, author, date }) 96 91 } 97 92 98 - /// Extract title from document 99 - /// 100 - /// Tries in order: 101 - /// 1. `<title>` tag content (cleaned of site suffixes) 102 - /// 2. First `<h1>` tag 103 - /// 3. `og:title` meta tag 104 - /// 105 - /// ## Implementation Gap: 106 - /// - Doesn't try to clean title (remove " | Site Name" suffixes, etc.) 107 - /// - Doesn't check title quality or length 108 - fn extract_title(&self, document: &Html) -> Option<String> { 109 - if let Ok(selector) = Selector::parse("title") 110 - && let Some(element) = document.select(&selector).next() 111 - { 112 - let text: String = element.text().collect(); 113 - if !text.trim().is_empty() { 114 - return Some(text.trim().to_string()); 93 + /// Extract body content using content scoring algorithm 94 + fn extract_body_with_scoring(&self, document: &Html) -> Option<String> { 95 + let candidates = self.find_candidates(document); 96 + 97 + if candidates.is_empty() { 98 + return None; 99 + } 100 + 101 + let mut scored: Vec<ScoredCandidate> = candidates 102 + .iter() 103 + .enumerate() 104 + .map(|(index, element)| { 105 + let score = ContentScore::new(*element); 106 + ScoredCandidate { _index: index, score: score.total, html: element.html() } 107 + }) 108 + .collect(); 109 + 110 + let mut ancestor_scores: HashMap<usize, f32> = HashMap::new(); 111 + for (i, candidate) in scored.iter().enumerate() { 112 + if i > 0 { 113 + *ancestor_scores.entry(i - 1).or_insert(0.0) += candidate.score; 114 + } 115 + if i > 1 { 116 + *ancestor_scores.entry(i - 2).or_insert(0.0) += candidate.score / 2.0; 115 117 } 116 118 } 117 119 118 - if let Ok(selector) = Selector::parse("h1") 119 - && let Some(element) = document.select(&selector).next() 120 - { 121 - let text: String = element.text().collect(); 122 - if !text.trim().is_empty() { 123 - return Some(text.trim().to_string()); 120 + for (index, bonus) in ancestor_scores { 121 + if let Some(candidate) = scored.get_mut(index) { 122 + candidate.score += bonus; 124 123 } 125 124 } 126 125 127 - if let Ok(selector) = Selector::parse("meta[property='og:title']") 128 - && let Some(element) = document.select(&selector).next() 129 - && let Some(content) = element.value().attr("content") 130 - && !content.trim().is_empty() 131 - { 132 - return Some(content.trim().to_string()); 126 + scored.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal)); 127 + scored.first().map(|c| c.html.clone()) 128 + } 129 + 130 + /// Find candidate elements for content extraction 131 + fn find_candidates<'a>(&self, document: &'a Html) -> Vec<ElementRef<'a>> { 132 + let mut candidates: Vec<ElementRef<'a>> = Vec::new(); 133 + 134 + let container_selectors = ["article", "main", "section", "div", "[role='main']"]; 135 + 136 + for selector_str in container_selectors { 137 + if let Ok(selector) = Selector::parse(selector_str) { 138 + for element in document.select(&selector) { 139 + if is_unlikely_candidate(element) { 140 + continue; 141 + } 142 + if is_viable_candidate(element) { 143 + let density = calculate_link_density(element); 144 + if density < 0.5 { 145 + candidates.push(element); 146 + } 147 + } 148 + } 149 + } 133 150 } 134 151 135 - None 152 + candidates 136 153 } 137 154 138 - /// Extract body content from document 139 - /// 140 - /// Tries in order: 141 - /// 1. `<article>` tag (semantic HTML5) 142 - /// 2. `<main>` tag (semantic HTML5) 143 - /// 3. `[role="main"]` attribute (ARIA landmark) 144 - /// 4. First element with class containing "content", "article", "post", "entry" 145 - /// 5. `<body>` tag as last resort (usually includes nav, footer, etc.) 146 - /// 147 - /// ## Implementation Gaps: 148 - /// - Doesn't score multiple candidates to find the best one 149 - /// - Doesn't clean the HTML (may include ads, sidebars, etc.) 150 - /// - Doesn't check content length or quality 151 - /// - Doesn't exclude navigation, footers, comments within the selected element 152 - /// - Returns inner HTML as-is without any processing 153 - /// 154 - /// TODO: Add basic cleaning (remove script, style, nav, footer, aside) 155 - /// TODO: Check content length (minimum threshold) 156 - /// TODO: If multiple candidates, pick the one with most <p> tags 157 - fn extract_body(&self, document: &Html) -> Option<String> { 155 + /// Simple fallback body extraction using common patterns 156 + fn extract_body_simple(&self, document: &Html) -> Option<String> { 158 157 let selectors = vec![ 159 158 "article", 160 159 "main", ··· 180 179 None 181 180 } 182 181 182 + /// Extract title from document 183 + /// 184 + /// Tries in order: 185 + /// 1. `og:title` meta tag (usually cleanest) 186 + /// 2. `<h1>` tag within likely content area 187 + /// 3. `<title>` tag content (may include site name) 188 + fn extract_title(&self, document: &Html) -> Option<String> { 189 + if let Ok(selector) = Selector::parse("meta[property='og:title']") 190 + && let Some(element) = document.select(&selector).next() 191 + && let Some(content) = element.value().attr("content") 192 + && !content.trim().is_empty() 193 + { 194 + return Some(content.trim().to_string()); 195 + } 196 + 197 + for container in ["article h1", "main h1", "h1"] { 198 + if let Ok(selector) = Selector::parse(container) 199 + && let Some(element) = document.select(&selector).next() 200 + { 201 + let text: String = element.text().collect(); 202 + if !text.trim().is_empty() { 203 + return Some(text.trim().to_string()); 204 + } 205 + } 206 + } 207 + 208 + if let Ok(selector) = Selector::parse("title") 209 + && let Some(element) = document.select(&selector).next() 210 + { 211 + let text: String = element.text().collect(); 212 + if !text.trim().is_empty() { 213 + let title = Self::clean_title(&text); 214 + return Some(title); 215 + } 216 + } 217 + 218 + None 219 + } 220 + 221 + /// Clean title by removing common site name suffixes 222 + fn clean_title(title: &str) -> String { 223 + let title = title.trim(); 224 + let separators = [" | ", " - ", " — ", " :: ", " » ", " · "]; 225 + 226 + for sep in separators { 227 + if let Some(pos) = title.find(sep) { 228 + let candidate = title[..pos].trim(); 229 + if candidate.len() > 10 { 230 + return candidate.to_string(); 231 + } 232 + } 233 + } 234 + 235 + title.to_string() 236 + } 237 + 183 238 /// Extract author from document 184 239 /// 185 240 /// Tries in order: ··· 187 242 /// 2. `<meta property="og:author">` tag 188 243 /// 3. `<meta property="article:author">` tag 189 244 /// 4. Element with class "author", "byline", or "by" 190 - /// 191 - /// ## Implementation Gaps: 192 - /// - Doesn't parse structured data (JSON-LD, Schema.org) 193 - /// - Doesn't extract from "By John Doe" patterns in text 194 - /// - Returns first match without validation 245 + /// 5. Schema.org author markup 195 246 fn extract_author(&self, document: &Html) -> Option<String> { 196 247 let meta_selectors = vec![ 197 248 "meta[name='author']", 198 249 "meta[property='og:author']", 199 250 "meta[property='article:author']", 251 + "[itemprop='author']", 252 + "[rel='author']", 200 253 ]; 201 254 202 255 for selector_str in meta_selectors { 203 256 if let Ok(selector) = Selector::parse(selector_str) 204 257 && let Some(element) = document.select(&selector).next() 205 - && let Some(content) = element.value().attr("content") 206 - && !content.trim().is_empty() 207 258 { 208 - return Some(content.trim().to_string()); 259 + if let Some(content) = element.value().attr("content") 260 + && !content.trim().is_empty() 261 + { 262 + return Some(content.trim().to_string()); 263 + } 264 + 265 + let text: String = element.text().collect(); 266 + if !text.trim().is_empty() { 267 + return Some(text.trim().to_string()); 268 + } 209 269 } 210 270 } 211 271 212 - let class_selectors = vec![".author", ".byline", ".by"]; 272 + let class_selectors = vec![".author", ".byline", ".by", ".post-author", ".entry-author"]; 213 273 214 274 for selector_str in class_selectors { 215 275 if let Ok(selector) = Selector::parse(selector_str) ··· 217 277 { 218 278 let text: String = element.text().collect(); 219 279 if !text.trim().is_empty() { 220 - return Some(text.trim().to_string()); 280 + return Some(Self::clean_author(&text)); 221 281 } 222 282 } 223 283 } 224 284 225 285 None 286 + } 287 + 288 + /// Clean author text (remove "By " prefix, etc.) 289 + fn clean_author(author: &str) -> String { 290 + let author = author.trim(); 291 + 292 + let prefixes = ["By ", "by ", "Author: ", "Written by "]; 293 + for prefix in prefixes { 294 + if let Some(rest) = author.strip_prefix(prefix) { 295 + return rest.trim().to_string(); 296 + } 297 + } 298 + 299 + author.to_string() 226 300 } 227 301 228 302 /// Extract publication date from document ··· 232 306 /// 2. `<meta itemprop="datePublished">` (Schema.org) 233 307 /// 3. `<time datetime="...">` attribute 234 308 /// 4. `<time>` element text content 235 - /// 236 - /// ## Implementation Gaps: 237 - /// - Doesn't parse or normalize date formats 238 - /// - Doesn't validate date values 239 - /// - Doesn't extract from text patterns ("Published on Jan 1, 2020") 240 309 fn extract_date(&self, document: &Html) -> Option<String> { 241 310 let meta_selectors = vec![ 242 311 "meta[property='article:published_time']", 243 312 "meta[itemprop='datePublished']", 313 + "meta[name='date']", 314 + "meta[name='DC.date.issued']", 244 315 ]; 245 316 246 317 for selector_str in meta_selectors { ··· 261 332 return Some(datetime.trim().to_string()); 262 333 } 263 334 335 + if let Ok(selector) = Selector::parse("[itemprop='datePublished']") 336 + && let Some(element) = document.select(&selector).next() 337 + { 338 + if let Some(datetime) = element.value().attr("datetime") 339 + && !datetime.trim().is_empty() 340 + { 341 + return Some(datetime.trim().to_string()); 342 + } 343 + 344 + if let Some(content) = element.value().attr("content") 345 + && !content.trim().is_empty() 346 + { 347 + return Some(content.trim().to_string()); 348 + } 349 + } 350 + 264 351 if let Ok(selector) = Selector::parse("time") 265 352 && let Some(element) = document.select(&selector).next() 266 353 { ··· 269 356 return Some(text.trim().to_string()); 270 357 } 271 358 } 359 + 272 360 None 273 361 } 274 362 } ··· 278 366 use super::*; 279 367 280 368 #[test] 281 - fn test_extract_title_from_title_tag() { 369 + fn test_extract_title_from_og() { 282 370 let html = r#" 283 371 <html> 284 - <head><title>Test Article Title</title></head> 372 + <head> 373 + <meta property="og:title" content="OG Title"> 374 + <title>Page Title | Site Name</title> 375 + </head> 285 376 <body></body> 286 377 </html> 287 378 "#; ··· 290 381 let document = Html::parse_document(html); 291 382 let title = extractor.extract_title(&document); 292 383 293 - assert_eq!(title, Some("Test Article Title".to_string())); 384 + assert_eq!(title, Some("OG Title".to_string())); 385 + } 386 + 387 + #[test] 388 + fn test_clean_title_suffix() { 389 + let title = "My Article Title | Some News Site"; 390 + let cleaned = GenericExtractor::clean_title(title); 391 + assert_eq!(cleaned, "My Article Title"); 294 392 } 295 393 296 394 #[test] 297 395 fn test_extract_title_from_h1() { 298 396 let html = r#" 299 397 <html> 300 - <body><h1>Article Heading</h1></body> 398 + <body> 399 + <article> 400 + <h1>Article Heading</h1> 401 + </article> 402 + </body> 301 403 </html> 302 404 "#; 303 405 ··· 312 414 fn test_extract_body_from_article() { 313 415 let html = r#" 314 416 <html> 417 + <head><title>Test</title></head> 315 418 <body> 419 + <nav>Navigation here</nav> 316 420 <article> 317 - <p>This is the article content.</p> 421 + <p>This is the main article content with enough text to be considered viable content.</p> 422 + <p>Another paragraph here with more content to ensure we have substantial text.</p> 318 423 </article> 424 + <aside>Sidebar content</aside> 319 425 </body> 320 426 </html> 321 427 "#; 322 428 323 429 let extractor = GenericExtractor::new(html.to_string()); 324 - let document = Html::parse_document(html); 325 - let body = extractor.extract_body(&document); 430 + let result = extractor.extract().unwrap(); 326 431 327 - assert!(body.is_some()); 328 - assert!(body.unwrap().contains("This is the article content")); 432 + assert!(result.body_html.contains("main article content")); 329 433 } 330 434 331 435 #[test] ··· 346 450 } 347 451 348 452 #[test] 453 + fn test_extract_author_from_byline() { 454 + let html = r#" 455 + <html> 456 + <body> 457 + <span class="byline">By Jane Smith</span> 458 + </body> 459 + </html> 460 + "#; 461 + 462 + let extractor = GenericExtractor::new(html.to_string()); 463 + let document = Html::parse_document(html); 464 + let author = extractor.extract_author(&document); 465 + 466 + assert_eq!(author, Some("Jane Smith".to_string())); 467 + } 468 + 469 + #[test] 349 470 fn test_extract_date_from_meta() { 350 471 let html = r#" 351 472 <html> 352 473 <head> 353 - <meta property="article:published_time" content="2024-01-15"> 474 + <meta property="article:published_time" content="2024-01-15T10:30:00Z"> 354 475 </head> 355 476 </html> 356 477 "#; ··· 359 480 let document = Html::parse_document(html); 360 481 let date = extractor.extract_date(&document); 361 482 362 - assert_eq!(date, Some("2024-01-15".to_string())); 483 + assert_eq!(date, Some("2024-01-15T10:30:00Z".to_string())); 363 484 } 364 485 365 486 #[test] ··· 367 488 let html = r#" 368 489 <html> 369 490 <head> 370 - <title>Test Article</title> 491 + <meta property="og:title" content="Test Article"> 371 492 <meta name="author" content="Jane Smith"> 372 493 <meta property="article:published_time" content="2024-01-15"> 373 494 </head> 374 495 <body> 496 + <header>Site Header</header> 375 497 <article> 376 498 <h1>Article Title</h1> 377 - <p>Article content goes here.</p> 499 + <p>This is the main article content. It contains several paragraphs of text that make up the body of the article. The content should be substantial enough to score well.</p> 500 + <p>This is another paragraph with additional content. More words here to ensure we have a proper article body.</p> 378 501 </article> 502 + <footer>Site Footer</footer> 379 503 </body> 380 504 </html> 381 505 "#; ··· 384 508 let result = extractor.extract().unwrap(); 385 509 386 510 assert_eq!(result.title, "Test Article"); 387 - assert!(result.body_html.contains("Article content goes here")); 511 + assert!(result.body_html.contains("main article content")); 388 512 assert_eq!(result.author, Some("Jane Smith".to_string())); 389 513 assert_eq!(result.date, Some("2024-01-15".to_string())); 514 + } 515 + 516 + #[test] 517 + fn test_find_candidates_skips_nav() { 518 + let html = r#" 519 + <html> 520 + <body> 521 + <nav class="navigation"> 522 + <p>Nav item 1</p> 523 + <p>Nav item 2</p> 524 + </nav> 525 + <article> 526 + <p>Real content here that should be selected as the main candidate.</p> 527 + </article> 528 + </body> 529 + </html> 530 + "#; 531 + 532 + let extractor = GenericExtractor::new(html.to_string()); 533 + let document = Html::parse_document(html); 534 + let candidates = extractor.find_candidates(&document); 535 + 536 + assert!(!candidates.is_empty()); 537 + for candidate in &candidates { 538 + assert_ne!(candidate.value().name(), "nav"); 539 + } 540 + } 541 + 542 + #[test] 543 + fn test_scored_extraction_prefers_article() { 544 + let html = r#" 545 + <html> 546 + <head><title>Test</title></head> 547 + <body> 548 + <div class="sidebar"> 549 + <p>Sidebar content here.</p> 550 + </div> 551 + <article class="post-content"> 552 + <p>This is the main article content with plenty of text to score well in the content scoring algorithm.</p> 553 + <p>Multiple paragraphs help boost the score significantly.</p> 554 + </article> 555 + </body> 556 + </html> 557 + "#; 558 + 559 + let extractor = GenericExtractor::new(html.to_string()); 560 + let result = extractor.extract().unwrap(); 561 + 562 + assert!(result.body_html.contains("main article content")); 390 563 } 391 564 }

+4 -1

crates/readability/src/extractor/mod.rs

··· 4 4 pub mod scoring; 5 5 pub mod xpath; 6 6 7 - pub use generic::GenericExtractor; 7 + pub use generic::{ExtractedContent, GenericExtractor}; 8 + pub use scoring::{ 9 + ContentScore, calculate_class_weight, calculate_link_density, is_unlikely_candidate, is_viable_candidate, 10 + }; 8 11 pub use xpath::XPathExtractor;

+302 -8

crates/readability/src/extractor/scoring.rs

··· 1 1 //! Content scoring for the Mozilla Readability algorithm 2 2 //! 3 - //! TODO: Implement scoring 3 + //! This module implements the heuristic-based scoring system used to identify 4 + //! main content in HTML documents. Based on the Arc90/Mozilla Readability algorithm. 5 + 6 + use scraper::{ElementRef, Selector}; 4 7 5 8 /// Content score for an element 6 - #[derive(Debug, Clone)] 9 + #[derive(Debug, Clone, Default)] 7 10 pub struct ContentScore { 8 - /// Text length of the element 9 - pub text_length: usize, 10 - /// Link density (0.0 to 1.0) 11 + /// Base score from tag type 12 + pub tag_score: f32, 13 + /// Bonus/penalty from class/id names 14 + pub class_weight: f32, 15 + /// Link density (0.0 to 1.0) - lower is better for content 11 16 pub link_density: f32, 12 - /// Class/ID weight (positive for content, negative for non-content) 13 - pub class_weight: f32, 17 + /// Text length bonus 18 + pub text_length_bonus: f32, 19 + /// Comma count bonus (indicates prose) 20 + pub comma_bonus: f32, 14 21 /// Total calculated score 15 22 pub total: f32, 16 23 } 17 24 25 + impl ContentScore { 26 + /// Create a new score for an element 27 + /// 28 + /// Total score is calculated as: 29 + /// 30 + /// ```text 31 + /// tag_score + class_weight + text_length_bonus + comma_bonus - (link_density * 10.0) 32 + /// ``` 33 + /// 34 + /// High link density is penalized (navigation/sidebar content) 35 + pub fn new(element: ElementRef) -> Self { 36 + let tag_score = calculate_tag_score(element); 37 + let class_weight = calculate_class_weight(element); 38 + let (text_length_bonus, comma_bonus) = calculate_text_bonuses(element); 39 + let link_density = calculate_link_density(element); 40 + let total = tag_score + class_weight + text_length_bonus + comma_bonus - (link_density * 10.0); 41 + Self { tag_score, class_weight, link_density, text_length_bonus, comma_bonus, total } 42 + } 43 + } 44 + 18 45 /// Positive class/ID patterns indicating content 19 46 pub const POSITIVE_PATTERNS: &[&str] = &[ 20 - "article", "body", "content", "entry", "main", "page", "post", "text", "blog", "story", 47 + "article", 48 + "body", 49 + "content", 50 + "entry", 51 + "main", 52 + "page", 53 + "post", 54 + "text", 55 + "blog", 56 + "story", 57 + "hentry", 58 + "h-entry", 59 + "entry-content", 60 + "post-content", 61 + "article-content", 21 62 ]; 22 63 23 64 /// Negative class/ID patterns indicating non-content ··· 39 80 "agegate", 40 81 "pagination", 41 82 "nav", 83 + "related", 84 + "social", 85 + "widget", 86 + "promo", 87 + "masthead", 88 + "meta", 89 + "outbrain", 90 + "taboola", 42 91 ]; 92 + 93 + /// Tags that are likely to contain main content 94 + const POSITIVE_TAGS: &[&str] = &["article", "main", "section", "div", "p", "td", "pre"]; 95 + 96 + /// Tags unlikely to contain main content 97 + const NEGATIVE_TAGS: &[&str] = &[ 98 + "nav", 99 + "aside", 100 + "footer", 101 + "header", 102 + "form", 103 + "iframe", 104 + "figure", 105 + "figcaption", 106 + ]; 107 + 108 + /// Calculate base score from element tag name 109 + fn calculate_tag_score(element: ElementRef) -> f32 { 110 + let tag_name = element.value().name(); 111 + 112 + for tag in POSITIVE_TAGS { 113 + if tag_name == *tag { 114 + return match *tag { 115 + "article" => 10.0, 116 + "main" => 8.0, 117 + "section" => 5.0, 118 + "div" => 5.0, 119 + "p" => 3.0, 120 + "pre" => 3.0, 121 + "td" => 3.0, 122 + _ => 0.0, 123 + }; 124 + } 125 + } 126 + 127 + for tag in NEGATIVE_TAGS { 128 + if tag_name == *tag { 129 + return -5.0; 130 + } 131 + } 132 + 133 + 0.0 134 + } 135 + 136 + /// Calculate class/id weight based on positive/negative patterns 137 + pub fn calculate_class_weight(element: ElementRef) -> f32 { 138 + let mut weight: f32 = 0.0; 139 + 140 + let class_str = element.value().attr("class").unwrap_or(""); 141 + let id_str = element.value().attr("id").unwrap_or(""); 142 + let combined = format!("{} {}", class_str, id_str).to_lowercase(); 143 + for pattern in POSITIVE_PATTERNS { 144 + if combined.contains(pattern) { 145 + weight += 25.0; 146 + } 147 + } 148 + 149 + for pattern in NEGATIVE_PATTERNS { 150 + if combined.contains(pattern) { 151 + weight -= 25.0; 152 + } 153 + } 154 + 155 + weight 156 + } 157 + 158 + /// Calculate text length and comma bonuses 159 + fn calculate_text_bonuses(element: ElementRef) -> (f32, f32) { 160 + let text: String = element.text().collect(); 161 + let text_length = text.len(); 162 + let comma_count = text.matches(',').count(); 163 + 164 + let text_length_bonus = ((text_length as f32).sqrt() / 5.0).min(10.0); 165 + let comma_bonus = (comma_count as f32).min(3.0); 166 + 167 + (text_length_bonus, comma_bonus) 168 + } 169 + 170 + /// Calculate link density (ratio of link text to total text) 171 + pub fn calculate_link_density(element: ElementRef) -> f32 { 172 + let text: String = element.text().collect(); 173 + let total_length = text.len(); 174 + 175 + if total_length == 0 { 176 + return 0.0; 177 + } 178 + 179 + let mut link_length = 0usize; 180 + 181 + if let Ok(selector) = Selector::parse("a") { 182 + for link in element.select(&selector) { 183 + let link_text: String = link.text().collect(); 184 + link_length += link_text.len(); 185 + } 186 + } 187 + 188 + link_length as f32 / total_length as f32 189 + } 190 + 191 + /// Check if an element is an "unlikely candidate" (sidebar, comment, etc.) 192 + pub fn is_unlikely_candidate(element: ElementRef) -> bool { 193 + let class_str = element.value().attr("class").unwrap_or(""); 194 + let id_str = element.value().attr("id").unwrap_or(""); 195 + let combined = format!("{} {}", class_str, id_str).to_lowercase(); 196 + 197 + for pattern in NEGATIVE_PATTERNS { 198 + if combined.contains(pattern) { 199 + for positive in POSITIVE_PATTERNS { 200 + if combined.contains(positive) { 201 + return false; 202 + } 203 + } 204 + return true; 205 + } 206 + } 207 + 208 + false 209 + } 210 + 211 + /// Check if an element has enough content to be a candidate 212 + pub fn is_viable_candidate(element: ElementRef) -> bool { 213 + let text: String = element.text().collect(); 214 + let text_length = text.len(); 215 + 216 + if text_length < 25 { 217 + return false; 218 + } 219 + if let Ok(selector) = Selector::parse("p") { 220 + let p_count = element.select(&selector).count(); 221 + if p_count > 0 { 222 + return true; 223 + } 224 + } 225 + 226 + text_length >= 100 227 + } 228 + 229 + #[cfg(test)] 230 + mod tests { 231 + use super::*; 232 + use scraper::Html; 233 + 234 + #[test] 235 + fn test_positive_patterns_detection() { 236 + let html = r#"<div id="content" class="article-body">Test content</div>"#; 237 + let document = Html::parse_fragment(html); 238 + let selector = Selector::parse("div").unwrap(); 239 + let element = document.select(&selector).next().unwrap(); 240 + 241 + let weight = calculate_class_weight(element); 242 + assert!(weight > 0.0, "Should have positive weight for content/article classes"); 243 + } 244 + 245 + #[test] 246 + fn test_negative_patterns_detection() { 247 + let html = r#"<div id="sidebar" class="comment-section">Test content</div>"#; 248 + let document = Html::parse_fragment(html); 249 + let selector = Selector::parse("div").unwrap(); 250 + let element = document.select(&selector).next().unwrap(); 251 + 252 + let weight = calculate_class_weight(element); 253 + assert!(weight < 0.0, "Should have negative weight for sidebar/comment classes"); 254 + } 255 + 256 + #[test] 257 + fn test_link_density_calculation() { 258 + let html = r#"<div>Some text here <a href="\#">link one</a> and <a href="\#">link two</a> more text</div>"#; 259 + let document = Html::parse_fragment(html); 260 + let selector = Selector::parse("div").unwrap(); 261 + let element = document.select(&selector).next().unwrap(); 262 + 263 + let density = calculate_link_density(element); 264 + assert!(density > 0.0 && density < 1.0, "Link density should be between 0 and 1"); 265 + } 266 + 267 + #[test] 268 + fn test_high_link_density() { 269 + let html = r#"<div><a href="\#">link</a><a href="\#">link</a><a href="\#">link</a></div>"#; 270 + let document = Html::parse_fragment(html); 271 + let selector = Selector::parse("div").unwrap(); 272 + let element = document.select(&selector).next().unwrap(); 273 + 274 + let density = calculate_link_density(element); 275 + assert!( 276 + density > 0.8, 277 + "Should detect high link density in navigation-like content" 278 + ); 279 + } 280 + 281 + #[test] 282 + fn test_unlikely_candidate() { 283 + let html = r#"<div class="sidebar">Sidebar content</div>"#; 284 + let document = Html::parse_fragment(html); 285 + let selector = Selector::parse("div").unwrap(); 286 + let element = document.select(&selector).next().unwrap(); 287 + 288 + assert!(is_unlikely_candidate(element), "Sidebar should be unlikely candidate"); 289 + } 290 + 291 + #[test] 292 + fn test_viable_candidate_with_paragraphs() { 293 + let html = r#"<div><p>This is a paragraph with enough content to be considered viable.</p></div>"#; 294 + let document = Html::parse_fragment(html); 295 + let selector = Selector::parse("div").unwrap(); 296 + let element = document.select(&selector).next().unwrap(); 297 + 298 + assert!(is_viable_candidate(element), "Div with paragraph should be viable"); 299 + } 300 + 301 + #[test] 302 + fn test_content_score_creation() { 303 + let html = 304 + r#"<article class="post-content"><p>This is article content with some commas, here, there.</p></article>"#; 305 + let document = Html::parse_fragment(html); 306 + let selector = Selector::parse("article").unwrap(); 307 + let element = document.select(&selector).next().unwrap(); 308 + 309 + let score = ContentScore::new(element); 310 + assert!(score.tag_score > 0.0, "Article tag should have positive score"); 311 + assert!(score.class_weight > 0.0, "post-content class should be positive"); 312 + assert!(score.comma_bonus > 0.0, "Should detect commas"); 313 + } 314 + 315 + #[test] 316 + fn test_tag_score_article() { 317 + let html = r#"<article>Content</article>"#; 318 + let document = Html::parse_fragment(html); 319 + let selector = Selector::parse("article").unwrap(); 320 + let element = document.select(&selector).next().unwrap(); 321 + 322 + let score = calculate_tag_score(element); 323 + assert_eq!(score, 10.0, "Article tag should score 10"); 324 + } 325 + 326 + #[test] 327 + fn test_tag_score_nav() { 328 + let html = r#"<nav>Navigation</nav>"#; 329 + let document = Html::parse_fragment(html); 330 + let selector = Selector::parse("nav").unwrap(); 331 + let element = document.select(&selector).next().unwrap(); 332 + 333 + let score = calculate_tag_score(element); 334 + assert_eq!(score, -5.0, "Nav tag should score -5"); 335 + } 336 + }

+51 -5

crates/readability/src/extractor/xpath.rs

··· 23 23 use regex::Regex; 24 24 use scraper::{ElementRef, Html, Selector}; 25 25 26 + static VOID_ELEMENTS: &[&str] = &[ 27 + "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr", 28 + ]; 29 + 26 30 /// Extracted content from XPath rules 27 31 #[derive(Debug, Clone)] 28 32 pub struct ExtractedContent { ··· 142 146 output.push_str(&html_escape::encode_text(&text.to_string())); 143 147 } 144 148 } 145 - 146 - const VOID_ELEMENTS: &[&str] = &[ 147 - "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", 148 - "wbr", 149 - ]; 150 149 151 150 if !VOID_ELEMENTS.contains(&tag) { 152 151 output.push_str("</"); ··· 491 490 assert_eq!(result.author, Some("Test Author".to_string())); 492 491 assert_eq!(result.date, Some("2024-01-15".to_string())); 493 492 } 493 + 494 + #[test] 495 + fn test_strip_elements_inside_body() { 496 + let html = r#" 497 + <html> 498 + <body> 499 + <div id="bodyContent"> 500 + <h2>Section Title <span class="mw-editsection">[edit]</span></h2> 501 + <p>Main content here.</p> 502 + <h2>Another Section <span class="mw-editsection-bracket">[</span></h2> 503 + </div> 504 + </body> 505 + </html> 506 + "#; 507 + 508 + let config = SiteConfig { 509 + body: vec!["//*[@id='bodyContent']".to_string()], 510 + strip_id_or_class: vec!["editsection".to_string()], 511 + ..Default::default() 512 + }; 513 + 514 + let extractor = XPathExtractor::new(html.to_string()); 515 + let result = extractor.extract(&config).unwrap(); 516 + 517 + let body = result.body_html.expect("Should extract body"); 518 + println!("Extracted body: {}", body); 519 + 520 + assert!(!body.contains("mw-editsection"), "mw-editsection should be stripped"); 521 + assert!(!body.contains("[edit]"), "[edit] text should be stripped"); 522 + assert!(body.contains("Main content here")); 523 + assert!(body.contains("Section Title")); 524 + } 525 + } 526 + 527 + #[test] 528 + fn test_wikipedia_xpath_patterns() { 529 + let extractor = XPathExtractor::new(String::new()); 530 + 531 + // Wikipedia title XPath 532 + let (css, filter) = extractor.xpath_to_css_with_attr("//h1[@id='firstHeading']").unwrap(); 533 + assert_eq!(css, "h1#firstHeading"); 534 + assert!(filter.is_none()); 535 + 536 + // Wikipedia body XPath (note space around =) 537 + let (css, filter) = extractor.xpath_to_css_with_attr("//div[@id = 'bodyContent']").unwrap(); 538 + assert_eq!(css, "div#bodyContent"); 539 + assert!(filter.is_none()); 494 540 }

+16 -24

crates/readability/src/lib.rs

··· 55 55 } 56 56 57 57 impl Readability { 58 - /// Create a new Readability instance 59 - /// 60 - /// # Arguments 61 - /// 62 - /// * `html` - The HTML content to extract from 63 - /// * `url` - Optional URL of the article (used for rule matching) 64 58 pub fn new(html: String, url: Option<&str>) -> Self { 65 59 Self { html, url: url.map(String::from), rules_dir: None } 66 60 } ··· 75 69 76 70 /// Extract article content from HTML 77 71 /// 78 - /// ## Extraction Flow: 79 72 /// 1. If URL provided: Try to load site-specific XPath rules from embedded rules 80 - /// 2. If rules found: Attempt XPath-based extraction 73 + /// 2. If rules found: Attempt XPath-based extraction with strip rules applied 81 74 /// 3. If no rules OR XPath extraction fails: Fall back to generic heuristic extraction 82 - /// 4. Convert extracted HTML to markdown 83 - /// 5. Generate excerpt from markdown 84 - /// 6. Return complete Article struct 75 + /// 4. Clean extracted HTML (remove scripts, styles, unlikely elements) 76 + /// 5. Convert cleaned HTML to markdown 77 + /// 6. Generate excerpt from markdown 78 + /// 7. Return complete Article struct 85 79 /// 86 - /// ## Implementation Gaps: 87 - /// - XPath extraction doesn't handle complex expressions with `contains()`, `normalize-space()`, etc. 88 - /// These will fall back to generic extraction 89 - /// - No content cleaning between XPath/generic extraction and markdown conversion 90 - /// (scripts, styles, etc. may be present in extracted HTML) 91 - /// - Generic extraction may include non-content elements (nav, footer, etc.) 80 + /// Supported XPath Features: 81 + /// - Simple tag selection: `//tag` 82 + /// - ID selection: `//tag[@id='value']` 83 + /// - Class matching: `//tag[@class='value']`, `//tag[contains(@class, 'value')]` 84 + /// - Normalized class: `//tag[contains(concat(' ',normalize-space(@class),' '),' value ')]` 85 + /// - Attribute extraction: `//meta[@name='value']/@content` 86 + /// - Strip rules: `strip_id_or_class` and `strip` XPath directives 92 87 /// 93 - /// ## Design Decision: 88 + /// Design: 94 89 /// We prefer to return *something* (via generic extraction) rather than fail completely. 95 90 /// This maximizes success rate at the cost of potentially lower quality extraction. 96 - /// 97 - /// TODO: Add HTML cleaning step before markdown conversion 98 - /// TODO: Implement XPath strip directives to remove unwanted elements 99 - /// TODO: Add content validation (minimum length, etc.) 100 91 pub fn parse(&self) -> Result<Article> { 101 92 use config::ConfigLoader; 102 93 use converter::to_markdown; ··· 121 112 self.extract_with_generic()? 122 113 }; 123 114 124 - let markdown = to_markdown(&content); 115 + let cleaned_content = cleaner::HtmlCleaner::clean(&content); 116 + let markdown = to_markdown(&cleaned_content); 125 117 let excerpt = Some(converter::html2md::generate_excerpt(&markdown, 200)); 126 - Ok(Article { title, content, markdown, author, published_date: date, excerpt }) 118 + Ok(Article { title, content: cleaned_content, markdown, author, published_date: date, excerpt }) 127 119 } 128 120 129 121 /// Extract using generic heuristic-based algorithm

+90

crates/readability/tests/readability_tests.rs

··· 1 + use malfestio_readability::Readability; 2 + 3 + #[tokio::test] 4 + #[ignore = "requires network access"] 5 + async fn test_arxiv_extraction() { 6 + let url = "https://arxiv.org/abs/2009.03017"; 7 + 8 + let client = reqwest::Client::builder() 9 + .user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)") 10 + .build() 11 + .unwrap(); 12 + 13 + let response = client.get(url).send().await.unwrap(); 14 + let html = response.text().await.unwrap(); 15 + 16 + let readability = Readability::new(html, Some(url)); 17 + let article = readability.parse().unwrap(); 18 + 19 + assert!(!article.title.is_empty(), "Title should be extracted"); 20 + println!("Title: {}", article.title); 21 + 22 + assert!(!article.markdown.is_empty(), "Body/markdown should be extracted"); 23 + assert!(article.markdown.len() > 50, "Abstract should have substantial content"); 24 + println!("Markdown length: {} chars", article.markdown.len()); 25 + 26 + assert!(article.author.is_some(), "Author should be extracted from meta tag"); 27 + println!("Author: {:?}", article.author); 28 + 29 + assert!( 30 + article.published_date.is_some(), 31 + "Date should be extracted from meta tag" 32 + ); 33 + println!("Date: {:?}", article.published_date); 34 + } 35 + 36 + #[tokio::test] 37 + #[ignore = "requires network access"] 38 + async fn test_wikipedia_extraction() { 39 + let url = "https://en.wikipedia.org/wiki/Rust_(programming_language)"; 40 + 41 + let client = reqwest::Client::builder() 42 + .user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)") 43 + .build() 44 + .unwrap(); 45 + 46 + let response = client.get(url).send().await.unwrap(); 47 + let html = response.text().await.unwrap(); 48 + 49 + let readability = Readability::new(html, Some(url)); 50 + let article = readability.parse().unwrap(); 51 + 52 + assert!(article.title.contains("Rust"), "Title should contain 'Rust'"); 53 + println!("Title: {}", article.title); 54 + 55 + assert!( 56 + article.markdown.len() > 1000, 57 + "Wikipedia article should have substantial content" 58 + ); 59 + println!("Markdown length: {} chars", article.markdown.len()); 60 + 61 + // Verify strip rules worked: mw-editsection elements should be removed 62 + assert!( 63 + !article.content.contains("mw-editsection"), 64 + "Edit section elements (mw-editsection) should be stripped" 65 + ); 66 + } 67 + 68 + /// Test extraction for site without specific rules (falls back to generic) 69 + #[tokio::test] 70 + #[ignore = "requires network access"] 71 + async fn test_generic_fallback_extraction() { 72 + let url = "https://www.rust-lang.org/"; 73 + 74 + let client = reqwest::Client::builder() 75 + .user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)") 76 + .build() 77 + .unwrap(); 78 + 79 + let response = client.get(url).send().await.unwrap(); 80 + let html = response.text().await.unwrap(); 81 + 82 + let readability = Readability::new(html, Some(url)); 83 + let article = readability.parse().unwrap(); 84 + 85 + assert!(!article.title.is_empty(), "Title should be extracted via generic"); 86 + assert!(!article.markdown.is_empty(), "Content should be extracted via generic"); 87 + 88 + println!("Title: {}", article.title); 89 + println!("Markdown length: {} chars", article.markdown.len()); 90 + }

Configure Feed

Configure Feed