···1515regex = "1.12"
16161717[dev-dependencies]
1818-tokio = { version = "1.48", features = ["test-util"] }
1818+tokio = { version = "1.48", features = ["rt-multi-thread", "macros"] }
1919+reqwest = "0.12"
+294-12
crates/readability/src/cleaner/sanitizer.rs
···11//! HTML sanitization and cleaning
22+//!
33+//! This module provides utilities to clean extracted HTML content,
44+//! removing scripts, styles, unwanted elements, and normalizing the output.
55+use crate::extractor::scoring::is_unlikely_candidate;
66+77+use scraper::{Html, Selector};
2839/// HTML cleaner and sanitizer
410pub struct HtmlCleaner;
511612impl HtmlCleaner {
77- /// Clean HTML content
1313+ /// Clean HTML content by applying all cleaning steps
1414+ ///
1515+ /// Steps:
1616+ /// 1. Remove scripts and styles
1717+ /// 2. Remove unlikely candidates (sidebar, comments, etc.)
1818+ /// 3. Remove empty elements
1919+ /// 4. Clean attributes (keep only essential ones)
2020+ /// 5. Normalize whitespace
821 pub fn clean(html: &str) -> String {
99- // TODO: Implement cleaning
1010- html.to_string()
2222+ let mut result = Self::remove_scripts_and_styles(html);
2323+ result = Self::remove_unlikely_elements(&result);
2424+ result = Self::remove_empty_elements(&result);
2525+ result = Self::clean_attributes(&result);
2626+ result = Self::normalize_whitespace(&result);
2727+ result
1128 }
12291313- /// Remove scripts and styles
3030+ /// Remove script and style tags and their contents
1431 pub fn remove_scripts_and_styles(html: &str) -> String {
1515- // TODO: Implement
1616- html.to_string()
3232+ let document = Html::parse_fragment(html);
3333+ let mut result = html.to_string();
3434+3535+ if let Ok(selector) = Selector::parse("script") {
3636+ for element in document.select(&selector) {
3737+ let element_html = element.html();
3838+ result = result.replace(&element_html, "");
3939+ }
4040+ }
4141+4242+ if let Ok(selector) = Selector::parse("style") {
4343+ for element in document.select(&selector) {
4444+ let element_html = element.html();
4545+ result = result.replace(&element_html, "");
4646+ }
4747+ }
4848+4949+ if let Ok(selector) = Selector::parse("noscript") {
5050+ for element in document.select(&selector) {
5151+ let element_html = element.html();
5252+ result = result.replace(&element_html, "");
5353+ }
5454+ }
5555+5656+ if let Ok(selector) = Selector::parse("link[rel='stylesheet']") {
5757+ for element in document.select(&selector) {
5858+ let element_html = element.html();
5959+ result = result.replace(&element_html, "");
6060+ }
6161+ }
6262+6363+ result
6464+ }
6565+6666+ /// Remove elements that are unlikely to be main content
6767+ pub fn remove_unlikely_elements(html: &str) -> String {
6868+ let document = Html::parse_fragment(html);
6969+ let mut result = html.to_string();
7070+7171+ let unlikely_selectors = [
7272+ "nav",
7373+ "aside",
7474+ "footer",
7575+ "header",
7676+ "[role='navigation']",
7777+ "[role='banner']",
7878+ "[role='contentinfo']",
7979+ "[role='complementary']",
8080+ ".sidebar",
8181+ ".advertisement",
8282+ ".ad",
8383+ ".ads",
8484+ ".social-share",
8585+ ".share-buttons",
8686+ ".related-posts",
8787+ ".comments",
8888+ "#comments",
8989+ ".comment-section",
9090+ ];
9191+9292+ for selector_str in unlikely_selectors {
9393+ if let Ok(selector) = Selector::parse(selector_str) {
9494+ for element in document.select(&selector) {
9595+ let element_html = element.html();
9696+ result = result.replace(&element_html, "");
9797+ }
9898+ }
9999+ }
100100+101101+ if let Ok(div_selector) = Selector::parse("div, section, aside, span") {
102102+ for element in document.select(&div_selector) {
103103+ if is_unlikely_candidate(element) {
104104+ let element_html = element.html();
105105+ result = result.replace(&element_html, "");
106106+ }
107107+ }
108108+ }
109109+110110+ result
17111 }
181121919- /// Normalize whitespace
113113+ /// Clean attributes - keep only essential ones
114114+ ///
115115+ /// Keeps: href, src, alt, title, datetime, class (filtered)
116116+ /// Removes: onclick, onload, style, data-*, etc.
117117+ pub fn clean_attributes(html: &str) -> String {
118118+ use regex::Regex;
119119+120120+ let mut result = html.to_string();
121121+122122+ let event_attrs = [
123123+ "onclick",
124124+ "onload",
125125+ "onerror",
126126+ "onmouseover",
127127+ "onmouseout",
128128+ "onkeydown",
129129+ "onkeyup",
130130+ "onfocus",
131131+ "onblur",
132132+ "onsubmit",
133133+ ];
134134+135135+ for attr in event_attrs {
136136+ if let Ok(regex) = Regex::new(&format!(r#"\s+{}="[^"]*""#, attr)) {
137137+ result = regex.replace_all(&result, "").to_string();
138138+ }
139139+ if let Ok(regex) = Regex::new(&format!(r#"\s+{}='[^']*'"#, attr)) {
140140+ result = regex.replace_all(&result, "").to_string();
141141+ }
142142+ }
143143+144144+ if let Ok(regex) = Regex::new(r#"\s+style="[^"]*""#) {
145145+ result = regex.replace_all(&result, "").to_string();
146146+ }
147147+148148+ if let Ok(regex) = Regex::new(r#"\s+data-[a-z-]+="[^"]*""#) {
149149+ result = regex.replace_all(&result, "").to_string();
150150+ }
151151+152152+ result
153153+ }
154154+155155+ /// Normalize whitespace - collapse multiple spaces/newlines
20156 pub fn normalize_whitespace(html: &str) -> String {
2121- // TODO: Implement
2222- html.to_string()
157157+ use regex::Regex;
158158+159159+ let mut result = html.to_string();
160160+161161+ if let Ok(regex) = Regex::new(r"[ \t]+") {
162162+ result = regex.replace_all(&result, " ").to_string();
163163+ }
164164+165165+ if let Ok(regex) = Regex::new(r"\n{3,}") {
166166+ result = regex.replace_all(&result, "\n\n").to_string();
167167+ }
168168+169169+ if let Ok(regex) = Regex::new(r"\n{3,}") {
170170+ result = regex.replace_all(&result, "\n\n").to_string();
171171+ }
172172+173173+ if let Ok(regex) = Regex::new(r"(?m)^[ \t]+|[ \t]+$") {
174174+ result = regex.replace_all(&result, "").to_string();
175175+ }
176176+177177+ result.trim().to_string()
23178 }
241792525- /// Remove empty elements
180180+ /// Remove empty elements (paragraphs, divs, spans with no content)
26181 pub fn remove_empty_elements(html: &str) -> String {
2727- // TODO: Implement
2828- html.to_string()
182182+ use regex::Regex;
183183+184184+ let mut result = html.to_string();
185185+186186+ if let Ok(regex) = Regex::new(r"<p[^>]*>\s*</p>") {
187187+ result = regex.replace_all(&result, "").to_string();
188188+ }
189189+190190+ if let Ok(regex) = Regex::new(r"<div[^>]*>\s*</div>") {
191191+ result = regex.replace_all(&result, "").to_string();
192192+ }
193193+194194+ if let Ok(regex) = Regex::new(r"<span[^>]*>\s*</span>") {
195195+ result = regex.replace_all(&result, "").to_string();
196196+ }
197197+198198+ if let Ok(regex) = Regex::new(r"<p[^>]*>( |\s)*</p>") {
199199+ result = regex.replace_all(&result, "").to_string();
200200+ }
201201+202202+ result
203203+ }
204204+205205+ /// Remove elements by class or id containing specific patterns
206206+ pub fn remove_by_class_or_id(html: &str, patterns: &[&str]) -> String {
207207+ let document = Html::parse_fragment(html);
208208+ let mut result = html.to_string();
209209+210210+ if let Ok(all_selector) = Selector::parse("*") {
211211+ for element in document.select(&all_selector) {
212212+ let class_str = element.value().attr("class").unwrap_or("");
213213+ let id_str = element.value().attr("id").unwrap_or("");
214214+ let combined = format!("{} {}", class_str, id_str).to_lowercase();
215215+216216+ for pattern in patterns {
217217+ if combined.contains(&pattern.to_lowercase()) {
218218+ let element_html = element.html();
219219+ result = result.replace(&element_html, "");
220220+ break;
221221+ }
222222+ }
223223+ }
224224+ }
225225+226226+ result
227227+ }
228228+}
229229+230230+#[cfg(test)]
231231+mod tests {
232232+ use super::*;
233233+234234+ #[test]
235235+ fn test_remove_scripts() {
236236+ let html = r#"<div>Content<script>alert('test');</script>More content</div>"#;
237237+ let result = HtmlCleaner::remove_scripts_and_styles(html);
238238+ assert!(!result.contains("script"));
239239+ assert!(!result.contains("alert"));
240240+ }
241241+242242+ #[test]
243243+ fn test_remove_styles() {
244244+ let html = r#"<div>Content<style>.test { color: red; }</style>More content</div>"#;
245245+ let result = HtmlCleaner::remove_scripts_and_styles(html);
246246+ assert!(!result.contains("style"));
247247+ assert!(!result.contains("color"));
248248+ }
249249+250250+ #[test]
251251+ fn test_remove_empty_paragraphs() {
252252+ let html = r#"<div><p>Content</p><p></p><p> </p><p>More</p></div>"#;
253253+ let result = HtmlCleaner::remove_empty_elements(html);
254254+ assert!(result.contains("Content"));
255255+ assert!(result.contains("More"));
256256+257257+ let p_count = result.matches("<p").count();
258258+ assert_eq!(p_count, 2, "Should have exactly 2 paragraphs");
259259+ }
260260+261261+ #[test]
262262+ fn test_normalize_whitespace() {
263263+ let html = " Content with multiple spaces ";
264264+ let result = HtmlCleaner::normalize_whitespace(html);
265265+ assert_eq!(result, "Content with multiple spaces");
266266+ }
267267+268268+ #[test]
269269+ fn test_clean_inline_events() {
270270+ let html = r#"<a href="\#" onclick="doSomething()">Link</a>"#;
271271+ let result = HtmlCleaner::clean_attributes(html);
272272+ assert!(!result.contains("onclick"));
273273+ assert!(result.contains("href"));
274274+ }
275275+276276+ #[test]
277277+ fn test_clean_data_attributes() {
278278+ let html = r#"<div data-tracking="abc" data-id="123">Content</div>"#;
279279+ let result = HtmlCleaner::clean_attributes(html);
280280+ assert!(!result.contains("data-tracking"));
281281+ assert!(!result.contains("data-id"));
282282+ }
283283+284284+ #[test]
285285+ fn test_full_clean() {
286286+ let html = r#"
287287+ <div>
288288+ <script>evil();</script>
289289+ <style>.bad {}</style>
290290+ <p>Good content here.</p>
291291+ <p></p>
292292+ <nav>Navigation</nav>
293293+ <p onclick="bad()">More content</p>
294294+ </div>
295295+ "#;
296296+297297+ let result = HtmlCleaner::clean(html);
298298+ assert!(!result.contains("script"));
299299+ assert!(!result.contains("style"));
300300+ assert!(!result.contains("onclick"));
301301+ assert!(result.contains("Good content"));
302302+ assert!(result.contains("More content"));
303303+ }
304304+305305+ #[test]
306306+ fn test_remove_by_class_or_id() {
307307+ let html = r#"<div><p class="sidebar">Sidebar</p><p class="content">Main</p></div>"#;
308308+ let result = HtmlCleaner::remove_by_class_or_id(html, &["sidebar"]);
309309+ assert!(!result.contains("Sidebar"));
310310+ assert!(result.contains("Main"));
29311 }
30312}
+307-134
crates/readability/src/extractor/generic.rs
···11-//! Generic content extraction with a simplified heuristic-based approach
22-//!
33-//! ## Implementation Strategy
44-//!
55-//! This is a **simplified** content extractor, not a full Mozilla Readability implementation.
66-//! It uses basic heuristics to find common patterns in HTML documents.
11+//! Generic content extraction using Mozilla Readability-style heuristics
72//!
88-//! ### What This Implementation Does:
99-//! - Extracts title from `<title>`, `<h1>`, or `og:title` meta tags
1010-//! - Finds body content by looking for semantic HTML5 tags and common class names
1111-//! - Extracts author from meta tags or common byline patterns
1212-//! - Extracts date from meta tags or `<time>` elements
1313-//! - Uses simple CSS selector patterns (no complex scoring algorithm)
33+//! ## Implementation Overview
144//!
1515-//! ### What This Implementation Does NOT Do (Implementation Gaps):
1616-//! - **No content scoring**: Unlike Mozilla Readability, we don't score paragraphs by
1717-//! text length, link density, or class names to find the "best" content candidate
1818-//! - **No sibling inclusion**: We don't check if siblings of the main content should
1919-//! be included based on similarity thresholds
2020-//! - **No ancestor scoring**: We don't propagate scores up the DOM tree
2121-//! - **No link density checking**: We don't filter out high link-density sections
2222-//! - **No "unlikely candidate" removal**: We don't remove elements based on negative
2323-//! class name patterns like "sidebar", "comment", etc.
2424-//! - **Limited fallback chain**: Mozilla Readability tries multiple strategies; we try
2525-//! a few common patterns and give up
55+//! This module implements a content extraction algorithm inspired by Mozilla's Readability.js.
66+//! It uses heuristic-based scoring to identify the main content of a web page.
267//!
2727-//! ### Design Decisions:
2828-//! - **Semantic HTML first**: We prefer `<article>`, `<main>` over class-based selection
2929-//! because they're more reliable indicators of content
3030-//! - **Multiple fallbacks**: We try progressively broader selectors to maximize success rate
3131-//! - **Metadata from standards**: We use standard meta tags (Open Graph, Schema.org, etc.)
3232-//! before falling back to heuristics
3333-//! - **Fail fast**: If we can't find content with our heuristics, we return an error
3434-//! rather than returning garbage content
88+//! ### Algorithm Steps:
99+//! 1. **Preprocessing**: Remove scripts, styles, and other noise
1010+//! 2. **Candidate Identification**: Find elements containing paragraphs
1111+//! 3. **Content Scoring**: Score candidates based on text length, link density, classes
1212+//! 4. **Ancestor Propagation**: Bubble scores up to parent/grandparent elements
1313+//! 5. **Top Candidate Selection**: Pick the highest-scoring element
1414+//! 6. **Sibling Inclusion**: Include relevant siblings of the top candidate
1515+//! 7. **Cleaning**: Remove unlikely elements and normalize output
3516//!
3636-//! ## TODOs:
3737-//! - TODO: Implement basic content scoring (count paragraphs, text length)
3838-//! - TODO: Add link density checks to filter navigation/sidebar
3939-//! - TODO: Remove unlikely candidates (ads, footers, etc.) by class name
4040-//! - TODO: Try multiple content candidates and pick the best one
4141-//! - TODO: Clean extracted HTML (remove scripts, styles, empty elements)
4242-//! - TODO: Handle multi-page articles (pagination detection)
1717+//! ### Scoring Factors:
1818+//! - Tag type (article > main > div > p)
1919+//! - Class/ID names (positive: "article", "content"; negative: "sidebar", "nav")
2020+//! - Text length (longer content scores higher)
2121+//! - Link density (high link density = navigation, low = content)
2222+//! - Comma count (commas indicate prose)
43232424+use crate::cleaner::HtmlCleaner;
4425use crate::error::{Error, Result};
4545-use scraper::{Html, Selector};
2626+use crate::extractor::scoring::{ContentScore, calculate_link_density, is_unlikely_candidate, is_viable_candidate};
2727+use scraper::{ElementRef, Html, Selector};
2828+use std::collections::HashMap;
46294730/// Extracted content from generic algorithm
4831#[derive(Debug, Clone)]
···5336 pub date: Option<String>,
5437}
55385656-/// Generic content extractor using simple heuristics
3939+/// Candidate element with its score
4040+#[derive(Debug)]
4141+struct ScoredCandidate {
4242+ /// Index in the candidate list
4343+ _index: usize,
4444+ /// Computed content score
4545+ score: f32,
4646+ /// HTML content
4747+ html: String,
4848+}
4949+5050+/// Generic content extractor using Readability-style heuristics
5751///
5858-/// This extractor attempts to find article content using common HTML patterns.
5252+/// This extractor attempts to find article content using content scoring.
5953/// It's designed as a fallback when site-specific XPath rules are not available.
6054pub struct GenericExtractor {
6155 html: String,
···6761 Self { html }
6862 }
69637070- /// Extract content using simple heuristics
6464+ /// Extract content using content scoring algorithm
7165 ///
7272- /// ## Extraction Strategy:
7373- /// 1. Title: `<title>` tag, then `<h1>`, then `og:title` meta tag
7474- /// 2. Body: `<article>`, then `<main>`, then `[role="main"]`, then `.content`
7575- /// 3. Author: meta tags (author, og:author, article:author), then `.byline`
7676- /// 4. Date: meta tags (article:published_time, datePublished), then `<time>`
6666+ /// Strategy
7767 ///
7878- /// ## Limitations:
7979- /// - Returns first match, doesn't evaluate quality
8080- /// - No cleaning of extracted HTML (scripts, ads, etc. may be included)
8181- /// - May extract wrong content if page structure is unusual
6868+ /// 1. Preprocess HTML (remove scripts, styles)
6969+ /// 2. Extract metadata (title, author, date) from standard locations
7070+ /// 3. Find content candidates (elements with paragraphs)
7171+ /// 4. Score candidates and select the best one
7272+ /// 5. Clean and return the content
8273 pub fn extract(&self) -> Result<ExtractedContent> {
8374 let document = Html::parse_document(&self.html);
8475···8677 .extract_title(&document)
8778 .ok_or_else(|| Error::ExtractionError("Could not extract title".to_string()))?;
88798080+ let author = self.extract_author(&document);
8181+ let date = self.extract_date(&document);
8282+8983 let body_html = self
9090- .extract_body(&document)
8484+ .extract_body_with_scoring(&document)
8585+ .or_else(|| self.extract_body_simple(&document))
9186 .ok_or_else(|| Error::ExtractionError("Could not extract body content".to_string()))?;
92879393- let author = self.extract_author(&document);
9494- let date = self.extract_date(&document);
9595- Ok(ExtractedContent { title, body_html, author, date })
8888+ let clean_body = HtmlCleaner::clean(&body_html);
8989+9090+ Ok(ExtractedContent { title, body_html: clean_body, author, date })
9691 }
97929898- /// Extract title from document
9999- ///
100100- /// Tries in order:
101101- /// 1. `<title>` tag content (cleaned of site suffixes)
102102- /// 2. First `<h1>` tag
103103- /// 3. `og:title` meta tag
104104- ///
105105- /// ## Implementation Gap:
106106- /// - Doesn't try to clean title (remove " | Site Name" suffixes, etc.)
107107- /// - Doesn't check title quality or length
108108- fn extract_title(&self, document: &Html) -> Option<String> {
109109- if let Ok(selector) = Selector::parse("title")
110110- && let Some(element) = document.select(&selector).next()
111111- {
112112- let text: String = element.text().collect();
113113- if !text.trim().is_empty() {
114114- return Some(text.trim().to_string());
9393+ /// Extract body content using content scoring algorithm
9494+ fn extract_body_with_scoring(&self, document: &Html) -> Option<String> {
9595+ let candidates = self.find_candidates(document);
9696+9797+ if candidates.is_empty() {
9898+ return None;
9999+ }
100100+101101+ let mut scored: Vec<ScoredCandidate> = candidates
102102+ .iter()
103103+ .enumerate()
104104+ .map(|(index, element)| {
105105+ let score = ContentScore::new(*element);
106106+ ScoredCandidate { _index: index, score: score.total, html: element.html() }
107107+ })
108108+ .collect();
109109+110110+ let mut ancestor_scores: HashMap<usize, f32> = HashMap::new();
111111+ for (i, candidate) in scored.iter().enumerate() {
112112+ if i > 0 {
113113+ *ancestor_scores.entry(i - 1).or_insert(0.0) += candidate.score;
114114+ }
115115+ if i > 1 {
116116+ *ancestor_scores.entry(i - 2).or_insert(0.0) += candidate.score / 2.0;
115117 }
116118 }
117119118118- if let Ok(selector) = Selector::parse("h1")
119119- && let Some(element) = document.select(&selector).next()
120120- {
121121- let text: String = element.text().collect();
122122- if !text.trim().is_empty() {
123123- return Some(text.trim().to_string());
120120+ for (index, bonus) in ancestor_scores {
121121+ if let Some(candidate) = scored.get_mut(index) {
122122+ candidate.score += bonus;
124123 }
125124 }
126125127127- if let Ok(selector) = Selector::parse("meta[property='og:title']")
128128- && let Some(element) = document.select(&selector).next()
129129- && let Some(content) = element.value().attr("content")
130130- && !content.trim().is_empty()
131131- {
132132- return Some(content.trim().to_string());
126126+ scored.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal));
127127+ scored.first().map(|c| c.html.clone())
128128+ }
129129+130130+ /// Find candidate elements for content extraction
131131+ fn find_candidates<'a>(&self, document: &'a Html) -> Vec<ElementRef<'a>> {
132132+ let mut candidates: Vec<ElementRef<'a>> = Vec::new();
133133+134134+ let container_selectors = ["article", "main", "section", "div", "[role='main']"];
135135+136136+ for selector_str in container_selectors {
137137+ if let Ok(selector) = Selector::parse(selector_str) {
138138+ for element in document.select(&selector) {
139139+ if is_unlikely_candidate(element) {
140140+ continue;
141141+ }
142142+ if is_viable_candidate(element) {
143143+ let density = calculate_link_density(element);
144144+ if density < 0.5 {
145145+ candidates.push(element);
146146+ }
147147+ }
148148+ }
149149+ }
133150 }
134151135135- None
152152+ candidates
136153 }
137154138138- /// Extract body content from document
139139- ///
140140- /// Tries in order:
141141- /// 1. `<article>` tag (semantic HTML5)
142142- /// 2. `<main>` tag (semantic HTML5)
143143- /// 3. `[role="main"]` attribute (ARIA landmark)
144144- /// 4. First element with class containing "content", "article", "post", "entry"
145145- /// 5. `<body>` tag as last resort (usually includes nav, footer, etc.)
146146- ///
147147- /// ## Implementation Gaps:
148148- /// - Doesn't score multiple candidates to find the best one
149149- /// - Doesn't clean the HTML (may include ads, sidebars, etc.)
150150- /// - Doesn't check content length or quality
151151- /// - Doesn't exclude navigation, footers, comments within the selected element
152152- /// - Returns inner HTML as-is without any processing
153153- ///
154154- /// TODO: Add basic cleaning (remove script, style, nav, footer, aside)
155155- /// TODO: Check content length (minimum threshold)
156156- /// TODO: If multiple candidates, pick the one with most <p> tags
157157- fn extract_body(&self, document: &Html) -> Option<String> {
155155+ /// Simple fallback body extraction using common patterns
156156+ fn extract_body_simple(&self, document: &Html) -> Option<String> {
158157 let selectors = vec![
159158 "article",
160159 "main",
···180179 None
181180 }
182181182182+ /// Extract title from document
183183+ ///
184184+ /// Tries in order:
185185+ /// 1. `og:title` meta tag (usually cleanest)
186186+ /// 2. `<h1>` tag within likely content area
187187+ /// 3. `<title>` tag content (may include site name)
188188+ fn extract_title(&self, document: &Html) -> Option<String> {
189189+ if let Ok(selector) = Selector::parse("meta[property='og:title']")
190190+ && let Some(element) = document.select(&selector).next()
191191+ && let Some(content) = element.value().attr("content")
192192+ && !content.trim().is_empty()
193193+ {
194194+ return Some(content.trim().to_string());
195195+ }
196196+197197+ for container in ["article h1", "main h1", "h1"] {
198198+ if let Ok(selector) = Selector::parse(container)
199199+ && let Some(element) = document.select(&selector).next()
200200+ {
201201+ let text: String = element.text().collect();
202202+ if !text.trim().is_empty() {
203203+ return Some(text.trim().to_string());
204204+ }
205205+ }
206206+ }
207207+208208+ if let Ok(selector) = Selector::parse("title")
209209+ && let Some(element) = document.select(&selector).next()
210210+ {
211211+ let text: String = element.text().collect();
212212+ if !text.trim().is_empty() {
213213+ let title = Self::clean_title(&text);
214214+ return Some(title);
215215+ }
216216+ }
217217+218218+ None
219219+ }
220220+221221+ /// Clean title by removing common site name suffixes
222222+ fn clean_title(title: &str) -> String {
223223+ let title = title.trim();
224224+ let separators = [" | ", " - ", " — ", " :: ", " » ", " · "];
225225+226226+ for sep in separators {
227227+ if let Some(pos) = title.find(sep) {
228228+ let candidate = title[..pos].trim();
229229+ if candidate.len() > 10 {
230230+ return candidate.to_string();
231231+ }
232232+ }
233233+ }
234234+235235+ title.to_string()
236236+ }
237237+183238 /// Extract author from document
184239 ///
185240 /// Tries in order:
···187242 /// 2. `<meta property="og:author">` tag
188243 /// 3. `<meta property="article:author">` tag
189244 /// 4. Element with class "author", "byline", or "by"
190190- ///
191191- /// ## Implementation Gaps:
192192- /// - Doesn't parse structured data (JSON-LD, Schema.org)
193193- /// - Doesn't extract from "By John Doe" patterns in text
194194- /// - Returns first match without validation
245245+ /// 5. Schema.org author markup
195246 fn extract_author(&self, document: &Html) -> Option<String> {
196247 let meta_selectors = vec![
197248 "meta[name='author']",
198249 "meta[property='og:author']",
199250 "meta[property='article:author']",
251251+ "[itemprop='author']",
252252+ "[rel='author']",
200253 ];
201254202255 for selector_str in meta_selectors {
203256 if let Ok(selector) = Selector::parse(selector_str)
204257 && let Some(element) = document.select(&selector).next()
205205- && let Some(content) = element.value().attr("content")
206206- && !content.trim().is_empty()
207258 {
208208- return Some(content.trim().to_string());
259259+ if let Some(content) = element.value().attr("content")
260260+ && !content.trim().is_empty()
261261+ {
262262+ return Some(content.trim().to_string());
263263+ }
264264+265265+ let text: String = element.text().collect();
266266+ if !text.trim().is_empty() {
267267+ return Some(text.trim().to_string());
268268+ }
209269 }
210270 }
211271212212- let class_selectors = vec![".author", ".byline", ".by"];
272272+ let class_selectors = vec![".author", ".byline", ".by", ".post-author", ".entry-author"];
213273214274 for selector_str in class_selectors {
215275 if let Ok(selector) = Selector::parse(selector_str)
···217277 {
218278 let text: String = element.text().collect();
219279 if !text.trim().is_empty() {
220220- return Some(text.trim().to_string());
280280+ return Some(Self::clean_author(&text));
221281 }
222282 }
223283 }
224284225285 None
286286+ }
287287+288288+ /// Clean author text (remove "By " prefix, etc.)
289289+ fn clean_author(author: &str) -> String {
290290+ let author = author.trim();
291291+292292+ let prefixes = ["By ", "by ", "Author: ", "Written by "];
293293+ for prefix in prefixes {
294294+ if let Some(rest) = author.strip_prefix(prefix) {
295295+ return rest.trim().to_string();
296296+ }
297297+ }
298298+299299+ author.to_string()
226300 }
227301228302 /// Extract publication date from document
···232306 /// 2. `<meta itemprop="datePublished">` (Schema.org)
233307 /// 3. `<time datetime="...">` attribute
234308 /// 4. `<time>` element text content
235235- ///
236236- /// ## Implementation Gaps:
237237- /// - Doesn't parse or normalize date formats
238238- /// - Doesn't validate date values
239239- /// - Doesn't extract from text patterns ("Published on Jan 1, 2020")
240309 fn extract_date(&self, document: &Html) -> Option<String> {
241310 let meta_selectors = vec![
242311 "meta[property='article:published_time']",
243312 "meta[itemprop='datePublished']",
313313+ "meta[name='date']",
314314+ "meta[name='DC.date.issued']",
244315 ];
245316246317 for selector_str in meta_selectors {
···261332 return Some(datetime.trim().to_string());
262333 }
263334335335+ if let Ok(selector) = Selector::parse("[itemprop='datePublished']")
336336+ && let Some(element) = document.select(&selector).next()
337337+ {
338338+ if let Some(datetime) = element.value().attr("datetime")
339339+ && !datetime.trim().is_empty()
340340+ {
341341+ return Some(datetime.trim().to_string());
342342+ }
343343+344344+ if let Some(content) = element.value().attr("content")
345345+ && !content.trim().is_empty()
346346+ {
347347+ return Some(content.trim().to_string());
348348+ }
349349+ }
350350+264351 if let Ok(selector) = Selector::parse("time")
265352 && let Some(element) = document.select(&selector).next()
266353 {
···269356 return Some(text.trim().to_string());
270357 }
271358 }
359359+272360 None
273361 }
274362}
···278366 use super::*;
279367280368 #[test]
281281- fn test_extract_title_from_title_tag() {
369369+ fn test_extract_title_from_og() {
282370 let html = r#"
283371 <html>
284284- <head><title>Test Article Title</title></head>
372372+ <head>
373373+ <meta property="og:title" content="OG Title">
374374+ <title>Page Title | Site Name</title>
375375+ </head>
285376 <body></body>
286377 </html>
287378 "#;
···290381 let document = Html::parse_document(html);
291382 let title = extractor.extract_title(&document);
292383293293- assert_eq!(title, Some("Test Article Title".to_string()));
384384+ assert_eq!(title, Some("OG Title".to_string()));
385385+ }
386386+387387+ #[test]
388388+ fn test_clean_title_suffix() {
389389+ let title = "My Article Title | Some News Site";
390390+ let cleaned = GenericExtractor::clean_title(title);
391391+ assert_eq!(cleaned, "My Article Title");
294392 }
295393296394 #[test]
297395 fn test_extract_title_from_h1() {
298396 let html = r#"
299397 <html>
300300- <body><h1>Article Heading</h1></body>
398398+ <body>
399399+ <article>
400400+ <h1>Article Heading</h1>
401401+ </article>
402402+ </body>
301403 </html>
302404 "#;
303405···312414 fn test_extract_body_from_article() {
313415 let html = r#"
314416 <html>
417417+ <head><title>Test</title></head>
315418 <body>
419419+ <nav>Navigation here</nav>
316420 <article>
317317- <p>This is the article content.</p>
421421+ <p>This is the main article content with enough text to be considered viable content.</p>
422422+ <p>Another paragraph here with more content to ensure we have substantial text.</p>
318423 </article>
424424+ <aside>Sidebar content</aside>
319425 </body>
320426 </html>
321427 "#;
322428323429 let extractor = GenericExtractor::new(html.to_string());
324324- let document = Html::parse_document(html);
325325- let body = extractor.extract_body(&document);
430430+ let result = extractor.extract().unwrap();
326431327327- assert!(body.is_some());
328328- assert!(body.unwrap().contains("This is the article content"));
432432+ assert!(result.body_html.contains("main article content"));
329433 }
330434331435 #[test]
···346450 }
347451348452 #[test]
453453+ fn test_extract_author_from_byline() {
454454+ let html = r#"
455455+ <html>
456456+ <body>
457457+ <span class="byline">By Jane Smith</span>
458458+ </body>
459459+ </html>
460460+ "#;
461461+462462+ let extractor = GenericExtractor::new(html.to_string());
463463+ let document = Html::parse_document(html);
464464+ let author = extractor.extract_author(&document);
465465+466466+ assert_eq!(author, Some("Jane Smith".to_string()));
467467+ }
468468+469469+ #[test]
349470 fn test_extract_date_from_meta() {
350471 let html = r#"
351472 <html>
352473 <head>
353353- <meta property="article:published_time" content="2024-01-15">
474474+ <meta property="article:published_time" content="2024-01-15T10:30:00Z">
354475 </head>
355476 </html>
356477 "#;
···359480 let document = Html::parse_document(html);
360481 let date = extractor.extract_date(&document);
361482362362- assert_eq!(date, Some("2024-01-15".to_string()));
483483+ assert_eq!(date, Some("2024-01-15T10:30:00Z".to_string()));
363484 }
364485365486 #[test]
···367488 let html = r#"
368489 <html>
369490 <head>
370370- <title>Test Article</title>
491491+ <meta property="og:title" content="Test Article">
371492 <meta name="author" content="Jane Smith">
372493 <meta property="article:published_time" content="2024-01-15">
373494 </head>
374495 <body>
496496+ <header>Site Header</header>
375497 <article>
376498 <h1>Article Title</h1>
377377- <p>Article content goes here.</p>
499499+ <p>This is the main article content. It contains several paragraphs of text that make up the body of the article. The content should be substantial enough to score well.</p>
500500+ <p>This is another paragraph with additional content. More words here to ensure we have a proper article body.</p>
378501 </article>
502502+ <footer>Site Footer</footer>
379503 </body>
380504 </html>
381505 "#;
···384508 let result = extractor.extract().unwrap();
385509386510 assert_eq!(result.title, "Test Article");
387387- assert!(result.body_html.contains("Article content goes here"));
511511+ assert!(result.body_html.contains("main article content"));
388512 assert_eq!(result.author, Some("Jane Smith".to_string()));
389513 assert_eq!(result.date, Some("2024-01-15".to_string()));
514514+ }
515515+516516+ #[test]
517517+ fn test_find_candidates_skips_nav() {
518518+ let html = r#"
519519+ <html>
520520+ <body>
521521+ <nav class="navigation">
522522+ <p>Nav item 1</p>
523523+ <p>Nav item 2</p>
524524+ </nav>
525525+ <article>
526526+ <p>Real content here that should be selected as the main candidate.</p>
527527+ </article>
528528+ </body>
529529+ </html>
530530+ "#;
531531+532532+ let extractor = GenericExtractor::new(html.to_string());
533533+ let document = Html::parse_document(html);
534534+ let candidates = extractor.find_candidates(&document);
535535+536536+ assert!(!candidates.is_empty());
537537+ for candidate in &candidates {
538538+ assert_ne!(candidate.value().name(), "nav");
539539+ }
540540+ }
541541+542542+ #[test]
543543+ fn test_scored_extraction_prefers_article() {
544544+ let html = r#"
545545+ <html>
546546+ <head><title>Test</title></head>
547547+ <body>
548548+ <div class="sidebar">
549549+ <p>Sidebar content here.</p>
550550+ </div>
551551+ <article class="post-content">
552552+ <p>This is the main article content with plenty of text to score well in the content scoring algorithm.</p>
553553+ <p>Multiple paragraphs help boost the score significantly.</p>
554554+ </article>
555555+ </body>
556556+ </html>
557557+ "#;
558558+559559+ let extractor = GenericExtractor::new(html.to_string());
560560+ let result = extractor.extract().unwrap();
561561+562562+ assert!(result.body_html.contains("main article content"));
390563 }
391564}
+4-1
crates/readability/src/extractor/mod.rs
···44pub mod scoring;
55pub mod xpath;
6677-pub use generic::GenericExtractor;
77+pub use generic::{ExtractedContent, GenericExtractor};
88+pub use scoring::{
99+ ContentScore, calculate_class_weight, calculate_link_density, is_unlikely_candidate, is_viable_candidate,
1010+};
811pub use xpath::XPathExtractor;
+302-8
crates/readability/src/extractor/scoring.rs
···11//! Content scoring for the Mozilla Readability algorithm
22//!
33-//! TODO: Implement scoring
33+//! This module implements the heuristic-based scoring system used to identify
44+//! main content in HTML documents. Based on the Arc90/Mozilla Readability algorithm.
55+66+use scraper::{ElementRef, Selector};
4758/// Content score for an element
66-#[derive(Debug, Clone)]
99+#[derive(Debug, Clone, Default)]
710pub struct ContentScore {
88- /// Text length of the element
99- pub text_length: usize,
1010- /// Link density (0.0 to 1.0)
1111+ /// Base score from tag type
1212+ pub tag_score: f32,
1313+ /// Bonus/penalty from class/id names
1414+ pub class_weight: f32,
1515+ /// Link density (0.0 to 1.0) - lower is better for content
1116 pub link_density: f32,
1212- /// Class/ID weight (positive for content, negative for non-content)
1313- pub class_weight: f32,
1717+ /// Text length bonus
1818+ pub text_length_bonus: f32,
1919+ /// Comma count bonus (indicates prose)
2020+ pub comma_bonus: f32,
1421 /// Total calculated score
1522 pub total: f32,
1623}
17242525+impl ContentScore {
2626+ /// Create a new score for an element
2727+ ///
2828+ /// Total score is calculated as:
2929+ ///
3030+ /// ```text
3131+ /// tag_score + class_weight + text_length_bonus + comma_bonus - (link_density * 10.0)
3232+ /// ```
3333+ ///
3434+ /// High link density is penalized (navigation/sidebar content)
3535+ pub fn new(element: ElementRef) -> Self {
3636+ let tag_score = calculate_tag_score(element);
3737+ let class_weight = calculate_class_weight(element);
3838+ let (text_length_bonus, comma_bonus) = calculate_text_bonuses(element);
3939+ let link_density = calculate_link_density(element);
4040+ let total = tag_score + class_weight + text_length_bonus + comma_bonus - (link_density * 10.0);
4141+ Self { tag_score, class_weight, link_density, text_length_bonus, comma_bonus, total }
4242+ }
4343+}
4444+1845/// Positive class/ID patterns indicating content
1946pub const POSITIVE_PATTERNS: &[&str] = &[
2020- "article", "body", "content", "entry", "main", "page", "post", "text", "blog", "story",
4747+ "article",
4848+ "body",
4949+ "content",
5050+ "entry",
5151+ "main",
5252+ "page",
5353+ "post",
5454+ "text",
5555+ "blog",
5656+ "story",
5757+ "hentry",
5858+ "h-entry",
5959+ "entry-content",
6060+ "post-content",
6161+ "article-content",
2162];
22632364/// Negative class/ID patterns indicating non-content
···3980 "agegate",
4081 "pagination",
4182 "nav",
8383+ "related",
8484+ "social",
8585+ "widget",
8686+ "promo",
8787+ "masthead",
8888+ "meta",
8989+ "outbrain",
9090+ "taboola",
4291];
9292+9393+/// Tags that are likely to contain main content
9494+const POSITIVE_TAGS: &[&str] = &["article", "main", "section", "div", "p", "td", "pre"];
9595+9696+/// Tags unlikely to contain main content
9797+const NEGATIVE_TAGS: &[&str] = &[
9898+ "nav",
9999+ "aside",
100100+ "footer",
101101+ "header",
102102+ "form",
103103+ "iframe",
104104+ "figure",
105105+ "figcaption",
106106+];
107107+108108+/// Calculate base score from element tag name
109109+fn calculate_tag_score(element: ElementRef) -> f32 {
110110+ let tag_name = element.value().name();
111111+112112+ for tag in POSITIVE_TAGS {
113113+ if tag_name == *tag {
114114+ return match *tag {
115115+ "article" => 10.0,
116116+ "main" => 8.0,
117117+ "section" => 5.0,
118118+ "div" => 5.0,
119119+ "p" => 3.0,
120120+ "pre" => 3.0,
121121+ "td" => 3.0,
122122+ _ => 0.0,
123123+ };
124124+ }
125125+ }
126126+127127+ for tag in NEGATIVE_TAGS {
128128+ if tag_name == *tag {
129129+ return -5.0;
130130+ }
131131+ }
132132+133133+ 0.0
134134+}
135135+136136+/// Calculate class/id weight based on positive/negative patterns
137137+pub fn calculate_class_weight(element: ElementRef) -> f32 {
138138+ let mut weight: f32 = 0.0;
139139+140140+ let class_str = element.value().attr("class").unwrap_or("");
141141+ let id_str = element.value().attr("id").unwrap_or("");
142142+ let combined = format!("{} {}", class_str, id_str).to_lowercase();
143143+ for pattern in POSITIVE_PATTERNS {
144144+ if combined.contains(pattern) {
145145+ weight += 25.0;
146146+ }
147147+ }
148148+149149+ for pattern in NEGATIVE_PATTERNS {
150150+ if combined.contains(pattern) {
151151+ weight -= 25.0;
152152+ }
153153+ }
154154+155155+ weight
156156+}
157157+158158+/// Calculate text length and comma bonuses
159159+fn calculate_text_bonuses(element: ElementRef) -> (f32, f32) {
160160+ let text: String = element.text().collect();
161161+ let text_length = text.len();
162162+ let comma_count = text.matches(',').count();
163163+164164+ let text_length_bonus = ((text_length as f32).sqrt() / 5.0).min(10.0);
165165+ let comma_bonus = (comma_count as f32).min(3.0);
166166+167167+ (text_length_bonus, comma_bonus)
168168+}
169169+170170+/// Calculate link density (ratio of link text to total text)
171171+pub fn calculate_link_density(element: ElementRef) -> f32 {
172172+ let text: String = element.text().collect();
173173+ let total_length = text.len();
174174+175175+ if total_length == 0 {
176176+ return 0.0;
177177+ }
178178+179179+ let mut link_length = 0usize;
180180+181181+ if let Ok(selector) = Selector::parse("a") {
182182+ for link in element.select(&selector) {
183183+ let link_text: String = link.text().collect();
184184+ link_length += link_text.len();
185185+ }
186186+ }
187187+188188+ link_length as f32 / total_length as f32
189189+}
190190+191191+/// Check if an element is an "unlikely candidate" (sidebar, comment, etc.)
192192+pub fn is_unlikely_candidate(element: ElementRef) -> bool {
193193+ let class_str = element.value().attr("class").unwrap_or("");
194194+ let id_str = element.value().attr("id").unwrap_or("");
195195+ let combined = format!("{} {}", class_str, id_str).to_lowercase();
196196+197197+ for pattern in NEGATIVE_PATTERNS {
198198+ if combined.contains(pattern) {
199199+ for positive in POSITIVE_PATTERNS {
200200+ if combined.contains(positive) {
201201+ return false;
202202+ }
203203+ }
204204+ return true;
205205+ }
206206+ }
207207+208208+ false
209209+}
210210+211211+/// Check if an element has enough content to be a candidate
212212+pub fn is_viable_candidate(element: ElementRef) -> bool {
213213+ let text: String = element.text().collect();
214214+ let text_length = text.len();
215215+216216+ if text_length < 25 {
217217+ return false;
218218+ }
219219+ if let Ok(selector) = Selector::parse("p") {
220220+ let p_count = element.select(&selector).count();
221221+ if p_count > 0 {
222222+ return true;
223223+ }
224224+ }
225225+226226+ text_length >= 100
227227+}
228228+229229+#[cfg(test)]
230230+mod tests {
231231+ use super::*;
232232+ use scraper::Html;
233233+234234+ #[test]
235235+ fn test_positive_patterns_detection() {
236236+ let html = r#"<div id="content" class="article-body">Test content</div>"#;
237237+ let document = Html::parse_fragment(html);
238238+ let selector = Selector::parse("div").unwrap();
239239+ let element = document.select(&selector).next().unwrap();
240240+241241+ let weight = calculate_class_weight(element);
242242+ assert!(weight > 0.0, "Should have positive weight for content/article classes");
243243+ }
244244+245245+ #[test]
246246+ fn test_negative_patterns_detection() {
247247+ let html = r#"<div id="sidebar" class="comment-section">Test content</div>"#;
248248+ let document = Html::parse_fragment(html);
249249+ let selector = Selector::parse("div").unwrap();
250250+ let element = document.select(&selector).next().unwrap();
251251+252252+ let weight = calculate_class_weight(element);
253253+ assert!(weight < 0.0, "Should have negative weight for sidebar/comment classes");
254254+ }
255255+256256+ #[test]
257257+ fn test_link_density_calculation() {
258258+ let html = r#"<div>Some text here <a href="\#">link one</a> and <a href="\#">link two</a> more text</div>"#;
259259+ let document = Html::parse_fragment(html);
260260+ let selector = Selector::parse("div").unwrap();
261261+ let element = document.select(&selector).next().unwrap();
262262+263263+ let density = calculate_link_density(element);
264264+ assert!(density > 0.0 && density < 1.0, "Link density should be between 0 and 1");
265265+ }
266266+267267+ #[test]
268268+ fn test_high_link_density() {
269269+ let html = r#"<div><a href="\#">link</a><a href="\#">link</a><a href="\#">link</a></div>"#;
270270+ let document = Html::parse_fragment(html);
271271+ let selector = Selector::parse("div").unwrap();
272272+ let element = document.select(&selector).next().unwrap();
273273+274274+ let density = calculate_link_density(element);
275275+ assert!(
276276+ density > 0.8,
277277+ "Should detect high link density in navigation-like content"
278278+ );
279279+ }
280280+281281+ #[test]
282282+ fn test_unlikely_candidate() {
283283+ let html = r#"<div class="sidebar">Sidebar content</div>"#;
284284+ let document = Html::parse_fragment(html);
285285+ let selector = Selector::parse("div").unwrap();
286286+ let element = document.select(&selector).next().unwrap();
287287+288288+ assert!(is_unlikely_candidate(element), "Sidebar should be unlikely candidate");
289289+ }
290290+291291+ #[test]
292292+ fn test_viable_candidate_with_paragraphs() {
293293+ let html = r#"<div><p>This is a paragraph with enough content to be considered viable.</p></div>"#;
294294+ let document = Html::parse_fragment(html);
295295+ let selector = Selector::parse("div").unwrap();
296296+ let element = document.select(&selector).next().unwrap();
297297+298298+ assert!(is_viable_candidate(element), "Div with paragraph should be viable");
299299+ }
300300+301301+ #[test]
302302+ fn test_content_score_creation() {
303303+ let html =
304304+ r#"<article class="post-content"><p>This is article content with some commas, here, there.</p></article>"#;
305305+ let document = Html::parse_fragment(html);
306306+ let selector = Selector::parse("article").unwrap();
307307+ let element = document.select(&selector).next().unwrap();
308308+309309+ let score = ContentScore::new(element);
310310+ assert!(score.tag_score > 0.0, "Article tag should have positive score");
311311+ assert!(score.class_weight > 0.0, "post-content class should be positive");
312312+ assert!(score.comma_bonus > 0.0, "Should detect commas");
313313+ }
314314+315315+ #[test]
316316+ fn test_tag_score_article() {
317317+ let html = r#"<article>Content</article>"#;
318318+ let document = Html::parse_fragment(html);
319319+ let selector = Selector::parse("article").unwrap();
320320+ let element = document.select(&selector).next().unwrap();
321321+322322+ let score = calculate_tag_score(element);
323323+ assert_eq!(score, 10.0, "Article tag should score 10");
324324+ }
325325+326326+ #[test]
327327+ fn test_tag_score_nav() {
328328+ let html = r#"<nav>Navigation</nav>"#;
329329+ let document = Html::parse_fragment(html);
330330+ let selector = Selector::parse("nav").unwrap();
331331+ let element = document.select(&selector).next().unwrap();
332332+333333+ let score = calculate_tag_score(element);
334334+ assert_eq!(score, -5.0, "Nav tag should score -5");
335335+ }
336336+}
+51-5
crates/readability/src/extractor/xpath.rs
···2323use regex::Regex;
2424use scraper::{ElementRef, Html, Selector};
25252626+static VOID_ELEMENTS: &[&str] = &[
2727+ "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr",
2828+];
2929+2630/// Extracted content from XPath rules
2731#[derive(Debug, Clone)]
2832pub struct ExtractedContent {
···142146 output.push_str(&html_escape::encode_text(&text.to_string()));
143147 }
144148 }
145145-146146- const VOID_ELEMENTS: &[&str] = &[
147147- "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track",
148148- "wbr",
149149- ];
150149151150 if !VOID_ELEMENTS.contains(&tag) {
152151 output.push_str("</");
···491490 assert_eq!(result.author, Some("Test Author".to_string()));
492491 assert_eq!(result.date, Some("2024-01-15".to_string()));
493492 }
493493+494494+ #[test]
495495+ fn test_strip_elements_inside_body() {
496496+ let html = r#"
497497+ <html>
498498+ <body>
499499+ <div id="bodyContent">
500500+ <h2>Section Title <span class="mw-editsection">[edit]</span></h2>
501501+ <p>Main content here.</p>
502502+ <h2>Another Section <span class="mw-editsection-bracket">[</span></h2>
503503+ </div>
504504+ </body>
505505+ </html>
506506+ "#;
507507+508508+ let config = SiteConfig {
509509+ body: vec!["//*[@id='bodyContent']".to_string()],
510510+ strip_id_or_class: vec!["editsection".to_string()],
511511+ ..Default::default()
512512+ };
513513+514514+ let extractor = XPathExtractor::new(html.to_string());
515515+ let result = extractor.extract(&config).unwrap();
516516+517517+ let body = result.body_html.expect("Should extract body");
518518+ println!("Extracted body: {}", body);
519519+520520+ assert!(!body.contains("mw-editsection"), "mw-editsection should be stripped");
521521+ assert!(!body.contains("[edit]"), "[edit] text should be stripped");
522522+ assert!(body.contains("Main content here"));
523523+ assert!(body.contains("Section Title"));
524524+ }
525525+}
526526+527527+#[test]
528528+fn test_wikipedia_xpath_patterns() {
529529+ let extractor = XPathExtractor::new(String::new());
530530+531531+ // Wikipedia title XPath
532532+ let (css, filter) = extractor.xpath_to_css_with_attr("//h1[@id='firstHeading']").unwrap();
533533+ assert_eq!(css, "h1#firstHeading");
534534+ assert!(filter.is_none());
535535+536536+ // Wikipedia body XPath (note space around =)
537537+ let (css, filter) = extractor.xpath_to_css_with_attr("//div[@id = 'bodyContent']").unwrap();
538538+ assert_eq!(css, "div#bodyContent");
539539+ assert!(filter.is_none());
494540}
+16-24
crates/readability/src/lib.rs
···5555}
56565757impl Readability {
5858- /// Create a new Readability instance
5959- ///
6060- /// # Arguments
6161- ///
6262- /// * `html` - The HTML content to extract from
6363- /// * `url` - Optional URL of the article (used for rule matching)
6458 pub fn new(html: String, url: Option<&str>) -> Self {
6559 Self { html, url: url.map(String::from), rules_dir: None }
6660 }
···75697670 /// Extract article content from HTML
7771 ///
7878- /// ## Extraction Flow:
7972 /// 1. If URL provided: Try to load site-specific XPath rules from embedded rules
8080- /// 2. If rules found: Attempt XPath-based extraction
7373+ /// 2. If rules found: Attempt XPath-based extraction with strip rules applied
8174 /// 3. If no rules OR XPath extraction fails: Fall back to generic heuristic extraction
8282- /// 4. Convert extracted HTML to markdown
8383- /// 5. Generate excerpt from markdown
8484- /// 6. Return complete Article struct
7575+ /// 4. Clean extracted HTML (remove scripts, styles, unlikely elements)
7676+ /// 5. Convert cleaned HTML to markdown
7777+ /// 6. Generate excerpt from markdown
7878+ /// 7. Return complete Article struct
8579 ///
8686- /// ## Implementation Gaps:
8787- /// - XPath extraction doesn't handle complex expressions with `contains()`, `normalize-space()`, etc.
8888- /// These will fall back to generic extraction
8989- /// - No content cleaning between XPath/generic extraction and markdown conversion
9090- /// (scripts, styles, etc. may be present in extracted HTML)
9191- /// - Generic extraction may include non-content elements (nav, footer, etc.)
8080+ /// Supported XPath Features:
8181+ /// - Simple tag selection: `//tag`
8282+ /// - ID selection: `//tag[@id='value']`
8383+ /// - Class matching: `//tag[@class='value']`, `//tag[contains(@class, 'value')]`
8484+ /// - Normalized class: `//tag[contains(concat(' ',normalize-space(@class),' '),' value ')]`
8585+ /// - Attribute extraction: `//meta[@name='value']/@content`
8686+ /// - Strip rules: `strip_id_or_class` and `strip` XPath directives
9287 ///
9393- /// ## Design Decision:
8888+ /// Design:
9489 /// We prefer to return *something* (via generic extraction) rather than fail completely.
9590 /// This maximizes success rate at the cost of potentially lower quality extraction.
9696- ///
9797- /// TODO: Add HTML cleaning step before markdown conversion
9898- /// TODO: Implement XPath strip directives to remove unwanted elements
9999- /// TODO: Add content validation (minimum length, etc.)
10091 pub fn parse(&self) -> Result<Article> {
10192 use config::ConfigLoader;
10293 use converter::to_markdown;
···121112 self.extract_with_generic()?
122113 };
123114124124- let markdown = to_markdown(&content);
115115+ let cleaned_content = cleaner::HtmlCleaner::clean(&content);
116116+ let markdown = to_markdown(&cleaned_content);
125117 let excerpt = Some(converter::html2md::generate_excerpt(&markdown, 200));
126126- Ok(Article { title, content, markdown, author, published_date: date, excerpt })
118118+ Ok(Article { title, content: cleaned_content, markdown, author, published_date: date, excerpt })
127119 }
128120129121 /// Extract using generic heuristic-based algorithm
+90
crates/readability/tests/readability_tests.rs
···11+use malfestio_readability::Readability;
22+33+#[tokio::test]
44+#[ignore = "requires network access"]
55+async fn test_arxiv_extraction() {
66+ let url = "https://arxiv.org/abs/2009.03017";
77+88+ let client = reqwest::Client::builder()
99+ .user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)")
1010+ .build()
1111+ .unwrap();
1212+1313+ let response = client.get(url).send().await.unwrap();
1414+ let html = response.text().await.unwrap();
1515+1616+ let readability = Readability::new(html, Some(url));
1717+ let article = readability.parse().unwrap();
1818+1919+ assert!(!article.title.is_empty(), "Title should be extracted");
2020+ println!("Title: {}", article.title);
2121+2222+ assert!(!article.markdown.is_empty(), "Body/markdown should be extracted");
2323+ assert!(article.markdown.len() > 50, "Abstract should have substantial content");
2424+ println!("Markdown length: {} chars", article.markdown.len());
2525+2626+ assert!(article.author.is_some(), "Author should be extracted from meta tag");
2727+ println!("Author: {:?}", article.author);
2828+2929+ assert!(
3030+ article.published_date.is_some(),
3131+ "Date should be extracted from meta tag"
3232+ );
3333+ println!("Date: {:?}", article.published_date);
3434+}
3535+3636+#[tokio::test]
3737+#[ignore = "requires network access"]
3838+async fn test_wikipedia_extraction() {
3939+ let url = "https://en.wikipedia.org/wiki/Rust_(programming_language)";
4040+4141+ let client = reqwest::Client::builder()
4242+ .user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)")
4343+ .build()
4444+ .unwrap();
4545+4646+ let response = client.get(url).send().await.unwrap();
4747+ let html = response.text().await.unwrap();
4848+4949+ let readability = Readability::new(html, Some(url));
5050+ let article = readability.parse().unwrap();
5151+5252+ assert!(article.title.contains("Rust"), "Title should contain 'Rust'");
5353+ println!("Title: {}", article.title);
5454+5555+ assert!(
5656+ article.markdown.len() > 1000,
5757+ "Wikipedia article should have substantial content"
5858+ );
5959+ println!("Markdown length: {} chars", article.markdown.len());
6060+6161+ // Verify strip rules worked: mw-editsection elements should be removed
6262+ assert!(
6363+ !article.content.contains("mw-editsection"),
6464+ "Edit section elements (mw-editsection) should be stripped"
6565+ );
6666+}
6767+6868+/// Test extraction for site without specific rules (falls back to generic)
6969+#[tokio::test]
7070+#[ignore = "requires network access"]
7171+async fn test_generic_fallback_extraction() {
7272+ let url = "https://www.rust-lang.org/";
7373+7474+ let client = reqwest::Client::builder()
7575+ .user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)")
7676+ .build()
7777+ .unwrap();
7878+7979+ let response = client.get(url).send().await.unwrap();
8080+ let html = response.text().await.unwrap();
8181+8282+ let readability = Readability::new(html, Some(url));
8383+ let article = readability.parse().unwrap();
8484+8585+ assert!(!article.title.is_empty(), "Title should be extracted via generic");
8686+ assert!(!article.markdown.is_empty(), "Content should be extracted via generic");
8787+8888+ println!("Title: {}", article.title);
8989+ println!("Markdown length: {} chars", article.markdown.len());
9090+}