···11+//! Embedded site-specific extraction rules
22+//!
33+//! Rules are compiled into the binary at build time for fast access without filesystem dependencies.
44+55+use std::collections::HashMap;
66+77+/// Embedded rule files indexed by domain
88+///
99+/// Supported domains:
1010+/// - arxiv.org
1111+/// - .wikipedia.org (subdomain wildcard)
1212+pub fn get_embedded_rules() -> HashMap<&'static str, &'static str> {
1313+ let mut rules = HashMap::new();
1414+ rules.insert("arxiv.org", include_str!("../../rules/arxiv.org.txt"));
1515+ rules.insert(".wikipedia.org", include_str!("../../rules/.wikipedia.org.txt"));
1616+ rules
1717+}
1818+1919+/// Get embedded rule content for a domain
2020+pub fn get_rule_for_domain(domain: &str) -> Option<&'static str> {
2121+ let rules = get_embedded_rules();
2222+2323+ if let Some(rule) = rules.get(domain) {
2424+ return Some(rule);
2525+ }
2626+2727+ let parts: Vec<&str> = domain.split('.').collect();
2828+ if parts.len() > 2 {
2929+ let parent_domain = parts[1..].join(".");
3030+ let wildcard_key = format!(".{}", parent_domain);
3131+ if let Some(rule) = rules.get(wildcard_key.as_str()) {
3232+ return Some(rule);
3333+ }
3434+ }
3535+3636+ None
3737+}
3838+3939+#[cfg(test)]
4040+mod tests {
4141+ use super::*;
4242+4343+ #[test]
4444+ fn test_embedded_rules_loaded() {
4545+ let rules = get_embedded_rules();
4646+ assert!(rules.contains_key("arxiv.org"));
4747+ assert!(rules.contains_key(".wikipedia.org"));
4848+ }
4949+5050+ #[test]
5151+ fn test_get_arxiv_rule() {
5252+ let rule = get_rule_for_domain("arxiv.org");
5353+ assert!(rule.is_some());
5454+ assert!(rule.unwrap().contains("title:"));
5555+ }
5656+5757+ #[test]
5858+ fn test_get_wikipedia_rule_subdomain() {
5959+ let rule = get_rule_for_domain("en.wikipedia.org");
6060+ assert!(rule.is_some());
6161+ assert!(rule.unwrap().contains("firstHeading"));
6262+ }
6363+6464+ #[test]
6565+ fn test_unknown_domain() {
6666+ let rule = get_rule_for_domain("unknown.com");
6767+ assert!(rule.is_none());
6868+ }
6969+}
+132
crates/readability/src/config/loader.rs
···11+//! Load site-specific configuration files based on URL
22+33+use crate::config::embedded_rules;
44+use crate::config::parser::{SiteConfig, parse_config};
55+use crate::error::Result;
66+use std::path::{Path, PathBuf};
77+use url::Url;
88+99+/// Loads site-specific configuration files
1010+///
1111+/// First checks embedded rules, then falls back to external rules_dir if provided.
1212+#[derive(Default)]
1313+pub struct ConfigLoader {
1414+ rules_dir: Option<PathBuf>,
1515+}
1616+1717+impl ConfigLoader {
1818+ /// Create a new config loader with embedded rules only
1919+ pub fn new() -> Self {
2020+ Self::default()
2121+ }
2222+2323+ /// Create a config loader with an external rules directory
2424+ ///
2525+ /// External rules take precedence over embedded rules.
2626+ pub fn with_rules_dir(rules_dir: PathBuf) -> Self {
2727+ Self { rules_dir: Some(rules_dir) }
2828+ }
2929+3030+ /// Load configuration for a given URL
3131+ ///
3232+ /// Priority:
3333+ /// 1. External rules (if rules_dir provided)
3434+ /// 2. Embedded rules
3535+ /// 3. None (if no match found)
3636+ pub fn load_for_url(&self, url: &str) -> Result<Option<SiteConfig>> {
3737+ let Some(domain) = Self::extract_domain(url) else {
3838+ return Ok(None);
3939+ };
4040+4141+ if let Some(ref rules_dir) = self.rules_dir
4242+ && let Some(config) = self.try_load_from_dir(rules_dir, &domain)?
4343+ {
4444+ return Ok(Some(config));
4545+ }
4646+4747+ if let Some(rule_content) = embedded_rules::get_rule_for_domain(&domain) {
4848+ return Ok(Some(parse_config(rule_content)?));
4949+ }
5050+5151+ Ok(None)
5252+ }
5353+5454+ /// Try to load config from external directory
5555+ fn try_load_from_dir(&self, rules_dir: &Path, domain: &str) -> Result<Option<SiteConfig>> {
5656+ let exact_path = rules_dir.join(format!("{}.txt", domain));
5757+ if exact_path.exists() {
5858+ let content = std::fs::read_to_string(&exact_path)?;
5959+ return Ok(Some(parse_config(&content)?));
6060+ }
6161+6262+ let wildcard_path = rules_dir.join(format!(".{}.txt", domain));
6363+ if wildcard_path.exists() {
6464+ let content = std::fs::read_to_string(&wildcard_path)?;
6565+ return Ok(Some(parse_config(&content)?));
6666+ }
6767+6868+ if let Some(parent_domain) = Self::extract_parent_domain(domain) {
6969+ let parent_wildcard = rules_dir.join(format!(".{}.txt", parent_domain));
7070+ if parent_wildcard.exists() {
7171+ let content = std::fs::read_to_string(&parent_wildcard)?;
7272+ return Ok(Some(parse_config(&content)?));
7373+ }
7474+ }
7575+7676+ Ok(None)
7777+ }
7878+7979+ /// Extract domain from URL
8080+ fn extract_domain(url: &str) -> Option<String> {
8181+ Url::parse(url).ok().and_then(|u| u.host_str().map(String::from))
8282+ }
8383+8484+ /// Extract parent domain (e.g., "en.wikipedia.org" -> "wikipedia.org")
8585+ fn extract_parent_domain(domain: &str) -> Option<String> {
8686+ let parts: Vec<&str> = domain.split('.').collect();
8787+ if parts.len() > 2 { Some(parts[1..].join(".")) } else { None }
8888+ }
8989+}
9090+9191+#[cfg(test)]
9292+mod tests {
9393+ use super::*;
9494+9595+ #[test]
9696+ fn test_extract_domain() {
9797+ assert_eq!(
9898+ ConfigLoader::extract_domain("https://arxiv.org/abs/123"),
9999+ Some("arxiv.org".to_string())
100100+ );
101101+ assert_eq!(
102102+ ConfigLoader::extract_domain("https://en.wikipedia.org/wiki/Article"),
103103+ Some("en.wikipedia.org".to_string())
104104+ );
105105+ assert_eq!(ConfigLoader::extract_domain("invalid"), None);
106106+ }
107107+108108+ #[test]
109109+ fn test_load_embedded_arxiv() {
110110+ let loader = ConfigLoader::new();
111111+ let config = loader
112112+ .load_for_url("https://arxiv.org/abs/2009.03017")
113113+ .unwrap()
114114+ .expect("Should find embedded arxiv config");
115115+116116+ assert_eq!(config.title.len(), 1);
117117+ assert_eq!(config.body.len(), 1);
118118+ }
119119+120120+ #[test]
121121+ fn test_load_embedded_wikipedia() {
122122+ let loader = ConfigLoader::new();
123123+ let config = loader
124124+ .load_for_url("https://en.wikipedia.org/wiki/Article")
125125+ .unwrap()
126126+ .expect("Should find embedded wikipedia config");
127127+128128+ assert_eq!(config.title.len(), 1);
129129+ assert_eq!(config.body.len(), 1);
130130+ assert!(!config.prune);
131131+ }
132132+}
+8
crates/readability/src/config/mod.rs
···11+//! Configuration file parsing and loading for site-specific extraction rules
22+33+pub mod embedded_rules;
44+pub mod loader;
55+pub mod parser;
66+77+pub use loader::ConfigLoader;
88+pub use parser::{SiteConfig, parse_config};
+156
crates/readability/src/config/parser.rs
···11+//! Parser for ftr-site-config format extraction rules
22+33+use crate::error::{Error, Result};
44+55+/// Site-specific extraction configuration
66+#[derive(Debug, Clone, Default)]
77+pub struct SiteConfig {
88+ /// XPath expressions for title extraction (evaluated in order)
99+ pub title: Vec<String>,
1010+ /// XPath expressions for body extraction
1111+ pub body: Vec<String>,
1212+ /// XPath expressions for author extraction
1313+ pub author: Vec<String>,
1414+ /// XPath expressions for date extraction
1515+ pub date: Vec<String>,
1616+ /// XPath expressions for elements to strip
1717+ pub strip: Vec<String>,
1818+ /// Substrings to match in @id or @class for stripping
1919+ pub strip_id_or_class: Vec<String>,
2020+ /// Whether to prune non-content elements (default: true)
2121+ pub prune: bool,
2222+ /// Whether to run HTML Tidy preprocessor (default: true)
2323+ pub tidy: bool,
2424+ /// Whether to fall back to generic extraction on failure (default: true)
2525+ pub autodetect_on_failure: bool,
2626+ /// Test URLs for validation
2727+ pub test_urls: Vec<String>,
2828+}
2929+3030+/// Parse a site configuration file in ftr-site-config format
3131+///
3232+/// Format:
3333+/// ```text
3434+/// # Comments start with hash
3535+/// directive: value
3636+/// directive: another value
3737+///
3838+/// # Boolean directives
3939+/// prune: yes
4040+/// tidy: no
4141+/// ```
4242+pub fn parse_config(content: &str) -> Result<SiteConfig> {
4343+ let mut config = SiteConfig { prune: true, tidy: true, autodetect_on_failure: true, ..Default::default() };
4444+4545+ for line in content.lines() {
4646+ let line = line.trim();
4747+4848+ if line.is_empty() || line.starts_with('#') {
4949+ continue;
5050+ }
5151+5252+ if let Some((directive, value)) = line.split_once(':') {
5353+ let directive = directive.trim();
5454+ let value = value.trim();
5555+5656+ match directive {
5757+ "title" => config.title.push(value.to_string()),
5858+ "body" => config.body.push(value.to_string()),
5959+ "author" => config.author.push(value.to_string()),
6060+ "date" => config.date.push(value.to_string()),
6161+ "strip" => config.strip.push(value.to_string()),
6262+ "strip_id_or_class" => config.strip_id_or_class.push(value.to_string()),
6363+ "test_url" => config.test_urls.push(value.to_string()),
6464+ "prune" => config.prune = parse_bool(value)?,
6565+ "tidy" => config.tidy = parse_bool(value)?,
6666+ "autodetect_on_failure" => config.autodetect_on_failure = parse_bool(value)?,
6767+ // TODO: Implement other directives (like http_header)
6868+ _ => {}
6969+ }
7070+ }
7171+ }
7272+7373+ Ok(config)
7474+}
7575+7676+/// Parse a boolean value (yes/no, true/false, 1/0)
7777+fn parse_bool(value: &str) -> Result<bool> {
7878+ match value.to_lowercase().as_str() {
7979+ "yes" | "true" | "1" => Ok(true),
8080+ "no" | "false" | "0" => Ok(false),
8181+ _ => Err(Error::ConfigError(format!("Invalid boolean value: {}", value))),
8282+ }
8383+}
8484+8585+#[cfg(test)]
8686+mod tests {
8787+ use super::*;
8888+8989+ #[test]
9090+ fn test_parse_empty_config() {
9191+ let config = parse_config("").unwrap();
9292+ assert!(config.title.is_empty());
9393+ assert!(config.body.is_empty());
9494+ }
9595+9696+ #[test]
9797+ fn test_parse_arxiv_config() {
9898+ let content = r#"
9999+title: //h1[contains(concat(' ',normalize-space(@class),' '),' title ')]
100100+body: //blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')]
101101+date: //meta[@name='citation_date']/@content
102102+author: //meta[@name='citation_author']/@content
103103+test_url: https://arxiv.org/abs/2009.03017
104104+test_url: https://arxiv.org/abs/2012.03780
105105+ "#;
106106+107107+ let config = parse_config(content).unwrap();
108108+ assert_eq!(config.title.len(), 1);
109109+ assert_eq!(config.body.len(), 1);
110110+ assert_eq!(config.author.len(), 1);
111111+ assert_eq!(config.date.len(), 1);
112112+ assert_eq!(config.test_urls.len(), 2);
113113+ }
114114+115115+ #[test]
116116+ fn test_parse_with_comments() {
117117+ let content = r#"
118118+# This is a comment
119119+title: //h1
120120+# Another comment
121121+body: //article
122122+ "#;
123123+124124+ let config = parse_config(content).unwrap();
125125+ assert_eq!(config.title.len(), 1);
126126+ assert_eq!(config.body.len(), 1);
127127+ }
128128+129129+ #[test]
130130+ fn test_parse_boolean_directives() {
131131+ let content = r#"
132132+prune: no
133133+tidy: yes
134134+autodetect_on_failure: no
135135+ "#;
136136+137137+ let config = parse_config(content).unwrap();
138138+ assert!(!config.prune);
139139+ assert!(config.tidy);
140140+ assert!(!config.autodetect_on_failure);
141141+ }
142142+143143+ #[test]
144144+ fn test_parse_strip_directives() {
145145+ let content = r#"
146146+strip: //div[@class='sidebar']
147147+strip: //div[@id='footer']
148148+strip_id_or_class: advertisement
149149+strip_id_or_class: nav
150150+ "#;
151151+152152+ let config = parse_config(content).unwrap();
153153+ assert_eq!(config.strip.len(), 2);
154154+ assert_eq!(config.strip_id_or_class.len(), 2);
155155+ }
156156+}
+39
crates/readability/src/converter/html2md.rs
···11+//! Markdown conversion using html2md crate
22+33+/// Convert HTML to Markdown
44+pub fn to_markdown(html: &str) -> String {
55+ html2md::parse_html(html)
66+}
77+88+/// Generate an excerpt from markdown (first ~200 chars)
99+pub fn generate_excerpt(markdown: &str, max_length: usize) -> String {
1010+ let cleaned: String = markdown.chars().filter(|c| !c.is_control() || *c == '\n').collect();
1111+1212+ if cleaned.len() <= max_length {
1313+ cleaned
1414+ } else {
1515+ let truncated = &cleaned[..max_length];
1616+ format!("{}...", truncated.trim_end())
1717+ }
1818+}
1919+2020+#[cfg(test)]
2121+mod tests {
2222+ use super::*;
2323+2424+ #[test]
2525+ fn test_generate_excerpt() {
2626+ let markdown =
2727+ "This is a long piece of markdown text that should be truncated to approximately 200 characters or so.";
2828+ let excerpt = generate_excerpt(markdown, 50);
2929+ assert!(excerpt.len() <= 53);
3030+ assert!(excerpt.ends_with("..."));
3131+ }
3232+3333+ #[test]
3434+ fn test_generate_excerpt_short() {
3535+ let markdown = "Short text";
3636+ let excerpt = generate_excerpt(markdown, 50);
3737+ assert_eq!(excerpt, "Short text");
3838+ }
3939+}
+5
crates/readability/src/converter/mod.rs
···11+//! HTML to Markdown conversion
22+33+pub mod html2md;
44+55+pub use self::html2md::to_markdown;
+23
crates/readability/src/error.rs
···11+use thiserror::Error;
22+33+/// Errors that can occur during article extraction
44+#[derive(Error, Debug)]
55+pub enum Error {
66+ #[error("HTML parsing failed: {0}")]
77+ ParseError(String),
88+99+ #[error("XPath evaluation failed: {0}")]
1010+ XPathError(String),
1111+1212+ #[error("Config parse error: {0}")]
1313+ ConfigError(String),
1414+1515+ #[error("Extraction failed: {0}")]
1616+ ExtractionError(String),
1717+1818+ #[error("IO error: {0}")]
1919+ Io(#[from] std::io::Error),
2020+}
2121+2222+/// Result type for readability operations
2323+pub type Result<T> = std::result::Result<T, Error>;
+391
crates/readability/src/extractor/generic.rs
···11+//! Generic content extraction with a simplified heuristic-based approach
22+//!
33+//! ## Implementation Strategy
44+//!
55+//! This is a **simplified** content extractor, not a full Mozilla Readability implementation.
66+//! It uses basic heuristics to find common patterns in HTML documents.
77+//!
88+//! ### What This Implementation Does:
99+//! - Extracts title from `<title>`, `<h1>`, or `og:title` meta tags
1010+//! - Finds body content by looking for semantic HTML5 tags and common class names
1111+//! - Extracts author from meta tags or common byline patterns
1212+//! - Extracts date from meta tags or `<time>` elements
1313+//! - Uses simple CSS selector patterns (no complex scoring algorithm)
1414+//!
1515+//! ### What This Implementation Does NOT Do (Implementation Gaps):
1616+//! - **No content scoring**: Unlike Mozilla Readability, we don't score paragraphs by
1717+//! text length, link density, or class names to find the "best" content candidate
1818+//! - **No sibling inclusion**: We don't check if siblings of the main content should
1919+//! be included based on similarity thresholds
2020+//! - **No ancestor scoring**: We don't propagate scores up the DOM tree
2121+//! - **No link density checking**: We don't filter out high link-density sections
2222+//! - **No "unlikely candidate" removal**: We don't remove elements based on negative
2323+//! class name patterns like "sidebar", "comment", etc.
2424+//! - **Limited fallback chain**: Mozilla Readability tries multiple strategies; we try
2525+//! a few common patterns and give up
2626+//!
2727+//! ### Design Decisions:
2828+//! - **Semantic HTML first**: We prefer `<article>`, `<main>` over class-based selection
2929+//! because they're more reliable indicators of content
3030+//! - **Multiple fallbacks**: We try progressively broader selectors to maximize success rate
3131+//! - **Metadata from standards**: We use standard meta tags (Open Graph, Schema.org, etc.)
3232+//! before falling back to heuristics
3333+//! - **Fail fast**: If we can't find content with our heuristics, we return an error
3434+//! rather than returning garbage content
3535+//!
3636+//! ## TODOs:
3737+//! - TODO: Implement basic content scoring (count paragraphs, text length)
3838+//! - TODO: Add link density checks to filter navigation/sidebar
3939+//! - TODO: Remove unlikely candidates (ads, footers, etc.) by class name
4040+//! - TODO: Try multiple content candidates and pick the best one
4141+//! - TODO: Clean extracted HTML (remove scripts, styles, empty elements)
4242+//! - TODO: Handle multi-page articles (pagination detection)
4343+4444+use crate::error::{Error, Result};
4545+use scraper::{Html, Selector};
4646+4747+/// Extracted content from generic algorithm
4848+#[derive(Debug, Clone)]
4949+pub struct ExtractedContent {
5050+ pub title: String,
5151+ pub body_html: String,
5252+ pub author: Option<String>,
5353+ pub date: Option<String>,
5454+}
5555+5656+/// Generic content extractor using simple heuristics
5757+///
5858+/// This extractor attempts to find article content using common HTML patterns.
5959+/// It's designed as a fallback when site-specific XPath rules are not available.
6060+pub struct GenericExtractor {
6161+ html: String,
6262+}
6363+6464+impl GenericExtractor {
6565+ /// Create a new generic extractor
6666+ pub fn new(html: String) -> Self {
6767+ Self { html }
6868+ }
6969+7070+ /// Extract content using simple heuristics
7171+ ///
7272+ /// ## Extraction Strategy:
7373+ /// 1. Title: `<title>` tag, then `<h1>`, then `og:title` meta tag
7474+ /// 2. Body: `<article>`, then `<main>`, then `[role="main"]`, then `.content`
7575+ /// 3. Author: meta tags (author, og:author, article:author), then `.byline`
7676+ /// 4. Date: meta tags (article:published_time, datePublished), then `<time>`
7777+ ///
7878+ /// ## Limitations:
7979+ /// - Returns first match, doesn't evaluate quality
8080+ /// - No cleaning of extracted HTML (scripts, ads, etc. may be included)
8181+ /// - May extract wrong content if page structure is unusual
8282+ pub fn extract(&self) -> Result<ExtractedContent> {
8383+ let document = Html::parse_document(&self.html);
8484+8585+ let title = self
8686+ .extract_title(&document)
8787+ .ok_or_else(|| Error::ExtractionError("Could not extract title".to_string()))?;
8888+8989+ let body_html = self
9090+ .extract_body(&document)
9191+ .ok_or_else(|| Error::ExtractionError("Could not extract body content".to_string()))?;
9292+9393+ let author = self.extract_author(&document);
9494+ let date = self.extract_date(&document);
9595+ Ok(ExtractedContent { title, body_html, author, date })
9696+ }
9797+9898+ /// Extract title from document
9999+ ///
100100+ /// Tries in order:
101101+ /// 1. `<title>` tag content (cleaned of site suffixes)
102102+ /// 2. First `<h1>` tag
103103+ /// 3. `og:title` meta tag
104104+ ///
105105+ /// ## Implementation Gap:
106106+ /// - Doesn't try to clean title (remove " | Site Name" suffixes, etc.)
107107+ /// - Doesn't check title quality or length
108108+ fn extract_title(&self, document: &Html) -> Option<String> {
109109+ if let Ok(selector) = Selector::parse("title")
110110+ && let Some(element) = document.select(&selector).next()
111111+ {
112112+ let text: String = element.text().collect();
113113+ if !text.trim().is_empty() {
114114+ return Some(text.trim().to_string());
115115+ }
116116+ }
117117+118118+ if let Ok(selector) = Selector::parse("h1")
119119+ && let Some(element) = document.select(&selector).next()
120120+ {
121121+ let text: String = element.text().collect();
122122+ if !text.trim().is_empty() {
123123+ return Some(text.trim().to_string());
124124+ }
125125+ }
126126+127127+ if let Ok(selector) = Selector::parse("meta[property='og:title']")
128128+ && let Some(element) = document.select(&selector).next()
129129+ && let Some(content) = element.value().attr("content")
130130+ && !content.trim().is_empty()
131131+ {
132132+ return Some(content.trim().to_string());
133133+ }
134134+135135+ None
136136+ }
137137+138138+ /// Extract body content from document
139139+ ///
140140+ /// Tries in order:
141141+ /// 1. `<article>` tag (semantic HTML5)
142142+ /// 2. `<main>` tag (semantic HTML5)
143143+ /// 3. `[role="main"]` attribute (ARIA landmark)
144144+ /// 4. First element with class containing "content", "article", "post", "entry"
145145+ /// 5. `<body>` tag as last resort (usually includes nav, footer, etc.)
146146+ ///
147147+ /// ## Implementation Gaps:
148148+ /// - Doesn't score multiple candidates to find the best one
149149+ /// - Doesn't clean the HTML (may include ads, sidebars, etc.)
150150+ /// - Doesn't check content length or quality
151151+ /// - Doesn't exclude navigation, footers, comments within the selected element
152152+ /// - Returns inner HTML as-is without any processing
153153+ ///
154154+ /// TODO: Add basic cleaning (remove script, style, nav, footer, aside)
155155+ /// TODO: Check content length (minimum threshold)
156156+ /// TODO: If multiple candidates, pick the one with most <p> tags
157157+ fn extract_body(&self, document: &Html) -> Option<String> {
158158+ let selectors = vec![
159159+ "article",
160160+ "main",
161161+ "[role='main']",
162162+ "[class*='content']",
163163+ "[class*='article']",
164164+ "[class*='post']",
165165+ "[class*='entry']",
166166+ "body",
167167+ ];
168168+169169+ for selector_str in selectors {
170170+ if let Ok(selector) = Selector::parse(selector_str)
171171+ && let Some(element) = document.select(&selector).next()
172172+ {
173173+ let html = element.html();
174174+ if !html.trim().is_empty() {
175175+ return Some(html);
176176+ }
177177+ }
178178+ }
179179+180180+ None
181181+ }
182182+183183+ /// Extract author from document
184184+ ///
185185+ /// Tries in order:
186186+ /// 1. `<meta name="author">` tag
187187+ /// 2. `<meta property="og:author">` tag
188188+ /// 3. `<meta property="article:author">` tag
189189+ /// 4. Element with class "author", "byline", or "by"
190190+ ///
191191+ /// ## Implementation Gaps:
192192+ /// - Doesn't parse structured data (JSON-LD, Schema.org)
193193+ /// - Doesn't extract from "By John Doe" patterns in text
194194+ /// - Returns first match without validation
195195+ fn extract_author(&self, document: &Html) -> Option<String> {
196196+ let meta_selectors = vec![
197197+ "meta[name='author']",
198198+ "meta[property='og:author']",
199199+ "meta[property='article:author']",
200200+ ];
201201+202202+ for selector_str in meta_selectors {
203203+ if let Ok(selector) = Selector::parse(selector_str)
204204+ && let Some(element) = document.select(&selector).next()
205205+ && let Some(content) = element.value().attr("content")
206206+ && !content.trim().is_empty()
207207+ {
208208+ return Some(content.trim().to_string());
209209+ }
210210+ }
211211+212212+ let class_selectors = vec![".author", ".byline", ".by"];
213213+214214+ for selector_str in class_selectors {
215215+ if let Ok(selector) = Selector::parse(selector_str)
216216+ && let Some(element) = document.select(&selector).next()
217217+ {
218218+ let text: String = element.text().collect();
219219+ if !text.trim().is_empty() {
220220+ return Some(text.trim().to_string());
221221+ }
222222+ }
223223+ }
224224+225225+ None
226226+ }
227227+228228+ /// Extract publication date from document
229229+ ///
230230+ /// Tries in order:
231231+ /// 1. `<meta property="article:published_time">` (Open Graph)
232232+ /// 2. `<meta itemprop="datePublished">` (Schema.org)
233233+ /// 3. `<time datetime="...">` attribute
234234+ /// 4. `<time>` element text content
235235+ ///
236236+ /// ## Implementation Gaps:
237237+ /// - Doesn't parse or normalize date formats
238238+ /// - Doesn't validate date values
239239+ /// - Doesn't extract from text patterns ("Published on Jan 1, 2020")
240240+ fn extract_date(&self, document: &Html) -> Option<String> {
241241+ let meta_selectors = vec![
242242+ "meta[property='article:published_time']",
243243+ "meta[itemprop='datePublished']",
244244+ ];
245245+246246+ for selector_str in meta_selectors {
247247+ if let Ok(selector) = Selector::parse(selector_str)
248248+ && let Some(element) = document.select(&selector).next()
249249+ && let Some(content) = element.value().attr("content")
250250+ && !content.trim().is_empty()
251251+ {
252252+ return Some(content.trim().to_string());
253253+ }
254254+ }
255255+256256+ if let Ok(selector) = Selector::parse("time[datetime]")
257257+ && let Some(element) = document.select(&selector).next()
258258+ && let Some(datetime) = element.value().attr("datetime")
259259+ && !datetime.trim().is_empty()
260260+ {
261261+ return Some(datetime.trim().to_string());
262262+ }
263263+264264+ if let Ok(selector) = Selector::parse("time")
265265+ && let Some(element) = document.select(&selector).next()
266266+ {
267267+ let text: String = element.text().collect();
268268+ if !text.trim().is_empty() {
269269+ return Some(text.trim().to_string());
270270+ }
271271+ }
272272+ None
273273+ }
274274+}
275275+276276+#[cfg(test)]
277277+mod tests {
278278+ use super::*;
279279+280280+ #[test]
281281+ fn test_extract_title_from_title_tag() {
282282+ let html = r#"
283283+ <html>
284284+ <head><title>Test Article Title</title></head>
285285+ <body></body>
286286+ </html>
287287+ "#;
288288+289289+ let extractor = GenericExtractor::new(html.to_string());
290290+ let document = Html::parse_document(html);
291291+ let title = extractor.extract_title(&document);
292292+293293+ assert_eq!(title, Some("Test Article Title".to_string()));
294294+ }
295295+296296+ #[test]
297297+ fn test_extract_title_from_h1() {
298298+ let html = r#"
299299+ <html>
300300+ <body><h1>Article Heading</h1></body>
301301+ </html>
302302+ "#;
303303+304304+ let extractor = GenericExtractor::new(html.to_string());
305305+ let document = Html::parse_document(html);
306306+ let title = extractor.extract_title(&document);
307307+308308+ assert_eq!(title, Some("Article Heading".to_string()));
309309+ }
310310+311311+ #[test]
312312+ fn test_extract_body_from_article() {
313313+ let html = r#"
314314+ <html>
315315+ <body>
316316+ <article>
317317+ <p>This is the article content.</p>
318318+ </article>
319319+ </body>
320320+ </html>
321321+ "#;
322322+323323+ let extractor = GenericExtractor::new(html.to_string());
324324+ let document = Html::parse_document(html);
325325+ let body = extractor.extract_body(&document);
326326+327327+ assert!(body.is_some());
328328+ assert!(body.unwrap().contains("This is the article content"));
329329+ }
330330+331331+ #[test]
332332+ fn test_extract_author_from_meta() {
333333+ let html = r#"
334334+ <html>
335335+ <head>
336336+ <meta name="author" content="John Doe">
337337+ </head>
338338+ </html>
339339+ "#;
340340+341341+ let extractor = GenericExtractor::new(html.to_string());
342342+ let document = Html::parse_document(html);
343343+ let author = extractor.extract_author(&document);
344344+345345+ assert_eq!(author, Some("John Doe".to_string()));
346346+ }
347347+348348+ #[test]
349349+ fn test_extract_date_from_meta() {
350350+ let html = r#"
351351+ <html>
352352+ <head>
353353+ <meta property="article:published_time" content="2024-01-15">
354354+ </head>
355355+ </html>
356356+ "#;
357357+358358+ let extractor = GenericExtractor::new(html.to_string());
359359+ let document = Html::parse_document(html);
360360+ let date = extractor.extract_date(&document);
361361+362362+ assert_eq!(date, Some("2024-01-15".to_string()));
363363+ }
364364+365365+ #[test]
366366+ fn test_full_extraction() {
367367+ let html = r#"
368368+ <html>
369369+ <head>
370370+ <title>Test Article</title>
371371+ <meta name="author" content="Jane Smith">
372372+ <meta property="article:published_time" content="2024-01-15">
373373+ </head>
374374+ <body>
375375+ <article>
376376+ <h1>Article Title</h1>
377377+ <p>Article content goes here.</p>
378378+ </article>
379379+ </body>
380380+ </html>
381381+ "#;
382382+383383+ let extractor = GenericExtractor::new(html.to_string());
384384+ let result = extractor.extract().unwrap();
385385+386386+ assert_eq!(result.title, "Test Article");
387387+ assert!(result.body_html.contains("Article content goes here"));
388388+ assert_eq!(result.author, Some("Jane Smith".to_string()));
389389+ assert_eq!(result.date, Some("2024-01-15".to_string()));
390390+ }
391391+}
+8
crates/readability/src/extractor/mod.rs
···11+//! Content extraction using XPath rules and generic algorithms
22+33+pub mod generic;
44+pub mod scoring;
55+pub mod xpath;
66+77+pub use generic::GenericExtractor;
88+pub use xpath::XPathExtractor;
+42
crates/readability/src/extractor/scoring.rs
···11+//! Content scoring for the Mozilla Readability algorithm
22+//!
33+//! TODO: Implement scoring
44+55+/// Content score for an element
66+#[derive(Debug, Clone)]
77+pub struct ContentScore {
88+ /// Text length of the element
99+ pub text_length: usize,
1010+ /// Link density (0.0 to 1.0)
1111+ pub link_density: f32,
1212+ /// Class/ID weight (positive for content, negative for non-content)
1313+ pub class_weight: f32,
1414+ /// Total calculated score
1515+ pub total: f32,
1616+}
1717+1818+/// Positive class/ID patterns indicating content
1919+pub const POSITIVE_PATTERNS: &[&str] = &[
2020+ "article", "body", "content", "entry", "main", "page", "post", "text", "blog", "story",
2121+];
2222+2323+/// Negative class/ID patterns indicating non-content
2424+pub const NEGATIVE_PATTERNS: &[&str] = &[
2525+ "combx",
2626+ "comment",
2727+ "community",
2828+ "disqus",
2929+ "extra",
3030+ "footer",
3131+ "header",
3232+ "menu",
3333+ "remark",
3434+ "rss",
3535+ "share",
3636+ "sidebar",
3737+ "sponsor",
3838+ "ad-",
3939+ "agegate",
4040+ "pagination",
4141+ "nav",
4242+];
+494
crates/readability/src/extractor/xpath.rs
···11+//! XPath-based content extraction using site-specific rules
22+//!
33+//! This module provides content extraction from HTML documents using XPath-like expressions.
44+//!
55+//! ## Strategy
66+//!
77+//! Since Rust doesn't have a robust HTML-compatible XPath library, we use a hybrid approach:
88+//! 1. Convert simple XPath expressions to CSS selectors (scraper handles these well)
99+//! 2. Handle complex patterns (contains(), normalize-space()) with custom matchers
1010+//! 3. Use regex parsing for XPath syntax to extract selector components
1111+//!
1212+//! ## Supported XPath Patterns
1313+//!
1414+//! - `//tag` - Simple tag selection
1515+//! - `//tag[@id='value']` - ID selection
1616+//! - `//tag[@class='value']` - Exact class match
1717+//! - `//tag[contains(@class, 'value')]` - Class contains match
1818+//! - `//tag[contains(concat(' ',normalize-space(@class),' '),' value ')]` - Normalized class match
1919+//! - `//meta[@name='value']/@content` - Attribute extraction from meta tags
2020+2121+use crate::config::SiteConfig;
2222+use crate::error::{Error, Result};
2323+use regex::Regex;
2424+use scraper::{ElementRef, Html, Selector};
2525+2626+/// Extracted content from XPath rules
2727+#[derive(Debug, Clone)]
2828+pub struct ExtractedContent {
2929+ pub title: Option<String>,
3030+ pub body_html: Option<String>,
3131+ pub author: Option<String>,
3232+ pub date: Option<String>,
3333+}
3434+3535+/// XPath-based extractor
3636+pub struct XPathExtractor {
3737+ html: String,
3838+}
3939+4040+impl XPathExtractor {
4141+ /// Create a new XPath extractor
4242+ pub fn new(html: String) -> Self {
4343+ Self { html }
4444+ }
4545+4646+ /// Extract content using site-specific rules
4747+ pub fn extract(&self, config: &SiteConfig) -> Result<ExtractedContent> {
4848+ let cleaned_html = self.apply_strip_rules(&self.html, config)?;
4949+ let document = Html::parse_document(&cleaned_html);
5050+5151+ let title = self.extract_field(&document, &config.title, false)?;
5252+ let body_html = self.extract_field(&document, &config.body, true)?;
5353+ let author = self.extract_field(&document, &config.author, false)?;
5454+ let date = self.extract_field(&document, &config.date, false)?;
5555+5656+ Ok(ExtractedContent { title, body_html, author, date })
5757+ }
5858+5959+ /// Apply strip rules to remove unwanted elements
6060+ ///
6161+ /// Processes both `strip` (XPath) and `strip_id_or_class` (substring match) directives.
6262+ fn apply_strip_rules(&self, html: &str, config: &SiteConfig) -> Result<String> {
6363+ let document = Html::parse_document(html);
6464+ let mut elements_to_remove: Vec<String> = Vec::new();
6565+6666+ for substring in &config.strip_id_or_class {
6767+ let substring_lower = substring.to_lowercase();
6868+ for element in document.tree.nodes() {
6969+ if let Some(el) = ElementRef::wrap(element) {
7070+ let should_remove = el
7171+ .value()
7272+ .id()
7373+ .is_some_and(|id| id.to_lowercase().contains(&substring_lower))
7474+ || el
7575+ .value()
7676+ .classes()
7777+ .any(|class| class.to_lowercase().contains(&substring_lower));
7878+7979+ if should_remove {
8080+ elements_to_remove.push(self.element_signature(&el));
8181+ }
8282+ }
8383+ }
8484+ }
8585+8686+ for xpath in &config.strip {
8787+ if let Some((css, _)) = self.xpath_to_css_with_attr(xpath)
8888+ && let Ok(selector) = Selector::parse(&css)
8989+ {
9090+ for el in document.select(&selector) {
9191+ elements_to_remove.push(self.element_signature(&el));
9292+ }
9393+ }
9494+ }
9595+9696+ self.rebuild_html_without_elements(&document, &elements_to_remove)
9797+ }
9898+9999+ /// Generate a signature for an element to identify it during rebuild
100100+ fn element_signature(&self, el: &ElementRef) -> String {
101101+ let tag = el.value().name();
102102+ let id = el.value().id().unwrap_or("");
103103+ let classes: Vec<&str> = el.value().classes().collect();
104104+ format!("{}#{}#{}", tag, id, classes.join(","))
105105+ }
106106+107107+ /// Rebuild HTML without specified elements
108108+ fn rebuild_html_without_elements(&self, document: &Html, to_remove: &[String]) -> Result<String> {
109109+ if to_remove.is_empty() {
110110+ return Ok(self.html.clone());
111111+ }
112112+113113+ let mut result = String::new();
114114+ self.rebuild_node(&document.root_element(), to_remove, &mut result);
115115+ Ok(result)
116116+ }
117117+118118+ /// Recursively rebuild a node and its children, skipping removed elements
119119+ fn rebuild_node(&self, element: &ElementRef, to_remove: &[String], output: &mut String) {
120120+ let sig = self.element_signature(element);
121121+ if to_remove.contains(&sig) {
122122+ return;
123123+ }
124124+125125+ let tag = element.value().name();
126126+ output.push('<');
127127+ output.push_str(tag);
128128+129129+ for (name, value) in element.value().attrs() {
130130+ output.push(' ');
131131+ output.push_str(name);
132132+ output.push_str("=\"");
133133+ output.push_str(&html_escape::encode_double_quoted_attribute(value));
134134+ output.push('"');
135135+ }
136136+ output.push('>');
137137+138138+ for child in element.children() {
139139+ if let Some(el) = ElementRef::wrap(child) {
140140+ self.rebuild_node(&el, to_remove, output);
141141+ } else if let Some(text) = child.value().as_text() {
142142+ output.push_str(&html_escape::encode_text(&text.to_string()));
143143+ }
144144+ }
145145+146146+ const VOID_ELEMENTS: &[&str] = &[
147147+ "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track",
148148+ "wbr",
149149+ ];
150150+151151+ if !VOID_ELEMENTS.contains(&tag) {
152152+ output.push_str("</");
153153+ output.push_str(tag);
154154+ output.push('>');
155155+ }
156156+ }
157157+158158+ /// Extract a field using XPath expressions (tries each in order)
159159+ fn extract_field(&self, document: &Html, xpaths: &[String], extract_html: bool) -> Result<Option<String>> {
160160+ for xpath_expr in xpaths {
161161+ if let Some(result) = self.evaluate_xpath(document, xpath_expr, extract_html)? {
162162+ return Ok(Some(result));
163163+ }
164164+ }
165165+ Ok(None)
166166+ }
167167+168168+ /// Evaluate an XPath expression against the document
169169+ fn evaluate_xpath(&self, document: &Html, xpath: &str, extract_html: bool) -> Result<Option<String>> {
170170+ let (xpath_part, attr_to_extract) = if let Some(pos) = xpath.rfind("/@") {
171171+ (&xpath[..pos], Some(&xpath[pos + 2..]))
172172+ } else {
173173+ (xpath, None)
174174+ };
175175+176176+ let (css, class_filter) = match self.xpath_to_css_with_attr(xpath_part) {
177177+ Some(result) => result,
178178+ None => return Ok(None),
179179+ };
180180+181181+ let selector =
182182+ Selector::parse(&css).map_err(|e| Error::XPathError(format!("Invalid CSS selector '{}': {:?}", css, e)))?;
183183+184184+ for element in document.select(&selector) {
185185+ if let Some(ref filter) = class_filter
186186+ && !self.element_has_class_containing(&element, filter)
187187+ {
188188+ continue;
189189+ }
190190+191191+ if let Some(attr) = attr_to_extract {
192192+ if let Some(value) = element.value().attr(attr) {
193193+ return Ok(Some(value.to_string()));
194194+ }
195195+ continue;
196196+ }
197197+198198+ let content =
199199+ if extract_html { element.inner_html() } else { element.text().collect::<Vec<_>>().join(" ") };
200200+201201+ let content = content.trim().to_string();
202202+ if !content.is_empty() {
203203+ return Ok(Some(content));
204204+ }
205205+ }
206206+207207+ Ok(None)
208208+ }
209209+210210+ /// Convert XPath to CSS selector with optional class filter
211211+ fn xpath_to_css_with_attr(&self, xpath: &str) -> Option<(String, Option<String>)> {
212212+ let xpath = xpath.trim();
213213+214214+ if xpath.starts_with("//") && !xpath.contains('[') && !xpath.contains('@') {
215215+ let tag = xpath.trim_start_matches("//");
216216+ return Some((tag.to_string(), None));
217217+ }
218218+219219+ if let Some(css) = self.parse_id_selector(xpath) {
220220+ return Some((css, None));
221221+ }
222222+223223+ if let Some((css, class_filter)) = self.parse_contains_class_normalized(xpath) {
224224+ return Some((css, Some(class_filter)));
225225+ }
226226+227227+ if let Some((css, class_filter)) = self.parse_contains_class_simple(xpath) {
228228+ return Some((css, Some(class_filter)));
229229+ }
230230+231231+ if let Some(css) = self.parse_exact_class(xpath) {
232232+ return Some((css, None));
233233+ }
234234+235235+ if let Some(css) = self.parse_exact_class(xpath) {
236236+ return Some((css, None));
237237+ }
238238+239239+ if let Some(css) = self.parse_any_tag_with_id(xpath) {
240240+ return Some((css, None));
241241+ }
242242+243243+ if let Some(css) = self.parse_meta_selector(xpath) {
244244+ return Some((css, None));
245245+ }
246246+247247+ if let Some(css) = self.parse_meta_selector(xpath) {
248248+ return Some((css, None));
249249+ }
250250+251251+ None
252252+ }
253253+254254+ /// Parse //tag[@id='value'] pattern
255255+ fn parse_id_selector(&self, xpath: &str) -> Option<String> {
256256+ let re = Regex::new(r#"//(\w+)\[@id\s*=\s*['"]([^'"]+)['"]\]"#).ok()?;
257257+ let caps = re.captures(xpath)?;
258258+ let tag = caps.get(1)?.as_str();
259259+ let id = caps.get(2)?.as_str();
260260+ Some(format!("{}#{}", tag, id))
261261+ }
262262+263263+ /// Parse //*[@id='value'] pattern
264264+ fn parse_any_tag_with_id(&self, xpath: &str) -> Option<String> {
265265+ let re = Regex::new(r#"//\*\[@id\s*=\s*['"]([^'"]+)['"]\]"#).ok()?;
266266+ let caps = re.captures(xpath)?;
267267+ let id = caps.get(1)?.as_str();
268268+ Some(format!("#{}", id))
269269+ }
270270+271271+ /// Parse //tag[@class='value'] pattern (exact class match)
272272+ fn parse_exact_class(&self, xpath: &str) -> Option<String> {
273273+ if xpath.contains("contains") {
274274+ return None;
275275+ }
276276+ let re = Regex::new(r#"//(\w+)\[@class\s*=\s*['"]([^'"]+)['"]\]"#).ok()?;
277277+ let caps = re.captures(xpath)?;
278278+ let tag = caps.get(1)?.as_str();
279279+ let class = caps.get(2)?.as_str();
280280+ Some(format!("{}[class=\"{}\"]", tag, class))
281281+ }
282282+283283+ /// Parse //tag[contains(@class, 'value')] pattern
284284+ fn parse_contains_class_simple(&self, xpath: &str) -> Option<(String, String)> {
285285+ let re = Regex::new(r#"//(\w+)\[contains\s*\(\s*@class\s*,\s*['"]([^'"]+)['"]\s*\)\]"#).ok()?;
286286+ let caps = re.captures(xpath)?;
287287+ let tag = caps.get(1)?.as_str();
288288+ let class_substr = caps.get(2)?.as_str();
289289+ Some((tag.to_string(), class_substr.to_string()))
290290+ }
291291+292292+ /// Parse //tag[contains(concat(' ',normalize-space(@class),' '),' value ')] pattern
293293+ fn parse_contains_class_normalized(&self, xpath: &str) -> Option<(String, String)> {
294294+ let re = Regex::new(r#"//(\w+)\[contains\s*\(\s*concat\s*\(.+\)\s*,\s*['"]([^'"]+)['"]\s*\)\]"#).ok()?;
295295+ let caps = re.captures(xpath)?;
296296+ let tag = caps.get(1)?.as_str();
297297+ let class_name = caps.get(2)?.as_str().trim();
298298+ Some((tag.to_string(), class_name.to_string()))
299299+ }
300300+301301+ /// Parse //meta[@name='value'] pattern
302302+ fn parse_meta_selector(&self, xpath: &str) -> Option<String> {
303303+ let re = Regex::new(r#"//meta\[@(\w+)\s*=\s*['"]([^'"]+)['"]\]"#).ok()?;
304304+ let caps = re.captures(xpath)?;
305305+ let attr_name = caps.get(1)?.as_str();
306306+ let attr_value = caps.get(2)?.as_str();
307307+ Some(format!("meta[{}=\"{}\"]", attr_name, attr_value))
308308+ }
309309+310310+ /// Check if element has a class containing the given substring
311311+ fn element_has_class_containing(&self, element: &ElementRef, class_filter: &str) -> bool {
312312+ element.value().classes().any(|class| class.contains(class_filter))
313313+ }
314314+}
315315+316316+#[cfg(test)]
317317+mod tests {
318318+ use super::*;
319319+ use crate::config::parser::SiteConfig;
320320+321321+ #[test]
322322+ fn test_xpath_to_css_simple_tag() {
323323+ let extractor = XPathExtractor::new(String::new());
324324+ let (css, filter) = extractor.xpath_to_css_with_attr("//h1").unwrap();
325325+ assert_eq!(css, "h1");
326326+ assert!(filter.is_none());
327327+ }
328328+329329+ #[test]
330330+ fn test_xpath_to_css_id_selector() {
331331+ let extractor = XPathExtractor::new(String::new());
332332+ let (css, filter) = extractor.xpath_to_css_with_attr("//h1[@id='firstHeading']").unwrap();
333333+ assert_eq!(css, "h1#firstHeading");
334334+ assert!(filter.is_none());
335335+ }
336336+337337+ #[test]
338338+ fn test_xpath_to_css_any_tag_with_id() {
339339+ let extractor = XPathExtractor::new(String::new());
340340+ let (css, filter) = extractor.xpath_to_css_with_attr("//*[@id='bodyContent']").unwrap();
341341+ assert_eq!(css, "#bodyContent");
342342+ assert!(filter.is_none());
343343+ }
344344+345345+ #[test]
346346+ fn test_xpath_contains_class_simple() {
347347+ let extractor = XPathExtractor::new(String::new());
348348+ let (css, filter) = extractor
349349+ .xpath_to_css_with_attr("//div[contains(@class, 'content')]")
350350+ .unwrap();
351351+ assert_eq!(css, "div");
352352+ assert_eq!(filter, Some("content".to_string()));
353353+ }
354354+355355+ #[test]
356356+ fn test_xpath_contains_class_normalized() {
357357+ let extractor = XPathExtractor::new(String::new());
358358+ let xpath = "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]";
359359+ let (css, filter) = extractor.xpath_to_css_with_attr(xpath).unwrap();
360360+ assert_eq!(css, "h1");
361361+ assert_eq!(filter, Some("title".to_string()));
362362+ }
363363+364364+ #[test]
365365+ fn test_extract_meta_attribute() {
366366+ let html = r#"
367367+ <html>
368368+ <head>
369369+ <meta name="citation_date" content="2020-09-07">
370370+ <meta name="citation_author" content="John Doe">
371371+ </head>
372372+ </html>
373373+ "#;
374374+375375+ let extractor = XPathExtractor::new(html.to_string());
376376+ let document = Html::parse_document(html);
377377+378378+ let date = extractor
379379+ .evaluate_xpath(&document, "//meta[@name='citation_date']/@content", false)
380380+ .unwrap();
381381+ assert_eq!(date, Some("2020-09-07".to_string()));
382382+383383+ let author = extractor
384384+ .evaluate_xpath(&document, "//meta[@name='citation_author']/@content", false)
385385+ .unwrap();
386386+ assert_eq!(author, Some("John Doe".to_string()));
387387+ }
388388+389389+ #[test]
390390+ fn test_extract_with_contains_class() {
391391+ let html = r#"
392392+ <html>
393393+ <body>
394394+ <h1 class="page-title title main">Article Title</h1>
395395+ <div class="article-content">Content here</div>
396396+ </body>
397397+ </html>
398398+ "#;
399399+400400+ let extractor = XPathExtractor::new(html.to_string());
401401+ let document = Html::parse_document(html);
402402+403403+ let title = extractor
404404+ .evaluate_xpath(&document, "//h1[contains(@class, 'title')]", false)
405405+ .unwrap();
406406+ assert_eq!(title, Some("Article Title".to_string()));
407407+ }
408408+409409+ #[test]
410410+ fn test_strip_id_or_class() {
411411+ let html = r#"
412412+ <html>
413413+ <body>
414414+ <div id="main-content">Main content</div>
415415+ <div class="sidebar-widget">Sidebar</div>
416416+ <div class="advertisement-banner">Ad</div>
417417+ </body>
418418+ </html>
419419+ "#;
420420+421421+ let config = SiteConfig {
422422+ strip_id_or_class: vec!["sidebar".to_string(), "advertisement".to_string()],
423423+ ..Default::default()
424424+ };
425425+426426+ let extractor = XPathExtractor::new(html.to_string());
427427+ let cleaned = extractor.apply_strip_rules(html, &config).unwrap();
428428+429429+ assert!(cleaned.contains("Main content"));
430430+ assert!(!cleaned.contains("Sidebar"));
431431+ assert!(!cleaned.contains("Ad"));
432432+ }
433433+434434+ #[test]
435435+ fn test_strip_xpath() {
436436+ let html = r#"
437437+ <html>
438438+ <body>
439439+ <div id="content">Main content</div>
440440+ <div id="toc">Table of contents</div>
441441+ <div id="footer">Footer</div>
442442+ </body>
443443+ </html>
444444+ "#;
445445+446446+ let config = SiteConfig {
447447+ strip: vec!["//*[@id='toc']".to_string(), "//div[@id='footer']".to_string()],
448448+ ..Default::default()
449449+ };
450450+451451+ let extractor = XPathExtractor::new(html.to_string());
452452+ let cleaned = extractor.apply_strip_rules(html, &config).unwrap();
453453+454454+ assert!(cleaned.contains("Main content"));
455455+ assert!(!cleaned.contains("Table of contents"));
456456+ assert!(!cleaned.contains("Footer"));
457457+ }
458458+459459+ #[test]
460460+ fn test_full_extraction() {
461461+ let html = r#"
462462+ <html>
463463+ <head>
464464+ <meta name="author" content="Test Author">
465465+ <meta name="date" content="2024-01-15">
466466+ </head>
467467+ <body>
468468+ <h1 id="title">Test Title</h1>
469469+ <article class="content">
470470+ <p>Article content here.</p>
471471+ </article>
472472+ <div class="sidebar">Sidebar content</div>
473473+ </body>
474474+ </html>
475475+ "#;
476476+477477+ let config = SiteConfig {
478478+ title: vec!["//h1[@id='title']".to_string()],
479479+ body: vec!["//article".to_string()],
480480+ author: vec!["//meta[@name='author']/@content".to_string()],
481481+ date: vec!["//meta[@name='date']/@content".to_string()],
482482+ strip_id_or_class: vec!["sidebar".to_string()],
483483+ ..Default::default()
484484+ };
485485+486486+ let extractor = XPathExtractor::new(html.to_string());
487487+ let result = extractor.extract(&config).unwrap();
488488+489489+ assert_eq!(result.title, Some("Test Title".to_string()));
490490+ assert!(result.body_html.unwrap().contains("Article content here"));
491491+ assert_eq!(result.author, Some("Test Author".to_string()));
492492+ assert_eq!(result.date, Some("2024-01-15".to_string()));
493493+ }
494494+}
+135
crates/readability/src/lib.rs
···11+//! Article extraction library with support for site-specific XPath rules and generic content extraction.
22+//!
33+//! This crate provides functionality to extract clean article content from HTML pages using:
44+//! - Site-specific XPath rules (ftr-site-config format)
55+//! - Generic content extraction (Mozilla Readability algorithm)
66+//! - Automatic markdown conversion
77+//!
88+//! # Example
99+//!
1010+//! ```no_run
1111+//! use malfestio_readability::Readability;
1212+//! use std::path::PathBuf;
1313+//!
1414+//! let html = r#"<html><head><title>Article</title></head><body>...</body></html>"#;
1515+//! let readability = Readability::new(html.to_string(), Some("https://example.com/article"))
1616+//! .with_rules_dir(PathBuf::from("rules"));
1717+//!
1818+//! let article = readability.parse().unwrap();
1919+//! println!("Title: {}", article.title);
2020+//! println!("Markdown: {}", article.markdown);
2121+//! ```
2222+2323+pub mod cleaner;
2424+pub mod config;
2525+pub mod converter;
2626+pub mod error;
2727+pub mod extractor;
2828+2929+use std::path::PathBuf;
3030+3131+pub use error::{Error, Result};
3232+3333+/// Extracted article content
3434+#[derive(Debug, Clone)]
3535+pub struct Article {
3636+ /// Article title
3737+ pub title: String,
3838+ /// Clean HTML content
3939+ pub content: String,
4040+ /// Markdown formatted content
4141+ pub markdown: String,
4242+ /// Article author (if found)
4343+ pub author: Option<String>,
4444+ /// Publication date (if found)
4545+ pub published_date: Option<String>,
4646+ /// Excerpt (first ~200 chars of content)
4747+ pub excerpt: Option<String>,
4848+}
4949+5050+/// Main entry point for article extraction
5151+pub struct Readability {
5252+ html: String,
5353+ url: Option<String>,
5454+ rules_dir: Option<PathBuf>,
5555+}
5656+5757+impl Readability {
5858+ /// Create a new Readability instance
5959+ ///
6060+ /// # Arguments
6161+ ///
6262+ /// * `html` - The HTML content to extract from
6363+ /// * `url` - Optional URL of the article (used for rule matching)
6464+ pub fn new(html: String, url: Option<&str>) -> Self {
6565+ Self { html, url: url.map(String::from), rules_dir: None }
6666+ }
6767+6868+ /// Set the directory containing extraction rules
6969+ ///
7070+ /// Rules files should be named `domain.com.txt` or `.domain.com.txt` for subdomain matching.
7171+ pub fn with_rules_dir(mut self, path: PathBuf) -> Self {
7272+ self.rules_dir = Some(path);
7373+ self
7474+ }
7575+7676+ /// Extract article content from HTML
7777+ ///
7878+ /// ## Extraction Flow:
7979+ /// 1. If URL provided: Try to load site-specific XPath rules from embedded rules
8080+ /// 2. If rules found: Attempt XPath-based extraction
8181+ /// 3. If no rules OR XPath extraction fails: Fall back to generic heuristic extraction
8282+ /// 4. Convert extracted HTML to markdown
8383+ /// 5. Generate excerpt from markdown
8484+ /// 6. Return complete Article struct
8585+ ///
8686+ /// ## Implementation Gaps:
8787+ /// - XPath extraction doesn't handle complex expressions with `contains()`, `normalize-space()`, etc.
8888+ /// These will fall back to generic extraction
8989+ /// - No content cleaning between XPath/generic extraction and markdown conversion
9090+ /// (scripts, styles, etc. may be present in extracted HTML)
9191+ /// - Generic extraction may include non-content elements (nav, footer, etc.)
9292+ ///
9393+ /// ## Design Decision:
9494+ /// We prefer to return *something* (via generic extraction) rather than fail completely.
9595+ /// This maximizes success rate at the cost of potentially lower quality extraction.
9696+ ///
9797+ /// TODO: Add HTML cleaning step before markdown conversion
9898+ /// TODO: Implement XPath strip directives to remove unwanted elements
9999+ /// TODO: Add content validation (minimum length, etc.)
100100+ pub fn parse(&self) -> Result<Article> {
101101+ use config::ConfigLoader;
102102+ use converter::to_markdown;
103103+ use extractor::XPathExtractor;
104104+105105+ let (title, content, author, date) = if let Some(ref url) = self.url {
106106+ let loader = ConfigLoader::new();
107107+108108+ if let Some(config) = loader.load_for_url(url)? {
109109+ let xpath_extractor = XPathExtractor::new(self.html.clone());
110110+ let xpath_result = xpath_extractor.extract(&config)?;
111111+112112+ if let (Some(title), Some(body)) = (xpath_result.title, xpath_result.body_html) {
113113+ (title, body, xpath_result.author, xpath_result.date)
114114+ } else {
115115+ self.extract_with_generic()?
116116+ }
117117+ } else {
118118+ self.extract_with_generic()?
119119+ }
120120+ } else {
121121+ self.extract_with_generic()?
122122+ };
123123+124124+ let markdown = to_markdown(&content);
125125+ let excerpt = Some(converter::html2md::generate_excerpt(&markdown, 200));
126126+ Ok(Article { title, content, markdown, author, published_date: date, excerpt })
127127+ }
128128+129129+ /// Extract using generic heuristic-based algorithm
130130+ fn extract_with_generic(&self) -> Result<(String, String, Option<String>, Option<String>)> {
131131+ let generic_extractor = extractor::GenericExtractor::new(self.html.clone());
132132+ let result = generic_extractor.extract()?;
133133+ Ok((result.title, result.body_html, result.author, result.date))
134134+ }
135135+}
+1-2
crates/server/Cargo.toml
···1313deadpool-postgres = "0.14.0"
1414dotenvy = "0.15.7"
1515ed25519-dalek = { version = "2.2.0", features = ["serde"] }
1616-dom_smoothie = "0.4"
1716getrandom = { version = "0.3", features = ["std"] }
1818-html2md = "0.2.15"
1917malfestio-core = { version = "0.1.0", path = "../core" }
1818+malfestio-readability = { version = "0.1.0", path = "../readability" }
2019regex = "1.12.2"
2120reqwest = { version = "0.12", features = ["json"] }
2221serde = "1.0.228"
+20-40
crates/server/src/api/importer.rs
···11use crate::middleware::auth::UserContext;
22use crate::state::SharedState;
33use axum::{Json, extract::Extension, http::StatusCode, response::IntoResponse};
44-use dom_smoothie::Readability;
54use malfestio_core::model::Visibility;
55+use malfestio_readability::Readability;
66use serde::{Deserialize, Serialize};
77use serde_json::json;
88···3333 }
34343535 let url = payload.url.clone();
3636-3737- // Fetch HTML content
3836 let html_result = reqwest::get(&url).await;
3937 let html_content = match html_result {
4038 Ok(response) => match response.text().await {
···5654 }
5755 };
58565959- // Extract article using dom_smoothie
6057 let url_for_task = url.clone();
6161- let result = tokio::task::spawn_blocking(
6262- move || -> Result<(String, String, Option<String>, Option<String>), String> {
6363- let mut readability = Readability::new(html_content, Some(&url_for_task), None)
6464- .map_err(|e| format!("Readability error: {}", e))?;
6565- let article = readability.parse().map_err(|e| format!("Parse error: {}", e))?;
6666- Ok((
6767- article.title,
6868- article.content.to_string(),
6969- article.byline,
7070- article.published_time,
7171- ))
7272- },
7373- )
5858+ let result = tokio::task::spawn_blocking(move || -> Result<malfestio_readability::Article, String> {
5959+ let readability = Readability::new(html_content, Some(&url_for_task));
6060+ readability.parse().map_err(|e| format!("Parse error: {}", e))
6161+ })
7462 .await;
75637664 match result {
7777- Ok(Ok((title, content, author, publish_date))) => {
7878- // Convert HTML content to markdown
7979- let markdown = html2md::parse_html(&content);
6565+ Ok(Ok(article)) => {
6666+ let markdown = article.markdown;
80678168 let response = ImportArticleResponse {
8282- title,
6969+ title: article.title,
8370 markdown,
8484- metadata: ArticleMetadata { author, publish_date, source_url: payload.url },
7171+ metadata: ArticleMetadata {
7272+ author: article.author,
7373+ publish_date: article.published_date,
7474+ source_url: payload.url,
7575+ },
8576 };
86778778 Json(response).into_response()
···121112 }
122113123114 let url = payload.url.clone();
124124-125125- // Fetch HTML content
126115 let html_result = reqwest::get(&url).await;
127116 let html_content = match html_result {
128117 Ok(response) => match response.text().await {
···144133 }
145134 };
146135147147- // Extract article using dom_smoothie
148136 let url_for_task = url.clone();
149149- let result = tokio::task::spawn_blocking(move || -> Result<(String, String), String> {
150150- let mut readability = Readability::new(html_content, Some(&url_for_task), None)
151151- .map_err(|e| format!("Readability error: {}", e))?;
152152- let article = readability.parse().map_err(|e| format!("Parse error: {}", e))?;
153153- Ok((article.title, article.content.to_string()))
137137+ let result = tokio::task::spawn_blocking(move || -> Result<malfestio_readability::Article, String> {
138138+ let readability = Readability::new(html_content, Some(&url_for_task));
139139+ readability.parse().map_err(|e| format!("Parse error: {}", e))
154140 })
155141 .await;
156142157143 match result {
158158- Ok(Ok((title, content))) => {
159159- // Convert HTML content to markdown
160160- let markdown = html2md::parse_html(&content);
161161-162162- // Merge auto-tags with user-provided tags
144144+ Ok(Ok(article)) => {
145145+ let title = article.title;
146146+ let markdown = article.markdown;
163147 let mut tags = payload.tags.clone();
164148 if !tags.contains(&"imported".to_string()) {
165149 tags.push("imported".to_string());
···168152 tags.push("article".to_string());
169153 }
170154171171- // Store source URL as first link
172155 let links = vec![payload.url.clone()];
173156174174- // Create note
175157 match state
176158 .note_repo
177159 .create(&user_ctx.did, &title, &markdown, tags, payload.visibility, links)
···222204 let body_json: serde_json::Value = serde_json::from_slice(&body_bytes).unwrap();
223205 let title = body_json["title"].as_str().unwrap();
224206 assert!(title.contains("Rust"));
225225- // Verify markdown field exists and is non-empty
207207+226208 let markdown = body_json["markdown"].as_str().unwrap();
227209 assert!(markdown.len() > 100);
228228- // Verify no HTML tags leak through
229210 assert!(!markdown.contains("<div"));
230211 assert!(!markdown.contains("<p>"));
231231- // Verify metadata structure exists
232212 assert!(body_json["metadata"].is_object());
233213 assert_eq!(
234214 body_json["metadata"]["source_url"].as_str().unwrap(),