cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
29
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 609 lines 17 kB view raw
1package articles 2 3import ( 4 "bufio" 5 "embed" 6 "fmt" 7 "io" 8 "net/http" 9 "net/url" 10 "os" 11 "path/filepath" 12 "regexp" 13 "strings" 14 "time" 15 16 "github.com/antchfx/htmlquery" 17 "github.com/gomarkdown/markdown" 18 "github.com/gomarkdown/markdown/html" 19 "github.com/gomarkdown/markdown/parser" 20 "github.com/stormlightlabs/noteleaf/internal/models" 21 exhtml "golang.org/x/net/html" 22) 23 24//go:embed rules/*.txt 25var rulesFS embed.FS 26 27// ParsedContent represents the extracted content from a web page 28type ParsedContent struct { 29 Title string 30 Author string 31 Date string 32 Content string 33 URL string 34 Confidence float64 // 0-1 scale, confidence in extraction quality 35 ExtractionMethod string // "xpath", "heuristic", "dual-validated", etc. 36} 37 38// ParsingRule represents XPath rules for extracting content from a specific domain 39type ParsingRule struct { 40 Domain string 41 Title string 42 Author string 43 Date string 44 Body string 45 Strip []string // XPath selectors for elements to remove 46 StripIDsOrClasses []string 47 TestURLs []string 48 Headers map[string]string 49 Prune bool 50 Tidy bool 51} 52 53// Parser interface defines methods for parsing articles from URLs 54type Parser interface { 55 // ParseURL extracts article content from a given URL 56 ParseURL(url string) (*ParsedContent, error) 57 // Convert HTML content directly to markdown using domain-specific rules 58 Convert(htmlContent, domain, sourceURL string) (string, error) 59 // GetSupportedDomains returns a list of domains that have parsing rules 60 GetSupportedDomains() []string 61 // SaveArticle saves the parsed content to filesystem and returns file paths 62 SaveArticle(content *ParsedContent, storageDir string) (markdownPath, htmlPath string, err error) 63} 64 65// ArticleParser implements the Parser interface 66type ArticleParser struct { 67 rules map[string]*ParsingRule 68 client *http.Client 69 heuristicExtract *HeuristicExtractor 70 metadataExtractor *MetadataExtractor 71} 72 73// NewArticleParser creates a new ArticleParser with the specified HTTP client and loaded rules 74func NewArticleParser(client *http.Client) (*ArticleParser, error) { 75 parser := &ArticleParser{ 76 rules: make(map[string]*ParsingRule), 77 client: client, 78 heuristicExtract: NewHeuristicExtractor(), 79 metadataExtractor: NewMetadataExtractor(), 80 } 81 82 if err := parser.loadRules(); err != nil { 83 return nil, fmt.Errorf("failed to load parsing rules: %w", err) 84 } 85 86 return parser, nil 87} 88 89// AddRule adds or replaces a parsing rule for a specific domain 90func (p *ArticleParser) AddRule(domain string, rule *ParsingRule) { 91 p.rules[domain] = rule 92} 93 94// SetHTTPClient overrides the HTTP client used for fetching article content. 95func (p *ArticleParser) SetHTTPClient(client *http.Client) { 96 p.client = client 97} 98 99func (p *ArticleParser) loadRules() error { 100 entries, err := rulesFS.ReadDir("rules") 101 if err != nil { 102 return fmt.Errorf("failed to read rules directory: %w", err) 103 } 104 105 for _, entry := range entries { 106 if !strings.HasSuffix(entry.Name(), ".txt") { 107 continue 108 } 109 110 domain := strings.TrimSuffix(entry.Name(), ".txt") 111 112 content, err := rulesFS.ReadFile(filepath.Join("rules", entry.Name())) 113 if err != nil { 114 return fmt.Errorf("failed to read rule file %s: %w", entry.Name(), err) 115 } 116 117 rule, err := p.parseRules(domain, string(content)) 118 if err != nil { 119 return fmt.Errorf("failed to parse rule file %s: %w", entry.Name(), err) 120 } 121 122 p.rules[domain] = rule 123 } 124 125 return nil 126} 127 128func (p *ArticleParser) parseRules(domain, content string) (*ParsingRule, error) { 129 rule := &ParsingRule{Domain: domain, Strip: []string{}} 130 scanner := bufio.NewScanner(strings.NewReader(content)) 131 for scanner.Scan() { 132 line := strings.TrimSpace(scanner.Text()) 133 134 if line == "" || strings.HasPrefix(line, "#") { 135 continue 136 } 137 138 parts := strings.SplitN(line, ":", 2) 139 if len(parts) != 2 { 140 continue 141 } 142 143 key := strings.TrimSpace(parts[0]) 144 value := strings.TrimSpace(parts[1]) 145 146 switch key { 147 case "title": 148 rule.Title = value 149 case "author": 150 rule.Author = value 151 case "date": 152 rule.Date = value 153 case "body": 154 rule.Body = value 155 case "strip": 156 rule.Strip = append(rule.Strip, value) 157 case "strip_id_or_class": 158 rule.StripIDsOrClasses = append(rule.StripIDsOrClasses, value) 159 case "prune": 160 rule.Prune = parseBool(value) 161 case "tidy": 162 rule.Tidy = parseBool(value) 163 case "test_url": 164 rule.TestURLs = append(rule.TestURLs, value) 165 default: 166 if strings.HasPrefix(key, "http_header(") && strings.HasSuffix(key, ")") { 167 headerName := strings.TrimSuffix(strings.TrimPrefix(key, "http_header("), ")") 168 if headerName != "" { 169 if rule.Headers == nil { 170 rule.Headers = make(map[string]string) 171 } 172 rule.Headers[http.CanonicalHeaderKey(headerName)] = value 173 } 174 } 175 } 176 } 177 178 if err := scanner.Err(); err != nil { 179 return nil, fmt.Errorf("error reading rule file: %w", err) 180 } 181 182 return rule, nil 183} 184 185func parseBool(value string) bool { 186 switch strings.ToLower(strings.TrimSpace(value)) { 187 case "1", "true", "yes", "on": 188 return true 189 default: 190 return false 191 } 192} 193 194func (p *ArticleParser) findRule(domain string) *ParsingRule { 195 for ruleDomain, rule := range p.rules { 196 if domain == ruleDomain || strings.HasSuffix(domain, ruleDomain) { 197 return rule 198 } 199 } 200 return nil 201} 202 203// ParseURL extracts article content from a given URL 204func (p *ArticleParser) ParseURL(s string) (*ParsedContent, error) { 205 parsedURL, err := url.Parse(s) 206 if err != nil { 207 return nil, fmt.Errorf("invalid URL: %w", err) 208 } 209 210 domain := parsedURL.Hostname() 211 rule := p.findRule(domain) 212 req, err := http.NewRequest(http.MethodGet, s, nil) 213 214 if err != nil { 215 return nil, fmt.Errorf("failed to create request: %w", err) 216 } 217 218 if rule != nil { 219 for header, value := range rule.Headers { 220 if value == "" { 221 continue 222 } 223 if req.Header.Get(header) == "" { 224 req.Header.Set(header, value) 225 } 226 } 227 } 228 229 resp, err := p.client.Do(req) 230 if err != nil { 231 return nil, fmt.Errorf("failed to fetch URL: %w", err) 232 } 233 defer resp.Body.Close() 234 235 if resp.StatusCode != http.StatusOK { 236 return nil, fmt.Errorf("HTTP error: %d", resp.StatusCode) 237 } 238 239 htmlBytes, err := io.ReadAll(resp.Body) 240 if err != nil { 241 return nil, fmt.Errorf("failed to read response body: %w", err) 242 } 243 244 return p.Parse(string(htmlBytes), domain, s) 245} 246 247// ParseHTML extracts article content from HTML string using domain-specific rules with heuristic fallback. 248// Implements dual validation: compares XPath results with heuristic extraction when rules exist. 249func (p *ArticleParser) Parse(htmlContent, domain, sourceURL string) (*ParsedContent, error) { 250 doc, err := htmlquery.Parse(strings.NewReader(htmlContent)) 251 if err != nil { 252 return nil, fmt.Errorf("failed to parse HTML: %w", err) 253 } 254 255 rule := p.findRule(domain) 256 257 if rule == nil { 258 return p.parseWithHeuristics(doc, sourceURL) 259 } 260 261 content := &ParsedContent{ 262 URL: sourceURL, 263 ExtractionMethod: "xpath", 264 Confidence: 0.85, 265 } 266 267 if rule.Title != "" { 268 if titleNode := htmlquery.FindOne(doc, rule.Title); titleNode != nil { 269 content.Title = strings.TrimSpace(htmlquery.InnerText(titleNode)) 270 } 271 } 272 if content.Title == "" { 273 content.Title = p.metadataExtractor.ExtractTitle(doc) 274 } 275 276 if rule.Author != "" { 277 if authorNode := htmlquery.FindOne(doc, rule.Author); authorNode != nil { 278 content.Author = strings.TrimSpace(htmlquery.InnerText(authorNode)) 279 } 280 } 281 if content.Author == "" { 282 content.Author = p.metadataExtractor.ExtractAuthor(doc) 283 } 284 285 if rule.Date != "" { 286 if dateNode := htmlquery.FindOne(doc, rule.Date); dateNode != nil { 287 content.Date = strings.TrimSpace(htmlquery.InnerText(dateNode)) 288 } 289 } 290 if content.Date == "" { 291 content.Date = p.metadataExtractor.ExtractPublishedDate(doc) 292 } 293 294 if rule.Body != "" { 295 bodyNode := htmlquery.FindOne(doc, rule.Body) 296 if bodyNode == nil { 297 return p.parseWithHeuristics(doc, sourceURL) 298 } 299 300 for _, stripXPath := range rule.Strip { 301 removeNodesByXPath(bodyNode, stripXPath) 302 } 303 304 for _, identifier := range rule.StripIDsOrClasses { 305 removeNodesByIdentifier(bodyNode, identifier) 306 } 307 308 removeDefaultNonContentNodes(bodyNode) 309 310 xpathContent := normalizeWhitespace(htmlquery.InnerText(bodyNode)) 311 312 heuristicResult := p.heuristicExtract.CompareWithXPath(doc, bodyNode) 313 if heuristicResult != nil { 314 content.Content = heuristicResult.Content 315 if content.Content == "" { 316 content.Content = xpathContent 317 } 318 content.Confidence = heuristicResult.Confidence 319 content.ExtractionMethod = heuristicResult.ExtractionMethod 320 } else { 321 content.Content = xpathContent 322 } 323 } 324 325 if content.Title == "" { 326 return nil, fmt.Errorf("could not extract title from HTML") 327 } 328 329 return content, nil 330} 331 332// parseWithHeuristics performs heuristic-only extraction when no XPath rule exists. 333func (p *ArticleParser) parseWithHeuristics(doc *exhtml.Node, sourceURL string) (*ParsedContent, error) { 334 result := p.heuristicExtract.ExtractWithSemanticHTML(doc) 335 if result == nil { 336 result = &ExtractionResult{ 337 ExtractionMethod: "heuristic-failed", 338 Confidence: 0.0, 339 } 340 } 341 342 metadata := p.metadataExtractor.ExtractMetadata(doc) 343 if metadata != nil { 344 if result.Title == "" { 345 result.Title = metadata.Title 346 } 347 if result.Author == "" { 348 result.Author = metadata.Author 349 } 350 if result.PublishedDate == "" { 351 result.PublishedDate = metadata.PublishedDate 352 } 353 } 354 355 content := &ParsedContent{ 356 Title: result.Title, 357 Author: result.Author, 358 Date: result.PublishedDate, 359 Content: result.Content, 360 URL: sourceURL, 361 Confidence: result.Confidence, 362 ExtractionMethod: result.ExtractionMethod, 363 } 364 365 if content.Title == "" { 366 return nil, fmt.Errorf("could not extract title from HTML using heuristics") 367 } 368 369 if content.Confidence < 0.3 { 370 return nil, fmt.Errorf("heuristic extraction confidence too low (%.2f)", content.Confidence) 371 } 372 373 return content, nil 374} 375 376func removeNodesByXPath(root *exhtml.Node, xpath string) { 377 if root == nil { 378 return 379 } 380 381 xpath = strings.TrimSpace(xpath) 382 if xpath == "" { 383 return 384 } 385 386 nodes := htmlquery.Find(root, xpath) 387 for _, node := range nodes { 388 if node != nil && node.Parent != nil { 389 node.Parent.RemoveChild(node) 390 } 391 } 392} 393 394func removeNodesByIdentifier(root *exhtml.Node, identifier string) { 395 identifier = strings.TrimSpace(identifier) 396 if root == nil || identifier == "" { 397 return 398 } 399 400 idLiteral := buildXPathLiteral(identifier) 401 removeNodesByXPath(root, fmt.Sprintf(".//*[@id=%s]", idLiteral)) 402 403 classLiteral := buildXPathLiteral(" " + identifier + " ") 404 removeNodesByXPath(root, fmt.Sprintf(".//*[contains(concat(' ', normalize-space(@class), ' '), %s)]", classLiteral)) 405} 406 407func removeDefaultNonContentNodes(root *exhtml.Node) { 408 for _, xp := range []string{ 409 ".//script", 410 ".//style", 411 ".//noscript", 412 } { 413 removeNodesByXPath(root, xp) 414 } 415} 416 417func normalizeWhitespace(value string) string { 418 value = strings.ReplaceAll(value, "\u00a0", " ") 419 return strings.TrimSpace(value) 420} 421 422func buildXPathLiteral(value string) string { 423 if !strings.Contains(value, "'") { 424 return "'" + value + "'" 425 } 426 427 if !strings.Contains(value, "\"") { 428 return `"` + value + `"` 429 } 430 431 segments := strings.Split(value, "'") 432 var builder strings.Builder 433 builder.WriteString("concat(") 434 435 for i, segment := range segments { 436 if i > 0 { 437 builder.WriteString(", \"'\", ") 438 } 439 if segment == "" { 440 builder.WriteString("''") 441 continue 442 } 443 builder.WriteString("'") 444 builder.WriteString(segment) 445 builder.WriteString("'") 446 } 447 448 builder.WriteString(")") 449 return builder.String() 450} 451 452// Convert HTML content directly to markdown using domain-specific rules 453func (p *ArticleParser) Convert(htmlContent, domain, sourceURL string) (string, error) { 454 content, err := p.Parse(htmlContent, domain, sourceURL) 455 if err != nil { 456 return "", err 457 } 458 459 return p.createMarkdown(content), nil 460} 461 462// GetSupportedDomains returns a list of domains that have parsing rules 463func (p *ArticleParser) GetSupportedDomains() []string { 464 var domains []string 465 for domain := range p.rules { 466 domains = append(domains, domain) 467 } 468 return domains 469} 470 471// SaveArticle saves the parsed content to filesystem and returns file paths 472func (p *ArticleParser) SaveArticle(content *ParsedContent, dir string) (markdownPath, htmlPath string, err error) { 473 if err := os.MkdirAll(dir, 0755); err != nil { 474 return "", "", fmt.Errorf("failed to create storage directory: %w", err) 475 } 476 477 slug := p.slugify(content.Title) 478 if slug == "" { 479 slug = "article" 480 } 481 482 baseMarkdownPath := filepath.Join(dir, slug+".md") 483 baseHTMLPath := filepath.Join(dir, slug+".html") 484 markdownPath = baseMarkdownPath 485 htmlPath = baseHTMLPath 486 487 counter := 1 488 for { 489 if _, err := os.Stat(markdownPath); os.IsNotExist(err) { 490 if _, err := os.Stat(htmlPath); os.IsNotExist(err) { 491 break 492 } 493 } 494 markdownPath = filepath.Join(dir, fmt.Sprintf("%s_%d.md", slug, counter)) 495 htmlPath = filepath.Join(dir, fmt.Sprintf("%s_%d.html", slug, counter)) 496 counter++ 497 } 498 499 markdownContent := p.createMarkdown(content) 500 501 if err := os.WriteFile(markdownPath, []byte(markdownContent), 0644); err != nil { 502 return "", "", fmt.Errorf("failed to write markdown file: %w", err) 503 } 504 505 htmlContent := p.createHTML(content, markdownContent) 506 507 if err := os.WriteFile(htmlPath, []byte(htmlContent), 0644); err != nil { 508 os.Remove(markdownPath) 509 return "", "", fmt.Errorf("failed to write HTML file: %w", err) 510 } 511 512 return markdownPath, htmlPath, nil 513} 514 515func (p *ArticleParser) slugify(title string) string { 516 slug := strings.ToLower(title) 517 518 reg := regexp.MustCompile(`[^a-z0-9]+`) 519 slug = reg.ReplaceAllString(slug, "-") 520 521 slug = strings.Trim(slug, "-") 522 523 if len(slug) > 100 { 524 slug = slug[:100] 525 slug = strings.Trim(slug, "-") 526 } 527 528 return slug 529} 530 531func (p *ArticleParser) createMarkdown(content *ParsedContent) string { 532 var builder strings.Builder 533 534 builder.WriteString(fmt.Sprintf("# %s\n\n", content.Title)) 535 536 if content.Author != "" { 537 builder.WriteString(fmt.Sprintf("**Author:** %s\n\n", content.Author)) 538 } 539 540 if content.Date != "" { 541 builder.WriteString(fmt.Sprintf("**Date:** %s\n\n", content.Date)) 542 } 543 544 builder.WriteString(fmt.Sprintf("**Source:** %s\n\n", content.URL)) 545 builder.WriteString(fmt.Sprintf("**Saved:** %s\n\n", time.Now().Format("2006-01-02 15:04:05"))) 546 547 builder.WriteString("---\n\n") 548 builder.WriteString(content.Content) 549 550 return builder.String() 551} 552 553func (p *ArticleParser) createHTML(content *ParsedContent, markdownContent string) string { 554 extensions := parser.CommonExtensions | parser.AutoHeadingIDs | parser.NoEmptyLineBeforeBlock 555 mdParser := parser.NewWithExtensions(extensions) 556 doc := mdParser.Parse([]byte(markdownContent)) 557 558 htmlFlags := html.CommonFlags | html.HrefTargetBlank 559 opts := html.RendererOptions{Flags: htmlFlags} 560 renderer := html.NewRenderer(opts) 561 562 htmlBody := markdown.Render(doc, renderer) 563 564 var builder strings.Builder 565 builder.WriteString("<!DOCTYPE html>\n") 566 builder.WriteString("<html>\n<head>\n") 567 builder.WriteString(fmt.Sprintf(" <title>%s</title>\n", content.Title)) 568 builder.WriteString(" <meta charset=\"UTF-8\">\n") 569 builder.WriteString(" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n") 570 builder.WriteString(" <style>\n") 571 builder.WriteString(" body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }\n") 572 builder.WriteString(" pre { background-color: #f4f4f4; padding: 10px; border-radius: 4px; overflow-x: auto; }\n") 573 builder.WriteString(" blockquote { border-left: 4px solid #ccc; padding-left: 16px; margin-left: 0; }\n") 574 builder.WriteString(" </style>\n") 575 builder.WriteString("</head>\n<body>\n") 576 builder.Write(htmlBody) 577 builder.WriteString("\n</body>\n</html>") 578 579 return builder.String() 580} 581 582// CreateArticleFromURL is a convenience function that parses a URL and creates an instance of [models.Article] 583func CreateArticleFromURL(url, dir string) (*models.Article, error) { 584 parser, err := NewArticleParser(http.DefaultClient) 585 if err != nil { 586 return nil, fmt.Errorf("failed to create parser: %w", err) 587 } 588 589 content, err := parser.ParseURL(url) 590 if err != nil { 591 return nil, fmt.Errorf("failed to parse URL: %w", err) 592 } 593 594 mdPath, htmlPath, err := parser.SaveArticle(content, dir) 595 if err != nil { 596 return nil, fmt.Errorf("failed to save article: %w", err) 597 } 598 599 return &models.Article{ 600 URL: url, 601 Title: content.Title, 602 Author: content.Author, 603 Date: content.Date, 604 MarkdownPath: mdPath, 605 HTMLPath: htmlPath, 606 Created: time.Now(), 607 Modified: time.Now(), 608 }, nil 609}