cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
29
fork

Configure Feed

Select the types of activity you want to include in your feed.

at d631904bbdd17c4c85e83012b4eae79459a04a9c 400 lines 11 kB view raw
1package articles 2 3import ( 4 "bufio" 5 "embed" 6 "fmt" 7 "io" 8 "net/http" 9 "net/url" 10 "os" 11 "path/filepath" 12 "regexp" 13 "strings" 14 "time" 15 16 "github.com/antchfx/htmlquery" 17 "github.com/gomarkdown/markdown" 18 "github.com/gomarkdown/markdown/html" 19 "github.com/gomarkdown/markdown/parser" 20 "github.com/stormlightlabs/noteleaf/internal/models" 21) 22 23//go:embed rules/*.txt 24var rulesFS embed.FS 25 26// ParsedContent represents the extracted content from a web page 27type ParsedContent struct { 28 Title string 29 Author string 30 Date string 31 Content string 32 URL string 33} 34 35// ParsingRule represents XPath rules for extracting content from a specific domain 36type ParsingRule struct { 37 Domain string 38 Title string 39 Author string 40 Date string 41 Body string 42 // XPath selectors for elements to remove 43 Strip []string 44 TestURLs []string 45} 46 47// Parser interface defines methods for parsing articles from URLs 48type Parser interface { 49 // ParseURL extracts article content from a given URL 50 ParseURL(url string) (*ParsedContent, error) 51 // Convert HTML content directly to markdown using domain-specific rules 52 Convert(htmlContent, domain, sourceURL string) (string, error) 53 // GetSupportedDomains returns a list of domains that have parsing rules 54 GetSupportedDomains() []string 55 // SaveArticle saves the parsed content to filesystem and returns file paths 56 SaveArticle(content *ParsedContent, storageDir string) (markdownPath, htmlPath string, err error) 57} 58 59// ArticleParser implements the Parser interface 60type ArticleParser struct { 61 rules map[string]*ParsingRule 62 client *http.Client 63} 64 65// NewArticleParser creates a new ArticleParser with the specified HTTP client and loaded rules 66func NewArticleParser(client *http.Client) (*ArticleParser, error) { 67 parser := &ArticleParser{ 68 rules: make(map[string]*ParsingRule), 69 client: client, 70 } 71 72 if err := parser.loadRules(); err != nil { 73 return nil, fmt.Errorf("failed to load parsing rules: %w", err) 74 } 75 76 return parser, nil 77} 78 79// AddRule adds or replaces a parsing rule for a specific domain 80func (p *ArticleParser) AddRule(domain string, rule *ParsingRule) { 81 p.rules[domain] = rule 82} 83 84func (p *ArticleParser) loadRules() error { 85 entries, err := rulesFS.ReadDir("rules") 86 if err != nil { 87 return fmt.Errorf("failed to read rules directory: %w", err) 88 } 89 90 for _, entry := range entries { 91 if !strings.HasSuffix(entry.Name(), ".txt") { 92 continue 93 } 94 95 domain := strings.TrimSuffix(entry.Name(), ".txt") 96 97 content, err := rulesFS.ReadFile(filepath.Join("rules", entry.Name())) 98 if err != nil { 99 return fmt.Errorf("failed to read rule file %s: %w", entry.Name(), err) 100 } 101 102 rule, err := p.parseRules(domain, string(content)) 103 if err != nil { 104 return fmt.Errorf("failed to parse rule file %s: %w", entry.Name(), err) 105 } 106 107 p.rules[domain] = rule 108 } 109 110 return nil 111} 112 113func (p *ArticleParser) parseRules(domain, content string) (*ParsingRule, error) { 114 rule := &ParsingRule{Domain: domain, Strip: []string{}} 115 scanner := bufio.NewScanner(strings.NewReader(content)) 116 for scanner.Scan() { 117 line := strings.TrimSpace(scanner.Text()) 118 119 if line == "" || strings.HasPrefix(line, "#") { 120 continue 121 } 122 123 parts := strings.SplitN(line, ":", 2) 124 if len(parts) != 2 { 125 continue 126 } 127 128 key := strings.TrimSpace(parts[0]) 129 value := strings.TrimSpace(parts[1]) 130 131 switch key { 132 case "title": 133 rule.Title = value 134 case "author": 135 rule.Author = value 136 case "date": 137 rule.Date = value 138 case "body": 139 rule.Body = value 140 case "strip": 141 rule.Strip = append(rule.Strip, value) 142 case "test_url": 143 rule.TestURLs = append(rule.TestURLs, value) 144 } 145 } 146 147 if err := scanner.Err(); err != nil { 148 return nil, fmt.Errorf("error reading rule file: %w", err) 149 } 150 151 return rule, nil 152} 153 154// ParseURL extracts article content from a given URL 155func (p *ArticleParser) ParseURL(urlStr string) (*ParsedContent, error) { 156 parsedURL, err := url.Parse(urlStr) 157 if err != nil { 158 return nil, fmt.Errorf("invalid URL: %w", err) 159 } 160 161 domain := parsedURL.Hostname() 162 163 resp, err := p.client.Get(urlStr) 164 if err != nil { 165 return nil, fmt.Errorf("failed to fetch URL: %w", err) 166 } 167 defer resp.Body.Close() 168 169 if resp.StatusCode != http.StatusOK { 170 return nil, fmt.Errorf("HTTP error: %d", resp.StatusCode) 171 } 172 173 htmlBytes, err := io.ReadAll(resp.Body) 174 if err != nil { 175 return nil, fmt.Errorf("failed to read response body: %w", err) 176 } 177 178 return p.Parse(string(htmlBytes), domain, urlStr) 179} 180 181// ParseHTML extracts article content from HTML string using domain-specific rules 182func (p *ArticleParser) Parse(htmlContent, domain, sourceURL string) (*ParsedContent, error) { 183 var rule *ParsingRule 184 for ruleDomain, r := range p.rules { 185 if strings.Contains(domain, ruleDomain) { 186 rule = r 187 break 188 } 189 } 190 191 if rule == nil { 192 return nil, fmt.Errorf("no parsing rule found for domain: %s", domain) 193 } 194 195 doc, err := htmlquery.Parse(strings.NewReader(htmlContent)) 196 if err != nil { 197 return nil, fmt.Errorf("failed to parse HTML: %w", err) 198 } 199 200 content := &ParsedContent{URL: sourceURL} 201 202 if rule.Title != "" { 203 if titleNode := htmlquery.FindOne(doc, rule.Title); titleNode != nil { 204 content.Title = strings.TrimSpace(htmlquery.InnerText(titleNode)) 205 } 206 } 207 208 if rule.Author != "" { 209 if authorNode := htmlquery.FindOne(doc, rule.Author); authorNode != nil { 210 content.Author = strings.TrimSpace(htmlquery.InnerText(authorNode)) 211 } 212 } 213 214 if rule.Date != "" { 215 if dateNode := htmlquery.FindOne(doc, rule.Date); dateNode != nil { 216 content.Date = strings.TrimSpace(htmlquery.InnerText(dateNode)) 217 } 218 } 219 220 if rule.Body != "" { 221 if bodyNode := htmlquery.FindOne(doc, rule.Body); bodyNode != nil { 222 for _, stripXPath := range rule.Strip { 223 stripNodes := htmlquery.Find(bodyNode, stripXPath) 224 for _, node := range stripNodes { 225 node.Parent.RemoveChild(node) 226 } 227 } 228 229 content.Content = strings.TrimSpace(htmlquery.InnerText(bodyNode)) 230 } 231 } 232 233 if content.Title == "" { 234 return nil, fmt.Errorf("could not extract title from HTML") 235 } 236 237 return content, nil 238} 239 240// Convert HTML content directly to markdown using domain-specific rules 241func (p *ArticleParser) Convert(htmlContent, domain, sourceURL string) (string, error) { 242 content, err := p.Parse(htmlContent, domain, sourceURL) 243 if err != nil { 244 return "", err 245 } 246 247 return p.createMarkdown(content), nil 248} 249 250// GetSupportedDomains returns a list of domains that have parsing rules 251func (p *ArticleParser) GetSupportedDomains() []string { 252 var domains []string 253 for domain := range p.rules { 254 domains = append(domains, domain) 255 } 256 return domains 257} 258 259// SaveArticle saves the parsed content to filesystem and returns file paths 260func (p *ArticleParser) SaveArticle(content *ParsedContent, storageDir string) (markdownPath, htmlPath string, err error) { 261 if err := os.MkdirAll(storageDir, 0755); err != nil { 262 return "", "", fmt.Errorf("failed to create storage directory: %w", err) 263 } 264 265 slug := p.slugify(content.Title) 266 if slug == "" { 267 slug = "article" 268 } 269 270 baseMarkdownPath := filepath.Join(storageDir, slug+".md") 271 baseHTMLPath := filepath.Join(storageDir, slug+".html") 272 273 markdownPath = baseMarkdownPath 274 htmlPath = baseHTMLPath 275 276 counter := 1 277 for { 278 if _, err := os.Stat(markdownPath); os.IsNotExist(err) { 279 if _, err := os.Stat(htmlPath); os.IsNotExist(err) { 280 break 281 } 282 } 283 markdownPath = filepath.Join(storageDir, fmt.Sprintf("%s_%d.md", slug, counter)) 284 htmlPath = filepath.Join(storageDir, fmt.Sprintf("%s_%d.html", slug, counter)) 285 counter++ 286 } 287 288 markdownContent := p.createMarkdown(content) 289 290 if err := os.WriteFile(markdownPath, []byte(markdownContent), 0644); err != nil { 291 return "", "", fmt.Errorf("failed to write markdown file: %w", err) 292 } 293 294 htmlContent := p.createHTML(content, markdownContent) 295 296 if err := os.WriteFile(htmlPath, []byte(htmlContent), 0644); err != nil { 297 os.Remove(markdownPath) 298 return "", "", fmt.Errorf("failed to write HTML file: %w", err) 299 } 300 301 return markdownPath, htmlPath, nil 302} 303 304func (p *ArticleParser) slugify(title string) string { 305 slug := strings.ToLower(title) 306 307 reg := regexp.MustCompile(`[^a-z0-9]+`) 308 slug = reg.ReplaceAllString(slug, "-") 309 310 slug = strings.Trim(slug, "-") 311 312 if len(slug) > 100 { 313 slug = slug[:100] 314 slug = strings.Trim(slug, "-") 315 } 316 317 return slug 318} 319 320func (p *ArticleParser) createMarkdown(content *ParsedContent) string { 321 var builder strings.Builder 322 323 builder.WriteString(fmt.Sprintf("# %s\n\n", content.Title)) 324 325 if content.Author != "" { 326 builder.WriteString(fmt.Sprintf("**Author:** %s\n\n", content.Author)) 327 } 328 329 if content.Date != "" { 330 builder.WriteString(fmt.Sprintf("**Date:** %s\n\n", content.Date)) 331 } 332 333 builder.WriteString(fmt.Sprintf("**Source:** %s\n\n", content.URL)) 334 builder.WriteString(fmt.Sprintf("**Saved:** %s\n\n", time.Now().Format("2006-01-02 15:04:05"))) 335 336 builder.WriteString("---\n\n") 337 builder.WriteString(content.Content) 338 339 return builder.String() 340} 341 342func (p *ArticleParser) createHTML(content *ParsedContent, markdownContent string) string { 343 extensions := parser.CommonExtensions | parser.AutoHeadingIDs | parser.NoEmptyLineBeforeBlock 344 mdParser := parser.NewWithExtensions(extensions) 345 doc := mdParser.Parse([]byte(markdownContent)) 346 347 htmlFlags := html.CommonFlags | html.HrefTargetBlank 348 opts := html.RendererOptions{Flags: htmlFlags} 349 renderer := html.NewRenderer(opts) 350 351 htmlBody := markdown.Render(doc, renderer) 352 353 var builder strings.Builder 354 builder.WriteString("<!DOCTYPE html>\n") 355 builder.WriteString("<html>\n<head>\n") 356 builder.WriteString(fmt.Sprintf(" <title>%s</title>\n", content.Title)) 357 builder.WriteString(" <meta charset=\"UTF-8\">\n") 358 builder.WriteString(" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n") 359 builder.WriteString(" <style>\n") 360 builder.WriteString(" body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }\n") 361 builder.WriteString(" pre { background-color: #f4f4f4; padding: 10px; border-radius: 4px; overflow-x: auto; }\n") 362 builder.WriteString(" blockquote { border-left: 4px solid #ccc; padding-left: 16px; margin-left: 0; }\n") 363 builder.WriteString(" </style>\n") 364 builder.WriteString("</head>\n<body>\n") 365 builder.Write(htmlBody) 366 builder.WriteString("\n</body>\n</html>") 367 368 return builder.String() 369} 370 371// CreateArticleFromURL is a convenience function that parses a URL and creates an instance of [models.Article] 372func CreateArticleFromURL(url, dir string) (*models.Article, error) { 373 parser, err := NewArticleParser(http.DefaultClient) 374 if err != nil { 375 return nil, fmt.Errorf("failed to create parser: %w", err) 376 } 377 378 content, err := parser.ParseURL(url) 379 if err != nil { 380 return nil, fmt.Errorf("failed to parse URL: %w", err) 381 } 382 383 mdPath, htmlPath, err := parser.SaveArticle(content, dir) 384 if err != nil { 385 return nil, fmt.Errorf("failed to save article: %w", err) 386 } 387 388 article := &models.Article{ 389 URL: url, 390 Title: content.Title, 391 Author: content.Author, 392 Date: content.Date, 393 MarkdownPath: mdPath, 394 HTMLPath: htmlPath, 395 Created: time.Now(), 396 Modified: time.Now(), 397 } 398 399 return article, nil 400}