cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm
leaflet
readability
golang
1package articles
2
3import (
4 "bufio"
5 "embed"
6 "fmt"
7 "io"
8 "net/http"
9 "net/url"
10 "os"
11 "path/filepath"
12 "regexp"
13 "strings"
14 "time"
15
16 "github.com/antchfx/htmlquery"
17 "github.com/gomarkdown/markdown"
18 "github.com/gomarkdown/markdown/html"
19 "github.com/gomarkdown/markdown/parser"
20 "github.com/stormlightlabs/noteleaf/internal/models"
21)
22
23//go:embed rules/*.txt
24var rulesFS embed.FS
25
26// ParsedContent represents the extracted content from a web page
27type ParsedContent struct {
28 Title string
29 Author string
30 Date string
31 Content string
32 URL string
33}
34
35// ParsingRule represents XPath rules for extracting content from a specific domain
36type ParsingRule struct {
37 Domain string
38 Title string
39 Author string
40 Date string
41 Body string
42 // XPath selectors for elements to remove
43 Strip []string
44 TestURLs []string
45}
46
47// Parser interface defines methods for parsing articles from URLs
48type Parser interface {
49 // ParseURL extracts article content from a given URL
50 ParseURL(url string) (*ParsedContent, error)
51 // Convert HTML content directly to markdown using domain-specific rules
52 Convert(htmlContent, domain, sourceURL string) (string, error)
53 // GetSupportedDomains returns a list of domains that have parsing rules
54 GetSupportedDomains() []string
55 // SaveArticle saves the parsed content to filesystem and returns file paths
56 SaveArticle(content *ParsedContent, storageDir string) (markdownPath, htmlPath string, err error)
57}
58
59// ArticleParser implements the Parser interface
60type ArticleParser struct {
61 rules map[string]*ParsingRule
62 client *http.Client
63}
64
65// NewArticleParser creates a new ArticleParser with the specified HTTP client and loaded rules
66func NewArticleParser(client *http.Client) (*ArticleParser, error) {
67 parser := &ArticleParser{
68 rules: make(map[string]*ParsingRule),
69 client: client,
70 }
71
72 if err := parser.loadRules(); err != nil {
73 return nil, fmt.Errorf("failed to load parsing rules: %w", err)
74 }
75
76 return parser, nil
77}
78
79// AddRule adds or replaces a parsing rule for a specific domain
80func (p *ArticleParser) AddRule(domain string, rule *ParsingRule) {
81 p.rules[domain] = rule
82}
83
84func (p *ArticleParser) loadRules() error {
85 entries, err := rulesFS.ReadDir("rules")
86 if err != nil {
87 return fmt.Errorf("failed to read rules directory: %w", err)
88 }
89
90 for _, entry := range entries {
91 if !strings.HasSuffix(entry.Name(), ".txt") {
92 continue
93 }
94
95 domain := strings.TrimSuffix(entry.Name(), ".txt")
96
97 content, err := rulesFS.ReadFile(filepath.Join("rules", entry.Name()))
98 if err != nil {
99 return fmt.Errorf("failed to read rule file %s: %w", entry.Name(), err)
100 }
101
102 rule, err := p.parseRules(domain, string(content))
103 if err != nil {
104 return fmt.Errorf("failed to parse rule file %s: %w", entry.Name(), err)
105 }
106
107 p.rules[domain] = rule
108 }
109
110 return nil
111}
112
113func (p *ArticleParser) parseRules(domain, content string) (*ParsingRule, error) {
114 rule := &ParsingRule{Domain: domain, Strip: []string{}}
115 scanner := bufio.NewScanner(strings.NewReader(content))
116 for scanner.Scan() {
117 line := strings.TrimSpace(scanner.Text())
118
119 if line == "" || strings.HasPrefix(line, "#") {
120 continue
121 }
122
123 parts := strings.SplitN(line, ":", 2)
124 if len(parts) != 2 {
125 continue
126 }
127
128 key := strings.TrimSpace(parts[0])
129 value := strings.TrimSpace(parts[1])
130
131 switch key {
132 case "title":
133 rule.Title = value
134 case "author":
135 rule.Author = value
136 case "date":
137 rule.Date = value
138 case "body":
139 rule.Body = value
140 case "strip":
141 rule.Strip = append(rule.Strip, value)
142 case "test_url":
143 rule.TestURLs = append(rule.TestURLs, value)
144 }
145 }
146
147 if err := scanner.Err(); err != nil {
148 return nil, fmt.Errorf("error reading rule file: %w", err)
149 }
150
151 return rule, nil
152}
153
154// ParseURL extracts article content from a given URL
155func (p *ArticleParser) ParseURL(urlStr string) (*ParsedContent, error) {
156 parsedURL, err := url.Parse(urlStr)
157 if err != nil {
158 return nil, fmt.Errorf("invalid URL: %w", err)
159 }
160
161 domain := parsedURL.Hostname()
162
163 resp, err := p.client.Get(urlStr)
164 if err != nil {
165 return nil, fmt.Errorf("failed to fetch URL: %w", err)
166 }
167 defer resp.Body.Close()
168
169 if resp.StatusCode != http.StatusOK {
170 return nil, fmt.Errorf("HTTP error: %d", resp.StatusCode)
171 }
172
173 htmlBytes, err := io.ReadAll(resp.Body)
174 if err != nil {
175 return nil, fmt.Errorf("failed to read response body: %w", err)
176 }
177
178 return p.Parse(string(htmlBytes), domain, urlStr)
179}
180
181// ParseHTML extracts article content from HTML string using domain-specific rules
182func (p *ArticleParser) Parse(htmlContent, domain, sourceURL string) (*ParsedContent, error) {
183 var rule *ParsingRule
184 for ruleDomain, r := range p.rules {
185 if strings.Contains(domain, ruleDomain) {
186 rule = r
187 break
188 }
189 }
190
191 if rule == nil {
192 return nil, fmt.Errorf("no parsing rule found for domain: %s", domain)
193 }
194
195 doc, err := htmlquery.Parse(strings.NewReader(htmlContent))
196 if err != nil {
197 return nil, fmt.Errorf("failed to parse HTML: %w", err)
198 }
199
200 content := &ParsedContent{URL: sourceURL}
201
202 if rule.Title != "" {
203 if titleNode := htmlquery.FindOne(doc, rule.Title); titleNode != nil {
204 content.Title = strings.TrimSpace(htmlquery.InnerText(titleNode))
205 }
206 }
207
208 if rule.Author != "" {
209 if authorNode := htmlquery.FindOne(doc, rule.Author); authorNode != nil {
210 content.Author = strings.TrimSpace(htmlquery.InnerText(authorNode))
211 }
212 }
213
214 if rule.Date != "" {
215 if dateNode := htmlquery.FindOne(doc, rule.Date); dateNode != nil {
216 content.Date = strings.TrimSpace(htmlquery.InnerText(dateNode))
217 }
218 }
219
220 if rule.Body != "" {
221 if bodyNode := htmlquery.FindOne(doc, rule.Body); bodyNode != nil {
222 for _, stripXPath := range rule.Strip {
223 stripNodes := htmlquery.Find(bodyNode, stripXPath)
224 for _, node := range stripNodes {
225 node.Parent.RemoveChild(node)
226 }
227 }
228
229 content.Content = strings.TrimSpace(htmlquery.InnerText(bodyNode))
230 }
231 }
232
233 if content.Title == "" {
234 return nil, fmt.Errorf("could not extract title from HTML")
235 }
236
237 return content, nil
238}
239
240// Convert HTML content directly to markdown using domain-specific rules
241func (p *ArticleParser) Convert(htmlContent, domain, sourceURL string) (string, error) {
242 content, err := p.Parse(htmlContent, domain, sourceURL)
243 if err != nil {
244 return "", err
245 }
246
247 return p.createMarkdown(content), nil
248}
249
250// GetSupportedDomains returns a list of domains that have parsing rules
251func (p *ArticleParser) GetSupportedDomains() []string {
252 var domains []string
253 for domain := range p.rules {
254 domains = append(domains, domain)
255 }
256 return domains
257}
258
259// SaveArticle saves the parsed content to filesystem and returns file paths
260func (p *ArticleParser) SaveArticle(content *ParsedContent, storageDir string) (markdownPath, htmlPath string, err error) {
261 if err := os.MkdirAll(storageDir, 0755); err != nil {
262 return "", "", fmt.Errorf("failed to create storage directory: %w", err)
263 }
264
265 slug := p.slugify(content.Title)
266 if slug == "" {
267 slug = "article"
268 }
269
270 baseMarkdownPath := filepath.Join(storageDir, slug+".md")
271 baseHTMLPath := filepath.Join(storageDir, slug+".html")
272
273 markdownPath = baseMarkdownPath
274 htmlPath = baseHTMLPath
275
276 counter := 1
277 for {
278 if _, err := os.Stat(markdownPath); os.IsNotExist(err) {
279 if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
280 break
281 }
282 }
283 markdownPath = filepath.Join(storageDir, fmt.Sprintf("%s_%d.md", slug, counter))
284 htmlPath = filepath.Join(storageDir, fmt.Sprintf("%s_%d.html", slug, counter))
285 counter++
286 }
287
288 markdownContent := p.createMarkdown(content)
289
290 if err := os.WriteFile(markdownPath, []byte(markdownContent), 0644); err != nil {
291 return "", "", fmt.Errorf("failed to write markdown file: %w", err)
292 }
293
294 htmlContent := p.createHTML(content, markdownContent)
295
296 if err := os.WriteFile(htmlPath, []byte(htmlContent), 0644); err != nil {
297 os.Remove(markdownPath)
298 return "", "", fmt.Errorf("failed to write HTML file: %w", err)
299 }
300
301 return markdownPath, htmlPath, nil
302}
303
304func (p *ArticleParser) slugify(title string) string {
305 slug := strings.ToLower(title)
306
307 reg := regexp.MustCompile(`[^a-z0-9]+`)
308 slug = reg.ReplaceAllString(slug, "-")
309
310 slug = strings.Trim(slug, "-")
311
312 if len(slug) > 100 {
313 slug = slug[:100]
314 slug = strings.Trim(slug, "-")
315 }
316
317 return slug
318}
319
320func (p *ArticleParser) createMarkdown(content *ParsedContent) string {
321 var builder strings.Builder
322
323 builder.WriteString(fmt.Sprintf("# %s\n\n", content.Title))
324
325 if content.Author != "" {
326 builder.WriteString(fmt.Sprintf("**Author:** %s\n\n", content.Author))
327 }
328
329 if content.Date != "" {
330 builder.WriteString(fmt.Sprintf("**Date:** %s\n\n", content.Date))
331 }
332
333 builder.WriteString(fmt.Sprintf("**Source:** %s\n\n", content.URL))
334 builder.WriteString(fmt.Sprintf("**Saved:** %s\n\n", time.Now().Format("2006-01-02 15:04:05")))
335
336 builder.WriteString("---\n\n")
337 builder.WriteString(content.Content)
338
339 return builder.String()
340}
341
342func (p *ArticleParser) createHTML(content *ParsedContent, markdownContent string) string {
343 extensions := parser.CommonExtensions | parser.AutoHeadingIDs | parser.NoEmptyLineBeforeBlock
344 mdParser := parser.NewWithExtensions(extensions)
345 doc := mdParser.Parse([]byte(markdownContent))
346
347 htmlFlags := html.CommonFlags | html.HrefTargetBlank
348 opts := html.RendererOptions{Flags: htmlFlags}
349 renderer := html.NewRenderer(opts)
350
351 htmlBody := markdown.Render(doc, renderer)
352
353 var builder strings.Builder
354 builder.WriteString("<!DOCTYPE html>\n")
355 builder.WriteString("<html>\n<head>\n")
356 builder.WriteString(fmt.Sprintf(" <title>%s</title>\n", content.Title))
357 builder.WriteString(" <meta charset=\"UTF-8\">\n")
358 builder.WriteString(" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n")
359 builder.WriteString(" <style>\n")
360 builder.WriteString(" body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }\n")
361 builder.WriteString(" pre { background-color: #f4f4f4; padding: 10px; border-radius: 4px; overflow-x: auto; }\n")
362 builder.WriteString(" blockquote { border-left: 4px solid #ccc; padding-left: 16px; margin-left: 0; }\n")
363 builder.WriteString(" </style>\n")
364 builder.WriteString("</head>\n<body>\n")
365 builder.Write(htmlBody)
366 builder.WriteString("\n</body>\n</html>")
367
368 return builder.String()
369}
370
371// CreateArticleFromURL is a convenience function that parses a URL and creates an instance of [models.Article]
372func CreateArticleFromURL(url, dir string) (*models.Article, error) {
373 parser, err := NewArticleParser(http.DefaultClient)
374 if err != nil {
375 return nil, fmt.Errorf("failed to create parser: %w", err)
376 }
377
378 content, err := parser.ParseURL(url)
379 if err != nil {
380 return nil, fmt.Errorf("failed to parse URL: %w", err)
381 }
382
383 mdPath, htmlPath, err := parser.SaveArticle(content, dir)
384 if err != nil {
385 return nil, fmt.Errorf("failed to save article: %w", err)
386 }
387
388 article := &models.Article{
389 URL: url,
390 Title: content.Title,
391 Author: content.Author,
392 Date: content.Date,
393 MarkdownPath: mdPath,
394 HTMLPath: htmlPath,
395 Created: time.Now(),
396 Modified: time.Now(),
397 }
398
399 return article, nil
400}