internal/articles/parser.go at d631904bbdd17c4c85e83012b4eae79459a04a9c

desertthunder.dev / noteleaf
fork
cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
fork
noteleaf / internal / articles / parser.go
at d631904bbdd17c4c85e83012b4eae79459a04a9c 400 lines 11 kB view raw
wrap content
Owais Jamil build: add test cases for CreateArticleFromURL 7mo ago
86604060
  1package articles
  2
  3import (
  4	"bufio"
  5	"embed"
  6	"fmt"
  7	"io"
  8	"net/http"
  9	"net/url"
 10	"os"
 11	"path/filepath"
 12	"regexp"
 13	"strings"
 14	"time"
 15
 16	"github.com/antchfx/htmlquery"
 17	"github.com/gomarkdown/markdown"
 18	"github.com/gomarkdown/markdown/html"
 19	"github.com/gomarkdown/markdown/parser"
 20	"github.com/stormlightlabs/noteleaf/internal/models"
 21)
 22
 23//go:embed rules/*.txt
 24var rulesFS embed.FS
 25
 26// ParsedContent represents the extracted content from a web page
 27type ParsedContent struct {
 28	Title   string
 29	Author  string
 30	Date    string
 31	Content string
 32	URL     string
 33}
 34
 35// ParsingRule represents XPath rules for extracting content from a specific domain
 36type ParsingRule struct {
 37	Domain string
 38	Title  string
 39	Author string
 40	Date   string
 41	Body   string
 42	// XPath selectors for elements to remove
 43	Strip    []string
 44	TestURLs []string
 45}
 46
 47// Parser interface defines methods for parsing articles from URLs
 48type Parser interface {
 49	// ParseURL extracts article content from a given URL
 50	ParseURL(url string) (*ParsedContent, error)
 51	// Convert HTML content directly to markdown using domain-specific rules
 52	Convert(htmlContent, domain, sourceURL string) (string, error)
 53	// GetSupportedDomains returns a list of domains that have parsing rules
 54	GetSupportedDomains() []string
 55	// SaveArticle saves the parsed content to filesystem and returns file paths
 56	SaveArticle(content *ParsedContent, storageDir string) (markdownPath, htmlPath string, err error)
 57}
 58
 59// ArticleParser implements the Parser interface
 60type ArticleParser struct {
 61	rules  map[string]*ParsingRule
 62	client *http.Client
 63}
 64
 65// NewArticleParser creates a new ArticleParser with the specified HTTP client and loaded rules
 66func NewArticleParser(client *http.Client) (*ArticleParser, error) {
 67	parser := &ArticleParser{
 68		rules:  make(map[string]*ParsingRule),
 69		client: client,
 70	}
 71
 72	if err := parser.loadRules(); err != nil {
 73		return nil, fmt.Errorf("failed to load parsing rules: %w", err)
 74	}
 75
 76	return parser, nil
 77}
 78
 79// AddRule adds or replaces a parsing rule for a specific domain
 80func (p *ArticleParser) AddRule(domain string, rule *ParsingRule) {
 81	p.rules[domain] = rule
 82}
 83
 84func (p *ArticleParser) loadRules() error {
 85	entries, err := rulesFS.ReadDir("rules")
 86	if err != nil {
 87		return fmt.Errorf("failed to read rules directory: %w", err)
 88	}
 89
 90	for _, entry := range entries {
 91		if !strings.HasSuffix(entry.Name(), ".txt") {
 92			continue
 93		}
 94
 95		domain := strings.TrimSuffix(entry.Name(), ".txt")
 96
 97		content, err := rulesFS.ReadFile(filepath.Join("rules", entry.Name()))
 98		if err != nil {
 99			return fmt.Errorf("failed to read rule file %s: %w", entry.Name(), err)
100		}
101
102		rule, err := p.parseRules(domain, string(content))
103		if err != nil {
104			return fmt.Errorf("failed to parse rule file %s: %w", entry.Name(), err)
105		}
106
107		p.rules[domain] = rule
108	}
109
110	return nil
111}
112
113func (p *ArticleParser) parseRules(domain, content string) (*ParsingRule, error) {
114	rule := &ParsingRule{Domain: domain, Strip: []string{}}
115	scanner := bufio.NewScanner(strings.NewReader(content))
116	for scanner.Scan() {
117		line := strings.TrimSpace(scanner.Text())
118
119		if line == "" || strings.HasPrefix(line, "#") {
120			continue
121		}
122
123		parts := strings.SplitN(line, ":", 2)
124		if len(parts) != 2 {
125			continue
126		}
127
128		key := strings.TrimSpace(parts[0])
129		value := strings.TrimSpace(parts[1])
130
131		switch key {
132		case "title":
133			rule.Title = value
134		case "author":
135			rule.Author = value
136		case "date":
137			rule.Date = value
138		case "body":
139			rule.Body = value
140		case "strip":
141			rule.Strip = append(rule.Strip, value)
142		case "test_url":
143			rule.TestURLs = append(rule.TestURLs, value)
144		}
145	}
146
147	if err := scanner.Err(); err != nil {
148		return nil, fmt.Errorf("error reading rule file: %w", err)
149	}
150
151	return rule, nil
152}
153
154// ParseURL extracts article content from a given URL
155func (p *ArticleParser) ParseURL(urlStr string) (*ParsedContent, error) {
156	parsedURL, err := url.Parse(urlStr)
157	if err != nil {
158		return nil, fmt.Errorf("invalid URL: %w", err)
159	}
160
161	domain := parsedURL.Hostname()
162
163	resp, err := p.client.Get(urlStr)
164	if err != nil {
165		return nil, fmt.Errorf("failed to fetch URL: %w", err)
166	}
167	defer resp.Body.Close()
168
169	if resp.StatusCode != http.StatusOK {
170		return nil, fmt.Errorf("HTTP error: %d", resp.StatusCode)
171	}
172
173	htmlBytes, err := io.ReadAll(resp.Body)
174	if err != nil {
175		return nil, fmt.Errorf("failed to read response body: %w", err)
176	}
177
178	return p.Parse(string(htmlBytes), domain, urlStr)
179}
180
181// ParseHTML extracts article content from HTML string using domain-specific rules
182func (p *ArticleParser) Parse(htmlContent, domain, sourceURL string) (*ParsedContent, error) {
183	var rule *ParsingRule
184	for ruleDomain, r := range p.rules {
185		if strings.Contains(domain, ruleDomain) {
186			rule = r
187			break
188		}
189	}
190
191	if rule == nil {
192		return nil, fmt.Errorf("no parsing rule found for domain: %s", domain)
193	}
194
195	doc, err := htmlquery.Parse(strings.NewReader(htmlContent))
196	if err != nil {
197		return nil, fmt.Errorf("failed to parse HTML: %w", err)
198	}
199
200	content := &ParsedContent{URL: sourceURL}
201
202	if rule.Title != "" {
203		if titleNode := htmlquery.FindOne(doc, rule.Title); titleNode != nil {
204			content.Title = strings.TrimSpace(htmlquery.InnerText(titleNode))
205		}
206	}
207
208	if rule.Author != "" {
209		if authorNode := htmlquery.FindOne(doc, rule.Author); authorNode != nil {
210			content.Author = strings.TrimSpace(htmlquery.InnerText(authorNode))
211		}
212	}
213
214	if rule.Date != "" {
215		if dateNode := htmlquery.FindOne(doc, rule.Date); dateNode != nil {
216			content.Date = strings.TrimSpace(htmlquery.InnerText(dateNode))
217		}
218	}
219
220	if rule.Body != "" {
221		if bodyNode := htmlquery.FindOne(doc, rule.Body); bodyNode != nil {
222			for _, stripXPath := range rule.Strip {
223				stripNodes := htmlquery.Find(bodyNode, stripXPath)
224				for _, node := range stripNodes {
225					node.Parent.RemoveChild(node)
226				}
227			}
228
229			content.Content = strings.TrimSpace(htmlquery.InnerText(bodyNode))
230		}
231	}
232
233	if content.Title == "" {
234		return nil, fmt.Errorf("could not extract title from HTML")
235	}
236
237	return content, nil
238}
239
240// Convert HTML content directly to markdown using domain-specific rules
241func (p *ArticleParser) Convert(htmlContent, domain, sourceURL string) (string, error) {
242	content, err := p.Parse(htmlContent, domain, sourceURL)
243	if err != nil {
244		return "", err
245	}
246
247	return p.createMarkdown(content), nil
248}
249
250// GetSupportedDomains returns a list of domains that have parsing rules
251func (p *ArticleParser) GetSupportedDomains() []string {
252	var domains []string
253	for domain := range p.rules {
254		domains = append(domains, domain)
255	}
256	return domains
257}
258
259// SaveArticle saves the parsed content to filesystem and returns file paths
260func (p *ArticleParser) SaveArticle(content *ParsedContent, storageDir string) (markdownPath, htmlPath string, err error) {
261	if err := os.MkdirAll(storageDir, 0755); err != nil {
262		return "", "", fmt.Errorf("failed to create storage directory: %w", err)
263	}
264
265	slug := p.slugify(content.Title)
266	if slug == "" {
267		slug = "article"
268	}
269
270	baseMarkdownPath := filepath.Join(storageDir, slug+".md")
271	baseHTMLPath := filepath.Join(storageDir, slug+".html")
272
273	markdownPath = baseMarkdownPath
274	htmlPath = baseHTMLPath
275
276	counter := 1
277	for {
278		if _, err := os.Stat(markdownPath); os.IsNotExist(err) {
279			if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
280				break
281			}
282		}
283		markdownPath = filepath.Join(storageDir, fmt.Sprintf("%s_%d.md", slug, counter))
284		htmlPath = filepath.Join(storageDir, fmt.Sprintf("%s_%d.html", slug, counter))
285		counter++
286	}
287
288	markdownContent := p.createMarkdown(content)
289
290	if err := os.WriteFile(markdownPath, []byte(markdownContent), 0644); err != nil {
291		return "", "", fmt.Errorf("failed to write markdown file: %w", err)
292	}
293
294	htmlContent := p.createHTML(content, markdownContent)
295
296	if err := os.WriteFile(htmlPath, []byte(htmlContent), 0644); err != nil {
297		os.Remove(markdownPath)
298		return "", "", fmt.Errorf("failed to write HTML file: %w", err)
299	}
300
301	return markdownPath, htmlPath, nil
302}
303
304func (p *ArticleParser) slugify(title string) string {
305	slug := strings.ToLower(title)
306
307	reg := regexp.MustCompile(`[^a-z0-9]+`)
308	slug = reg.ReplaceAllString(slug, "-")
309
310	slug = strings.Trim(slug, "-")
311
312	if len(slug) > 100 {
313		slug = slug[:100]
314		slug = strings.Trim(slug, "-")
315	}
316
317	return slug
318}
319
320func (p *ArticleParser) createMarkdown(content *ParsedContent) string {
321	var builder strings.Builder
322
323	builder.WriteString(fmt.Sprintf("# %s\n\n", content.Title))
324
325	if content.Author != "" {
326		builder.WriteString(fmt.Sprintf("**Author:** %s\n\n", content.Author))
327	}
328
329	if content.Date != "" {
330		builder.WriteString(fmt.Sprintf("**Date:** %s\n\n", content.Date))
331	}
332
333	builder.WriteString(fmt.Sprintf("**Source:** %s\n\n", content.URL))
334	builder.WriteString(fmt.Sprintf("**Saved:** %s\n\n", time.Now().Format("2006-01-02 15:04:05")))
335
336	builder.WriteString("---\n\n")
337	builder.WriteString(content.Content)
338
339	return builder.String()
340}
341
342func (p *ArticleParser) createHTML(content *ParsedContent, markdownContent string) string {
343	extensions := parser.CommonExtensions | parser.AutoHeadingIDs | parser.NoEmptyLineBeforeBlock
344	mdParser := parser.NewWithExtensions(extensions)
345	doc := mdParser.Parse([]byte(markdownContent))
346
347	htmlFlags := html.CommonFlags | html.HrefTargetBlank
348	opts := html.RendererOptions{Flags: htmlFlags}
349	renderer := html.NewRenderer(opts)
350
351	htmlBody := markdown.Render(doc, renderer)
352
353	var builder strings.Builder
354	builder.WriteString("<!DOCTYPE html>\n")
355	builder.WriteString("<html>\n<head>\n")
356	builder.WriteString(fmt.Sprintf("  <title>%s</title>\n", content.Title))
357	builder.WriteString("  <meta charset=\"UTF-8\">\n")
358	builder.WriteString("  <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n")
359	builder.WriteString("  <style>\n")
360	builder.WriteString("    body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }\n")
361	builder.WriteString("    pre { background-color: #f4f4f4; padding: 10px; border-radius: 4px; overflow-x: auto; }\n")
362	builder.WriteString("    blockquote { border-left: 4px solid #ccc; padding-left: 16px; margin-left: 0; }\n")
363	builder.WriteString("  </style>\n")
364	builder.WriteString("</head>\n<body>\n")
365	builder.Write(htmlBody)
366	builder.WriteString("\n</body>\n</html>")
367
368	return builder.String()
369}
370
371// CreateArticleFromURL is a convenience function that parses a URL and creates an instance of [models.Article]
372func CreateArticleFromURL(url, dir string) (*models.Article, error) {
373	parser, err := NewArticleParser(http.DefaultClient)
374	if err != nil {
375		return nil, fmt.Errorf("failed to create parser: %w", err)
376	}
377
378	content, err := parser.ParseURL(url)
379	if err != nil {
380		return nil, fmt.Errorf("failed to parse URL: %w", err)
381	}
382
383	mdPath, htmlPath, err := parser.SaveArticle(content, dir)
384	if err != nil {
385		return nil, fmt.Errorf("failed to save article: %w", err)
386	}
387
388	article := &models.Article{
389		URL:          url,
390		Title:        content.Title,
391		Author:       content.Author,
392		Date:         content.Date,
393		MarkdownPath: mdPath,
394		HTMLPath:     htmlPath,
395		Created:      time.Now(),
396		Modified:     time.Now(),
397	}
398
399	return article, nil
400}
Configure Feed

Configure Feed