cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm
leaflet
readability
golang
1package articles
2
3import (
4 "fmt"
5 "net/http"
6 "net/http/httptest"
7 "os"
8 "strings"
9 "testing"
10)
11
12// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules.
13func ExampleParser_Convert() {
14 parser, err := NewArticleParser(http.DefaultClient)
15 if err != nil {
16 fmt.Printf("Failed to create parser: %v\n", err)
17 return
18 }
19
20 htmlPath := "examples/christopher-lloyd.html"
21 htmlContent, err := os.ReadFile(htmlPath)
22 if err != nil {
23 fmt.Printf("Local HTML file not found: %v\n", err)
24 return
25 }
26
27 markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd")
28 if err != nil {
29 fmt.Printf("Failed to convert HTML: %v\n", err)
30 return
31 }
32
33 parts := strings.Split(markdown, "\n---\n")
34 if len(parts) > 0 {
35 frontmatter := strings.TrimSpace(parts[0])
36 lines := strings.Split(frontmatter, "\n")
37
38 for i, line := range lines {
39 if i >= 4 {
40 break
41 }
42
43 if !strings.Contains(line, "**Saved:**") {
44 fmt.Println(line)
45 }
46 }
47 }
48
49 // Output: # Christopher Lloyd
50 //
51 // **Source:** https://en.wikipedia.org/wiki/Christopher_Lloyd
52}
53
54func TestArticleParser(t *testing.T) {
55 t.Run("New", func(t *testing.T) {
56 t.Run("successfully creates parser", func(t *testing.T) {
57 parser, err := NewArticleParser(http.DefaultClient)
58 if err != nil {
59 t.Fatalf("Expected no error, got %v", err)
60 }
61 if parser == nil {
62 t.Fatal("Expected parser to be created, got nil")
63 }
64 if len(parser.rules) == 0 {
65 t.Error("Expected rules to be loaded")
66 }
67 })
68
69 t.Run("loads expected domains", func(t *testing.T) {
70 parser, err := NewArticleParser(http.DefaultClient)
71 if err != nil {
72 t.Fatalf("Failed to create parser: %v", err)
73 }
74
75 domains := parser.GetSupportedDomains()
76 expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"}
77
78 if len(domains) != len(expectedDomains) {
79 t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains))
80 }
81
82 domainMap := make(map[string]bool)
83 for _, domain := range domains {
84 domainMap[domain] = true
85 }
86
87 for _, expected := range expectedDomains {
88 if !domainMap[expected] {
89 t.Errorf("Expected domain %s not found in supported domains", expected)
90 }
91 }
92 })
93 })
94
95 t.Run("parseRules", func(t *testing.T) {
96 parser := &ArticleParser{rules: make(map[string]*ParsingRule)}
97
98 t.Run("parses valid rule file", func(t *testing.T) {
99 content := `title: //h1
100author: //span[@class='author']
101date: //time
102body: //article
103strip: //nav
104strip: //footer
105test_url: https://example.com/article`
106
107 rule, err := parser.parseRules("example.com", content)
108 if err != nil {
109 t.Fatalf("Expected no error, got %v", err)
110 }
111
112 if rule.Domain != "example.com" {
113 t.Errorf("Expected domain 'example.com', got %s", rule.Domain)
114 }
115 if rule.Title != "//h1" {
116 t.Errorf("Expected title '//h1', got %s", rule.Title)
117 }
118 if rule.Author != "//span[@class='author']" {
119 t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author)
120 }
121 if len(rule.Strip) != 2 {
122 t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip))
123 }
124 if len(rule.TestURLs) != 1 {
125 t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs))
126 }
127 })
128
129 t.Run("handles empty lines and comments", func(t *testing.T) {
130 content := `# This is a comment
131title: //h1
132
133# Another comment
134body: //article
135`
136
137 rule, err := parser.parseRules("test.com", content)
138 if err != nil {
139 t.Fatalf("Expected no error, got %v", err)
140 }
141
142 if rule.Title != "//h1" {
143 t.Errorf("Expected title '//h1', got %s", rule.Title)
144 }
145 if rule.Body != "//article" {
146 t.Errorf("Expected body '//article', got %s", rule.Body)
147 }
148 })
149 })
150
151 t.Run("slugify", func(t *testing.T) {
152 parser := &ArticleParser{}
153
154 testCases := []struct {
155 input string
156 expected string
157 }{
158 {"Simple Title", "simple-title"},
159 {"Title with Numbers 123", "title-with-numbers-123"},
160 {"Title-with-Hyphens", "title-with-hyphens"},
161 {"Title with Spaces and Multiple Spaces", "title-with-spaces-and-multiple-spaces"},
162 {"Title!@#$%^&*()with Special Characters", "title-with-special-characters"},
163 {"", ""},
164 {strings.Repeat("a", 150), strings.Repeat("a", 100)},
165 }
166
167 for _, tc := range testCases {
168 t.Run(fmt.Sprintf("slugify '%s'", tc.input), func(t *testing.T) {
169 result := parser.slugify(tc.input)
170 if result != tc.expected {
171 t.Errorf("Expected '%s', got '%s'", tc.expected, result)
172 }
173 })
174 }
175 })
176
177 t.Run("Convert", func(t *testing.T) {
178 parser, err := NewArticleParser(http.DefaultClient)
179 if err != nil {
180 t.Fatalf("Failed to create parser: %v", err)
181 }
182
183 t.Run("fails with unsupported domain", func(t *testing.T) {
184 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
185 _, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article")
186
187 if err == nil {
188 t.Error("Expected error for unsupported domain")
189 }
190 if !strings.Contains(err.Error(), "no parsing rule found") {
191 t.Errorf("Expected 'no parsing rule found' error, got %v", err)
192 }
193 })
194
195 t.Run("fails with invalid HTML", func(t *testing.T) {
196 invalidHTML := "<html><head><title>Test</head></body>"
197 _, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
198
199 if err == nil {
200 t.Error("Expected error for invalid HTML")
201 }
202 })
203
204 t.Run("fails when no title extracted", func(t *testing.T) {
205 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
206 _, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
207
208 if err == nil {
209 t.Error("Expected error when no title can be extracted")
210 }
211 if !strings.Contains(err.Error(), "could not extract title") {
212 t.Errorf("Expected 'could not extract title' error, got %v", err)
213 }
214 })
215
216 t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) {
217 htmlContent := `<html>
218 <head><title>Test Article</title></head>
219 <body>
220 <h1 id="firstHeading">Test Article Title</h1>
221 <div id="bodyContent">
222 <p>This is the main content of the article.</p>
223 <div class="noprint">This should be stripped</div>
224 <p>More content here.</p>
225 </div>
226 </body>
227 </html>`
228
229 markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
230 if err != nil {
231 t.Fatalf("Expected no error, got %v", err)
232 }
233
234 if !strings.Contains(markdown, "# Test Article Title") {
235 t.Error("Expected markdown to contain title")
236 }
237 if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") {
238 t.Error("Expected markdown to contain source URL")
239 }
240 if !strings.Contains(markdown, "This is the main content") {
241 t.Error("Expected markdown to contain article content")
242 }
243 if strings.Contains(markdown, "This should be stripped") {
244 t.Error("Expected stripped content to be removed from markdown")
245 }
246 })
247 })
248
249 t.Run("ParseURL", func(t *testing.T) {
250 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
251 switch {
252 case strings.Contains(r.URL.Path, "404"):
253 w.WriteHeader(http.StatusNotFound)
254 case strings.Contains(r.URL.Path, "unsupported"):
255 w.WriteHeader(http.StatusOK)
256 w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
257 default:
258 // Return Wikipedia-like structure for localhost rule
259 w.WriteHeader(http.StatusOK)
260 w.Write([]byte(`<html>
261 <head><title>Test Article</title></head>
262 <body>
263 <h1 id="firstHeading">Test Wikipedia Article</h1>
264 <div id="bodyContent">
265 <p>This is the article content.</p>
266 <div class="noprint">This gets stripped</div>
267 </div>
268 </body>
269 </html>`))
270 }
271 }))
272 defer server.Close()
273
274 parser, err := NewArticleParser(server.Client())
275 if err != nil {
276 t.Fatalf("Failed to create parser: %v", err)
277 }
278
279 localhostRule := &ParsingRule{
280 Domain: "127.0.0.1",
281 Title: "//h1[@id='firstHeading']",
282 Body: "//div[@id='bodyContent']",
283 Strip: []string{"//div[@class='noprint']"},
284 }
285 parser.AddRule("127.0.0.1", localhostRule)
286
287 t.Run("fails with invalid URL", func(t *testing.T) {
288 _, err := parser.ParseURL("not-a-url")
289 if err == nil {
290 t.Error("Expected error for invalid URL")
291 }
292 if !strings.Contains(err.Error(), "unsupported protocol scheme") {
293 t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err)
294 }
295 })
296
297 t.Run("fails with unsupported domain", func(t *testing.T) {
298 _, err := parser.ParseURL(server.URL + "/unsupported.com")
299 if err == nil {
300 t.Error("Expected error for unsupported domain")
301 }
302 })
303
304 t.Run("fails with HTTP error", func(t *testing.T) {
305 _, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test")
306 if err == nil {
307 t.Error("Expected error for HTTP 404")
308 }
309 })
310
311 })
312
313 t.Run("SaveArticle", func(t *testing.T) {
314 parser := &ArticleParser{}
315 tempDir := t.TempDir()
316
317 content := &ParsedContent{
318 Title: "Test Article",
319 Author: "Test Author",
320 Date: "2023-01-01",
321 Content: "This is test content.",
322 URL: "https://example.com/test",
323 }
324
325 t.Run("successfully saves article", func(t *testing.T) {
326 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
327 if err != nil {
328 t.Fatalf("Expected no error, got %v", err)
329 }
330
331 if _, err := os.Stat(mdPath); os.IsNotExist(err) {
332 t.Error("Expected markdown file to exist")
333 }
334 if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
335 t.Error("Expected HTML file to exist")
336 }
337
338 mdContent, err := os.ReadFile(mdPath)
339 if err != nil {
340 t.Fatalf("Failed to read markdown file: %v", err)
341 }
342 if !strings.Contains(string(mdContent), "# Test Article") {
343 t.Error("Expected markdown to contain title")
344 }
345 if !strings.Contains(string(mdContent), "**Author:** Test Author") {
346 t.Error("Expected markdown to contain author")
347 }
348
349 htmlContentBytes, err := os.ReadFile(htmlPath)
350 if err != nil {
351 t.Fatalf("Failed to read HTML file: %v", err)
352 }
353 if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") {
354 t.Error("Expected HTML to contain title")
355 }
356 })
357
358 t.Run("handles duplicate filenames", func(t *testing.T) {
359 mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir)
360 if err != nil {
361 t.Fatalf("Expected no error for first save, got %v", err)
362 }
363
364 mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir)
365 if err != nil {
366 t.Fatalf("Expected no error for second save, got %v", err)
367 }
368
369 if mdPath1 == mdPath2 {
370 t.Error("Expected different markdown paths for duplicate saves")
371 }
372 if htmlPath1 == htmlPath2 {
373 t.Error("Expected different HTML paths for duplicate saves")
374 }
375
376 if _, err := os.Stat(mdPath1); os.IsNotExist(err) {
377 t.Error("Expected first markdown file to exist")
378 }
379 if _, err := os.Stat(mdPath2); os.IsNotExist(err) {
380 t.Error("Expected second markdown file to exist")
381 }
382 })
383
384 t.Run("fails with invalid directory", func(t *testing.T) {
385 invalidDir := "/nonexistent/directory"
386 _, _, err := parser.SaveArticle(content, invalidDir)
387 if err == nil {
388 t.Error("Expected error for invalid directory")
389 }
390 })
391 })
392
393 t.Run("createHTML", func(t *testing.T) {
394 parser := &ArticleParser{}
395 content := &ParsedContent{
396 Title: "Test HTML Article",
397 Author: "HTML Author",
398 Date: "2023-12-25",
399 Content: "This is **bold** content with *emphasis*.",
400 URL: "https://example.com/html-test",
401 }
402
403 t.Run("creates valid HTML", func(t *testing.T) {
404 markdown := parser.createMarkdown(content)
405 html := parser.createHTML(content, markdown)
406
407 if !strings.Contains(html, "<!DOCTYPE html>") {
408 t.Error("Expected HTML to contain DOCTYPE")
409 }
410 if !strings.Contains(html, "<title>Test HTML Article</title>") {
411 t.Error("Expected HTML to contain title")
412 }
413 if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") {
414 t.Error("Expected HTML to contain h1 heading with title")
415 }
416 if !strings.Contains(html, "<strong>bold</strong>") {
417 t.Error("Expected HTML to contain bold formatting")
418 }
419 if !strings.Contains(html, "<em>emphasis</em>") {
420 t.Error("Expected HTML to contain emphasis formatting")
421 }
422 })
423 })
424
425 t.Run("createMarkdown", func(t *testing.T) {
426 parser := &ArticleParser{}
427
428 t.Run("creates markdown with all fields", func(t *testing.T) {
429 content := &ParsedContent{
430 Title: "Full Content Article",
431 Author: "Complete Author",
432 Date: "2023-01-15",
433 Content: "Complete article content here.",
434 URL: "https://example.com/full",
435 }
436
437 markdown := parser.createMarkdown(content)
438
439 if !strings.Contains(markdown, "# Full Content Article") {
440 t.Error("Expected markdown to contain title")
441 }
442 if !strings.Contains(markdown, "**Author:** Complete Author") {
443 t.Error("Expected markdown to contain author")
444 }
445 if !strings.Contains(markdown, "**Date:** 2023-01-15") {
446 t.Error("Expected markdown to contain date")
447 }
448 if !strings.Contains(markdown, "**Source:** https://example.com/full") {
449 t.Error("Expected markdown to contain source URL")
450 }
451 if !strings.Contains(markdown, "**Saved:**") {
452 t.Error("Expected markdown to contain saved timestamp")
453 }
454 if !strings.Contains(markdown, "---") {
455 t.Error("Expected markdown to contain separator")
456 }
457 if !strings.Contains(markdown, "Complete article content here.") {
458 t.Error("Expected markdown to contain article content")
459 }
460 })
461
462 t.Run("creates markdown with minimal fields", func(t *testing.T) {
463 content := &ParsedContent{
464 Title: "Minimal Article",
465 Content: "Just content.",
466 URL: "https://example.com/minimal",
467 }
468
469 markdown := parser.createMarkdown(content)
470
471 if !strings.Contains(markdown, "# Minimal Article") {
472 t.Error("Expected markdown to contain title")
473 }
474 if strings.Contains(markdown, "**Author:**") {
475 t.Error("Expected no author field for empty author")
476 }
477 if strings.Contains(markdown, "**Date:**") {
478 t.Error("Expected no date field for empty date")
479 }
480 if !strings.Contains(markdown, "**Source:** https://example.com/minimal") {
481 t.Error("Expected markdown to contain source URL")
482 }
483 })
484 })
485}
486
487func TestCreateArticleFromURL(t *testing.T) {
488 tempDir := t.TempDir()
489
490 t.Run("fails with invalid URL", func(t *testing.T) {
491 _, err := CreateArticleFromURL("not-a-url", tempDir)
492 if err == nil {
493 t.Error("Expected error for invalid URL")
494 }
495 })
496
497 t.Run("fails with unsupported domain", func(t *testing.T) {
498 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
499 w.WriteHeader(http.StatusOK)
500 w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
501 }))
502 defer server.Close()
503
504 _, err := CreateArticleFromURL(server.URL, tempDir)
505 if err == nil {
506 t.Error("Expected error for unsupported domain")
507 }
508 })
509
510 t.Run("successfully creates article from Wikipedia-like URL", func(t *testing.T) {
511 wikipediaHTML := `<html>
512 <head><title>Integration Test Article</title></head>
513 <body>
514 <h1 id="firstHeading">Integration Test Article</h1>
515 <div id="bodyContent">
516 <p>This is integration test content.</p>
517 </div>
518 </body>
519 </html>`
520
521 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
522 w.WriteHeader(http.StatusOK)
523 w.Write([]byte(wikipediaHTML))
524 }))
525 defer server.Close()
526
527 // We need to patch the CreateArticleFromURL function to use our test client and rules
528 // For now, let's test the components individually since CreateArticleFromURL uses NewArticleParser internally
529 parser, err := NewArticleParser(server.Client())
530 if err != nil {
531 t.Fatalf("Failed to create parser: %v", err)
532 }
533
534 // Add localhost rule for testing
535 localhostRule := &ParsingRule{
536 Domain: "127.0.0.1",
537 Title: "//h1[@id='firstHeading']",
538 Body: "//div[@id='bodyContent']",
539 Strip: []string{"//div[@class='noprint']"},
540 }
541 parser.AddRule("127.0.0.1", localhostRule)
542
543 content, err := parser.ParseURL(server.URL)
544 if err != nil {
545 t.Fatalf("Expected no error, got %v", err)
546 }
547
548 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
549 if err != nil {
550 t.Fatalf("Failed to save article: %v", err)
551 }
552
553 if content.Title != "Integration Test Article" {
554 t.Errorf("Expected title 'Integration Test Article', got %s", content.Title)
555 }
556 if mdPath == "" {
557 t.Error("Expected non-empty markdown path")
558 }
559 if htmlPath == "" {
560 t.Error("Expected non-empty HTML path")
561 }
562
563 // Check files exist
564 if _, err := os.Stat(mdPath); os.IsNotExist(err) {
565 t.Error("Expected markdown file to exist")
566 }
567 if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
568 t.Error("Expected HTML file to exist")
569 }
570 })
571}