cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm
leaflet
readability
golang
1package articles
2
3import (
4 "fmt"
5 "net/http"
6 "net/http/httptest"
7 "os"
8 "strings"
9 "testing"
10 "time"
11
12 "github.com/stormlightlabs/noteleaf/internal/models"
13)
14
15func newServerWithHtml(h string) *httptest.Server {
16 return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
17 w.WriteHeader(http.StatusOK)
18 w.Write([]byte(h))
19 }))
20}
21
22// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules.
23func ExampleParser_Convert() {
24 parser, err := NewArticleParser(http.DefaultClient)
25 if err != nil {
26 fmt.Printf("Failed to create parser: %v\n", err)
27 return
28 }
29
30 htmlPath := "examples/christopher-lloyd.html"
31 htmlContent, err := os.ReadFile(htmlPath)
32 if err != nil {
33 fmt.Printf("Local HTML file not found: %v\n", err)
34 return
35 }
36
37 markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd")
38 if err != nil {
39 fmt.Printf("Failed to convert HTML: %v\n", err)
40 return
41 }
42
43 parts := strings.Split(markdown, "\n---\n")
44 if len(parts) > 0 {
45 frontmatter := strings.TrimSpace(parts[0])
46 lines := strings.Split(frontmatter, "\n")
47
48 for i, line := range lines {
49 if i >= 4 {
50 break
51 }
52
53 if !strings.Contains(line, "**Saved:**") {
54 fmt.Println(line)
55 }
56 }
57 }
58
59 // Output: # Christopher Lloyd
60 //
61 // **Source:** https://en.wikipedia.org/wiki/Christopher_Lloyd
62}
63
64func TestArticleParser(t *testing.T) {
65 t.Run("New", func(t *testing.T) {
66 t.Run("successfully creates parser", func(t *testing.T) {
67 parser, err := NewArticleParser(http.DefaultClient)
68 if err != nil {
69 t.Fatalf("Expected no error, got %v", err)
70 }
71 if parser == nil {
72 t.Fatal("Expected parser to be created, got nil")
73 }
74 if len(parser.rules) == 0 {
75 t.Error("Expected rules to be loaded")
76 }
77 })
78
79 t.Run("loads expected domains", func(t *testing.T) {
80 parser, err := NewArticleParser(http.DefaultClient)
81 if err != nil {
82 t.Fatalf("Failed to create parser: %v", err)
83 }
84
85 domains := parser.GetSupportedDomains()
86 expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"}
87
88 if len(domains) != len(expectedDomains) {
89 t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains))
90 }
91
92 domainMap := make(map[string]bool)
93 for _, domain := range domains {
94 domainMap[domain] = true
95 }
96
97 for _, expected := range expectedDomains {
98 if !domainMap[expected] {
99 t.Errorf("Expected domain %s not found in supported domains", expected)
100 }
101 }
102 })
103 })
104
105 t.Run("parseRules", func(t *testing.T) {
106 parser := &ArticleParser{rules: make(map[string]*ParsingRule)}
107
108 t.Run("parses valid rule file", func(t *testing.T) {
109 content := `title: //h1
110author: //span[@class='author']
111date: //time
112body: //article
113strip: //nav
114strip: //footer
115test_url: https://example.com/article`
116
117 rule, err := parser.parseRules("example.com", content)
118 if err != nil {
119 t.Fatalf("Expected no error, got %v", err)
120 }
121
122 if rule.Domain != "example.com" {
123 t.Errorf("Expected domain 'example.com', got %s", rule.Domain)
124 }
125 if rule.Title != "//h1" {
126 t.Errorf("Expected title '//h1', got %s", rule.Title)
127 }
128 if rule.Author != "//span[@class='author']" {
129 t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author)
130 }
131 if len(rule.Strip) != 2 {
132 t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip))
133 }
134 if len(rule.TestURLs) != 1 {
135 t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs))
136 }
137 })
138
139 t.Run("handles empty lines and comments", func(t *testing.T) {
140 content := `# This is a comment
141title: //h1
142
143# Another comment
144body: //article
145`
146
147 rule, err := parser.parseRules("test.com", content)
148 if err != nil {
149 t.Fatalf("Expected no error, got %v", err)
150 }
151
152 if rule.Title != "//h1" {
153 t.Errorf("Expected title '//h1', got %s", rule.Title)
154 }
155 if rule.Body != "//article" {
156 t.Errorf("Expected body '//article', got %s", rule.Body)
157 }
158 })
159 })
160
161 t.Run("slugify", func(t *testing.T) {
162 parser := &ArticleParser{}
163
164 tc := []struct {
165 input string
166 expected string
167 }{
168 {"Simple Title", "simple-title"},
169 {"Title with Numbers 123", "title-with-numbers-123"},
170 {"Title-with-Hyphens", "title-with-hyphens"},
171 {"Title with Spaces and Multiple Spaces", "title-with-spaces-and-multiple-spaces"},
172 {"Title!@#$%^&*()with Special Characters", "title-with-special-characters"},
173 {"", ""},
174 {strings.Repeat("a", 150), strings.Repeat("a", 100)},
175 }
176
177 for _, tt := range tc {
178 t.Run(fmt.Sprintf("slugify '%s'", tt.input), func(t *testing.T) {
179 result := parser.slugify(tt.input)
180 if result != tt.expected {
181 t.Errorf("Expected '%s', got '%s'", tt.expected, result)
182 }
183 })
184 }
185 })
186
187 t.Run("Convert", func(t *testing.T) {
188 parser, err := NewArticleParser(http.DefaultClient)
189 if err != nil {
190 t.Fatalf("Failed to create parser: %v", err)
191 }
192
193 t.Run("fails with unsupported domain", func(t *testing.T) {
194 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
195 _, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article")
196
197 if err == nil {
198 t.Error("Expected error for unsupported domain")
199 }
200 if !strings.Contains(err.Error(), "no parsing rule found") {
201 t.Errorf("Expected 'no parsing rule found' error, got %v", err)
202 }
203 })
204
205 t.Run("fails with invalid HTML", func(t *testing.T) {
206 invalidHTML := "<html><head><title>Test</head></body>"
207 _, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
208
209 if err == nil {
210 t.Error("Expected error for invalid HTML")
211 }
212 })
213
214 t.Run("fails when no title extracted", func(t *testing.T) {
215 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
216 _, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
217
218 if err == nil {
219 t.Error("Expected error when no title can be extracted")
220 }
221 if !strings.Contains(err.Error(), "could not extract title") &&
222 !strings.Contains(err.Error(), "could not extract body content") {
223 t.Errorf("Expected title or body extraction error, got %v", err)
224 }
225 })
226
227 t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) {
228 htmlContent := `<html>
229 <head><title>Test Article</title></head>
230 <body>
231 <h1 id="firstHeading">Test Article Title</h1>
232 <div id="bodyContent">
233 <style>.mw-parser-output .hatnote{font-style:italic;}</style>
234 <p>This is the main content of the article.</p>
235 <div class="noprint">This should be stripped</div>
236 <div class="editsection">Edit this section</div>
237 <p>More content here.</p>
238 </div>
239 </body>
240 </html>`
241
242 markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
243 if err != nil {
244 t.Fatalf("Expected no error, got %v", err)
245 }
246
247 if !strings.Contains(markdown, "# Test Article Title") {
248 t.Error("Expected markdown to contain title")
249 }
250 if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") {
251 t.Error("Expected markdown to contain source URL")
252 }
253 if !strings.Contains(markdown, "This is the main content") {
254 t.Error("Expected markdown to contain article content")
255 }
256 if strings.Contains(markdown, "This should be stripped") {
257 t.Error("Expected stripped content to be removed from markdown")
258 }
259 if strings.Contains(markdown, ".mw-parser-output") {
260 t.Error("Expected style content to be removed from markdown")
261 }
262 if strings.Contains(markdown, "Edit this section") {
263 t.Error("Expected edit section markers to be removed from markdown")
264 }
265 })
266
267 t.Run("strips Wikipedia navigation boxes and metadata", func(t *testing.T) {
268 htmlContent := `<html>
269 <head><title>Test Navigation Article</title></head>
270 <body>
271 <h1 id="firstHeading">Test Navigation Article</h1>
272 <div id="bodyContent">
273 <p>Main article content goes here.</p>
274 <h2>Section One<span class="mw-editsection">[edit]</span></h2>
275 <p>Section content.</p>
276 <table class="navbox" role="navigation">
277 <tr><td>Navigation item 1</td></tr>
278 <tr><td>Navigation item 2</td></tr>
279 </table>
280 <div class="navbox">
281 <p>Another navigation box</p>
282 </div>
283 <table class="vertical-navbox">
284 <tr><td>Vertical nav item</td></tr>
285 </table>
286 <p>More article content.</p>
287 <div role="navigation">
288 <p>Navigation content</p>
289 </div>
290 <div id="catlinks">
291 <p>Categories: Test Category</p>
292 </div>
293 <div id="footer">
294 <p>Retrieved from Wikipedia</p>
295 </div>
296 </div>
297 </body>
298 </html>`
299
300 markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test_Navigation")
301 if err != nil {
302 t.Fatalf("Expected no error, got %v", err)
303 }
304
305 if !strings.Contains(markdown, "Main article content") {
306 t.Error("Expected markdown to contain main article content")
307 }
308 if !strings.Contains(markdown, "Section content") {
309 t.Error("Expected markdown to contain section content")
310 }
311 if !strings.Contains(markdown, "More article content") {
312 t.Error("Expected markdown to contain additional content")
313 }
314
315 if strings.Contains(markdown, "Navigation item") {
316 t.Error("Expected navbox table content to be stripped")
317 }
318 if strings.Contains(markdown, "Another navigation box") {
319 t.Error("Expected navbox div content to be stripped")
320 }
321 if strings.Contains(markdown, "Vertical nav item") {
322 t.Error("Expected vertical-navbox content to be stripped")
323 }
324 if strings.Contains(markdown, "[edit]") {
325 t.Error("Expected edit section markers to be stripped")
326 }
327 if strings.Contains(markdown, "Navigation content") {
328 t.Error("Expected role=navigation content to be stripped")
329 }
330 if strings.Contains(markdown, "Categories:") {
331 t.Error("Expected category links to be stripped")
332 }
333 if strings.Contains(markdown, "Retrieved from") {
334 t.Error("Expected footer content to be stripped")
335 }
336 })
337 })
338
339 t.Run("ParseURL", func(t *testing.T) {
340 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
341 switch {
342 case strings.Contains(r.URL.Path, "404"):
343 w.WriteHeader(http.StatusNotFound)
344 case strings.Contains(r.URL.Path, "unsupported"):
345 w.WriteHeader(http.StatusOK)
346 w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
347 default:
348 w.WriteHeader(http.StatusOK)
349 w.Write([]byte(`<html>
350 <head><title>Test Article</title></head>
351 <body>
352 <h1 id="firstHeading">Test Wikipedia Article</h1>
353 <div id="bodyContent">
354 <p>This is the article content.</p>
355 <div class="noprint">This gets stripped</div>
356 </div>
357 </body>
358 </html>`))
359 }
360 }))
361 defer server.Close()
362
363 parser, err := NewArticleParser(server.Client())
364 if err != nil {
365 t.Fatalf("Failed to create parser: %v", err)
366 }
367
368 localhostRule := &ParsingRule{
369 Domain: "127.0.0.1",
370 Title: "//h1[@id='firstHeading']",
371 Body: "//div[@id='bodyContent']",
372 Strip: []string{"//div[@class='noprint']"},
373 }
374 parser.AddRule("127.0.0.1", localhostRule)
375
376 t.Run("fails with invalid URL", func(t *testing.T) {
377 _, err := parser.ParseURL("not-a-url")
378 if err == nil {
379 t.Error("Expected error for invalid URL")
380 }
381 if !strings.Contains(err.Error(), "unsupported protocol scheme") {
382 t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err)
383 }
384 })
385
386 t.Run("fails with unsupported domain", func(t *testing.T) {
387 _, err := parser.ParseURL(server.URL + "/unsupported.com")
388 if err == nil {
389 t.Error("Expected error for unsupported domain")
390 }
391 })
392
393 t.Run("fails with HTTP error", func(t *testing.T) {
394 _, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test")
395 if err == nil {
396 t.Error("Expected error for HTTP 404")
397 }
398 })
399
400 })
401
402 t.Run("SaveArticle", func(t *testing.T) {
403 parser := &ArticleParser{}
404 tempDir := t.TempDir()
405
406 content := &ParsedContent{
407 Title: "Test Article",
408 Author: "Test Author",
409 Date: "2023-01-01",
410 Content: "This is test content.",
411 URL: "https://example.com/test",
412 }
413
414 t.Run("successfully saves article", func(t *testing.T) {
415 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
416 if err != nil {
417 t.Fatalf("Expected no error, got %v", err)
418 }
419
420 if _, err := os.Stat(mdPath); os.IsNotExist(err) {
421 t.Error("Expected markdown file to exist")
422 }
423 if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
424 t.Error("Expected HTML file to exist")
425 }
426
427 mdContent, err := os.ReadFile(mdPath)
428 if err != nil {
429 t.Fatalf("Failed to read markdown file: %v", err)
430 }
431 if !strings.Contains(string(mdContent), "# Test Article") {
432 t.Error("Expected markdown to contain title")
433 }
434 if !strings.Contains(string(mdContent), "**Author:** Test Author") {
435 t.Error("Expected markdown to contain author")
436 }
437
438 htmlContentBytes, err := os.ReadFile(htmlPath)
439 if err != nil {
440 t.Fatalf("Failed to read HTML file: %v", err)
441 }
442 if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") {
443 t.Error("Expected HTML to contain title")
444 }
445 })
446
447 t.Run("handles duplicate filenames", func(t *testing.T) {
448 mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir)
449 if err != nil {
450 t.Fatalf("Expected no error for first save, got %v", err)
451 }
452
453 mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir)
454 if err != nil {
455 t.Fatalf("Expected no error for second save, got %v", err)
456 }
457
458 if mdPath1 == mdPath2 {
459 t.Error("Expected different markdown paths for duplicate saves")
460 }
461 if htmlPath1 == htmlPath2 {
462 t.Error("Expected different HTML paths for duplicate saves")
463 }
464
465 if _, err := os.Stat(mdPath1); os.IsNotExist(err) {
466 t.Error("Expected first markdown file to exist")
467 }
468 if _, err := os.Stat(mdPath2); os.IsNotExist(err) {
469 t.Error("Expected second markdown file to exist")
470 }
471 })
472
473 t.Run("fails with invalid directory", func(t *testing.T) {
474 invalidDir := "/nonexistent/directory"
475 _, _, err := parser.SaveArticle(content, invalidDir)
476 if err == nil {
477 t.Error("Expected error for invalid directory")
478 }
479 })
480 })
481
482 t.Run("createHTML", func(t *testing.T) {
483 parser := &ArticleParser{}
484 content := &ParsedContent{
485 Title: "Test HTML Article",
486 Author: "HTML Author",
487 Date: "2023-12-25",
488 Content: "This is **bold** content with *emphasis*.",
489 URL: "https://example.com/html-test",
490 }
491
492 t.Run("creates valid HTML", func(t *testing.T) {
493 markdown := parser.createMarkdown(content)
494 html := parser.createHTML(content, markdown)
495
496 if !strings.Contains(html, "<!DOCTYPE html>") {
497 t.Error("Expected HTML to contain DOCTYPE")
498 }
499 if !strings.Contains(html, "<title>Test HTML Article</title>") {
500 t.Error("Expected HTML to contain title")
501 }
502 if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") {
503 t.Error("Expected HTML to contain h1 heading with title")
504 }
505 if !strings.Contains(html, "<strong>bold</strong>") {
506 t.Error("Expected HTML to contain bold formatting")
507 }
508 if !strings.Contains(html, "<em>emphasis</em>") {
509 t.Error("Expected HTML to contain emphasis formatting")
510 }
511 })
512 })
513
514 t.Run("createMarkdown", func(t *testing.T) {
515 parser := &ArticleParser{}
516
517 t.Run("creates markdown with all fields", func(t *testing.T) {
518 content := &ParsedContent{
519 Title: "Full Content Article",
520 Author: "Complete Author",
521 Date: "2023-01-15",
522 Content: "Complete article content here.",
523 URL: "https://example.com/full",
524 }
525
526 markdown := parser.createMarkdown(content)
527
528 if !strings.Contains(markdown, "# Full Content Article") {
529 t.Error("Expected markdown to contain title")
530 }
531 if !strings.Contains(markdown, "**Author:** Complete Author") {
532 t.Error("Expected markdown to contain author")
533 }
534 if !strings.Contains(markdown, "**Date:** 2023-01-15") {
535 t.Error("Expected markdown to contain date")
536 }
537 if !strings.Contains(markdown, "**Source:** https://example.com/full") {
538 t.Error("Expected markdown to contain source URL")
539 }
540 if !strings.Contains(markdown, "**Saved:**") {
541 t.Error("Expected markdown to contain saved timestamp")
542 }
543 if !strings.Contains(markdown, "---") {
544 t.Error("Expected markdown to contain separator")
545 }
546 if !strings.Contains(markdown, "Complete article content here.") {
547 t.Error("Expected markdown to contain article content")
548 }
549 })
550
551 t.Run("creates markdown with minimal fields", func(t *testing.T) {
552 content := &ParsedContent{
553 Title: "Minimal Article",
554 Content: "Just content.",
555 URL: "https://example.com/minimal",
556 }
557
558 markdown := parser.createMarkdown(content)
559
560 if !strings.Contains(markdown, "# Minimal Article") {
561 t.Error("Expected markdown to contain title")
562 }
563 if strings.Contains(markdown, "**Author:**") {
564 t.Error("Expected no author field for empty author")
565 }
566 if strings.Contains(markdown, "**Date:**") {
567 t.Error("Expected no date field for empty date")
568 }
569 if !strings.Contains(markdown, "**Source:** https://example.com/minimal") {
570 t.Error("Expected markdown to contain source URL")
571 }
572 })
573 })
574}
575
576func TestCreateArticleFromURL(t *testing.T) {
577 tempDir := t.TempDir()
578
579 t.Run("fails with invalid URL", func(t *testing.T) {
580 _, err := CreateArticleFromURL("not-a-url", tempDir)
581 if err == nil {
582 t.Error("Expected error for invalid URL")
583 }
584 if !strings.Contains(err.Error(), "invalid URL") && !strings.Contains(err.Error(), "failed to parse URL") {
585 t.Errorf("Expected URL parsing error, got %v", err)
586 }
587 })
588
589 t.Run("fails with empty URL", func(t *testing.T) {
590 _, err := CreateArticleFromURL("", tempDir)
591 if err == nil {
592 t.Error("Expected error for empty URL")
593 }
594 })
595
596 t.Run("fails with unsupported domain", func(t *testing.T) {
597 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
598 w.WriteHeader(http.StatusOK)
599 w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
600 }))
601 defer server.Close()
602
603 _, err := CreateArticleFromURL(server.URL, tempDir)
604 if err == nil {
605 t.Error("Expected error for unsupported domain")
606 }
607 if !strings.Contains(err.Error(), "no parsing rule found") {
608 t.Errorf("Expected 'no parsing rule found' error, got %v", err)
609 }
610 })
611
612 t.Run("fails with HTTP error", func(t *testing.T) {
613 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
614 w.WriteHeader(http.StatusNotFound)
615 }))
616 defer server.Close()
617
618 _, err := CreateArticleFromURL("https://en.wikipedia.org/wiki/NonExistentPage12345", tempDir)
619 if err == nil {
620 t.Error("Expected error for HTTP 404")
621 }
622 if !strings.Contains(err.Error(), "HTTP error") && !strings.Contains(err.Error(), "404") {
623 t.Errorf("Expected HTTP error, got %v", err)
624 }
625 })
626
627 t.Run("fails with network error", func(t *testing.T) {
628 _, err := CreateArticleFromURL("http://localhost:99999/test", tempDir)
629 if err == nil {
630 t.Error("Expected error for network failure")
631 }
632 if !strings.Contains(err.Error(), "failed to fetch URL") && !strings.Contains(err.Error(), "connection refused") {
633 t.Errorf("Expected network error, got %v", err)
634 }
635 })
636
637 t.Run("fails with malformed HTML", func(t *testing.T) {
638 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
639 w.WriteHeader(http.StatusOK)
640 w.Write([]byte("<html><head><title>Test</head></body>"))
641 }))
642 defer server.Close()
643
644 parser, err := NewArticleParser(server.Client())
645 if err != nil {
646 t.Fatalf("Failed to create parser: %v", err)
647 }
648
649 localhostRule := &ParsingRule{
650 Domain: "127.0.0.1",
651 Title: "//h1[@id='firstHeading']",
652 Body: "//div[@id='bodyContent']",
653 Strip: []string{"//div[@class='noprint']"},
654 }
655 parser.AddRule("127.0.0.1", localhostRule)
656
657 _, err = parser.ParseURL(server.URL)
658 if err == nil {
659 t.Error("Expected error for malformed HTML")
660 }
661 if !strings.Contains(err.Error(), "failed to parse HTML") &&
662 !strings.Contains(err.Error(), "could not extract title") &&
663 !strings.Contains(err.Error(), "could not extract body content") {
664 t.Errorf("Expected HTML parsing or extraction error, got %v", err)
665 }
666 })
667
668 t.Run("fails when no title can be extracted", func(t *testing.T) {
669 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
670 w.WriteHeader(http.StatusOK)
671 w.Write([]byte(`<html>
672 <head><title>Test</title></head>
673 <body>
674 <div id="bodyContent">
675 <p>Content without proper title</p>
676 </div>
677 </body>
678 </html>`))
679 }))
680 defer server.Close()
681
682 parser, err := NewArticleParser(server.Client())
683 if err != nil {
684 t.Fatalf("Failed to create parser: %v", err)
685 }
686
687 localhostRule := &ParsingRule{
688 Domain: "127.0.0.1",
689 Title: "//h1[@id='firstHeading']",
690 Body: "//div[@id='bodyContent']",
691 Strip: []string{"//div[@class='noprint']"},
692 }
693 parser.AddRule("127.0.0.1", localhostRule)
694
695 _, err = parser.ParseURL(server.URL)
696 if err == nil {
697 t.Error("Expected error when no title can be extracted")
698 }
699 if !strings.Contains(err.Error(), "could not extract title") {
700 t.Errorf("Expected 'could not extract title' error, got %v", err)
701 }
702 })
703
704 t.Run("successfully creates article structure from parsed content", func(t *testing.T) {
705 wikipediaHTML := `<html>
706 <head><title>Integration Test Article</title></head>
707 <body>
708 <h1 id="firstHeading">Integration Test Article</h1>
709 <div id="bodyContent">
710 <p>This is integration test content.</p>
711 <div class="noprint">This should be stripped</div>
712 <p>More content here.</p>
713 </div>
714 </body>
715 </html>`
716
717 server := newServerWithHtml(wikipediaHTML)
718 defer server.Close()
719
720 parser, err := NewArticleParser(server.Client())
721 if err != nil {
722 t.Fatalf("Failed to create parser: %v", err)
723 }
724
725 localhostRule := &ParsingRule{
726 Domain: "127.0.0.1",
727 Title: "//h1[@id='firstHeading']",
728 Body: "//div[@id='bodyContent']",
729 Strip: []string{"//div[@class='noprint']"},
730 }
731 parser.AddRule("127.0.0.1", localhostRule)
732
733 content, err := parser.ParseURL(server.URL)
734 if err != nil {
735 t.Fatalf("Expected no error, got %v", err)
736 }
737
738 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
739 if err != nil {
740 t.Fatalf("Failed to save article: %v", err)
741 }
742
743 article := &models.Article{
744 URL: server.URL,
745 Title: content.Title,
746 MarkdownPath: mdPath,
747 HTMLPath: htmlPath,
748 Created: time.Now(),
749 Modified: time.Now(),
750 }
751
752 if article.Title != "Integration Test Article" {
753 t.Errorf("Expected title 'Integration Test Article', got %s", article.Title)
754 }
755 if article.URL != server.URL {
756 t.Errorf("Expected URL %s, got %s", server.URL, article.URL)
757 }
758 if article.MarkdownPath == "" {
759 t.Error("Expected non-empty markdown path")
760 }
761 if article.HTMLPath == "" {
762 t.Error("Expected non-empty HTML path")
763 }
764 if article.Created.IsZero() {
765 t.Error("Expected Created timestamp to be set")
766 }
767 if article.Modified.IsZero() {
768 t.Error("Expected Modified timestamp to be set")
769 }
770
771 if _, err := os.Stat(article.MarkdownPath); os.IsNotExist(err) {
772 t.Error("Expected markdown file to exist")
773 }
774 if _, err := os.Stat(article.HTMLPath); os.IsNotExist(err) {
775 t.Error("Expected HTML file to exist")
776 }
777
778 mdContent, err := os.ReadFile(article.MarkdownPath)
779 if err != nil {
780 t.Fatalf("Failed to read markdown file: %v", err)
781 }
782 if !strings.Contains(string(mdContent), "# Integration Test Article") {
783 t.Error("Expected markdown to contain title")
784 }
785 if !strings.Contains(string(mdContent), "This is integration test content") {
786 t.Error("Expected markdown to contain article content")
787 }
788 if strings.Contains(string(mdContent), "This should be stripped") {
789 t.Error("Expected stripped content to be removed from markdown")
790 }
791
792 htmlContent, err := os.ReadFile(article.HTMLPath)
793 if err != nil {
794 t.Fatalf("Failed to read HTML file: %v", err)
795 }
796 if !strings.Contains(string(htmlContent), "<title>Integration Test Article</title>") {
797 t.Error("Expected HTML to contain title")
798 }
799 if !strings.Contains(string(htmlContent), "<!DOCTYPE html>") {
800 t.Error("Expected HTML to contain DOCTYPE")
801 }
802 })
803
804 t.Run("successfully handles article with metadata", func(t *testing.T) {
805 contentHTML := `<html>
806 <head>
807 <title>Test Paper</title>
808 <meta name="citation_author" content="Dr. Test Author">
809 <meta name="citation_date" content="2024-01-01">
810 </head>
811 <body>
812 <h1 class="title">Test Research Paper</h1>
813 <blockquote class="abstract">
814 <p>This is the abstract of the research paper.</p>
815 <p>It contains important research findings.</p>
816 </blockquote>
817 </body>
818 </html>`
819
820 server := newServerWithHtml(contentHTML)
821 defer server.Close()
822
823 parser, err := NewArticleParser(server.Client())
824 if err != nil {
825 t.Fatalf("Failed to create parser: %v", err)
826 }
827
828 localhostRule := &ParsingRule{
829 Domain: "127.0.0.1",
830 Title: "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]",
831 Body: "//blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')]",
832 Date: "//meta[@name='citation_date']/@content",
833 Author: "//meta[@name='citation_author']/@content",
834 }
835 parser.AddRule("127.0.0.1", localhostRule)
836
837 content, err := parser.ParseURL(server.URL)
838 if err != nil {
839 t.Fatalf("Expected no error, got %v", err)
840 }
841
842 if content.Title != "Test Research Paper" {
843 t.Errorf("Expected title 'Test Research Paper', got %s", content.Title)
844 }
845 if content.Author != "Dr. Test Author" {
846 t.Errorf("Expected author 'Dr. Test Author', got %s", content.Author)
847 }
848 if content.Date != "2024-01-01" {
849 t.Errorf("Expected date '2024-01-01', got %s", content.Date)
850 }
851
852 mdPath, _, err := parser.SaveArticle(content, tempDir)
853 if err != nil {
854 t.Fatalf("Failed to save article: %v", err)
855 }
856
857 mdContent, err := os.ReadFile(mdPath)
858 if err != nil {
859 t.Fatalf("Failed to read markdown file: %v", err)
860 }
861 if !strings.Contains(string(mdContent), "**Author:** Dr. Test Author") {
862 t.Error("Expected markdown to contain author")
863 }
864 if !strings.Contains(string(mdContent), "**Date:** 2024-01-01") {
865 t.Error("Expected markdown to contain date")
866 }
867
868 article := &models.Article{
869 Author: content.Author,
870 Date: content.Date,
871 }
872
873 if article.Author != "Dr. Test Author" {
874 t.Errorf("Expected article author 'Dr. Test Author', got %s", article.Author)
875 }
876 if article.Date != "2024-01-01" {
877 t.Errorf("Expected article date '2024-01-01', got %s", article.Date)
878 }
879 })
880}