cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm
leaflet
readability
golang
1package articles
2
3import (
4 "fmt"
5 "net/http"
6 "net/http/httptest"
7 "os"
8 "strings"
9 "testing"
10 "time"
11
12 "github.com/stormlightlabs/noteleaf/internal/models"
13)
14
15// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules.
16func ExampleParser_Convert() {
17 parser, err := NewArticleParser(http.DefaultClient)
18 if err != nil {
19 fmt.Printf("Failed to create parser: %v\n", err)
20 return
21 }
22
23 htmlPath := "examples/christopher-lloyd.html"
24 htmlContent, err := os.ReadFile(htmlPath)
25 if err != nil {
26 fmt.Printf("Local HTML file not found: %v\n", err)
27 return
28 }
29
30 markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd")
31 if err != nil {
32 fmt.Printf("Failed to convert HTML: %v\n", err)
33 return
34 }
35
36 parts := strings.Split(markdown, "\n---\n")
37 if len(parts) > 0 {
38 frontmatter := strings.TrimSpace(parts[0])
39 lines := strings.Split(frontmatter, "\n")
40
41 for i, line := range lines {
42 if i >= 4 {
43 break
44 }
45
46 if !strings.Contains(line, "**Saved:**") {
47 fmt.Println(line)
48 }
49 }
50 }
51
52 // Output: # Christopher Lloyd
53 //
54 // **Source:** https://en.wikipedia.org/wiki/Christopher_Lloyd
55}
56
57func TestArticleParser(t *testing.T) {
58 t.Run("New", func(t *testing.T) {
59 t.Run("successfully creates parser", func(t *testing.T) {
60 parser, err := NewArticleParser(http.DefaultClient)
61 if err != nil {
62 t.Fatalf("Expected no error, got %v", err)
63 }
64 if parser == nil {
65 t.Fatal("Expected parser to be created, got nil")
66 }
67 if len(parser.rules) == 0 {
68 t.Error("Expected rules to be loaded")
69 }
70 })
71
72 t.Run("loads expected domains", func(t *testing.T) {
73 parser, err := NewArticleParser(http.DefaultClient)
74 if err != nil {
75 t.Fatalf("Failed to create parser: %v", err)
76 }
77
78 domains := parser.GetSupportedDomains()
79 expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"}
80
81 if len(domains) != len(expectedDomains) {
82 t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains))
83 }
84
85 domainMap := make(map[string]bool)
86 for _, domain := range domains {
87 domainMap[domain] = true
88 }
89
90 for _, expected := range expectedDomains {
91 if !domainMap[expected] {
92 t.Errorf("Expected domain %s not found in supported domains", expected)
93 }
94 }
95 })
96 })
97
98 t.Run("parseRules", func(t *testing.T) {
99 parser := &ArticleParser{rules: make(map[string]*ParsingRule)}
100
101 t.Run("parses valid rule file", func(t *testing.T) {
102 content := `title: //h1
103author: //span[@class='author']
104date: //time
105body: //article
106strip: //nav
107strip: //footer
108test_url: https://example.com/article`
109
110 rule, err := parser.parseRules("example.com", content)
111 if err != nil {
112 t.Fatalf("Expected no error, got %v", err)
113 }
114
115 if rule.Domain != "example.com" {
116 t.Errorf("Expected domain 'example.com', got %s", rule.Domain)
117 }
118 if rule.Title != "//h1" {
119 t.Errorf("Expected title '//h1', got %s", rule.Title)
120 }
121 if rule.Author != "//span[@class='author']" {
122 t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author)
123 }
124 if len(rule.Strip) != 2 {
125 t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip))
126 }
127 if len(rule.TestURLs) != 1 {
128 t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs))
129 }
130 })
131
132 t.Run("handles empty lines and comments", func(t *testing.T) {
133 content := `# This is a comment
134title: //h1
135
136# Another comment
137body: //article
138`
139
140 rule, err := parser.parseRules("test.com", content)
141 if err != nil {
142 t.Fatalf("Expected no error, got %v", err)
143 }
144
145 if rule.Title != "//h1" {
146 t.Errorf("Expected title '//h1', got %s", rule.Title)
147 }
148 if rule.Body != "//article" {
149 t.Errorf("Expected body '//article', got %s", rule.Body)
150 }
151 })
152 })
153
154 t.Run("slugify", func(t *testing.T) {
155 parser := &ArticleParser{}
156
157 testCases := []struct {
158 input string
159 expected string
160 }{
161 {"Simple Title", "simple-title"},
162 {"Title with Numbers 123", "title-with-numbers-123"},
163 {"Title-with-Hyphens", "title-with-hyphens"},
164 {"Title with Spaces and Multiple Spaces", "title-with-spaces-and-multiple-spaces"},
165 {"Title!@#$%^&*()with Special Characters", "title-with-special-characters"},
166 {"", ""},
167 {strings.Repeat("a", 150), strings.Repeat("a", 100)},
168 }
169
170 for _, tc := range testCases {
171 t.Run(fmt.Sprintf("slugify '%s'", tc.input), func(t *testing.T) {
172 result := parser.slugify(tc.input)
173 if result != tc.expected {
174 t.Errorf("Expected '%s', got '%s'", tc.expected, result)
175 }
176 })
177 }
178 })
179
180 t.Run("Convert", func(t *testing.T) {
181 parser, err := NewArticleParser(http.DefaultClient)
182 if err != nil {
183 t.Fatalf("Failed to create parser: %v", err)
184 }
185
186 t.Run("fails with unsupported domain", func(t *testing.T) {
187 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
188 _, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article")
189
190 if err == nil {
191 t.Error("Expected error for unsupported domain")
192 }
193 if !strings.Contains(err.Error(), "no parsing rule found") {
194 t.Errorf("Expected 'no parsing rule found' error, got %v", err)
195 }
196 })
197
198 t.Run("fails with invalid HTML", func(t *testing.T) {
199 invalidHTML := "<html><head><title>Test</head></body>"
200 _, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
201
202 if err == nil {
203 t.Error("Expected error for invalid HTML")
204 }
205 })
206
207 t.Run("fails when no title extracted", func(t *testing.T) {
208 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
209 _, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
210
211 if err == nil {
212 t.Error("Expected error when no title can be extracted")
213 }
214 if !strings.Contains(err.Error(), "could not extract title") {
215 t.Errorf("Expected 'could not extract title' error, got %v", err)
216 }
217 })
218
219 t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) {
220 htmlContent := `<html>
221 <head><title>Test Article</title></head>
222 <body>
223 <h1 id="firstHeading">Test Article Title</h1>
224 <div id="bodyContent">
225 <p>This is the main content of the article.</p>
226 <div class="noprint">This should be stripped</div>
227 <p>More content here.</p>
228 </div>
229 </body>
230 </html>`
231
232 markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
233 if err != nil {
234 t.Fatalf("Expected no error, got %v", err)
235 }
236
237 if !strings.Contains(markdown, "# Test Article Title") {
238 t.Error("Expected markdown to contain title")
239 }
240 if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") {
241 t.Error("Expected markdown to contain source URL")
242 }
243 if !strings.Contains(markdown, "This is the main content") {
244 t.Error("Expected markdown to contain article content")
245 }
246 if strings.Contains(markdown, "This should be stripped") {
247 t.Error("Expected stripped content to be removed from markdown")
248 }
249 })
250 })
251
252 t.Run("ParseURL", func(t *testing.T) {
253 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
254 switch {
255 case strings.Contains(r.URL.Path, "404"):
256 w.WriteHeader(http.StatusNotFound)
257 case strings.Contains(r.URL.Path, "unsupported"):
258 w.WriteHeader(http.StatusOK)
259 w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
260 default:
261 // Return Wikipedia-like structure for localhost rule
262 w.WriteHeader(http.StatusOK)
263 w.Write([]byte(`<html>
264 <head><title>Test Article</title></head>
265 <body>
266 <h1 id="firstHeading">Test Wikipedia Article</h1>
267 <div id="bodyContent">
268 <p>This is the article content.</p>
269 <div class="noprint">This gets stripped</div>
270 </div>
271 </body>
272 </html>`))
273 }
274 }))
275 defer server.Close()
276
277 parser, err := NewArticleParser(server.Client())
278 if err != nil {
279 t.Fatalf("Failed to create parser: %v", err)
280 }
281
282 localhostRule := &ParsingRule{
283 Domain: "127.0.0.1",
284 Title: "//h1[@id='firstHeading']",
285 Body: "//div[@id='bodyContent']",
286 Strip: []string{"//div[@class='noprint']"},
287 }
288 parser.AddRule("127.0.0.1", localhostRule)
289
290 t.Run("fails with invalid URL", func(t *testing.T) {
291 _, err := parser.ParseURL("not-a-url")
292 if err == nil {
293 t.Error("Expected error for invalid URL")
294 }
295 if !strings.Contains(err.Error(), "unsupported protocol scheme") {
296 t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err)
297 }
298 })
299
300 t.Run("fails with unsupported domain", func(t *testing.T) {
301 _, err := parser.ParseURL(server.URL + "/unsupported.com")
302 if err == nil {
303 t.Error("Expected error for unsupported domain")
304 }
305 })
306
307 t.Run("fails with HTTP error", func(t *testing.T) {
308 _, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test")
309 if err == nil {
310 t.Error("Expected error for HTTP 404")
311 }
312 })
313
314 })
315
316 t.Run("SaveArticle", func(t *testing.T) {
317 parser := &ArticleParser{}
318 tempDir := t.TempDir()
319
320 content := &ParsedContent{
321 Title: "Test Article",
322 Author: "Test Author",
323 Date: "2023-01-01",
324 Content: "This is test content.",
325 URL: "https://example.com/test",
326 }
327
328 t.Run("successfully saves article", func(t *testing.T) {
329 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
330 if err != nil {
331 t.Fatalf("Expected no error, got %v", err)
332 }
333
334 if _, err := os.Stat(mdPath); os.IsNotExist(err) {
335 t.Error("Expected markdown file to exist")
336 }
337 if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
338 t.Error("Expected HTML file to exist")
339 }
340
341 mdContent, err := os.ReadFile(mdPath)
342 if err != nil {
343 t.Fatalf("Failed to read markdown file: %v", err)
344 }
345 if !strings.Contains(string(mdContent), "# Test Article") {
346 t.Error("Expected markdown to contain title")
347 }
348 if !strings.Contains(string(mdContent), "**Author:** Test Author") {
349 t.Error("Expected markdown to contain author")
350 }
351
352 htmlContentBytes, err := os.ReadFile(htmlPath)
353 if err != nil {
354 t.Fatalf("Failed to read HTML file: %v", err)
355 }
356 if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") {
357 t.Error("Expected HTML to contain title")
358 }
359 })
360
361 t.Run("handles duplicate filenames", func(t *testing.T) {
362 mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir)
363 if err != nil {
364 t.Fatalf("Expected no error for first save, got %v", err)
365 }
366
367 mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir)
368 if err != nil {
369 t.Fatalf("Expected no error for second save, got %v", err)
370 }
371
372 if mdPath1 == mdPath2 {
373 t.Error("Expected different markdown paths for duplicate saves")
374 }
375 if htmlPath1 == htmlPath2 {
376 t.Error("Expected different HTML paths for duplicate saves")
377 }
378
379 if _, err := os.Stat(mdPath1); os.IsNotExist(err) {
380 t.Error("Expected first markdown file to exist")
381 }
382 if _, err := os.Stat(mdPath2); os.IsNotExist(err) {
383 t.Error("Expected second markdown file to exist")
384 }
385 })
386
387 t.Run("fails with invalid directory", func(t *testing.T) {
388 invalidDir := "/nonexistent/directory"
389 _, _, err := parser.SaveArticle(content, invalidDir)
390 if err == nil {
391 t.Error("Expected error for invalid directory")
392 }
393 })
394 })
395
396 t.Run("createHTML", func(t *testing.T) {
397 parser := &ArticleParser{}
398 content := &ParsedContent{
399 Title: "Test HTML Article",
400 Author: "HTML Author",
401 Date: "2023-12-25",
402 Content: "This is **bold** content with *emphasis*.",
403 URL: "https://example.com/html-test",
404 }
405
406 t.Run("creates valid HTML", func(t *testing.T) {
407 markdown := parser.createMarkdown(content)
408 html := parser.createHTML(content, markdown)
409
410 if !strings.Contains(html, "<!DOCTYPE html>") {
411 t.Error("Expected HTML to contain DOCTYPE")
412 }
413 if !strings.Contains(html, "<title>Test HTML Article</title>") {
414 t.Error("Expected HTML to contain title")
415 }
416 if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") {
417 t.Error("Expected HTML to contain h1 heading with title")
418 }
419 if !strings.Contains(html, "<strong>bold</strong>") {
420 t.Error("Expected HTML to contain bold formatting")
421 }
422 if !strings.Contains(html, "<em>emphasis</em>") {
423 t.Error("Expected HTML to contain emphasis formatting")
424 }
425 })
426 })
427
428 t.Run("createMarkdown", func(t *testing.T) {
429 parser := &ArticleParser{}
430
431 t.Run("creates markdown with all fields", func(t *testing.T) {
432 content := &ParsedContent{
433 Title: "Full Content Article",
434 Author: "Complete Author",
435 Date: "2023-01-15",
436 Content: "Complete article content here.",
437 URL: "https://example.com/full",
438 }
439
440 markdown := parser.createMarkdown(content)
441
442 if !strings.Contains(markdown, "# Full Content Article") {
443 t.Error("Expected markdown to contain title")
444 }
445 if !strings.Contains(markdown, "**Author:** Complete Author") {
446 t.Error("Expected markdown to contain author")
447 }
448 if !strings.Contains(markdown, "**Date:** 2023-01-15") {
449 t.Error("Expected markdown to contain date")
450 }
451 if !strings.Contains(markdown, "**Source:** https://example.com/full") {
452 t.Error("Expected markdown to contain source URL")
453 }
454 if !strings.Contains(markdown, "**Saved:**") {
455 t.Error("Expected markdown to contain saved timestamp")
456 }
457 if !strings.Contains(markdown, "---") {
458 t.Error("Expected markdown to contain separator")
459 }
460 if !strings.Contains(markdown, "Complete article content here.") {
461 t.Error("Expected markdown to contain article content")
462 }
463 })
464
465 t.Run("creates markdown with minimal fields", func(t *testing.T) {
466 content := &ParsedContent{
467 Title: "Minimal Article",
468 Content: "Just content.",
469 URL: "https://example.com/minimal",
470 }
471
472 markdown := parser.createMarkdown(content)
473
474 if !strings.Contains(markdown, "# Minimal Article") {
475 t.Error("Expected markdown to contain title")
476 }
477 if strings.Contains(markdown, "**Author:**") {
478 t.Error("Expected no author field for empty author")
479 }
480 if strings.Contains(markdown, "**Date:**") {
481 t.Error("Expected no date field for empty date")
482 }
483 if !strings.Contains(markdown, "**Source:** https://example.com/minimal") {
484 t.Error("Expected markdown to contain source URL")
485 }
486 })
487 })
488}
489
490func TestCreateArticleFromURL(t *testing.T) {
491 tempDir := t.TempDir()
492
493 t.Run("fails with invalid URL", func(t *testing.T) {
494 _, err := CreateArticleFromURL("not-a-url", tempDir)
495 if err == nil {
496 t.Error("Expected error for invalid URL")
497 }
498 if !strings.Contains(err.Error(), "invalid URL") && !strings.Contains(err.Error(), "failed to parse URL") {
499 t.Errorf("Expected URL parsing error, got %v", err)
500 }
501 })
502
503 t.Run("fails with empty URL", func(t *testing.T) {
504 _, err := CreateArticleFromURL("", tempDir)
505 if err == nil {
506 t.Error("Expected error for empty URL")
507 }
508 })
509
510 t.Run("fails with unsupported domain", func(t *testing.T) {
511 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
512 w.WriteHeader(http.StatusOK)
513 w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
514 }))
515 defer server.Close()
516
517 _, err := CreateArticleFromURL(server.URL, tempDir)
518 if err == nil {
519 t.Error("Expected error for unsupported domain")
520 }
521 if !strings.Contains(err.Error(), "no parsing rule found") {
522 t.Errorf("Expected 'no parsing rule found' error, got %v", err)
523 }
524 })
525
526 t.Run("fails with HTTP error", func(t *testing.T) {
527 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
528 w.WriteHeader(http.StatusNotFound)
529 }))
530 defer server.Close()
531
532 // Use a direct Wikipedia URL that would be processed by the real function
533 _, err := CreateArticleFromURL("https://en.wikipedia.org/wiki/NonExistentPage12345", tempDir)
534 if err == nil {
535 t.Error("Expected error for HTTP 404")
536 }
537 if !strings.Contains(err.Error(), "HTTP error") && !strings.Contains(err.Error(), "404") {
538 t.Errorf("Expected HTTP error, got %v", err)
539 }
540 })
541
542 t.Run("fails with network error", func(t *testing.T) {
543 // Use a non-existent server to trigger network error
544 _, err := CreateArticleFromURL("http://localhost:99999/test", tempDir)
545 if err == nil {
546 t.Error("Expected error for network failure")
547 }
548 if !strings.Contains(err.Error(), "failed to fetch URL") && !strings.Contains(err.Error(), "connection refused") {
549 t.Errorf("Expected network error, got %v", err)
550 }
551 })
552
553 t.Run("fails with invalid directory", func(t *testing.T) {
554 // Skip this test as it would require network access to test with real URLs
555 t.Skip("Skipping invalid directory test - requires network access")
556 })
557
558 t.Run("fails with malformed HTML", func(t *testing.T) {
559 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
560 w.WriteHeader(http.StatusOK)
561 w.Write([]byte("<html><head><title>Test</head></body>")) // Malformed HTML
562 }))
563 defer server.Close()
564
565 // Create a custom parser with localhost rule for testing
566 parser, err := NewArticleParser(server.Client())
567 if err != nil {
568 t.Fatalf("Failed to create parser: %v", err)
569 }
570
571 localhostRule := &ParsingRule{
572 Domain: "127.0.0.1",
573 Title: "//h1[@id='firstHeading']",
574 Body: "//div[@id='bodyContent']",
575 Strip: []string{"//div[@class='noprint']"},
576 }
577 parser.AddRule("127.0.0.1", localhostRule)
578
579 _, err = parser.ParseURL(server.URL)
580 if err == nil {
581 t.Error("Expected error for malformed HTML")
582 }
583 // Malformed HTML may either fail to parse or fail to extract title
584 if !strings.Contains(err.Error(), "failed to parse HTML") && !strings.Contains(err.Error(), "could not extract title") {
585 t.Errorf("Expected HTML parsing or title extraction error, got %v", err)
586 }
587 })
588
589 t.Run("fails when no title can be extracted", func(t *testing.T) {
590 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
591 w.WriteHeader(http.StatusOK)
592 w.Write([]byte(`<html>
593 <head><title>Test</title></head>
594 <body>
595 <div id="bodyContent">
596 <p>Content without proper title</p>
597 </div>
598 </body>
599 </html>`)) // No h1 with id="firstHeading"
600 }))
601 defer server.Close()
602
603 // Create a custom parser with localhost rule for testing
604 parser, err := NewArticleParser(server.Client())
605 if err != nil {
606 t.Fatalf("Failed to create parser: %v", err)
607 }
608
609 localhostRule := &ParsingRule{
610 Domain: "127.0.0.1",
611 Title: "//h1[@id='firstHeading']",
612 Body: "//div[@id='bodyContent']",
613 Strip: []string{"//div[@class='noprint']"},
614 }
615 parser.AddRule("127.0.0.1", localhostRule)
616
617 _, err = parser.ParseURL(server.URL)
618 if err == nil {
619 t.Error("Expected error when no title can be extracted")
620 }
621 if !strings.Contains(err.Error(), "could not extract title") {
622 t.Errorf("Expected 'could not extract title' error, got %v", err)
623 }
624 })
625
626 t.Run("successfully creates article structure from parsed content", func(t *testing.T) {
627 wikipediaHTML := `<html>
628 <head><title>Integration Test Article</title></head>
629 <body>
630 <h1 id="firstHeading">Integration Test Article</h1>
631 <div id="bodyContent">
632 <p>This is integration test content.</p>
633 <div class="noprint">This should be stripped</div>
634 <p>More content here.</p>
635 </div>
636 </body>
637 </html>`
638
639 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
640 w.WriteHeader(http.StatusOK)
641 w.Write([]byte(wikipediaHTML))
642 }))
643 defer server.Close()
644
645 // Create a custom parser with localhost rule for testing
646 parser, err := NewArticleParser(server.Client())
647 if err != nil {
648 t.Fatalf("Failed to create parser: %v", err)
649 }
650
651 localhostRule := &ParsingRule{
652 Domain: "127.0.0.1",
653 Title: "//h1[@id='firstHeading']",
654 Body: "//div[@id='bodyContent']",
655 Strip: []string{"//div[@class='noprint']"},
656 }
657 parser.AddRule("127.0.0.1", localhostRule)
658
659 content, err := parser.ParseURL(server.URL)
660 if err != nil {
661 t.Fatalf("Expected no error, got %v", err)
662 }
663
664 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
665 if err != nil {
666 t.Fatalf("Failed to save article: %v", err)
667 }
668
669 // Test that it creates a proper models.Article structure (simulating CreateArticleFromURL)
670 article := &models.Article{
671 URL: server.URL,
672 Title: content.Title,
673 MarkdownPath: mdPath,
674 HTMLPath: htmlPath,
675 Created: time.Now(),
676 Modified: time.Now(),
677 }
678
679 if article.Title != "Integration Test Article" {
680 t.Errorf("Expected title 'Integration Test Article', got %s", article.Title)
681 }
682 if article.URL != server.URL {
683 t.Errorf("Expected URL %s, got %s", server.URL, article.URL)
684 }
685 if article.MarkdownPath == "" {
686 t.Error("Expected non-empty markdown path")
687 }
688 if article.HTMLPath == "" {
689 t.Error("Expected non-empty HTML path")
690 }
691 if article.Created.IsZero() {
692 t.Error("Expected Created timestamp to be set")
693 }
694 if article.Modified.IsZero() {
695 t.Error("Expected Modified timestamp to be set")
696 }
697
698 // Check files exist
699 if _, err := os.Stat(article.MarkdownPath); os.IsNotExist(err) {
700 t.Error("Expected markdown file to exist")
701 }
702 if _, err := os.Stat(article.HTMLPath); os.IsNotExist(err) {
703 t.Error("Expected HTML file to exist")
704 }
705
706 // Verify file contents
707 mdContent, err := os.ReadFile(article.MarkdownPath)
708 if err != nil {
709 t.Fatalf("Failed to read markdown file: %v", err)
710 }
711 if !strings.Contains(string(mdContent), "# Integration Test Article") {
712 t.Error("Expected markdown to contain title")
713 }
714 if !strings.Contains(string(mdContent), "This is integration test content") {
715 t.Error("Expected markdown to contain article content")
716 }
717 if strings.Contains(string(mdContent), "This should be stripped") {
718 t.Error("Expected stripped content to be removed from markdown")
719 }
720
721 htmlContent, err := os.ReadFile(article.HTMLPath)
722 if err != nil {
723 t.Fatalf("Failed to read HTML file: %v", err)
724 }
725 if !strings.Contains(string(htmlContent), "<title>Integration Test Article</title>") {
726 t.Error("Expected HTML to contain title")
727 }
728 if !strings.Contains(string(htmlContent), "<!DOCTYPE html>") {
729 t.Error("Expected HTML to contain DOCTYPE")
730 }
731 })
732
733 t.Run("successfully handles article with metadata", func(t *testing.T) {
734 contentHTML := `<html>
735 <head>
736 <title>Test Paper</title>
737 <meta name="citation_author" content="Dr. Test Author">
738 <meta name="citation_date" content="2024-01-01">
739 </head>
740 <body>
741 <h1 class="title">Test Research Paper</h1>
742 <blockquote class="abstract">
743 <p>This is the abstract of the research paper.</p>
744 <p>It contains important research findings.</p>
745 </blockquote>
746 </body>
747 </html>`
748
749 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
750 w.WriteHeader(http.StatusOK)
751 w.Write([]byte(contentHTML))
752 }))
753 defer server.Close()
754
755 // Create a custom parser with arXiv-like rule for testing
756 parser, err := NewArticleParser(server.Client())
757 if err != nil {
758 t.Fatalf("Failed to create parser: %v", err)
759 }
760
761 localhostRule := &ParsingRule{
762 Domain: "127.0.0.1",
763 Title: "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]",
764 Body: "//blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')]",
765 Date: "//meta[@name='citation_date']/@content",
766 Author: "//meta[@name='citation_author']/@content",
767 }
768 parser.AddRule("127.0.0.1", localhostRule)
769
770 content, err := parser.ParseURL(server.URL)
771 if err != nil {
772 t.Fatalf("Expected no error, got %v", err)
773 }
774
775 if content.Title != "Test Research Paper" {
776 t.Errorf("Expected title 'Test Research Paper', got %s", content.Title)
777 }
778 if content.Author != "Dr. Test Author" {
779 t.Errorf("Expected author 'Dr. Test Author', got %s", content.Author)
780 }
781 if content.Date != "2024-01-01" {
782 t.Errorf("Expected date '2024-01-01', got %s", content.Date)
783 }
784
785 mdPath, _, err := parser.SaveArticle(content, tempDir)
786 if err != nil {
787 t.Fatalf("Failed to save article: %v", err)
788 }
789
790 // Verify markdown contains all metadata
791 mdContent, err := os.ReadFile(mdPath)
792 if err != nil {
793 t.Fatalf("Failed to read markdown file: %v", err)
794 }
795 if !strings.Contains(string(mdContent), "**Author:** Dr. Test Author") {
796 t.Error("Expected markdown to contain author")
797 }
798 if !strings.Contains(string(mdContent), "**Date:** 2024-01-01") {
799 t.Error("Expected markdown to contain date")
800 }
801
802 article := &models.Article{
803 Author: content.Author,
804 Date: content.Date,
805 }
806
807 if article.Author != "Dr. Test Author" {
808 t.Errorf("Expected article author 'Dr. Test Author', got %s", article.Author)
809 }
810 if article.Date != "2024-01-01" {
811 t.Errorf("Expected article date '2024-01-01', got %s", article.Date)
812 }
813 })
814}