internal/articles/parser_test.go at 7452aaa4e1d32eaa8a95e2413dcd589b4c807007

desertthunder.dev / noteleaf
fork
cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
fork
noteleaf / internal / articles / parser_test.go
at 7452aaa4e1d32eaa8a95e2413dcd589b4c807007 814 lines 25 kB view raw
wrap content
Owais Jamil build: add test cases for CreateArticleFromURL 7mo ago
86604060
  1package articles
  2
  3import (
  4	"fmt"
  5	"net/http"
  6	"net/http/httptest"
  7	"os"
  8	"strings"
  9	"testing"
 10	"time"
 11
 12	"github.com/stormlightlabs/noteleaf/internal/models"
 13)
 14
 15// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules.
 16func ExampleParser_Convert() {
 17	parser, err := NewArticleParser(http.DefaultClient)
 18	if err != nil {
 19		fmt.Printf("Failed to create parser: %v\n", err)
 20		return
 21	}
 22
 23	htmlPath := "examples/christopher-lloyd.html"
 24	htmlContent, err := os.ReadFile(htmlPath)
 25	if err != nil {
 26		fmt.Printf("Local HTML file not found: %v\n", err)
 27		return
 28	}
 29
 30	markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd")
 31	if err != nil {
 32		fmt.Printf("Failed to convert HTML: %v\n", err)
 33		return
 34	}
 35
 36	parts := strings.Split(markdown, "\n---\n")
 37	if len(parts) > 0 {
 38		frontmatter := strings.TrimSpace(parts[0])
 39		lines := strings.Split(frontmatter, "\n")
 40
 41		for i, line := range lines {
 42			if i >= 4 {
 43				break
 44			}
 45
 46			if !strings.Contains(line, "**Saved:**") {
 47				fmt.Println(line)
 48			}
 49		}
 50	}
 51
 52	// Output: # Christopher Lloyd
 53	//
 54	// **Source:** https://en.wikipedia.org/wiki/Christopher_Lloyd
 55}
 56
 57func TestArticleParser(t *testing.T) {
 58	t.Run("New", func(t *testing.T) {
 59		t.Run("successfully creates parser", func(t *testing.T) {
 60			parser, err := NewArticleParser(http.DefaultClient)
 61			if err != nil {
 62				t.Fatalf("Expected no error, got %v", err)
 63			}
 64			if parser == nil {
 65				t.Fatal("Expected parser to be created, got nil")
 66			}
 67			if len(parser.rules) == 0 {
 68				t.Error("Expected rules to be loaded")
 69			}
 70		})
 71
 72		t.Run("loads expected domains", func(t *testing.T) {
 73			parser, err := NewArticleParser(http.DefaultClient)
 74			if err != nil {
 75				t.Fatalf("Failed to create parser: %v", err)
 76			}
 77
 78			domains := parser.GetSupportedDomains()
 79			expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"}
 80
 81			if len(domains) != len(expectedDomains) {
 82				t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains))
 83			}
 84
 85			domainMap := make(map[string]bool)
 86			for _, domain := range domains {
 87				domainMap[domain] = true
 88			}
 89
 90			for _, expected := range expectedDomains {
 91				if !domainMap[expected] {
 92					t.Errorf("Expected domain %s not found in supported domains", expected)
 93				}
 94			}
 95		})
 96	})
 97
 98	t.Run("parseRules", func(t *testing.T) {
 99		parser := &ArticleParser{rules: make(map[string]*ParsingRule)}
100
101		t.Run("parses valid rule file", func(t *testing.T) {
102			content := `title: //h1
103author: //span[@class='author']
104date: //time
105body: //article
106strip: //nav
107strip: //footer
108test_url: https://example.com/article`
109
110			rule, err := parser.parseRules("example.com", content)
111			if err != nil {
112				t.Fatalf("Expected no error, got %v", err)
113			}
114
115			if rule.Domain != "example.com" {
116				t.Errorf("Expected domain 'example.com', got %s", rule.Domain)
117			}
118			if rule.Title != "//h1" {
119				t.Errorf("Expected title '//h1', got %s", rule.Title)
120			}
121			if rule.Author != "//span[@class='author']" {
122				t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author)
123			}
124			if len(rule.Strip) != 2 {
125				t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip))
126			}
127			if len(rule.TestURLs) != 1 {
128				t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs))
129			}
130		})
131
132		t.Run("handles empty lines and comments", func(t *testing.T) {
133			content := `# This is a comment
134title: //h1
135
136# Another comment
137body: //article
138`
139
140			rule, err := parser.parseRules("test.com", content)
141			if err != nil {
142				t.Fatalf("Expected no error, got %v", err)
143			}
144
145			if rule.Title != "//h1" {
146				t.Errorf("Expected title '//h1', got %s", rule.Title)
147			}
148			if rule.Body != "//article" {
149				t.Errorf("Expected body '//article', got %s", rule.Body)
150			}
151		})
152	})
153
154	t.Run("slugify", func(t *testing.T) {
155		parser := &ArticleParser{}
156
157		testCases := []struct {
158			input    string
159			expected string
160		}{
161			{"Simple Title", "simple-title"},
162			{"Title with Numbers 123", "title-with-numbers-123"},
163			{"Title-with-Hyphens", "title-with-hyphens"},
164			{"Title with Spaces and    Multiple   Spaces", "title-with-spaces-and-multiple-spaces"},
165			{"Title!@#$%^&*()with Special Characters", "title-with-special-characters"},
166			{"", ""},
167			{strings.Repeat("a", 150), strings.Repeat("a", 100)},
168		}
169
170		for _, tc := range testCases {
171			t.Run(fmt.Sprintf("slugify '%s'", tc.input), func(t *testing.T) {
172				result := parser.slugify(tc.input)
173				if result != tc.expected {
174					t.Errorf("Expected '%s', got '%s'", tc.expected, result)
175				}
176			})
177		}
178	})
179
180	t.Run("Convert", func(t *testing.T) {
181		parser, err := NewArticleParser(http.DefaultClient)
182		if err != nil {
183			t.Fatalf("Failed to create parser: %v", err)
184		}
185
186		t.Run("fails with unsupported domain", func(t *testing.T) {
187			htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
188			_, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article")
189
190			if err == nil {
191				t.Error("Expected error for unsupported domain")
192			}
193			if !strings.Contains(err.Error(), "no parsing rule found") {
194				t.Errorf("Expected 'no parsing rule found' error, got %v", err)
195			}
196		})
197
198		t.Run("fails with invalid HTML", func(t *testing.T) {
199			invalidHTML := "<html><head><title>Test</head></body>"
200			_, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
201
202			if err == nil {
203				t.Error("Expected error for invalid HTML")
204			}
205		})
206
207		t.Run("fails when no title extracted", func(t *testing.T) {
208			htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
209			_, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
210
211			if err == nil {
212				t.Error("Expected error when no title can be extracted")
213			}
214			if !strings.Contains(err.Error(), "could not extract title") {
215				t.Errorf("Expected 'could not extract title' error, got %v", err)
216			}
217		})
218
219		t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) {
220			htmlContent := `<html>
221			<head><title>Test Article</title></head>
222			<body>
223				<h1 id="firstHeading">Test Article Title</h1>
224				<div id="bodyContent">
225					<p>This is the main content of the article.</p>
226					<div class="noprint">This should be stripped</div>
227					<p>More content here.</p>
228				</div>
229			</body>
230		</html>`
231
232			markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
233			if err != nil {
234				t.Fatalf("Expected no error, got %v", err)
235			}
236
237			if !strings.Contains(markdown, "# Test Article Title") {
238				t.Error("Expected markdown to contain title")
239			}
240			if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") {
241				t.Error("Expected markdown to contain source URL")
242			}
243			if !strings.Contains(markdown, "This is the main content") {
244				t.Error("Expected markdown to contain article content")
245			}
246			if strings.Contains(markdown, "This should be stripped") {
247				t.Error("Expected stripped content to be removed from markdown")
248			}
249		})
250	})
251
252	t.Run("ParseURL", func(t *testing.T) {
253		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
254			switch {
255			case strings.Contains(r.URL.Path, "404"):
256				w.WriteHeader(http.StatusNotFound)
257			case strings.Contains(r.URL.Path, "unsupported"):
258				w.WriteHeader(http.StatusOK)
259				w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
260			default:
261				// Return Wikipedia-like structure for localhost rule
262				w.WriteHeader(http.StatusOK)
263				w.Write([]byte(`<html>
264					<head><title>Test Article</title></head>
265					<body>
266						<h1 id="firstHeading">Test Wikipedia Article</h1>
267						<div id="bodyContent">
268							<p>This is the article content.</p>
269							<div class="noprint">This gets stripped</div>
270						</div>
271					</body>
272				</html>`))
273			}
274		}))
275		defer server.Close()
276
277		parser, err := NewArticleParser(server.Client())
278		if err != nil {
279			t.Fatalf("Failed to create parser: %v", err)
280		}
281
282		localhostRule := &ParsingRule{
283			Domain: "127.0.0.1",
284			Title:  "//h1[@id='firstHeading']",
285			Body:   "//div[@id='bodyContent']",
286			Strip:  []string{"//div[@class='noprint']"},
287		}
288		parser.AddRule("127.0.0.1", localhostRule)
289
290		t.Run("fails with invalid URL", func(t *testing.T) {
291			_, err := parser.ParseURL("not-a-url")
292			if err == nil {
293				t.Error("Expected error for invalid URL")
294			}
295			if !strings.Contains(err.Error(), "unsupported protocol scheme") {
296				t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err)
297			}
298		})
299
300		t.Run("fails with unsupported domain", func(t *testing.T) {
301			_, err := parser.ParseURL(server.URL + "/unsupported.com")
302			if err == nil {
303				t.Error("Expected error for unsupported domain")
304			}
305		})
306
307		t.Run("fails with HTTP error", func(t *testing.T) {
308			_, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test")
309			if err == nil {
310				t.Error("Expected error for HTTP 404")
311			}
312		})
313
314	})
315
316	t.Run("SaveArticle", func(t *testing.T) {
317		parser := &ArticleParser{}
318		tempDir := t.TempDir()
319
320		content := &ParsedContent{
321			Title:   "Test Article",
322			Author:  "Test Author",
323			Date:    "2023-01-01",
324			Content: "This is test content.",
325			URL:     "https://example.com/test",
326		}
327
328		t.Run("successfully saves article", func(t *testing.T) {
329			mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
330			if err != nil {
331				t.Fatalf("Expected no error, got %v", err)
332			}
333
334			if _, err := os.Stat(mdPath); os.IsNotExist(err) {
335				t.Error("Expected markdown file to exist")
336			}
337			if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
338				t.Error("Expected HTML file to exist")
339			}
340
341			mdContent, err := os.ReadFile(mdPath)
342			if err != nil {
343				t.Fatalf("Failed to read markdown file: %v", err)
344			}
345			if !strings.Contains(string(mdContent), "# Test Article") {
346				t.Error("Expected markdown to contain title")
347			}
348			if !strings.Contains(string(mdContent), "**Author:** Test Author") {
349				t.Error("Expected markdown to contain author")
350			}
351
352			htmlContentBytes, err := os.ReadFile(htmlPath)
353			if err != nil {
354				t.Fatalf("Failed to read HTML file: %v", err)
355			}
356			if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") {
357				t.Error("Expected HTML to contain title")
358			}
359		})
360
361		t.Run("handles duplicate filenames", func(t *testing.T) {
362			mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir)
363			if err != nil {
364				t.Fatalf("Expected no error for first save, got %v", err)
365			}
366
367			mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir)
368			if err != nil {
369				t.Fatalf("Expected no error for second save, got %v", err)
370			}
371
372			if mdPath1 == mdPath2 {
373				t.Error("Expected different markdown paths for duplicate saves")
374			}
375			if htmlPath1 == htmlPath2 {
376				t.Error("Expected different HTML paths for duplicate saves")
377			}
378
379			if _, err := os.Stat(mdPath1); os.IsNotExist(err) {
380				t.Error("Expected first markdown file to exist")
381			}
382			if _, err := os.Stat(mdPath2); os.IsNotExist(err) {
383				t.Error("Expected second markdown file to exist")
384			}
385		})
386
387		t.Run("fails with invalid directory", func(t *testing.T) {
388			invalidDir := "/nonexistent/directory"
389			_, _, err := parser.SaveArticle(content, invalidDir)
390			if err == nil {
391				t.Error("Expected error for invalid directory")
392			}
393		})
394	})
395
396	t.Run("createHTML", func(t *testing.T) {
397		parser := &ArticleParser{}
398		content := &ParsedContent{
399			Title:   "Test HTML Article",
400			Author:  "HTML Author",
401			Date:    "2023-12-25",
402			Content: "This is **bold** content with *emphasis*.",
403			URL:     "https://example.com/html-test",
404		}
405
406		t.Run("creates valid HTML", func(t *testing.T) {
407			markdown := parser.createMarkdown(content)
408			html := parser.createHTML(content, markdown)
409
410			if !strings.Contains(html, "<!DOCTYPE html>") {
411				t.Error("Expected HTML to contain DOCTYPE")
412			}
413			if !strings.Contains(html, "<title>Test HTML Article</title>") {
414				t.Error("Expected HTML to contain title")
415			}
416			if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") {
417				t.Error("Expected HTML to contain h1 heading with title")
418			}
419			if !strings.Contains(html, "<strong>bold</strong>") {
420				t.Error("Expected HTML to contain bold formatting")
421			}
422			if !strings.Contains(html, "<em>emphasis</em>") {
423				t.Error("Expected HTML to contain emphasis formatting")
424			}
425		})
426	})
427
428	t.Run("createMarkdown", func(t *testing.T) {
429		parser := &ArticleParser{}
430
431		t.Run("creates markdown with all fields", func(t *testing.T) {
432			content := &ParsedContent{
433				Title:   "Full Content Article",
434				Author:  "Complete Author",
435				Date:    "2023-01-15",
436				Content: "Complete article content here.",
437				URL:     "https://example.com/full",
438			}
439
440			markdown := parser.createMarkdown(content)
441
442			if !strings.Contains(markdown, "# Full Content Article") {
443				t.Error("Expected markdown to contain title")
444			}
445			if !strings.Contains(markdown, "**Author:** Complete Author") {
446				t.Error("Expected markdown to contain author")
447			}
448			if !strings.Contains(markdown, "**Date:** 2023-01-15") {
449				t.Error("Expected markdown to contain date")
450			}
451			if !strings.Contains(markdown, "**Source:** https://example.com/full") {
452				t.Error("Expected markdown to contain source URL")
453			}
454			if !strings.Contains(markdown, "**Saved:**") {
455				t.Error("Expected markdown to contain saved timestamp")
456			}
457			if !strings.Contains(markdown, "---") {
458				t.Error("Expected markdown to contain separator")
459			}
460			if !strings.Contains(markdown, "Complete article content here.") {
461				t.Error("Expected markdown to contain article content")
462			}
463		})
464
465		t.Run("creates markdown with minimal fields", func(t *testing.T) {
466			content := &ParsedContent{
467				Title:   "Minimal Article",
468				Content: "Just content.",
469				URL:     "https://example.com/minimal",
470			}
471
472			markdown := parser.createMarkdown(content)
473
474			if !strings.Contains(markdown, "# Minimal Article") {
475				t.Error("Expected markdown to contain title")
476			}
477			if strings.Contains(markdown, "**Author:**") {
478				t.Error("Expected no author field for empty author")
479			}
480			if strings.Contains(markdown, "**Date:**") {
481				t.Error("Expected no date field for empty date")
482			}
483			if !strings.Contains(markdown, "**Source:** https://example.com/minimal") {
484				t.Error("Expected markdown to contain source URL")
485			}
486		})
487	})
488}
489
490func TestCreateArticleFromURL(t *testing.T) {
491	tempDir := t.TempDir()
492
493	t.Run("fails with invalid URL", func(t *testing.T) {
494		_, err := CreateArticleFromURL("not-a-url", tempDir)
495		if err == nil {
496			t.Error("Expected error for invalid URL")
497		}
498		if !strings.Contains(err.Error(), "invalid URL") && !strings.Contains(err.Error(), "failed to parse URL") {
499			t.Errorf("Expected URL parsing error, got %v", err)
500		}
501	})
502
503	t.Run("fails with empty URL", func(t *testing.T) {
504		_, err := CreateArticleFromURL("", tempDir)
505		if err == nil {
506			t.Error("Expected error for empty URL")
507		}
508	})
509
510	t.Run("fails with unsupported domain", func(t *testing.T) {
511		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
512			w.WriteHeader(http.StatusOK)
513			w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
514		}))
515		defer server.Close()
516
517		_, err := CreateArticleFromURL(server.URL, tempDir)
518		if err == nil {
519			t.Error("Expected error for unsupported domain")
520		}
521		if !strings.Contains(err.Error(), "no parsing rule found") {
522			t.Errorf("Expected 'no parsing rule found' error, got %v", err)
523		}
524	})
525
526	t.Run("fails with HTTP error", func(t *testing.T) {
527		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
528			w.WriteHeader(http.StatusNotFound)
529		}))
530		defer server.Close()
531
532		// Use a direct Wikipedia URL that would be processed by the real function
533		_, err := CreateArticleFromURL("https://en.wikipedia.org/wiki/NonExistentPage12345", tempDir)
534		if err == nil {
535			t.Error("Expected error for HTTP 404")
536		}
537		if !strings.Contains(err.Error(), "HTTP error") && !strings.Contains(err.Error(), "404") {
538			t.Errorf("Expected HTTP error, got %v", err)
539		}
540	})
541
542	t.Run("fails with network error", func(t *testing.T) {
543		// Use a non-existent server to trigger network error
544		_, err := CreateArticleFromURL("http://localhost:99999/test", tempDir)
545		if err == nil {
546			t.Error("Expected error for network failure")
547		}
548		if !strings.Contains(err.Error(), "failed to fetch URL") && !strings.Contains(err.Error(), "connection refused") {
549			t.Errorf("Expected network error, got %v", err)
550		}
551	})
552
553	t.Run("fails with invalid directory", func(t *testing.T) {
554		// Skip this test as it would require network access to test with real URLs
555		t.Skip("Skipping invalid directory test - requires network access")
556	})
557
558	t.Run("fails with malformed HTML", func(t *testing.T) {
559		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
560			w.WriteHeader(http.StatusOK)
561			w.Write([]byte("<html><head><title>Test</head></body>")) // Malformed HTML
562		}))
563		defer server.Close()
564
565		// Create a custom parser with localhost rule for testing
566		parser, err := NewArticleParser(server.Client())
567		if err != nil {
568			t.Fatalf("Failed to create parser: %v", err)
569		}
570
571		localhostRule := &ParsingRule{
572			Domain: "127.0.0.1",
573			Title:  "//h1[@id='firstHeading']",
574			Body:   "//div[@id='bodyContent']",
575			Strip:  []string{"//div[@class='noprint']"},
576		}
577		parser.AddRule("127.0.0.1", localhostRule)
578
579		_, err = parser.ParseURL(server.URL)
580		if err == nil {
581			t.Error("Expected error for malformed HTML")
582		}
583		// Malformed HTML may either fail to parse or fail to extract title
584		if !strings.Contains(err.Error(), "failed to parse HTML") && !strings.Contains(err.Error(), "could not extract title") {
585			t.Errorf("Expected HTML parsing or title extraction error, got %v", err)
586		}
587	})
588
589	t.Run("fails when no title can be extracted", func(t *testing.T) {
590		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
591			w.WriteHeader(http.StatusOK)
592			w.Write([]byte(`<html>
593				<head><title>Test</title></head>
594				<body>
595					<div id="bodyContent">
596						<p>Content without proper title</p>
597					</div>
598				</body>
599			</html>`)) // No h1 with id="firstHeading"
600		}))
601		defer server.Close()
602
603		// Create a custom parser with localhost rule for testing
604		parser, err := NewArticleParser(server.Client())
605		if err != nil {
606			t.Fatalf("Failed to create parser: %v", err)
607		}
608
609		localhostRule := &ParsingRule{
610			Domain: "127.0.0.1",
611			Title:  "//h1[@id='firstHeading']",
612			Body:   "//div[@id='bodyContent']",
613			Strip:  []string{"//div[@class='noprint']"},
614		}
615		parser.AddRule("127.0.0.1", localhostRule)
616
617		_, err = parser.ParseURL(server.URL)
618		if err == nil {
619			t.Error("Expected error when no title can be extracted")
620		}
621		if !strings.Contains(err.Error(), "could not extract title") {
622			t.Errorf("Expected 'could not extract title' error, got %v", err)
623		}
624	})
625
626	t.Run("successfully creates article structure from parsed content", func(t *testing.T) {
627		wikipediaHTML := `<html>
628			<head><title>Integration Test Article</title></head>
629			<body>
630				<h1 id="firstHeading">Integration Test Article</h1>
631				<div id="bodyContent">
632					<p>This is integration test content.</p>
633					<div class="noprint">This should be stripped</div>
634					<p>More content here.</p>
635				</div>
636			</body>
637		</html>`
638
639		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
640			w.WriteHeader(http.StatusOK)
641			w.Write([]byte(wikipediaHTML))
642		}))
643		defer server.Close()
644
645		// Create a custom parser with localhost rule for testing
646		parser, err := NewArticleParser(server.Client())
647		if err != nil {
648			t.Fatalf("Failed to create parser: %v", err)
649		}
650
651		localhostRule := &ParsingRule{
652			Domain: "127.0.0.1",
653			Title:  "//h1[@id='firstHeading']",
654			Body:   "//div[@id='bodyContent']",
655			Strip:  []string{"//div[@class='noprint']"},
656		}
657		parser.AddRule("127.0.0.1", localhostRule)
658
659		content, err := parser.ParseURL(server.URL)
660		if err != nil {
661			t.Fatalf("Expected no error, got %v", err)
662		}
663
664		mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
665		if err != nil {
666			t.Fatalf("Failed to save article: %v", err)
667		}
668
669		// Test that it creates a proper models.Article structure (simulating CreateArticleFromURL)
670		article := &models.Article{
671			URL:          server.URL,
672			Title:        content.Title,
673			MarkdownPath: mdPath,
674			HTMLPath:     htmlPath,
675			Created:      time.Now(),
676			Modified:     time.Now(),
677		}
678
679		if article.Title != "Integration Test Article" {
680			t.Errorf("Expected title 'Integration Test Article', got %s", article.Title)
681		}
682		if article.URL != server.URL {
683			t.Errorf("Expected URL %s, got %s", server.URL, article.URL)
684		}
685		if article.MarkdownPath == "" {
686			t.Error("Expected non-empty markdown path")
687		}
688		if article.HTMLPath == "" {
689			t.Error("Expected non-empty HTML path")
690		}
691		if article.Created.IsZero() {
692			t.Error("Expected Created timestamp to be set")
693		}
694		if article.Modified.IsZero() {
695			t.Error("Expected Modified timestamp to be set")
696		}
697
698		// Check files exist
699		if _, err := os.Stat(article.MarkdownPath); os.IsNotExist(err) {
700			t.Error("Expected markdown file to exist")
701		}
702		if _, err := os.Stat(article.HTMLPath); os.IsNotExist(err) {
703			t.Error("Expected HTML file to exist")
704		}
705
706		// Verify file contents
707		mdContent, err := os.ReadFile(article.MarkdownPath)
708		if err != nil {
709			t.Fatalf("Failed to read markdown file: %v", err)
710		}
711		if !strings.Contains(string(mdContent), "# Integration Test Article") {
712			t.Error("Expected markdown to contain title")
713		}
714		if !strings.Contains(string(mdContent), "This is integration test content") {
715			t.Error("Expected markdown to contain article content")
716		}
717		if strings.Contains(string(mdContent), "This should be stripped") {
718			t.Error("Expected stripped content to be removed from markdown")
719		}
720
721		htmlContent, err := os.ReadFile(article.HTMLPath)
722		if err != nil {
723			t.Fatalf("Failed to read HTML file: %v", err)
724		}
725		if !strings.Contains(string(htmlContent), "<title>Integration Test Article</title>") {
726			t.Error("Expected HTML to contain title")
727		}
728		if !strings.Contains(string(htmlContent), "<!DOCTYPE html>") {
729			t.Error("Expected HTML to contain DOCTYPE")
730		}
731	})
732
733	t.Run("successfully handles article with metadata", func(t *testing.T) {
734		contentHTML := `<html>
735			<head>
736				<title>Test Paper</title>
737				<meta name="citation_author" content="Dr. Test Author">
738				<meta name="citation_date" content="2024-01-01">
739			</head>
740			<body>
741				<h1 class="title">Test Research Paper</h1>
742				<blockquote class="abstract">
743					<p>This is the abstract of the research paper.</p>
744					<p>It contains important research findings.</p>
745				</blockquote>
746			</body>
747		</html>`
748
749		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
750			w.WriteHeader(http.StatusOK)
751			w.Write([]byte(contentHTML))
752		}))
753		defer server.Close()
754
755		// Create a custom parser with arXiv-like rule for testing
756		parser, err := NewArticleParser(server.Client())
757		if err != nil {
758			t.Fatalf("Failed to create parser: %v", err)
759		}
760
761		localhostRule := &ParsingRule{
762			Domain: "127.0.0.1",
763			Title:  "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]",
764			Body:   "//blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')]",
765			Date:   "//meta[@name='citation_date']/@content",
766			Author: "//meta[@name='citation_author']/@content",
767		}
768		parser.AddRule("127.0.0.1", localhostRule)
769
770		content, err := parser.ParseURL(server.URL)
771		if err != nil {
772			t.Fatalf("Expected no error, got %v", err)
773		}
774
775		if content.Title != "Test Research Paper" {
776			t.Errorf("Expected title 'Test Research Paper', got %s", content.Title)
777		}
778		if content.Author != "Dr. Test Author" {
779			t.Errorf("Expected author 'Dr. Test Author', got %s", content.Author)
780		}
781		if content.Date != "2024-01-01" {
782			t.Errorf("Expected date '2024-01-01', got %s", content.Date)
783		}
784
785		mdPath, _, err := parser.SaveArticle(content, tempDir)
786		if err != nil {
787			t.Fatalf("Failed to save article: %v", err)
788		}
789
790		// Verify markdown contains all metadata
791		mdContent, err := os.ReadFile(mdPath)
792		if err != nil {
793			t.Fatalf("Failed to read markdown file: %v", err)
794		}
795		if !strings.Contains(string(mdContent), "**Author:** Dr. Test Author") {
796			t.Error("Expected markdown to contain author")
797		}
798		if !strings.Contains(string(mdContent), "**Date:** 2024-01-01") {
799			t.Error("Expected markdown to contain date")
800		}
801
802		article := &models.Article{
803			Author: content.Author,
804			Date:   content.Date,
805		}
806
807		if article.Author != "Dr. Test Author" {
808			t.Errorf("Expected article author 'Dr. Test Author', got %s", article.Author)
809		}
810		if article.Date != "2024-01-01" {
811			t.Errorf("Expected article date '2024-01-01', got %s", article.Date)
812		}
813	})
814}
Configure Feed

Configure Feed