internal/articles/parser_test.go at b5d8851190fc9c722d4e5dcc977e58042fbdec81

desertthunder.dev / noteleaf
fork
cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
fork
noteleaf / internal / articles / parser_test.go
at b5d8851190fc9c722d4e5dcc977e58042fbdec81 880 lines 27 kB view raw
wrap content
Owais Jamil feat: add more xpath rules for wikipedia 6mo ago
b5f5e49c
  1package articles
  2
  3import (
  4	"fmt"
  5	"net/http"
  6	"net/http/httptest"
  7	"os"
  8	"strings"
  9	"testing"
 10	"time"
 11
 12	"github.com/stormlightlabs/noteleaf/internal/models"
 13)
 14
 15func newServerWithHtml(h string) *httptest.Server {
 16	return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 17		w.WriteHeader(http.StatusOK)
 18		w.Write([]byte(h))
 19	}))
 20}
 21
 22// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules.
 23func ExampleParser_Convert() {
 24	parser, err := NewArticleParser(http.DefaultClient)
 25	if err != nil {
 26		fmt.Printf("Failed to create parser: %v\n", err)
 27		return
 28	}
 29
 30	htmlPath := "examples/christopher-lloyd.html"
 31	htmlContent, err := os.ReadFile(htmlPath)
 32	if err != nil {
 33		fmt.Printf("Local HTML file not found: %v\n", err)
 34		return
 35	}
 36
 37	markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd")
 38	if err != nil {
 39		fmt.Printf("Failed to convert HTML: %v\n", err)
 40		return
 41	}
 42
 43	parts := strings.Split(markdown, "\n---\n")
 44	if len(parts) > 0 {
 45		frontmatter := strings.TrimSpace(parts[0])
 46		lines := strings.Split(frontmatter, "\n")
 47
 48		for i, line := range lines {
 49			if i >= 4 {
 50				break
 51			}
 52
 53			if !strings.Contains(line, "**Saved:**") {
 54				fmt.Println(line)
 55			}
 56		}
 57	}
 58
 59	// Output: # Christopher Lloyd
 60	//
 61	// **Source:** https://en.wikipedia.org/wiki/Christopher_Lloyd
 62}
 63
 64func TestArticleParser(t *testing.T) {
 65	t.Run("New", func(t *testing.T) {
 66		t.Run("successfully creates parser", func(t *testing.T) {
 67			parser, err := NewArticleParser(http.DefaultClient)
 68			if err != nil {
 69				t.Fatalf("Expected no error, got %v", err)
 70			}
 71			if parser == nil {
 72				t.Fatal("Expected parser to be created, got nil")
 73			}
 74			if len(parser.rules) == 0 {
 75				t.Error("Expected rules to be loaded")
 76			}
 77		})
 78
 79		t.Run("loads expected domains", func(t *testing.T) {
 80			parser, err := NewArticleParser(http.DefaultClient)
 81			if err != nil {
 82				t.Fatalf("Failed to create parser: %v", err)
 83			}
 84
 85			domains := parser.GetSupportedDomains()
 86			expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"}
 87
 88			if len(domains) != len(expectedDomains) {
 89				t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains))
 90			}
 91
 92			domainMap := make(map[string]bool)
 93			for _, domain := range domains {
 94				domainMap[domain] = true
 95			}
 96
 97			for _, expected := range expectedDomains {
 98				if !domainMap[expected] {
 99					t.Errorf("Expected domain %s not found in supported domains", expected)
100				}
101			}
102		})
103	})
104
105	t.Run("parseRules", func(t *testing.T) {
106		parser := &ArticleParser{rules: make(map[string]*ParsingRule)}
107
108		t.Run("parses valid rule file", func(t *testing.T) {
109			content := `title: //h1
110author: //span[@class='author']
111date: //time
112body: //article
113strip: //nav
114strip: //footer
115test_url: https://example.com/article`
116
117			rule, err := parser.parseRules("example.com", content)
118			if err != nil {
119				t.Fatalf("Expected no error, got %v", err)
120			}
121
122			if rule.Domain != "example.com" {
123				t.Errorf("Expected domain 'example.com', got %s", rule.Domain)
124			}
125			if rule.Title != "//h1" {
126				t.Errorf("Expected title '//h1', got %s", rule.Title)
127			}
128			if rule.Author != "//span[@class='author']" {
129				t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author)
130			}
131			if len(rule.Strip) != 2 {
132				t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip))
133			}
134			if len(rule.TestURLs) != 1 {
135				t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs))
136			}
137		})
138
139		t.Run("handles empty lines and comments", func(t *testing.T) {
140			content := `# This is a comment
141title: //h1
142
143# Another comment
144body: //article
145`
146
147			rule, err := parser.parseRules("test.com", content)
148			if err != nil {
149				t.Fatalf("Expected no error, got %v", err)
150			}
151
152			if rule.Title != "//h1" {
153				t.Errorf("Expected title '//h1', got %s", rule.Title)
154			}
155			if rule.Body != "//article" {
156				t.Errorf("Expected body '//article', got %s", rule.Body)
157			}
158		})
159	})
160
161	t.Run("slugify", func(t *testing.T) {
162		parser := &ArticleParser{}
163
164		tc := []struct {
165			input    string
166			expected string
167		}{
168			{"Simple Title", "simple-title"},
169			{"Title with Numbers 123", "title-with-numbers-123"},
170			{"Title-with-Hyphens", "title-with-hyphens"},
171			{"Title with Spaces and    Multiple   Spaces", "title-with-spaces-and-multiple-spaces"},
172			{"Title!@#$%^&*()with Special Characters", "title-with-special-characters"},
173			{"", ""},
174			{strings.Repeat("a", 150), strings.Repeat("a", 100)},
175		}
176
177		for _, tt := range tc {
178			t.Run(fmt.Sprintf("slugify '%s'", tt.input), func(t *testing.T) {
179				result := parser.slugify(tt.input)
180				if result != tt.expected {
181					t.Errorf("Expected '%s', got '%s'", tt.expected, result)
182				}
183			})
184		}
185	})
186
187	t.Run("Convert", func(t *testing.T) {
188		parser, err := NewArticleParser(http.DefaultClient)
189		if err != nil {
190			t.Fatalf("Failed to create parser: %v", err)
191		}
192
193		t.Run("fails with unsupported domain", func(t *testing.T) {
194			htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
195			_, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article")
196
197			if err == nil {
198				t.Error("Expected error for unsupported domain")
199			}
200			if !strings.Contains(err.Error(), "no parsing rule found") {
201				t.Errorf("Expected 'no parsing rule found' error, got %v", err)
202			}
203		})
204
205		t.Run("fails with invalid HTML", func(t *testing.T) {
206			invalidHTML := "<html><head><title>Test</head></body>"
207			_, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
208
209			if err == nil {
210				t.Error("Expected error for invalid HTML")
211			}
212		})
213
214		t.Run("fails when no title extracted", func(t *testing.T) {
215			htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
216			_, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
217
218			if err == nil {
219				t.Error("Expected error when no title can be extracted")
220			}
221			if !strings.Contains(err.Error(), "could not extract title") &&
222				!strings.Contains(err.Error(), "could not extract body content") {
223				t.Errorf("Expected title or body extraction error, got %v", err)
224			}
225		})
226
227		t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) {
228			htmlContent := `<html>
229			<head><title>Test Article</title></head>
230			<body>
231				<h1 id="firstHeading">Test Article Title</h1>
232				<div id="bodyContent">
233					<style>.mw-parser-output .hatnote{font-style:italic;}</style>
234					<p>This is the main content of the article.</p>
235					<div class="noprint">This should be stripped</div>
236					<div class="editsection">Edit this section</div>
237					<p>More content here.</p>
238				</div>
239			</body>
240		</html>`
241
242			markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
243			if err != nil {
244				t.Fatalf("Expected no error, got %v", err)
245			}
246
247			if !strings.Contains(markdown, "# Test Article Title") {
248				t.Error("Expected markdown to contain title")
249			}
250			if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") {
251				t.Error("Expected markdown to contain source URL")
252			}
253			if !strings.Contains(markdown, "This is the main content") {
254				t.Error("Expected markdown to contain article content")
255			}
256			if strings.Contains(markdown, "This should be stripped") {
257				t.Error("Expected stripped content to be removed from markdown")
258			}
259			if strings.Contains(markdown, ".mw-parser-output") {
260				t.Error("Expected style content to be removed from markdown")
261			}
262			if strings.Contains(markdown, "Edit this section") {
263				t.Error("Expected edit section markers to be removed from markdown")
264			}
265		})
266
267		t.Run("strips Wikipedia navigation boxes and metadata", func(t *testing.T) {
268			htmlContent := `<html>
269			<head><title>Test Navigation Article</title></head>
270			<body>
271				<h1 id="firstHeading">Test Navigation Article</h1>
272				<div id="bodyContent">
273					<p>Main article content goes here.</p>
274					<h2>Section One<span class="mw-editsection">[edit]</span></h2>
275					<p>Section content.</p>
276					<table class="navbox" role="navigation">
277						<tr><td>Navigation item 1</td></tr>
278						<tr><td>Navigation item 2</td></tr>
279					</table>
280					<div class="navbox">
281						<p>Another navigation box</p>
282					</div>
283					<table class="vertical-navbox">
284						<tr><td>Vertical nav item</td></tr>
285					</table>
286					<p>More article content.</p>
287					<div role="navigation">
288						<p>Navigation content</p>
289					</div>
290					<div id="catlinks">
291						<p>Categories: Test Category</p>
292					</div>
293					<div id="footer">
294						<p>Retrieved from Wikipedia</p>
295					</div>
296				</div>
297			</body>
298		</html>`
299
300			markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test_Navigation")
301			if err != nil {
302				t.Fatalf("Expected no error, got %v", err)
303			}
304
305			if !strings.Contains(markdown, "Main article content") {
306				t.Error("Expected markdown to contain main article content")
307			}
308			if !strings.Contains(markdown, "Section content") {
309				t.Error("Expected markdown to contain section content")
310			}
311			if !strings.Contains(markdown, "More article content") {
312				t.Error("Expected markdown to contain additional content")
313			}
314
315			if strings.Contains(markdown, "Navigation item") {
316				t.Error("Expected navbox table content to be stripped")
317			}
318			if strings.Contains(markdown, "Another navigation box") {
319				t.Error("Expected navbox div content to be stripped")
320			}
321			if strings.Contains(markdown, "Vertical nav item") {
322				t.Error("Expected vertical-navbox content to be stripped")
323			}
324			if strings.Contains(markdown, "[edit]") {
325				t.Error("Expected edit section markers to be stripped")
326			}
327			if strings.Contains(markdown, "Navigation content") {
328				t.Error("Expected role=navigation content to be stripped")
329			}
330			if strings.Contains(markdown, "Categories:") {
331				t.Error("Expected category links to be stripped")
332			}
333			if strings.Contains(markdown, "Retrieved from") {
334				t.Error("Expected footer content to be stripped")
335			}
336		})
337	})
338
339	t.Run("ParseURL", func(t *testing.T) {
340		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
341			switch {
342			case strings.Contains(r.URL.Path, "404"):
343				w.WriteHeader(http.StatusNotFound)
344			case strings.Contains(r.URL.Path, "unsupported"):
345				w.WriteHeader(http.StatusOK)
346				w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
347			default:
348				w.WriteHeader(http.StatusOK)
349				w.Write([]byte(`<html>
350					<head><title>Test Article</title></head>
351					<body>
352						<h1 id="firstHeading">Test Wikipedia Article</h1>
353						<div id="bodyContent">
354							<p>This is the article content.</p>
355							<div class="noprint">This gets stripped</div>
356						</div>
357					</body>
358				</html>`))
359			}
360		}))
361		defer server.Close()
362
363		parser, err := NewArticleParser(server.Client())
364		if err != nil {
365			t.Fatalf("Failed to create parser: %v", err)
366		}
367
368		localhostRule := &ParsingRule{
369			Domain: "127.0.0.1",
370			Title:  "//h1[@id='firstHeading']",
371			Body:   "//div[@id='bodyContent']",
372			Strip:  []string{"//div[@class='noprint']"},
373		}
374		parser.AddRule("127.0.0.1", localhostRule)
375
376		t.Run("fails with invalid URL", func(t *testing.T) {
377			_, err := parser.ParseURL("not-a-url")
378			if err == nil {
379				t.Error("Expected error for invalid URL")
380			}
381			if !strings.Contains(err.Error(), "unsupported protocol scheme") {
382				t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err)
383			}
384		})
385
386		t.Run("fails with unsupported domain", func(t *testing.T) {
387			_, err := parser.ParseURL(server.URL + "/unsupported.com")
388			if err == nil {
389				t.Error("Expected error for unsupported domain")
390			}
391		})
392
393		t.Run("fails with HTTP error", func(t *testing.T) {
394			_, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test")
395			if err == nil {
396				t.Error("Expected error for HTTP 404")
397			}
398		})
399
400	})
401
402	t.Run("SaveArticle", func(t *testing.T) {
403		parser := &ArticleParser{}
404		tempDir := t.TempDir()
405
406		content := &ParsedContent{
407			Title:   "Test Article",
408			Author:  "Test Author",
409			Date:    "2023-01-01",
410			Content: "This is test content.",
411			URL:     "https://example.com/test",
412		}
413
414		t.Run("successfully saves article", func(t *testing.T) {
415			mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
416			if err != nil {
417				t.Fatalf("Expected no error, got %v", err)
418			}
419
420			if _, err := os.Stat(mdPath); os.IsNotExist(err) {
421				t.Error("Expected markdown file to exist")
422			}
423			if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
424				t.Error("Expected HTML file to exist")
425			}
426
427			mdContent, err := os.ReadFile(mdPath)
428			if err != nil {
429				t.Fatalf("Failed to read markdown file: %v", err)
430			}
431			if !strings.Contains(string(mdContent), "# Test Article") {
432				t.Error("Expected markdown to contain title")
433			}
434			if !strings.Contains(string(mdContent), "**Author:** Test Author") {
435				t.Error("Expected markdown to contain author")
436			}
437
438			htmlContentBytes, err := os.ReadFile(htmlPath)
439			if err != nil {
440				t.Fatalf("Failed to read HTML file: %v", err)
441			}
442			if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") {
443				t.Error("Expected HTML to contain title")
444			}
445		})
446
447		t.Run("handles duplicate filenames", func(t *testing.T) {
448			mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir)
449			if err != nil {
450				t.Fatalf("Expected no error for first save, got %v", err)
451			}
452
453			mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir)
454			if err != nil {
455				t.Fatalf("Expected no error for second save, got %v", err)
456			}
457
458			if mdPath1 == mdPath2 {
459				t.Error("Expected different markdown paths for duplicate saves")
460			}
461			if htmlPath1 == htmlPath2 {
462				t.Error("Expected different HTML paths for duplicate saves")
463			}
464
465			if _, err := os.Stat(mdPath1); os.IsNotExist(err) {
466				t.Error("Expected first markdown file to exist")
467			}
468			if _, err := os.Stat(mdPath2); os.IsNotExist(err) {
469				t.Error("Expected second markdown file to exist")
470			}
471		})
472
473		t.Run("fails with invalid directory", func(t *testing.T) {
474			invalidDir := "/nonexistent/directory"
475			_, _, err := parser.SaveArticle(content, invalidDir)
476			if err == nil {
477				t.Error("Expected error for invalid directory")
478			}
479		})
480	})
481
482	t.Run("createHTML", func(t *testing.T) {
483		parser := &ArticleParser{}
484		content := &ParsedContent{
485			Title:   "Test HTML Article",
486			Author:  "HTML Author",
487			Date:    "2023-12-25",
488			Content: "This is **bold** content with *emphasis*.",
489			URL:     "https://example.com/html-test",
490		}
491
492		t.Run("creates valid HTML", func(t *testing.T) {
493			markdown := parser.createMarkdown(content)
494			html := parser.createHTML(content, markdown)
495
496			if !strings.Contains(html, "<!DOCTYPE html>") {
497				t.Error("Expected HTML to contain DOCTYPE")
498			}
499			if !strings.Contains(html, "<title>Test HTML Article</title>") {
500				t.Error("Expected HTML to contain title")
501			}
502			if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") {
503				t.Error("Expected HTML to contain h1 heading with title")
504			}
505			if !strings.Contains(html, "<strong>bold</strong>") {
506				t.Error("Expected HTML to contain bold formatting")
507			}
508			if !strings.Contains(html, "<em>emphasis</em>") {
509				t.Error("Expected HTML to contain emphasis formatting")
510			}
511		})
512	})
513
514	t.Run("createMarkdown", func(t *testing.T) {
515		parser := &ArticleParser{}
516
517		t.Run("creates markdown with all fields", func(t *testing.T) {
518			content := &ParsedContent{
519				Title:   "Full Content Article",
520				Author:  "Complete Author",
521				Date:    "2023-01-15",
522				Content: "Complete article content here.",
523				URL:     "https://example.com/full",
524			}
525
526			markdown := parser.createMarkdown(content)
527
528			if !strings.Contains(markdown, "# Full Content Article") {
529				t.Error("Expected markdown to contain title")
530			}
531			if !strings.Contains(markdown, "**Author:** Complete Author") {
532				t.Error("Expected markdown to contain author")
533			}
534			if !strings.Contains(markdown, "**Date:** 2023-01-15") {
535				t.Error("Expected markdown to contain date")
536			}
537			if !strings.Contains(markdown, "**Source:** https://example.com/full") {
538				t.Error("Expected markdown to contain source URL")
539			}
540			if !strings.Contains(markdown, "**Saved:**") {
541				t.Error("Expected markdown to contain saved timestamp")
542			}
543			if !strings.Contains(markdown, "---") {
544				t.Error("Expected markdown to contain separator")
545			}
546			if !strings.Contains(markdown, "Complete article content here.") {
547				t.Error("Expected markdown to contain article content")
548			}
549		})
550
551		t.Run("creates markdown with minimal fields", func(t *testing.T) {
552			content := &ParsedContent{
553				Title:   "Minimal Article",
554				Content: "Just content.",
555				URL:     "https://example.com/minimal",
556			}
557
558			markdown := parser.createMarkdown(content)
559
560			if !strings.Contains(markdown, "# Minimal Article") {
561				t.Error("Expected markdown to contain title")
562			}
563			if strings.Contains(markdown, "**Author:**") {
564				t.Error("Expected no author field for empty author")
565			}
566			if strings.Contains(markdown, "**Date:**") {
567				t.Error("Expected no date field for empty date")
568			}
569			if !strings.Contains(markdown, "**Source:** https://example.com/minimal") {
570				t.Error("Expected markdown to contain source URL")
571			}
572		})
573	})
574}
575
576func TestCreateArticleFromURL(t *testing.T) {
577	tempDir := t.TempDir()
578
579	t.Run("fails with invalid URL", func(t *testing.T) {
580		_, err := CreateArticleFromURL("not-a-url", tempDir)
581		if err == nil {
582			t.Error("Expected error for invalid URL")
583		}
584		if !strings.Contains(err.Error(), "invalid URL") && !strings.Contains(err.Error(), "failed to parse URL") {
585			t.Errorf("Expected URL parsing error, got %v", err)
586		}
587	})
588
589	t.Run("fails with empty URL", func(t *testing.T) {
590		_, err := CreateArticleFromURL("", tempDir)
591		if err == nil {
592			t.Error("Expected error for empty URL")
593		}
594	})
595
596	t.Run("fails with unsupported domain", func(t *testing.T) {
597		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
598			w.WriteHeader(http.StatusOK)
599			w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
600		}))
601		defer server.Close()
602
603		_, err := CreateArticleFromURL(server.URL, tempDir)
604		if err == nil {
605			t.Error("Expected error for unsupported domain")
606		}
607		if !strings.Contains(err.Error(), "no parsing rule found") {
608			t.Errorf("Expected 'no parsing rule found' error, got %v", err)
609		}
610	})
611
612	t.Run("fails with HTTP error", func(t *testing.T) {
613		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
614			w.WriteHeader(http.StatusNotFound)
615		}))
616		defer server.Close()
617
618		_, err := CreateArticleFromURL("https://en.wikipedia.org/wiki/NonExistentPage12345", tempDir)
619		if err == nil {
620			t.Error("Expected error for HTTP 404")
621		}
622		if !strings.Contains(err.Error(), "HTTP error") && !strings.Contains(err.Error(), "404") {
623			t.Errorf("Expected HTTP error, got %v", err)
624		}
625	})
626
627	t.Run("fails with network error", func(t *testing.T) {
628		_, err := CreateArticleFromURL("http://localhost:99999/test", tempDir)
629		if err == nil {
630			t.Error("Expected error for network failure")
631		}
632		if !strings.Contains(err.Error(), "failed to fetch URL") && !strings.Contains(err.Error(), "connection refused") {
633			t.Errorf("Expected network error, got %v", err)
634		}
635	})
636
637	t.Run("fails with malformed HTML", func(t *testing.T) {
638		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
639			w.WriteHeader(http.StatusOK)
640			w.Write([]byte("<html><head><title>Test</head></body>"))
641		}))
642		defer server.Close()
643
644		parser, err := NewArticleParser(server.Client())
645		if err != nil {
646			t.Fatalf("Failed to create parser: %v", err)
647		}
648
649		localhostRule := &ParsingRule{
650			Domain: "127.0.0.1",
651			Title:  "//h1[@id='firstHeading']",
652			Body:   "//div[@id='bodyContent']",
653			Strip:  []string{"//div[@class='noprint']"},
654		}
655		parser.AddRule("127.0.0.1", localhostRule)
656
657		_, err = parser.ParseURL(server.URL)
658		if err == nil {
659			t.Error("Expected error for malformed HTML")
660		}
661		if !strings.Contains(err.Error(), "failed to parse HTML") &&
662			!strings.Contains(err.Error(), "could not extract title") &&
663			!strings.Contains(err.Error(), "could not extract body content") {
664			t.Errorf("Expected HTML parsing or extraction error, got %v", err)
665		}
666	})
667
668	t.Run("fails when no title can be extracted", func(t *testing.T) {
669		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
670			w.WriteHeader(http.StatusOK)
671			w.Write([]byte(`<html>
672				<head><title>Test</title></head>
673				<body>
674					<div id="bodyContent">
675						<p>Content without proper title</p>
676					</div>
677				</body>
678			</html>`))
679		}))
680		defer server.Close()
681
682		parser, err := NewArticleParser(server.Client())
683		if err != nil {
684			t.Fatalf("Failed to create parser: %v", err)
685		}
686
687		localhostRule := &ParsingRule{
688			Domain: "127.0.0.1",
689			Title:  "//h1[@id='firstHeading']",
690			Body:   "//div[@id='bodyContent']",
691			Strip:  []string{"//div[@class='noprint']"},
692		}
693		parser.AddRule("127.0.0.1", localhostRule)
694
695		_, err = parser.ParseURL(server.URL)
696		if err == nil {
697			t.Error("Expected error when no title can be extracted")
698		}
699		if !strings.Contains(err.Error(), "could not extract title") {
700			t.Errorf("Expected 'could not extract title' error, got %v", err)
701		}
702	})
703
704	t.Run("successfully creates article structure from parsed content", func(t *testing.T) {
705		wikipediaHTML := `<html>
706			<head><title>Integration Test Article</title></head>
707			<body>
708				<h1 id="firstHeading">Integration Test Article</h1>
709				<div id="bodyContent">
710					<p>This is integration test content.</p>
711					<div class="noprint">This should be stripped</div>
712					<p>More content here.</p>
713				</div>
714			</body>
715		</html>`
716
717		server := newServerWithHtml(wikipediaHTML)
718		defer server.Close()
719
720		parser, err := NewArticleParser(server.Client())
721		if err != nil {
722			t.Fatalf("Failed to create parser: %v", err)
723		}
724
725		localhostRule := &ParsingRule{
726			Domain: "127.0.0.1",
727			Title:  "//h1[@id='firstHeading']",
728			Body:   "//div[@id='bodyContent']",
729			Strip:  []string{"//div[@class='noprint']"},
730		}
731		parser.AddRule("127.0.0.1", localhostRule)
732
733		content, err := parser.ParseURL(server.URL)
734		if err != nil {
735			t.Fatalf("Expected no error, got %v", err)
736		}
737
738		mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
739		if err != nil {
740			t.Fatalf("Failed to save article: %v", err)
741		}
742
743		article := &models.Article{
744			URL:          server.URL,
745			Title:        content.Title,
746			MarkdownPath: mdPath,
747			HTMLPath:     htmlPath,
748			Created:      time.Now(),
749			Modified:     time.Now(),
750		}
751
752		if article.Title != "Integration Test Article" {
753			t.Errorf("Expected title 'Integration Test Article', got %s", article.Title)
754		}
755		if article.URL != server.URL {
756			t.Errorf("Expected URL %s, got %s", server.URL, article.URL)
757		}
758		if article.MarkdownPath == "" {
759			t.Error("Expected non-empty markdown path")
760		}
761		if article.HTMLPath == "" {
762			t.Error("Expected non-empty HTML path")
763		}
764		if article.Created.IsZero() {
765			t.Error("Expected Created timestamp to be set")
766		}
767		if article.Modified.IsZero() {
768			t.Error("Expected Modified timestamp to be set")
769		}
770
771		if _, err := os.Stat(article.MarkdownPath); os.IsNotExist(err) {
772			t.Error("Expected markdown file to exist")
773		}
774		if _, err := os.Stat(article.HTMLPath); os.IsNotExist(err) {
775			t.Error("Expected HTML file to exist")
776		}
777
778		mdContent, err := os.ReadFile(article.MarkdownPath)
779		if err != nil {
780			t.Fatalf("Failed to read markdown file: %v", err)
781		}
782		if !strings.Contains(string(mdContent), "# Integration Test Article") {
783			t.Error("Expected markdown to contain title")
784		}
785		if !strings.Contains(string(mdContent), "This is integration test content") {
786			t.Error("Expected markdown to contain article content")
787		}
788		if strings.Contains(string(mdContent), "This should be stripped") {
789			t.Error("Expected stripped content to be removed from markdown")
790		}
791
792		htmlContent, err := os.ReadFile(article.HTMLPath)
793		if err != nil {
794			t.Fatalf("Failed to read HTML file: %v", err)
795		}
796		if !strings.Contains(string(htmlContent), "<title>Integration Test Article</title>") {
797			t.Error("Expected HTML to contain title")
798		}
799		if !strings.Contains(string(htmlContent), "<!DOCTYPE html>") {
800			t.Error("Expected HTML to contain DOCTYPE")
801		}
802	})
803
804	t.Run("successfully handles article with metadata", func(t *testing.T) {
805		contentHTML := `<html>
806			<head>
807				<title>Test Paper</title>
808				<meta name="citation_author" content="Dr. Test Author">
809				<meta name="citation_date" content="2024-01-01">
810			</head>
811			<body>
812				<h1 class="title">Test Research Paper</h1>
813				<blockquote class="abstract">
814					<p>This is the abstract of the research paper.</p>
815					<p>It contains important research findings.</p>
816				</blockquote>
817			</body>
818		</html>`
819
820		server := newServerWithHtml(contentHTML)
821		defer server.Close()
822
823		parser, err := NewArticleParser(server.Client())
824		if err != nil {
825			t.Fatalf("Failed to create parser: %v", err)
826		}
827
828		localhostRule := &ParsingRule{
829			Domain: "127.0.0.1",
830			Title:  "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]",
831			Body:   "//blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')]",
832			Date:   "//meta[@name='citation_date']/@content",
833			Author: "//meta[@name='citation_author']/@content",
834		}
835		parser.AddRule("127.0.0.1", localhostRule)
836
837		content, err := parser.ParseURL(server.URL)
838		if err != nil {
839			t.Fatalf("Expected no error, got %v", err)
840		}
841
842		if content.Title != "Test Research Paper" {
843			t.Errorf("Expected title 'Test Research Paper', got %s", content.Title)
844		}
845		if content.Author != "Dr. Test Author" {
846			t.Errorf("Expected author 'Dr. Test Author', got %s", content.Author)
847		}
848		if content.Date != "2024-01-01" {
849			t.Errorf("Expected date '2024-01-01', got %s", content.Date)
850		}
851
852		mdPath, _, err := parser.SaveArticle(content, tempDir)
853		if err != nil {
854			t.Fatalf("Failed to save article: %v", err)
855		}
856
857		mdContent, err := os.ReadFile(mdPath)
858		if err != nil {
859			t.Fatalf("Failed to read markdown file: %v", err)
860		}
861		if !strings.Contains(string(mdContent), "**Author:** Dr. Test Author") {
862			t.Error("Expected markdown to contain author")
863		}
864		if !strings.Contains(string(mdContent), "**Date:** 2024-01-01") {
865			t.Error("Expected markdown to contain date")
866		}
867
868		article := &models.Article{
869			Author: content.Author,
870			Date:   content.Date,
871		}
872
873		if article.Author != "Dr. Test Author" {
874			t.Errorf("Expected article author 'Dr. Test Author', got %s", article.Author)
875		}
876		if article.Date != "2024-01-01" {
877			t.Errorf("Expected article date '2024-01-01', got %s", article.Date)
878		}
879	})
880}
Configure Feed

Configure Feed