internal/articles/articles_test.go at d2c4ecc217c2c1655026bba3b364c124b3a95a8a

desertthunder.dev / noteleaf
fork
cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
fork
noteleaf / internal / articles / articles_test.go
at d2c4ecc217c2c1655026bba3b364c124b3a95a8a 571 lines 17 kB view raw
wrap content
Owais Jamil feat: article parser 7mo ago
fad93018
  1package articles
  2
  3import (
  4	"fmt"
  5	"net/http"
  6	"net/http/httptest"
  7	"os"
  8	"strings"
  9	"testing"
 10)
 11
 12// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules.
 13func ExampleParser_Convert() {
 14	parser, err := NewArticleParser(http.DefaultClient)
 15	if err != nil {
 16		fmt.Printf("Failed to create parser: %v\n", err)
 17		return
 18	}
 19
 20	htmlPath := "examples/christopher-lloyd.html"
 21	htmlContent, err := os.ReadFile(htmlPath)
 22	if err != nil {
 23		fmt.Printf("Local HTML file not found: %v\n", err)
 24		return
 25	}
 26
 27	markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd")
 28	if err != nil {
 29		fmt.Printf("Failed to convert HTML: %v\n", err)
 30		return
 31	}
 32
 33	parts := strings.Split(markdown, "\n---\n")
 34	if len(parts) > 0 {
 35		frontmatter := strings.TrimSpace(parts[0])
 36		lines := strings.Split(frontmatter, "\n")
 37
 38		for i, line := range lines {
 39			if i >= 4 {
 40				break
 41			}
 42
 43			if !strings.Contains(line, "**Saved:**") {
 44				fmt.Println(line)
 45			}
 46		}
 47	}
 48
 49	// Output: # Christopher Lloyd
 50	//
 51	// **Source:** https://en.wikipedia.org/wiki/Christopher_Lloyd
 52}
 53
 54func TestArticleParser(t *testing.T) {
 55	t.Run("New", func(t *testing.T) {
 56		t.Run("successfully creates parser", func(t *testing.T) {
 57			parser, err := NewArticleParser(http.DefaultClient)
 58			if err != nil {
 59				t.Fatalf("Expected no error, got %v", err)
 60			}
 61			if parser == nil {
 62				t.Fatal("Expected parser to be created, got nil")
 63			}
 64			if len(parser.rules) == 0 {
 65				t.Error("Expected rules to be loaded")
 66			}
 67		})
 68
 69		t.Run("loads expected domains", func(t *testing.T) {
 70			parser, err := NewArticleParser(http.DefaultClient)
 71			if err != nil {
 72				t.Fatalf("Failed to create parser: %v", err)
 73			}
 74
 75			domains := parser.GetSupportedDomains()
 76			expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"}
 77
 78			if len(domains) != len(expectedDomains) {
 79				t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains))
 80			}
 81
 82			domainMap := make(map[string]bool)
 83			for _, domain := range domains {
 84				domainMap[domain] = true
 85			}
 86
 87			for _, expected := range expectedDomains {
 88				if !domainMap[expected] {
 89					t.Errorf("Expected domain %s not found in supported domains", expected)
 90				}
 91			}
 92		})
 93	})
 94
 95	t.Run("parseRules", func(t *testing.T) {
 96		parser := &ArticleParser{rules: make(map[string]*ParsingRule)}
 97
 98		t.Run("parses valid rule file", func(t *testing.T) {
 99			content := `title: //h1
100author: //span[@class='author']
101date: //time
102body: //article
103strip: //nav
104strip: //footer
105test_url: https://example.com/article`
106
107			rule, err := parser.parseRules("example.com", content)
108			if err != nil {
109				t.Fatalf("Expected no error, got %v", err)
110			}
111
112			if rule.Domain != "example.com" {
113				t.Errorf("Expected domain 'example.com', got %s", rule.Domain)
114			}
115			if rule.Title != "//h1" {
116				t.Errorf("Expected title '//h1', got %s", rule.Title)
117			}
118			if rule.Author != "//span[@class='author']" {
119				t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author)
120			}
121			if len(rule.Strip) != 2 {
122				t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip))
123			}
124			if len(rule.TestURLs) != 1 {
125				t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs))
126			}
127		})
128
129		t.Run("handles empty lines and comments", func(t *testing.T) {
130			content := `# This is a comment
131title: //h1
132
133# Another comment
134body: //article
135`
136
137			rule, err := parser.parseRules("test.com", content)
138			if err != nil {
139				t.Fatalf("Expected no error, got %v", err)
140			}
141
142			if rule.Title != "//h1" {
143				t.Errorf("Expected title '//h1', got %s", rule.Title)
144			}
145			if rule.Body != "//article" {
146				t.Errorf("Expected body '//article', got %s", rule.Body)
147			}
148		})
149	})
150
151	t.Run("slugify", func(t *testing.T) {
152		parser := &ArticleParser{}
153
154		testCases := []struct {
155			input    string
156			expected string
157		}{
158			{"Simple Title", "simple-title"},
159			{"Title with Numbers 123", "title-with-numbers-123"},
160			{"Title-with-Hyphens", "title-with-hyphens"},
161			{"Title with Spaces and    Multiple   Spaces", "title-with-spaces-and-multiple-spaces"},
162			{"Title!@#$%^&*()with Special Characters", "title-with-special-characters"},
163			{"", ""},
164			{strings.Repeat("a", 150), strings.Repeat("a", 100)},
165		}
166
167		for _, tc := range testCases {
168			t.Run(fmt.Sprintf("slugify '%s'", tc.input), func(t *testing.T) {
169				result := parser.slugify(tc.input)
170				if result != tc.expected {
171					t.Errorf("Expected '%s', got '%s'", tc.expected, result)
172				}
173			})
174		}
175	})
176
177	t.Run("Convert", func(t *testing.T) {
178		parser, err := NewArticleParser(http.DefaultClient)
179		if err != nil {
180			t.Fatalf("Failed to create parser: %v", err)
181		}
182
183		t.Run("fails with unsupported domain", func(t *testing.T) {
184			htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
185			_, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article")
186
187			if err == nil {
188				t.Error("Expected error for unsupported domain")
189			}
190			if !strings.Contains(err.Error(), "no parsing rule found") {
191				t.Errorf("Expected 'no parsing rule found' error, got %v", err)
192			}
193		})
194
195		t.Run("fails with invalid HTML", func(t *testing.T) {
196			invalidHTML := "<html><head><title>Test</head></body>"
197			_, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
198
199			if err == nil {
200				t.Error("Expected error for invalid HTML")
201			}
202		})
203
204		t.Run("fails when no title extracted", func(t *testing.T) {
205			htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
206			_, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
207
208			if err == nil {
209				t.Error("Expected error when no title can be extracted")
210			}
211			if !strings.Contains(err.Error(), "could not extract title") {
212				t.Errorf("Expected 'could not extract title' error, got %v", err)
213			}
214		})
215
216		t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) {
217			htmlContent := `<html>
218			<head><title>Test Article</title></head>
219			<body>
220				<h1 id="firstHeading">Test Article Title</h1>
221				<div id="bodyContent">
222					<p>This is the main content of the article.</p>
223					<div class="noprint">This should be stripped</div>
224					<p>More content here.</p>
225				</div>
226			</body>
227		</html>`
228
229			markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
230			if err != nil {
231				t.Fatalf("Expected no error, got %v", err)
232			}
233
234			if !strings.Contains(markdown, "# Test Article Title") {
235				t.Error("Expected markdown to contain title")
236			}
237			if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") {
238				t.Error("Expected markdown to contain source URL")
239			}
240			if !strings.Contains(markdown, "This is the main content") {
241				t.Error("Expected markdown to contain article content")
242			}
243			if strings.Contains(markdown, "This should be stripped") {
244				t.Error("Expected stripped content to be removed from markdown")
245			}
246		})
247	})
248
249	t.Run("ParseURL", func(t *testing.T) {
250		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
251			switch {
252			case strings.Contains(r.URL.Path, "404"):
253				w.WriteHeader(http.StatusNotFound)
254			case strings.Contains(r.URL.Path, "unsupported"):
255				w.WriteHeader(http.StatusOK)
256				w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
257			default:
258				// Return Wikipedia-like structure for localhost rule
259				w.WriteHeader(http.StatusOK)
260				w.Write([]byte(`<html>
261					<head><title>Test Article</title></head>
262					<body>
263						<h1 id="firstHeading">Test Wikipedia Article</h1>
264						<div id="bodyContent">
265							<p>This is the article content.</p>
266							<div class="noprint">This gets stripped</div>
267						</div>
268					</body>
269				</html>`))
270			}
271		}))
272		defer server.Close()
273
274		parser, err := NewArticleParser(server.Client())
275		if err != nil {
276			t.Fatalf("Failed to create parser: %v", err)
277		}
278
279		localhostRule := &ParsingRule{
280			Domain: "127.0.0.1",
281			Title:  "//h1[@id='firstHeading']",
282			Body:   "//div[@id='bodyContent']",
283			Strip:  []string{"//div[@class='noprint']"},
284		}
285		parser.AddRule("127.0.0.1", localhostRule)
286
287		t.Run("fails with invalid URL", func(t *testing.T) {
288			_, err := parser.ParseURL("not-a-url")
289			if err == nil {
290				t.Error("Expected error for invalid URL")
291			}
292			if !strings.Contains(err.Error(), "unsupported protocol scheme") {
293				t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err)
294			}
295		})
296
297		t.Run("fails with unsupported domain", func(t *testing.T) {
298			_, err := parser.ParseURL(server.URL + "/unsupported.com")
299			if err == nil {
300				t.Error("Expected error for unsupported domain")
301			}
302		})
303
304		t.Run("fails with HTTP error", func(t *testing.T) {
305			_, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test")
306			if err == nil {
307				t.Error("Expected error for HTTP 404")
308			}
309		})
310
311	})
312
313	t.Run("SaveArticle", func(t *testing.T) {
314		parser := &ArticleParser{}
315		tempDir := t.TempDir()
316
317		content := &ParsedContent{
318			Title:   "Test Article",
319			Author:  "Test Author",
320			Date:    "2023-01-01",
321			Content: "This is test content.",
322			URL:     "https://example.com/test",
323		}
324
325		t.Run("successfully saves article", func(t *testing.T) {
326			mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
327			if err != nil {
328				t.Fatalf("Expected no error, got %v", err)
329			}
330
331			if _, err := os.Stat(mdPath); os.IsNotExist(err) {
332				t.Error("Expected markdown file to exist")
333			}
334			if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
335				t.Error("Expected HTML file to exist")
336			}
337
338			mdContent, err := os.ReadFile(mdPath)
339			if err != nil {
340				t.Fatalf("Failed to read markdown file: %v", err)
341			}
342			if !strings.Contains(string(mdContent), "# Test Article") {
343				t.Error("Expected markdown to contain title")
344			}
345			if !strings.Contains(string(mdContent), "**Author:** Test Author") {
346				t.Error("Expected markdown to contain author")
347			}
348
349			htmlContentBytes, err := os.ReadFile(htmlPath)
350			if err != nil {
351				t.Fatalf("Failed to read HTML file: %v", err)
352			}
353			if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") {
354				t.Error("Expected HTML to contain title")
355			}
356		})
357
358		t.Run("handles duplicate filenames", func(t *testing.T) {
359			mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir)
360			if err != nil {
361				t.Fatalf("Expected no error for first save, got %v", err)
362			}
363
364			mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir)
365			if err != nil {
366				t.Fatalf("Expected no error for second save, got %v", err)
367			}
368
369			if mdPath1 == mdPath2 {
370				t.Error("Expected different markdown paths for duplicate saves")
371			}
372			if htmlPath1 == htmlPath2 {
373				t.Error("Expected different HTML paths for duplicate saves")
374			}
375
376			if _, err := os.Stat(mdPath1); os.IsNotExist(err) {
377				t.Error("Expected first markdown file to exist")
378			}
379			if _, err := os.Stat(mdPath2); os.IsNotExist(err) {
380				t.Error("Expected second markdown file to exist")
381			}
382		})
383
384		t.Run("fails with invalid directory", func(t *testing.T) {
385			invalidDir := "/nonexistent/directory"
386			_, _, err := parser.SaveArticle(content, invalidDir)
387			if err == nil {
388				t.Error("Expected error for invalid directory")
389			}
390		})
391	})
392
393	t.Run("createHTML", func(t *testing.T) {
394		parser := &ArticleParser{}
395		content := &ParsedContent{
396			Title:   "Test HTML Article",
397			Author:  "HTML Author",
398			Date:    "2023-12-25",
399			Content: "This is **bold** content with *emphasis*.",
400			URL:     "https://example.com/html-test",
401		}
402
403		t.Run("creates valid HTML", func(t *testing.T) {
404			markdown := parser.createMarkdown(content)
405			html := parser.createHTML(content, markdown)
406
407			if !strings.Contains(html, "<!DOCTYPE html>") {
408				t.Error("Expected HTML to contain DOCTYPE")
409			}
410			if !strings.Contains(html, "<title>Test HTML Article</title>") {
411				t.Error("Expected HTML to contain title")
412			}
413			if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") {
414				t.Error("Expected HTML to contain h1 heading with title")
415			}
416			if !strings.Contains(html, "<strong>bold</strong>") {
417				t.Error("Expected HTML to contain bold formatting")
418			}
419			if !strings.Contains(html, "<em>emphasis</em>") {
420				t.Error("Expected HTML to contain emphasis formatting")
421			}
422		})
423	})
424
425	t.Run("createMarkdown", func(t *testing.T) {
426		parser := &ArticleParser{}
427
428		t.Run("creates markdown with all fields", func(t *testing.T) {
429			content := &ParsedContent{
430				Title:   "Full Content Article",
431				Author:  "Complete Author",
432				Date:    "2023-01-15",
433				Content: "Complete article content here.",
434				URL:     "https://example.com/full",
435			}
436
437			markdown := parser.createMarkdown(content)
438
439			if !strings.Contains(markdown, "# Full Content Article") {
440				t.Error("Expected markdown to contain title")
441			}
442			if !strings.Contains(markdown, "**Author:** Complete Author") {
443				t.Error("Expected markdown to contain author")
444			}
445			if !strings.Contains(markdown, "**Date:** 2023-01-15") {
446				t.Error("Expected markdown to contain date")
447			}
448			if !strings.Contains(markdown, "**Source:** https://example.com/full") {
449				t.Error("Expected markdown to contain source URL")
450			}
451			if !strings.Contains(markdown, "**Saved:**") {
452				t.Error("Expected markdown to contain saved timestamp")
453			}
454			if !strings.Contains(markdown, "---") {
455				t.Error("Expected markdown to contain separator")
456			}
457			if !strings.Contains(markdown, "Complete article content here.") {
458				t.Error("Expected markdown to contain article content")
459			}
460		})
461
462		t.Run("creates markdown with minimal fields", func(t *testing.T) {
463			content := &ParsedContent{
464				Title:   "Minimal Article",
465				Content: "Just content.",
466				URL:     "https://example.com/minimal",
467			}
468
469			markdown := parser.createMarkdown(content)
470
471			if !strings.Contains(markdown, "# Minimal Article") {
472				t.Error("Expected markdown to contain title")
473			}
474			if strings.Contains(markdown, "**Author:**") {
475				t.Error("Expected no author field for empty author")
476			}
477			if strings.Contains(markdown, "**Date:**") {
478				t.Error("Expected no date field for empty date")
479			}
480			if !strings.Contains(markdown, "**Source:** https://example.com/minimal") {
481				t.Error("Expected markdown to contain source URL")
482			}
483		})
484	})
485}
486
487func TestCreateArticleFromURL(t *testing.T) {
488	tempDir := t.TempDir()
489
490	t.Run("fails with invalid URL", func(t *testing.T) {
491		_, err := CreateArticleFromURL("not-a-url", tempDir)
492		if err == nil {
493			t.Error("Expected error for invalid URL")
494		}
495	})
496
497	t.Run("fails with unsupported domain", func(t *testing.T) {
498		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
499			w.WriteHeader(http.StatusOK)
500			w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
501		}))
502		defer server.Close()
503
504		_, err := CreateArticleFromURL(server.URL, tempDir)
505		if err == nil {
506			t.Error("Expected error for unsupported domain")
507		}
508	})
509
510	t.Run("successfully creates article from Wikipedia-like URL", func(t *testing.T) {
511		wikipediaHTML := `<html>
512			<head><title>Integration Test Article</title></head>
513			<body>
514				<h1 id="firstHeading">Integration Test Article</h1>
515				<div id="bodyContent">
516					<p>This is integration test content.</p>
517				</div>
518			</body>
519		</html>`
520
521		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
522			w.WriteHeader(http.StatusOK)
523			w.Write([]byte(wikipediaHTML))
524		}))
525		defer server.Close()
526
527		// We need to patch the CreateArticleFromURL function to use our test client and rules
528		// For now, let's test the components individually since CreateArticleFromURL uses NewArticleParser internally
529		parser, err := NewArticleParser(server.Client())
530		if err != nil {
531			t.Fatalf("Failed to create parser: %v", err)
532		}
533
534		// Add localhost rule for testing
535		localhostRule := &ParsingRule{
536			Domain: "127.0.0.1",
537			Title:  "//h1[@id='firstHeading']",
538			Body:   "//div[@id='bodyContent']",
539			Strip:  []string{"//div[@class='noprint']"},
540		}
541		parser.AddRule("127.0.0.1", localhostRule)
542
543		content, err := parser.ParseURL(server.URL)
544		if err != nil {
545			t.Fatalf("Expected no error, got %v", err)
546		}
547
548		mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
549		if err != nil {
550			t.Fatalf("Failed to save article: %v", err)
551		}
552
553		if content.Title != "Integration Test Article" {
554			t.Errorf("Expected title 'Integration Test Article', got %s", content.Title)
555		}
556		if mdPath == "" {
557			t.Error("Expected non-empty markdown path")
558		}
559		if htmlPath == "" {
560			t.Error("Expected non-empty HTML path")
561		}
562
563		// Check files exist
564		if _, err := os.Stat(mdPath); os.IsNotExist(err) {
565			t.Error("Expected markdown file to exist")
566		}
567		if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
568			t.Error("Expected HTML file to exist")
569		}
570	})
571}
Configure Feed

Configure Feed