package articles import ( "fmt" "net/http" "net/http/httptest" "os" "strings" "testing" ) // ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules. func ExampleParser_Convert() { parser, err := NewArticleParser(http.DefaultClient) if err != nil { fmt.Printf("Failed to create parser: %v\n", err) return } htmlPath := "examples/christopher-lloyd.html" htmlContent, err := os.ReadFile(htmlPath) if err != nil { fmt.Printf("Local HTML file not found: %v\n", err) return } markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd") if err != nil { fmt.Printf("Failed to convert HTML: %v\n", err) return } parts := strings.Split(markdown, "\n---\n") if len(parts) > 0 { frontmatter := strings.TrimSpace(parts[0]) lines := strings.Split(frontmatter, "\n") for i, line := range lines { if i >= 4 { break } if !strings.Contains(line, "**Saved:**") { fmt.Println(line) } } } // Output: # Christopher Lloyd // // **Source:** https://en.wikipedia.org/wiki/Christopher_Lloyd } func TestArticleParser(t *testing.T) { t.Run("New", func(t *testing.T) { t.Run("successfully creates parser", func(t *testing.T) { parser, err := NewArticleParser(http.DefaultClient) if err != nil { t.Fatalf("Expected no error, got %v", err) } if parser == nil { t.Fatal("Expected parser to be created, got nil") } if len(parser.rules) == 0 { t.Error("Expected rules to be loaded") } }) t.Run("loads expected domains", func(t *testing.T) { parser, err := NewArticleParser(http.DefaultClient) if err != nil { t.Fatalf("Failed to create parser: %v", err) } domains := parser.GetSupportedDomains() expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"} if len(domains) != len(expectedDomains) { t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains)) } domainMap := make(map[string]bool) for _, domain := range domains { domainMap[domain] = true } for _, expected := range expectedDomains { if !domainMap[expected] { t.Errorf("Expected domain %s not found in supported domains", expected) } } }) }) t.Run("parseRules", func(t *testing.T) { parser := &ArticleParser{rules: make(map[string]*ParsingRule)} t.Run("parses valid rule file", func(t *testing.T) { content := `title: //h1 author: //span[@class='author'] date: //time body: //article strip: //nav strip: //footer test_url: https://example.com/article` rule, err := parser.parseRules("example.com", content) if err != nil { t.Fatalf("Expected no error, got %v", err) } if rule.Domain != "example.com" { t.Errorf("Expected domain 'example.com', got %s", rule.Domain) } if rule.Title != "//h1" { t.Errorf("Expected title '//h1', got %s", rule.Title) } if rule.Author != "//span[@class='author']" { t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author) } if len(rule.Strip) != 2 { t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip)) } if len(rule.TestURLs) != 1 { t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs)) } }) t.Run("handles empty lines and comments", func(t *testing.T) { content := `# This is a comment title: //h1 # Another comment body: //article ` rule, err := parser.parseRules("test.com", content) if err != nil { t.Fatalf("Expected no error, got %v", err) } if rule.Title != "//h1" { t.Errorf("Expected title '//h1', got %s", rule.Title) } if rule.Body != "//article" { t.Errorf("Expected body '//article', got %s", rule.Body) } }) }) t.Run("slugify", func(t *testing.T) { parser := &ArticleParser{} testCases := []struct { input string expected string }{ {"Simple Title", "simple-title"}, {"Title with Numbers 123", "title-with-numbers-123"}, {"Title-with-Hyphens", "title-with-hyphens"}, {"Title with Spaces and Multiple Spaces", "title-with-spaces-and-multiple-spaces"}, {"Title!@#$%^&*()with Special Characters", "title-with-special-characters"}, {"", ""}, {strings.Repeat("a", 150), strings.Repeat("a", 100)}, } for _, tc := range testCases { t.Run(fmt.Sprintf("slugify '%s'", tc.input), func(t *testing.T) { result := parser.slugify(tc.input) if result != tc.expected { t.Errorf("Expected '%s', got '%s'", tc.expected, result) } }) } }) t.Run("Convert", func(t *testing.T) { parser, err := NewArticleParser(http.DefaultClient) if err != nil { t.Fatalf("Failed to create parser: %v", err) } t.Run("fails with unsupported domain", func(t *testing.T) { htmlContent := "Test

Content

" _, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article") if err == nil { t.Error("Expected error for unsupported domain") } if !strings.Contains(err.Error(), "no parsing rule found") { t.Errorf("Expected 'no parsing rule found' error, got %v", err) } }) t.Run("fails with invalid HTML", func(t *testing.T) { invalidHTML := "Test</head></body>" _, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") if err == nil { t.Error("Expected error for invalid HTML") } }) t.Run("fails when no title extracted", func(t *testing.T) { htmlContent := "<html><head><title>Test

Content

" _, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") if err == nil { t.Error("Expected error when no title can be extracted") } if !strings.Contains(err.Error(), "could not extract title") { t.Errorf("Expected 'could not extract title' error, got %v", err) } }) t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) { htmlContent := ` Test Article

Test Article Title

This is the main content of the article.

This should be stripped

More content here.

` markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") if err != nil { t.Fatalf("Expected no error, got %v", err) } if !strings.Contains(markdown, "# Test Article Title") { t.Error("Expected markdown to contain title") } if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") { t.Error("Expected markdown to contain source URL") } if !strings.Contains(markdown, "This is the main content") { t.Error("Expected markdown to contain article content") } if strings.Contains(markdown, "This should be stripped") { t.Error("Expected stripped content to be removed from markdown") } }) }) t.Run("ParseURL", func(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch { case strings.Contains(r.URL.Path, "404"): w.WriteHeader(http.StatusNotFound) case strings.Contains(r.URL.Path, "unsupported"): w.WriteHeader(http.StatusOK) w.Write([]byte("Test

Content

")) default: // Return Wikipedia-like structure for localhost rule w.WriteHeader(http.StatusOK) w.Write([]byte(` Test Article

Test Wikipedia Article

This is the article content.

This gets stripped
`)) } })) defer server.Close() parser, err := NewArticleParser(server.Client()) if err != nil { t.Fatalf("Failed to create parser: %v", err) } localhostRule := &ParsingRule{ Domain: "127.0.0.1", Title: "//h1[@id='firstHeading']", Body: "//div[@id='bodyContent']", Strip: []string{"//div[@class='noprint']"}, } parser.AddRule("127.0.0.1", localhostRule) t.Run("fails with invalid URL", func(t *testing.T) { _, err := parser.ParseURL("not-a-url") if err == nil { t.Error("Expected error for invalid URL") } if !strings.Contains(err.Error(), "unsupported protocol scheme") { t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err) } }) t.Run("fails with unsupported domain", func(t *testing.T) { _, err := parser.ParseURL(server.URL + "/unsupported.com") if err == nil { t.Error("Expected error for unsupported domain") } }) t.Run("fails with HTTP error", func(t *testing.T) { _, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test") if err == nil { t.Error("Expected error for HTTP 404") } }) }) t.Run("SaveArticle", func(t *testing.T) { parser := &ArticleParser{} tempDir := t.TempDir() content := &ParsedContent{ Title: "Test Article", Author: "Test Author", Date: "2023-01-01", Content: "This is test content.", URL: "https://example.com/test", } t.Run("successfully saves article", func(t *testing.T) { mdPath, htmlPath, err := parser.SaveArticle(content, tempDir) if err != nil { t.Fatalf("Expected no error, got %v", err) } if _, err := os.Stat(mdPath); os.IsNotExist(err) { t.Error("Expected markdown file to exist") } if _, err := os.Stat(htmlPath); os.IsNotExist(err) { t.Error("Expected HTML file to exist") } mdContent, err := os.ReadFile(mdPath) if err != nil { t.Fatalf("Failed to read markdown file: %v", err) } if !strings.Contains(string(mdContent), "# Test Article") { t.Error("Expected markdown to contain title") } if !strings.Contains(string(mdContent), "**Author:** Test Author") { t.Error("Expected markdown to contain author") } htmlContentBytes, err := os.ReadFile(htmlPath) if err != nil { t.Fatalf("Failed to read HTML file: %v", err) } if !strings.Contains(string(htmlContentBytes), "Test Article") { t.Error("Expected HTML to contain title") } }) t.Run("handles duplicate filenames", func(t *testing.T) { mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir) if err != nil { t.Fatalf("Expected no error for first save, got %v", err) } mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir) if err != nil { t.Fatalf("Expected no error for second save, got %v", err) } if mdPath1 == mdPath2 { t.Error("Expected different markdown paths for duplicate saves") } if htmlPath1 == htmlPath2 { t.Error("Expected different HTML paths for duplicate saves") } if _, err := os.Stat(mdPath1); os.IsNotExist(err) { t.Error("Expected first markdown file to exist") } if _, err := os.Stat(mdPath2); os.IsNotExist(err) { t.Error("Expected second markdown file to exist") } }) t.Run("fails with invalid directory", func(t *testing.T) { invalidDir := "/nonexistent/directory" _, _, err := parser.SaveArticle(content, invalidDir) if err == nil { t.Error("Expected error for invalid directory") } }) }) t.Run("createHTML", func(t *testing.T) { parser := &ArticleParser{} content := &ParsedContent{ Title: "Test HTML Article", Author: "HTML Author", Date: "2023-12-25", Content: "This is **bold** content with *emphasis*.", URL: "https://example.com/html-test", } t.Run("creates valid HTML", func(t *testing.T) { markdown := parser.createMarkdown(content) html := parser.createHTML(content, markdown) if !strings.Contains(html, "") { t.Error("Expected HTML to contain DOCTYPE") } if !strings.Contains(html, "Test HTML Article") { t.Error("Expected HTML to contain title") } if !strings.Contains(html, "bold") { t.Error("Expected HTML to contain bold formatting") } if !strings.Contains(html, "emphasis") { t.Error("Expected HTML to contain emphasis formatting") } }) }) t.Run("createMarkdown", func(t *testing.T) { parser := &ArticleParser{} t.Run("creates markdown with all fields", func(t *testing.T) { content := &ParsedContent{ Title: "Full Content Article", Author: "Complete Author", Date: "2023-01-15", Content: "Complete article content here.", URL: "https://example.com/full", } markdown := parser.createMarkdown(content) if !strings.Contains(markdown, "# Full Content Article") { t.Error("Expected markdown to contain title") } if !strings.Contains(markdown, "**Author:** Complete Author") { t.Error("Expected markdown to contain author") } if !strings.Contains(markdown, "**Date:** 2023-01-15") { t.Error("Expected markdown to contain date") } if !strings.Contains(markdown, "**Source:** https://example.com/full") { t.Error("Expected markdown to contain source URL") } if !strings.Contains(markdown, "**Saved:**") { t.Error("Expected markdown to contain saved timestamp") } if !strings.Contains(markdown, "---") { t.Error("Expected markdown to contain separator") } if !strings.Contains(markdown, "Complete article content here.") { t.Error("Expected markdown to contain article content") } }) t.Run("creates markdown with minimal fields", func(t *testing.T) { content := &ParsedContent{ Title: "Minimal Article", Content: "Just content.", URL: "https://example.com/minimal", } markdown := parser.createMarkdown(content) if !strings.Contains(markdown, "# Minimal Article") { t.Error("Expected markdown to contain title") } if strings.Contains(markdown, "**Author:**") { t.Error("Expected no author field for empty author") } if strings.Contains(markdown, "**Date:**") { t.Error("Expected no date field for empty date") } if !strings.Contains(markdown, "**Source:** https://example.com/minimal") { t.Error("Expected markdown to contain source URL") } }) }) } func TestCreateArticleFromURL(t *testing.T) { tempDir := t.TempDir() t.Run("fails with invalid URL", func(t *testing.T) { _, err := CreateArticleFromURL("not-a-url", tempDir) if err == nil { t.Error("Expected error for invalid URL") } }) t.Run("fails with unsupported domain", func(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) w.Write([]byte("Test

Content

")) })) defer server.Close() _, err := CreateArticleFromURL(server.URL, tempDir) if err == nil { t.Error("Expected error for unsupported domain") } }) t.Run("successfully creates article from Wikipedia-like URL", func(t *testing.T) { wikipediaHTML := ` Integration Test Article

Integration Test Article

This is integration test content.

` server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) w.Write([]byte(wikipediaHTML)) })) defer server.Close() // We need to patch the CreateArticleFromURL function to use our test client and rules // For now, let's test the components individually since CreateArticleFromURL uses NewArticleParser internally parser, err := NewArticleParser(server.Client()) if err != nil { t.Fatalf("Failed to create parser: %v", err) } // Add localhost rule for testing localhostRule := &ParsingRule{ Domain: "127.0.0.1", Title: "//h1[@id='firstHeading']", Body: "//div[@id='bodyContent']", Strip: []string{"//div[@class='noprint']"}, } parser.AddRule("127.0.0.1", localhostRule) content, err := parser.ParseURL(server.URL) if err != nil { t.Fatalf("Expected no error, got %v", err) } mdPath, htmlPath, err := parser.SaveArticle(content, tempDir) if err != nil { t.Fatalf("Failed to save article: %v", err) } if content.Title != "Integration Test Article" { t.Errorf("Expected title 'Integration Test Article', got %s", content.Title) } if mdPath == "" { t.Error("Expected non-empty markdown path") } if htmlPath == "" { t.Error("Expected non-empty HTML path") } // Check files exist if _, err := os.Stat(mdPath); os.IsNotExist(err) { t.Error("Expected markdown file to exist") } if _, err := os.Stat(htmlPath); os.IsNotExist(err) { t.Error("Expected HTML file to exist") } }) }