package articles
import (
"fmt"
"net/http"
"net/http/httptest"
"os"
"strings"
"testing"
)
// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules.
func ExampleParser_Convert() {
parser, err := NewArticleParser(http.DefaultClient)
if err != nil {
fmt.Printf("Failed to create parser: %v\n", err)
return
}
htmlPath := "examples/christopher-lloyd.html"
htmlContent, err := os.ReadFile(htmlPath)
if err != nil {
fmt.Printf("Local HTML file not found: %v\n", err)
return
}
markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd")
if err != nil {
fmt.Printf("Failed to convert HTML: %v\n", err)
return
}
parts := strings.Split(markdown, "\n---\n")
if len(parts) > 0 {
frontmatter := strings.TrimSpace(parts[0])
lines := strings.Split(frontmatter, "\n")
for i, line := range lines {
if i >= 4 {
break
}
if !strings.Contains(line, "**Saved:**") {
fmt.Println(line)
}
}
}
// Output: # Christopher Lloyd
//
// **Source:** https://en.wikipedia.org/wiki/Christopher_Lloyd
}
func TestArticleParser(t *testing.T) {
t.Run("New", func(t *testing.T) {
t.Run("successfully creates parser", func(t *testing.T) {
parser, err := NewArticleParser(http.DefaultClient)
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if parser == nil {
t.Fatal("Expected parser to be created, got nil")
}
if len(parser.rules) == 0 {
t.Error("Expected rules to be loaded")
}
})
t.Run("loads expected domains", func(t *testing.T) {
parser, err := NewArticleParser(http.DefaultClient)
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
domains := parser.GetSupportedDomains()
expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"}
if len(domains) != len(expectedDomains) {
t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains))
}
domainMap := make(map[string]bool)
for _, domain := range domains {
domainMap[domain] = true
}
for _, expected := range expectedDomains {
if !domainMap[expected] {
t.Errorf("Expected domain %s not found in supported domains", expected)
}
}
})
})
t.Run("parseRules", func(t *testing.T) {
parser := &ArticleParser{rules: make(map[string]*ParsingRule)}
t.Run("parses valid rule file", func(t *testing.T) {
content := `title: //h1
author: //span[@class='author']
date: //time
body: //article
strip: //nav
strip: //footer
test_url: https://example.com/article`
rule, err := parser.parseRules("example.com", content)
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if rule.Domain != "example.com" {
t.Errorf("Expected domain 'example.com', got %s", rule.Domain)
}
if rule.Title != "//h1" {
t.Errorf("Expected title '//h1', got %s", rule.Title)
}
if rule.Author != "//span[@class='author']" {
t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author)
}
if len(rule.Strip) != 2 {
t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip))
}
if len(rule.TestURLs) != 1 {
t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs))
}
})
t.Run("handles empty lines and comments", func(t *testing.T) {
content := `# This is a comment
title: //h1
# Another comment
body: //article
`
rule, err := parser.parseRules("test.com", content)
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if rule.Title != "//h1" {
t.Errorf("Expected title '//h1', got %s", rule.Title)
}
if rule.Body != "//article" {
t.Errorf("Expected body '//article', got %s", rule.Body)
}
})
})
t.Run("slugify", func(t *testing.T) {
parser := &ArticleParser{}
testCases := []struct {
input string
expected string
}{
{"Simple Title", "simple-title"},
{"Title with Numbers 123", "title-with-numbers-123"},
{"Title-with-Hyphens", "title-with-hyphens"},
{"Title with Spaces and Multiple Spaces", "title-with-spaces-and-multiple-spaces"},
{"Title!@#$%^&*()with Special Characters", "title-with-special-characters"},
{"", ""},
{strings.Repeat("a", 150), strings.Repeat("a", 100)},
}
for _, tc := range testCases {
t.Run(fmt.Sprintf("slugify '%s'", tc.input), func(t *testing.T) {
result := parser.slugify(tc.input)
if result != tc.expected {
t.Errorf("Expected '%s', got '%s'", tc.expected, result)
}
})
}
})
t.Run("Convert", func(t *testing.T) {
parser, err := NewArticleParser(http.DefaultClient)
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
t.Run("fails with unsupported domain", func(t *testing.T) {
htmlContent := "
TestContent
"
_, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article")
if err == nil {
t.Error("Expected error for unsupported domain")
}
if !strings.Contains(err.Error(), "no parsing rule found") {
t.Errorf("Expected 'no parsing rule found' error, got %v", err)
}
})
t.Run("fails with invalid HTML", func(t *testing.T) {
invalidHTML := "Test"
_, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
if err == nil {
t.Error("Expected error for invalid HTML")
}
})
t.Run("fails when no title extracted", func(t *testing.T) {
htmlContent := "TestContent
"
_, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
if err == nil {
t.Error("Expected error when no title can be extracted")
}
if !strings.Contains(err.Error(), "could not extract title") {
t.Errorf("Expected 'could not extract title' error, got %v", err)
}
})
t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) {
htmlContent := `
Test Article
Test Article Title
This is the main content of the article.
This should be stripped
More content here.
`
markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if !strings.Contains(markdown, "# Test Article Title") {
t.Error("Expected markdown to contain title")
}
if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") {
t.Error("Expected markdown to contain source URL")
}
if !strings.Contains(markdown, "This is the main content") {
t.Error("Expected markdown to contain article content")
}
if strings.Contains(markdown, "This should be stripped") {
t.Error("Expected stripped content to be removed from markdown")
}
})
})
t.Run("ParseURL", func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case strings.Contains(r.URL.Path, "404"):
w.WriteHeader(http.StatusNotFound)
case strings.Contains(r.URL.Path, "unsupported"):
w.WriteHeader(http.StatusOK)
w.Write([]byte("TestContent
"))
default:
// Return Wikipedia-like structure for localhost rule
w.WriteHeader(http.StatusOK)
w.Write([]byte(`
Test Article
Test Wikipedia Article
This is the article content.
This gets stripped
`))
}
}))
defer server.Close()
parser, err := NewArticleParser(server.Client())
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
localhostRule := &ParsingRule{
Domain: "127.0.0.1",
Title: "//h1[@id='firstHeading']",
Body: "//div[@id='bodyContent']",
Strip: []string{"//div[@class='noprint']"},
}
parser.AddRule("127.0.0.1", localhostRule)
t.Run("fails with invalid URL", func(t *testing.T) {
_, err := parser.ParseURL("not-a-url")
if err == nil {
t.Error("Expected error for invalid URL")
}
if !strings.Contains(err.Error(), "unsupported protocol scheme") {
t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err)
}
})
t.Run("fails with unsupported domain", func(t *testing.T) {
_, err := parser.ParseURL(server.URL + "/unsupported.com")
if err == nil {
t.Error("Expected error for unsupported domain")
}
})
t.Run("fails with HTTP error", func(t *testing.T) {
_, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test")
if err == nil {
t.Error("Expected error for HTTP 404")
}
})
})
t.Run("SaveArticle", func(t *testing.T) {
parser := &ArticleParser{}
tempDir := t.TempDir()
content := &ParsedContent{
Title: "Test Article",
Author: "Test Author",
Date: "2023-01-01",
Content: "This is test content.",
URL: "https://example.com/test",
}
t.Run("successfully saves article", func(t *testing.T) {
mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if _, err := os.Stat(mdPath); os.IsNotExist(err) {
t.Error("Expected markdown file to exist")
}
if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
t.Error("Expected HTML file to exist")
}
mdContent, err := os.ReadFile(mdPath)
if err != nil {
t.Fatalf("Failed to read markdown file: %v", err)
}
if !strings.Contains(string(mdContent), "# Test Article") {
t.Error("Expected markdown to contain title")
}
if !strings.Contains(string(mdContent), "**Author:** Test Author") {
t.Error("Expected markdown to contain author")
}
htmlContentBytes, err := os.ReadFile(htmlPath)
if err != nil {
t.Fatalf("Failed to read HTML file: %v", err)
}
if !strings.Contains(string(htmlContentBytes), "Test Article") {
t.Error("Expected HTML to contain title")
}
})
t.Run("handles duplicate filenames", func(t *testing.T) {
mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir)
if err != nil {
t.Fatalf("Expected no error for first save, got %v", err)
}
mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir)
if err != nil {
t.Fatalf("Expected no error for second save, got %v", err)
}
if mdPath1 == mdPath2 {
t.Error("Expected different markdown paths for duplicate saves")
}
if htmlPath1 == htmlPath2 {
t.Error("Expected different HTML paths for duplicate saves")
}
if _, err := os.Stat(mdPath1); os.IsNotExist(err) {
t.Error("Expected first markdown file to exist")
}
if _, err := os.Stat(mdPath2); os.IsNotExist(err) {
t.Error("Expected second markdown file to exist")
}
})
t.Run("fails with invalid directory", func(t *testing.T) {
invalidDir := "/nonexistent/directory"
_, _, err := parser.SaveArticle(content, invalidDir)
if err == nil {
t.Error("Expected error for invalid directory")
}
})
})
t.Run("createHTML", func(t *testing.T) {
parser := &ArticleParser{}
content := &ParsedContent{
Title: "Test HTML Article",
Author: "HTML Author",
Date: "2023-12-25",
Content: "This is **bold** content with *emphasis*.",
URL: "https://example.com/html-test",
}
t.Run("creates valid HTML", func(t *testing.T) {
markdown := parser.createMarkdown(content)
html := parser.createHTML(content, markdown)
if !strings.Contains(html, "") {
t.Error("Expected HTML to contain DOCTYPE")
}
if !strings.Contains(html, "Test HTML Article") {
t.Error("Expected HTML to contain title")
}
if !strings.Contains(html, "bold") {
t.Error("Expected HTML to contain bold formatting")
}
if !strings.Contains(html, "emphasis") {
t.Error("Expected HTML to contain emphasis formatting")
}
})
})
t.Run("createMarkdown", func(t *testing.T) {
parser := &ArticleParser{}
t.Run("creates markdown with all fields", func(t *testing.T) {
content := &ParsedContent{
Title: "Full Content Article",
Author: "Complete Author",
Date: "2023-01-15",
Content: "Complete article content here.",
URL: "https://example.com/full",
}
markdown := parser.createMarkdown(content)
if !strings.Contains(markdown, "# Full Content Article") {
t.Error("Expected markdown to contain title")
}
if !strings.Contains(markdown, "**Author:** Complete Author") {
t.Error("Expected markdown to contain author")
}
if !strings.Contains(markdown, "**Date:** 2023-01-15") {
t.Error("Expected markdown to contain date")
}
if !strings.Contains(markdown, "**Source:** https://example.com/full") {
t.Error("Expected markdown to contain source URL")
}
if !strings.Contains(markdown, "**Saved:**") {
t.Error("Expected markdown to contain saved timestamp")
}
if !strings.Contains(markdown, "---") {
t.Error("Expected markdown to contain separator")
}
if !strings.Contains(markdown, "Complete article content here.") {
t.Error("Expected markdown to contain article content")
}
})
t.Run("creates markdown with minimal fields", func(t *testing.T) {
content := &ParsedContent{
Title: "Minimal Article",
Content: "Just content.",
URL: "https://example.com/minimal",
}
markdown := parser.createMarkdown(content)
if !strings.Contains(markdown, "# Minimal Article") {
t.Error("Expected markdown to contain title")
}
if strings.Contains(markdown, "**Author:**") {
t.Error("Expected no author field for empty author")
}
if strings.Contains(markdown, "**Date:**") {
t.Error("Expected no date field for empty date")
}
if !strings.Contains(markdown, "**Source:** https://example.com/minimal") {
t.Error("Expected markdown to contain source URL")
}
})
})
}
func TestCreateArticleFromURL(t *testing.T) {
tempDir := t.TempDir()
t.Run("fails with invalid URL", func(t *testing.T) {
_, err := CreateArticleFromURL("not-a-url", tempDir)
if err == nil {
t.Error("Expected error for invalid URL")
}
})
t.Run("fails with unsupported domain", func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte("Test
Content
"))
}))
defer server.Close()
_, err := CreateArticleFromURL(server.URL, tempDir)
if err == nil {
t.Error("Expected error for unsupported domain")
}
})
t.Run("successfully creates article from Wikipedia-like URL", func(t *testing.T) {
wikipediaHTML := `
Integration Test Article
Integration Test Article
This is integration test content.
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte(wikipediaHTML))
}))
defer server.Close()
// We need to patch the CreateArticleFromURL function to use our test client and rules
// For now, let's test the components individually since CreateArticleFromURL uses NewArticleParser internally
parser, err := NewArticleParser(server.Client())
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
// Add localhost rule for testing
localhostRule := &ParsingRule{
Domain: "127.0.0.1",
Title: "//h1[@id='firstHeading']",
Body: "//div[@id='bodyContent']",
Strip: []string{"//div[@class='noprint']"},
}
parser.AddRule("127.0.0.1", localhostRule)
content, err := parser.ParseURL(server.URL)
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
if err != nil {
t.Fatalf("Failed to save article: %v", err)
}
if content.Title != "Integration Test Article" {
t.Errorf("Expected title 'Integration Test Article', got %s", content.Title)
}
if mdPath == "" {
t.Error("Expected non-empty markdown path")
}
if htmlPath == "" {
t.Error("Expected non-empty HTML path")
}
// Check files exist
if _, err := os.Stat(mdPath); os.IsNotExist(err) {
t.Error("Expected markdown file to exist")
}
if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
t.Error("Expected HTML file to exist")
}
})
}