cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
29
fork

Configure Feed

Select the types of activity you want to include in your feed.

at d2c4ecc217c2c1655026bba3b364c124b3a95a8a 571 lines 17 kB view raw
1package articles 2 3import ( 4 "fmt" 5 "net/http" 6 "net/http/httptest" 7 "os" 8 "strings" 9 "testing" 10) 11 12// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules. 13func ExampleParser_Convert() { 14 parser, err := NewArticleParser(http.DefaultClient) 15 if err != nil { 16 fmt.Printf("Failed to create parser: %v\n", err) 17 return 18 } 19 20 htmlPath := "examples/christopher-lloyd.html" 21 htmlContent, err := os.ReadFile(htmlPath) 22 if err != nil { 23 fmt.Printf("Local HTML file not found: %v\n", err) 24 return 25 } 26 27 markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd") 28 if err != nil { 29 fmt.Printf("Failed to convert HTML: %v\n", err) 30 return 31 } 32 33 parts := strings.Split(markdown, "\n---\n") 34 if len(parts) > 0 { 35 frontmatter := strings.TrimSpace(parts[0]) 36 lines := strings.Split(frontmatter, "\n") 37 38 for i, line := range lines { 39 if i >= 4 { 40 break 41 } 42 43 if !strings.Contains(line, "**Saved:**") { 44 fmt.Println(line) 45 } 46 } 47 } 48 49 // Output: # Christopher Lloyd 50 // 51 // **Source:** https://en.wikipedia.org/wiki/Christopher_Lloyd 52} 53 54func TestArticleParser(t *testing.T) { 55 t.Run("New", func(t *testing.T) { 56 t.Run("successfully creates parser", func(t *testing.T) { 57 parser, err := NewArticleParser(http.DefaultClient) 58 if err != nil { 59 t.Fatalf("Expected no error, got %v", err) 60 } 61 if parser == nil { 62 t.Fatal("Expected parser to be created, got nil") 63 } 64 if len(parser.rules) == 0 { 65 t.Error("Expected rules to be loaded") 66 } 67 }) 68 69 t.Run("loads expected domains", func(t *testing.T) { 70 parser, err := NewArticleParser(http.DefaultClient) 71 if err != nil { 72 t.Fatalf("Failed to create parser: %v", err) 73 } 74 75 domains := parser.GetSupportedDomains() 76 expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"} 77 78 if len(domains) != len(expectedDomains) { 79 t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains)) 80 } 81 82 domainMap := make(map[string]bool) 83 for _, domain := range domains { 84 domainMap[domain] = true 85 } 86 87 for _, expected := range expectedDomains { 88 if !domainMap[expected] { 89 t.Errorf("Expected domain %s not found in supported domains", expected) 90 } 91 } 92 }) 93 }) 94 95 t.Run("parseRules", func(t *testing.T) { 96 parser := &ArticleParser{rules: make(map[string]*ParsingRule)} 97 98 t.Run("parses valid rule file", func(t *testing.T) { 99 content := `title: //h1 100author: //span[@class='author'] 101date: //time 102body: //article 103strip: //nav 104strip: //footer 105test_url: https://example.com/article` 106 107 rule, err := parser.parseRules("example.com", content) 108 if err != nil { 109 t.Fatalf("Expected no error, got %v", err) 110 } 111 112 if rule.Domain != "example.com" { 113 t.Errorf("Expected domain 'example.com', got %s", rule.Domain) 114 } 115 if rule.Title != "//h1" { 116 t.Errorf("Expected title '//h1', got %s", rule.Title) 117 } 118 if rule.Author != "//span[@class='author']" { 119 t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author) 120 } 121 if len(rule.Strip) != 2 { 122 t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip)) 123 } 124 if len(rule.TestURLs) != 1 { 125 t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs)) 126 } 127 }) 128 129 t.Run("handles empty lines and comments", func(t *testing.T) { 130 content := `# This is a comment 131title: //h1 132 133# Another comment 134body: //article 135` 136 137 rule, err := parser.parseRules("test.com", content) 138 if err != nil { 139 t.Fatalf("Expected no error, got %v", err) 140 } 141 142 if rule.Title != "//h1" { 143 t.Errorf("Expected title '//h1', got %s", rule.Title) 144 } 145 if rule.Body != "//article" { 146 t.Errorf("Expected body '//article', got %s", rule.Body) 147 } 148 }) 149 }) 150 151 t.Run("slugify", func(t *testing.T) { 152 parser := &ArticleParser{} 153 154 testCases := []struct { 155 input string 156 expected string 157 }{ 158 {"Simple Title", "simple-title"}, 159 {"Title with Numbers 123", "title-with-numbers-123"}, 160 {"Title-with-Hyphens", "title-with-hyphens"}, 161 {"Title with Spaces and Multiple Spaces", "title-with-spaces-and-multiple-spaces"}, 162 {"Title!@#$%^&*()with Special Characters", "title-with-special-characters"}, 163 {"", ""}, 164 {strings.Repeat("a", 150), strings.Repeat("a", 100)}, 165 } 166 167 for _, tc := range testCases { 168 t.Run(fmt.Sprintf("slugify '%s'", tc.input), func(t *testing.T) { 169 result := parser.slugify(tc.input) 170 if result != tc.expected { 171 t.Errorf("Expected '%s', got '%s'", tc.expected, result) 172 } 173 }) 174 } 175 }) 176 177 t.Run("Convert", func(t *testing.T) { 178 parser, err := NewArticleParser(http.DefaultClient) 179 if err != nil { 180 t.Fatalf("Failed to create parser: %v", err) 181 } 182 183 t.Run("fails with unsupported domain", func(t *testing.T) { 184 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>" 185 _, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article") 186 187 if err == nil { 188 t.Error("Expected error for unsupported domain") 189 } 190 if !strings.Contains(err.Error(), "no parsing rule found") { 191 t.Errorf("Expected 'no parsing rule found' error, got %v", err) 192 } 193 }) 194 195 t.Run("fails with invalid HTML", func(t *testing.T) { 196 invalidHTML := "<html><head><title>Test</head></body>" 197 _, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 198 199 if err == nil { 200 t.Error("Expected error for invalid HTML") 201 } 202 }) 203 204 t.Run("fails when no title extracted", func(t *testing.T) { 205 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>" 206 _, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 207 208 if err == nil { 209 t.Error("Expected error when no title can be extracted") 210 } 211 if !strings.Contains(err.Error(), "could not extract title") { 212 t.Errorf("Expected 'could not extract title' error, got %v", err) 213 } 214 }) 215 216 t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) { 217 htmlContent := `<html> 218 <head><title>Test Article</title></head> 219 <body> 220 <h1 id="firstHeading">Test Article Title</h1> 221 <div id="bodyContent"> 222 <p>This is the main content of the article.</p> 223 <div class="noprint">This should be stripped</div> 224 <p>More content here.</p> 225 </div> 226 </body> 227 </html>` 228 229 markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 230 if err != nil { 231 t.Fatalf("Expected no error, got %v", err) 232 } 233 234 if !strings.Contains(markdown, "# Test Article Title") { 235 t.Error("Expected markdown to contain title") 236 } 237 if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") { 238 t.Error("Expected markdown to contain source URL") 239 } 240 if !strings.Contains(markdown, "This is the main content") { 241 t.Error("Expected markdown to contain article content") 242 } 243 if strings.Contains(markdown, "This should be stripped") { 244 t.Error("Expected stripped content to be removed from markdown") 245 } 246 }) 247 }) 248 249 t.Run("ParseURL", func(t *testing.T) { 250 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 251 switch { 252 case strings.Contains(r.URL.Path, "404"): 253 w.WriteHeader(http.StatusNotFound) 254 case strings.Contains(r.URL.Path, "unsupported"): 255 w.WriteHeader(http.StatusOK) 256 w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>")) 257 default: 258 // Return Wikipedia-like structure for localhost rule 259 w.WriteHeader(http.StatusOK) 260 w.Write([]byte(`<html> 261 <head><title>Test Article</title></head> 262 <body> 263 <h1 id="firstHeading">Test Wikipedia Article</h1> 264 <div id="bodyContent"> 265 <p>This is the article content.</p> 266 <div class="noprint">This gets stripped</div> 267 </div> 268 </body> 269 </html>`)) 270 } 271 })) 272 defer server.Close() 273 274 parser, err := NewArticleParser(server.Client()) 275 if err != nil { 276 t.Fatalf("Failed to create parser: %v", err) 277 } 278 279 localhostRule := &ParsingRule{ 280 Domain: "127.0.0.1", 281 Title: "//h1[@id='firstHeading']", 282 Body: "//div[@id='bodyContent']", 283 Strip: []string{"//div[@class='noprint']"}, 284 } 285 parser.AddRule("127.0.0.1", localhostRule) 286 287 t.Run("fails with invalid URL", func(t *testing.T) { 288 _, err := parser.ParseURL("not-a-url") 289 if err == nil { 290 t.Error("Expected error for invalid URL") 291 } 292 if !strings.Contains(err.Error(), "unsupported protocol scheme") { 293 t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err) 294 } 295 }) 296 297 t.Run("fails with unsupported domain", func(t *testing.T) { 298 _, err := parser.ParseURL(server.URL + "/unsupported.com") 299 if err == nil { 300 t.Error("Expected error for unsupported domain") 301 } 302 }) 303 304 t.Run("fails with HTTP error", func(t *testing.T) { 305 _, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test") 306 if err == nil { 307 t.Error("Expected error for HTTP 404") 308 } 309 }) 310 311 }) 312 313 t.Run("SaveArticle", func(t *testing.T) { 314 parser := &ArticleParser{} 315 tempDir := t.TempDir() 316 317 content := &ParsedContent{ 318 Title: "Test Article", 319 Author: "Test Author", 320 Date: "2023-01-01", 321 Content: "This is test content.", 322 URL: "https://example.com/test", 323 } 324 325 t.Run("successfully saves article", func(t *testing.T) { 326 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir) 327 if err != nil { 328 t.Fatalf("Expected no error, got %v", err) 329 } 330 331 if _, err := os.Stat(mdPath); os.IsNotExist(err) { 332 t.Error("Expected markdown file to exist") 333 } 334 if _, err := os.Stat(htmlPath); os.IsNotExist(err) { 335 t.Error("Expected HTML file to exist") 336 } 337 338 mdContent, err := os.ReadFile(mdPath) 339 if err != nil { 340 t.Fatalf("Failed to read markdown file: %v", err) 341 } 342 if !strings.Contains(string(mdContent), "# Test Article") { 343 t.Error("Expected markdown to contain title") 344 } 345 if !strings.Contains(string(mdContent), "**Author:** Test Author") { 346 t.Error("Expected markdown to contain author") 347 } 348 349 htmlContentBytes, err := os.ReadFile(htmlPath) 350 if err != nil { 351 t.Fatalf("Failed to read HTML file: %v", err) 352 } 353 if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") { 354 t.Error("Expected HTML to contain title") 355 } 356 }) 357 358 t.Run("handles duplicate filenames", func(t *testing.T) { 359 mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir) 360 if err != nil { 361 t.Fatalf("Expected no error for first save, got %v", err) 362 } 363 364 mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir) 365 if err != nil { 366 t.Fatalf("Expected no error for second save, got %v", err) 367 } 368 369 if mdPath1 == mdPath2 { 370 t.Error("Expected different markdown paths for duplicate saves") 371 } 372 if htmlPath1 == htmlPath2 { 373 t.Error("Expected different HTML paths for duplicate saves") 374 } 375 376 if _, err := os.Stat(mdPath1); os.IsNotExist(err) { 377 t.Error("Expected first markdown file to exist") 378 } 379 if _, err := os.Stat(mdPath2); os.IsNotExist(err) { 380 t.Error("Expected second markdown file to exist") 381 } 382 }) 383 384 t.Run("fails with invalid directory", func(t *testing.T) { 385 invalidDir := "/nonexistent/directory" 386 _, _, err := parser.SaveArticle(content, invalidDir) 387 if err == nil { 388 t.Error("Expected error for invalid directory") 389 } 390 }) 391 }) 392 393 t.Run("createHTML", func(t *testing.T) { 394 parser := &ArticleParser{} 395 content := &ParsedContent{ 396 Title: "Test HTML Article", 397 Author: "HTML Author", 398 Date: "2023-12-25", 399 Content: "This is **bold** content with *emphasis*.", 400 URL: "https://example.com/html-test", 401 } 402 403 t.Run("creates valid HTML", func(t *testing.T) { 404 markdown := parser.createMarkdown(content) 405 html := parser.createHTML(content, markdown) 406 407 if !strings.Contains(html, "<!DOCTYPE html>") { 408 t.Error("Expected HTML to contain DOCTYPE") 409 } 410 if !strings.Contains(html, "<title>Test HTML Article</title>") { 411 t.Error("Expected HTML to contain title") 412 } 413 if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") { 414 t.Error("Expected HTML to contain h1 heading with title") 415 } 416 if !strings.Contains(html, "<strong>bold</strong>") { 417 t.Error("Expected HTML to contain bold formatting") 418 } 419 if !strings.Contains(html, "<em>emphasis</em>") { 420 t.Error("Expected HTML to contain emphasis formatting") 421 } 422 }) 423 }) 424 425 t.Run("createMarkdown", func(t *testing.T) { 426 parser := &ArticleParser{} 427 428 t.Run("creates markdown with all fields", func(t *testing.T) { 429 content := &ParsedContent{ 430 Title: "Full Content Article", 431 Author: "Complete Author", 432 Date: "2023-01-15", 433 Content: "Complete article content here.", 434 URL: "https://example.com/full", 435 } 436 437 markdown := parser.createMarkdown(content) 438 439 if !strings.Contains(markdown, "# Full Content Article") { 440 t.Error("Expected markdown to contain title") 441 } 442 if !strings.Contains(markdown, "**Author:** Complete Author") { 443 t.Error("Expected markdown to contain author") 444 } 445 if !strings.Contains(markdown, "**Date:** 2023-01-15") { 446 t.Error("Expected markdown to contain date") 447 } 448 if !strings.Contains(markdown, "**Source:** https://example.com/full") { 449 t.Error("Expected markdown to contain source URL") 450 } 451 if !strings.Contains(markdown, "**Saved:**") { 452 t.Error("Expected markdown to contain saved timestamp") 453 } 454 if !strings.Contains(markdown, "---") { 455 t.Error("Expected markdown to contain separator") 456 } 457 if !strings.Contains(markdown, "Complete article content here.") { 458 t.Error("Expected markdown to contain article content") 459 } 460 }) 461 462 t.Run("creates markdown with minimal fields", func(t *testing.T) { 463 content := &ParsedContent{ 464 Title: "Minimal Article", 465 Content: "Just content.", 466 URL: "https://example.com/minimal", 467 } 468 469 markdown := parser.createMarkdown(content) 470 471 if !strings.Contains(markdown, "# Minimal Article") { 472 t.Error("Expected markdown to contain title") 473 } 474 if strings.Contains(markdown, "**Author:**") { 475 t.Error("Expected no author field for empty author") 476 } 477 if strings.Contains(markdown, "**Date:**") { 478 t.Error("Expected no date field for empty date") 479 } 480 if !strings.Contains(markdown, "**Source:** https://example.com/minimal") { 481 t.Error("Expected markdown to contain source URL") 482 } 483 }) 484 }) 485} 486 487func TestCreateArticleFromURL(t *testing.T) { 488 tempDir := t.TempDir() 489 490 t.Run("fails with invalid URL", func(t *testing.T) { 491 _, err := CreateArticleFromURL("not-a-url", tempDir) 492 if err == nil { 493 t.Error("Expected error for invalid URL") 494 } 495 }) 496 497 t.Run("fails with unsupported domain", func(t *testing.T) { 498 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 499 w.WriteHeader(http.StatusOK) 500 w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>")) 501 })) 502 defer server.Close() 503 504 _, err := CreateArticleFromURL(server.URL, tempDir) 505 if err == nil { 506 t.Error("Expected error for unsupported domain") 507 } 508 }) 509 510 t.Run("successfully creates article from Wikipedia-like URL", func(t *testing.T) { 511 wikipediaHTML := `<html> 512 <head><title>Integration Test Article</title></head> 513 <body> 514 <h1 id="firstHeading">Integration Test Article</h1> 515 <div id="bodyContent"> 516 <p>This is integration test content.</p> 517 </div> 518 </body> 519 </html>` 520 521 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 522 w.WriteHeader(http.StatusOK) 523 w.Write([]byte(wikipediaHTML)) 524 })) 525 defer server.Close() 526 527 // We need to patch the CreateArticleFromURL function to use our test client and rules 528 // For now, let's test the components individually since CreateArticleFromURL uses NewArticleParser internally 529 parser, err := NewArticleParser(server.Client()) 530 if err != nil { 531 t.Fatalf("Failed to create parser: %v", err) 532 } 533 534 // Add localhost rule for testing 535 localhostRule := &ParsingRule{ 536 Domain: "127.0.0.1", 537 Title: "//h1[@id='firstHeading']", 538 Body: "//div[@id='bodyContent']", 539 Strip: []string{"//div[@class='noprint']"}, 540 } 541 parser.AddRule("127.0.0.1", localhostRule) 542 543 content, err := parser.ParseURL(server.URL) 544 if err != nil { 545 t.Fatalf("Expected no error, got %v", err) 546 } 547 548 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir) 549 if err != nil { 550 t.Fatalf("Failed to save article: %v", err) 551 } 552 553 if content.Title != "Integration Test Article" { 554 t.Errorf("Expected title 'Integration Test Article', got %s", content.Title) 555 } 556 if mdPath == "" { 557 t.Error("Expected non-empty markdown path") 558 } 559 if htmlPath == "" { 560 t.Error("Expected non-empty HTML path") 561 } 562 563 // Check files exist 564 if _, err := os.Stat(mdPath); os.IsNotExist(err) { 565 t.Error("Expected markdown file to exist") 566 } 567 if _, err := os.Stat(htmlPath); os.IsNotExist(err) { 568 t.Error("Expected HTML file to exist") 569 } 570 }) 571}