cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
29
fork

Configure Feed

Select the types of activity you want to include in your feed.

at b5d8851190fc9c722d4e5dcc977e58042fbdec81 880 lines 27 kB view raw
1package articles 2 3import ( 4 "fmt" 5 "net/http" 6 "net/http/httptest" 7 "os" 8 "strings" 9 "testing" 10 "time" 11 12 "github.com/stormlightlabs/noteleaf/internal/models" 13) 14 15func newServerWithHtml(h string) *httptest.Server { 16 return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 17 w.WriteHeader(http.StatusOK) 18 w.Write([]byte(h)) 19 })) 20} 21 22// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules. 23func ExampleParser_Convert() { 24 parser, err := NewArticleParser(http.DefaultClient) 25 if err != nil { 26 fmt.Printf("Failed to create parser: %v\n", err) 27 return 28 } 29 30 htmlPath := "examples/christopher-lloyd.html" 31 htmlContent, err := os.ReadFile(htmlPath) 32 if err != nil { 33 fmt.Printf("Local HTML file not found: %v\n", err) 34 return 35 } 36 37 markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd") 38 if err != nil { 39 fmt.Printf("Failed to convert HTML: %v\n", err) 40 return 41 } 42 43 parts := strings.Split(markdown, "\n---\n") 44 if len(parts) > 0 { 45 frontmatter := strings.TrimSpace(parts[0]) 46 lines := strings.Split(frontmatter, "\n") 47 48 for i, line := range lines { 49 if i >= 4 { 50 break 51 } 52 53 if !strings.Contains(line, "**Saved:**") { 54 fmt.Println(line) 55 } 56 } 57 } 58 59 // Output: # Christopher Lloyd 60 // 61 // **Source:** https://en.wikipedia.org/wiki/Christopher_Lloyd 62} 63 64func TestArticleParser(t *testing.T) { 65 t.Run("New", func(t *testing.T) { 66 t.Run("successfully creates parser", func(t *testing.T) { 67 parser, err := NewArticleParser(http.DefaultClient) 68 if err != nil { 69 t.Fatalf("Expected no error, got %v", err) 70 } 71 if parser == nil { 72 t.Fatal("Expected parser to be created, got nil") 73 } 74 if len(parser.rules) == 0 { 75 t.Error("Expected rules to be loaded") 76 } 77 }) 78 79 t.Run("loads expected domains", func(t *testing.T) { 80 parser, err := NewArticleParser(http.DefaultClient) 81 if err != nil { 82 t.Fatalf("Failed to create parser: %v", err) 83 } 84 85 domains := parser.GetSupportedDomains() 86 expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"} 87 88 if len(domains) != len(expectedDomains) { 89 t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains)) 90 } 91 92 domainMap := make(map[string]bool) 93 for _, domain := range domains { 94 domainMap[domain] = true 95 } 96 97 for _, expected := range expectedDomains { 98 if !domainMap[expected] { 99 t.Errorf("Expected domain %s not found in supported domains", expected) 100 } 101 } 102 }) 103 }) 104 105 t.Run("parseRules", func(t *testing.T) { 106 parser := &ArticleParser{rules: make(map[string]*ParsingRule)} 107 108 t.Run("parses valid rule file", func(t *testing.T) { 109 content := `title: //h1 110author: //span[@class='author'] 111date: //time 112body: //article 113strip: //nav 114strip: //footer 115test_url: https://example.com/article` 116 117 rule, err := parser.parseRules("example.com", content) 118 if err != nil { 119 t.Fatalf("Expected no error, got %v", err) 120 } 121 122 if rule.Domain != "example.com" { 123 t.Errorf("Expected domain 'example.com', got %s", rule.Domain) 124 } 125 if rule.Title != "//h1" { 126 t.Errorf("Expected title '//h1', got %s", rule.Title) 127 } 128 if rule.Author != "//span[@class='author']" { 129 t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author) 130 } 131 if len(rule.Strip) != 2 { 132 t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip)) 133 } 134 if len(rule.TestURLs) != 1 { 135 t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs)) 136 } 137 }) 138 139 t.Run("handles empty lines and comments", func(t *testing.T) { 140 content := `# This is a comment 141title: //h1 142 143# Another comment 144body: //article 145` 146 147 rule, err := parser.parseRules("test.com", content) 148 if err != nil { 149 t.Fatalf("Expected no error, got %v", err) 150 } 151 152 if rule.Title != "//h1" { 153 t.Errorf("Expected title '//h1', got %s", rule.Title) 154 } 155 if rule.Body != "//article" { 156 t.Errorf("Expected body '//article', got %s", rule.Body) 157 } 158 }) 159 }) 160 161 t.Run("slugify", func(t *testing.T) { 162 parser := &ArticleParser{} 163 164 tc := []struct { 165 input string 166 expected string 167 }{ 168 {"Simple Title", "simple-title"}, 169 {"Title with Numbers 123", "title-with-numbers-123"}, 170 {"Title-with-Hyphens", "title-with-hyphens"}, 171 {"Title with Spaces and Multiple Spaces", "title-with-spaces-and-multiple-spaces"}, 172 {"Title!@#$%^&*()with Special Characters", "title-with-special-characters"}, 173 {"", ""}, 174 {strings.Repeat("a", 150), strings.Repeat("a", 100)}, 175 } 176 177 for _, tt := range tc { 178 t.Run(fmt.Sprintf("slugify '%s'", tt.input), func(t *testing.T) { 179 result := parser.slugify(tt.input) 180 if result != tt.expected { 181 t.Errorf("Expected '%s', got '%s'", tt.expected, result) 182 } 183 }) 184 } 185 }) 186 187 t.Run("Convert", func(t *testing.T) { 188 parser, err := NewArticleParser(http.DefaultClient) 189 if err != nil { 190 t.Fatalf("Failed to create parser: %v", err) 191 } 192 193 t.Run("fails with unsupported domain", func(t *testing.T) { 194 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>" 195 _, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article") 196 197 if err == nil { 198 t.Error("Expected error for unsupported domain") 199 } 200 if !strings.Contains(err.Error(), "no parsing rule found") { 201 t.Errorf("Expected 'no parsing rule found' error, got %v", err) 202 } 203 }) 204 205 t.Run("fails with invalid HTML", func(t *testing.T) { 206 invalidHTML := "<html><head><title>Test</head></body>" 207 _, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 208 209 if err == nil { 210 t.Error("Expected error for invalid HTML") 211 } 212 }) 213 214 t.Run("fails when no title extracted", func(t *testing.T) { 215 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>" 216 _, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 217 218 if err == nil { 219 t.Error("Expected error when no title can be extracted") 220 } 221 if !strings.Contains(err.Error(), "could not extract title") && 222 !strings.Contains(err.Error(), "could not extract body content") { 223 t.Errorf("Expected title or body extraction error, got %v", err) 224 } 225 }) 226 227 t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) { 228 htmlContent := `<html> 229 <head><title>Test Article</title></head> 230 <body> 231 <h1 id="firstHeading">Test Article Title</h1> 232 <div id="bodyContent"> 233 <style>.mw-parser-output .hatnote{font-style:italic;}</style> 234 <p>This is the main content of the article.</p> 235 <div class="noprint">This should be stripped</div> 236 <div class="editsection">Edit this section</div> 237 <p>More content here.</p> 238 </div> 239 </body> 240 </html>` 241 242 markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 243 if err != nil { 244 t.Fatalf("Expected no error, got %v", err) 245 } 246 247 if !strings.Contains(markdown, "# Test Article Title") { 248 t.Error("Expected markdown to contain title") 249 } 250 if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") { 251 t.Error("Expected markdown to contain source URL") 252 } 253 if !strings.Contains(markdown, "This is the main content") { 254 t.Error("Expected markdown to contain article content") 255 } 256 if strings.Contains(markdown, "This should be stripped") { 257 t.Error("Expected stripped content to be removed from markdown") 258 } 259 if strings.Contains(markdown, ".mw-parser-output") { 260 t.Error("Expected style content to be removed from markdown") 261 } 262 if strings.Contains(markdown, "Edit this section") { 263 t.Error("Expected edit section markers to be removed from markdown") 264 } 265 }) 266 267 t.Run("strips Wikipedia navigation boxes and metadata", func(t *testing.T) { 268 htmlContent := `<html> 269 <head><title>Test Navigation Article</title></head> 270 <body> 271 <h1 id="firstHeading">Test Navigation Article</h1> 272 <div id="bodyContent"> 273 <p>Main article content goes here.</p> 274 <h2>Section One<span class="mw-editsection">[edit]</span></h2> 275 <p>Section content.</p> 276 <table class="navbox" role="navigation"> 277 <tr><td>Navigation item 1</td></tr> 278 <tr><td>Navigation item 2</td></tr> 279 </table> 280 <div class="navbox"> 281 <p>Another navigation box</p> 282 </div> 283 <table class="vertical-navbox"> 284 <tr><td>Vertical nav item</td></tr> 285 </table> 286 <p>More article content.</p> 287 <div role="navigation"> 288 <p>Navigation content</p> 289 </div> 290 <div id="catlinks"> 291 <p>Categories: Test Category</p> 292 </div> 293 <div id="footer"> 294 <p>Retrieved from Wikipedia</p> 295 </div> 296 </div> 297 </body> 298 </html>` 299 300 markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test_Navigation") 301 if err != nil { 302 t.Fatalf("Expected no error, got %v", err) 303 } 304 305 if !strings.Contains(markdown, "Main article content") { 306 t.Error("Expected markdown to contain main article content") 307 } 308 if !strings.Contains(markdown, "Section content") { 309 t.Error("Expected markdown to contain section content") 310 } 311 if !strings.Contains(markdown, "More article content") { 312 t.Error("Expected markdown to contain additional content") 313 } 314 315 if strings.Contains(markdown, "Navigation item") { 316 t.Error("Expected navbox table content to be stripped") 317 } 318 if strings.Contains(markdown, "Another navigation box") { 319 t.Error("Expected navbox div content to be stripped") 320 } 321 if strings.Contains(markdown, "Vertical nav item") { 322 t.Error("Expected vertical-navbox content to be stripped") 323 } 324 if strings.Contains(markdown, "[edit]") { 325 t.Error("Expected edit section markers to be stripped") 326 } 327 if strings.Contains(markdown, "Navigation content") { 328 t.Error("Expected role=navigation content to be stripped") 329 } 330 if strings.Contains(markdown, "Categories:") { 331 t.Error("Expected category links to be stripped") 332 } 333 if strings.Contains(markdown, "Retrieved from") { 334 t.Error("Expected footer content to be stripped") 335 } 336 }) 337 }) 338 339 t.Run("ParseURL", func(t *testing.T) { 340 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 341 switch { 342 case strings.Contains(r.URL.Path, "404"): 343 w.WriteHeader(http.StatusNotFound) 344 case strings.Contains(r.URL.Path, "unsupported"): 345 w.WriteHeader(http.StatusOK) 346 w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>")) 347 default: 348 w.WriteHeader(http.StatusOK) 349 w.Write([]byte(`<html> 350 <head><title>Test Article</title></head> 351 <body> 352 <h1 id="firstHeading">Test Wikipedia Article</h1> 353 <div id="bodyContent"> 354 <p>This is the article content.</p> 355 <div class="noprint">This gets stripped</div> 356 </div> 357 </body> 358 </html>`)) 359 } 360 })) 361 defer server.Close() 362 363 parser, err := NewArticleParser(server.Client()) 364 if err != nil { 365 t.Fatalf("Failed to create parser: %v", err) 366 } 367 368 localhostRule := &ParsingRule{ 369 Domain: "127.0.0.1", 370 Title: "//h1[@id='firstHeading']", 371 Body: "//div[@id='bodyContent']", 372 Strip: []string{"//div[@class='noprint']"}, 373 } 374 parser.AddRule("127.0.0.1", localhostRule) 375 376 t.Run("fails with invalid URL", func(t *testing.T) { 377 _, err := parser.ParseURL("not-a-url") 378 if err == nil { 379 t.Error("Expected error for invalid URL") 380 } 381 if !strings.Contains(err.Error(), "unsupported protocol scheme") { 382 t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err) 383 } 384 }) 385 386 t.Run("fails with unsupported domain", func(t *testing.T) { 387 _, err := parser.ParseURL(server.URL + "/unsupported.com") 388 if err == nil { 389 t.Error("Expected error for unsupported domain") 390 } 391 }) 392 393 t.Run("fails with HTTP error", func(t *testing.T) { 394 _, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test") 395 if err == nil { 396 t.Error("Expected error for HTTP 404") 397 } 398 }) 399 400 }) 401 402 t.Run("SaveArticle", func(t *testing.T) { 403 parser := &ArticleParser{} 404 tempDir := t.TempDir() 405 406 content := &ParsedContent{ 407 Title: "Test Article", 408 Author: "Test Author", 409 Date: "2023-01-01", 410 Content: "This is test content.", 411 URL: "https://example.com/test", 412 } 413 414 t.Run("successfully saves article", func(t *testing.T) { 415 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir) 416 if err != nil { 417 t.Fatalf("Expected no error, got %v", err) 418 } 419 420 if _, err := os.Stat(mdPath); os.IsNotExist(err) { 421 t.Error("Expected markdown file to exist") 422 } 423 if _, err := os.Stat(htmlPath); os.IsNotExist(err) { 424 t.Error("Expected HTML file to exist") 425 } 426 427 mdContent, err := os.ReadFile(mdPath) 428 if err != nil { 429 t.Fatalf("Failed to read markdown file: %v", err) 430 } 431 if !strings.Contains(string(mdContent), "# Test Article") { 432 t.Error("Expected markdown to contain title") 433 } 434 if !strings.Contains(string(mdContent), "**Author:** Test Author") { 435 t.Error("Expected markdown to contain author") 436 } 437 438 htmlContentBytes, err := os.ReadFile(htmlPath) 439 if err != nil { 440 t.Fatalf("Failed to read HTML file: %v", err) 441 } 442 if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") { 443 t.Error("Expected HTML to contain title") 444 } 445 }) 446 447 t.Run("handles duplicate filenames", func(t *testing.T) { 448 mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir) 449 if err != nil { 450 t.Fatalf("Expected no error for first save, got %v", err) 451 } 452 453 mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir) 454 if err != nil { 455 t.Fatalf("Expected no error for second save, got %v", err) 456 } 457 458 if mdPath1 == mdPath2 { 459 t.Error("Expected different markdown paths for duplicate saves") 460 } 461 if htmlPath1 == htmlPath2 { 462 t.Error("Expected different HTML paths for duplicate saves") 463 } 464 465 if _, err := os.Stat(mdPath1); os.IsNotExist(err) { 466 t.Error("Expected first markdown file to exist") 467 } 468 if _, err := os.Stat(mdPath2); os.IsNotExist(err) { 469 t.Error("Expected second markdown file to exist") 470 } 471 }) 472 473 t.Run("fails with invalid directory", func(t *testing.T) { 474 invalidDir := "/nonexistent/directory" 475 _, _, err := parser.SaveArticle(content, invalidDir) 476 if err == nil { 477 t.Error("Expected error for invalid directory") 478 } 479 }) 480 }) 481 482 t.Run("createHTML", func(t *testing.T) { 483 parser := &ArticleParser{} 484 content := &ParsedContent{ 485 Title: "Test HTML Article", 486 Author: "HTML Author", 487 Date: "2023-12-25", 488 Content: "This is **bold** content with *emphasis*.", 489 URL: "https://example.com/html-test", 490 } 491 492 t.Run("creates valid HTML", func(t *testing.T) { 493 markdown := parser.createMarkdown(content) 494 html := parser.createHTML(content, markdown) 495 496 if !strings.Contains(html, "<!DOCTYPE html>") { 497 t.Error("Expected HTML to contain DOCTYPE") 498 } 499 if !strings.Contains(html, "<title>Test HTML Article</title>") { 500 t.Error("Expected HTML to contain title") 501 } 502 if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") { 503 t.Error("Expected HTML to contain h1 heading with title") 504 } 505 if !strings.Contains(html, "<strong>bold</strong>") { 506 t.Error("Expected HTML to contain bold formatting") 507 } 508 if !strings.Contains(html, "<em>emphasis</em>") { 509 t.Error("Expected HTML to contain emphasis formatting") 510 } 511 }) 512 }) 513 514 t.Run("createMarkdown", func(t *testing.T) { 515 parser := &ArticleParser{} 516 517 t.Run("creates markdown with all fields", func(t *testing.T) { 518 content := &ParsedContent{ 519 Title: "Full Content Article", 520 Author: "Complete Author", 521 Date: "2023-01-15", 522 Content: "Complete article content here.", 523 URL: "https://example.com/full", 524 } 525 526 markdown := parser.createMarkdown(content) 527 528 if !strings.Contains(markdown, "# Full Content Article") { 529 t.Error("Expected markdown to contain title") 530 } 531 if !strings.Contains(markdown, "**Author:** Complete Author") { 532 t.Error("Expected markdown to contain author") 533 } 534 if !strings.Contains(markdown, "**Date:** 2023-01-15") { 535 t.Error("Expected markdown to contain date") 536 } 537 if !strings.Contains(markdown, "**Source:** https://example.com/full") { 538 t.Error("Expected markdown to contain source URL") 539 } 540 if !strings.Contains(markdown, "**Saved:**") { 541 t.Error("Expected markdown to contain saved timestamp") 542 } 543 if !strings.Contains(markdown, "---") { 544 t.Error("Expected markdown to contain separator") 545 } 546 if !strings.Contains(markdown, "Complete article content here.") { 547 t.Error("Expected markdown to contain article content") 548 } 549 }) 550 551 t.Run("creates markdown with minimal fields", func(t *testing.T) { 552 content := &ParsedContent{ 553 Title: "Minimal Article", 554 Content: "Just content.", 555 URL: "https://example.com/minimal", 556 } 557 558 markdown := parser.createMarkdown(content) 559 560 if !strings.Contains(markdown, "# Minimal Article") { 561 t.Error("Expected markdown to contain title") 562 } 563 if strings.Contains(markdown, "**Author:**") { 564 t.Error("Expected no author field for empty author") 565 } 566 if strings.Contains(markdown, "**Date:**") { 567 t.Error("Expected no date field for empty date") 568 } 569 if !strings.Contains(markdown, "**Source:** https://example.com/minimal") { 570 t.Error("Expected markdown to contain source URL") 571 } 572 }) 573 }) 574} 575 576func TestCreateArticleFromURL(t *testing.T) { 577 tempDir := t.TempDir() 578 579 t.Run("fails with invalid URL", func(t *testing.T) { 580 _, err := CreateArticleFromURL("not-a-url", tempDir) 581 if err == nil { 582 t.Error("Expected error for invalid URL") 583 } 584 if !strings.Contains(err.Error(), "invalid URL") && !strings.Contains(err.Error(), "failed to parse URL") { 585 t.Errorf("Expected URL parsing error, got %v", err) 586 } 587 }) 588 589 t.Run("fails with empty URL", func(t *testing.T) { 590 _, err := CreateArticleFromURL("", tempDir) 591 if err == nil { 592 t.Error("Expected error for empty URL") 593 } 594 }) 595 596 t.Run("fails with unsupported domain", func(t *testing.T) { 597 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 598 w.WriteHeader(http.StatusOK) 599 w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>")) 600 })) 601 defer server.Close() 602 603 _, err := CreateArticleFromURL(server.URL, tempDir) 604 if err == nil { 605 t.Error("Expected error for unsupported domain") 606 } 607 if !strings.Contains(err.Error(), "no parsing rule found") { 608 t.Errorf("Expected 'no parsing rule found' error, got %v", err) 609 } 610 }) 611 612 t.Run("fails with HTTP error", func(t *testing.T) { 613 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 614 w.WriteHeader(http.StatusNotFound) 615 })) 616 defer server.Close() 617 618 _, err := CreateArticleFromURL("https://en.wikipedia.org/wiki/NonExistentPage12345", tempDir) 619 if err == nil { 620 t.Error("Expected error for HTTP 404") 621 } 622 if !strings.Contains(err.Error(), "HTTP error") && !strings.Contains(err.Error(), "404") { 623 t.Errorf("Expected HTTP error, got %v", err) 624 } 625 }) 626 627 t.Run("fails with network error", func(t *testing.T) { 628 _, err := CreateArticleFromURL("http://localhost:99999/test", tempDir) 629 if err == nil { 630 t.Error("Expected error for network failure") 631 } 632 if !strings.Contains(err.Error(), "failed to fetch URL") && !strings.Contains(err.Error(), "connection refused") { 633 t.Errorf("Expected network error, got %v", err) 634 } 635 }) 636 637 t.Run("fails with malformed HTML", func(t *testing.T) { 638 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 639 w.WriteHeader(http.StatusOK) 640 w.Write([]byte("<html><head><title>Test</head></body>")) 641 })) 642 defer server.Close() 643 644 parser, err := NewArticleParser(server.Client()) 645 if err != nil { 646 t.Fatalf("Failed to create parser: %v", err) 647 } 648 649 localhostRule := &ParsingRule{ 650 Domain: "127.0.0.1", 651 Title: "//h1[@id='firstHeading']", 652 Body: "//div[@id='bodyContent']", 653 Strip: []string{"//div[@class='noprint']"}, 654 } 655 parser.AddRule("127.0.0.1", localhostRule) 656 657 _, err = parser.ParseURL(server.URL) 658 if err == nil { 659 t.Error("Expected error for malformed HTML") 660 } 661 if !strings.Contains(err.Error(), "failed to parse HTML") && 662 !strings.Contains(err.Error(), "could not extract title") && 663 !strings.Contains(err.Error(), "could not extract body content") { 664 t.Errorf("Expected HTML parsing or extraction error, got %v", err) 665 } 666 }) 667 668 t.Run("fails when no title can be extracted", func(t *testing.T) { 669 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 670 w.WriteHeader(http.StatusOK) 671 w.Write([]byte(`<html> 672 <head><title>Test</title></head> 673 <body> 674 <div id="bodyContent"> 675 <p>Content without proper title</p> 676 </div> 677 </body> 678 </html>`)) 679 })) 680 defer server.Close() 681 682 parser, err := NewArticleParser(server.Client()) 683 if err != nil { 684 t.Fatalf("Failed to create parser: %v", err) 685 } 686 687 localhostRule := &ParsingRule{ 688 Domain: "127.0.0.1", 689 Title: "//h1[@id='firstHeading']", 690 Body: "//div[@id='bodyContent']", 691 Strip: []string{"//div[@class='noprint']"}, 692 } 693 parser.AddRule("127.0.0.1", localhostRule) 694 695 _, err = parser.ParseURL(server.URL) 696 if err == nil { 697 t.Error("Expected error when no title can be extracted") 698 } 699 if !strings.Contains(err.Error(), "could not extract title") { 700 t.Errorf("Expected 'could not extract title' error, got %v", err) 701 } 702 }) 703 704 t.Run("successfully creates article structure from parsed content", func(t *testing.T) { 705 wikipediaHTML := `<html> 706 <head><title>Integration Test Article</title></head> 707 <body> 708 <h1 id="firstHeading">Integration Test Article</h1> 709 <div id="bodyContent"> 710 <p>This is integration test content.</p> 711 <div class="noprint">This should be stripped</div> 712 <p>More content here.</p> 713 </div> 714 </body> 715 </html>` 716 717 server := newServerWithHtml(wikipediaHTML) 718 defer server.Close() 719 720 parser, err := NewArticleParser(server.Client()) 721 if err != nil { 722 t.Fatalf("Failed to create parser: %v", err) 723 } 724 725 localhostRule := &ParsingRule{ 726 Domain: "127.0.0.1", 727 Title: "//h1[@id='firstHeading']", 728 Body: "//div[@id='bodyContent']", 729 Strip: []string{"//div[@class='noprint']"}, 730 } 731 parser.AddRule("127.0.0.1", localhostRule) 732 733 content, err := parser.ParseURL(server.URL) 734 if err != nil { 735 t.Fatalf("Expected no error, got %v", err) 736 } 737 738 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir) 739 if err != nil { 740 t.Fatalf("Failed to save article: %v", err) 741 } 742 743 article := &models.Article{ 744 URL: server.URL, 745 Title: content.Title, 746 MarkdownPath: mdPath, 747 HTMLPath: htmlPath, 748 Created: time.Now(), 749 Modified: time.Now(), 750 } 751 752 if article.Title != "Integration Test Article" { 753 t.Errorf("Expected title 'Integration Test Article', got %s", article.Title) 754 } 755 if article.URL != server.URL { 756 t.Errorf("Expected URL %s, got %s", server.URL, article.URL) 757 } 758 if article.MarkdownPath == "" { 759 t.Error("Expected non-empty markdown path") 760 } 761 if article.HTMLPath == "" { 762 t.Error("Expected non-empty HTML path") 763 } 764 if article.Created.IsZero() { 765 t.Error("Expected Created timestamp to be set") 766 } 767 if article.Modified.IsZero() { 768 t.Error("Expected Modified timestamp to be set") 769 } 770 771 if _, err := os.Stat(article.MarkdownPath); os.IsNotExist(err) { 772 t.Error("Expected markdown file to exist") 773 } 774 if _, err := os.Stat(article.HTMLPath); os.IsNotExist(err) { 775 t.Error("Expected HTML file to exist") 776 } 777 778 mdContent, err := os.ReadFile(article.MarkdownPath) 779 if err != nil { 780 t.Fatalf("Failed to read markdown file: %v", err) 781 } 782 if !strings.Contains(string(mdContent), "# Integration Test Article") { 783 t.Error("Expected markdown to contain title") 784 } 785 if !strings.Contains(string(mdContent), "This is integration test content") { 786 t.Error("Expected markdown to contain article content") 787 } 788 if strings.Contains(string(mdContent), "This should be stripped") { 789 t.Error("Expected stripped content to be removed from markdown") 790 } 791 792 htmlContent, err := os.ReadFile(article.HTMLPath) 793 if err != nil { 794 t.Fatalf("Failed to read HTML file: %v", err) 795 } 796 if !strings.Contains(string(htmlContent), "<title>Integration Test Article</title>") { 797 t.Error("Expected HTML to contain title") 798 } 799 if !strings.Contains(string(htmlContent), "<!DOCTYPE html>") { 800 t.Error("Expected HTML to contain DOCTYPE") 801 } 802 }) 803 804 t.Run("successfully handles article with metadata", func(t *testing.T) { 805 contentHTML := `<html> 806 <head> 807 <title>Test Paper</title> 808 <meta name="citation_author" content="Dr. Test Author"> 809 <meta name="citation_date" content="2024-01-01"> 810 </head> 811 <body> 812 <h1 class="title">Test Research Paper</h1> 813 <blockquote class="abstract"> 814 <p>This is the abstract of the research paper.</p> 815 <p>It contains important research findings.</p> 816 </blockquote> 817 </body> 818 </html>` 819 820 server := newServerWithHtml(contentHTML) 821 defer server.Close() 822 823 parser, err := NewArticleParser(server.Client()) 824 if err != nil { 825 t.Fatalf("Failed to create parser: %v", err) 826 } 827 828 localhostRule := &ParsingRule{ 829 Domain: "127.0.0.1", 830 Title: "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]", 831 Body: "//blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')]", 832 Date: "//meta[@name='citation_date']/@content", 833 Author: "//meta[@name='citation_author']/@content", 834 } 835 parser.AddRule("127.0.0.1", localhostRule) 836 837 content, err := parser.ParseURL(server.URL) 838 if err != nil { 839 t.Fatalf("Expected no error, got %v", err) 840 } 841 842 if content.Title != "Test Research Paper" { 843 t.Errorf("Expected title 'Test Research Paper', got %s", content.Title) 844 } 845 if content.Author != "Dr. Test Author" { 846 t.Errorf("Expected author 'Dr. Test Author', got %s", content.Author) 847 } 848 if content.Date != "2024-01-01" { 849 t.Errorf("Expected date '2024-01-01', got %s", content.Date) 850 } 851 852 mdPath, _, err := parser.SaveArticle(content, tempDir) 853 if err != nil { 854 t.Fatalf("Failed to save article: %v", err) 855 } 856 857 mdContent, err := os.ReadFile(mdPath) 858 if err != nil { 859 t.Fatalf("Failed to read markdown file: %v", err) 860 } 861 if !strings.Contains(string(mdContent), "**Author:** Dr. Test Author") { 862 t.Error("Expected markdown to contain author") 863 } 864 if !strings.Contains(string(mdContent), "**Date:** 2024-01-01") { 865 t.Error("Expected markdown to contain date") 866 } 867 868 article := &models.Article{ 869 Author: content.Author, 870 Date: content.Date, 871 } 872 873 if article.Author != "Dr. Test Author" { 874 t.Errorf("Expected article author 'Dr. Test Author', got %s", article.Author) 875 } 876 if article.Date != "2024-01-01" { 877 t.Errorf("Expected article date '2024-01-01', got %s", article.Date) 878 } 879 }) 880}