cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
29
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 2ddf61b354c2cd3b6697f74c0c27e06f4b511fba 1048 lines 33 kB view raw
1package articles 2 3import ( 4 "errors" 5 "fmt" 6 "io" 7 "net/http" 8 "os" 9 "strings" 10 "testing" 11 "time" 12 13 "github.com/stormlightlabs/noteleaf/internal/models" 14) 15 16// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules. 17func ExampleParser_Convert() { 18 parser, err := NewArticleParser(http.DefaultClient) 19 if err != nil { 20 fmt.Printf("Failed to create parser: %v\n", err) 21 return 22 } 23 24 htmlPath := "examples/christopher-lloyd.html" 25 htmlContent, err := os.ReadFile(htmlPath) 26 if err != nil { 27 fmt.Printf("Local HTML file not found: %v\n", err) 28 return 29 } 30 31 markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd") 32 if err != nil { 33 fmt.Printf("Failed to convert HTML: %v\n", err) 34 return 35 } 36 37 parts := strings.Split(markdown, "\n---\n") 38 if len(parts) > 0 { 39 frontmatter := strings.TrimSpace(parts[0]) 40 lines := strings.Split(frontmatter, "\n") 41 42 for i, line := range lines { 43 if i >= 4 { 44 break 45 } 46 47 if !strings.Contains(line, "**Saved:**") { 48 fmt.Println(line) 49 } 50 } 51 } 52 53 // Output: # Christopher Lloyd 54 // 55 // **Author:** Contributors to Wikimedia projects 56} 57 58func TestArticleParser(t *testing.T) { 59 t.Run("New", func(t *testing.T) { 60 t.Run("successfully creates parser", func(t *testing.T) { 61 parser, err := NewArticleParser(http.DefaultClient) 62 if err != nil { 63 t.Fatalf("Expected no error, got %v", err) 64 } 65 if parser == nil { 66 t.Fatal("Expected parser to be created, got nil") 67 } 68 if len(parser.rules) == 0 { 69 t.Error("Expected rules to be loaded") 70 } 71 }) 72 73 t.Run("loads expected domains", func(t *testing.T) { 74 parser, err := NewArticleParser(http.DefaultClient) 75 if err != nil { 76 t.Fatalf("Failed to create parser: %v", err) 77 } 78 79 domains := parser.GetSupportedDomains() 80 expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"} 81 82 if len(domains) != len(expectedDomains) { 83 t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains)) 84 } 85 86 domainMap := make(map[string]bool) 87 for _, domain := range domains { 88 domainMap[domain] = true 89 } 90 91 for _, expected := range expectedDomains { 92 if !domainMap[expected] { 93 t.Errorf("Expected domain %s not found in supported domains", expected) 94 } 95 } 96 }) 97 }) 98 99 t.Run("parseRules", func(t *testing.T) { 100 parser := &ArticleParser{rules: make(map[string]*ParsingRule)} 101 102 t.Run("parses valid rule file", func(t *testing.T) { 103 content := `title: //h1 104author: //span[@class='author'] 105date: //time 106body: //article 107strip: //nav 108strip: //footer 109test_url: https://example.com/article` 110 111 rule, err := parser.parseRules("example.com", content) 112 if err != nil { 113 t.Fatalf("Expected no error, got %v", err) 114 } 115 116 if rule.Domain != "example.com" { 117 t.Errorf("Expected domain 'example.com', got %s", rule.Domain) 118 } 119 if rule.Title != "//h1" { 120 t.Errorf("Expected title '//h1', got %s", rule.Title) 121 } 122 if rule.Author != "//span[@class='author']" { 123 t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author) 124 } 125 if len(rule.Strip) != 2 { 126 t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip)) 127 } 128 if len(rule.TestURLs) != 1 { 129 t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs)) 130 } 131 }) 132 133 t.Run("handles empty lines and comments", func(t *testing.T) { 134 content := `# This is a comment 135title: //h1 136 137# Another comment 138body: //article 139` 140 141 rule, err := parser.parseRules("test.com", content) 142 if err != nil { 143 t.Fatalf("Expected no error, got %v", err) 144 } 145 146 if rule.Title != "//h1" { 147 t.Errorf("Expected title '//h1', got %s", rule.Title) 148 } 149 if rule.Body != "//article" { 150 t.Errorf("Expected body '//article', got %s", rule.Body) 151 } 152 }) 153 }) 154 155 t.Run("slugify", func(t *testing.T) { 156 parser := &ArticleParser{} 157 158 tc := []struct { 159 input string 160 expected string 161 }{ 162 {"Simple Title", "simple-title"}, 163 {"Title with Numbers 123", "title-with-numbers-123"}, 164 {"Title-with-Hyphens", "title-with-hyphens"}, 165 {"Title with Spaces and Multiple Spaces", "title-with-spaces-and-multiple-spaces"}, 166 {"Title!@#$%^&*()with Special Characters", "title-with-special-characters"}, 167 {"", ""}, 168 {strings.Repeat("a", 150), strings.Repeat("a", 100)}, 169 } 170 171 for _, tt := range tc { 172 t.Run(fmt.Sprintf("slugify '%s'", tt.input), func(t *testing.T) { 173 result := parser.slugify(tt.input) 174 if result != tt.expected { 175 t.Errorf("Expected '%s', got '%s'", tt.expected, result) 176 } 177 }) 178 } 179 }) 180 181 t.Run("Convert", func(t *testing.T) { 182 parser, err := NewArticleParser(http.DefaultClient) 183 if err != nil { 184 t.Fatalf("Failed to create parser: %v", err) 185 } 186 187 t.Run("fails with unsupported domain", func(t *testing.T) { 188 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>" 189 _, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article") 190 191 if err == nil { 192 t.Error("Expected error for unsupported domain") 193 } 194 195 if !strings.Contains(err.Error(), "confidence too low") && 196 !strings.Contains(err.Error(), "could not extract title") { 197 t.Errorf("Expected heuristic extraction error, got %v", err) 198 } 199 }) 200 201 t.Run("fails with invalid HTML", func(t *testing.T) { 202 invalidHTML := "<html><head><title>Test</head></body>" 203 _, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 204 205 if err == nil { 206 t.Error("Expected error for invalid HTML") 207 } 208 }) 209 210 t.Run("fails when no title extracted", func(t *testing.T) { 211 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>" 212 _, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 213 214 if err == nil { 215 t.Error("Expected error when no title can be extracted") 216 } 217 218 if !strings.Contains(err.Error(), "could not extract title") && 219 !strings.Contains(err.Error(), "could not extract body content") && 220 !strings.Contains(err.Error(), "confidence too low") { 221 t.Errorf("Expected title, body, or confidence error, got %v", err) 222 } 223 }) 224 225 t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) { 226 htmlContent := `<html> 227 <head><title>Test Article</title></head> 228 <body> 229 <h1 id="firstHeading">Test Article Title</h1> 230 <div id="bodyContent"> 231 <style>.mw-parser-output .hatnote{font-style:italic;}</style> 232 <p>This is the main content of the article.</p> 233 <div class="noprint">This should be stripped</div> 234 <div class="editsection">Edit this section</div> 235 <p>More content here.</p> 236 </div> 237 </body> 238 </html>` 239 240 markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 241 if err != nil { 242 t.Fatalf("Expected no error, got %v", err) 243 } 244 245 if !strings.Contains(markdown, "# Test Article Title") { 246 t.Error("Expected markdown to contain title") 247 } 248 if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") { 249 t.Error("Expected markdown to contain source URL") 250 } 251 if !strings.Contains(markdown, "This is the main content") { 252 t.Error("Expected markdown to contain article content") 253 } 254 if strings.Contains(markdown, "This should be stripped") { 255 t.Error("Expected stripped content to be removed from markdown") 256 } 257 if strings.Contains(markdown, ".mw-parser-output") { 258 t.Error("Expected style content to be removed from markdown") 259 } 260 if strings.Contains(markdown, "Edit this section") { 261 t.Error("Expected edit section markers to be removed from markdown") 262 } 263 }) 264 265 t.Run("strips Wikipedia navigation boxes and metadata", func(t *testing.T) { 266 htmlContent := `<html> 267 <head><title>Test Navigation Article</title></head> 268 <body> 269 <h1 id="firstHeading">Test Navigation Article</h1> 270 <div id="bodyContent"> 271 <p>Main article content goes here.</p> 272 <h2>Section One<span class="mw-editsection">[edit]</span></h2> 273 <p>Section content.</p> 274 <table class="navbox" role="navigation"> 275 <tr><td>Navigation item 1</td></tr> 276 <tr><td>Navigation item 2</td></tr> 277 </table> 278 <div class="navbox"> 279 <p>Another navigation box</p> 280 </div> 281 <table class="vertical-navbox"> 282 <tr><td>Vertical nav item</td></tr> 283 </table> 284 <p>More article content.</p> 285 <div role="navigation"> 286 <p>Navigation content</p> 287 </div> 288 <div id="catlinks"> 289 <p>Categories: Test Category</p> 290 </div> 291 <div id="footer"> 292 <p>Retrieved from Wikipedia</p> 293 </div> 294 </div> 295 </body> 296 </html>` 297 298 markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test_Navigation") 299 if err != nil { 300 t.Fatalf("Expected no error, got %v", err) 301 } 302 303 if !strings.Contains(markdown, "Main article content") { 304 t.Error("Expected markdown to contain main article content") 305 } 306 if !strings.Contains(markdown, "Section content") { 307 t.Error("Expected markdown to contain section content") 308 } 309 if !strings.Contains(markdown, "More article content") { 310 t.Error("Expected markdown to contain additional content") 311 } 312 313 if strings.Contains(markdown, "Navigation item") { 314 t.Error("Expected navbox table content to be stripped") 315 } 316 if strings.Contains(markdown, "Another navigation box") { 317 t.Error("Expected navbox div content to be stripped") 318 } 319 if strings.Contains(markdown, "Vertical nav item") { 320 t.Error("Expected vertical-navbox content to be stripped") 321 } 322 if strings.Contains(markdown, "[edit]") { 323 t.Error("Expected edit section markers to be stripped") 324 } 325 if strings.Contains(markdown, "Navigation content") { 326 t.Error("Expected role=navigation content to be stripped") 327 } 328 if strings.Contains(markdown, "Categories:") { 329 t.Error("Expected category links to be stripped") 330 } 331 if strings.Contains(markdown, "Retrieved from") { 332 t.Error("Expected footer content to be stripped") 333 } 334 }) 335 336 t.Run("uses heuristic extraction for unsupported domain with semantic HTML", func(t *testing.T) { 337 htmlContent := `<html><head> 338 <title>Heuristic Test Article</title> 339 <meta property="og:author" content="Heuristic Author"> 340 <meta property="article:published_time" content="2025-01-15"> 341 </head><body> 342 <article> 343 <p>This is a substantial article that should be extracted using heuristic methods.</p> 344 <p>It contains multiple paragraphs with sufficient content for the readability algorithm.</p> 345 <p>The heuristic extractor should successfully identify this as main content.</p> 346 </article> 347 </body></html>` 348 349 markdown, err := parser.Convert(htmlContent, "unsupported-domain.com", "https://unsupported-domain.com/article") 350 351 if err == nil { 352 if !strings.Contains(markdown, "substantial article") { 353 t.Error("Expected markdown to contain extracted content") 354 } 355 } 356 }) 357 358 t.Run("includes confidence score in parsed content", func(t *testing.T) { 359 htmlContent := `<html> 360 <head><title>Confidence Test</title></head> 361 <body> 362 <h1 id="firstHeading">Confidence Test Article</h1> 363 <div id="bodyContent"> 364 <p>Article content for confidence testing.</p> 365 </div> 366 </body> 367 </html>` 368 369 content, err := parser.Parse(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Confidence") 370 if err != nil { 371 t.Fatalf("Expected no error, got %v", err) 372 } 373 374 if content.Confidence == 0.0 { 375 t.Error("Expected non-zero confidence score") 376 } 377 378 if content.ExtractionMethod == "" { 379 t.Error("Expected extraction method to be set") 380 } 381 }) 382 383 t.Run("falls back to metadata extractor when XPath fails", func(t *testing.T) { 384 htmlContent := `<html><head> 385 <title>Metadata Fallback Test</title> 386 <meta property="og:author" content="Metadata Author"> 387 <meta property="article:published_time" content="2025-01-20"> 388 </head><body> 389 <h1 id="firstHeading">Fallback Test</h1> 390 <div id="bodyContent"> 391 <p>Content without author or date in XPath locations.</p> 392 </div> 393 </body></html>` 394 395 content, err := parser.Parse(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Metadata_Test") 396 if err != nil { 397 t.Fatalf("Expected no error, got %v", err) 398 } 399 400 if content.Author != "Metadata Author" { 401 t.Errorf("Expected metadata fallback for author, got %q", content.Author) 402 } 403 404 if content.Date != "2025-01-20" { 405 t.Errorf("Expected metadata fallback for date, got %q", content.Date) 406 } 407 }) 408 }) 409 410 t.Run("ParseURL", func(t *testing.T) { 411 parser, err := NewArticleParser(http.DefaultClient) 412 if err != nil { 413 t.Fatalf("Failed to create parser: %v", err) 414 } 415 416 localhostRule := &ParsingRule{ 417 Domain: "example.com", 418 Title: "//h1[@id='firstHeading']", 419 Body: "//div[@id='bodyContent']", 420 Strip: []string{"//div[@class='noprint']"}, 421 } 422 parser.AddRule("example.com", localhostRule) 423 424 const ( 425 validURL = "https://example.com/wiki/test" 426 httpErrorURL = "https://example.com/wiki/404" 427 unsupportedURL = "https://unsupported-domain.test/article" 428 ) 429 430 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 431 switch req.URL.String() { 432 case validURL: 433 return htmlResponse(http.StatusOK, `<html> 434 <head><title>Test Article</title></head> 435 <body> 436 <h1 id="firstHeading">Test Wikipedia Article</h1> 437 <div id="bodyContent"> 438 <p>This is the article content.</p> 439 <div class="noprint">This gets stripped</div> 440 </div> 441 </body> 442 </html>`), nil 443 case httpErrorURL: 444 return &http.Response{ 445 StatusCode: http.StatusNotFound, 446 Header: make(http.Header), 447 Body: io.NopCloser(strings.NewReader("")), 448 }, nil 449 case unsupportedURL: 450 return htmlResponse(http.StatusOK, `<html><head><title>Unsupported</title></head><body><p>Content</p></body></html>`), nil 451 default: 452 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 453 } 454 })) 455 456 t.Run("fails with invalid URL", func(t *testing.T) { 457 _, err := parser.ParseURL("not-a-url") 458 if err == nil { 459 t.Error("Expected error for invalid URL") 460 } 461 if !strings.Contains(err.Error(), "unsupported protocol scheme") && 462 !strings.Contains(err.Error(), "failed to fetch URL") && 463 !strings.Contains(err.Error(), "invalid URL") { 464 t.Errorf("Expected URL scheme error, got %v", err) 465 } 466 }) 467 468 t.Run("fails with unsupported domain", func(t *testing.T) { 469 _, err := parser.ParseURL(unsupportedURL) 470 if err == nil { 471 t.Error("Expected error for unsupported domain") 472 } 473 }) 474 475 t.Run("fails with HTTP error", func(t *testing.T) { 476 _, err := parser.ParseURL(httpErrorURL) 477 if err == nil { 478 t.Error("Expected error for HTTP 404") 479 } 480 }) 481 482 t.Run("successfully parses supported domain", func(t *testing.T) { 483 content, err := parser.ParseURL(validURL) 484 if err != nil { 485 t.Fatalf("Expected no error, got %v", err) 486 } 487 if content == nil { 488 t.Fatal("Expected parsed content, got nil") 489 } 490 if content.Title != "Test Wikipedia Article" { 491 t.Errorf("Expected title to be extracted, got %q", content.Title) 492 } 493 if !strings.Contains(content.Content, "This is the article content.") { 494 t.Errorf("Expected content to include article text, got %q", content.Content) 495 } 496 if strings.Contains(content.Content, "This gets stripped") { 497 t.Error("Expected strip rules to remove non-content nodes") 498 } 499 }) 500 501 }) 502 503 t.Run("SaveArticle", func(t *testing.T) { 504 parser := &ArticleParser{} 505 tempDir := t.TempDir() 506 507 content := &ParsedContent{ 508 Title: "Test Article", 509 Author: "Test Author", 510 Date: "2023-01-01", 511 Content: "This is test content.", 512 URL: "https://example.com/test", 513 } 514 515 t.Run("successfully saves article", func(t *testing.T) { 516 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir) 517 if err != nil { 518 t.Fatalf("Expected no error, got %v", err) 519 } 520 521 if _, err := os.Stat(mdPath); os.IsNotExist(err) { 522 t.Error("Expected markdown file to exist") 523 } 524 if _, err := os.Stat(htmlPath); os.IsNotExist(err) { 525 t.Error("Expected HTML file to exist") 526 } 527 528 mdContent, err := os.ReadFile(mdPath) 529 if err != nil { 530 t.Fatalf("Failed to read markdown file: %v", err) 531 } 532 if !strings.Contains(string(mdContent), "# Test Article") { 533 t.Error("Expected markdown to contain title") 534 } 535 if !strings.Contains(string(mdContent), "**Author:** Test Author") { 536 t.Error("Expected markdown to contain author") 537 } 538 539 htmlContentBytes, err := os.ReadFile(htmlPath) 540 if err != nil { 541 t.Fatalf("Failed to read HTML file: %v", err) 542 } 543 if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") { 544 t.Error("Expected HTML to contain title") 545 } 546 }) 547 548 t.Run("handles duplicate filenames", func(t *testing.T) { 549 mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir) 550 if err != nil { 551 t.Fatalf("Expected no error for first save, got %v", err) 552 } 553 554 mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir) 555 if err != nil { 556 t.Fatalf("Expected no error for second save, got %v", err) 557 } 558 559 if mdPath1 == mdPath2 { 560 t.Error("Expected different markdown paths for duplicate saves") 561 } 562 if htmlPath1 == htmlPath2 { 563 t.Error("Expected different HTML paths for duplicate saves") 564 } 565 566 if _, err := os.Stat(mdPath1); os.IsNotExist(err) { 567 t.Error("Expected first markdown file to exist") 568 } 569 if _, err := os.Stat(mdPath2); os.IsNotExist(err) { 570 t.Error("Expected second markdown file to exist") 571 } 572 }) 573 574 t.Run("fails with invalid directory", func(t *testing.T) { 575 invalidDir := "/nonexistent/directory" 576 _, _, err := parser.SaveArticle(content, invalidDir) 577 if err == nil { 578 t.Error("Expected error for invalid directory") 579 } 580 }) 581 }) 582 583 t.Run("createHTML", func(t *testing.T) { 584 parser := &ArticleParser{} 585 content := &ParsedContent{ 586 Title: "Test HTML Article", 587 Author: "HTML Author", 588 Date: "2023-12-25", 589 Content: "This is **bold** content with *emphasis*.", 590 URL: "https://example.com/html-test", 591 } 592 593 t.Run("creates valid HTML", func(t *testing.T) { 594 markdown := parser.createMarkdown(content) 595 html := parser.createHTML(content, markdown) 596 597 if !strings.Contains(html, "<!DOCTYPE html>") { 598 t.Error("Expected HTML to contain DOCTYPE") 599 } 600 if !strings.Contains(html, "<title>Test HTML Article</title>") { 601 t.Error("Expected HTML to contain title") 602 } 603 if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") { 604 t.Error("Expected HTML to contain h1 heading with title") 605 } 606 if !strings.Contains(html, "<strong>bold</strong>") { 607 t.Error("Expected HTML to contain bold formatting") 608 } 609 if !strings.Contains(html, "<em>emphasis</em>") { 610 t.Error("Expected HTML to contain emphasis formatting") 611 } 612 }) 613 }) 614 615 t.Run("createMarkdown", func(t *testing.T) { 616 parser := &ArticleParser{} 617 618 t.Run("creates markdown with all fields", func(t *testing.T) { 619 content := &ParsedContent{ 620 Title: "Full Content Article", 621 Author: "Complete Author", 622 Date: "2023-01-15", 623 Content: "Complete article content here.", 624 URL: "https://example.com/full", 625 } 626 627 markdown := parser.createMarkdown(content) 628 629 if !strings.Contains(markdown, "# Full Content Article") { 630 t.Error("Expected markdown to contain title") 631 } 632 if !strings.Contains(markdown, "**Author:** Complete Author") { 633 t.Error("Expected markdown to contain author") 634 } 635 if !strings.Contains(markdown, "**Date:** 2023-01-15") { 636 t.Error("Expected markdown to contain date") 637 } 638 if !strings.Contains(markdown, "**Source:** https://example.com/full") { 639 t.Error("Expected markdown to contain source URL") 640 } 641 if !strings.Contains(markdown, "**Saved:**") { 642 t.Error("Expected markdown to contain saved timestamp") 643 } 644 if !strings.Contains(markdown, "---") { 645 t.Error("Expected markdown to contain separator") 646 } 647 if !strings.Contains(markdown, "Complete article content here.") { 648 t.Error("Expected markdown to contain article content") 649 } 650 }) 651 652 t.Run("creates markdown with minimal fields", func(t *testing.T) { 653 content := &ParsedContent{ 654 Title: "Minimal Article", 655 Content: "Just content.", 656 URL: "https://example.com/minimal", 657 } 658 659 markdown := parser.createMarkdown(content) 660 661 if !strings.Contains(markdown, "# Minimal Article") { 662 t.Error("Expected markdown to contain title") 663 } 664 if strings.Contains(markdown, "**Author:**") { 665 t.Error("Expected no author field for empty author") 666 } 667 if strings.Contains(markdown, "**Date:**") { 668 t.Error("Expected no date field for empty date") 669 } 670 if !strings.Contains(markdown, "**Source:** https://example.com/minimal") { 671 t.Error("Expected markdown to contain source URL") 672 } 673 }) 674 }) 675} 676 677func TestCreateArticleFromURL(t *testing.T) { 678 tempDir := t.TempDir() 679 680 t.Run("fails with invalid URL", func(t *testing.T) { 681 _, err := CreateArticleFromURL("not-a-url", tempDir) 682 if err == nil { 683 t.Error("Expected error for invalid URL") 684 } 685 if !strings.Contains(err.Error(), "invalid URL") && !strings.Contains(err.Error(), "failed to parse URL") { 686 t.Errorf("Expected URL parsing error, got %v", err) 687 } 688 }) 689 690 t.Run("fails with empty URL", func(t *testing.T) { 691 _, err := CreateArticleFromURL("", tempDir) 692 if err == nil { 693 t.Error("Expected error for empty URL") 694 } 695 }) 696 697 t.Run("fails with unsupported domain", func(t *testing.T) { 698 unsupportedURL := "https://unsupported-domain.test/article" 699 withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) { 700 if req.URL.String() == unsupportedURL { 701 return htmlResponse(http.StatusOK, "<html><body><div>Too little content</div></body></html>"), nil 702 } 703 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 704 }) 705 706 _, err := CreateArticleFromURL(unsupportedURL, tempDir) 707 if err == nil { 708 t.Error("Expected error for unsupported domain") 709 } 710 if !strings.Contains(err.Error(), "confidence too low") && 711 !strings.Contains(err.Error(), "could not extract title") { 712 t.Errorf("Expected heuristic extraction error, got %v", err) 713 } 714 }) 715 716 t.Run("fails with HTTP error", func(t *testing.T) { 717 errorURL := "https://example.com/missing" 718 withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) { 719 if req.URL.String() == errorURL { 720 return &http.Response{ 721 StatusCode: http.StatusNotFound, 722 Header: make(http.Header), 723 Body: io.NopCloser(strings.NewReader("")), 724 }, nil 725 } 726 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 727 }) 728 729 _, err := CreateArticleFromURL(errorURL, tempDir) 730 if err == nil { 731 t.Error("Expected error for HTTP 404") 732 } 733 if !strings.Contains(err.Error(), "HTTP error") && !strings.Contains(err.Error(), "404") { 734 t.Errorf("Expected HTTP error, got %v", err) 735 } 736 }) 737 738 t.Run("fails with network error", func(t *testing.T) { 739 networkURL := "https://example.com/network" 740 withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) { 741 if req.URL.String() == networkURL { 742 return nil, errors.New("dial error") 743 } 744 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 745 }) 746 747 _, err := CreateArticleFromURL(networkURL, tempDir) 748 if err == nil { 749 t.Error("Expected error for network failure") 750 } 751 if !strings.Contains(err.Error(), "failed to fetch URL") && !strings.Contains(err.Error(), "connection refused") { 752 t.Errorf("Expected network error, got %v", err) 753 } 754 }) 755 756 t.Run("fails with malformed HTML", func(t *testing.T) { 757 parser, err := NewArticleParser(http.DefaultClient) 758 if err != nil { 759 t.Fatalf("Failed to create parser: %v", err) 760 } 761 762 localhostRule := &ParsingRule{ 763 Domain: "example.com", 764 Title: "//h1[@id='firstHeading']", 765 Body: "//div[@id='bodyContent']", 766 Strip: []string{"//div[@class='noprint']"}, 767 } 768 parser.AddRule("example.com", localhostRule) 769 770 malformedURL := "https://example.com/malformed" 771 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 772 if req.URL.String() == malformedURL { 773 return htmlResponse(http.StatusOK, "<html><head><title>Test</head></body>"), nil 774 } 775 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 776 })) 777 778 _, err = parser.ParseURL(malformedURL) 779 if err == nil { 780 t.Error("Expected error for malformed HTML") 781 } 782 783 if !strings.Contains(err.Error(), "failed to parse HTML") && 784 !strings.Contains(err.Error(), "could not extract title") && 785 !strings.Contains(err.Error(), "could not extract body content") && 786 !strings.Contains(err.Error(), "confidence too low") { 787 t.Errorf("Expected HTML parsing or extraction error, got %v", err) 788 } 789 }) 790 791 t.Run("fails when no title can be extracted", func(t *testing.T) { 792 parser, err := NewArticleParser(http.DefaultClient) 793 if err != nil { 794 t.Fatalf("Failed to create parser: %v", err) 795 } 796 797 localhostRule := &ParsingRule{ 798 Domain: "example.com", 799 Title: "//h1[@id='firstHeading']", 800 Body: "//div[@id='bodyContent']", 801 Strip: []string{"//div[@class='noprint']"}, 802 } 803 parser.AddRule("example.com", localhostRule) 804 805 noTitleURL := "https://example.com/notitle" 806 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 807 if req.URL.String() == noTitleURL { 808 return htmlResponse(http.StatusOK, `<html> 809 <head><title>Test</title></head> 810 <body> 811 <div id="bodyContent"> 812 <p>Content without proper title</p> 813 </div> 814 </body> 815 </html>`), nil 816 } 817 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 818 })) 819 820 result, err := parser.ParseURL(noTitleURL) 821 822 if err != nil { 823 if !strings.Contains(err.Error(), "could not extract title") && 824 !strings.Contains(err.Error(), "confidence too low") { 825 t.Errorf("Expected title extraction error, got %v", err) 826 } 827 } else if result != nil { 828 if result.Title == "" { 829 t.Error("Expected title to be extracted via metadata fallback") 830 } 831 } 832 }) 833 834 t.Run("successfully creates article structure from parsed content", func(t *testing.T) { 835 wikipediaHTML := `<html> 836 <head><title>Integration Test Article</title></head> 837 <body> 838 <h1 id="firstHeading">Integration Test Article</h1> 839 <div id="bodyContent"> 840 <p>This is integration test content.</p> 841 <div class="noprint">This should be stripped</div> 842 <p>More content here.</p> 843 </div> 844 </body> 845 </html>` 846 847 parser, err := NewArticleParser(http.DefaultClient) 848 if err != nil { 849 t.Fatalf("Failed to create parser: %v", err) 850 } 851 852 localhostRule := &ParsingRule{ 853 Domain: "example.com", 854 Title: "//h1[@id='firstHeading']", 855 Body: "//div[@id='bodyContent']", 856 Strip: []string{"//div[@class='noprint']"}, 857 } 858 parser.AddRule("example.com", localhostRule) 859 860 contentURL := "https://example.com/integration" 861 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 862 if req.URL.String() == contentURL { 863 return htmlResponse(http.StatusOK, wikipediaHTML), nil 864 } 865 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 866 })) 867 868 content, err := parser.ParseURL(contentURL) 869 if err != nil { 870 t.Fatalf("Expected no error, got %v", err) 871 } 872 873 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir) 874 if err != nil { 875 t.Fatalf("Failed to save article: %v", err) 876 } 877 878 article := &models.Article{ 879 URL: contentURL, 880 Title: content.Title, 881 MarkdownPath: mdPath, 882 HTMLPath: htmlPath, 883 Created: time.Now(), 884 Modified: time.Now(), 885 } 886 887 if article.Title != "Integration Test Article" { 888 t.Errorf("Expected title 'Integration Test Article', got %s", article.Title) 889 } 890 if article.URL != contentURL { 891 t.Errorf("Expected URL %s, got %s", contentURL, article.URL) 892 } 893 if article.MarkdownPath == "" { 894 t.Error("Expected non-empty markdown path") 895 } 896 if article.HTMLPath == "" { 897 t.Error("Expected non-empty HTML path") 898 } 899 if article.Created.IsZero() { 900 t.Error("Expected Created timestamp to be set") 901 } 902 if article.Modified.IsZero() { 903 t.Error("Expected Modified timestamp to be set") 904 } 905 906 if _, err := os.Stat(article.MarkdownPath); os.IsNotExist(err) { 907 t.Error("Expected markdown file to exist") 908 } 909 if _, err := os.Stat(article.HTMLPath); os.IsNotExist(err) { 910 t.Error("Expected HTML file to exist") 911 } 912 913 mdContent, err := os.ReadFile(article.MarkdownPath) 914 if err != nil { 915 t.Fatalf("Failed to read markdown file: %v", err) 916 } 917 if !strings.Contains(string(mdContent), "# Integration Test Article") { 918 t.Error("Expected markdown to contain title") 919 } 920 if !strings.Contains(string(mdContent), "This is integration test content") { 921 t.Error("Expected markdown to contain article content") 922 } 923 if strings.Contains(string(mdContent), "This should be stripped") { 924 t.Error("Expected stripped content to be removed from markdown") 925 } 926 927 htmlContent, err := os.ReadFile(article.HTMLPath) 928 if err != nil { 929 t.Fatalf("Failed to read HTML file: %v", err) 930 } 931 if !strings.Contains(string(htmlContent), "<title>Integration Test Article</title>") { 932 t.Error("Expected HTML to contain title") 933 } 934 if !strings.Contains(string(htmlContent), "<!DOCTYPE html>") { 935 t.Error("Expected HTML to contain DOCTYPE") 936 } 937 }) 938 939 t.Run("successfully handles article with metadata", func(t *testing.T) { 940 contentHTML := `<html> 941 <head> 942 <title>Test Paper</title> 943 <meta name="citation_author" content="Dr. Test Author"> 944 <meta name="citation_date" content="2024-01-01"> 945 </head> 946 <body> 947 <h1 class="title">Test Research Paper</h1> 948 <blockquote class="abstract"> 949 <p>This is the abstract of the research paper.</p> 950 <p>It contains important research findings.</p> 951 </blockquote> 952 </body> 953 </html>` 954 955 parser, err := NewArticleParser(http.DefaultClient) 956 if err != nil { 957 t.Fatalf("Failed to create parser: %v", err) 958 } 959 960 localhostRule := &ParsingRule{ 961 Domain: "example.com", 962 Title: "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]", 963 Body: "//blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')]", 964 Date: "//meta[@name='citation_date']/@content", 965 Author: "//meta[@name='citation_author']/@content", 966 } 967 parser.AddRule("example.com", localhostRule) 968 969 contentURL := "https://example.com/metadata" 970 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 971 if req.URL.String() == contentURL { 972 return htmlResponse(http.StatusOK, contentHTML), nil 973 } 974 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 975 })) 976 977 content, err := parser.ParseURL(contentURL) 978 if err != nil { 979 t.Fatalf("Expected no error, got %v", err) 980 } 981 982 if content.Title != "Test Research Paper" { 983 t.Errorf("Expected title 'Test Research Paper', got %s", content.Title) 984 } 985 if content.Author != "Dr. Test Author" { 986 t.Errorf("Expected author 'Dr. Test Author', got %s", content.Author) 987 } 988 if content.Date != "2024-01-01" { 989 t.Errorf("Expected date '2024-01-01', got %s", content.Date) 990 } 991 992 mdPath, _, err := parser.SaveArticle(content, tempDir) 993 if err != nil { 994 t.Fatalf("Failed to save article: %v", err) 995 } 996 997 mdContent, err := os.ReadFile(mdPath) 998 if err != nil { 999 t.Fatalf("Failed to read markdown file: %v", err) 1000 } 1001 if !strings.Contains(string(mdContent), "**Author:** Dr. Test Author") { 1002 t.Error("Expected markdown to contain author") 1003 } 1004 if !strings.Contains(string(mdContent), "**Date:** 2024-01-01") { 1005 t.Error("Expected markdown to contain date") 1006 } 1007 1008 article := &models.Article{ 1009 Author: content.Author, 1010 Date: content.Date, 1011 } 1012 1013 if article.Author != "Dr. Test Author" { 1014 t.Errorf("Expected article author 'Dr. Test Author', got %s", article.Author) 1015 } 1016 if article.Date != "2024-01-01" { 1017 t.Errorf("Expected article date '2024-01-01', got %s", article.Date) 1018 } 1019 }) 1020} 1021 1022type roundTripFunc func(*http.Request) (*http.Response, error) 1023 1024func (f roundTripFunc) RoundTrip(req *http.Request) (*http.Response, error) { 1025 return f(req) 1026} 1027 1028func newMockHTTPClient(t *testing.T, fn roundTripFunc) *http.Client { 1029 t.Helper() 1030 return &http.Client{Transport: fn} 1031} 1032 1033func htmlResponse(status int, body string) *http.Response { 1034 return &http.Response{ 1035 StatusCode: status, 1036 Header: http.Header{"Content-Type": []string{"text/html; charset=utf-8"}}, 1037 Body: io.NopCloser(strings.NewReader(body)), 1038 } 1039} 1040 1041func withDefaultHTTPClient(t *testing.T, fn roundTripFunc) { 1042 t.Helper() 1043 original := http.DefaultClient.Transport 1044 http.DefaultClient.Transport = fn 1045 t.Cleanup(func() { 1046 http.DefaultClient.Transport = original 1047 }) 1048}