cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
29
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 7452aaa4e1d32eaa8a95e2413dcd589b4c807007 814 lines 25 kB view raw
1package articles 2 3import ( 4 "fmt" 5 "net/http" 6 "net/http/httptest" 7 "os" 8 "strings" 9 "testing" 10 "time" 11 12 "github.com/stormlightlabs/noteleaf/internal/models" 13) 14 15// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules. 16func ExampleParser_Convert() { 17 parser, err := NewArticleParser(http.DefaultClient) 18 if err != nil { 19 fmt.Printf("Failed to create parser: %v\n", err) 20 return 21 } 22 23 htmlPath := "examples/christopher-lloyd.html" 24 htmlContent, err := os.ReadFile(htmlPath) 25 if err != nil { 26 fmt.Printf("Local HTML file not found: %v\n", err) 27 return 28 } 29 30 markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd") 31 if err != nil { 32 fmt.Printf("Failed to convert HTML: %v\n", err) 33 return 34 } 35 36 parts := strings.Split(markdown, "\n---\n") 37 if len(parts) > 0 { 38 frontmatter := strings.TrimSpace(parts[0]) 39 lines := strings.Split(frontmatter, "\n") 40 41 for i, line := range lines { 42 if i >= 4 { 43 break 44 } 45 46 if !strings.Contains(line, "**Saved:**") { 47 fmt.Println(line) 48 } 49 } 50 } 51 52 // Output: # Christopher Lloyd 53 // 54 // **Source:** https://en.wikipedia.org/wiki/Christopher_Lloyd 55} 56 57func TestArticleParser(t *testing.T) { 58 t.Run("New", func(t *testing.T) { 59 t.Run("successfully creates parser", func(t *testing.T) { 60 parser, err := NewArticleParser(http.DefaultClient) 61 if err != nil { 62 t.Fatalf("Expected no error, got %v", err) 63 } 64 if parser == nil { 65 t.Fatal("Expected parser to be created, got nil") 66 } 67 if len(parser.rules) == 0 { 68 t.Error("Expected rules to be loaded") 69 } 70 }) 71 72 t.Run("loads expected domains", func(t *testing.T) { 73 parser, err := NewArticleParser(http.DefaultClient) 74 if err != nil { 75 t.Fatalf("Failed to create parser: %v", err) 76 } 77 78 domains := parser.GetSupportedDomains() 79 expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"} 80 81 if len(domains) != len(expectedDomains) { 82 t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains)) 83 } 84 85 domainMap := make(map[string]bool) 86 for _, domain := range domains { 87 domainMap[domain] = true 88 } 89 90 for _, expected := range expectedDomains { 91 if !domainMap[expected] { 92 t.Errorf("Expected domain %s not found in supported domains", expected) 93 } 94 } 95 }) 96 }) 97 98 t.Run("parseRules", func(t *testing.T) { 99 parser := &ArticleParser{rules: make(map[string]*ParsingRule)} 100 101 t.Run("parses valid rule file", func(t *testing.T) { 102 content := `title: //h1 103author: //span[@class='author'] 104date: //time 105body: //article 106strip: //nav 107strip: //footer 108test_url: https://example.com/article` 109 110 rule, err := parser.parseRules("example.com", content) 111 if err != nil { 112 t.Fatalf("Expected no error, got %v", err) 113 } 114 115 if rule.Domain != "example.com" { 116 t.Errorf("Expected domain 'example.com', got %s", rule.Domain) 117 } 118 if rule.Title != "//h1" { 119 t.Errorf("Expected title '//h1', got %s", rule.Title) 120 } 121 if rule.Author != "//span[@class='author']" { 122 t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author) 123 } 124 if len(rule.Strip) != 2 { 125 t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip)) 126 } 127 if len(rule.TestURLs) != 1 { 128 t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs)) 129 } 130 }) 131 132 t.Run("handles empty lines and comments", func(t *testing.T) { 133 content := `# This is a comment 134title: //h1 135 136# Another comment 137body: //article 138` 139 140 rule, err := parser.parseRules("test.com", content) 141 if err != nil { 142 t.Fatalf("Expected no error, got %v", err) 143 } 144 145 if rule.Title != "//h1" { 146 t.Errorf("Expected title '//h1', got %s", rule.Title) 147 } 148 if rule.Body != "//article" { 149 t.Errorf("Expected body '//article', got %s", rule.Body) 150 } 151 }) 152 }) 153 154 t.Run("slugify", func(t *testing.T) { 155 parser := &ArticleParser{} 156 157 testCases := []struct { 158 input string 159 expected string 160 }{ 161 {"Simple Title", "simple-title"}, 162 {"Title with Numbers 123", "title-with-numbers-123"}, 163 {"Title-with-Hyphens", "title-with-hyphens"}, 164 {"Title with Spaces and Multiple Spaces", "title-with-spaces-and-multiple-spaces"}, 165 {"Title!@#$%^&*()with Special Characters", "title-with-special-characters"}, 166 {"", ""}, 167 {strings.Repeat("a", 150), strings.Repeat("a", 100)}, 168 } 169 170 for _, tc := range testCases { 171 t.Run(fmt.Sprintf("slugify '%s'", tc.input), func(t *testing.T) { 172 result := parser.slugify(tc.input) 173 if result != tc.expected { 174 t.Errorf("Expected '%s', got '%s'", tc.expected, result) 175 } 176 }) 177 } 178 }) 179 180 t.Run("Convert", func(t *testing.T) { 181 parser, err := NewArticleParser(http.DefaultClient) 182 if err != nil { 183 t.Fatalf("Failed to create parser: %v", err) 184 } 185 186 t.Run("fails with unsupported domain", func(t *testing.T) { 187 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>" 188 _, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article") 189 190 if err == nil { 191 t.Error("Expected error for unsupported domain") 192 } 193 if !strings.Contains(err.Error(), "no parsing rule found") { 194 t.Errorf("Expected 'no parsing rule found' error, got %v", err) 195 } 196 }) 197 198 t.Run("fails with invalid HTML", func(t *testing.T) { 199 invalidHTML := "<html><head><title>Test</head></body>" 200 _, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 201 202 if err == nil { 203 t.Error("Expected error for invalid HTML") 204 } 205 }) 206 207 t.Run("fails when no title extracted", func(t *testing.T) { 208 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>" 209 _, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 210 211 if err == nil { 212 t.Error("Expected error when no title can be extracted") 213 } 214 if !strings.Contains(err.Error(), "could not extract title") { 215 t.Errorf("Expected 'could not extract title' error, got %v", err) 216 } 217 }) 218 219 t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) { 220 htmlContent := `<html> 221 <head><title>Test Article</title></head> 222 <body> 223 <h1 id="firstHeading">Test Article Title</h1> 224 <div id="bodyContent"> 225 <p>This is the main content of the article.</p> 226 <div class="noprint">This should be stripped</div> 227 <p>More content here.</p> 228 </div> 229 </body> 230 </html>` 231 232 markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 233 if err != nil { 234 t.Fatalf("Expected no error, got %v", err) 235 } 236 237 if !strings.Contains(markdown, "# Test Article Title") { 238 t.Error("Expected markdown to contain title") 239 } 240 if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") { 241 t.Error("Expected markdown to contain source URL") 242 } 243 if !strings.Contains(markdown, "This is the main content") { 244 t.Error("Expected markdown to contain article content") 245 } 246 if strings.Contains(markdown, "This should be stripped") { 247 t.Error("Expected stripped content to be removed from markdown") 248 } 249 }) 250 }) 251 252 t.Run("ParseURL", func(t *testing.T) { 253 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 254 switch { 255 case strings.Contains(r.URL.Path, "404"): 256 w.WriteHeader(http.StatusNotFound) 257 case strings.Contains(r.URL.Path, "unsupported"): 258 w.WriteHeader(http.StatusOK) 259 w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>")) 260 default: 261 // Return Wikipedia-like structure for localhost rule 262 w.WriteHeader(http.StatusOK) 263 w.Write([]byte(`<html> 264 <head><title>Test Article</title></head> 265 <body> 266 <h1 id="firstHeading">Test Wikipedia Article</h1> 267 <div id="bodyContent"> 268 <p>This is the article content.</p> 269 <div class="noprint">This gets stripped</div> 270 </div> 271 </body> 272 </html>`)) 273 } 274 })) 275 defer server.Close() 276 277 parser, err := NewArticleParser(server.Client()) 278 if err != nil { 279 t.Fatalf("Failed to create parser: %v", err) 280 } 281 282 localhostRule := &ParsingRule{ 283 Domain: "127.0.0.1", 284 Title: "//h1[@id='firstHeading']", 285 Body: "//div[@id='bodyContent']", 286 Strip: []string{"//div[@class='noprint']"}, 287 } 288 parser.AddRule("127.0.0.1", localhostRule) 289 290 t.Run("fails with invalid URL", func(t *testing.T) { 291 _, err := parser.ParseURL("not-a-url") 292 if err == nil { 293 t.Error("Expected error for invalid URL") 294 } 295 if !strings.Contains(err.Error(), "unsupported protocol scheme") { 296 t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err) 297 } 298 }) 299 300 t.Run("fails with unsupported domain", func(t *testing.T) { 301 _, err := parser.ParseURL(server.URL + "/unsupported.com") 302 if err == nil { 303 t.Error("Expected error for unsupported domain") 304 } 305 }) 306 307 t.Run("fails with HTTP error", func(t *testing.T) { 308 _, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test") 309 if err == nil { 310 t.Error("Expected error for HTTP 404") 311 } 312 }) 313 314 }) 315 316 t.Run("SaveArticle", func(t *testing.T) { 317 parser := &ArticleParser{} 318 tempDir := t.TempDir() 319 320 content := &ParsedContent{ 321 Title: "Test Article", 322 Author: "Test Author", 323 Date: "2023-01-01", 324 Content: "This is test content.", 325 URL: "https://example.com/test", 326 } 327 328 t.Run("successfully saves article", func(t *testing.T) { 329 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir) 330 if err != nil { 331 t.Fatalf("Expected no error, got %v", err) 332 } 333 334 if _, err := os.Stat(mdPath); os.IsNotExist(err) { 335 t.Error("Expected markdown file to exist") 336 } 337 if _, err := os.Stat(htmlPath); os.IsNotExist(err) { 338 t.Error("Expected HTML file to exist") 339 } 340 341 mdContent, err := os.ReadFile(mdPath) 342 if err != nil { 343 t.Fatalf("Failed to read markdown file: %v", err) 344 } 345 if !strings.Contains(string(mdContent), "# Test Article") { 346 t.Error("Expected markdown to contain title") 347 } 348 if !strings.Contains(string(mdContent), "**Author:** Test Author") { 349 t.Error("Expected markdown to contain author") 350 } 351 352 htmlContentBytes, err := os.ReadFile(htmlPath) 353 if err != nil { 354 t.Fatalf("Failed to read HTML file: %v", err) 355 } 356 if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") { 357 t.Error("Expected HTML to contain title") 358 } 359 }) 360 361 t.Run("handles duplicate filenames", func(t *testing.T) { 362 mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir) 363 if err != nil { 364 t.Fatalf("Expected no error for first save, got %v", err) 365 } 366 367 mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir) 368 if err != nil { 369 t.Fatalf("Expected no error for second save, got %v", err) 370 } 371 372 if mdPath1 == mdPath2 { 373 t.Error("Expected different markdown paths for duplicate saves") 374 } 375 if htmlPath1 == htmlPath2 { 376 t.Error("Expected different HTML paths for duplicate saves") 377 } 378 379 if _, err := os.Stat(mdPath1); os.IsNotExist(err) { 380 t.Error("Expected first markdown file to exist") 381 } 382 if _, err := os.Stat(mdPath2); os.IsNotExist(err) { 383 t.Error("Expected second markdown file to exist") 384 } 385 }) 386 387 t.Run("fails with invalid directory", func(t *testing.T) { 388 invalidDir := "/nonexistent/directory" 389 _, _, err := parser.SaveArticle(content, invalidDir) 390 if err == nil { 391 t.Error("Expected error for invalid directory") 392 } 393 }) 394 }) 395 396 t.Run("createHTML", func(t *testing.T) { 397 parser := &ArticleParser{} 398 content := &ParsedContent{ 399 Title: "Test HTML Article", 400 Author: "HTML Author", 401 Date: "2023-12-25", 402 Content: "This is **bold** content with *emphasis*.", 403 URL: "https://example.com/html-test", 404 } 405 406 t.Run("creates valid HTML", func(t *testing.T) { 407 markdown := parser.createMarkdown(content) 408 html := parser.createHTML(content, markdown) 409 410 if !strings.Contains(html, "<!DOCTYPE html>") { 411 t.Error("Expected HTML to contain DOCTYPE") 412 } 413 if !strings.Contains(html, "<title>Test HTML Article</title>") { 414 t.Error("Expected HTML to contain title") 415 } 416 if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") { 417 t.Error("Expected HTML to contain h1 heading with title") 418 } 419 if !strings.Contains(html, "<strong>bold</strong>") { 420 t.Error("Expected HTML to contain bold formatting") 421 } 422 if !strings.Contains(html, "<em>emphasis</em>") { 423 t.Error("Expected HTML to contain emphasis formatting") 424 } 425 }) 426 }) 427 428 t.Run("createMarkdown", func(t *testing.T) { 429 parser := &ArticleParser{} 430 431 t.Run("creates markdown with all fields", func(t *testing.T) { 432 content := &ParsedContent{ 433 Title: "Full Content Article", 434 Author: "Complete Author", 435 Date: "2023-01-15", 436 Content: "Complete article content here.", 437 URL: "https://example.com/full", 438 } 439 440 markdown := parser.createMarkdown(content) 441 442 if !strings.Contains(markdown, "# Full Content Article") { 443 t.Error("Expected markdown to contain title") 444 } 445 if !strings.Contains(markdown, "**Author:** Complete Author") { 446 t.Error("Expected markdown to contain author") 447 } 448 if !strings.Contains(markdown, "**Date:** 2023-01-15") { 449 t.Error("Expected markdown to contain date") 450 } 451 if !strings.Contains(markdown, "**Source:** https://example.com/full") { 452 t.Error("Expected markdown to contain source URL") 453 } 454 if !strings.Contains(markdown, "**Saved:**") { 455 t.Error("Expected markdown to contain saved timestamp") 456 } 457 if !strings.Contains(markdown, "---") { 458 t.Error("Expected markdown to contain separator") 459 } 460 if !strings.Contains(markdown, "Complete article content here.") { 461 t.Error("Expected markdown to contain article content") 462 } 463 }) 464 465 t.Run("creates markdown with minimal fields", func(t *testing.T) { 466 content := &ParsedContent{ 467 Title: "Minimal Article", 468 Content: "Just content.", 469 URL: "https://example.com/minimal", 470 } 471 472 markdown := parser.createMarkdown(content) 473 474 if !strings.Contains(markdown, "# Minimal Article") { 475 t.Error("Expected markdown to contain title") 476 } 477 if strings.Contains(markdown, "**Author:**") { 478 t.Error("Expected no author field for empty author") 479 } 480 if strings.Contains(markdown, "**Date:**") { 481 t.Error("Expected no date field for empty date") 482 } 483 if !strings.Contains(markdown, "**Source:** https://example.com/minimal") { 484 t.Error("Expected markdown to contain source URL") 485 } 486 }) 487 }) 488} 489 490func TestCreateArticleFromURL(t *testing.T) { 491 tempDir := t.TempDir() 492 493 t.Run("fails with invalid URL", func(t *testing.T) { 494 _, err := CreateArticleFromURL("not-a-url", tempDir) 495 if err == nil { 496 t.Error("Expected error for invalid URL") 497 } 498 if !strings.Contains(err.Error(), "invalid URL") && !strings.Contains(err.Error(), "failed to parse URL") { 499 t.Errorf("Expected URL parsing error, got %v", err) 500 } 501 }) 502 503 t.Run("fails with empty URL", func(t *testing.T) { 504 _, err := CreateArticleFromURL("", tempDir) 505 if err == nil { 506 t.Error("Expected error for empty URL") 507 } 508 }) 509 510 t.Run("fails with unsupported domain", func(t *testing.T) { 511 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 512 w.WriteHeader(http.StatusOK) 513 w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>")) 514 })) 515 defer server.Close() 516 517 _, err := CreateArticleFromURL(server.URL, tempDir) 518 if err == nil { 519 t.Error("Expected error for unsupported domain") 520 } 521 if !strings.Contains(err.Error(), "no parsing rule found") { 522 t.Errorf("Expected 'no parsing rule found' error, got %v", err) 523 } 524 }) 525 526 t.Run("fails with HTTP error", func(t *testing.T) { 527 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 528 w.WriteHeader(http.StatusNotFound) 529 })) 530 defer server.Close() 531 532 // Use a direct Wikipedia URL that would be processed by the real function 533 _, err := CreateArticleFromURL("https://en.wikipedia.org/wiki/NonExistentPage12345", tempDir) 534 if err == nil { 535 t.Error("Expected error for HTTP 404") 536 } 537 if !strings.Contains(err.Error(), "HTTP error") && !strings.Contains(err.Error(), "404") { 538 t.Errorf("Expected HTTP error, got %v", err) 539 } 540 }) 541 542 t.Run("fails with network error", func(t *testing.T) { 543 // Use a non-existent server to trigger network error 544 _, err := CreateArticleFromURL("http://localhost:99999/test", tempDir) 545 if err == nil { 546 t.Error("Expected error for network failure") 547 } 548 if !strings.Contains(err.Error(), "failed to fetch URL") && !strings.Contains(err.Error(), "connection refused") { 549 t.Errorf("Expected network error, got %v", err) 550 } 551 }) 552 553 t.Run("fails with invalid directory", func(t *testing.T) { 554 // Skip this test as it would require network access to test with real URLs 555 t.Skip("Skipping invalid directory test - requires network access") 556 }) 557 558 t.Run("fails with malformed HTML", func(t *testing.T) { 559 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 560 w.WriteHeader(http.StatusOK) 561 w.Write([]byte("<html><head><title>Test</head></body>")) // Malformed HTML 562 })) 563 defer server.Close() 564 565 // Create a custom parser with localhost rule for testing 566 parser, err := NewArticleParser(server.Client()) 567 if err != nil { 568 t.Fatalf("Failed to create parser: %v", err) 569 } 570 571 localhostRule := &ParsingRule{ 572 Domain: "127.0.0.1", 573 Title: "//h1[@id='firstHeading']", 574 Body: "//div[@id='bodyContent']", 575 Strip: []string{"//div[@class='noprint']"}, 576 } 577 parser.AddRule("127.0.0.1", localhostRule) 578 579 _, err = parser.ParseURL(server.URL) 580 if err == nil { 581 t.Error("Expected error for malformed HTML") 582 } 583 // Malformed HTML may either fail to parse or fail to extract title 584 if !strings.Contains(err.Error(), "failed to parse HTML") && !strings.Contains(err.Error(), "could not extract title") { 585 t.Errorf("Expected HTML parsing or title extraction error, got %v", err) 586 } 587 }) 588 589 t.Run("fails when no title can be extracted", func(t *testing.T) { 590 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 591 w.WriteHeader(http.StatusOK) 592 w.Write([]byte(`<html> 593 <head><title>Test</title></head> 594 <body> 595 <div id="bodyContent"> 596 <p>Content without proper title</p> 597 </div> 598 </body> 599 </html>`)) // No h1 with id="firstHeading" 600 })) 601 defer server.Close() 602 603 // Create a custom parser with localhost rule for testing 604 parser, err := NewArticleParser(server.Client()) 605 if err != nil { 606 t.Fatalf("Failed to create parser: %v", err) 607 } 608 609 localhostRule := &ParsingRule{ 610 Domain: "127.0.0.1", 611 Title: "//h1[@id='firstHeading']", 612 Body: "//div[@id='bodyContent']", 613 Strip: []string{"//div[@class='noprint']"}, 614 } 615 parser.AddRule("127.0.0.1", localhostRule) 616 617 _, err = parser.ParseURL(server.URL) 618 if err == nil { 619 t.Error("Expected error when no title can be extracted") 620 } 621 if !strings.Contains(err.Error(), "could not extract title") { 622 t.Errorf("Expected 'could not extract title' error, got %v", err) 623 } 624 }) 625 626 t.Run("successfully creates article structure from parsed content", func(t *testing.T) { 627 wikipediaHTML := `<html> 628 <head><title>Integration Test Article</title></head> 629 <body> 630 <h1 id="firstHeading">Integration Test Article</h1> 631 <div id="bodyContent"> 632 <p>This is integration test content.</p> 633 <div class="noprint">This should be stripped</div> 634 <p>More content here.</p> 635 </div> 636 </body> 637 </html>` 638 639 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 640 w.WriteHeader(http.StatusOK) 641 w.Write([]byte(wikipediaHTML)) 642 })) 643 defer server.Close() 644 645 // Create a custom parser with localhost rule for testing 646 parser, err := NewArticleParser(server.Client()) 647 if err != nil { 648 t.Fatalf("Failed to create parser: %v", err) 649 } 650 651 localhostRule := &ParsingRule{ 652 Domain: "127.0.0.1", 653 Title: "//h1[@id='firstHeading']", 654 Body: "//div[@id='bodyContent']", 655 Strip: []string{"//div[@class='noprint']"}, 656 } 657 parser.AddRule("127.0.0.1", localhostRule) 658 659 content, err := parser.ParseURL(server.URL) 660 if err != nil { 661 t.Fatalf("Expected no error, got %v", err) 662 } 663 664 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir) 665 if err != nil { 666 t.Fatalf("Failed to save article: %v", err) 667 } 668 669 // Test that it creates a proper models.Article structure (simulating CreateArticleFromURL) 670 article := &models.Article{ 671 URL: server.URL, 672 Title: content.Title, 673 MarkdownPath: mdPath, 674 HTMLPath: htmlPath, 675 Created: time.Now(), 676 Modified: time.Now(), 677 } 678 679 if article.Title != "Integration Test Article" { 680 t.Errorf("Expected title 'Integration Test Article', got %s", article.Title) 681 } 682 if article.URL != server.URL { 683 t.Errorf("Expected URL %s, got %s", server.URL, article.URL) 684 } 685 if article.MarkdownPath == "" { 686 t.Error("Expected non-empty markdown path") 687 } 688 if article.HTMLPath == "" { 689 t.Error("Expected non-empty HTML path") 690 } 691 if article.Created.IsZero() { 692 t.Error("Expected Created timestamp to be set") 693 } 694 if article.Modified.IsZero() { 695 t.Error("Expected Modified timestamp to be set") 696 } 697 698 // Check files exist 699 if _, err := os.Stat(article.MarkdownPath); os.IsNotExist(err) { 700 t.Error("Expected markdown file to exist") 701 } 702 if _, err := os.Stat(article.HTMLPath); os.IsNotExist(err) { 703 t.Error("Expected HTML file to exist") 704 } 705 706 // Verify file contents 707 mdContent, err := os.ReadFile(article.MarkdownPath) 708 if err != nil { 709 t.Fatalf("Failed to read markdown file: %v", err) 710 } 711 if !strings.Contains(string(mdContent), "# Integration Test Article") { 712 t.Error("Expected markdown to contain title") 713 } 714 if !strings.Contains(string(mdContent), "This is integration test content") { 715 t.Error("Expected markdown to contain article content") 716 } 717 if strings.Contains(string(mdContent), "This should be stripped") { 718 t.Error("Expected stripped content to be removed from markdown") 719 } 720 721 htmlContent, err := os.ReadFile(article.HTMLPath) 722 if err != nil { 723 t.Fatalf("Failed to read HTML file: %v", err) 724 } 725 if !strings.Contains(string(htmlContent), "<title>Integration Test Article</title>") { 726 t.Error("Expected HTML to contain title") 727 } 728 if !strings.Contains(string(htmlContent), "<!DOCTYPE html>") { 729 t.Error("Expected HTML to contain DOCTYPE") 730 } 731 }) 732 733 t.Run("successfully handles article with metadata", func(t *testing.T) { 734 contentHTML := `<html> 735 <head> 736 <title>Test Paper</title> 737 <meta name="citation_author" content="Dr. Test Author"> 738 <meta name="citation_date" content="2024-01-01"> 739 </head> 740 <body> 741 <h1 class="title">Test Research Paper</h1> 742 <blockquote class="abstract"> 743 <p>This is the abstract of the research paper.</p> 744 <p>It contains important research findings.</p> 745 </blockquote> 746 </body> 747 </html>` 748 749 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 750 w.WriteHeader(http.StatusOK) 751 w.Write([]byte(contentHTML)) 752 })) 753 defer server.Close() 754 755 // Create a custom parser with arXiv-like rule for testing 756 parser, err := NewArticleParser(server.Client()) 757 if err != nil { 758 t.Fatalf("Failed to create parser: %v", err) 759 } 760 761 localhostRule := &ParsingRule{ 762 Domain: "127.0.0.1", 763 Title: "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]", 764 Body: "//blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')]", 765 Date: "//meta[@name='citation_date']/@content", 766 Author: "//meta[@name='citation_author']/@content", 767 } 768 parser.AddRule("127.0.0.1", localhostRule) 769 770 content, err := parser.ParseURL(server.URL) 771 if err != nil { 772 t.Fatalf("Expected no error, got %v", err) 773 } 774 775 if content.Title != "Test Research Paper" { 776 t.Errorf("Expected title 'Test Research Paper', got %s", content.Title) 777 } 778 if content.Author != "Dr. Test Author" { 779 t.Errorf("Expected author 'Dr. Test Author', got %s", content.Author) 780 } 781 if content.Date != "2024-01-01" { 782 t.Errorf("Expected date '2024-01-01', got %s", content.Date) 783 } 784 785 mdPath, _, err := parser.SaveArticle(content, tempDir) 786 if err != nil { 787 t.Fatalf("Failed to save article: %v", err) 788 } 789 790 // Verify markdown contains all metadata 791 mdContent, err := os.ReadFile(mdPath) 792 if err != nil { 793 t.Fatalf("Failed to read markdown file: %v", err) 794 } 795 if !strings.Contains(string(mdContent), "**Author:** Dr. Test Author") { 796 t.Error("Expected markdown to contain author") 797 } 798 if !strings.Contains(string(mdContent), "**Date:** 2024-01-01") { 799 t.Error("Expected markdown to contain date") 800 } 801 802 article := &models.Article{ 803 Author: content.Author, 804 Date: content.Date, 805 } 806 807 if article.Author != "Dr. Test Author" { 808 t.Errorf("Expected article author 'Dr. Test Author', got %s", article.Author) 809 } 810 if article.Date != "2024-01-01" { 811 t.Errorf("Expected article date '2024-01-01', got %s", article.Date) 812 } 813 }) 814}