cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
29
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 430 lines 11 kB view raw
1package articles 2 3import ( 4 "strings" 5 "testing" 6) 7 8func TestMetadataExtractor(t *testing.T) { 9 t.Run("NewMetadataExtractor", func(t *testing.T) { 10 t.Run("creates extractor", func(t *testing.T) { 11 extractor := NewMetadataExtractor() 12 13 if extractor == nil { 14 t.Fatal("Expected extractor to be created, got nil") 15 } 16 }) 17 }) 18 19 t.Run("ExtractTitle", func(t *testing.T) { 20 extractor := NewMetadataExtractor() 21 22 t.Run("extracts from OpenGraph", func(t *testing.T) { 23 htmlStr := `<html><head> 24 <meta property="og:title" content="Article Title from OpenGraph"> 25 </head><body></body></html>` 26 doc := parseHTML(htmlStr) 27 28 title := extractor.ExtractTitle(doc) 29 30 if title != "Article Title from OpenGraph" { 31 t.Errorf("Expected OpenGraph title, got %q", title) 32 } 33 }) 34 35 t.Run("extracts from title tag", func(t *testing.T) { 36 htmlStr := `<html><head> 37 <title>Page Title from Title Tag</title> 38 </head><body></body></html>` 39 doc := parseHTML(htmlStr) 40 41 title := extractor.ExtractTitle(doc) 42 43 if title != "Page Title from Title Tag" { 44 t.Errorf("Expected title tag content, got %q", title) 45 } 46 }) 47 48 t.Run("extracts from h1", func(t *testing.T) { 49 htmlStr := `<html><body> 50 <h1>Heading Title</h1> 51 </body></html>` 52 doc := parseHTML(htmlStr) 53 54 title := extractor.ExtractTitle(doc) 55 56 if title != "Heading Title" { 57 t.Errorf("Expected h1 content, got %q", title) 58 } 59 }) 60 61 t.Run("returns empty for nil document", func(t *testing.T) { 62 title := extractor.ExtractTitle(nil) 63 64 if title != "" { 65 t.Errorf("Expected empty string for nil document, got %q", title) 66 } 67 }) 68 69 t.Run("prioritizes OpenGraph over title tag", func(t *testing.T) { 70 htmlStr := `<html><head> 71 <meta property="og:title" content="OpenGraph Title"> 72 <title>HTML Title</title> 73 </head><body></body></html>` 74 doc := parseHTML(htmlStr) 75 76 title := extractor.ExtractTitle(doc) 77 78 if title != "OpenGraph Title" { 79 t.Errorf("Expected OpenGraph title to have priority, got %q", title) 80 } 81 }) 82 }) 83 84 t.Run("ExtractAuthor", func(t *testing.T) { 85 extractor := NewMetadataExtractor() 86 87 t.Run("extracts from OpenGraph", func(t *testing.T) { 88 htmlStr := `<html><head> 89 <meta property="og:author" content="John Doe"> 90 </head><body></body></html>` 91 doc := parseHTML(htmlStr) 92 93 author := extractor.ExtractAuthor(doc) 94 95 if author != "John Doe" { 96 t.Errorf("Expected OpenGraph author, got %q", author) 97 } 98 }) 99 100 t.Run("extracts from meta tag", func(t *testing.T) { 101 htmlStr := `<html><head> 102 <meta name="author" content="Jane Smith"> 103 </head><body></body></html>` 104 doc := parseHTML(htmlStr) 105 106 author := extractor.ExtractAuthor(doc) 107 108 if author != "Jane Smith" { 109 t.Errorf("Expected meta tag author, got %q", author) 110 } 111 }) 112 113 t.Run("extracts from rel=author link", func(t *testing.T) { 114 htmlStr := `<html><body> 115 <a rel="author" href="/author/bob">Bob Johnson</a> 116 </body></html>` 117 doc := parseHTML(htmlStr) 118 119 author := extractor.ExtractAuthor(doc) 120 121 if author != "Bob Johnson" { 122 t.Errorf("Expected rel=author link text, got %q", author) 123 } 124 }) 125 126 t.Run("extracts from byline class", func(t *testing.T) { 127 htmlStr := `<html><body> 128 <span class="author-name">Alice Brown</span> 129 </body></html>` 130 doc := parseHTML(htmlStr) 131 132 author := extractor.ExtractAuthor(doc) 133 134 if author != "Alice Brown" { 135 t.Errorf("Expected byline class text, got %q", author) 136 } 137 }) 138 139 t.Run("returns empty for nil document", func(t *testing.T) { 140 author := extractor.ExtractAuthor(nil) 141 142 if author != "" { 143 t.Errorf("Expected empty string for nil document, got %q", author) 144 } 145 }) 146 }) 147 148 t.Run("ExtractPublishedDate", func(t *testing.T) { 149 extractor := NewMetadataExtractor() 150 151 t.Run("extracts from OpenGraph", func(t *testing.T) { 152 htmlStr := `<html><head> 153 <meta property="og:published_time" content="2025-01-15T10:00:00Z"> 154 </head><body></body></html>` 155 doc := parseHTML(htmlStr) 156 157 date := extractor.ExtractPublishedDate(doc) 158 159 if date != "2025-01-15T10:00:00Z" { 160 t.Errorf("Expected OpenGraph date, got %q", date) 161 } 162 }) 163 164 t.Run("extracts from article:published_time", func(t *testing.T) { 165 htmlStr := `<html><head> 166 <meta property="article:published_time" content="2025-02-20"> 167 </head><body></body></html>` 168 doc := parseHTML(htmlStr) 169 170 date := extractor.ExtractPublishedDate(doc) 171 172 if date != "2025-02-20" { 173 t.Errorf("Expected article:published_time, got %q", date) 174 } 175 }) 176 177 t.Run("extracts from time element", func(t *testing.T) { 178 htmlStr := `<html><body> 179 <time datetime="2025-03-25T14:30:00">March 25, 2025</time> 180 </body></html>` 181 doc := parseHTML(htmlStr) 182 183 date := extractor.ExtractPublishedDate(doc) 184 185 if date != "2025-03-25T14:30:00" { 186 t.Errorf("Expected time element datetime, got %q", date) 187 } 188 }) 189 190 t.Run("returns empty for nil document", func(t *testing.T) { 191 date := extractor.ExtractPublishedDate(nil) 192 193 if date != "" { 194 t.Errorf("Expected empty string for nil document, got %q", date) 195 } 196 }) 197 }) 198 199 t.Run("ExtractSiteName", func(t *testing.T) { 200 extractor := NewMetadataExtractor() 201 202 t.Run("extracts from OpenGraph", func(t *testing.T) { 203 htmlStr := `<html><head> 204 <meta property="og:site_name" content="Example News"> 205 </head><body></body></html>` 206 doc := parseHTML(htmlStr) 207 208 siteName := extractor.ExtractSiteName(doc) 209 210 if siteName != "Example News" { 211 t.Errorf("Expected OpenGraph site_name, got %q", siteName) 212 } 213 }) 214 215 t.Run("extracts from application-name", func(t *testing.T) { 216 htmlStr := `<html><head> 217 <meta name="application-name" content="Tech Blog"> 218 </head><body></body></html>` 219 doc := parseHTML(htmlStr) 220 221 siteName := extractor.ExtractSiteName(doc) 222 223 if siteName != "Tech Blog" { 224 t.Errorf("Expected application-name, got %q", siteName) 225 } 226 }) 227 228 t.Run("returns empty for nil document", func(t *testing.T) { 229 siteName := extractor.ExtractSiteName(nil) 230 231 if siteName != "" { 232 t.Errorf("Expected empty string for nil document, got %q", siteName) 233 } 234 }) 235 }) 236 237 t.Run("ExtractLanguage", func(t *testing.T) { 238 extractor := NewMetadataExtractor() 239 240 t.Run("extracts from html lang attribute", func(t *testing.T) { 241 htmlStr := `<html lang="en-US"><body></body></html>` 242 doc := parseHTML(htmlStr) 243 244 lang := extractor.ExtractLanguage(doc) 245 246 if lang != "en-US" { 247 t.Errorf("Expected html lang attribute, got %q", lang) 248 } 249 }) 250 251 t.Run("extracts from OpenGraph locale", func(t *testing.T) { 252 htmlStr := `<html><head> 253 <meta property="og:locale" content="fr-FR"> 254 </head><body></body></html>` 255 doc := parseHTML(htmlStr) 256 257 lang := extractor.ExtractLanguage(doc) 258 259 if lang != "fr-FR" { 260 t.Errorf("Expected OpenGraph locale, got %q", lang) 261 } 262 }) 263 264 t.Run("returns empty for nil document", func(t *testing.T) { 265 lang := extractor.ExtractLanguage(nil) 266 267 if lang != "" { 268 t.Errorf("Expected empty string for nil document, got %q", lang) 269 } 270 }) 271 }) 272 273 t.Run("getSchemaOrgField", func(t *testing.T) { 274 extractor := NewMetadataExtractor() 275 276 t.Run("extracts from JSON-LD Article", func(t *testing.T) { 277 htmlStr := `<html><head> 278 <script type="application/ld+json"> 279 { 280 "@context": "https://schema.org", 281 "@type": "Article", 282 "headline": "Test Article", 283 "author": "Test Author", 284 "datePublished": "2025-01-15" 285 } 286 </script> 287 </head><body></body></html>` 288 doc := parseHTML(htmlStr) 289 290 headline := extractor.getSchemaOrgField(doc, "headline") 291 author := extractor.getSchemaOrgField(doc, "author") 292 date := extractor.getSchemaOrgField(doc, "datePublished") 293 294 if headline != "Test Article" { 295 t.Errorf("Expected headline from JSON-LD, got %q", headline) 296 } 297 298 if author != "Test Author" { 299 t.Errorf("Expected author from JSON-LD, got %q", author) 300 } 301 302 if date != "2025-01-15" { 303 t.Errorf("Expected datePublished from JSON-LD, got %q", date) 304 } 305 }) 306 307 t.Run("extracts from NewsArticle type", func(t *testing.T) { 308 htmlStr := `<html><head> 309 <script type="application/ld+json"> 310 { 311 "@context": "https://schema.org", 312 "@type": "NewsArticle", 313 "headline": "Breaking News" 314 } 315 </script> 316 </head><body></body></html>` 317 doc := parseHTML(htmlStr) 318 319 headline := extractor.getSchemaOrgField(doc, "headline") 320 321 if headline != "Breaking News" { 322 t.Errorf("Expected headline from NewsArticle, got %q", headline) 323 } 324 }) 325 326 t.Run("handles nested author object", func(t *testing.T) { 327 htmlStr := `<html><head> 328 <script type="application/ld+json"> 329 { 330 "@context": "https://schema.org", 331 "@type": "Article", 332 "author": { 333 "@type": "Person", 334 "name": "Nested Author" 335 } 336 } 337 </script> 338 </head><body></body></html>` 339 doc := parseHTML(htmlStr) 340 341 author := extractor.getSchemaOrgField(doc, "author") 342 343 if author != "Nested Author" { 344 t.Errorf("Expected nested author name, got %q", author) 345 } 346 }) 347 348 t.Run("returns empty for invalid JSON", func(t *testing.T) { 349 htmlStr := `<html><head> 350 <script type="application/ld+json"> 351 { invalid json } 352 </script> 353 </head><body></body></html>` 354 doc := parseHTML(htmlStr) 355 356 result := extractor.getSchemaOrgField(doc, "headline") 357 358 if result != "" { 359 t.Errorf("Expected empty for invalid JSON, got %q", result) 360 } 361 }) 362 363 t.Run("returns empty for non-Article types", func(t *testing.T) { 364 htmlStr := `<html><head> 365 <script type="application/ld+json"> 366 { 367 "@context": "https://schema.org", 368 "@type": "WebPage", 369 "headline": "Not an article" 370 } 371 </script> 372 </head><body></body></html>` 373 doc := parseHTML(htmlStr) 374 375 result := extractor.getSchemaOrgField(doc, "headline") 376 377 if result != "" { 378 t.Errorf("Expected empty for WebPage type, got %q", result) 379 } 380 }) 381 }) 382 383 t.Run("ExtractMetadata", func(t *testing.T) { 384 extractor := NewMetadataExtractor() 385 386 t.Run("extracts all metadata fields", func(t *testing.T) { 387 htmlStr := `<html lang="en"><head> 388 <title>Full Article Title</title> 389 <meta property="og:author" content="Full Name"> 390 <meta property="article:published_time" content="2025-01-20"> 391 <meta property="og:site_name" content="News Site"> 392 </head><body></body></html>` 393 doc := parseHTML(htmlStr) 394 395 result := extractor.ExtractMetadata(doc) 396 397 if result == nil { 398 t.Fatal("Expected result, got nil") 399 } 400 401 if !strings.Contains(result.Title, "Full Article Title") { 402 t.Errorf("Expected title to be extracted, got %q", result.Title) 403 } 404 405 if result.Author != "Full Name" { 406 t.Errorf("Expected author to be extracted, got %q", result.Author) 407 } 408 409 if result.PublishedDate != "2025-01-20" { 410 t.Errorf("Expected date to be extracted, got %q", result.PublishedDate) 411 } 412 413 if result.SiteName != "News Site" { 414 t.Errorf("Expected site name to be extracted, got %q", result.SiteName) 415 } 416 417 if result.Language != "en" { 418 t.Errorf("Expected language to be extracted, got %q", result.Language) 419 } 420 }) 421 422 t.Run("returns empty result for nil document", func(t *testing.T) { 423 result := extractor.ExtractMetadata(nil) 424 425 if result == nil { 426 t.Error("Expected empty result, got nil") 427 } 428 }) 429 }) 430}