experiments in a post-browser web
10
fork

Configure Feed

Select the types of activity you want to include in your feed.

test(entities): add unit tests for name-validator module

+545
+545
tests/unit/name-validator.test.js
··· 1 + import { describe, it, before } from 'node:test'; 2 + import { strict as assert } from 'node:assert'; 3 + import { readFileSync } from 'fs'; 4 + import { join, dirname } from 'path'; 5 + import { fileURLToPath } from 'url'; 6 + 7 + const __dirname = dirname(fileURLToPath(import.meta.url)); 8 + const modulePath = join(__dirname, '..', '..', 'extensions', 'entities', 'name-validator.js'); 9 + 10 + // Dynamic import since the module uses ESM exports 11 + const { classifyNamePattern, isLikelyPersonName, validateEntityType, loadNameDatabaseSync } = await import(modulePath); 12 + 13 + // Load the name database synchronously before tests 14 + const namesPath = join(__dirname, '..', '..', 'extensions', 'entities', 'data', 'names.json'); 15 + const namesData = JSON.parse(readFileSync(namesPath, 'utf-8')); 16 + loadNameDatabaseSync(namesData); 17 + 18 + // --------------------------------------------------------------------------- 19 + // classifyNamePattern 20 + // --------------------------------------------------------------------------- 21 + describe('classifyNamePattern', () => { 22 + 23 + describe('edge cases', () => { 24 + it('returns unknown for empty string', () => { 25 + const r = classifyNamePattern(''); 26 + assert.equal(r.likelyType, 'unknown'); 27 + assert.equal(r.confidence, 0); 28 + }); 29 + 30 + it('returns unknown for null', () => { 31 + const r = classifyNamePattern(null); 32 + assert.equal(r.likelyType, 'unknown'); 33 + assert.equal(r.confidence, 0); 34 + }); 35 + 36 + it('returns unknown for undefined', () => { 37 + const r = classifyNamePattern(undefined); 38 + assert.equal(r.likelyType, 'unknown'); 39 + assert.equal(r.confidence, 0); 40 + }); 41 + 42 + it('returns unknown for non-string input', () => { 43 + const r = classifyNamePattern(42); 44 + assert.equal(r.likelyType, 'unknown'); 45 + }); 46 + }); 47 + 48 + describe('URL and path-like names', () => { 49 + it('detects https URL', () => { 50 + const r = classifyNamePattern('https://example.com'); 51 + assert.equal(r.likelyType, 'not_entity'); 52 + assert.ok(r.confidence >= 0.9); 53 + }); 54 + 55 + it('detects http URL', () => { 56 + const r = classifyNamePattern('http://example.com/page'); 57 + assert.equal(r.likelyType, 'not_entity'); 58 + }); 59 + 60 + it('detects www prefix', () => { 61 + const r = classifyNamePattern('www.example.com'); 62 + assert.equal(r.likelyType, 'not_entity'); 63 + }); 64 + 65 + it('detects file path with slashes', () => { 66 + const r = classifyNamePattern('path/to/file.html'); 67 + assert.equal(r.likelyType, 'not_entity'); 68 + }); 69 + 70 + it('detects .com domain', () => { 71 + const r = classifyNamePattern('example.com'); 72 + assert.equal(r.likelyType, 'not_entity'); 73 + }); 74 + 75 + it('detects .org domain', () => { 76 + const r = classifyNamePattern('mozilla.org'); 77 + assert.equal(r.likelyType, 'not_entity'); 78 + }); 79 + }); 80 + 81 + describe('organization suffixes', () => { 82 + it('detects "Inc" suffix', () => { 83 + const r = classifyNamePattern('Apple Inc'); 84 + assert.equal(r.likelyType, 'organization'); 85 + assert.equal(r.confidence, 0.9); 86 + }); 87 + 88 + it('detects "LLC" suffix', () => { 89 + const r = classifyNamePattern('Google LLC'); 90 + assert.equal(r.likelyType, 'organization'); 91 + assert.equal(r.confidence, 0.9); 92 + }); 93 + 94 + it('detects "Foundation" suffix', () => { 95 + const r = classifyNamePattern('Mozilla Foundation'); 96 + assert.equal(r.likelyType, 'organization'); 97 + assert.equal(r.confidence, 0.9); 98 + }); 99 + 100 + it('detects "Ltd" suffix', () => { 101 + const r = classifyNamePattern('Acme Ltd'); 102 + assert.equal(r.likelyType, 'organization'); 103 + }); 104 + 105 + it('detects "University" anywhere', () => { 106 + const r = classifyNamePattern('Stanford University Press'); 107 + assert.equal(r.likelyType, 'organization'); 108 + }); 109 + 110 + it('detects "Corp" suffix', () => { 111 + const r = classifyNamePattern('Mega Corp'); 112 + assert.equal(r.likelyType, 'organization'); 113 + }); 114 + }); 115 + 116 + describe('honorifics', () => { 117 + it('detects "Dr." prefix', () => { 118 + const r = classifyNamePattern('Dr. Jane Doe'); 119 + assert.equal(r.likelyType, 'person'); 120 + assert.equal(r.confidence, 0.85); 121 + }); 122 + 123 + it('detects "Prof." prefix', () => { 124 + const r = classifyNamePattern('Prof. Smith'); 125 + assert.equal(r.likelyType, 'person'); 126 + assert.equal(r.confidence, 0.85); 127 + }); 128 + 129 + it('detects "Mr" prefix without dot', () => { 130 + const r = classifyNamePattern('Mr Johnson'); 131 + assert.equal(r.likelyType, 'person'); 132 + }); 133 + 134 + it('detects "Mrs." prefix', () => { 135 + const r = classifyNamePattern('Mrs. Williams'); 136 + assert.equal(r.likelyType, 'person'); 137 + }); 138 + 139 + it('detects "Sir" prefix', () => { 140 + const r = classifyNamePattern('Sir Isaac Newton'); 141 + assert.equal(r.likelyType, 'person'); 142 + }); 143 + 144 + it('detects "Senator" prefix', () => { 145 + const r = classifyNamePattern('Sen. Warren'); 146 + assert.equal(r.likelyType, 'person'); 147 + }); 148 + }); 149 + 150 + describe('"The X" pattern', () => { 151 + it('detects "The New York Times"', () => { 152 + const r = classifyNamePattern('The New York Times'); 153 + assert.equal(r.likelyType, 'organization'); 154 + assert.equal(r.confidence, 0.7); 155 + }); 156 + 157 + it('detects "The Guardian"', () => { 158 + const r = classifyNamePattern('The Guardian'); 159 + assert.equal(r.likelyType, 'organization'); 160 + }); 161 + 162 + // "The" alone (single word) should not match — needs at least 2 words 163 + it('does not match bare "The"', () => { 164 + const r = classifyNamePattern('The'); 165 + assert.notEqual(r.likelyType, 'organization'); 166 + }); 167 + }); 168 + 169 + describe('CamelCase brands (single word)', () => { 170 + it('detects "IndieWeb"', () => { 171 + const r = classifyNamePattern('IndieWeb'); 172 + assert.equal(r.likelyType, 'organization'); 173 + assert.equal(r.confidence, 0.8); 174 + assert.ok(r.reason.includes('CamelCase')); 175 + }); 176 + 177 + it('detects "YouTube"', () => { 178 + const r = classifyNamePattern('YouTube'); 179 + assert.equal(r.likelyType, 'organization'); 180 + }); 181 + 182 + it('detects "GitHub"', () => { 183 + const r = classifyNamePattern('GitHub'); 184 + assert.equal(r.likelyType, 'organization'); 185 + }); 186 + 187 + it('detects "WordPress"', () => { 188 + const r = classifyNamePattern('WordPress'); 189 + assert.equal(r.likelyType, 'organization'); 190 + }); 191 + 192 + it('detects "TikTok"', () => { 193 + const r = classifyNamePattern('TikTok'); 194 + assert.equal(r.likelyType, 'organization'); 195 + }); 196 + 197 + it('detects "LinkedIn"', () => { 198 + const r = classifyNamePattern('LinkedIn'); 199 + assert.equal(r.likelyType, 'organization'); 200 + }); 201 + }); 202 + 203 + describe('ALL_CAPS abbreviations', () => { 204 + it('detects "NASA"', () => { 205 + const r = classifyNamePattern('NASA'); 206 + assert.equal(r.likelyType, 'organization'); 207 + assert.equal(r.confidence, 0.6); 208 + }); 209 + 210 + it('detects "FBI"', () => { 211 + const r = classifyNamePattern('FBI'); 212 + assert.equal(r.likelyType, 'organization'); 213 + }); 214 + 215 + it('detects "UNESCO"', () => { 216 + const r = classifyNamePattern('UNESCO'); 217 + assert.equal(r.likelyType, 'organization'); 218 + }); 219 + }); 220 + 221 + describe('leading CamelCase in multi-word', () => { 222 + it('detects "IndieWeb Movement"', () => { 223 + const r = classifyNamePattern('IndieWeb Movement'); 224 + assert.equal(r.likelyType, 'organization'); 225 + assert.equal(r.confidence, 0.7); 226 + }); 227 + }); 228 + 229 + describe('two-word title case (ambiguous)', () => { 230 + it('returns ambiguous for "John Smith"', () => { 231 + const r = classifyNamePattern('John Smith'); 232 + assert.equal(r.likelyType, 'ambiguous'); 233 + assert.equal(r.confidence, 0.5); 234 + }); 235 + 236 + it('returns ambiguous for "Jane Doe"', () => { 237 + const r = classifyNamePattern('Jane Doe'); 238 + assert.equal(r.likelyType, 'ambiguous'); 239 + }); 240 + }); 241 + 242 + describe('single word ambiguous', () => { 243 + it('returns ambiguous for single title-case word', () => { 244 + const r = classifyNamePattern('Apple'); 245 + assert.equal(r.likelyType, 'ambiguous'); 246 + assert.equal(r.confidence, 0.4); 247 + }); 248 + }); 249 + 250 + describe('"&" and "and" patterns', () => { 251 + // Note: \b&\b in the regex does not match because & is not a word character, 252 + // so "Ben & Jerry" falls through to "unknown". This is a quirk of the regex. 253 + it('does not detect bare ampersand (regex limitation)', () => { 254 + const r = classifyNamePattern('Ben & Jerry'); 255 + assert.equal(r.likelyType, 'unknown'); 256 + }); 257 + 258 + it('detects "and" conjunction', () => { 259 + const r = classifyNamePattern('Simon and Garfunkel'); 260 + assert.equal(r.likelyType, 'organization'); 261 + }); 262 + }); 263 + 264 + describe('no strong pattern', () => { 265 + it('returns unknown for multi-word non-matching name', () => { 266 + const r = classifyNamePattern('something random here'); 267 + assert.equal(r.likelyType, 'unknown'); 268 + assert.equal(r.confidence, 0.3); 269 + }); 270 + }); 271 + }); 272 + 273 + // --------------------------------------------------------------------------- 274 + // isLikelyPersonName 275 + // --------------------------------------------------------------------------- 276 + describe('isLikelyPersonName', () => { 277 + 278 + describe('edge cases', () => { 279 + it('returns not-likely for empty string', () => { 280 + const r = isLikelyPersonName(''); 281 + assert.equal(r.likely, false); 282 + assert.equal(r.confidence, 0); 283 + }); 284 + 285 + it('returns not-likely for null', () => { 286 + const r = isLikelyPersonName(null); 287 + assert.equal(r.likely, false); 288 + }); 289 + 290 + it('returns not-likely for undefined', () => { 291 + const r = isLikelyPersonName(undefined); 292 + assert.equal(r.likely, false); 293 + }); 294 + }); 295 + 296 + describe('known first + surname', () => { 297 + it('recognizes "John Smith" as likely person (both in DB)', () => { 298 + const r = isLikelyPersonName('John Smith'); 299 + assert.equal(r.likely, true); 300 + assert.equal(r.confidence, 0.9); 301 + assert.ok(r.reason.includes('first name') && r.reason.includes('surname')); 302 + }); 303 + }); 304 + 305 + describe('known first name only', () => { 306 + it('recognizes "James Xyzzyplugh" (first name known, surname not)', () => { 307 + const r = isLikelyPersonName('James Xyzzyplugh'); 308 + assert.equal(r.likely, true); 309 + assert.equal(r.confidence, 0.7); 310 + }); 311 + }); 312 + 313 + describe('known surname only', () => { 314 + it('recognizes "Zxqwkj Smith" (surname known, first not)', () => { 315 + const r = isLikelyPersonName('Zxqwkj Smith'); 316 + assert.equal(r.likely, true); 317 + assert.equal(r.confidence, 0.5); 318 + }); 319 + }); 320 + 321 + describe('mononyms (single known first name)', () => { 322 + it('recognizes single known first name', () => { 323 + const r = isLikelyPersonName('Mary'); 324 + assert.equal(r.likely, true); 325 + assert.equal(r.confidence, 0.5); 326 + assert.ok(r.reason.includes('single word')); 327 + }); 328 + }); 329 + 330 + describe('unknown single word', () => { 331 + it('rejects unknown single word', () => { 332 + const r = isLikelyPersonName('Xyzzy'); 333 + assert.equal(r.likely, false); 334 + }); 335 + 336 + it('rejects brand name', () => { 337 + const r = isLikelyPersonName('IndieWeb'); 338 + assert.equal(r.likely, false); 339 + }); 340 + }); 341 + 342 + describe('honorific handling', () => { 343 + it('recognizes honorific + known first name', () => { 344 + // "Jane" is not in the names DB, so this hits the generic honorific branch (0.75) 345 + // Use a name that IS in the DB for the 0.9 path 346 + const r1 = isLikelyPersonName('Dr. Mary Johnson'); 347 + assert.equal(r1.likely, true); 348 + assert.equal(r1.confidence, 0.9); 349 + assert.ok(r1.reason.includes('honorific') && r1.reason.includes('known first name')); 350 + 351 + // "Dr. Jane Doe" still recognized as person via honorific alone 352 + const r2 = isLikelyPersonName('Dr. Jane Doe'); 353 + assert.equal(r2.likely, true); 354 + assert.equal(r2.confidence, 0.75); 355 + assert.ok(r2.reason.includes('honorific')); 356 + }); 357 + 358 + it('recognizes honorific + unknown name', () => { 359 + const r = isLikelyPersonName('Dr. Xyzzy'); 360 + assert.equal(r.likely, true); 361 + assert.equal(r.confidence, 0.75); 362 + assert.ok(r.reason.includes('honorific')); 363 + }); 364 + }); 365 + 366 + describe('two-word title case without DB match', () => { 367 + it('returns not-likely with moderate confidence', () => { 368 + const r = isLikelyPersonName('Zxqwkj Plugh'); 369 + assert.equal(r.likely, false); 370 + assert.equal(r.confidence, 0.4); 371 + }); 372 + }); 373 + 374 + describe('multi-word no match', () => { 375 + it('returns not-likely for completely unknown multi-word', () => { 376 + const r = isLikelyPersonName('something random here'); 377 + assert.equal(r.likely, false); 378 + assert.equal(r.confidence, 0.2); 379 + }); 380 + }); 381 + }); 382 + 383 + // --------------------------------------------------------------------------- 384 + // validateEntityType 385 + // --------------------------------------------------------------------------- 386 + describe('validateEntityType', () => { 387 + 388 + describe('edge cases', () => { 389 + it('returns invalid for empty name', () => { 390 + const r = validateEntityType('', 'person'); 391 + assert.equal(r.valid, false); 392 + }); 393 + 394 + it('returns invalid for null name', () => { 395 + const r = validateEntityType(null, 'person'); 396 + assert.equal(r.valid, false); 397 + }); 398 + 399 + it('returns invalid for missing type', () => { 400 + const r = validateEntityType('John Smith', ''); 401 + assert.equal(r.valid, false); 402 + }); 403 + 404 + it('returns invalid for null type', () => { 405 + const r = validateEntityType('John Smith', null); 406 + assert.equal(r.valid, false); 407 + }); 408 + 409 + it('returns invalid for undefined type', () => { 410 + const r = validateEntityType('John Smith', undefined); 411 + assert.equal(r.valid, false); 412 + }); 413 + }); 414 + 415 + describe('URL/path rejection', () => { 416 + it('rejects URL as person', () => { 417 + const r = validateEntityType('https://example.com', 'person'); 418 + assert.equal(r.valid, false); 419 + assert.ok(r.reason.includes('URL')); 420 + }); 421 + 422 + it('rejects URL as organization', () => { 423 + const r = validateEntityType('https://example.com', 'organization'); 424 + assert.equal(r.valid, false); 425 + }); 426 + 427 + it('rejects path as person', () => { 428 + const r = validateEntityType('path/to/file.html', 'person'); 429 + assert.equal(r.valid, false); 430 + }); 431 + }); 432 + 433 + describe('person claims — invalid', () => { 434 + it('rejects CamelCase brand as person', () => { 435 + const r = validateEntityType('IndieWeb', 'person'); 436 + assert.equal(r.valid, false); 437 + assert.equal(r.suggestedType, 'organization'); 438 + }); 439 + 440 + it('rejects org suffix as person', () => { 441 + const r = validateEntityType('Apple Inc', 'person'); 442 + assert.equal(r.valid, false); 443 + assert.equal(r.suggestedType, 'organization'); 444 + }); 445 + 446 + it('rejects "The X" pattern as person', () => { 447 + const r = validateEntityType('The New York Times', 'person'); 448 + assert.equal(r.valid, false); 449 + assert.equal(r.suggestedType, 'organization'); 450 + }); 451 + 452 + it('rejects ALL_CAPS as person', () => { 453 + const r = validateEntityType('NASA', 'person'); 454 + assert.equal(r.valid, false); 455 + assert.equal(r.suggestedType, 'organization'); 456 + }); 457 + 458 + it('rejects leading CamelCase multi-word as person', () => { 459 + const r = validateEntityType('IndieWeb Movement', 'person'); 460 + assert.equal(r.valid, false); 461 + assert.equal(r.suggestedType, 'organization'); 462 + }); 463 + 464 + it('rejects unknown single-word as person', () => { 465 + const r = validateEntityType('Xyzzy', 'person'); 466 + assert.equal(r.valid, false); 467 + assert.ok(r.reason.includes('single-word')); 468 + }); 469 + }); 470 + 471 + describe('person claims — valid', () => { 472 + it('accepts "John Smith" as person', () => { 473 + const r = validateEntityType('John Smith', 'person'); 474 + assert.equal(r.valid, true); 475 + }); 476 + 477 + it('accepts "Dr. Jane Doe" as person', () => { 478 + const r = validateEntityType('Dr. Jane Doe', 'person'); 479 + assert.equal(r.valid, true); 480 + }); 481 + 482 + it('accepts known mononym as person', () => { 483 + const r = validateEntityType('Mary', 'person'); 484 + assert.equal(r.valid, true); 485 + }); 486 + 487 + it('accepts multi-word with known first name as person', () => { 488 + const r = validateEntityType('James Unknownlast', 'person'); 489 + assert.equal(r.valid, true); 490 + }); 491 + }); 492 + 493 + describe('organization claims — valid', () => { 494 + it('accepts "Google LLC" as organization', () => { 495 + const r = validateEntityType('Google LLC', 'organization'); 496 + assert.equal(r.valid, true); 497 + }); 498 + 499 + it('accepts "The New York Times" as organization', () => { 500 + const r = validateEntityType('The New York Times', 'organization'); 501 + assert.equal(r.valid, true); 502 + }); 503 + 504 + it('accepts "NASA" as organization', () => { 505 + const r = validateEntityType('NASA', 'organization'); 506 + assert.equal(r.valid, true); 507 + }); 508 + 509 + it('accepts "IndieWeb" as organization', () => { 510 + const r = validateEntityType('IndieWeb', 'organization'); 511 + assert.equal(r.valid, true); 512 + }); 513 + }); 514 + 515 + describe('organization claims — invalid', () => { 516 + it('rejects honorific name as organization', () => { 517 + const r = validateEntityType('Dr. Jane Doe', 'organization'); 518 + assert.equal(r.valid, false); 519 + assert.equal(r.suggestedType, 'person'); 520 + }); 521 + 522 + it('rejects "Prof. Smith" as organization', () => { 523 + const r = validateEntityType('Prof. Smith', 'organization'); 524 + assert.equal(r.valid, false); 525 + assert.equal(r.suggestedType, 'person'); 526 + }); 527 + }); 528 + 529 + describe('other claimed types', () => { 530 + it('accepts event type with minimal validation', () => { 531 + const r = validateEntityType('Summer Conference', 'event'); 532 + assert.equal(r.valid, true); 533 + }); 534 + 535 + it('accepts place type with minimal validation', () => { 536 + const r = validateEntityType('Central Park', 'place'); 537 + assert.equal(r.valid, true); 538 + }); 539 + 540 + it('still rejects URL for other types', () => { 541 + const r = validateEntityType('https://example.com', 'event'); 542 + assert.equal(r.valid, false); 543 + }); 544 + }); 545 + });