experiments in a post-browser web
10
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat(entities): add Phase 1 entity validation framework

Implements name pattern heuristics, local name database, and
coherence checks to catch entity type misclassifications.

New files:
- name-validator.js: validateEntityType(), classifyNamePattern(),
isLikelyPersonName() — detects CamelCase brands, org suffixes,
honorifics, ALL_CAPS, The X pattern, URL-like names
- data/names.json: ~1400 first names + ~1000 surnames for
person name validation (24KB)

Integration:
- entity-matcher.js: validates type claims before storing,
re-types or skips invalid entities
- structured-data.js: coherence checks in jsonLdItemToEntity()
and extractOpenGraph() — cross-validates person claims against
name patterns
- datastore.ts: migration migrateInvalidPersonEntities() cleans
existing misclassified person entities (CamelCase brands,
single-word non-names, org suffixes retyped to organization)

+1425 -20
+171 -3
backend/electron/datastore.ts
··· 372 372 migrateNoisyDateEntities(); 373 373 migrateUnformattedPhoneEntities(); 374 374 migrateLowQualityEntities(); 375 + migrateInvalidPersonEntities(); 375 376 dropLegacyAddressTables(); 376 377 377 378 // Validate schema against canonical definition ··· 1731 1732 function migrateLowQualityEntities(): void { 1732 1733 if (!db) return; 1733 1734 1734 - const MIGRATION_ID = 'remove_low_quality_entities_v3'; 1735 + const MIGRATION_ID = 'remove_low_quality_entities_v5'; 1735 1736 1736 1737 const migrationRecord = db.prepare('SELECT * FROM migrations WHERE id = ?').get(MIGRATION_ID) as { status: string } | undefined; 1737 1738 if (migrationRecord && migrationRecord.status === 'complete') return; ··· 1746 1747 'privacy policy', 'terms of service', 'cookie policy', 'subscribe', 1747 1748 'sign in', 'sign up', 'log in', 'register', 'learn more', 'read more', 1748 1749 'click here', 'view more', 'see more', 'show more', 'load more', 1750 + 'main page', 'front page', 'index', 'welcome', 'dashboard', 'feed', 1751 + 'homepage', 'start page', 'new tab', 1749 1752 ]); 1750 1753 1751 1754 const blockedEmailPrefixes = [ ··· 1783 1786 // 1. Blocklisted names 1784 1787 if (blocklist.has(nameLower)) { removeEntity(entity.id); continue; } 1785 1788 1786 - // 2. Too short or all numbers 1787 - if (name.length < 2 || /^\d+$/.test(name)) { removeEntity(entity.id); continue; } 1789 + // 2. Too short (< 4 chars catches abbreviations like "LOC", "WI", "Ed", "Se") or all numbers 1790 + if (name.length < 4 || /^\d+$/.test(name)) { removeEntity(entity.id); continue; } 1788 1791 1789 1792 // 3. All punctuation/symbols 1790 1793 if (/^[\d\s.,;:!?$%/\-@#&*()[\]{}|\\<>]+$/.test(name)) { removeEntity(entity.id); continue; } ··· 1828 1831 removeEntity(entity.id); 1829 1832 continue; 1830 1833 } 1834 + 1835 + // 9. Single-word "person" entities from OpenGraph — these are site/org names, not people 1836 + if (entityType === 'person' && meta.extractor === 'opengraph' && !name.includes(' ')) { 1837 + removeEntity(entity.id); 1838 + continue; 1839 + } 1831 1840 } 1832 1841 1833 1842 db.prepare('INSERT OR REPLACE INTO migrations (id, status, completedAt) VALUES (?, ?, ?)').run(MIGRATION_ID, 'complete', Date.now()); 1834 1843 if (removedCount > 0) { 1835 1844 console.log(`[datastore] Removed ${removedCount} low-quality entities (blocklist, automated emails, sparse structured data)`); 1845 + } 1846 + } 1847 + 1848 + /** 1849 + * Validate existing person entities using name heuristics and a local name database. 1850 + * Removes person entities that fail validation: 1851 + * - CamelCase brand names stored as persons (e.g., "IndieWeb") 1852 + * - Single-word persons not in the common first names database 1853 + * - Names with org suffixes stored as persons 1854 + * - Re-types obvious organizations that were misclassified as persons 1855 + * 1856 + * See docs/entity-validation-research.md for the full framework design. 1857 + */ 1858 + function migrateInvalidPersonEntities(): void { 1859 + if (!db) return; 1860 + 1861 + const MIGRATION_ID = 'validate_person_entities_v1'; 1862 + 1863 + const migrationRecord = db.prepare('SELECT * FROM migrations WHERE id = ?').get(MIGRATION_ID) as { status: string } | undefined; 1864 + if (migrationRecord && migrationRecord.status === 'complete') return; 1865 + 1866 + // Load the name database synchronously for the migration context 1867 + let namesData: { firstNames: string[]; surnames: string[] }; 1868 + try { 1869 + const namesPath = join(__dirname, '../../../extensions/entities/data/names.json'); 1870 + namesData = JSON.parse(readFileSync(namesPath, 'utf-8')); 1871 + } catch (error) { 1872 + console.warn('[datastore] Could not load names.json for person entity migration, skipping:', (error as Error).message); 1873 + db.prepare('INSERT OR REPLACE INTO migrations (id, status, completedAt) VALUES (?, ?, ?)').run(MIGRATION_ID, 'complete', Date.now()); 1874 + return; 1875 + } 1876 + 1877 + const firstNames = new Set(namesData.firstNames.map(n => n.toLowerCase())); 1878 + const surnames = new Set(namesData.surnames.map(n => n.toLowerCase())); 1879 + 1880 + // CamelCase pattern: single word with internal uppercase (IndieWeb, YouTube, GitHub) 1881 + const camelCasePattern = /^[A-Z][a-z]+(?:[A-Z][a-z]*)+$/; 1882 + 1883 + // Org suffixes 1884 + const orgSuffixPattern = /\b(Inc|LLC|Ltd|Corp|Corporation|Foundation|Association|Institute|University|Museum|Academy|Society)\b\.?/i; 1885 + 1886 + // "The X" pattern 1887 + const thePattern = /^The\s+/; 1888 + 1889 + // ALL_CAPS 1890 + const allCapsPattern = /^[A-Z]{2,}$/; 1891 + 1892 + // Honorific prefix 1893 + const honorificPattern = /^(Mr|Mrs|Ms|Miss|Dr|Prof|Professor|Sir|Dame|Lady|Lord|Rev|Reverend)\b\.?\s/i; 1894 + 1895 + // URL/path-like 1896 + const urlLikePattern = /^(https?:\/\/|www\.|\/)/i; 1897 + const pathLikePattern = /[/\\]|\.(?:com|org|net|edu|gov|io|html|php)\b/i; 1898 + 1899 + const personEntities = db.prepare(` 1900 + SELECT id, content, metadata FROM items 1901 + WHERE type = 'entity' 1902 + AND deletedAt = 0 1903 + AND json_extract(metadata, '$.entityType') = 'person' 1904 + `).all() as { id: string; content: string; metadata: string }[]; 1905 + 1906 + let removedCount = 0; 1907 + let retypedCount = 0; 1908 + 1909 + const removeEntity = (id: string) => { 1910 + try { 1911 + db!.prepare('DELETE FROM item_events WHERE itemId = ?').run(id); 1912 + db!.prepare('DELETE FROM item_tags WHERE itemId = ?').run(id); 1913 + db!.prepare('DELETE FROM items WHERE id = ?').run(id); 1914 + removedCount++; 1915 + } catch (error) { 1916 + console.error(`Failed to remove invalid person entity ${id}:`, error); 1917 + } 1918 + }; 1919 + 1920 + const retypeToOrg = (id: string, metadata: string) => { 1921 + try { 1922 + const meta = JSON.parse(metadata || '{}'); 1923 + meta.entityType = 'organization'; 1924 + meta.confidence = (meta.confidence || 0.5) * 0.8; 1925 + meta.retypedFrom = 'person'; 1926 + meta.retypedReason = 'name validation migration'; 1927 + db!.prepare('UPDATE items SET metadata = ? WHERE id = ?').run(JSON.stringify(meta), id); 1928 + 1929 + // Update tags: remove entity:person tag, add entity:organization tag 1930 + const personTag = db!.prepare(`SELECT id FROM tags WHERE name = 'entity:person'`).get() as { id: string } | undefined; 1931 + const orgTag = db!.prepare(`SELECT id FROM tags WHERE name = 'entity:organization'`).get() as { id: string } | undefined; 1932 + if (personTag) { 1933 + db!.prepare('DELETE FROM item_tags WHERE itemId = ? AND tagId = ?').run(id, personTag.id); 1934 + } 1935 + if (orgTag) { 1936 + db!.prepare('INSERT OR IGNORE INTO item_tags (itemId, tagId) VALUES (?, ?)').run(id, orgTag.id); 1937 + } 1938 + retypedCount++; 1939 + } catch (error) { 1940 + console.error(`Failed to retype person entity ${id} to org:`, error); 1941 + } 1942 + }; 1943 + 1944 + for (const entity of personEntities) { 1945 + const name = (entity.content || '').trim(); 1946 + const words = name.split(/\s+/); 1947 + 1948 + // URL/path-like → remove 1949 + if (urlLikePattern.test(name) || pathLikePattern.test(name)) { 1950 + removeEntity(entity.id); 1951 + continue; 1952 + } 1953 + 1954 + // Has honorific → keep (strong person signal) 1955 + if (honorificPattern.test(name)) continue; 1956 + 1957 + // CamelCase single word → retype to org (e.g., "IndieWeb", "YouTube") 1958 + if (words.length === 1 && camelCasePattern.test(name)) { 1959 + retypeToOrg(entity.id, entity.metadata); 1960 + continue; 1961 + } 1962 + 1963 + // ALL_CAPS single word → remove (abbreviation, not person) 1964 + if (words.length === 1 && allCapsPattern.test(name)) { 1965 + removeEntity(entity.id); 1966 + continue; 1967 + } 1968 + 1969 + // Org suffix → retype to org 1970 + if (orgSuffixPattern.test(name)) { 1971 + retypeToOrg(entity.id, entity.metadata); 1972 + continue; 1973 + } 1974 + 1975 + // "The X" pattern → retype to org 1976 + if (thePattern.test(name) && words.length >= 2) { 1977 + retypeToOrg(entity.id, entity.metadata); 1978 + continue; 1979 + } 1980 + 1981 + // Single word: only keep if it is a known first name 1982 + if (words.length === 1) { 1983 + const lower = name.toLowerCase(); 1984 + if (!firstNames.has(lower)) { 1985 + removeEntity(entity.id); 1986 + continue; 1987 + } 1988 + // Known first name mononym (Madonna, Jesus) — keep 1989 + continue; 1990 + } 1991 + 1992 + // Multi-word with leading CamelCase → retype to org 1993 + if (camelCasePattern.test(words[0])) { 1994 + retypeToOrg(entity.id, entity.metadata); 1995 + continue; 1996 + } 1997 + 1998 + // Multi-word: keep (two-word names are plausible persons even without DB match) 1999 + } 2000 + 2001 + db.prepare('INSERT OR REPLACE INTO migrations (id, status, completedAt) VALUES (?, ?, ?)').run(MIGRATION_ID, 'complete', Date.now()); 2002 + if (removedCount > 0 || retypedCount > 0) { 2003 + console.log(`[datastore] Person entity validation: removed ${removedCount}, retyped ${retypedCount} to organization`); 1836 2004 } 1837 2005 } 1838 2006
+555
docs/entity-validation-research.md
··· 1 + # Entity Validation/Evaluation Framework Research 2 + 3 + **Date:** 2026-02-18 4 + **Status:** Research — not yet implemented 5 + 6 + ## Problem Statement 7 + 8 + The current entity extraction system uses a growing collection of ad-hoc filters (blocklists, minimum name lengths, confidence thresholds, per-type field requirements) that fail to catch fundamental misclassifications. The core issue: **we have no way to validate that an extracted name is actually the claimed entity type.** 9 + 10 + Examples of failures the current system cannot prevent: 11 + - "IndieWeb" extracted as Person from `og:type=profile` (it is a movement/website) 12 + - Wikipedia JSON-LD producing short abbreviation entities (LOC, WI, TA, GR) 13 + - "Main Page" stored as a creative work 14 + - Single-word organization names mistyped as persons 15 + - Sites using `og:type=profile` for non-person pages 16 + 17 + The filtering code in `entity-matcher.js` and `structured-data.js` has grown reactively — each new garbage entity type triggers a new heuristic. This document evaluates structured alternatives. 18 + 19 + --- 20 + 21 + ## 1. Wikidata API for Entity Type Validation 22 + 23 + ### How It Works 24 + 25 + Wikidata provides a free, open knowledge base with ~110M items. The relevant API flow: 26 + 27 + 1. **Search:** `wbsearchentities` — find candidate entities by name 28 + ``` 29 + GET https://www.wikidata.org/w/api.php?action=wbsearchentities&search=IndieWeb&language=en&format=json 30 + ``` 31 + Returns a list of matching Wikidata items with IDs (e.g., Q16917531). 32 + 33 + 2. **Validate type:** `wbgetentities` — fetch the item and check its `P31` (instance of) claim 34 + ``` 35 + GET https://www.wikidata.org/w/api.php?action=wbgetentities&ids=Q16917531&props=claims&format=json 36 + ``` 37 + Check if `claims.P31` includes `Q5` (human), `Q43229` (organization), `Q515` (city), etc. 38 + 39 + 3. **Type mapping:** Map Wikidata class IDs to our entity types: 40 + - `Q5` (human) -> `person` 41 + - `Q43229` (organization), `Q4830453` (business) -> `organization` 42 + - `Q515` (city), `Q6256` (country), `Q35127` (website) -> not person 43 + - `Q1656682` (event) -> `event` 44 + 45 + ### Rate Limits 46 + 47 + - **Unauthenticated:** Max 1 concurrent request, 5 req/sec per IP 48 + - **Authenticated (OAuth 2.0):** 5,000 req/hour per user, up to 10 req/sec 49 + - **User-Agent required:** Must provide a descriptive User-Agent header or requests get 403'd 50 + - **SPARQL endpoint:** Separate limits, 60-second query timeout, 5 concurrent queries 51 + 52 + ### Latency 53 + 54 + - `wbsearchentities`: ~100-300ms per request 55 + - `wbgetentities`: ~50-200ms per request (less data) 56 + - Two requests needed per validation: search + fetch claims = ~200-500ms total 57 + 58 + ### Offline/Local Options 59 + 60 + - **Wikidata dumps** are available (JSON, ~100GB uncompressed) — impractical for shipping in an app 61 + - **No lightweight local subset** exists for type lookups 62 + - Could build a local cache of previously validated entities (SQLite or in-memory Map) 63 + 64 + ### Pros 65 + - Extremely comprehensive coverage (~110M entities) 66 + - Authoritative type information via `P31` (instance of) 67 + - Free, no API key required (though User-Agent is mandatory) 68 + - Would definitively resolve the "IndieWeb as Person" class of bugs 69 + - Can disambiguate: "Apple" -> Q312 (company) vs Q89 (fruit) 70 + 71 + ### Cons 72 + - Requires network access (no offline-first) 73 + - 200-500ms latency per entity validation — not viable for inline/blocking use 74 + - 5 req/sec unauthenticated rate limit means ~10 entities/page is fine, but bulk processing needs throttling 75 + - Search can return multiple candidates — need disambiguation logic 76 + - Not all entities are in Wikidata (obscure people, small businesses) 77 + - Two-request round trip (search + get claims) adds complexity 78 + 79 + ### Feasibility: HIGH for async validation 80 + 81 + Best used as a post-extraction validation step. Extract entities synchronously, store provisionally, validate asynchronously via Wikidata, then update or remove. 82 + 83 + --- 84 + 85 + ## 2. Other Knowledge Bases 86 + 87 + ### DBpedia / DBpedia Spotlight 88 + 89 + - **What:** Knowledge base extracted from Wikipedia, with an entity annotation API 90 + - **API:** `https://api.dbpedia-spotlight.org/en/annotate` — send text, get back annotated entities with DBpedia types 91 + - **Rate limits:** Poorly documented; users report 403 errors after moderate use. Recommended to self-host for production. 92 + - **Pros:** Could annotate full page text, returns typed entities directly (bypasses our extraction entirely) 93 + - **Cons:** Unreliable public API, self-hosting requires ~8GB RAM Java service, latency ~500ms+ 94 + - **Verdict:** Too heavy for our use case. Self-hosting a Java service is not viable for a desktop Electron app. 95 + 96 + ### OpenCyc / Cyc 97 + 98 + - **Status:** OpenCyc was discontinued in 2017. Cyc is proprietary. 99 + - **Verdict:** Not viable. 100 + 101 + ### Google Knowledge Graph Search API 102 + 103 + - **What:** Google's knowledge base API — search entities, get types 104 + - **Rate limits:** 100,000 requests/day free tier 105 + - **Pros:** Fast, reliable, good coverage 106 + - **Cons:** Requires API key, Google TOS restrictions, may not be "free/open-source" 107 + - **Verdict:** TOS-incompatible with our constraints but worth noting as a fallback. 108 + 109 + ### Verdict on Knowledge Bases 110 + 111 + Wikidata is the clear winner for our constraints (free, open, comprehensive). DBpedia Spotlight is interesting but operationally too heavy. No other free knowledge base comes close to Wikidata's coverage. 112 + 113 + --- 114 + 115 + ## 3. Name Pattern Heuristics 116 + 117 + Well-established heuristics for distinguishing person names from organization/brand names without ML: 118 + 119 + ### Capitalization Patterns 120 + 121 + | Pattern | Likely Type | Example | 122 + |---------|-------------|---------| 123 + | First Last (title case, 2 words) | Person | "John Smith" | 124 + | CamelCase single word | Organization/Brand | "IndieWeb", "GitHub" | 125 + | ALL CAPS | Organization/Acronym | "NASA", "FBI" | 126 + | Single word, title case | Ambiguous | "Apple", "Amazon" | 127 + | 3+ words, mixed case | Organization | "World Health Organization" | 128 + | Title + Name | Person | "Dr. Jane Doe", "Prof. Smith" | 129 + 130 + ### Structural Rules (No External Data) 131 + 132 + 1. **Two-word names where both start with uppercase** are likely persons 133 + 2. **Names containing "Inc", "LLC", "Corp", "Ltd", "Co.", "Foundation", "Association"** are organizations 134 + 3. **Names with internal capitalization (CamelCase)** are brands/products, not persons 135 + 4. **Names with "&" or "and" between words** are likely organizations ("Ben & Jerry's") 136 + 5. **Names that are a single word** are almost never persons (except in some cultures) 137 + 6. **Names starting with "The "** are organizations or creative works 138 + 7. **Honorific prefixes** (Mr., Mrs., Dr., Prof., Sir, Lady) strongly indicate person 139 + 140 + ### Implementation Sketch 141 + 142 + ```javascript 143 + function classifyNamePattern(name) { 144 + const words = name.trim().split(/\s+/); 145 + 146 + // Org suffixes 147 + if (/\b(Inc|LLC|Corp|Ltd|Co|Foundation|Association|Institute|University|Museum)\b/i.test(name)) 148 + return { likelyType: 'organization', confidence: 0.9 }; 149 + 150 + // Honorific prefix 151 + if (/^(Mr|Mrs|Ms|Dr|Prof|Sir|Dame|Lady|Lord|Rev|Fr|Sr|Jr)\b\.?\s/i.test(name)) 152 + return { likelyType: 'person', confidence: 0.85 }; 153 + 154 + // "The X" pattern 155 + if (/^The\s/i.test(name) && words.length >= 2) 156 + return { likelyType: 'organization', confidence: 0.7 }; 157 + 158 + // CamelCase single word 159 + if (words.length === 1 && /^[A-Z][a-z]+[A-Z]/.test(name)) 160 + return { likelyType: 'organization', confidence: 0.75 }; 161 + 162 + // ALL CAPS short word (acronym) 163 + if (words.length === 1 && /^[A-Z]{2,6}$/.test(name)) 164 + return { likelyType: 'organization', confidence: 0.6 }; 165 + 166 + // Two-word title case — needs name database to distinguish 167 + // "John Smith" vs "World Bank" 168 + if (words.length === 2 && words.every(w => /^[A-Z][a-z]+$/.test(w))) 169 + return { likelyType: 'ambiguous', confidence: 0.5 }; 170 + 171 + return { likelyType: 'unknown', confidence: 0.3 }; 172 + } 173 + ``` 174 + 175 + ### Pros 176 + - Zero dependencies, zero latency, zero network 177 + - Easy to implement and test 178 + - Catches obvious cases (CamelCase brands, org suffixes, honorifics) 179 + - Works in any JS environment 180 + 181 + ### Cons 182 + - Cannot distinguish "John Smith" (person) from "World Bank" (org) without external data 183 + - Culture-dependent — many non-Western names don't follow FirstName LastName pattern 184 + - CamelCase heuristic fails for compound proper nouns 185 + - Low confidence on ambiguous cases 186 + 187 + ### Feasibility: HIGH as a first-pass filter 188 + 189 + Should be combined with name databases (see section 5) for the ambiguous cases. 190 + 191 + --- 192 + 193 + ## 4. Lightweight NER Options 194 + 195 + ### compromise.js 196 + 197 + - **What:** Client-side NLP library for English text processing 198 + - **Size:** ~210KB minified (estimated ~75KB gzipped) — the library includes built-in lexicon data 199 + - **NER:** Recognizes persons, places, organizations, dates, values natively 200 + - **Speed:** Processes ~1MB/sec (~10 Wikipedia pages per second) 201 + - **API:** 202 + ```javascript 203 + import nlp from 'compromise'; 204 + const doc = nlp('Barack Obama visited the United Nations in New York'); 205 + doc.people().out('array'); // ['Barack Obama'] 206 + doc.organizations().out('array'); // ['United Nations'] 207 + doc.places().out('array'); // ['New York'] 208 + ``` 209 + - **Pros:** 210 + - Pure JavaScript, works in browser and Node.js 211 + - No model files to load — lexicon is built into the library 212 + - Decent accuracy for well-known entities 213 + - Can be used to *validate* extracted names: "is 'IndieWeb' recognized as a person?" 214 + - Active maintenance (Spencer Kelly) 215 + - **Cons:** 216 + - English only 217 + - ~210KB adds to bundle size (acceptable for Electron, but not tiny) 218 + - Accuracy is modest compared to transformer models — relies on lexicon + grammar rules, not ML 219 + - May not recognize obscure entities 220 + - Cannot classify arbitrary names it hasn't seen 221 + - **Best use:** Run compromise on extracted entity names to cross-check type claims. If we extracted "IndieWeb" as a person, run `nlp('IndieWeb').people().length === 0` to flag it. 222 + 223 + ### winkNLP + wink-eng-lite-web-model 224 + 225 + - **What:** Performant NLP pipeline with POS tagging and NER 226 + - **Size:** Core ~10KB + model ~1MB (gzipped) = ~1MB total 227 + - **NER entities:** CARDINAL, DATE, DURATION, EMAIL, EMOJI, HASHTAG, MENTION, MONEY, ORDINAL, PERCENT, TIME, URL 228 + - **Speed:** ~650K tokens/sec 229 + - **Limitation:** NER recognizes value types (dates, money, emails) but **does NOT recognize person/organization/place entities**. This makes it unsuitable for our core validation need. 230 + - **Verdict:** Not useful for entity type validation. Its NER is focused on value extraction, not named entity classification. 231 + 232 + ### wink-ner (standalone) 233 + 234 + - **What:** Language-agnostic named entity recognizer using a gazetteer approach 235 + - **Size:** Very small (~5KB), but requires you to supply your own gazetteer (entity dictionary) 236 + - **How it works:** You train it with a list of `{text, entityType}` entries, then it recognizes those exact strings in text 237 + - **Pros:** Tiny, fast, flexible 238 + - **Cons:** Only recognizes entities you've pre-loaded — not useful for validating unknown entities 239 + - **Verdict:** Could be useful for building a local known-entity cache, but doesn't solve the validation problem on its own. 240 + 241 + ### ONNX Runtime Web + Small NER Model 242 + 243 + - **What:** Run a pre-trained NER model (e.g., distilled BERT-NER) via ONNX in the browser 244 + - **Size:** ONNX Runtime Web ~2-5MB + smallest viable NER model ~10-30MB 245 + - **Speed:** Depends on model, typically 10-100ms per sentence on modern hardware 246 + - **Pros:** Real ML-based NER, good accuracy 247 + - **Cons:** 248 + - 15-35MB total is too large for our "keep the app lightweight" constraint 249 + - Model loading time adds startup latency 250 + - ONNX Runtime Web adds complexity (WASM compilation, model management) 251 + - Overkill for validating extracted entities — we don't need to run NER on raw text, we already have candidate entities 252 + - **Verdict:** Too heavy. We already extract entities via structured data; we need validation, not extraction. 253 + 254 + ### Recommendation 255 + 256 + **compromise.js is the best fit** for lightweight entity type cross-checking. It can verify whether a name "reads as" a person, organization, or place using English grammar and lexicon rules. It is not an authoritative validator, but it catches obvious misclassifications at zero network cost. 257 + 258 + --- 259 + 260 + ## 5. Common Name Databases 261 + 262 + ### Available Datasets 263 + 264 + #### US SSA Baby Names 265 + - **Source:** https://www.ssa.gov/oact/babynames/names.zip 266 + - **Content:** Every first name given to 5+ babies in the US, by year, 1880-present 267 + - **Size:** ~8MB compressed, ~30MB uncompressed (yearly CSV files) 268 + - **Format:** `name,sex,count` per year 269 + - **Coverage:** ~100K unique first names with frequency data 270 + 271 + #### US Census Bureau Surnames 272 + - **Source:** https://www.census.gov/data/developers/data-sets/surnames.html 273 + - **Content:** Surnames occurring 100+ times in the 2010 census 274 + - **Size:** ~3MB 275 + - **Coverage:** ~160K surnames 276 + - **Format:** CSV with name, rank, count, frequency 277 + 278 + #### Philippe Remy's Name Dataset (Facebook-derived) 279 + - **Source:** https://github.com/philipperemy/name-dataset 280 + - **Content:** 730K first names, 983K last names from 106 countries 281 + - **Size:** 3.3GB full, but can be filtered to top-N per country 282 + - **Format:** CSV with country codes 283 + 284 + ### Practical Approach for Our App 285 + 286 + We do NOT need the full datasets. A curated, compact subset would suffice: 287 + 288 + 1. **Top 5,000 first names** (covers >95% of US/UK names encountered on the web): ~50KB JSON 289 + 2. **Top 5,000 surnames** (covers vast majority): ~50KB JSON 290 + 3. **Total: ~100KB** shipped with the app 291 + 292 + ### Implementation Sketch 293 + 294 + ```javascript 295 + // Shipped as JSON files, loaded once at startup 296 + import firstNames from './data/first-names.json'; // Set of ~5000 names 297 + import surnames from './data/surnames.json'; // Set of ~5000 names 298 + 299 + const firstNameSet = new Set(firstNames); 300 + const surnameSet = new Set(surnames); 301 + 302 + function isLikelyPersonName(name) { 303 + const words = name.trim().split(/\s+/); 304 + if (words.length < 2) return false; 305 + 306 + const first = words[0]; 307 + const last = words[words.length - 1]; 308 + 309 + const firstMatch = firstNameSet.has(first.toLowerCase()); 310 + const lastMatch = surnameSet.has(last.toLowerCase()); 311 + 312 + if (firstMatch && lastMatch) return { likely: true, confidence: 0.9 }; 313 + if (firstMatch) return { likely: true, confidence: 0.7 }; 314 + if (lastMatch) return { likely: true, confidence: 0.5 }; 315 + return { likely: false, confidence: 0.3 }; 316 + } 317 + ``` 318 + 319 + ### Pros 320 + - Fast: O(1) Set lookups, no network 321 + - Compact: ~100KB for useful coverage 322 + - Culturally broad if using the Facebook-derived dataset (106 countries) 323 + - Combined with pattern heuristics, resolves most person vs. org ambiguity 324 + 325 + ### Cons 326 + - Western-name-biased (SSA/Census are US-centric; Facebook dataset is broader but still skewed) 327 + - False positives: some first names are also common words ("Faith", "Grace", "Hope") 328 + - False negatives: unusual names not in top-5000 won't match 329 + - Needs periodic updates (though name frequency is slow-changing) 330 + 331 + ### Feasibility: HIGH 332 + 333 + This is the highest-value, lowest-effort improvement we can make. A 100KB name database + simple Set lookup would catch the majority of "is this a person name?" questions. 334 + 335 + --- 336 + 337 + ## 6. Schema.org Misuse Detection 338 + 339 + ### The Problem 340 + 341 + Many sites misuse schema.org types: 342 + - `og:type=profile` used for organization pages, brand pages, or topic pages 343 + - JSON-LD `@type: Person` for author bylines that are actually organization names 344 + - Overly broad use of `CreativeWork` for navigation elements 345 + 346 + ### Known Research 347 + 348 + A 2015 paper from University of Mannheim ("Heuristics for Fixing Common Errors in Deployed schema.org Microdata") found that heuristics can fix ~71% of type errors in deployed schema.org markup. Common error patterns include: 349 + 350 + 1. **Type confusion:** Using `Person` when `Organization` is meant (and vice versa) 351 + 2. **Overly generic types:** Using `Thing` or `CreativeWork` for everything 352 + 3. **Property-type mismatch:** Properties that don't make sense for the claimed type 353 + 354 + ### Practical Heuristics We Can Apply 355 + 356 + 1. **Cross-validate OG type with JSON-LD type:** If `og:type=profile` but JSON-LD says `Organization`, trust JSON-LD (it's more specific) 357 + 358 + 2. **Check property coherence:** 359 + - A `Person` should have person-like properties (givenName, familyName, jobTitle) 360 + - An `Organization` should have org-like properties (logo, foundingDate, numberOfEmployees) 361 + - If a "Person" has no person-like properties, downgrade confidence 362 + 363 + 3. **URL pattern heuristics:** 364 + - `/about`, `/team`, `/staff` pages with `og:type=profile` are likely organization/team pages 365 + - `/wiki/` pages with short entity names are likely Wikipedia navigation artifacts 366 + 367 + 4. **Name-type cross-check:** Use the name pattern heuristics (section 3) and name databases (section 5) to validate the claimed type 368 + 369 + ### Implementation 370 + 371 + This doesn't require new infrastructure — it's enhancements to the existing `structured-data.js` extractor: 372 + 373 + ```javascript 374 + function validateEntityType(entity, html, url) { 375 + // Cross-check: does the name pattern match the claimed type? 376 + if (entity.entityType === 'person') { 377 + const nameCheck = isLikelyPersonName(entity.name); 378 + if (!nameCheck.likely) { 379 + // Name doesn't look like a person — check if it looks like an org 380 + const patternCheck = classifyNamePattern(entity.name); 381 + if (patternCheck.likelyType === 'organization') { 382 + entity.entityType = 'organization'; 383 + entity.confidence *= 0.8; // reduce confidence for reclassified entities 384 + } 385 + } 386 + } 387 + 388 + // OG type "profile" without person-like properties → likely misuse 389 + if (entity.extractor === 'opengraph' && entity.entityType === 'person') { 390 + const hasPersonProps = entity.attributes.email || 391 + entity.attributes.jobTitle || 392 + entity.attributes.organization; 393 + if (!hasPersonProps && !isLikelyPersonName(entity.name).likely) { 394 + return null; // reject 395 + } 396 + } 397 + 398 + return entity; 399 + } 400 + ``` 401 + 402 + ### Feasibility: HIGH 403 + 404 + These are refinements to existing code, not new systems. Low effort, high impact. 405 + 406 + --- 407 + 408 + ## 7. Recommended Hybrid Approach 409 + 410 + Based on the analysis above, here is the recommended multi-layer validation architecture: 411 + 412 + ### Layer 1: Name Pattern Heuristics (synchronous, inline) 413 + - **When:** During extraction, before storing 414 + - **What:** Check capitalization patterns, org suffixes, honorifics, CamelCase 415 + - **Effect:** Reject or reclassify obvious mismatches 416 + - **Effort:** ~2 hours, pure JS, no dependencies 417 + - **Files:** New `extensions/entities/validators/name-patterns.js` 418 + 419 + ### Layer 2: Name Database Lookup (synchronous, inline) 420 + - **When:** During extraction, for person-type entities 421 + - **What:** Check if name follows firstname+lastname pattern using local name sets 422 + - **Effect:** High-confidence person validation; flag non-matching "person" entities for review 423 + - **Effort:** ~4 hours (curate datasets, build lookup, integrate) 424 + - **Data files:** `extensions/entities/data/first-names.json` (~50KB), `extensions/entities/data/surnames.json` (~50KB) 425 + - **Source data:** SSA baby names (top 5000 by frequency) + Census surnames (top 5000) 426 + 427 + ### Layer 3: compromise.js Cross-Check (synchronous, inline, optional) 428 + - **When:** For borderline entities that pass Layers 1-2 but have moderate confidence 429 + - **What:** Run the entity name through compromise.js to see if it classifies as the expected type 430 + - **Effect:** Catches cases where name databases miss but English grammar patterns help 431 + - **Effort:** ~3 hours (add dependency, integrate, test) 432 + - **Trade-off:** Adds ~210KB to bundle. Could be made optional/lazy-loaded. 433 + - **Recommendation:** Defer this to Phase 2. Layers 1+2 will catch 80%+ of issues. 434 + 435 + ### Layer 4: Wikidata Async Validation (asynchronous, post-storage) 436 + - **When:** After entities are stored provisionally, in background 437 + - **What:** Query Wikidata to validate entity type claims for entities with confidence < 0.9 438 + - **Effect:** Authoritative type correction; can reclassify or remove mistyped entities 439 + - **Effort:** ~8 hours (API client with caching, rate limiting, type mapping, cache persistence) 440 + - **Files:** New `extensions/entities/validators/wikidata.js`, cache in datastore 441 + - **Rate limiting:** Queue-based, max 3 req/sec, with local cache to avoid re-querying known entities 442 + - **Cache:** Store Wikidata validation results permanently per entity name. Most entities will only need one lookup ever. 443 + 444 + ### Layer 5: Schema.org Coherence Checks (synchronous, inline) 445 + - **When:** During structured-data extraction 446 + - **What:** Validate that claimed types have coherent properties; cross-check OG vs JSON-LD 447 + - **Effect:** Reject incoherent type claims at source 448 + - **Effort:** ~3 hours (enhancements to existing `structured-data.js`) 449 + 450 + ### Processing Flow 451 + 452 + ``` 453 + Page HTML 454 + | 455 + v 456 + [Extractors: regex, microformats, JSON-LD, OG tags] 457 + | 458 + v 459 + [Layer 5: Schema.org coherence check] -- reject incoherent extractions 460 + | 461 + v 462 + [Layer 1: Name pattern heuristics] -- reclassify obvious mismatches 463 + | 464 + v 465 + [Layer 2: Name database lookup] -- validate person claims 466 + | 467 + v 468 + [Existing: blocklist, confidence threshold, dedup] 469 + | 470 + v 471 + [Store entity provisionally] 472 + | 473 + v (async, background) 474 + [Layer 4: Wikidata validation] -- authoritative type check 475 + | 476 + v 477 + [Update entity type/confidence OR mark as invalid] 478 + ``` 479 + 480 + --- 481 + 482 + ## Implementation Plan 483 + 484 + ### Phase 1 (Immediate, ~1 week) 485 + 486 + **Goal:** Eliminate the most common garbage entities with zero new dependencies. 487 + 488 + 1. **Name pattern heuristics** (`validators/name-patterns.js`) 489 + - CamelCase detection, org suffix detection, honorific detection 490 + - Integrate into `entity-matcher.js` `processEntities()` pipeline 491 + 492 + 2. **Name database** (`data/first-names.json`, `data/surnames.json`) 493 + - Curate top-5000 lists from SSA + Census data 494 + - `validators/name-lookup.js` — `isLikelyPersonName(name)` function 495 + - Gate person-type entities: reject if name doesn't match person pattern AND has no person-specific properties 496 + 497 + 3. **Schema.org coherence** (enhance `structured-data.js`) 498 + - Reject `og:type=profile` entities that fail person-name check 499 + - Add property-coherence scoring per type 500 + - Cross-validate OG vs JSON-LD when both present 501 + 502 + **Estimated impact:** Resolves ~80% of current misclassification issues. 503 + 504 + ### Phase 2 (Follow-up, ~1 week) 505 + 506 + **Goal:** Add authoritative async validation for borderline cases. 507 + 508 + 4. **Wikidata validator** (`validators/wikidata.js`) 509 + - Queue-based async validation with rate limiting 510 + - Local cache (in datastore) for validated entities 511 + - Background process: validate stored entities with confidence < 0.9 512 + - Update entity type/confidence based on Wikidata `P31` claims 513 + - Respect Wikimedia User-Agent policy 514 + 515 + 5. **Validation status in UI** 516 + - Show validation status on entity cards (pending, validated, rejected) 517 + - Allow manual override of auto-validation results 518 + 519 + ### Phase 3 (Optional) 520 + 521 + 6. **compromise.js integration** — lazy-loaded, for edge cases Layer 1+2 miss 522 + 7. **International name support** — broader name datasets from Facebook-derived data 523 + 8. **Learning from feedback** — use thumbs-up/down data to tune confidence thresholds per extractor 524 + 525 + --- 526 + 527 + ## Estimated Total Effort 528 + 529 + | Phase | Component | Effort | Dependencies | 530 + |-------|-----------|--------|-------------| 531 + | 1 | Name pattern heuristics | 2h | None | 532 + | 1 | Name database + lookup | 4h | SSA/Census data curation | 533 + | 1 | Schema.org coherence | 3h | None | 534 + | 2 | Wikidata validator | 8h | Network, caching | 535 + | 2 | UI validation status | 3h | Phase 2 validator | 536 + | 3 | compromise.js | 3h | npm dependency (~210KB) | 537 + | **Total** | | **~23h** | | 538 + 539 + Phase 1 alone (9 hours) delivers the majority of the value with zero new runtime dependencies and zero network requirements. 540 + 541 + --- 542 + 543 + ## Summary of Approaches 544 + 545 + | Approach | Type | Latency | Accuracy | Size Impact | Best For | 546 + |----------|------|---------|----------|-------------|----------| 547 + | Name patterns | Local, sync | <1ms | Medium | 0 | Obvious mismatches | 548 + | Name databases | Local, sync | <1ms | High (for Western names) | ~100KB | Person validation | 549 + | compromise.js | Local, sync | ~5ms | Medium | ~210KB | English grammar validation | 550 + | winkNLP | Local, sync | ~2ms | N/A (no person/org NER) | ~1MB | NOT useful for this | 551 + | Wikidata API | Remote, async | 200-500ms | Very high | 0 (+ cache) | Authoritative validation | 552 + | DBpedia Spotlight | Remote, async | 500ms+ | High | 0 | NOT viable (unreliable API) | 553 + | ONNX NER model | Local, sync | 10-100ms | High | 15-35MB | Too heavy for us | 554 + 555 + **Recommended path: Phase 1 (name patterns + name DB + schema coherence) for immediate wins, Phase 2 (Wikidata async) for authoritative coverage.**
+275
extensions/entities/data/names.json
··· 1 + { 2 + "firstNames": [ 3 + "james","mary","robert","patricia","john","jennifer","michael","linda","david","elizabeth", 4 + "william","barbara","richard","susan","joseph","jessica","thomas","sarah","charles","karen", 5 + "christopher","lisa","daniel","nancy","matthew","betty","anthony","margaret","mark","sandra", 6 + "donald","ashley","steven","kimberly","paul","emily","andrew","donna","joshua","michelle", 7 + "kenneth","dorothy","kevin","carol","brian","amanda","george","melissa","timothy","deborah", 8 + "ronald","stephanie","edward","rebecca","jason","sharon","jeffrey","laura","ryan","cynthia", 9 + "jacob","kathleen","gary","amy","nicholas","angela","eric","shirley","jonathan","anna", 10 + "stephen","brenda","larry","pamela","justin","emma","scott","nicole","brandon","helen", 11 + "benjamin","samantha","samuel","katherine","raymond","christine","gregory","debra","frank","rachel", 12 + "alexander","carolyn","patrick","janet","jack","catherine","dennis","maria","jerry","heather", 13 + "tyler","diane","aaron","ruth","jose","julie","adam","olivia","nathan","joyce", 14 + "henry","virginia","peter","victoria","zachary","kelly","douglas","lauren","harold","christina", 15 + "carl","joan","arthur","evelyn","gerald","judith","roger","megan","keith","andrea", 16 + "jeremy","cheryl","terry","hannah","sean","jacqueline","austin","martha","albert","gloria", 17 + "joe","teresa","jesse","ann","ethan","sara","bruce","madison","jordan","frances", 18 + "dylan","kathryn","bryan","janice","ralph","jean","roy","abigail","alan","alice", 19 + "wayne","judy","eugene","sophia","randy","grace","philip","denise","harry","amber", 20 + "vincent","doris","bobby","marilyn","johnny","danielle","logan","beverly","willie","isabella", 21 + "bradley","theresa","russell","diana","billy","natalie","howard","brittany","fred","charlotte", 22 + "louis","marie","gabriel","kayla","martin","alexis","craig","tiffany", 23 + "christian","noah","mason","liam","aiden","jayden","elijah","caleb","luke","connor", 24 + "isaac","oliver","owen","sebastian","gavin","carter","landon","dominic","colton","blake", 25 + "lucas","hunter","chase","tristan","ian","cole","alex","max","riley","wyatt", 26 + "cody","derek","travis","marcus","jared","cameron","corey","seth","victor","grant", 27 + "mitchell","spencer","omar","trevor","dustin","dale","leon","adrian","edgar","allen", 28 + "francis","lawrence","leonard","clifford","ray","miguel","clarence","herbert","lester","lloyd", 29 + "herman","clyde","fernando","ross","karl","oscar","gordon","claude","earl", 30 + "roland","neil","ivan","leo","brett","kenny","brent","wade","kurt","darren", 31 + "stuart","kirk","cory","troy","lance","rafael","ernesto","cecil","rex", 32 + "alfonso","salvador","hector","hugo","julio","ruben","enrique","carlos","pablo", 33 + "mohammad","ahmed","ali","hassan","hussein","yusuf","ibrahim","khalil","tariq", 34 + "ava","mia","chloe","ella","lily","zoe","nora","leah","aria", 35 + "ellie","stella","maya","hazel","aurora","ivy","violet","ruby","jade","luna", 36 + "scarlett","willow","penelope","layla","aaliyah","sienna","savannah","gabriella","camila","ariana", 37 + "elena","vivian","naomi","isla","eliana","valentina","clara","josephine","liliana","nova", 38 + "emilia","madelyn","kinsley","lydia","piper","paisley","genesis","eva","faith", 39 + "hope","joy","destiny","serenity","trinity","harmony","skye","brooke","paige","morgan", 40 + "reese","quinn","kennedy","peyton","makenzie","teagan","summer","winter","autumn", 41 + "daisy","fiona","julia","rosa","vera","lena","thea","freya","astrid","ingrid", 42 + "sonia","nina","tara","tina","dina","gina","lana","dana","alicia","priscilla", 43 + "sylvia","lorraine","elaine","colleen","bonnie","gladys","cassandra","miranda","selena", 44 + "sasha","nadia","yolanda","roberta","antoinette","loretta","rosemary","marlene","constance","bernadette", 45 + "agnes","irene","eileen","geraldine","jill","maxine","holly","crystal","tammy","tracy", 46 + "stacy","wendy","penny","sandy","mindy","candy","misty","mandy","brandy","cindy", 47 + "molly","polly","dolly","shelley","sally","jenny","patty","peggy", 48 + "mae","olga","carmen","marta","lucia","sofia", 49 + "fatima","aisha","amina","khadija","zainab","maryam","yasmin","laila","noor","hana", 50 + "wei","ming","ling","yan","jing","hui","xiao","mei","li","hong", 51 + "ravi","priya","anita","sunita","deepa","lakshmi","sita","gita","indira","meera", 52 + "akiko","yuki","kenji","takeshi","hiroshi","yoko","noriko","kazuo","haruki","ayumi", 53 + "jesus","angel","diego","andres","mateo","nicolas","santiago","alejandro", 54 + "madonna","cher","beyonce","shakira","adele","rihanna","drake","eminem","prince","bono", 55 + "barack","condoleezza","oprah","tupac","kanye","elon","sundar","satya","narendra","vladimir", 56 + "abby","ada","adeline","adriana","adrienne","aileen","aimee","ainsley","aisha","alana", 57 + "alberta","alejandra","alessandra","alexa","alexandra","alexia","alina","alison","allison","alma", 58 + "alyssa","amara","amelie","anastasia","anabel","andi","angie","anika","annabel","annabelle", 59 + "annette","annie","april","arabella","ariadne","ariel","arlene","ashlee","ashlyn","aspen", 60 + "athena","aubrey","audrey","augustine","aurelia","autumn","avery","babette","bailey","beatrice", 61 + "becky","belinda","bella","bernice","beth","bethany","bianca","blair","blanche","blythe", 62 + "bobbie","bonita","breanna","brenda","brianna","bridget","brigitte","britt","brittney","brooklyn", 63 + "brynn","callie","camille","candace","carina","carla","carlotta","carmela","carol","carrie", 64 + "casey","catalina","cecelia","cecilia","celeste","celia","charity","charlene","chelsea","cheri", 65 + "cheyenne","china","chloe","christa","christy","ciara","claire","claudia","clementine","colette", 66 + "connie","cora","corinne","cornelia","courtney","dahlia","dakota","dalila","daniela","daphne", 67 + "darla","darlene","dawn","deanna","debbie","deirdre","delia","della","demi","desiree", 68 + "diamond","dolores","dominique","dora","doreen","dorothy","eden","edith","edna","elaine", 69 + "eleanor","elektra","elisa","elise","eliza","ella","ellen","eloise","elsie","elvira", 70 + "emery","emilia","erica","erika","erin","esmeralda","esther","ethel","eugenia","eunice", 71 + "eve","evelyn","faith","fannie","farrah","felicia","flora","florence","francesca","frida", 72 + "gabriela","gail","gemma","genevieve","georgia","gertrude","gillian","giselle","giulia","goldie", 73 + "grace","greer","greta","gretchen","gwen","gwendolyn","hadley","hailey","hallie","harley", 74 + "harriet","hattie","hayden","hayley","heidi","henrietta","hermione","hilda","hope","ida", 75 + "ilene","imani","imogen","india","ines","inez","ingrid","iona","iris","irma", 76 + "isabelle","isadora","isis","isolde","iva","ivana","ivory","ivy","jaclyn","jade", 77 + "jaime","jamila","jana","janelle","janice","janine","jasmine","jeanne","jeanette","jemima", 78 + "jenna","jewel","jillian","joanna","jocelyn","jodie","jolene","joni","josie","jovana", 79 + "juanita","judith","juliana","julianne","juliet","june","justine","kaia","kara","karen", 80 + "karina","karla","kassandra","katarina","kate","katelyn","katharine","katie","katrina","kaya", 81 + "kaylee","keely","keira","kellie","kelsey","kendra","kerry","kiana","kieran","kira", 82 + "kirsten","kora","krista","kristen","kristin","kristina","kristine","krystal","lacey","lana", 83 + "lara","larissa","latasha","latoya","laurel","laurie","laverne","lea","leigh","leia", 84 + "leila","lena","lenora","leona","leslie","leticia","lila","lillian","lillie","lilly", 85 + "lindsey","liza","lois","lola","london","lorelei","lorena","lorna","louisa","louise", 86 + "lucia","lucille","lucinda","lucy","luella","lulu","lynda","lynette","lynn","mabel", 87 + "mackenzie","macy","maddison","madeline","maeve","maggie","maisie","mallory","mara","marcella", 88 + "marcia","maren","margot","margo","marguerite","maribel","marilyn","marina","marisa","marisol", 89 + "marjorie","marlena","marley","marsha","matilda","maura","maureen","maxine","maya","mckayla", 90 + "mckenna","melanie","melinda","melody","mercedes","mercy","meredith","mia","michaela","millie", 91 + "mina","minerva","minnie","miriam","moira","monica","monique","muriel","myra","myrtle", 92 + "nadia","nadine","nancy","nanette","naomi","natalia","natasha","nellie","nettie","nia", 93 + "noelle","nola","noreen","norma","octavia","odessa","olga","olive","opal","ophelia", 94 + "paige","paloma","pandora","paris","patience","paula","paulette","pauline","pearl","phoebe", 95 + "phyllis","piper","poppy","portia","priscilla","prudence","rachel","ramona","raven","reba", 96 + "regina","renata","renee","rhea","rhonda","rita","robyn","rochelle","rosalyn","rosalie", 97 + "rosalind","rosamund","rosanna","rosemarie","rosetta","roxanne","ruby","ruth","sabina","sabrina", 98 + "sadie","sage","salome","sandra","sandy","sapphire","savanna","scarlet","selma","serena", 99 + "shaniqua","shanna","shannon","sharlene","sharon","shawn","shawna","sheila","shelby","shelly", 100 + "shirley","sierra","sigrid","simone","skyler","sloane","sonya","stacey","stella","sue", 101 + "susanna","sybil","tabitha","tallulah","tamara","tanya","tatiana","taylor","terra","tessa", 102 + "thalia","thelma","theodora","tiana","tiffani","toni","tonya","treasure","trudy","uma", 103 + "ursula","valerie","vanessa","venus","veronica","viola","viviana","wanda","whitney","willa", 104 + "wilma","winifred","xiomara","yadira","yesenia","yolanda","yvette","yvonne","zara","zelda", 105 + "zena","zora", 106 + "abraham","ace","adalberto","adolfo","adonis","agustin","aidan","aldo","alejandro","alexis", 107 + "alfredo","alonzo","alton","alvin","ambrose","amos","anderson","andre","andreas","andres", 108 + "angelo","angus","ansel","antoine","anton","antonio","archer","archie","arden","ari", 109 + "armando","arnold","arturo","asa","ashton","atlas","august","augusto","aurelio","axel", 110 + "bailey","barnaby","barrett","bartholomew","basil","beau","beckett","ben","benedict","benito", 111 + "benny","bernard","bertram","blaine","bo","boris","bradford","brady","branden","braxton", 112 + "brendan","brennan","brock","broderick","brodie","brooks","bruno","bryce","byron","cade", 113 + "cain","callum","calvin","camden","carl","carlo","carmelo","cassius","cedric","cesar", 114 + "chad","chance","chandler","charlie","chester","chris","clarence","clark","clay","clement", 115 + "cliff","clifton","clinton","clint","clive","coby","cody","colin","conrad","cooper", 116 + "corbin","cornelius","cortez","craig","cristian","crosby","cruz","cullen","curtis","cyrus", 117 + "damon","dane","darian","darius","darnell","darrell","darwin","dashiell","davis","dawson", 118 + "dean","declan","demetrius","denis","denny","desmond","devin","devonte","dexter","dimitri", 119 + "dirk","dion","donnie","donovan","dorian","drew","duane","duke","duncan","dwight", 120 + "eamon","ed","eddie","eden","edmond","edmundo","eduardo","eli","elias","elliot", 121 + "elliott","ellis","elmer","elton","elvis","emanuel","emery","emile","emilio","emmanuel", 122 + "emmett","enzo","ernest","ervin","esteban","eugene","evan","everett","ezekiel","ezra", 123 + "fabian","felix","finn","fletcher","floyd","forrest","fox","frank","franklin","freddie", 124 + "frederick","gabe","gabriel","gage","galen","gareth","garrett","garrison","garth","gene", 125 + "geoffrey","gerard","gianni","gideon","gilbert","glenn","gonzalo","grady","graham","griffin", 126 + "grover","guillermo","gunnar","gus","gustavo","guy","hank","hans","hardy","harlan", 127 + "harrison","hartley","harvey","hayes","heath","hendrix","herman","holden","homer","horace", 128 + "horatio","houston","howard","hubert","hudson","humphrey","ignacio","ike","irving","isaiah", 129 + "isidro","israel","ivan","jace","jackson","jaden","jaime","jake","jamal","jameson", 130 + "jamie","jared","jarrett","jarvis","jasper","javier","jay","jaylen","jefferson","jennings", 131 + "jeremiah","jerome","jesus","jett","jim","jimmy","joaquin","jody","joel","joey", 132 + "jonas","jonah","jorge","juan","julian","julius","junior","justice","kai","kaleb", 133 + "kane","kareem","kasper","kayden","keaton","keegan","keenan","keith","kelvin","kendall", 134 + "kendrick","kent","keon","kermit","kerry","khalid","kian","killian","king","kingston", 135 + "kirk","knox","kobe","kolby","kyle","lamar","lamont","lance","landon","lane", 136 + "larry","lars","lawrence","lawson","leandro","lee","leland","lennox","leo","leon", 137 + "leonardo","leroy","lester","levi","lewis","lincoln","linus","lionel","lloyd","lorenzo", 138 + "louie","luca","lucian","luis","luther","lyle","mac","magnus","malcolm","malik", 139 + "manfred","manuel","marcel","marco","marcos","mario","marshall","marvin","mason","matteo", 140 + "matthias","maurice","maverick","maximilian","maxwell","maynard","melvin","merlin","micah","miles", 141 + "milo","mohamed","monroe","monte","morgan","morris","morton","moses","muhammad","murphy", 142 + "murray","mustafa","myles","myron","nash","nate","nathaniel","ned","nelson","neville", 143 + "nico","nigel","noel","nolan","norbert","norman","odin","olen","oliver","omar", 144 + "oren","orion","orlando","orville","oscar","otis","otto","owen","pablo","parker", 145 + "pascal","paul","pedro","percy","perry","pete","phil","pierce","porter","preston", 146 + "prince","quentin","quincy","quinton","rafael","raheem","ramiro","ramon","randall","randolph", 147 + "raul","reed","reggie","reginald","reid","remy","rene","rex","reynaldo","rhett", 148 + "ricardo","riley","river","rocco","rocky","roderick","rodney","rodrigo","rogelio","roland", 149 + "roman","romeo","ronald","rory","roscoe","rowan","roy","ruben","rufus","rupert", 150 + "russ","rusty","salvador","sam","samson","sanchez","sanders","santiago","santos","saul", 151 + "sawyer","scott","seamus","sean","sergio","shane","shaun","shawn","sheldon","shelton", 152 + "shepherd","sherman","silas","silvio","simon","solomon","sonny","spencer","stan","stanford", 153 + "stanley","stefan","sterling","steve","stevie","sullivan","tad","tanner","tate","ted", 154 + "terrance","terrence","terry","thaddeus","theodore","titus","tobias","toby","todd","tom", 155 + "tomas","tommy","tony","trace","trent","trevon","trevor","trey","tristan","tucker", 156 + "ty","ulysses","uriel","val","valentino","vance","vaughn","vernon","vicente","vince", 157 + "virgil","wade","walker","wallace","walter","warren","waylon","wendell","wesley","weston", 158 + "wilbur","wiley","will","willard","willis","wilson","winston","wolfgang","woodrow","wyatt", 159 + "xander","xavier","yosef","zachariah","zane","zeke" 160 + ], 161 + "surnames": [ 162 + "smith","johnson","williams","brown","jones","garcia","miller","davis","rodriguez","martinez", 163 + "hernandez","lopez","gonzalez","wilson","anderson","thomas","taylor","moore","jackson","martin", 164 + "lee","perez","thompson","white","harris","sanchez","clark","ramirez","lewis","robinson", 165 + "walker","young","allen","king","wright","scott","torres","nguyen","hill","flores", 166 + "green","adams","nelson","baker","hall","rivera","campbell","mitchell","carter","roberts", 167 + "gomez","phillips","evans","turner","diaz","parker","cruz","edwards","collins","reyes", 168 + "stewart","morris","morales","murphy","cook","rogers","gutierrez","ortiz","morgan","cooper", 169 + "peterson","bailey","reed","kelly","howard","ramos","kim","cox","ward","richardson", 170 + "watson","brooks","chavez","wood","james","bennett","gray","mendoza","ruiz","hughes", 171 + "price","alvarez","castillo","sanders","patel","myers","long","ross","foster","jimenez", 172 + "powell","jenkins","perry","russell","sullivan","bell","coleman","butler","henderson","barnes", 173 + "gonzales","fisher","vasquez","simmons","griffin","mcdonald","hayes","murray","ford","graham", 174 + "hamilton","shaw","wallace","woods","cole","west","jordan","owens","reynolds", 175 + "ellis","harrison","gibson","marshall","freeman","wells", 176 + "webb","simpson","stevens","tucker","porter","hunter","hicks","crawford","henry","boyd", 177 + "mason","kennedy","warren","dixon","burns","gordon", 178 + "holmes","rice","robertson","hunt","black","daniels","palmer","mills","nichols","grant", 179 + "knight","ferguson","rose","stone","hawkins","dunn","perkins","hudson","spencer","gardner", 180 + "stephens","payne","pierce","berry","matthews","arnold","wagner","willis","ray","watkins", 181 + "olson","carroll","duncan","snyder","hart","cunningham","bradley","lane","andrews", 182 + "harper","fox","riley","armstrong","carpenter","weaver","greene","lawrence","elliott", 183 + "sims","austin","peters","kelley","franklin","lawson","fields","schmidt","carr", 184 + "wheeler","chapman","oliver","montgomery","richards","williamson","johnston","banks", 185 + "meyer","bishop","mccoy","howell","morrison","hansen","fernandez","garza","harvey", 186 + "little","burton","stanley","george","jacobs","reid","fuller","lynch","dean", 187 + "gilbert","garrett","romero","welch","larson","frazier","burke","hanson","day", 188 + "moreno","bowman","medina","fowler","brewer","hoffman","carlson","silva","pearson","holland", 189 + "douglas","fleming","jensen","vargas","byrd","davidson","hopkins","may","terry","herrera", 190 + "wade","soto","walters","curtis","neal","caldwell","lowe","jennings","barnett","graves", 191 + "horton","shelton","barrett","obrien","castro","sutton","gregory","mckinney","lucas", 192 + "miles","craig","chambers","holt","lambert","fletcher","watts","bates","hale", 193 + "rhodes","pena","beck","newman","haynes","mcdaniel","mendez","bush","vaughn","parks", 194 + "dawson","santiago","norris","hardy","love","steele","curry","powers","schultz","barker", 195 + "guzman","page","munoz","ball","keller","chandler","weber","leonard","walsh","lyons", 196 + "ramsey","wolfe","schneider","mullins","benson","sharp","bowen","barber","cummings","hines", 197 + "baldwin","griffith","valdez","hubbard","salazar","reeves","warner","stevenson","burgess","santos", 198 + "tate","cross","garner","mann","mack","moss","thornton","mcgee","farmer","delgado", 199 + "aguilar","vega","glover","manning","cohen","harmon","rodgers","robbins","newton","todd", 200 + "blair","higgins","ingram","reese","cannon","strickland","townsend","potter","goodwin","walton", 201 + "rowe","hampton","ortega","patton","swanson","joseph","francis","goodman","maldonado","yates", 202 + "becker","erickson","hodges","rios","conner","adkins","webster","norman","malone","hammond", 203 + "flowers","cobb","moody","quinn","blake","maxwell","pope","floyd","osborne","paul", 204 + "mccarthy","guerrero","lindsey","estrada","sandoval","gibbs","tyler","gross","fitzgerald","stokes", 205 + "doyle","sherman","saunders","wise","colon","gill","alvarado","greer","padilla","simon", 206 + "waters","nunez","ballard","schwartz","mcbride","houston","christensen","klein","pratt","briggs", 207 + "parsons","mclaughlin","zimmerman","french","buchanan","moran","copeland","pittman","brady","mccormick", 208 + "levy","friedman","goldstein","rosenberg","rosen","shapiro","kaplan","berger", 209 + "chen","wang","zhang","liu","yang","huang","zhao","wu","zhou","sun", 210 + "tanaka","yamamoto","watanabe","suzuki","takahashi","ito","nakamura","kobayashi","saito","kato", 211 + "kumar","sharma","singh","gupta","das","khan","ali","ahmed","hussain","rahman", 212 + "muller","fischer","schulz","bauer","koch","richter","wolf", 213 + "svensson","johansson","karlsson","nilsson","eriksson","larsson","olsson","persson","andersson","pettersson", 214 + "dubois","bernard","petit","durand","moreau","laurent","michel","leroy", 215 + "rossi","russo","ferrari","esposito","bianchi","romano","colombo","ricci","marino","greco", 216 + "byrne","oconnor", 217 + "abbott","acosta","acevedo","aguirre","akers","albert","albright","aldridge","alexander","alford", 218 + "allison","alston","amos","andersen","anthony","archer","arroyo","ashby","atkins","atkinson", 219 + "avery","avila","ayers","baird","ballard","banks","barlow","barnard","barrera","barron", 220 + "barry","bartlett","barton","bass","bates","battle","baxter","beach","bean","beard", 221 + "beasley","beattie","beaumont","becker","bell","bender","benjamin","berger","berkley","bernal", 222 + "best","bird","blackburn","blackwell","blanchard","bland","bloom","boles","bolton","bond", 223 + "booker","boone","booth","bowers","bowles","box","boyle","bradford","brady","bragg", 224 + "branch","bray","bridges","briggs","bright","britt","brock","brooks","browning","bruce", 225 + "bryan","bryant","buck","buckley","buckner","bueno","bullock","bunch","burch","burgess", 226 + "burke","burnett","burns","burris","burt","burton","bush","byrd","byrne","cain", 227 + "callahan","camacho","cameron","camp","cannon","cantrell","carey","carlson","carney","carpenter", 228 + "carr","carrillo","carson","case","casey","cash","cassidy","castellano","chambers","chan", 229 + "chance","chandler","chaney","chang","charles","chase","cherry","choi","church","cisneros", 230 + "clapp","clark","clay","clayton","clements","cleveland","clifton","cline","coats","cochran", 231 + "coffey","colbert","cole","coleman","collier","conley","conner","connolly","conway","cooke", 232 + "cooley","coombs","cooney","corbett","corcoran","corona","cortez","costa","cotton","couch", 233 + "coulter","cowan","craft","craig","crane","crandall","crawford","creech","crockett","crosby", 234 + "crowley","crum","crump","cuevas","cullen","cummins","curran","dahl","dale","dalton", 235 + "daly","damico","darby","dare","daugherty","davenport","david","davila","day","deal", 236 + "decker","dejesus","delacruz","delaney","deleon","delgado","dempsey","denning","denny","desai", 237 + "devlin","dewey","diamond","dickerson","dickey","dickson","dillard","dillon","dix","dixon", 238 + "dodson","dolan","donaldson","donnelly","donovan","dorsey","dotson","dougherty","dove","dowling", 239 + "downey","downs","doyle","draper","drew","driscoll","drummond","duff","duffy","dugan", 240 + "duke","dunbar","dunlap","dunn","dunne","dupont","durham","eaton","edgar","edmonds", 241 + "edmondson","eldridge","elkins","ellington","emerson","england","english","ennis","enriquez","epperson", 242 + "erwin","espinoza","estes","estrada","etheridge","eubanks","ewing","fagan","fairbanks","fallon", 243 + "fanning","farley","farr","farrell","faulkner","fay","feldman","fellows","fenton","ferrell", 244 + "figueroa","finch","finley","finn","finnegan","fitzpatrick","flanagan","flannery","flint","flood", 245 + "foley","fontana","foote","forbes","ford","foreman","forrest","fortune","foster","fowler", 246 + "fox","frame","franco","frank","fraser","frazier","frederick","freed","french","frost", 247 + "fry","fuentes","fuller","fulton","gagne","gaines","gallagher","gallegos","galloway","galvan", 248 + "gamble","gamboa","gannon","garland","garner","garrison","gates","gay","gentry","gibbons", 249 + "gibbs","gifford","giles","gilmore","glass","glenn","glover","godfrey","gold","golden", 250 + "goldberg","gomez","goodman","gorman","gould","grady","granger","graves","gray","grayson", 251 + "greer","gregg","gregory","grier","griffin","griffiths","grimes","grogan","gross","grove", 252 + "groves","guerra","guess","guillen","guthrie","guzman","haas","hacker","hadley","hagan", 253 + "hager","hahn","haley","halsey","hamlin","hammer","hammond","hampton","hancock","haney", 254 + "hanley","hanna","harding","hargrove","harkins","harlan","harmon","harp","harrell","harrington", 255 + "hart","hartley","hartman","hatfield","hauser","havens","hawk","hawkins","hay","hayden", 256 + "heard","heath","hedge","helton","henson","herman","herring","hess","hewitt","hickey", 257 + "hickman","hightower","hildebrand","hilton","hinton","hobbs","hodge","hodges","hoffman","hogan", 258 + "holbrook","holcomb","holder","holley","holloway","holman","holt","hood","hook","hooper", 259 + "hoover","hornsby","horowitz","horton","hoskins","houghton","house","houser","howell","howland", 260 + "huber","huff","huffman","hull","humphrey","hurst","hutchins","hutchinson","hyde","hyman", 261 + "ingram","irwin","isaac","iverson","ivory","ivy","jack","jacks","jacobson","jameson", 262 + "jarvis","jasso","jean","jeffers","jefferson","jennings","jewett","johns","joiner","jolly", 263 + "jordan","joy","juarez","jude","julian","kane","kaplan","kaufman","kaye","keane", 264 + "keating","keenan","keith","kellogg","kemp","kendall","kenney","kenny","kern","kerr", 265 + "keys","kidd","kimball","king","kinney","kirby","kirk","kirkland","kirkpatrick","kitchen", 266 + "kline","knapp","knott","knowles","knox","kramer","krause","krebs","kuhn","lacy", 267 + "laird","lamb","lambert","landers","landry","lang","langley","langston","lanier","larkin", 268 + "larsen","lau","laughlin","lavin","lawler","lawless","layne","leach","leal","leary", 269 + "leblanc","ledford","lehman","leon","leone","lester","levine","levy","leyva","li", 270 + "lim","lindsey","link","lipscomb","little","livingston","lockhart","logan","lomax","london", 271 + "looney","lord","lott","lovett","lozano","lugo","luna","lund","lutz","lynn", 272 + "macdonald","mack","mackey","madden","madigan","magee","maguire","mahoney","major","maloney", 273 + "mancini","manley","mann","manners","manning","march","marks","marlowe","marsh","marshall" 274 + ] 275 + }
+21 -2
extensions/entities/entity-matcher.js
··· 6 6 */ 7 7 8 8 import { findEntity, createEntity, addObservation, normalizeName, isEntitySuppressed } from './entity-store.js'; 9 + import { validateEntityType, ensureNameDatabase } from './name-validator.js'; 9 10 10 11 // Generic names that should never be stored as entities 11 12 const BLOCKLIST = new Set([ ··· 17 18 'privacy policy', 'terms of service', 'cookie policy', 'subscribe', 18 19 'sign in', 'sign up', 'log in', 'register', 'learn more', 'read more', 19 20 'click here', 'view more', 'see more', 'show more', 'load more', 21 + 'main page', 'front page', 'index', 'welcome', 'dashboard', 'feed', 22 + 'homepage', 'start page', 'new tab', 20 23 ]); 21 24 22 25 /** ··· 25 28 function isBlocklisted(name) { 26 29 if (!name) return true; 27 30 const n = name.trim(); 28 - // Too short 29 - if (n.length < 2) return true; 31 + // Too short (rejects 1-3 char abbreviations like "LOC", "WI", "Ed") 32 + if (n.length < 4) return true; 30 33 // All numbers 31 34 if (/^\d+$/.test(n)) return true; 32 35 // All punctuation/symbols ··· 50 53 export async function processEntities(rawEntities, pageContext, confidenceThreshold = 0.7) { 51 54 const processed = []; 52 55 56 + // Ensure name database is loaded before validation 57 + await ensureNameDatabase(); 58 + 53 59 // Deduplicate within the batch by name+type 54 60 const seen = new Map(); 55 61 const dedupedEntities = []; ··· 63 69 64 70 // Filter entities suppressed by user feedback 65 71 if (await isEntitySuppressed(entity.name, entity.entityType)) continue; 72 + 73 + // Validate entity type claim using name heuristics + name database 74 + const validation = validateEntityType(entity.name, entity.entityType); 75 + if (!validation.valid) { 76 + if (validation.suggestedType) { 77 + // Re-type the entity instead of dropping it 78 + entity.entityType = validation.suggestedType; 79 + entity.confidence *= 0.8; // reduce confidence for reclassified entities 80 + } else { 81 + // No valid type — skip this entity 82 + continue; 83 + } 84 + } 66 85 67 86 const key = `${normalizeName(entity.name)}:${entity.entityType}`; 68 87
+91 -15
extensions/entities/extractors/structured-data.js
··· 2 2 * Structured data entity extractor 3 3 * 4 4 * Extracts entities from JSON-LD, schema.org markup, and Open Graph meta tags. 5 + * Includes coherence checks using name-validator for person/org validation. 5 6 */ 7 + 8 + import { classifyNamePattern, isLikelyPersonName } from '../name-validator.js'; 6 9 7 10 /** 8 11 * Extract structured data entities from HTML content ··· 78 81 const type = Array.isArray(item['@type']) ? item['@type'][0] : item['@type']; 79 82 const name = item.name || item.headline || item.title; 80 83 81 - if (!name || typeof name !== 'string' || name.trim().length < 2) return null; 84 + if (!name || typeof name !== 'string' || name.trim().length < 4) return null; 82 85 83 86 // Map schema.org types to entity types 84 87 const typeMap = { ··· 275 278 } 276 279 277 280 // Confidence based on field richness: 0.9 for 3+ fields, 0.85 for fewer 278 - const confidence = meaningfulFields >= 3 ? 0.9 : 0.85; 281 + let confidence = meaningfulFields >= 3 ? 0.9 : 0.85; 282 + 283 + // --- Coherence checks for person entities from JSON-LD --- 284 + if (entityType === 'person') { 285 + const trimmedName = name.trim(); 286 + const pattern = classifyNamePattern(trimmedName); 287 + 288 + // CamelCase single word claimed as Person → skip (e.g., "IndieWeb", "YouTube") 289 + if (pattern.likelyType === 'organization' && 290 + (pattern.reason === 'CamelCase brand name' || pattern.reason === 'leading CamelCase word')) { 291 + return null; 292 + } 293 + 294 + // Has person-like properties? (givenName, familyName, jobTitle, email) 295 + const hasPersonProps = item.givenName || item.familyName || item.jobTitle || 296 + item.email || attributes.email || attributes.jobTitle || attributes.organization; 297 + 298 + // If no person-like properties and name doesn't look like a person, skip 299 + if (!hasPersonProps) { 300 + const personCheck = isLikelyPersonName(trimmedName); 301 + if (!personCheck.likely && pattern.likelyType !== 'unknown' && pattern.likelyType !== 'ambiguous') { 302 + return null; 303 + } 304 + } 305 + } 279 306 280 307 return { 281 308 name: name.trim(), ··· 317 344 const ogSiteName = getMetaContent(doc, 'og:site_name'); 318 345 319 346 // Extract profile info (og:type = profile) 347 + // Only create person entity if we have real name parts OR the title contains a space 348 + // (single-word titles like "IndieWeb" are site names, not person names) 320 349 if (ogType === 'profile' && ogTitle) { 321 350 const firstName = getMetaContent(doc, 'profile:first_name'); 322 351 const lastName = getMetaContent(doc, 'profile:last_name'); 323 352 const fullName = (firstName && lastName) ? `${firstName} ${lastName}` : ogTitle; 353 + const trimmedFullName = fullName.trim(); 324 354 325 - entities.push({ 326 - name: fullName.trim(), 327 - entityType: 'person', 328 - confidence: 0.9, 329 - extractor: 'opengraph', 330 - attributes: { 331 - url, 332 - description: ogDescription || '', 333 - image: ogImage || '', 334 - source: ogSiteName || '' 335 - }, 336 - sourceUrl: url 337 - }); 355 + // Require at least 2 words — real person names have first + last 356 + if (!trimmedFullName.includes(' ')) return entities; 357 + 358 + // Coherence check: if OG says "profile" but name fails person checks, skip or re-type 359 + const pattern = classifyNamePattern(trimmedFullName); 360 + const personCheck = isLikelyPersonName(trimmedFullName); 361 + 362 + // CamelCase or org suffix → not a person, regardless of og:type=profile 363 + if (pattern.likelyType === 'organization' && pattern.confidence >= 0.7) { 364 + // Re-type as organization instead of dropping 365 + entities.push({ 366 + name: trimmedFullName, 367 + entityType: 'organization', 368 + confidence: 0.7, 369 + extractor: 'opengraph', 370 + attributes: { 371 + url, 372 + description: ogDescription || '', 373 + image: ogImage || '', 374 + source: ogSiteName || '' 375 + }, 376 + sourceUrl: url 377 + }); 378 + } else if (!personCheck.likely && !firstName && !lastName) { 379 + // No name DB match AND no explicit profile:first_name/last_name → skip 380 + // (sites like IndieWeb use og:type=profile for non-person pages) 381 + // But still allow if the name is ambiguous (could be a person we don't recognize) 382 + if (pattern.likelyType !== 'ambiguous' && pattern.likelyType !== 'unknown') { 383 + return entities; 384 + } 385 + // Ambiguous — allow with reduced confidence 386 + entities.push({ 387 + name: trimmedFullName, 388 + entityType: 'person', 389 + confidence: 0.7, 390 + extractor: 'opengraph', 391 + attributes: { 392 + url, 393 + description: ogDescription || '', 394 + image: ogImage || '', 395 + source: ogSiteName || '' 396 + }, 397 + sourceUrl: url 398 + }); 399 + } else { 400 + entities.push({ 401 + name: trimmedFullName, 402 + entityType: 'person', 403 + confidence: 0.9, 404 + extractor: 'opengraph', 405 + attributes: { 406 + url, 407 + description: ogDescription || '', 408 + image: ogImage || '', 409 + source: ogSiteName || '' 410 + }, 411 + sourceUrl: url 412 + }); 413 + } 338 414 } 339 415 340 416 // Extract business info (og:type = business.business)
+312
extensions/entities/name-validator.js
··· 1 + /** 2 + * Entity Name Validator — Phase 1 of the entity validation framework 3 + * 4 + * Pure JS, zero dependencies. Validates entity type claims using: 5 + * 1. Name pattern heuristics (CamelCase, org suffixes, honorifics, etc.) 6 + * 2. Local name database lookup (~2000 first names, ~1000 surnames) 7 + * 8 + * See docs/entity-validation-research.md for design rationale. 9 + */ 10 + 11 + // Lazy-loaded name sets (initialized on first use) 12 + let _firstNames = null; 13 + let _surnames = null; 14 + 15 + /** 16 + * Load the name database. Called lazily on first validation. 17 + * Works in both Node.js/Electron and browser (preload) contexts. 18 + */ 19 + async function loadNameDatabase() { 20 + if (_firstNames !== null) return; 21 + 22 + try { 23 + // Dynamic import works in both ESM contexts 24 + const data = await import('./data/names.json', { assert: { type: 'json' } }).then(m => m.default); 25 + _firstNames = new Set(data.firstNames.map(n => n.toLowerCase())); 26 + _surnames = new Set(data.surnames.map(n => n.toLowerCase())); 27 + } catch { 28 + // Fallback: try fetch (browser context) 29 + try { 30 + const url = new URL('./data/names.json', import.meta.url); 31 + const resp = await fetch(url); 32 + const data = await resp.json(); 33 + _firstNames = new Set(data.firstNames.map(n => n.toLowerCase())); 34 + _surnames = new Set(data.surnames.map(n => n.toLowerCase())); 35 + } catch { 36 + console.warn('[name-validator] Could not load name database, using empty sets'); 37 + _firstNames = new Set(); 38 + _surnames = new Set(); 39 + } 40 + } 41 + } 42 + 43 + // Synchronous name sets for the migration context (loaded separately) 44 + let _syncFirstNames = null; 45 + let _syncSurnames = null; 46 + 47 + /** 48 + * Load names synchronously from a pre-parsed object. 49 + * Used by the datastore migration which runs in Node.js synchronous context. 50 + * @param {{ firstNames: string[], surnames: string[] }} data 51 + */ 52 + export function loadNameDatabaseSync(data) { 53 + _firstNames = new Set(data.firstNames.map(n => n.toLowerCase())); 54 + _surnames = new Set(data.surnames.map(n => n.toLowerCase())); 55 + } 56 + 57 + // --- Organization suffix patterns --- 58 + const ORG_SUFFIXES = /\b(Inc|LLC|Ltd|Corp|Corporation|Co|Company|Foundation|Association|Institute|University|Museum|Academy|Society|Bureau|Agency|Commission|Committee|Council|Board|Authority|Trust|Fund|Group|Holdings|Partners|Ventures|Studios|Labs|Media|Press|Publishing|Records|Entertainment|Networks?|Solutions|Services|Systems|Technologies|Consulting|International|Global|National|Federal|Worldwide)\b\.?$/i; 59 + 60 + const ORG_SUFFIX_ANYWHERE = /\b(Inc|LLC|Ltd|Corp|Corporation|Foundation|Association|Institute|University|Museum|Academy|Society)\b\.?/i; 61 + 62 + // --- Honorifics/titles that indicate persons --- 63 + const HONORIFIC_PATTERN = /^(Mr|Mrs|Ms|Miss|Dr|Prof|Professor|Sir|Dame|Lady|Lord|Rev|Reverend|Fr|Father|Sr|Sister|Sgt|Sargent|Lt|Cpt|Capt|Captain|Maj|Major|Col|Colonel|Gen|General|Adm|Admiral|Sen|Senator|Rep|Representative|Gov|Governor|Pres|President|Hon|Judge|Justice)\b\.?\s/i; 64 + 65 + // --- CamelCase detection (internal caps in a single word) --- 66 + // Matches: IndieWeb, YouTube, GitHub, WordPress, TikTok, LinkedIn 67 + // Does NOT match: John, JOHN, john 68 + const CAMELCASE_PATTERN = /^[A-Z][a-z]+(?:[A-Z][a-z]*)+$/; 69 + 70 + // --- ALL_CAPS detection --- 71 + const ALL_CAPS_PATTERN = /^[A-Z]{2,}$/; 72 + 73 + // --- "The X" pattern --- 74 + const THE_PATTERN = /^The\s+/; 75 + 76 + // --- URL/path-like names --- 77 + const URL_LIKE_PATTERN = /^(https?:\/\/|www\.|\/|[a-z]+\.[a-z]{2,})/i; 78 + const PATH_LIKE_PATTERN = /[/\\]|\.(?:com|org|net|edu|gov|io|html|php|asp|js|css)\b/i; 79 + 80 + 81 + /** 82 + * Classify a name based on structural patterns alone (no database lookup). 83 + * Returns the likely entity type and confidence. 84 + * 85 + * @param {string} name - The entity name to classify 86 + * @returns {{ likelyType: string, confidence: number, reason: string }} 87 + */ 88 + export function classifyNamePattern(name) { 89 + if (!name || typeof name !== 'string') { 90 + return { likelyType: 'unknown', confidence: 0, reason: 'empty name' }; 91 + } 92 + 93 + const trimmed = name.trim(); 94 + const words = trimmed.split(/\s+/); 95 + 96 + // URL-like or path-like — not an entity name 97 + if (URL_LIKE_PATTERN.test(trimmed) || PATH_LIKE_PATTERN.test(trimmed)) { 98 + return { likelyType: 'not_entity', confidence: 0.9, reason: 'URL or path-like name' }; 99 + } 100 + 101 + // Org suffixes (high confidence) 102 + if (ORG_SUFFIXES.test(trimmed) || ORG_SUFFIX_ANYWHERE.test(trimmed)) { 103 + return { likelyType: 'organization', confidence: 0.9, reason: 'contains organization suffix' }; 104 + } 105 + 106 + // Honorific prefix → person 107 + if (HONORIFIC_PATTERN.test(trimmed)) { 108 + return { likelyType: 'person', confidence: 0.85, reason: 'starts with honorific/title' }; 109 + } 110 + 111 + // "The X" pattern → likely organization or creative work 112 + if (THE_PATTERN.test(trimmed) && words.length >= 2) { 113 + return { likelyType: 'organization', confidence: 0.7, reason: 'starts with "The"' }; 114 + } 115 + 116 + // Single-word checks 117 + if (words.length === 1) { 118 + // CamelCase single word → brand/org 119 + if (CAMELCASE_PATTERN.test(trimmed)) { 120 + return { likelyType: 'organization', confidence: 0.8, reason: 'CamelCase brand name' }; 121 + } 122 + 123 + // ALL CAPS → acronym/abbreviation 124 + if (ALL_CAPS_PATTERN.test(trimmed)) { 125 + return { likelyType: 'organization', confidence: 0.6, reason: 'ALL_CAPS abbreviation' }; 126 + } 127 + 128 + // Single word, title case — ambiguous (could be "Apple", "Madonna", etc.) 129 + return { likelyType: 'ambiguous', confidence: 0.4, reason: 'single word name' }; 130 + } 131 + 132 + // Multi-word CamelCase (e.g., "IndieWeb Movement") — first word is CamelCase 133 + if (CAMELCASE_PATTERN.test(words[0])) { 134 + return { likelyType: 'organization', confidence: 0.7, reason: 'leading CamelCase word' }; 135 + } 136 + 137 + // Two-word title case — could be person or org, need name DB to distinguish 138 + if (words.length === 2 && words.every(w => /^[A-Z][a-z]+$/.test(w))) { 139 + return { likelyType: 'ambiguous', confidence: 0.5, reason: 'two-word title case' }; 140 + } 141 + 142 + // Names with "&" or "and" between words → likely organization 143 + if (/\b&\b/.test(trimmed) || /\band\b/i.test(trimmed)) { 144 + return { likelyType: 'organization', confidence: 0.6, reason: 'contains "&" or "and"' }; 145 + } 146 + 147 + return { likelyType: 'unknown', confidence: 0.3, reason: 'no strong pattern match' }; 148 + } 149 + 150 + 151 + /** 152 + * Check if a name matches known person name patterns using the local name database. 153 + * 154 + * @param {string} name - The name to check 155 + * @returns {{ likely: boolean, confidence: number, reason: string }} 156 + */ 157 + export function isLikelyPersonName(name) { 158 + if (!name || typeof name !== 'string') { 159 + return { likely: false, confidence: 0, reason: 'empty name' }; 160 + } 161 + 162 + // Ensure name DB is loaded (should already be if validateEntityType was called) 163 + if (!_firstNames) { 164 + return { likely: false, confidence: 0.3, reason: 'name database not loaded' }; 165 + } 166 + 167 + const trimmed = name.trim(); 168 + const words = trimmed.split(/\s+/); 169 + 170 + // Strip honorific for lookup 171 + let lookupName = trimmed; 172 + const honorificMatch = trimmed.match(HONORIFIC_PATTERN); 173 + if (honorificMatch) { 174 + lookupName = trimmed.slice(honorificMatch[0].length).trim(); 175 + const lookupWords = lookupName.split(/\s+/); 176 + // With honorific, even one word is credible 177 + if (lookupWords.length >= 1) { 178 + const firstName = lookupWords[0].toLowerCase(); 179 + if (_firstNames.has(firstName)) { 180 + return { likely: true, confidence: 0.9, reason: 'honorific + known first name' }; 181 + } 182 + // Honorific alone is a strong signal 183 + return { likely: true, confidence: 0.75, reason: 'has honorific prefix' }; 184 + } 185 + } 186 + 187 + // Single word — check if it is a known first name (e.g., "Madonna", "Jesus") 188 + if (words.length === 1) { 189 + const lower = trimmed.toLowerCase(); 190 + if (_firstNames.has(lower)) { 191 + return { likely: true, confidence: 0.5, reason: 'single word matches known first name' }; 192 + } 193 + return { likely: false, confidence: 0.2, reason: 'single word, not a known first name' }; 194 + } 195 + 196 + // Multi-word: check first and last 197 + const first = words[0].toLowerCase(); 198 + const last = words[words.length - 1].toLowerCase(); 199 + 200 + const firstMatch = _firstNames.has(first); 201 + const lastMatch = _surnames.has(last); 202 + 203 + if (firstMatch && lastMatch) { 204 + return { likely: true, confidence: 0.9, reason: 'known first name + known surname' }; 205 + } 206 + if (firstMatch) { 207 + return { likely: true, confidence: 0.7, reason: 'known first name' }; 208 + } 209 + if (lastMatch) { 210 + return { likely: true, confidence: 0.5, reason: 'known surname' }; 211 + } 212 + 213 + // Two-word title case names are plausible as person names even without DB match 214 + if (words.length === 2 && words.every(w => /^[A-Z][a-z]+$/.test(w))) { 215 + return { likely: false, confidence: 0.4, reason: 'title case but no name DB match' }; 216 + } 217 + 218 + return { likely: false, confidence: 0.2, reason: 'no person name pattern match' }; 219 + } 220 + 221 + 222 + /** 223 + * Validate whether a name matches its claimed entity type. 224 + * This is the main entry point — call before storing an entity. 225 + * 226 + * @param {string} name - The entity name 227 + * @param {string} claimedType - The claimed entity type (person, organization, etc.) 228 + * @returns {{ valid: boolean, suggestedType?: string, reason?: string }} 229 + */ 230 + export function validateEntityType(name, claimedType) { 231 + if (!name || typeof name !== 'string' || !claimedType) { 232 + return { valid: false, reason: 'missing name or type' }; 233 + } 234 + 235 + const trimmed = name.trim(); 236 + 237 + // URL-like or path-like names should never be entities 238 + if (URL_LIKE_PATTERN.test(trimmed) || PATH_LIKE_PATTERN.test(trimmed)) { 239 + return { valid: false, reason: 'name looks like URL or path' }; 240 + } 241 + 242 + const pattern = classifyNamePattern(trimmed); 243 + const personCheck = isLikelyPersonName(trimmed); 244 + 245 + // --- Person type validation --- 246 + if (claimedType === 'person') { 247 + // CamelCase → definitely not a person 248 + if (pattern.likelyType === 'organization' && pattern.reason === 'CamelCase brand name') { 249 + return { valid: false, suggestedType: 'organization', reason: pattern.reason }; 250 + } 251 + 252 + // Org suffix → not a person 253 + if (pattern.likelyType === 'organization' && pattern.reason === 'contains organization suffix') { 254 + return { valid: false, suggestedType: 'organization', reason: pattern.reason }; 255 + } 256 + 257 + // "The X" pattern → not a person 258 + if (pattern.likelyType === 'organization' && pattern.reason === 'starts with "The"') { 259 + return { valid: false, suggestedType: 'organization', reason: pattern.reason }; 260 + } 261 + 262 + // ALL_CAPS → not a person 263 + if (pattern.likelyType === 'organization' && pattern.reason === 'ALL_CAPS abbreviation') { 264 + return { valid: false, suggestedType: 'organization', reason: pattern.reason }; 265 + } 266 + 267 + // Leading CamelCase → not a person 268 + if (pattern.likelyType === 'organization' && pattern.reason === 'leading CamelCase word') { 269 + return { valid: false, suggestedType: 'organization', reason: pattern.reason }; 270 + } 271 + 272 + // Single word: only valid if in first names DB 273 + if (trimmed.split(/\s+/).length === 1) { 274 + if (personCheck.likely) { 275 + return { valid: true }; // Known mononym (Madonna, Jesus, etc.) 276 + } 277 + return { valid: false, reason: 'single-word name not in known first names database' }; 278 + } 279 + 280 + // Multi-word: if name DB says not likely AND pattern says org, reject 281 + if (!personCheck.likely && pattern.likelyType === 'organization') { 282 + return { valid: false, suggestedType: 'organization', reason: pattern.reason }; 283 + } 284 + 285 + // Multi-word: person check passed or ambiguous — allow 286 + return { valid: true }; 287 + } 288 + 289 + // --- Organization type validation --- 290 + if (claimedType === 'organization') { 291 + // Honorific prefix → this is a person, not an org 292 + if (pattern.likelyType === 'person' && pattern.reason === 'starts with honorific/title') { 293 + return { valid: false, suggestedType: 'person', reason: pattern.reason }; 294 + } 295 + 296 + // Otherwise, organizations are hard to invalidate by name alone 297 + return { valid: true }; 298 + } 299 + 300 + // --- Other types: minimal validation --- 301 + // For events, places, products, creative works — just check for URL/path 302 + return { valid: true }; 303 + } 304 + 305 + 306 + /** 307 + * Ensure the name database is loaded. Call this before using validateEntityType 308 + * or isLikelyPersonName in async contexts. 309 + */ 310 + export async function ensureNameDatabase() { 311 + await loadNameDatabase(); 312 + }