Add bidirectional first-word, token-subset, and prefix-token matching

+3 -6

apps/speakers/bootstrap.py

··· 31 31 import numpy as np 32 32 33 33 from apps.speakers.owner import load_owner_centroid 34 - from think.entities import entity_slug, find_matching_entity 34 + from think.entities import entity_slug, find_matching_entity, is_name_variant_match 35 35 from think.entities.journal import ( 36 36 ensure_journal_entity_memory, 37 37 get_or_create_journal_entity, ··· 517 517 canonical_name, alias_name = name_b, name_a 518 518 canonical_id = other_id 519 519 520 - # Check name variant pattern: alias is the first word of canonical 521 - canonical_first = canonical_name.split()[0].lower() 522 - alias_lower = alias_name.strip().lower() 523 - 524 - if canonical_first != alias_lower: 520 + # Check name variant pattern: first-word, token-subset, or prefix-token 521 + if not is_name_variant_match(alias_name, canonical_name): 525 522 stats["ambiguous"].append( 526 523 { 527 524 "name": name_a,

+272

tests/test_matching.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Tests for entity matching and name variant resolution.""" 5 + 6 + from think.entities.matching import ( 7 + build_name_resolution_map, 8 + find_matching_entity, 9 + is_name_variant_match, 10 + ) 11 + 12 + 13 + def _entity(name, entity_id=None, aka=None): 14 + """Helper to create entity dicts for testing.""" 15 + eid = entity_id or name.lower().replace(" ", "_") 16 + result = {"id": eid, "name": name} 17 + if aka: 18 + result["aka"] = aka 19 + return result 20 + 21 + 22 + # --- Tier 1-3 regression tests --- 23 + 24 + 25 + class TestExistingTiers: 26 + def test_exact_name_match(self): 27 + entities = [_entity("Robert Johnson")] 28 + assert find_matching_entity("Robert Johnson", entities)["id"] == "robert_johnson" 29 + 30 + def test_exact_id_match(self): 31 + entities = [_entity("Robert Johnson")] 32 + assert find_matching_entity("robert_johnson", entities)["id"] == "robert_johnson" 33 + 34 + def test_exact_aka_match(self): 35 + entities = [_entity("Robert Johnson", aka=["Bob"])] 36 + assert find_matching_entity("Bob", entities)["id"] == "robert_johnson" 37 + 38 + def test_case_insensitive_match(self): 39 + entities = [_entity("Robert Johnson")] 40 + assert find_matching_entity("robert johnson", entities)["id"] == "robert_johnson" 41 + 42 + def test_no_match_returns_none(self): 43 + entities = [_entity("Robert Johnson")] 44 + assert find_matching_entity("Unknown Person", entities) is None 45 + 46 + def test_empty_inputs(self): 47 + assert find_matching_entity("", []) is None 48 + assert find_matching_entity("test", []) is None 49 + assert find_matching_entity("", [_entity("Test")]) is None 50 + 51 + 52 + # --- Enhancement 1: Bidirectional first-word match --- 53 + 54 + 55 + class TestBidirectionalFirstWord: 56 + def test_short_to_long(self): 57 + """Original tier 4: detected name IS a first word of an entity.""" 58 + entities = [_entity("Javier Garcia")] 59 + assert find_matching_entity("Javier", entities)["id"] == "javier_garcia" 60 + 61 + def test_long_to_short(self): 62 + """New: detected name's first word matches an entity.""" 63 + entities = [_entity("Javier")] 64 + assert find_matching_entity("Javier Garcia", entities)["id"] == "javier" 65 + 66 + def test_order_independent(self): 67 + """Both directions work regardless of which entity exists.""" 68 + entities = [_entity("Javier")] 69 + assert find_matching_entity("Javier Garcia", entities)["id"] == "javier" 70 + 71 + entities = [_entity("Javier Garcia")] 72 + assert find_matching_entity("Javier", entities)["id"] == "javier_garcia" 73 + 74 + def test_ambiguous_first_word_rejected(self): 75 + """Multiple entities with same first word: no match.""" 76 + entities = [_entity("Javier Garcia"), _entity("Javier Rodriguez")] 77 + assert find_matching_entity("Javier", entities) is None 78 + 79 + def test_ambiguous_first_word_long_to_short(self): 80 + """Multiple entities with same first word: long→short also rejected.""" 81 + entities = [_entity("Javier"), _entity("Javier Rodriguez")] 82 + assert find_matching_entity("Javier Garcia", entities) is None 83 + 84 + def test_short_name_min_length(self): 85 + """First word must be >= 3 chars.""" 86 + entities = [_entity("Li Wei")] 87 + assert find_matching_entity("Li", entities) is None 88 + 89 + 90 + # --- Enhancement 2: Token-subset match --- 91 + 92 + 93 + class TestTokenSubset: 94 + def test_subset_match_short_in_long(self): 95 + """Shorter name's tokens are a subset of longer entity's tokens.""" 96 + entities = [_entity("Josh Jones Dilworth")] 97 + assert find_matching_entity("Jones Dilworth", entities)["id"] == "josh_jones_dilworth" 98 + 99 + def test_subset_match_long_detected(self): 100 + """Detected name has more tokens than entity.""" 101 + entities = [_entity("Jones Dilworth")] 102 + assert find_matching_entity("Josh Jones Dilworth", entities)["id"] == "jones_dilworth" 103 + 104 + def test_single_token_not_subset(self): 105 + """Single-token names don't trigger subset match (min 2 tokens).""" 106 + entities = [_entity("Josh Jones Dilworth")] 107 + # "Dilworth" is 1 token — not first word, not a 2-token subset 108 + assert find_matching_entity("Dilworth", entities) is None 109 + 110 + def test_ambiguous_subset_rejected(self): 111 + """Multiple entities match token-subset: no match.""" 112 + entities = [ 113 + _entity("Josh Jones Dilworth"), 114 + _entity("Mary Jones Dilworth"), 115 + ] 116 + assert find_matching_entity("Jones Dilworth", entities) is None 117 + 118 + def test_subset_both_directions(self): 119 + """Token-subset works regardless of which name is in entities.""" 120 + entities = [_entity("Josh Jones Dilworth")] 121 + assert find_matching_entity("Jones Dilworth", entities)["id"] == "josh_jones_dilworth" 122 + 123 + entities = [_entity("Jones Dilworth")] 124 + assert find_matching_entity("Josh Jones Dilworth", entities)["id"] == "jones_dilworth" 125 + 126 + 127 + # --- Enhancement 3: Prefix-token match --- 128 + 129 + 130 + class TestPrefixToken: 131 + def test_prefix_match_nickname(self): 132 + """Nickname prefix matching (Chris → Christopher).""" 133 + entities = [_entity("Christopher DeWolfe")] 134 + assert find_matching_entity("Chris DeWolfe", entities)["id"] == "christopher_dewolfe" 135 + 136 + def test_prefix_match_reverse(self): 137 + """Reverse direction: full name detected, nickname entity.""" 138 + entities = [_entity("Chris DeWolfe")] 139 + assert find_matching_entity("Christopher DeWolfe", entities)["id"] == "chris_dewolfe" 140 + 141 + def test_prefix_min_length(self): 142 + """Prefix must be >= 4 chars.""" 143 + entities = [_entity("Jonathan Smith")] 144 + # "Jon" is only 3 chars, not a valid prefix 145 + assert find_matching_entity("Jon Smith", entities) is None 146 + 147 + def test_prefix_four_chars_matches(self): 148 + """Exactly 4-char prefix works.""" 149 + entities = [_entity("Jonathan Smith")] 150 + assert find_matching_entity("Jona Smith", entities)["id"] == "jonathan_smith" 151 + 152 + def test_ambiguous_prefix_rejected(self): 153 + """Multiple entities match prefix-token: no match.""" 154 + entities = [ 155 + _entity("Christopher DeWolfe"), 156 + _entity("Christine DeWolfe"), 157 + ] 158 + assert find_matching_entity("Chris DeWolfe", entities) is None 159 + 160 + def test_different_token_count_no_prefix(self): 161 + """Different token counts don't trigger prefix match.""" 162 + entities = [_entity("Christopher James DeWolfe")] 163 + assert find_matching_entity("Chris DeWolfe", entities) is None 164 + 165 + 166 + # --- Production duplicate cases --- 167 + 168 + 169 + class TestProductionDuplicates: 170 + """Verify the three production duplicate pairs that motivated this spec.""" 171 + 172 + def test_chris_dewolfe(self): 173 + """Chris DeWolfe ↔ Christopher DeWolfe (prefix-token match).""" 174 + entities = [_entity("Christopher DeWolfe")] 175 + assert find_matching_entity("Chris DeWolfe", entities)["id"] == "christopher_dewolfe" 176 + 177 + entities = [_entity("Chris DeWolfe")] 178 + assert find_matching_entity("Christopher DeWolfe", entities)["id"] == "chris_dewolfe" 179 + 180 + def test_javier_garcia(self): 181 + """Javier ↔ Javier Garcia (bidirectional first-word match).""" 182 + entities = [_entity("Javier Garcia")] 183 + assert find_matching_entity("Javier", entities)["id"] == "javier_garcia" 184 + 185 + entities = [_entity("Javier")] 186 + assert find_matching_entity("Javier Garcia", entities)["id"] == "javier" 187 + 188 + def test_jones_dilworth(self): 189 + """Jones Dilworth ↔ Josh Jones Dilworth (token-subset match).""" 190 + entities = [_entity("Josh Jones Dilworth")] 191 + assert find_matching_entity("Jones Dilworth", entities)["id"] == "josh_jones_dilworth" 192 + 193 + entities = [_entity("Jones Dilworth")] 194 + assert find_matching_entity("Josh Jones Dilworth", entities)["id"] == "jones_dilworth" 195 + 196 + 197 + # --- build_name_resolution_map --- 198 + 199 + 200 + class TestBuildNameResolutionMap: 201 + def test_bidirectional_first_word(self): 202 + entities = [_entity("Javier Garcia")] 203 + result = build_name_resolution_map(["Javier"], entities) 204 + assert result["Javier"] == "javier_garcia" 205 + 206 + def test_long_to_short_first_word(self): 207 + entities = [_entity("Javier")] 208 + result = build_name_resolution_map(["Javier Garcia"], entities) 209 + assert result["Javier Garcia"] == "javier" 210 + 211 + def test_token_subset(self): 212 + entities = [_entity("Josh Jones Dilworth")] 213 + result = build_name_resolution_map(["Jones Dilworth"], entities) 214 + assert result["Jones Dilworth"] == "josh_jones_dilworth" 215 + 216 + def test_prefix_token(self): 217 + entities = [_entity("Christopher DeWolfe")] 218 + result = build_name_resolution_map(["Chris DeWolfe"], entities) 219 + assert result["Chris DeWolfe"] == "christopher_dewolfe" 220 + 221 + def test_ambiguous_subset_skipped(self): 222 + entities = [ 223 + _entity("Josh Jones Dilworth"), 224 + _entity("Mary Jones Dilworth"), 225 + ] 226 + result = build_name_resolution_map(["Jones Dilworth"], entities) 227 + assert "Jones Dilworth" not in result 228 + 229 + def test_all_three_production_cases(self): 230 + entities = [ 231 + _entity("Christopher DeWolfe"), 232 + _entity("Javier Garcia"), 233 + _entity("Josh Jones Dilworth"), 234 + ] 235 + result = build_name_resolution_map( 236 + ["Chris DeWolfe", "Javier", "Jones Dilworth"], entities 237 + ) 238 + assert result["Chris DeWolfe"] == "christopher_dewolfe" 239 + assert result["Javier"] == "javier_garcia" 240 + assert result["Jones Dilworth"] == "josh_jones_dilworth" 241 + 242 + 243 + # --- is_name_variant_match --- 244 + 245 + 246 + class TestIsNameVariantMatch: 247 + def test_first_word_match(self): 248 + assert is_name_variant_match("Javier", "Javier Garcia") is True 249 + assert is_name_variant_match("Javier Garcia", "Javier") is True 250 + 251 + def test_token_subset_match(self): 252 + assert is_name_variant_match("Jones Dilworth", "Josh Jones Dilworth") is True 253 + assert is_name_variant_match("Josh Jones Dilworth", "Jones Dilworth") is True 254 + 255 + def test_prefix_token_match(self): 256 + assert is_name_variant_match("Chris DeWolfe", "Christopher DeWolfe") is True 257 + assert is_name_variant_match("Christopher DeWolfe", "Chris DeWolfe") is True 258 + 259 + def test_no_match(self): 260 + assert is_name_variant_match("Alice Smith", "Bob Jones") is False 261 + 262 + def test_empty_strings(self): 263 + assert is_name_variant_match("", "Test") is False 264 + assert is_name_variant_match("Test", "") is False 265 + 266 + def test_single_token_first_word(self): 267 + """Single tokens match via first-word when they ARE the first word.""" 268 + assert is_name_variant_match("Jones", "Jones Dilworth") is True 269 + 270 + def test_single_token_not_first_word(self): 271 + """Single tokens that aren't the first word don't match.""" 272 + assert is_name_variant_match("Dilworth", "Jones Dilworth") is False

+1

think/entities/__init__.py

··· 78 78 build_name_resolution_map, 79 79 find_entity_by_email, 80 80 find_matching_entity, 81 + is_name_variant_match, 81 82 resolve_entity, 82 83 validate_aka_uniqueness, 83 84 )

+107 -4

think/entities/matching.py

··· 18 18 logger = logging.getLogger(__name__) 19 19 20 20 21 + def _token_subset_match(name_a_lower: str, name_b_lower: str) -> bool: 22 + """True if all tokens of the shorter name appear in the longer (min 2 tokens in shorter).""" 23 + tokens_a = set(name_a_lower.split()) 24 + tokens_b = set(name_b_lower.split()) 25 + shorter, longer = sorted([tokens_a, tokens_b], key=len) 26 + return len(shorter) >= 2 and shorter <= longer 27 + 28 + 29 + def _prefix_token_match(name_a_lower: str, name_b_lower: str) -> bool: 30 + """True if sorted tokens are pairwise equal or ≥4-char prefixes of each other.""" 31 + sorted_a = sorted(name_a_lower.split()) 32 + sorted_b = sorted(name_b_lower.split()) 33 + if len(sorted_a) != len(sorted_b): 34 + return False 35 + return all( 36 + a == b or (len(a) >= 4 and b.startswith(a)) or (len(b) >= 4 and a.startswith(b)) 37 + for a, b in zip(sorted_a, sorted_b) 38 + ) 39 + 40 + 41 + def is_name_variant_match(name_a: str, name_b: str) -> bool: 42 + """Check if two names are plausible variants of each other. 43 + 44 + Uses three strategies: 45 + - First-word: one name equals the first word of the other 46 + - Token-subset: all tokens of the shorter name appear in the longer (min 2 tokens) 47 + - Prefix-token: same token count, pairwise equal or ≥4-char prefix match 48 + """ 49 + a_lower = name_a.strip().lower() 50 + b_lower = name_b.strip().lower() 51 + if not a_lower or not b_lower: 52 + return False 53 + 54 + a_words = a_lower.split() 55 + b_words = b_lower.split() 56 + if a_lower == b_words[0] or b_lower == a_words[0]: 57 + return True 58 + 59 + if _token_subset_match(a_lower, b_lower): 60 + return True 61 + 62 + if _prefix_token_match(a_lower, b_lower): 63 + return True 64 + 65 + return False 66 + 67 + 21 68 def validate_aka_uniqueness( 22 69 aka: str, 23 70 entities: list[EntityDict], ··· 79 126 1. Exact name, id, or aka match 80 127 2. Case-insensitive name, id, or aka match 81 128 3. Slugified query match against id 82 - 4. First-word match (unambiguous only, min 3 chars) 129 + 4. First-word match (bidirectional, unambiguous only, min 3 chars) 130 + 4b. Token-subset match (unambiguous only, min 2 tokens in shorter) 131 + 4c. Prefix-token match (unambiguous only, ≥4-char prefix) 83 132 5. Fuzzy match using rapidfuzz (score >= threshold) 84 133 85 134 Args: ··· 184 233 if detected_slug and detected_slug in id_map: 185 234 return id_map[detected_slug] 186 235 187 - # Tier 4: First-word match (only if unambiguous) 236 + # Tier 4: First-word match (bidirectional, only if unambiguous) 188 237 if len(detected_name) >= 3: 238 + # Short→long: detected name IS a first word of an entity 189 239 matches = first_word_map.get(detected_lower, []) 190 240 if len(matches) == 1: 191 241 return matches[0] 192 242 243 + # Long→short: detected name's first word matches an entity 244 + detected_first = detected_name.split()[0].lower() 245 + if detected_first != detected_lower and len(detected_first) >= 3: 246 + fw_matches = first_word_map.get(detected_first, []) 247 + if len(fw_matches) == 1: 248 + return fw_matches[0] 249 + 250 + # Tier 4b: Token-subset match (unambiguous only) 251 + subset_matches = [ 252 + e for e in entities 253 + if e.get("name") and _token_subset_match(detected_lower, e["name"].lower()) 254 + ] 255 + if len(subset_matches) == 1: 256 + return subset_matches[0] 257 + 258 + # Tier 4c: Prefix-token match (unambiguous only) 259 + prefix_matches = [ 260 + e for e in entities 261 + if e.get("name") and _prefix_token_match(detected_lower, e["name"].lower()) 262 + ] 263 + if len(prefix_matches) == 1: 264 + return prefix_matches[0] 265 + 193 266 # Tier 5: Fuzzy match 194 267 if len(detected_name) >= 4 and fuzzy_candidates: 195 268 try: ··· 226 299 1. Exact name, id, or aka match 227 300 2. Case-insensitive name, id, or aka match 228 301 3. Slugified query match against id 229 - 4. First-word match (unambiguous only, min 3 chars) 302 + 4. First-word match (bidirectional, unambiguous only, min 3 chars) 303 + 4b. Token-subset match (unambiguous only, min 2 tokens in shorter) 304 + 4c. Prefix-token match (unambiguous only, ≥4-char prefix) 230 305 5. Fuzzy match via rapidfuzz (score >= threshold) 231 306 232 307 Logs when ambiguous first-word matches prevent tier-4 resolution. ··· 247 322 id_set: set[str] = set() # all entity IDs for slug matching 248 323 first_word_map: dict[str, list[str]] = {} # lowercase first word → [entity_ids] 249 324 fuzzy_candidates: dict[str, str] = {} # candidate string → entity_id 325 + entity_name_info: list[tuple[str, str]] = [] # (entity_id, name_lower) for new tiers 250 326 251 327 for entity in entities: 252 328 name = entity.get("name", "") ··· 256 332 257 333 name_lower = name.lower() 258 334 id_set.add(entity_id) 335 + entity_name_info.append((entity_id, name_lower)) 259 336 260 337 # Tiers 1 & 2: exact and case-insensitive for name, id, akas 261 338 exact_map[name] = entity_id ··· 306 383 result[sname] = sname_slug 307 384 continue 308 385 309 - # Tier 4: first-word match (unambiguous only) 386 + # Tier 4: first-word match (bidirectional, unambiguous only) 310 387 if len(sname) >= 3: 311 388 matches = first_word_map.get(sname_lower, []) 312 389 if len(matches) == 1: ··· 319 396 len(matches), 320 397 matches, 321 398 ) 399 + 400 + # Long→short: first word of sname in first_word_map 401 + sname_first = sname.split()[0].lower() 402 + if sname_first != sname_lower and len(sname_first) >= 3: 403 + fw_matches = first_word_map.get(sname_first, []) 404 + if len(fw_matches) == 1: 405 + result[sname] = fw_matches[0] 406 + continue 407 + 408 + # Tier 4b: Token-subset match (unambiguous only) 409 + subset_matches = [ 410 + eid for eid, ename in entity_name_info 411 + if _token_subset_match(sname_lower, ename) 412 + ] 413 + if len(subset_matches) == 1: 414 + result[sname] = subset_matches[0] 415 + continue 416 + 417 + # Tier 4c: Prefix-token match (unambiguous only) 418 + prefix_matches = [ 419 + eid for eid, ename in entity_name_info 420 + if _prefix_token_match(sname_lower, ename) 421 + ] 422 + if len(prefix_matches) == 1: 423 + result[sname] = prefix_matches[0] 424 + continue 322 425 323 426 # Defer to fuzzy matching 324 427 unresolved.append(sname)

Configure Feed

Configure Feed