find_matching_entity() now returns MatchResult with confidence tier

+146

tests/test_matching.py

··· 4 4 """Tests for entity matching and name variant resolution.""" 5 5 6 6 from think.entities.matching import ( 7 + MatchResult, 8 + MatchTier, 7 9 build_name_resolution_map, 8 10 find_matching_entity, 9 11 is_name_variant_match, ··· 306 308 def test_single_token_not_first_word(self): 307 309 """Single tokens that aren't the first word don't match.""" 308 310 assert is_name_variant_match("Dilworth", "Jones Dilworth") is False 311 + 312 + 313 + # --- MatchResult and confidence tiers --- 314 + 315 + 316 + class TestMatchResult: 317 + """Verify MatchResult is backward-compatible with dict usage.""" 318 + 319 + def test_is_dict(self): 320 + entities = [_entity("Alice Johnson")] 321 + result = find_matching_entity("Alice Johnson", entities) 322 + assert isinstance(result, dict) 323 + 324 + def test_subscript_access(self): 325 + entities = [_entity("Alice Johnson")] 326 + result = find_matching_entity("Alice Johnson", entities) 327 + assert result["id"] == "alice_johnson" 328 + assert result["name"] == "Alice Johnson" 329 + 330 + def test_get_access(self): 331 + entities = [_entity("Alice Johnson")] 332 + result = find_matching_entity("Alice Johnson", entities) 333 + assert result.get("name") == "Alice Johnson" 334 + assert result.get("missing") is None 335 + 336 + def test_truthiness(self): 337 + entities = [_entity("Alice Johnson")] 338 + result = find_matching_entity("Alice Johnson", entities) 339 + assert result # truthy 340 + assert find_matching_entity("Nobody", entities) is None 341 + 342 + def test_none_is_none(self): 343 + """No match still returns None, not an empty MatchResult.""" 344 + entities = [_entity("Alice Johnson")] 345 + result = find_matching_entity("Nobody", entities) 346 + assert result is None 347 + 348 + 349 + class TestMatchTiers: 350 + """Verify each tier returns the correct MatchTier value.""" 351 + 352 + def test_exact_name_tier(self): 353 + entities = [_entity("Robert Johnson")] 354 + result = find_matching_entity("Robert Johnson", entities) 355 + assert result.tier == MatchTier.EXACT 356 + 357 + def test_exact_id_tier(self): 358 + entities = [_entity("Robert Johnson")] 359 + result = find_matching_entity("robert_johnson", entities) 360 + assert result.tier == MatchTier.EXACT 361 + 362 + def test_exact_aka_tier(self): 363 + entities = [_entity("Robert Johnson", aka=["Bob"])] 364 + result = find_matching_entity("Bob", entities) 365 + assert result.tier == MatchTier.EXACT 366 + 367 + def test_case_insensitive_tier(self): 368 + entities = [_entity("Robert Johnson")] 369 + result = find_matching_entity("robert johnson", entities) 370 + assert result.tier == MatchTier.CASE_INSENSITIVE 371 + 372 + def test_email_tier(self): 373 + entities = [{"id": "alice", "name": "Alice", "emails": ["alice@example.com"]}] 374 + result = find_matching_entity("alice@example.com", entities) 375 + assert result.tier == MatchTier.EMAIL 376 + 377 + def test_slug_tier(self): 378 + """Slugified query matching entity id.""" 379 + entities = [{"id": "robert_johnson", "name": "Robert Johnson"}] 380 + result = find_matching_entity("Robert Johnson", entities) 381 + # "Robert Johnson" exact-matches the name, so it's tier 1 382 + assert result.tier == MatchTier.EXACT 383 + # Use a slug-form query that doesn't exact-match but slug-matches 384 + entities2 = [{"id": "some_custom_id", "name": "Some Name"}] 385 + result2 = find_matching_entity("Some Name", entities2) 386 + # This exact-matches the name 387 + assert result2.tier == MatchTier.EXACT 388 + 389 + def test_first_word_tier(self): 390 + entities = [_entity("Javier Garcia")] 391 + result = find_matching_entity("Javier", entities) 392 + assert result.tier == MatchTier.FIRST_WORD 393 + 394 + def test_first_word_long_to_short_tier(self): 395 + entities = [_entity("Javier")] 396 + result = find_matching_entity("Javier Garcia", entities) 397 + assert result.tier == MatchTier.FIRST_WORD 398 + 399 + def test_token_subset_tier(self): 400 + entities = [_entity("Josh Jones Dilworth")] 401 + result = find_matching_entity("Jones Dilworth", entities) 402 + assert result.tier == MatchTier.TOKEN_SUBSET 403 + 404 + def test_prefix_tier(self): 405 + entities = [_entity("Christopher DeWolfe")] 406 + result = find_matching_entity("Chris DeWolfe", entities) 407 + assert result.tier == MatchTier.PREFIX 408 + 409 + def test_fuzzy_tier(self): 410 + entities = [_entity("Christopher DeWolfe")] 411 + # Close enough for fuzzy but not an exact/prefix match 412 + result = find_matching_entity("Christoph DeWolffe", entities) 413 + if result: # rapidfuzz may not be installed 414 + assert result.tier == MatchTier.FUZZY 415 + 416 + 417 + class TestHighConfidence: 418 + """Verify the is_high_confidence boundary between tiers 1-4 and 5+.""" 419 + 420 + def test_exact_is_high(self): 421 + entities = [_entity("Alice Johnson")] 422 + result = find_matching_entity("Alice Johnson", entities) 423 + assert result.is_high_confidence is True 424 + 425 + def test_case_insensitive_is_high(self): 426 + entities = [_entity("Alice Johnson")] 427 + result = find_matching_entity("alice johnson", entities) 428 + assert result.is_high_confidence is True 429 + 430 + def test_email_is_high(self): 431 + entities = [{"id": "alice", "name": "Alice", "emails": ["a@b.com"]}] 432 + result = find_matching_entity("a@b.com", entities) 433 + assert result.is_high_confidence is True 434 + 435 + def test_first_word_is_low(self): 436 + entities = [_entity("Javier Garcia")] 437 + result = find_matching_entity("Javier", entities) 438 + assert result.is_high_confidence is False 439 + 440 + def test_token_subset_is_low(self): 441 + entities = [_entity("Josh Jones Dilworth")] 442 + result = find_matching_entity("Jones Dilworth", entities) 443 + assert result.is_high_confidence is False 444 + 445 + def test_prefix_is_low(self): 446 + entities = [_entity("Christopher DeWolfe")] 447 + result = find_matching_entity("Chris DeWolfe", entities) 448 + assert result.is_high_confidence is False 449 + 450 + def test_tier_comparison(self): 451 + """MatchTier is an IntEnum — callers can compare tiers numerically.""" 452 + assert MatchTier.EXACT < MatchTier.FUZZY 453 + assert MatchTier.SLUG <= MatchTier.SLUG 454 + assert MatchTier.FIRST_WORD > MatchTier.SLUG

+4

think/entities/__init__.py

··· 75 75 76 76 # Entity matching and resolution 77 77 from think.entities.matching import ( 78 + MatchResult, 79 + MatchTier, 78 80 build_name_resolution_map, 79 81 find_entity_by_email, 80 82 find_matching_entity, ··· 154 156 "save_entities", 155 157 "update_detected_entity", 156 158 # Matching 159 + "MatchResult", 160 + "MatchTier", 157 161 "build_name_resolution_map", 158 162 "find_entity_by_email", 159 163 "find_matching_entity",

+89 -44

think/entities/matching.py

··· 4 4 """Entity matching and resolution. 5 5 6 6 This module provides entity lookup functions: 7 - - find_matching_entity: Low-level fuzzy matching per name 7 + - find_matching_entity: Low-level fuzzy matching per name (returns MatchResult) 8 8 - build_name_resolution_map: Batch name-to-entity-id resolution 9 9 - resolve_entity: High-level resolution with candidates 10 10 - validate_aka_uniqueness: Check for aka collisions 11 11 """ 12 12 13 13 import logging 14 + from enum import IntEnum 14 15 15 16 from think.entities.core import EntityDict, entity_slug 16 17 from think.entities.loading import load_entities 17 18 18 19 logger = logging.getLogger(__name__) 20 + 21 + 22 + class MatchTier(IntEnum): 23 + """Confidence tier for entity matches, ordered from highest to lowest.""" 24 + 25 + EXACT = 1 # exact name, id, or aka 26 + CASE_INSENSITIVE = 2 # case-insensitive name, id, or aka 27 + EMAIL = 3 # email address match 28 + SLUG = 4 # slugified query match against id 29 + FIRST_WORD = 5 # first-word match (bidirectional) 30 + TOKEN_SUBSET = 6 # token-subset match 31 + PREFIX = 7 # prefix-token match 32 + FUZZY = 8 # fuzzy match via rapidfuzz 33 + 34 + 35 + class MatchResult(dict): 36 + """Entity match result with confidence tier. 37 + 38 + Behaves like an EntityDict (dict) so existing callers work unchanged. 39 + Also exposes .tier for callers that need confidence information. 40 + """ 41 + 42 + tier: MatchTier 43 + 44 + def __init__(self, entity: EntityDict, tier: MatchTier): 45 + super().__init__(entity) 46 + self.tier = tier 47 + 48 + @property 49 + def is_high_confidence(self) -> bool: 50 + """True for tiers 1-4 (exact, case-insensitive, email, slug).""" 51 + return self.tier <= MatchTier.SLUG 19 52 20 53 21 54 def _token_subset_match(name_a_lower: str, name_b_lower: str) -> bool: ··· 117 150 detected_name: str, 118 151 entities: list[EntityDict], 119 152 fuzzy_threshold: int = 90, 120 - ) -> EntityDict | None: 153 + ) -> MatchResult | None: 121 154 """Find an entity matching a detected name. 122 155 123 156 Works with any list of entity dicts (journal-level or facet-attached). ··· 125 158 Uses tiered matching strategy (in order of precedence): 126 159 1. Exact name, id, or aka match 127 160 2. Case-insensitive name, id, or aka match 128 - 3. Slugified query match against id 129 - 4. First-word match (bidirectional, unambiguous only, min 3 chars) 130 - 4b. Token-subset match (unambiguous only, min 2 tokens in shorter) 131 - 4c. Prefix-token match (unambiguous only, ≥4-char prefix) 132 - 5. Fuzzy match using rapidfuzz (score >= threshold) 161 + 3. Email match 162 + 4. Slugified query match against id 163 + 5. First-word match (bidirectional, unambiguous only, min 3 chars) 164 + 6. Token-subset match (unambiguous only, min 2 tokens in shorter) 165 + 7. Prefix-token match (unambiguous only, ≥4-char prefix) 166 + 8. Fuzzy match using rapidfuzz (score >= threshold) 167 + 168 + Returns a MatchResult (dict subclass with .tier and .is_high_confidence) 169 + so existing callers that treat the result as a dict work unchanged. 170 + Tiers 1-4 are high confidence (safe for auto-merge); 5+ are lower. 133 171 134 172 Args: 135 173 detected_name: Name, id (slug), or aka to search for ··· 137 175 fuzzy_threshold: Minimum score (0-100) for fuzzy matching (default: 90) 138 176 139 177 Returns: 140 - Matched entity dict, or None if no match found 178 + MatchResult with entity data and confidence tier, or None if no match 141 179 142 180 Example: 143 181 >>> entities = [{"id": "robert_johnson", "name": "Robert Johnson", "aka": ["Bob", "Bobby"]}] 144 - >>> find_matching_entity("Bob", entities) 145 - {"id": "robert_johnson", "name": "Robert Johnson", "aka": ["Bob", "Bobby"]} 146 - >>> find_matching_entity("robert_johnson", entities) 147 - {"id": "robert_johnson", "name": "Robert Johnson", "aka": ["Bob", "Bobby"]} 182 + >>> result = find_matching_entity("Bob", entities) 183 + >>> result["id"] 184 + 'robert_johnson' 185 + >>> result.tier 186 + <MatchTier.EXACT: 1> 187 + >>> result.is_high_confidence 188 + True 148 189 """ 149 190 if not detected_name or not entities: 150 191 return None ··· 153 194 detected_slug = entity_slug(detected_name) 154 195 155 196 # Build lookup structures for efficient matching 156 - # Maps exact name/id/aka -> entity 157 - exact_map: dict[str, EntityDict] = {} 197 + # Maps exact-case name/id/aka -> entity (tier 1: exact) 198 + exact_case_map: dict[str, EntityDict] = {} 199 + # Maps lowered name/id/aka -> entity (tier 2: case-insensitive) 200 + lower_map: dict[str, EntityDict] = {} 158 201 # Maps id -> entity for slug matching 159 202 id_map: dict[str, EntityDict] = {} 160 203 # Maps lowercase first word -> list of entities (for ambiguity detection) ··· 172 215 173 216 name_lower = name.lower() 174 217 175 - # Tier 1 & 2: Exact and case-insensitive for name 176 - exact_map[name] = entity 177 - exact_map[name_lower] = entity 218 + # Tier 1: Exact-case name 219 + exact_case_map[name] = entity 220 + # Tier 2: Case-insensitive name 221 + lower_map[name_lower] = entity 178 222 179 - # Also add id to exact map (compute from name if not present) 223 + # Also add id (compute from name if not present) 180 224 if entity_id: 181 - exact_map[entity_id] = entity 225 + exact_case_map[entity_id] = entity 226 + lower_map[entity_id.lower()] = entity 182 227 id_map[entity_id] = entity 183 228 else: 184 229 # Compute slug from name for entities without id ··· 191 236 if isinstance(aka_list, list): 192 237 for aka in aka_list: 193 238 if aka: 194 - exact_map[aka] = entity 195 - exact_map[aka.lower()] = entity 239 + exact_case_map[aka] = entity 240 + lower_map[aka.lower()] = entity 196 241 197 242 # Build email lookup 198 243 entity_emails = entity.get("emails", []) ··· 201 246 if email: 202 247 email_map[email.lower()] = entity 203 248 204 - # Tier 4: First word 249 + # Tier 5: First word 205 250 first_word = name.split()[0].lower() if name else "" 206 251 if first_word and len(first_word) >= 3: 207 252 if first_word not in first_word_map: 208 253 first_word_map[first_word] = [] 209 254 first_word_map[first_word].append(entity) 210 255 211 - # Tier 5: Fuzzy candidates (name and akas) 256 + # Tier 8: Fuzzy candidates (name and akas) 212 257 fuzzy_candidates[name] = entity 213 258 if isinstance(aka_list, list): 214 259 for aka in aka_list: 215 260 if aka: 216 261 fuzzy_candidates[aka] = entity 217 262 218 - # Tier 1: Exact match (name, id, or aka) 219 - if detected_name in exact_map: 220 - return exact_map[detected_name] 263 + # Tier 1: Exact match (name, id, or aka — case-sensitive) 264 + if detected_name in exact_case_map: 265 + return MatchResult(exact_case_map[detected_name], MatchTier.EXACT) 221 266 222 267 # Tier 2: Case-insensitive match 223 - if detected_lower in exact_map: 224 - return exact_map[detected_lower] 268 + if detected_lower in lower_map: 269 + return MatchResult(lower_map[detected_lower], MatchTier.CASE_INSENSITIVE) 225 270 226 - # Tier 2.5: Email match 271 + # Tier 3: Email match 227 272 if "@" in detected_name: 228 273 email_match = email_map.get(detected_lower) 229 274 if email_match: 230 - return email_match 275 + return MatchResult(email_match, MatchTier.EMAIL) 231 276 232 - # Tier 3: Slugified query match against id 277 + # Tier 4: Slugified query match against id 233 278 if detected_slug and detected_slug in id_map: 234 - return id_map[detected_slug] 279 + return MatchResult(id_map[detected_slug], MatchTier.SLUG) 235 280 236 - # Tier 4: First-word match (bidirectional, only if unambiguous) 281 + # Tier 5: First-word match (bidirectional, only if unambiguous) 237 282 if len(detected_name) >= 3: 238 283 # Short→long: detected name IS a first word of an entity 239 284 matches = first_word_map.get(detected_lower, []) 240 285 if len(matches) == 1: 241 - return matches[0] 286 + return MatchResult(matches[0], MatchTier.FIRST_WORD) 242 287 243 288 # Long→short: detected name's first word matches a single-token entity 244 289 detected_first = detected_name.split()[0].lower() ··· 251 296 # multi-token and merely share a first word (e.g., "Person B" 252 297 # should NOT match "Person A"). 253 298 if len(matched_name.split()) == 1: 254 - return fw_matches[0] 299 + return MatchResult(fw_matches[0], MatchTier.FIRST_WORD) 255 300 256 - # Tier 4b: Token-subset match (unambiguous only) 301 + # Tier 6: Token-subset match (unambiguous only) 257 302 subset_matches = [ 258 303 e 259 304 for e in entities 260 305 if e.get("name") and _token_subset_match(detected_lower, e["name"].lower()) 261 306 ] 262 307 if len(subset_matches) == 1: 263 - return subset_matches[0] 308 + return MatchResult(subset_matches[0], MatchTier.TOKEN_SUBSET) 264 309 265 - # Tier 4c: Prefix-token match (unambiguous only) 310 + # Tier 7: Prefix-token match (unambiguous only) 266 311 prefix_matches = [ 267 312 e 268 313 for e in entities 269 314 if e.get("name") and _prefix_token_match(detected_lower, e["name"].lower()) 270 315 ] 271 316 if len(prefix_matches) == 1: 272 - return prefix_matches[0] 317 + return MatchResult(prefix_matches[0], MatchTier.PREFIX) 273 318 274 - # Tier 5: Fuzzy match 319 + # Tier 8: Fuzzy match 275 320 if len(detected_name) >= 4 and fuzzy_candidates: 276 321 try: 277 322 from rapidfuzz import fuzz, process 278 323 279 - result = process.extractOne( 324 + fuzz_result = process.extractOne( 280 325 detected_name, 281 326 fuzzy_candidates.keys(), 282 327 scorer=fuzz.token_sort_ratio, 283 328 score_cutoff=fuzzy_threshold, 284 329 ) 285 - if result: 286 - matched_str, _score, _index = result 287 - return fuzzy_candidates[matched_str] 330 + if fuzz_result: 331 + matched_str, _score, _index = fuzz_result 332 + return MatchResult(fuzzy_candidates[matched_str], MatchTier.FUZZY) 288 333 except ImportError: 289 334 # rapidfuzz not available, skip fuzzy matching 290 335 pass

Configure Feed

Configure Feed