fix(entities): tighten address regex, add label extraction, clean up false positives

+52

backend/electron/datastore.ts

··· 373 373 migrateUnformattedPhoneEntities(); 374 374 migrateLowQualityEntities(); 375 375 migrateInvalidPersonEntities(); 376 + migrateInvalidPlaceEntities(); 376 377 migrateTagNavigationHistory(); 377 378 dropLegacyAddressTables(); 378 379 ··· 2033 2034 db.prepare('INSERT OR REPLACE INTO migrations (id, status, completedAt) VALUES (?, ?, ?)').run(MIGRATION_ID, 'complete', Date.now()); 2034 2035 if (removedCount > 0 || retypedCount > 0) { 2035 2036 console.log(`[datastore] Person entity validation: removed ${removedCount}, retyped ${retypedCount} to organization`); 2037 + } 2038 + } 2039 + 2040 + /** 2041 + * Remove false-positive place entities from the regex extractor. 2042 + * Previously ADDRESS_RE was too loose (case-insensitive, allowed zero-length 2043 + * words) and matched strings like "16 yarn test" as street addresses. 2044 + * Re-validates regex-extracted places against the tightened ADDRESS_RE. 2045 + * Non-regex places (from structured data, microformats) are left untouched. 2046 + */ 2047 + function migrateInvalidPlaceEntities(): void { 2048 + if (!db) return; 2049 + 2050 + const MIGRATION_ID = 'validate_place_entities_v1'; 2051 + 2052 + const migrationRecord = db.prepare('SELECT * FROM migrations WHERE id = ?').get(MIGRATION_ID) as { status: string } | undefined; 2053 + if (migrationRecord && migrationRecord.status === 'complete') return; 2054 + 2055 + // Tightened ADDRESS_RE: case-sensitive suffixes, 2+ char words, mandatory whitespace 2056 + const validAddressPattern = /\b\d{1,5}\s+(?:[A-Z][a-zA-Z]+\s+){1,4}(?:Street|St|Avenue|Ave|Boulevard|Blvd|Drive|Dr|Road|Rd|Lane|Ln|Way|Court|Ct|Place|Pl|Circle|Cir|Trail|Trl|Parkway|Pkwy|Highway|Hwy)\.?\b/; 2057 + 2058 + const regexPlaceEntities = db.prepare(` 2059 + SELECT id, content, metadata FROM items 2060 + WHERE type = 'entity' 2061 + AND deletedAt = 0 2062 + AND json_extract(metadata, '$.entityType') = 'place' 2063 + AND json_extract(metadata, '$.extractor') = 'regex' 2064 + `).all() as { id: string; content: string; metadata: string }[]; 2065 + 2066 + let removedCount = 0; 2067 + 2068 + for (const entity of regexPlaceEntities) { 2069 + const name = (entity.content || '').trim(); 2070 + 2071 + // Keep if it matches the tightened address pattern 2072 + if (validAddressPattern.test(name)) continue; 2073 + 2074 + // Doesn't match — remove as false positive from old loose regex 2075 + try { 2076 + db!.prepare('DELETE FROM item_events WHERE itemId = ?').run(entity.id); 2077 + db!.prepare('DELETE FROM item_tags WHERE itemId = ?').run(entity.id); 2078 + db!.prepare('DELETE FROM items WHERE id = ?').run(entity.id); 2079 + removedCount++; 2080 + } catch (error) { 2081 + console.error(`Failed to remove invalid place entity ${entity.id}:`, error); 2082 + } 2083 + } 2084 + 2085 + db.prepare('INSERT OR REPLACE INTO migrations (id, status, completedAt) VALUES (?, ?, ?)').run(MIGRATION_ID, 'complete', Date.now()); 2086 + if (removedCount > 0) { 2087 + console.log(`[datastore] Place entity validation: removed ${removedCount} false-positive regex-extracted place entities`); 2036 2088 } 2037 2089 } 2038 2090

+174

backend/electron/entity-frecency.test.ts

··· 570 570 assert.strictEqual(dates.length, 0, 'Date regex extraction is disabled'); 571 571 }); 572 572 }); 573 + 574 + // ─── Address/place: true positives SHOULD be detected ──────────── 575 + 576 + describe('addresses: valid street addresses SHOULD be detected as places', () => { 577 + it('should detect "123 Main Street"', () => { 578 + const text = 'Our office is at 123 Main Street'; 579 + const result = extractRegexEntities(text, 'https://example.com'); 580 + const places = result.filter((e: any) => e.entityType === 'place'); 581 + assert.ok(places.length >= 1, '"123 Main Street" should be detected'); 582 + }); 583 + 584 + it('should detect "456 Oak Avenue"', () => { 585 + const text = 'Located at 456 Oak Avenue'; 586 + const result = extractRegexEntities(text, 'https://example.com'); 587 + const places = result.filter((e: any) => e.entityType === 'place'); 588 + assert.ok(places.length >= 1, '"456 Oak Avenue" should be detected'); 589 + }); 590 + 591 + it('should detect "789 Elm Boulevard"', () => { 592 + const text = 'Visit 789 Elm Boulevard'; 593 + const result = extractRegexEntities(text, 'https://example.com'); 594 + const places = result.filter((e: any) => e.entityType === 'place'); 595 + assert.ok(places.length >= 1, '"789 Elm Boulevard" should be detected'); 596 + }); 597 + 598 + it('should detect "1600 Pennsylvania Ave"', () => { 599 + const text = 'The address is 1600 Pennsylvania Ave'; 600 + const result = extractRegexEntities(text, 'https://example.com'); 601 + const places = result.filter((e: any) => e.entityType === 'place'); 602 + assert.ok(places.length >= 1, '"1600 Pennsylvania Ave" should be detected'); 603 + }); 604 + 605 + it('should detect "42 Wallaby Way"', () => { 606 + const text = 'Find us at 42 Wallaby Way'; 607 + const result = extractRegexEntities(text, 'https://example.com'); 608 + const places = result.filter((e: any) => e.entityType === 'place'); 609 + assert.ok(places.length >= 1, '"42 Wallaby Way" should be detected'); 610 + }); 611 + 612 + it('should detect "100 Technology Drive, Suite 200"', () => { 613 + const text = 'Headquarters: 100 Technology Drive, Suite 200'; 614 + const result = extractRegexEntities(text, 'https://example.com'); 615 + const places = result.filter((e: any) => e.entityType === 'place'); 616 + assert.ok(places.length >= 1, '"100 Technology Drive, Suite 200" should be detected'); 617 + }); 618 + 619 + it('should detect full address with city, state, zip', () => { 620 + const text = 'Leavening Heaven, 123 Main St, Dorktown, IA 52001'; 621 + const result = extractRegexEntities(text, 'https://example.com'); 622 + const places = result.filter((e: any) => e.entityType === 'place'); 623 + assert.ok(places.length >= 1, 'Full address should be detected'); 624 + assert.ok(places[0].attributes.address.includes('Dorktown'), 'Should include city'); 625 + assert.ok(places[0].attributes.address.includes('IA'), 'Should include state'); 626 + assert.ok(places[0].attributes.label === 'Leavening Heaven', 'Should extract label'); 627 + }); 628 + 629 + it('should detect address with city and state but no zip', () => { 630 + const text = '456 Oak Avenue, Portland, OR'; 631 + const result = extractRegexEntities(text, 'https://example.com'); 632 + const places = result.filter((e: any) => e.entityType === 'place'); 633 + assert.ok(places.length >= 1, 'Address with city/state should be detected'); 634 + assert.ok(places[0].attributes.address.includes('Portland'), 'Should include city'); 635 + assert.ok(places[0].attributes.address.includes('OR'), 'Should include state'); 636 + }); 637 + 638 + it('should detect address with zip+4', () => { 639 + const text = '789 Elm Boulevard, Austin, TX 78701-1234'; 640 + const result = extractRegexEntities(text, 'https://example.com'); 641 + const places = result.filter((e: any) => e.entityType === 'place'); 642 + assert.ok(places.length >= 1, 'Address with zip+4 should be detected'); 643 + assert.ok(places[0].attributes.address.includes('78701-1234'), 'Should include full zip'); 644 + }); 645 + }); 646 + 647 + // ─── Address/place: false positives should NOT be detected ─────── 648 + 649 + describe('addresses: non-address text should NOT be detected as places', () => { 650 + it('should NOT detect "16 yarn test" as a place', () => { 651 + const text = '16 yarn test'; 652 + const result = extractRegexEntities(text, 'https://example.com'); 653 + const places = result.filter((e: any) => e.entityType === 'place'); 654 + assert.strictEqual(places.length, 0, 655 + '"16 yarn test" should not be a place'); 656 + }); 657 + 658 + it('should NOT detect "9 yarn test" as a place', () => { 659 + const text = '9 yarn test'; 660 + const result = extractRegexEntities(text, 'https://example.com'); 661 + const places = result.filter((e: any) => e.entityType === 'place'); 662 + assert.strictEqual(places.length, 0, 663 + '"9 yarn test" should not be a place'); 664 + }); 665 + 666 + it('should NOT detect "90 npx playwright test" as a place', () => { 667 + const text = '90 npx playwright test'; 668 + const result = extractRegexEntities(text, 'https://example.com'); 669 + const places = result.filter((e: any) => e.entityType === 'place'); 670 + assert.strictEqual(places.length, 0, 671 + '"90 npx playwright test" should not be a place'); 672 + }); 673 + 674 + it('should NOT detect numbered list items as places', () => { 675 + const text = '3 Configure Settings'; 676 + const result = extractRegexEntities(text, 'https://example.com'); 677 + const places = result.filter((e: any) => e.entityType === 'place'); 678 + assert.strictEqual(places.length, 0, 679 + 'Numbered list item should not be a place'); 680 + }); 681 + 682 + it('should NOT detect "5 Install Dependencies" as a place', () => { 683 + const text = '5 Install Dependencies'; 684 + const result = extractRegexEntities(text, 'https://example.com'); 685 + const places = result.filter((e: any) => e.entityType === 'place'); 686 + assert.strictEqual(places.length, 0, 687 + '"5 Install Dependencies" should not be a place'); 688 + }); 689 + 690 + it('should NOT detect "10 Create Account" as a place', () => { 691 + const text = '10 Create Account'; 692 + const result = extractRegexEntities(text, 'https://example.com'); 693 + const places = result.filter((e: any) => e.entityType === 'place'); 694 + assert.strictEqual(places.length, 0, 695 + '"10 Create Account" should not be a place'); 696 + }); 697 + 698 + it('should NOT detect "2 Run Tests" as a place', () => { 699 + const text = '2 Run Tests'; 700 + const result = extractRegexEntities(text, 'https://example.com'); 701 + const places = result.filter((e: any) => e.entityType === 'place'); 702 + assert.strictEqual(places.length, 0, 703 + '"2 Run Tests" should not be a place'); 704 + }); 705 + }); 706 + 707 + // ─── Address/place: label/name context extraction ──────────────── 708 + 709 + describe('addresses: label/name context extraction', () => { 710 + it('should extract label from text before address (newline-separated)', () => { 711 + const text = "Madison Square Garden\n4 Pennsylvania Ave"; 712 + const result = extractRegexEntities(text, 'https://example.com'); 713 + const places = result.filter((e: any) => e.entityType === 'place'); 714 + assert.ok(places.length >= 1, 'Should detect the address'); 715 + assert.ok(places[0].attributes.label, 'Should have a label'); 716 + assert.ok(places[0].attributes.label.includes('Madison Square Garden')); 717 + assert.ok(places[0].name.includes('Madison Square Garden')); 718 + }); 719 + 720 + it('should extract label from "at" pattern', () => { 721 + const text = "Joe's Pizza at 123 Doughy Ln"; 722 + const result = extractRegexEntities(text, 'https://example.com'); 723 + const places = result.filter((e: any) => e.entityType === 'place'); 724 + assert.ok(places.length >= 1, 'Should detect the address'); 725 + assert.ok(places[0].attributes.label, 'Should have a label'); 726 + assert.ok(places[0].attributes.label.includes("Joe's Pizza")); 727 + }); 728 + 729 + it('should store address separately in attributes', () => { 730 + const text = "Acme Corp\n456 Oak Avenue"; 731 + const result = extractRegexEntities(text, 'https://example.com'); 732 + const places = result.filter((e: any) => e.entityType === 'place'); 733 + assert.ok(places.length >= 1); 734 + assert.ok(places[0].attributes.address.includes('456 Oak Avenue')); 735 + assert.ok(places[0].attributes.label === 'Acme Corp'); 736 + }); 737 + 738 + it('should have no label when no context precedes address', () => { 739 + const text = '123 Main Street'; 740 + const result = extractRegexEntities(text, 'https://example.com'); 741 + const places = result.filter((e: any) => e.entityType === 'place'); 742 + assert.ok(places.length >= 1); 743 + assert.strictEqual(places[0].attributes.label, undefined); 744 + assert.strictEqual(places[0].name, '123 Main Street'); 745 + }); 746 + }); 573 747 });

+60 -8

features/entities/extractors/regex.js

··· 32 32 const PRICE_CONTEXT_RE = /(?:\$|EUR|GBP|USD|CAD|AUD|JPY|CHF)\s?[\d,]+(?:\.\d{2})?|\d+(?:,\d{3})*(?:\.\d{2})?\s?(?:USD|EUR|GBP|CAD|AUD|JPY|CHF)/gi; 33 33 34 34 // US/international street address pattern 35 - // Matches patterns like "123 Main St", "456 Oak Avenue, Suite 200" 36 - const ADDRESS_RE = /\b\d{1,5}\s+(?:[A-Z][a-zA-Z]*\s*){1,4}(?:Street|St|Avenue|Ave|Boulevard|Blvd|Drive|Dr|Road|Rd|Lane|Ln|Way|Court|Ct|Place|Pl|Circle|Cir|Trail|Trl|Parkway|Pkwy|Highway|Hwy)\.?(?:\s*,?\s*(?:Suite|Ste|Apt|Unit|#)\s*\w+)?\b/gi; 35 + // Matches patterns like "123 Main St", "456 Oak Avenue, Suite 200", "123 Main St, Portland, OR 97201" 36 + // Case-sensitive: street suffixes must be capitalized (prevents "test" matching "St") 37 + // Requires 2+ char words with mandatory whitespace between them 38 + // Optional tail: Suite/Apt, then City, State (2-letter), ZIP 39 + const ADDRESS_RE = /\b\d{1,5}\s+(?:[A-Z][a-zA-Z]+\s+){1,4}(?:Street|St|Avenue|Ave|Boulevard|Blvd|Drive|Dr|Road|Rd|Lane|Ln|Way|Court|Ct|Place|Pl|Circle|Cir|Trail|Trl|Parkway|Pkwy|Highway|Hwy)\.?\b(?:\s*,?\s*(?:Suite|Ste|Apt|Unit|#)\s*\w+)?(?:\s*,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)?)?(?:\s*,\s*[A-Z]{2})?(?:\s+\d{5}(?:-\d{4})?)?\b/g; 37 40 38 41 // Tracking number patterns 39 42 const TRACKING_PATTERNS = [ ··· 173 176 } 174 177 175 178 /** 179 + * Extract the name/label associated with a street address. 180 + * Looks at the text before the address (up to 100 chars) for a business name, 181 + * person name, or venue title. Common patterns: 182 + * - "Joe's Pizza\n123 Main St" (newline-separated) 183 + * - "Madison Square Garden, 4 Pennsylvania Plaza" 184 + * - "Ship to: John Doe 456 Oak Ave" 185 + * @param {string} text - Full text 186 + * @param {number} matchIndex - Index of the address match in text 187 + * @returns {string|null} Name/label or null 188 + */ 189 + function extractAddressContext(text, matchIndex) { 190 + const beforeStart = Math.max(0, matchIndex - 100); 191 + const before = text.slice(beforeStart, matchIndex).trim(); 192 + 193 + // Split by newlines, bullets, pipes — the label is typically on the line above or same line 194 + const segments = before.split(/[\n\r|•·–—]/).filter(s => s.trim().length > 0); 195 + const lastSegment = (segments.length > 0 ? segments[segments.length - 1] : '').trim(); 196 + 197 + if (!lastSegment || lastSegment.length < 2) return null; 198 + 199 + // Remove trailing connector words ("at", "located at", "ship to:", etc.) 200 + const cleaned = lastSegment 201 + .replace(/\s*\b(?:at|located\s+at|ship\s+to|deliver\s+to|send\s+to|mail\s+to|address|located)\s*:?\s*$/i, '') 202 + .replace(/\s*[-:,]\s*$/, '') 203 + .trim(); 204 + 205 + if (!cleaned || cleaned.length < 2) return null; 206 + if (/^[\d\s.,;:!?$%/\-]+$/.test(cleaned)) return null; 207 + 208 + // Reject generic noise 209 + const noiseWords = /^(?:the|a|an|and|or|for|to|in|on|at|is|it|our|your|home|menu|contact|address|location|directions|map|visit|find)\s*$/i; 210 + if (noiseWords.test(cleaned)) return null; 211 + 212 + // Take up to ~60 chars, trim to word boundary 213 + let label = cleaned.slice(-60).trim(); 214 + if (cleaned.length > 60) { 215 + const spaceIdx = label.indexOf(' '); 216 + if (spaceIdx > 0) label = label.slice(spaceIdx + 1); 217 + } 218 + 219 + return label || null; 220 + } 221 + 222 + /** 176 223 * Extract person names from text using lightweight NER. 177 224 * Scans for capitalized word sequences (2-3 words) where the first word 178 225 * matches a known first name from the name database. ··· 285 332 // The price regex matches currency codes in non-price contexts (e.g. "2025 AUD" as a price). 286 333 // Dates and prices from structured data (JSON-LD) are still extracted by other extractors. 287 334 288 - // Extract street addresses 335 + // Extract street addresses (with optional name/label context) 289 336 ADDRESS_RE.lastIndex = 0; 290 - const addresses = text.match(ADDRESS_RE) || []; 291 337 const seenAddresses = new Set(); 292 - for (const addr of addresses) { 293 - const normalized = addr.trim().replace(/\s+/g, ' '); 338 + let addrMatch; 339 + while ((addrMatch = ADDRESS_RE.exec(text)) !== null) { 340 + const normalized = addrMatch[0].trim().replace(/\s+/g, ' '); 294 341 const key = normalized.toLowerCase(); 295 342 if (seenAddresses.has(key)) continue; 296 343 seenAddresses.add(key); 344 + 345 + const label = extractAddressContext(text, addrMatch.index); 346 + const attrs = { address: normalized }; 347 + if (label) attrs.label = label; 348 + 297 349 entities.push({ 298 - name: normalized, 350 + name: label ? `${label}, ${normalized}` : normalized, 299 351 entityType: 'place', 300 352 confidence: 0.75, 301 353 extractor: 'regex', 302 - attributes: { address: normalized }, 354 + attributes: attrs, 303 355 sourceUrl: url 304 356 }); 305 357 }

+1 -1

features/entities/name-validator.js

··· 298 298 } 299 299 300 300 // --- Other types: minimal validation --- 301 - // For events, places, products, creative works — just check for URL/path 301 + // For events, products, creative works — just check for URL/path 302 302 return { valid: true }; 303 303 } 304 304

Configure Feed

Configure Feed