fix(entities): remove all regex prices/dates (too noisy), disable extractors

+9 -59

2 changed files

expand all

backend

electron

datastore.ts

extensions

entities

extractors

regex.js

+5 -5

backend/electron/datastore.ts

··· 1731 1731 function migrateLowQualityEntities(): void { 1732 1732 if (!db) return; 1733 1733 1734 - const MIGRATION_ID = 'remove_low_quality_entities_v2'; 1734 + const MIGRATION_ID = 'remove_low_quality_entities_v3'; 1735 1735 1736 1736 const migrationRecord = db.prepare('SELECT * FROM migrations WHERE id = ?').get(MIGRATION_ID) as { status: string } | undefined; 1737 1737 if (migrationRecord && migrationRecord.status === 'complete') return; ··· 1798 1798 } 1799 1799 } 1800 1800 1801 - // 5. All regex-extracted places — the address regex is too noisy. 1802 - // Going forward, regex places get confidence 0.6 which is below the 0.7 threshold, 1803 - // so none will be stored. Clean up all existing ones. 1804 - if (entityType === 'place' && meta.extractor === 'regex') { 1801 + // 5. All regex-extracted places, prices, and dates — these extractors are too noisy. 1802 + // Places match random numbered text, prices match currency codes in non-price contexts 1803 + // (e.g. "2025 AUD" as a price), dates pull surrounding garbage text as "context". 1804 + if ((entityType === 'place' || entityType === 'price' || entityType === 'date') && meta.extractor === 'regex') { 1805 1805 removeEntity(entity.id); 1806 1806 continue; 1807 1807 }

+4 -54

extensions/entities/extractors/regex.js

··· 208 208 }); 209 209 } 210 210 211 - // Extract dates WITH context only - standalone dates are noise. 212 - // A date must be associated with a nearby event name, deadline, or meaningful label. 213 - for (const pattern of DATE_PATTERNS) { 214 - pattern.lastIndex = 0; 215 - let dateMatch; 216 - const seenDates = new Set(); 217 - while ((dateMatch = pattern.exec(text)) !== null) { 218 - const normalized = dateMatch[0].trim(); 219 - if (seenDates.has(normalized.toLowerCase())) continue; 220 - seenDates.add(normalized.toLowerCase()); 221 - 222 - const context = extractDateContext(text, dateMatch.index); 223 - if (context) { 224 - entities.push({ 225 - name: `${context} ${normalized}`, 226 - entityType: 'date', 227 - confidence: 0.8, 228 - extractor: 'regex', 229 - attributes: { originalText: normalized, context: context }, 230 - sourceUrl: url 231 - }); 232 - } 233 - // If no context found, skip this date — it's noise 234 - } 235 - } 236 - 237 - // Extract prices WITH context only - standalone prices are noise. 238 - // A price must be associated with a nearby product/service name. 239 - PRICE_CONTEXT_RE.lastIndex = 0; 240 - let priceMatch; 241 - const seenPrices = new Set(); 242 - while ((priceMatch = PRICE_CONTEXT_RE.exec(text)) !== null) { 243 - const priceText = priceMatch[0].trim(); 244 - if (seenPrices.has(priceText)) continue; 245 - seenPrices.add(priceText); 246 - 247 - const productName = extractPriceContext(text, priceMatch.index); 248 - if (productName) { 249 - // Create a meaningful price entity: "ProductName — $X.XX" 250 - entities.push({ 251 - name: `${productName} ${priceText}`, 252 - entityType: 'price', 253 - confidence: 0.85, 254 - extractor: 'regex', 255 - attributes: { 256 - price: priceText, 257 - product: productName, 258 - originalText: priceText 259 - }, 260 - sourceUrl: url 261 - }); 262 - } 263 - // If no product context found, skip this price — it's noise 264 - } 211 + // Date and price regex extraction DISABLED — too noisy. 212 + // The date regex pulls surrounding garbage text as "context" (e.g. sports match titles). 213 + // The price regex matches currency codes in non-price contexts (e.g. "2025 AUD" as a price). 214 + // Dates and prices from structured data (JSON-LD) are still extracted by other extractors. 265 215 266 216 // Extract street addresses 267 217 ADDRESS_RE.lastIndex = 0;

Configure Feed

Configure Feed