···17311731function migrateLowQualityEntities(): void {
17321732 if (!db) return;
1733173317341734- const MIGRATION_ID = 'remove_low_quality_entities_v2';
17341734+ const MIGRATION_ID = 'remove_low_quality_entities_v3';
1735173517361736 const migrationRecord = db.prepare('SELECT * FROM migrations WHERE id = ?').get(MIGRATION_ID) as { status: string } | undefined;
17371737 if (migrationRecord && migrationRecord.status === 'complete') return;
···17981798 }
17991799 }
1800180018011801- // 5. All regex-extracted places — the address regex is too noisy.
18021802- // Going forward, regex places get confidence 0.6 which is below the 0.7 threshold,
18031803- // so none will be stored. Clean up all existing ones.
18041804- if (entityType === 'place' && meta.extractor === 'regex') {
18011801+ // 5. All regex-extracted places, prices, and dates — these extractors are too noisy.
18021802+ // Places match random numbered text, prices match currency codes in non-price contexts
18031803+ // (e.g. "2025 AUD" as a price), dates pull surrounding garbage text as "context".
18041804+ if ((entityType === 'place' || entityType === 'price' || entityType === 'date') && meta.extractor === 'regex') {
18051805 removeEntity(entity.id);
18061806 continue;
18071807 }
+4-54
extensions/entities/extractors/regex.js
···208208 });
209209 }
210210211211- // Extract dates WITH context only - standalone dates are noise.
212212- // A date must be associated with a nearby event name, deadline, or meaningful label.
213213- for (const pattern of DATE_PATTERNS) {
214214- pattern.lastIndex = 0;
215215- let dateMatch;
216216- const seenDates = new Set();
217217- while ((dateMatch = pattern.exec(text)) !== null) {
218218- const normalized = dateMatch[0].trim();
219219- if (seenDates.has(normalized.toLowerCase())) continue;
220220- seenDates.add(normalized.toLowerCase());
221221-222222- const context = extractDateContext(text, dateMatch.index);
223223- if (context) {
224224- entities.push({
225225- name: `${context} ${normalized}`,
226226- entityType: 'date',
227227- confidence: 0.8,
228228- extractor: 'regex',
229229- attributes: { originalText: normalized, context: context },
230230- sourceUrl: url
231231- });
232232- }
233233- // If no context found, skip this date — it's noise
234234- }
235235- }
236236-237237- // Extract prices WITH context only - standalone prices are noise.
238238- // A price must be associated with a nearby product/service name.
239239- PRICE_CONTEXT_RE.lastIndex = 0;
240240- let priceMatch;
241241- const seenPrices = new Set();
242242- while ((priceMatch = PRICE_CONTEXT_RE.exec(text)) !== null) {
243243- const priceText = priceMatch[0].trim();
244244- if (seenPrices.has(priceText)) continue;
245245- seenPrices.add(priceText);
246246-247247- const productName = extractPriceContext(text, priceMatch.index);
248248- if (productName) {
249249- // Create a meaningful price entity: "ProductName — $X.XX"
250250- entities.push({
251251- name: `${productName} ${priceText}`,
252252- entityType: 'price',
253253- confidence: 0.85,
254254- extractor: 'regex',
255255- attributes: {
256256- price: priceText,
257257- product: productName,
258258- originalText: priceText
259259- },
260260- sourceUrl: url
261261- });
262262- }
263263- // If no product context found, skip this price — it's noise
264264- }
211211+ // Date and price regex extraction DISABLED — too noisy.
212212+ // The date regex pulls surrounding garbage text as "context" (e.g. sports match titles).
213213+ // The price regex matches currency codes in non-price contexts (e.g. "2025 AUD" as a price).
214214+ // Dates and prices from structured data (JSON-LD) are still extracted by other extractors.
265215266216 // Extract street addresses
267217 ADDRESS_RE.lastIndex = 0;