search for standard sites pub-search.waow.tech
search zig blog atproto
11
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat: ingestion-time content dedup + date filter UI

add content_hash (wyhash of title+content) to documents table. on
ingest, skip documents where the same author already has identical
content under a different rkey (cross-platform publishing dedup).

frontend: add date filter (any/week/month/year) with since param,
URL state sync, and active filter bar.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

zzstoatzz 641b6a86 a3da00c9

+147 -8
+4
backend/src/db/schema.zig
··· 141 141 // documents using site.standard.* that don't match a known platform are simply "other" 142 142 client.exec("UPDATE documents SET platform = 'other' WHERE platform = 'standardsite'", &.{}) catch {}; 143 143 144 + // content_hash: used for cross-platform dedup (same author + same content = skip) 145 + client.exec("ALTER TABLE documents ADD COLUMN content_hash TEXT", &.{}) catch {}; 146 + client.exec("CREATE INDEX IF NOT EXISTS idx_documents_did_content_hash ON documents(did, content_hash)", &.{}) catch {}; 147 + 144 148 // detect platform from publication basePath (site.standard.* is a lexicon, not a platform) 145 149 // known platforms (pckt, leaflet, offprint) use site.standard.* but have distinct basePaths 146 150 client.exec(
+31 -3
backend/src/ingest/indexer.zig
··· 1 1 const std = @import("std"); 2 + const logfire = @import("logfire"); 2 3 const db = @import("../db/mod.zig"); 4 + 5 + /// Hash title+content for cross-platform dedup. 6 + /// Returns a 16-char hex string (wyhash of "title\x00content"). 7 + fn computeContentHash(title: []const u8, content: []const u8) [16]u8 { 8 + var hasher = std.hash.Wyhash.init(0); 9 + hasher.update(title); 10 + hasher.update("\x00"); 11 + hasher.update(content); 12 + const hash = hasher.final(); 13 + return std.fmt.bytesToHex(std.mem.asBytes(&hash), .lower); 14 + } 3 15 4 16 pub fn insertDocument( 5 17 uri: []const u8, ··· 28 40 c.exec("DELETE FROM documents_fts WHERE uri = ?", &.{old_uri}) catch {}; 29 41 c.exec("DELETE FROM document_tags WHERE document_uri = ?", &.{old_uri}) catch {}; 30 42 c.exec("DELETE FROM documents WHERE uri = ?", &.{old_uri}) catch {}; 43 + } 44 + } 45 + } else |_| {} 46 + 47 + // cross-platform content dedup: if same author already has a document with 48 + // identical title+content (different rkey from a different platform), skip it. 49 + const content_hash: [16]u8 = computeContentHash(title, content); 50 + if (c.query("SELECT uri FROM documents WHERE did = ? AND content_hash = ?", &.{ did, &content_hash })) |res| { 51 + var result = res; 52 + defer result.deinit(); 53 + if (result.first()) |row| { 54 + const existing_uri = row.text(0); 55 + if (!std.mem.eql(u8, existing_uri, uri)) { 56 + logfire.debug("indexer: skipping dupe for {s} (existing: {s})", .{ uri, existing_uri }); 57 + return; 31 58 } 32 59 } 33 60 } else |_| {} ··· 122 149 // indexed_at uses strftime to record when this row was inserted/updated in Turso 123 150 // (created_at is the document's publication date, which can be old for resynced docs) 124 151 try c.exec( 125 - \\INSERT INTO documents (uri, did, rkey, title, content, created_at, publication_uri, platform, source_collection, path, base_path, has_publication, indexed_at) 126 - \\VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, strftime('%Y-%m-%dT%H:%M:%S', 'now')) 152 + \\INSERT INTO documents (uri, did, rkey, title, content, created_at, publication_uri, platform, source_collection, path, base_path, has_publication, content_hash, indexed_at) 153 + \\VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, strftime('%Y-%m-%dT%H:%M:%S', 'now')) 127 154 \\ON CONFLICT(uri) DO UPDATE SET 128 155 \\ did = excluded.did, 129 156 \\ rkey = excluded.rkey, ··· 136 163 \\ path = excluded.path, 137 164 \\ base_path = excluded.base_path, 138 165 \\ has_publication = excluded.has_publication, 166 + \\ content_hash = excluded.content_hash, 139 167 \\ indexed_at = strftime('%Y-%m-%dT%H:%M:%S', 'now'), 140 168 \\ embedded_at = documents.embedded_at 141 169 , 142 - &.{ uri, did, rkey, title, content, created_at orelse "", pub_uri, actual_platform, source_collection, path orelse "", base_path, has_pub }, 170 + &.{ uri, did, rkey, title, content, created_at orelse "", pub_uri, actual_platform, source_collection, path orelse "", base_path, has_pub, &content_hash }, 143 171 ); 144 172 145 173 // update FTS index
+112 -5
site/index.html
··· 417 417 color: #d4956a; 418 418 } 419 419 420 + .date-filter { 421 + margin-bottom: 1rem; 422 + } 423 + 424 + .date-filter-label { 425 + font-size: 11px; 426 + color: #444; 427 + margin-bottom: 0.5rem; 428 + } 429 + 430 + .date-filter-list { 431 + display: flex; 432 + gap: 0.5rem; 433 + } 434 + 435 + .date-option { 436 + font-size: 11px; 437 + padding: 3px 8px; 438 + background: #151515; 439 + border: 1px solid #252525; 440 + border-radius: 3px; 441 + cursor: pointer; 442 + color: #777; 443 + } 444 + 445 + .date-option:hover { 446 + background: #1a1a1a; 447 + border-color: #333; 448 + color: #aaa; 449 + } 450 + 451 + .date-option.active { 452 + background: rgba(14, 165, 233, 0.2); 453 + border-color: #0ea5e9; 454 + color: #38bdf8; 455 + } 456 + 420 457 421 458 .active-filter { 422 459 display: flex; ··· 512 549 } 513 550 514 551 /* ensure minimum 44px touch targets */ 515 - .tag, .platform-option, .mode-option, .suggestion, input.tag-input { 552 + .tag, .platform-option, .mode-option, .date-option, .suggestion, input.tag-input { 516 553 min-height: 44px; 517 554 display: inline-flex; 518 555 align-items: center; ··· 571 608 } 572 609 573 610 /* tags wrap better on mobile */ 574 - .tags-list, .platform-filter-list { 611 + .tags-list, .platform-filter-list, .date-filter-list { 575 612 gap: 0.5rem; 576 613 } 577 614 ··· 590 627 591 628 /* ensure touch targets on tablets too */ 592 629 @media (hover: none) and (pointer: coarse) { 593 - .tag, .platform-option, .mode-option, .suggestion, .related-item, input.tag-input { 630 + .tag, .platform-option, .mode-option, .date-option, .suggestion, .related-item, input.tag-input { 594 631 min-height: 44px; 595 632 display: inline-flex; 596 633 align-items: center; ··· 616 653 <div id="tags" class="tags"></div> 617 654 618 655 <div id="platform-filter" class="platform-filter"></div> 656 + 657 + <div id="date-filter" class="date-filter"></div> 619 658 620 659 <div id="results" class="results"> 621 660 <div class="empty-state"> ··· 640 679 const activeFilterDiv = document.getElementById('active-filter'); 641 680 const suggestionsDiv = document.getElementById('suggestions'); 642 681 const platformFilterDiv = document.getElementById('platform-filter'); 682 + const dateFilterDiv = document.getElementById('date-filter'); 643 683 const modeToggleDiv = document.getElementById('mode-toggle'); 644 684 let currentTag = null; 645 685 let currentPlatform = null; 686 + let currentSince = null; 646 687 let currentMode = 'keyword'; 647 688 let allTags = []; 648 689 let popularSearches = []; ··· 674 715 let searchUrl = `${API_URL}/search?q=${encodeURIComponent(query || '')}&format=v2&limit=${PAGE_SIZE}&offset=${currentOffset}`; 675 716 if (tag) searchUrl += `&tag=${encodeURIComponent(tag)}`; 676 717 if (platform) searchUrl += `&platform=${encodeURIComponent(platform)}`; 718 + if (currentSince) searchUrl += `&since=${encodeURIComponent(currentSince)}`; 677 719 if (currentMode !== 'keyword') searchUrl += `&mode=${currentMode}`; 678 720 679 721 try { ··· 978 1020 if (q) params.set('q', q); 979 1021 if (currentTag) params.set('tag', currentTag); 980 1022 if (currentPlatform) params.set('platform', currentPlatform); 1023 + if (currentSince) params.set('since', currentSince); 981 1024 if (currentMode !== 'keyword') params.set('mode', currentMode); 982 1025 const url = params.toString() ? `?${params}` : '/'; 983 1026 history.pushState(null, '', url); ··· 1049 1092 platformFilterDiv.innerHTML = `<div class="platform-filter-label">filter by platform:</div><div class="platform-filter-list">${html}</div>`; 1050 1093 } 1051 1094 1095 + function sinceFromPreset(preset) { 1096 + const d = new Date(); 1097 + if (preset === 'week') d.setDate(d.getDate() - 7); 1098 + else if (preset === 'month') d.setMonth(d.getMonth() - 1); 1099 + else if (preset === 'year') d.setFullYear(d.getFullYear() - 1); 1100 + else return null; 1101 + return d.toISOString().slice(0, 10); 1102 + } 1103 + 1104 + // reverse: given a since ISO date, find the closest preset 1105 + function presetFromSince(since) { 1106 + if (!since) return null; 1107 + const now = new Date(); 1108 + const d = new Date(since); 1109 + const days = Math.round((now - d) / (1000 * 60 * 60 * 24)); 1110 + if (days <= 8) return 'week'; 1111 + if (days <= 32) return 'month'; 1112 + if (days <= 370) return 'year'; 1113 + return null; 1114 + } 1115 + 1116 + let currentDatePreset = null; 1117 + 1118 + function renderDateFilter() { 1119 + const presets = [ 1120 + { id: null, label: 'any' }, 1121 + { id: 'week', label: 'week' }, 1122 + { id: 'month', label: 'month' }, 1123 + { id: 'year', label: 'year' }, 1124 + ]; 1125 + const html = presets.map(p => ` 1126 + <span class="date-option${currentDatePreset === p.id ? ' active' : ''}" onclick="setDateFilter(${p.id === null ? 'null' : `'${p.id}'`})">${p.label}</span> 1127 + `).join(''); 1128 + dateFilterDiv.innerHTML = `<div class="date-filter-label">date:</div><div class="date-filter-list">${html}</div>`; 1129 + } 1130 + 1131 + function setDateFilter(preset) { 1132 + if (currentDatePreset === preset) return; 1133 + currentDatePreset = preset; 1134 + currentSince = sinceFromPreset(preset); 1135 + renderDateFilter(); 1136 + renderActiveFilter(); 1137 + if (queryInput.value.trim() || currentTag || currentPlatform || currentSince) { 1138 + doSearch(); 1139 + } 1140 + } 1141 + 1052 1142 function renderModeToggle() { 1053 1143 const modes = [ 1054 1144 { id: 'keyword', label: 'keyword' }, ··· 1065 1155 if (currentMode === mode) return; 1066 1156 currentMode = mode; 1067 1157 renderModeToggle(); 1068 - // hide tags/since for non-keyword modes (tpuf doesn't support them) 1158 + // hide tags for semantic mode (tpuf doesn't support them); date works for keyword+hybrid 1069 1159 tagsDiv.style.display = currentMode === 'keyword' ? '' : 'none'; 1160 + dateFilterDiv.style.display = currentMode !== 'semantic' ? '' : 'none'; 1070 1161 if (currentMode !== 'keyword') { 1071 1162 currentTag = null; 1072 1163 renderActiveFilter(); 1073 1164 } 1165 + if (currentMode === 'semantic') { 1166 + currentSince = null; 1167 + currentDatePreset = null; 1168 + renderDateFilter(); 1169 + renderActiveFilter(); 1170 + } 1074 1171 // trigger search if there's a query 1075 1172 if (queryInput.value.trim() || currentTag || currentPlatform) { 1076 1173 doSearch(); ··· 1078 1175 } 1079 1176 1080 1177 function renderActiveFilter() { 1081 - if (!currentTag && !currentPlatform) { 1178 + if (!currentTag && !currentPlatform && !currentSince) { 1082 1179 activeFilterDiv.innerHTML = ''; 1083 1180 return; 1084 1181 } 1085 1182 let parts = []; 1086 1183 if (currentTag) parts.push(`tag: <strong>#${escapeHtml(currentTag)}</strong>`); 1087 1184 if (currentPlatform) parts.push(`platform: <strong>${escapeHtml(currentPlatform)}</strong>`); 1185 + if (currentSince) parts.push(`since: <strong>${escapeHtml(currentSince)}</strong>`); 1088 1186 const clearActions = []; 1089 1187 if (currentTag) clearActions.push(`<span class="clear" onclick="clearTag()">× tag</span>`); 1090 1188 if (currentPlatform) clearActions.push(`<span class="clear" onclick="clearPlatform()">× platform</span>`); 1189 + if (currentSince) clearActions.push(`<span class="clear" onclick="setDateFilter(null)">× date</span>`); 1091 1190 activeFilterDiv.innerHTML = ` 1092 1191 <div class="active-filter"> 1093 1192 <span>filtering by ${parts.join(', ')} <span style="color:#666;font-size:10px">(documents only)</span></span> ··· 1186 1285 queryInput.value = params.get('q') || ''; 1187 1286 currentTag = params.get('tag') || null; 1188 1287 currentPlatform = params.get('platform') || null; 1288 + currentSince = params.get('since') || null; 1289 + currentDatePreset = presetFromSince(currentSince); 1189 1290 currentMode = params.get('mode') || 'keyword'; 1190 1291 renderActiveFilter(); 1191 1292 renderTags(); 1192 1293 renderPlatformFilter(); 1294 + renderDateFilter(); 1193 1295 renderModeToggle(); 1194 1296 tagsDiv.style.display = currentMode === 'keyword' ? '' : 'none'; 1297 + dateFilterDiv.style.display = currentMode !== 'semantic' ? '' : 'none'; 1195 1298 if (queryInput.value || currentTag || currentPlatform) search(queryInput.value, currentTag, currentPlatform); 1196 1299 }); 1197 1300 ··· 1200 1303 const initialQuery = initialParams.get('q'); 1201 1304 const initialTag = initialParams.get('tag'); 1202 1305 const initialPlatform = initialParams.get('platform'); 1306 + const initialSince = initialParams.get('since'); 1203 1307 const initialMode = initialParams.get('mode'); 1204 1308 if (initialQuery) queryInput.value = initialQuery; 1205 1309 if (initialTag) currentTag = initialTag; 1206 1310 if (initialPlatform) currentPlatform = initialPlatform; 1311 + if (initialSince) { currentSince = initialSince; currentDatePreset = presetFromSince(initialSince); } 1207 1312 if (initialMode) currentMode = initialMode; 1208 1313 renderActiveFilter(); 1209 1314 renderPlatformFilter(); 1315 + renderDateFilter(); 1210 1316 renderModeToggle(); 1211 1317 tagsDiv.style.display = currentMode === 'keyword' ? '' : 'none'; 1318 + dateFilterDiv.style.display = currentMode !== 'semantic' ? '' : 'none'; 1212 1319 1213 1320 if (initialQuery || initialTag || initialPlatform) { 1214 1321 search(initialQuery || '', initialTag, initialPlatform);