search for standard sites pub-search.waow.tech
search zig blog atproto
11
fork

Configure Feed

Select the types of activity you want to include in your feed.

add similarity cache, cache stats, loading indicator, and planning doc

- similarity_cache table for /similar endpoint (invalidates on doc count change)
- cache_hits/cache_misses counters in stats
- loading indicator for "related to" results in frontend
- planning doc for standard-search expansion

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

zzstoatzz e264ee4e 009ecffc

+400 -9
+1 -1
backend/src/db/mod.zig
··· 1 1 const std = @import("std"); 2 2 3 - const Client = @import("Client.zig"); 4 3 const schema = @import("schema.zig"); 5 4 const result = @import("result.zig"); 6 5 7 6 // re-exports 7 + pub const Client = @import("Client.zig"); 8 8 pub const Row = result.Row; 9 9 pub const Result = result.Result; 10 10 pub const BatchResult = result.BatchResult;
+13
backend/src/db/schema.zig
··· 98 98 \\ deleted_at INTEGER NOT NULL 99 99 \\) 100 100 , &.{}); 101 + 102 + // similarity cache: stores precomputed similar documents 103 + // invalidated when doc_count changes (new docs added/removed) 104 + try client.exec( 105 + \\CREATE TABLE IF NOT EXISTS similarity_cache ( 106 + \\ source_uri TEXT PRIMARY KEY, 107 + \\ results TEXT NOT NULL, 108 + \\ doc_count INTEGER NOT NULL, 109 + \\ computed_at INTEGER NOT NULL 110 + \\) 111 + , &.{}); 101 112 } 102 113 103 114 fn runMigrations(client: *Client) !void { ··· 105 116 client.exec("ALTER TABLE documents ADD COLUMN publication_uri TEXT", &.{}) catch {}; 106 117 client.exec("ALTER TABLE publications ADD COLUMN base_path TEXT", &.{}) catch {}; 107 118 client.exec("ALTER TABLE stats ADD COLUMN service_started_at INTEGER", &.{}) catch {}; 119 + client.exec("ALTER TABLE stats ADD COLUMN cache_hits INTEGER DEFAULT 0", &.{}) catch {}; 120 + client.exec("ALTER TABLE stats ADD COLUMN cache_misses INTEGER DEFAULT 0", &.{}) catch {}; 108 121 109 122 // vector embeddings column already added by backfill script 110 123 }
+54 -2
backend/src/search.zig
··· 3 3 const Allocator = std.mem.Allocator; 4 4 const zql = @import("zql"); 5 5 const db = @import("db/mod.zig"); 6 + const stats = @import("stats.zig"); 6 7 7 8 // JSON output type for search results 8 9 const SearchResultJson = struct { ··· 175 176 } 176 177 177 178 /// Find documents similar to a given document using vector similarity 178 - /// Uses brute-force cosine distance (no index required, ~7s for 3500 docs) 179 + /// Uses brute-force cosine distance with caching (cache invalidated when doc count changes) 179 180 pub fn findSimilar(alloc: Allocator, uri: []const u8, limit: usize) ![]const u8 { 180 181 const c = db.getClient() orelse return error.NotInitialized; 181 182 183 + // get current doc count (for cache invalidation) 184 + const doc_count = getEmbeddedDocCount(c) orelse return error.QueryFailed; 185 + 186 + // check cache 187 + if (getCachedSimilar(alloc, c, uri, doc_count)) |cached| { 188 + stats.recordCacheHit(); 189 + return cached; 190 + } 191 + stats.recordCacheMiss(); 192 + 193 + // cache miss - compute similarity 182 194 var output: std.Io.Writer.Allocating = .init(alloc); 183 195 errdefer output.deinit(); 184 196 ··· 208 220 try jw.beginArray(); 209 221 for (res.rows) |row| try jw.write(Doc.fromRow(row).toJson()); 210 222 try jw.endArray(); 211 - return try output.toOwnedSlice(); 223 + 224 + const results = try output.toOwnedSlice(); 225 + 226 + // cache the results (fire and forget) 227 + cacheSimilarResults(c, uri, results, doc_count); 228 + 229 + return results; 230 + } 231 + 232 + fn getEmbeddedDocCount(c: *db.Client) ?i64 { 233 + var res = c.query("SELECT COUNT(*) FROM documents WHERE embedding IS NOT NULL", &.{}) catch return null; 234 + defer res.deinit(); 235 + if (res.rows.len == 0) return null; 236 + return res.rows[0].int(0); 237 + } 238 + 239 + fn getCachedSimilar(alloc: Allocator, c: *db.Client, uri: []const u8, current_doc_count: i64) ?[]const u8 { 240 + var count_buf: [20]u8 = undefined; 241 + const count_str = std.fmt.bufPrint(&count_buf, "{d}", .{current_doc_count}) catch return null; 242 + 243 + var res = c.query( 244 + "SELECT results FROM similarity_cache WHERE source_uri = ? AND doc_count = ?", 245 + &.{ uri, count_str }, 246 + ) catch return null; 247 + defer res.deinit(); 248 + 249 + if (res.rows.len == 0) return null; 250 + return alloc.dupe(u8, res.rows[0].text(0)) catch null; 251 + } 252 + 253 + fn cacheSimilarResults(c: *db.Client, uri: []const u8, results: []const u8, doc_count: i64) void { 254 + var count_buf: [20]u8 = undefined; 255 + const count_str = std.fmt.bufPrint(&count_buf, "{d}", .{doc_count}) catch return; 256 + 257 + var ts_buf: [20]u8 = undefined; 258 + const ts_str = std.fmt.bufPrint(&ts_buf, "{d}", .{std.time.timestamp()}) catch return; 259 + 260 + c.exec( 261 + "INSERT OR REPLACE INTO similarity_cache (source_uri, results, doc_count, computed_at) VALUES (?, ?, ?, ?)", 262 + &.{ uri, results, count_str, ts_str }, 263 + ) catch {}; 212 264 } 213 265 214 266 /// Build FTS5 query with OR between terms: "cat dog" -> "cat OR dog*"
+1 -1
backend/src/server.zig
··· 141 141 var response: std.ArrayList(u8) = .{}; 142 142 defer response.deinit(alloc); 143 143 144 - try response.print(alloc, "{{\"documents\":{d},\"publications\":{d}}}", .{ db_stats.documents, db_stats.publications }); 144 + try response.print(alloc, "{{\"documents\":{d},\"publications\":{d},\"cache_hits\":{d},\"cache_misses\":{d}}}", .{ db_stats.documents, db_stats.publications, db_stats.cache_hits, db_stats.cache_misses }); 145 145 146 146 try sendJson(request, response.items); 147 147 }
+20 -4
backend/src/stats.zig
··· 41 41 searches: i64, 42 42 errors: i64, 43 43 started_at: i64, 44 + cache_hits: i64, 45 + cache_misses: i64, 44 46 }; 45 47 46 48 pub fn getStats() Stats { 47 - const c = db.getClient() orelse return .{ .documents = 0, .publications = 0, .searches = 0, .errors = 0, .started_at = 0 }; 49 + const c = db.getClient() orelse return .{ .documents = 0, .publications = 0, .searches = 0, .errors = 0, .started_at = 0, .cache_hits = 0, .cache_misses = 0 }; 48 50 49 51 var res = c.query( 50 52 \\SELECT ··· 52 54 \\ (SELECT COUNT(*) FROM publications) as pubs, 53 55 \\ (SELECT total_searches FROM stats WHERE id = 1) as searches, 54 56 \\ (SELECT total_errors FROM stats WHERE id = 1) as errors, 55 - \\ (SELECT service_started_at FROM stats WHERE id = 1) as started_at 56 - , &.{}) catch return .{ .documents = 0, .publications = 0, .searches = 0, .errors = 0, .started_at = 0 }; 57 + \\ (SELECT service_started_at FROM stats WHERE id = 1) as started_at, 58 + \\ (SELECT COALESCE(cache_hits, 0) FROM stats WHERE id = 1) as cache_hits, 59 + \\ (SELECT COALESCE(cache_misses, 0) FROM stats WHERE id = 1) as cache_misses 60 + , &.{}) catch return .{ .documents = 0, .publications = 0, .searches = 0, .errors = 0, .started_at = 0, .cache_hits = 0, .cache_misses = 0 }; 57 61 defer res.deinit(); 58 62 59 - const row = res.first() orelse return .{ .documents = 0, .publications = 0, .searches = 0, .errors = 0, .started_at = 0 }; 63 + const row = res.first() orelse return .{ .documents = 0, .publications = 0, .searches = 0, .errors = 0, .started_at = 0, .cache_hits = 0, .cache_misses = 0 }; 60 64 return .{ 61 65 .documents = row.int(0), 62 66 .publications = row.int(1), 63 67 .searches = row.int(2), 64 68 .errors = row.int(3), 65 69 .started_at = row.int(4), 70 + .cache_hits = row.int(5), 71 + .cache_misses = row.int(6), 66 72 }; 67 73 } 68 74 ··· 84 90 pub fn recordError() void { 85 91 const c = db.getClient() orelse return; 86 92 c.exec("UPDATE stats SET total_errors = total_errors + 1 WHERE id = 1", &.{}) catch {}; 93 + } 94 + 95 + pub fn recordCacheHit() void { 96 + const c = db.getClient() orelse return; 97 + c.exec("UPDATE stats SET cache_hits = COALESCE(cache_hits, 0) + 1 WHERE id = 1", &.{}) catch {}; 98 + } 99 + 100 + pub fn recordCacheMiss() void { 101 + const c = db.getClient() orelse return; 102 + c.exec("UPDATE stats SET cache_misses = COALESCE(cache_misses, 0) + 1 WHERE id = 1", &.{}) catch {}; 87 103 } 88 104 89 105 pub fn getPopular(alloc: Allocator, limit: usize) ![]const u8 {
+290
docs/standard-search-planning.md
··· 1 + # standard-search planning 2 + 3 + expanding leaflet-search to index all standard.site records. 4 + 5 + ## references 6 + 7 + - [standard.site](https://standard.site/) - shared lexicons for long-form publishing on ATProto 8 + - [leaflet.pub](https://leaflet.pub/) - implements `pub.leaflet.*` lexicons 9 + - [pckt.blog](https://pckt.blog/) - implements `blog.pckt.*` lexicons 10 + - [offprint.app](https://offprint.app/) - implements `app.offprint.*` lexicons (early beta) 11 + - [ATProto docs](https://atproto.com/docs) - protocol documentation 12 + 13 + ## context 14 + 15 + discussion with pckt.blog team about building global search for standard.site ecosystem. 16 + current leaflet-search is tightly coupled to `pub.leaflet.*` lexicons. 17 + 18 + ### recent work (2026-01-05) 19 + 20 + added similarity cache to improve `/similar` endpoint performance: 21 + - `similarity_cache` table stores computed results keyed by `(source_uri, doc_count)` 22 + - cache auto-invalidates when document count changes 23 + - `/stats` endpoint now shows `cache_hits` and `cache_misses` 24 + - first request ~3s (cold), cached requests ~0.15s 25 + 26 + also added loading indicator for "related to" results in frontend. 27 + 28 + ## what we know 29 + 30 + ### standard.site lexicons 31 + 32 + two shared lexicons for long-form publishing on ATProto: 33 + - `site.standard.document` - document content and metadata 34 + - `site.standard.publication` - publication/blog metadata 35 + 36 + implementing platforms: 37 + - leaflet.pub (`pub.leaflet.*`) 38 + - pckt.blog (`blog.pckt.*`) 39 + - offprint.app (`app.offprint.*`) 40 + 41 + ### site.standard.document schema 42 + 43 + examined real records from pckt.blog. key fields: 44 + 45 + ``` 46 + textContent - PRE-FLATTENED TEXT FOR SEARCH (the holy grail) 47 + content - platform-specific block structure 48 + .$type - identifies platform (e.g., "blog.pckt.content") 49 + title - document title 50 + tags - array of strings 51 + site - AT-URI reference to site.standard.publication 52 + path - URL path (e.g., "/my-post-abc123") 53 + publishedAt - ISO timestamp 54 + updatedAt - ISO timestamp 55 + coverImage - blob reference 56 + ``` 57 + 58 + ### the textContent field 59 + 60 + this is huge. platforms flatten their block content into a single text field: 61 + 62 + ```json 63 + { 64 + "content": { 65 + "$type": "blog.pckt.content", 66 + "items": [ /* platform-specific blocks */ ] 67 + }, 68 + "textContent": "i have been writing a lot of atproto things in zig!..." 69 + } 70 + ``` 71 + 72 + no need to parse platform-specific blocks - just index `textContent` directly. 73 + 74 + ### platform detection 75 + 76 + derive platform from `content.$type` prefix: 77 + - `blog.pckt.content` → pckt 78 + - `pub.leaflet.content` → leaflet (TBD - need to verify) 79 + - `app.offprint.content` → offprint (TBD - need to verify) 80 + 81 + ### current leaflet-search architecture 82 + 83 + ``` 84 + ATProto firehose (via tap) 85 + 86 + tap.zig - subscribes to pub.leaflet.document/publication 87 + 88 + indexer.zig - extracts content from nested pages[].blocks[] structure 89 + 90 + turso (sqlite) - documents table + FTS5 + embeddings 91 + 92 + search.zig - FTS5 queries + vector similarity 93 + 94 + server.zig - HTTP API (/search, /similar, /stats) 95 + ``` 96 + 97 + leaflet-specific code: 98 + - tap.zig lines 10-11: hardcoded collection names 99 + - tap.zig lines 234-268: block type extraction (pub.leaflet.blocks.*) 100 + - recursive page/block traversal logic 101 + 102 + generalizable code: 103 + - database schema (FTS5, tags, stats, similarity cache) 104 + - search/similar logic 105 + - HTTP API 106 + - embedding pipeline 107 + 108 + ## proposed architecture for standard-search 109 + 110 + ### ingestion changes 111 + 112 + subscribe to: 113 + - `site.standard.document` 114 + - `site.standard.publication` 115 + 116 + optionally also subscribe to platform-specific collections for richer data: 117 + - `pub.leaflet.document/publication` 118 + - `blog.pckt.document/publication` (if they have these) 119 + - `app.offprint.document/publication` (if they have these) 120 + 121 + ### content extraction 122 + 123 + for `site.standard.document`: 124 + 1. use `textContent` field directly - no block parsing! 125 + 2. fall back to title + description if textContent missing 126 + 127 + for platform-specific records (if needed): 128 + - keep existing leaflet block parser 129 + - add parsers for other platforms as needed 130 + 131 + ### database changes 132 + 133 + add to documents table: 134 + - `platform` TEXT - derived from content.$type (leaflet, pckt, offprint) 135 + - `source_collection` TEXT - the actual lexicon (site.standard.document, pub.leaflet.document) 136 + - `standard_uri` TEXT - if platform-specific record, link to corresponding site.standard.document 137 + 138 + ### API changes 139 + 140 + - `/search?q=...&platform=leaflet` - optional platform filter 141 + - results include `platform` field 142 + - `/similar` works across all platforms 143 + 144 + ### naming/deployment 145 + 146 + options: 147 + 1. rename leaflet-search → standard-search (breaking change) 148 + 2. new repo/deployment, keep leaflet-search as-is 149 + 3. branch and generalize, decide naming later 150 + 151 + leaning toward option 3 for now. 152 + 153 + ## findings from exploration 154 + 155 + ### pckt.blog - READY 156 + - writes `site.standard.document` records 157 + - has `textContent` field (pre-flattened) 158 + - `content.$type` = `blog.pckt.content` 159 + - 6+ records found on pckt.blog service account 160 + 161 + ### leaflet.pub - NOT YET MIGRATED 162 + - still using `pub.leaflet.document` only 163 + - no `site.standard.document` records found 164 + - no `textContent` field - content is in nested `pages[].blocks[]` 165 + - will need to continue parsing blocks OR wait for migration 166 + 167 + ### offprint.app - LIKELY EARLY BETA 168 + - no `site.standard.document` records found on offprint.app account 169 + - no `app.offprint.document` collection visible 170 + - website shows no example users/content 171 + - probably in early/private beta - no public records yet 172 + 173 + ### implication for architecture 174 + 175 + two paths: 176 + 177 + **path A: wait for leaflet migration** 178 + - simpler: just index `site.standard.document` with `textContent` 179 + - all platforms converge on same schema 180 + - downside: loses existing leaflet search until they migrate 181 + 182 + **path B: hybrid approach** 183 + - index `site.standard.document` (pckt, future leaflet, offprint) 184 + - ALSO index `pub.leaflet.document` with existing block parser 185 + - dedupe by URI or store both with `source_collection` indicator 186 + - more complex but maintains backwards compat 187 + 188 + leaning toward **path B** - can't lose 3500 leaflet docs. 189 + 190 + ## open questions 191 + 192 + - [x] does leaflet write site.standard.document records? **NO, not yet** 193 + - [x] does offprint write site.standard.document records? **UNKNOWN - no public content yet** 194 + - [ ] when will leaflet migrate to standard.site? 195 + - [ ] should we dedupe platform-specific vs standard records? 196 + - [ ] embeddings: regenerate for all, or use same model? 197 + 198 + ## next steps 199 + 200 + 1. ~~verify leaflet's site.standard.document structure~~ (done - they don't have any) 201 + 2. ~~find and examine offprint records~~ (done - no public content yet) 202 + 3. decide on hybrid vs wait approach 203 + 4. consider witness cache architecture (see below) 204 + 5. design database migration 205 + 6. implement generalized tap subscriber 206 + 7. test with multi-platform data 207 + 208 + --- 209 + 210 + ## architectural consideration: witness cache 211 + 212 + [paul frazee's post on witness caches](https://bsky.app/profile/pfrazee.com/post/3lfarplxvcs2e) (2026-01-05): 213 + 214 + > I'm increasingly convinced that many Atmosphere backends start with a local "witness cache" of the repositories. 215 + > 216 + > A witness cache is a copy of the repository records, plus a timestamp of when the record was indexed (the "witness time") which you want to keep 217 + > 218 + > The key feature is: you can replay it 219 + 220 + > With local replay, you can add new tables or indexes to your backend and quickly backfill the data. If you don't have a witness cache, you would have to do backfill from the network, which is slow 221 + 222 + ### current leaflet-search architecture (no witness cache) 223 + 224 + ``` 225 + Firehose → TAP → Parse & Transform → Store DERIVED data → Discard raw record 226 + ``` 227 + 228 + we store: 229 + - `uri`, `did`, `rkey` 230 + - `title` (extracted) 231 + - `content` (flattened from blocks) 232 + - `created_at`, `publication_uri` 233 + 234 + we discard: the raw record JSON 235 + 236 + ### witness cache architecture 237 + 238 + ``` 239 + Firehose → Store RAW record + witness_time → Derive indexes on demand (replayable) 240 + ``` 241 + 242 + would store: 243 + - `uri`, `collection`, `rkey` 244 + - `raw_record` (full JSON blob) 245 + - `witness_time` (when we indexed it) 246 + 247 + then derive FTS, embeddings, etc. from local data via replay. 248 + 249 + ### comparison 250 + 251 + | scenario | current (no cache) | with witness cache | 252 + |----------|-------------------|-------------------| 253 + | add new parser (offprint) | re-crawl network | replay local | 254 + | leaflet adds textContent | wait for new records | replay & re-extract | 255 + | fix parsing bug | re-crawl affected | replay & re-derive | 256 + | change embedding model | re-fetch content | replay local | 257 + | add new index/table | backfill from network | replay locally | 258 + 259 + ### trade-offs 260 + 261 + **storage cost:** 262 + - ~3500 docs × ~10KB avg = ~35MB (not huge) 263 + - turso free tier: 9GB, so plenty of room 264 + 265 + **complexity:** 266 + - two-phase: store raw, then derive 267 + - vs current one-phase: derive immediately 268 + 269 + **benefits for standard-search:** 270 + - could add offprint/pckt parsers and replay existing data 271 + - when leaflet migrates to standard.site, re-derive without network 272 + - embedding backfill becomes local-only (no voyage API for content fetch) 273 + 274 + ### implementation options 275 + 276 + 1. **add `raw_record TEXT` column to existing tables** 277 + - simple, backwards compatible 278 + - can migrate incrementally 279 + 280 + 2. **separate `witness_cache` table** 281 + - `(uri PRIMARY KEY, collection, raw_record, witness_time)` 282 + - cleaner separation of concerns 283 + - documents/publications tables become derived views 284 + 285 + 3. **use duckdb/clickhouse for witness cache** (paul's suggestion) 286 + - better compression for JSON blobs 287 + - good for analytics queries 288 + - adds operational complexity 289 + 290 + for our scale, option 1 or 2 with turso is probably fine.
+21 -1
site/index.html
··· 234 234 margin-bottom: 0.75rem; 235 235 } 236 236 237 + .related-loading { 238 + font-size: 11px; 239 + color: #666; 240 + margin-top: 1rem; 241 + animation: pulse 1.5s ease-in-out infinite; 242 + } 243 + 244 + @keyframes pulse { 245 + 0%, 100% { opacity: 0.4; } 246 + 50% { opacity: 0.8; } 247 + } 248 + 237 249 .related-items { 238 250 display: flex; 239 251 flex-wrap: wrap; ··· 614 626 } 615 627 616 628 async function loadRelated(topResult) { 629 + // show loading state 630 + const loadingEl = document.createElement('div'); 631 + loadingEl.className = 'related-loading'; 632 + loadingEl.textContent = 'finding related...'; 633 + resultsDiv.appendChild(loadingEl); 634 + 617 635 try { 618 636 const res = await fetch(`${API_URL}/similar?uri=${encodeURIComponent(topResult.uri)}`); 619 637 const related = await res.json(); 638 + 639 + loadingEl.remove(); 620 640 621 641 if (!related || related.length === 0) return; 622 642 ··· 645 665 646 666 resultsDiv.insertAdjacentHTML('beforeend', relatedHtml); 647 667 } catch (e) { 648 - // silently fail - related is optional 668 + loadingEl.remove(); 649 669 } 650 670 } 651 671