add similarity cache, cache stats, loading indicator, and planning doc

+1 -1

backend/src/db/mod.zig

··· 1 1 const std = @import("std"); 2 2 3 - const Client = @import("Client.zig"); 4 3 const schema = @import("schema.zig"); 5 4 const result = @import("result.zig"); 6 5 7 6 // re-exports 7 + pub const Client = @import("Client.zig"); 8 8 pub const Row = result.Row; 9 9 pub const Result = result.Result; 10 10 pub const BatchResult = result.BatchResult;

+13

backend/src/db/schema.zig

··· 98 98 \\ deleted_at INTEGER NOT NULL 99 99 \\) 100 100 , &.{}); 101 + 102 + // similarity cache: stores precomputed similar documents 103 + // invalidated when doc_count changes (new docs added/removed) 104 + try client.exec( 105 + \\CREATE TABLE IF NOT EXISTS similarity_cache ( 106 + \\ source_uri TEXT PRIMARY KEY, 107 + \\ results TEXT NOT NULL, 108 + \\ doc_count INTEGER NOT NULL, 109 + \\ computed_at INTEGER NOT NULL 110 + \\) 111 + , &.{}); 101 112 } 102 113 103 114 fn runMigrations(client: *Client) !void { ··· 105 116 client.exec("ALTER TABLE documents ADD COLUMN publication_uri TEXT", &.{}) catch {}; 106 117 client.exec("ALTER TABLE publications ADD COLUMN base_path TEXT", &.{}) catch {}; 107 118 client.exec("ALTER TABLE stats ADD COLUMN service_started_at INTEGER", &.{}) catch {}; 119 + client.exec("ALTER TABLE stats ADD COLUMN cache_hits INTEGER DEFAULT 0", &.{}) catch {}; 120 + client.exec("ALTER TABLE stats ADD COLUMN cache_misses INTEGER DEFAULT 0", &.{}) catch {}; 108 121 109 122 // vector embeddings column already added by backfill script 110 123 }

+54 -2

backend/src/search.zig

··· 3 3 const Allocator = std.mem.Allocator; 4 4 const zql = @import("zql"); 5 5 const db = @import("db/mod.zig"); 6 + const stats = @import("stats.zig"); 6 7 7 8 // JSON output type for search results 8 9 const SearchResultJson = struct { ··· 175 176 } 176 177 177 178 /// Find documents similar to a given document using vector similarity 178 - /// Uses brute-force cosine distance (no index required, ~7s for 3500 docs) 179 + /// Uses brute-force cosine distance with caching (cache invalidated when doc count changes) 179 180 pub fn findSimilar(alloc: Allocator, uri: []const u8, limit: usize) ![]const u8 { 180 181 const c = db.getClient() orelse return error.NotInitialized; 181 182 183 + // get current doc count (for cache invalidation) 184 + const doc_count = getEmbeddedDocCount(c) orelse return error.QueryFailed; 185 + 186 + // check cache 187 + if (getCachedSimilar(alloc, c, uri, doc_count)) |cached| { 188 + stats.recordCacheHit(); 189 + return cached; 190 + } 191 + stats.recordCacheMiss(); 192 + 193 + // cache miss - compute similarity 182 194 var output: std.Io.Writer.Allocating = .init(alloc); 183 195 errdefer output.deinit(); 184 196 ··· 208 220 try jw.beginArray(); 209 221 for (res.rows) |row| try jw.write(Doc.fromRow(row).toJson()); 210 222 try jw.endArray(); 211 - return try output.toOwnedSlice(); 223 + 224 + const results = try output.toOwnedSlice(); 225 + 226 + // cache the results (fire and forget) 227 + cacheSimilarResults(c, uri, results, doc_count); 228 + 229 + return results; 230 + } 231 + 232 + fn getEmbeddedDocCount(c: *db.Client) ?i64 { 233 + var res = c.query("SELECT COUNT(*) FROM documents WHERE embedding IS NOT NULL", &.{}) catch return null; 234 + defer res.deinit(); 235 + if (res.rows.len == 0) return null; 236 + return res.rows[0].int(0); 237 + } 238 + 239 + fn getCachedSimilar(alloc: Allocator, c: *db.Client, uri: []const u8, current_doc_count: i64) ?[]const u8 { 240 + var count_buf: [20]u8 = undefined; 241 + const count_str = std.fmt.bufPrint(&count_buf, "{d}", .{current_doc_count}) catch return null; 242 + 243 + var res = c.query( 244 + "SELECT results FROM similarity_cache WHERE source_uri = ? AND doc_count = ?", 245 + &.{ uri, count_str }, 246 + ) catch return null; 247 + defer res.deinit(); 248 + 249 + if (res.rows.len == 0) return null; 250 + return alloc.dupe(u8, res.rows[0].text(0)) catch null; 251 + } 252 + 253 + fn cacheSimilarResults(c: *db.Client, uri: []const u8, results: []const u8, doc_count: i64) void { 254 + var count_buf: [20]u8 = undefined; 255 + const count_str = std.fmt.bufPrint(&count_buf, "{d}", .{doc_count}) catch return; 256 + 257 + var ts_buf: [20]u8 = undefined; 258 + const ts_str = std.fmt.bufPrint(&ts_buf, "{d}", .{std.time.timestamp()}) catch return; 259 + 260 + c.exec( 261 + "INSERT OR REPLACE INTO similarity_cache (source_uri, results, doc_count, computed_at) VALUES (?, ?, ?, ?)", 262 + &.{ uri, results, count_str, ts_str }, 263 + ) catch {}; 212 264 } 213 265 214 266 /// Build FTS5 query with OR between terms: "cat dog" -> "cat OR dog*"

+1 -1

backend/src/server.zig

··· 141 141 var response: std.ArrayList(u8) = .{}; 142 142 defer response.deinit(alloc); 143 143 144 - try response.print(alloc, "{{\"documents\":{d},\"publications\":{d}}}", .{ db_stats.documents, db_stats.publications }); 144 + try response.print(alloc, "{{\"documents\":{d},\"publications\":{d},\"cache_hits\":{d},\"cache_misses\":{d}}}", .{ db_stats.documents, db_stats.publications, db_stats.cache_hits, db_stats.cache_misses }); 145 145 146 146 try sendJson(request, response.items); 147 147 }

+20 -4

backend/src/stats.zig

··· 41 41 searches: i64, 42 42 errors: i64, 43 43 started_at: i64, 44 + cache_hits: i64, 45 + cache_misses: i64, 44 46 }; 45 47 46 48 pub fn getStats() Stats { 47 - const c = db.getClient() orelse return .{ .documents = 0, .publications = 0, .searches = 0, .errors = 0, .started_at = 0 }; 49 + const c = db.getClient() orelse return .{ .documents = 0, .publications = 0, .searches = 0, .errors = 0, .started_at = 0, .cache_hits = 0, .cache_misses = 0 }; 48 50 49 51 var res = c.query( 50 52 \\SELECT ··· 52 54 \\ (SELECT COUNT(*) FROM publications) as pubs, 53 55 \\ (SELECT total_searches FROM stats WHERE id = 1) as searches, 54 56 \\ (SELECT total_errors FROM stats WHERE id = 1) as errors, 55 - \\ (SELECT service_started_at FROM stats WHERE id = 1) as started_at 56 - , &.{}) catch return .{ .documents = 0, .publications = 0, .searches = 0, .errors = 0, .started_at = 0 }; 57 + \\ (SELECT service_started_at FROM stats WHERE id = 1) as started_at, 58 + \\ (SELECT COALESCE(cache_hits, 0) FROM stats WHERE id = 1) as cache_hits, 59 + \\ (SELECT COALESCE(cache_misses, 0) FROM stats WHERE id = 1) as cache_misses 60 + , &.{}) catch return .{ .documents = 0, .publications = 0, .searches = 0, .errors = 0, .started_at = 0, .cache_hits = 0, .cache_misses = 0 }; 57 61 defer res.deinit(); 58 62 59 - const row = res.first() orelse return .{ .documents = 0, .publications = 0, .searches = 0, .errors = 0, .started_at = 0 }; 63 + const row = res.first() orelse return .{ .documents = 0, .publications = 0, .searches = 0, .errors = 0, .started_at = 0, .cache_hits = 0, .cache_misses = 0 }; 60 64 return .{ 61 65 .documents = row.int(0), 62 66 .publications = row.int(1), 63 67 .searches = row.int(2), 64 68 .errors = row.int(3), 65 69 .started_at = row.int(4), 70 + .cache_hits = row.int(5), 71 + .cache_misses = row.int(6), 66 72 }; 67 73 } 68 74 ··· 84 90 pub fn recordError() void { 85 91 const c = db.getClient() orelse return; 86 92 c.exec("UPDATE stats SET total_errors = total_errors + 1 WHERE id = 1", &.{}) catch {}; 93 + } 94 + 95 + pub fn recordCacheHit() void { 96 + const c = db.getClient() orelse return; 97 + c.exec("UPDATE stats SET cache_hits = COALESCE(cache_hits, 0) + 1 WHERE id = 1", &.{}) catch {}; 98 + } 99 + 100 + pub fn recordCacheMiss() void { 101 + const c = db.getClient() orelse return; 102 + c.exec("UPDATE stats SET cache_misses = COALESCE(cache_misses, 0) + 1 WHERE id = 1", &.{}) catch {}; 87 103 } 88 104 89 105 pub fn getPopular(alloc: Allocator, limit: usize) ![]const u8 {

+290

docs/standard-search-planning.md

··· 1 + # standard-search planning 2 + 3 + expanding leaflet-search to index all standard.site records. 4 + 5 + ## references 6 + 7 + - [standard.site](https://standard.site/) - shared lexicons for long-form publishing on ATProto 8 + - [leaflet.pub](https://leaflet.pub/) - implements `pub.leaflet.*` lexicons 9 + - [pckt.blog](https://pckt.blog/) - implements `blog.pckt.*` lexicons 10 + - [offprint.app](https://offprint.app/) - implements `app.offprint.*` lexicons (early beta) 11 + - [ATProto docs](https://atproto.com/docs) - protocol documentation 12 + 13 + ## context 14 + 15 + discussion with pckt.blog team about building global search for standard.site ecosystem. 16 + current leaflet-search is tightly coupled to `pub.leaflet.*` lexicons. 17 + 18 + ### recent work (2026-01-05) 19 + 20 + added similarity cache to improve `/similar` endpoint performance: 21 + - `similarity_cache` table stores computed results keyed by `(source_uri, doc_count)` 22 + - cache auto-invalidates when document count changes 23 + - `/stats` endpoint now shows `cache_hits` and `cache_misses` 24 + - first request ~3s (cold), cached requests ~0.15s 25 + 26 + also added loading indicator for "related to" results in frontend. 27 + 28 + ## what we know 29 + 30 + ### standard.site lexicons 31 + 32 + two shared lexicons for long-form publishing on ATProto: 33 + - `site.standard.document` - document content and metadata 34 + - `site.standard.publication` - publication/blog metadata 35 + 36 + implementing platforms: 37 + - leaflet.pub (`pub.leaflet.*`) 38 + - pckt.blog (`blog.pckt.*`) 39 + - offprint.app (`app.offprint.*`) 40 + 41 + ### site.standard.document schema 42 + 43 + examined real records from pckt.blog. key fields: 44 + 45 + ``` 46 + textContent - PRE-FLATTENED TEXT FOR SEARCH (the holy grail) 47 + content - platform-specific block structure 48 + .$type - identifies platform (e.g., "blog.pckt.content") 49 + title - document title 50 + tags - array of strings 51 + site - AT-URI reference to site.standard.publication 52 + path - URL path (e.g., "/my-post-abc123") 53 + publishedAt - ISO timestamp 54 + updatedAt - ISO timestamp 55 + coverImage - blob reference 56 + ``` 57 + 58 + ### the textContent field 59 + 60 + this is huge. platforms flatten their block content into a single text field: 61 + 62 + ```json 63 + { 64 + "content": { 65 + "$type": "blog.pckt.content", 66 + "items": [ /* platform-specific blocks */ ] 67 + }, 68 + "textContent": "i have been writing a lot of atproto things in zig!..." 69 + } 70 + ``` 71 + 72 + no need to parse platform-specific blocks - just index `textContent` directly. 73 + 74 + ### platform detection 75 + 76 + derive platform from `content.$type` prefix: 77 + - `blog.pckt.content` → pckt 78 + - `pub.leaflet.content` → leaflet (TBD - need to verify) 79 + - `app.offprint.content` → offprint (TBD - need to verify) 80 + 81 + ### current leaflet-search architecture 82 + 83 + ``` 84 + ATProto firehose (via tap) 85 + ↓ 86 + tap.zig - subscribes to pub.leaflet.document/publication 87 + ↓ 88 + indexer.zig - extracts content from nested pages[].blocks[] structure 89 + ↓ 90 + turso (sqlite) - documents table + FTS5 + embeddings 91 + ↓ 92 + search.zig - FTS5 queries + vector similarity 93 + ↓ 94 + server.zig - HTTP API (/search, /similar, /stats) 95 + ``` 96 + 97 + leaflet-specific code: 98 + - tap.zig lines 10-11: hardcoded collection names 99 + - tap.zig lines 234-268: block type extraction (pub.leaflet.blocks.*) 100 + - recursive page/block traversal logic 101 + 102 + generalizable code: 103 + - database schema (FTS5, tags, stats, similarity cache) 104 + - search/similar logic 105 + - HTTP API 106 + - embedding pipeline 107 + 108 + ## proposed architecture for standard-search 109 + 110 + ### ingestion changes 111 + 112 + subscribe to: 113 + - `site.standard.document` 114 + - `site.standard.publication` 115 + 116 + optionally also subscribe to platform-specific collections for richer data: 117 + - `pub.leaflet.document/publication` 118 + - `blog.pckt.document/publication` (if they have these) 119 + - `app.offprint.document/publication` (if they have these) 120 + 121 + ### content extraction 122 + 123 + for `site.standard.document`: 124 + 1. use `textContent` field directly - no block parsing! 125 + 2. fall back to title + description if textContent missing 126 + 127 + for platform-specific records (if needed): 128 + - keep existing leaflet block parser 129 + - add parsers for other platforms as needed 130 + 131 + ### database changes 132 + 133 + add to documents table: 134 + - `platform` TEXT - derived from content.$type (leaflet, pckt, offprint) 135 + - `source_collection` TEXT - the actual lexicon (site.standard.document, pub.leaflet.document) 136 + - `standard_uri` TEXT - if platform-specific record, link to corresponding site.standard.document 137 + 138 + ### API changes 139 + 140 + - `/search?q=...&platform=leaflet` - optional platform filter 141 + - results include `platform` field 142 + - `/similar` works across all platforms 143 + 144 + ### naming/deployment 145 + 146 + options: 147 + 1. rename leaflet-search → standard-search (breaking change) 148 + 2. new repo/deployment, keep leaflet-search as-is 149 + 3. branch and generalize, decide naming later 150 + 151 + leaning toward option 3 for now. 152 + 153 + ## findings from exploration 154 + 155 + ### pckt.blog - READY 156 + - writes `site.standard.document` records 157 + - has `textContent` field (pre-flattened) 158 + - `content.$type` = `blog.pckt.content` 159 + - 6+ records found on pckt.blog service account 160 + 161 + ### leaflet.pub - NOT YET MIGRATED 162 + - still using `pub.leaflet.document` only 163 + - no `site.standard.document` records found 164 + - no `textContent` field - content is in nested `pages[].blocks[]` 165 + - will need to continue parsing blocks OR wait for migration 166 + 167 + ### offprint.app - LIKELY EARLY BETA 168 + - no `site.standard.document` records found on offprint.app account 169 + - no `app.offprint.document` collection visible 170 + - website shows no example users/content 171 + - probably in early/private beta - no public records yet 172 + 173 + ### implication for architecture 174 + 175 + two paths: 176 + 177 + **path A: wait for leaflet migration** 178 + - simpler: just index `site.standard.document` with `textContent` 179 + - all platforms converge on same schema 180 + - downside: loses existing leaflet search until they migrate 181 + 182 + **path B: hybrid approach** 183 + - index `site.standard.document` (pckt, future leaflet, offprint) 184 + - ALSO index `pub.leaflet.document` with existing block parser 185 + - dedupe by URI or store both with `source_collection` indicator 186 + - more complex but maintains backwards compat 187 + 188 + leaning toward **path B** - can't lose 3500 leaflet docs. 189 + 190 + ## open questions 191 + 192 + - [x] does leaflet write site.standard.document records? **NO, not yet** 193 + - [x] does offprint write site.standard.document records? **UNKNOWN - no public content yet** 194 + - [ ] when will leaflet migrate to standard.site? 195 + - [ ] should we dedupe platform-specific vs standard records? 196 + - [ ] embeddings: regenerate for all, or use same model? 197 + 198 + ## next steps 199 + 200 + 1. ~~verify leaflet's site.standard.document structure~~ (done - they don't have any) 201 + 2. ~~find and examine offprint records~~ (done - no public content yet) 202 + 3. decide on hybrid vs wait approach 203 + 4. consider witness cache architecture (see below) 204 + 5. design database migration 205 + 6. implement generalized tap subscriber 206 + 7. test with multi-platform data 207 + 208 + --- 209 + 210 + ## architectural consideration: witness cache 211 + 212 + [paul frazee's post on witness caches](https://bsky.app/profile/pfrazee.com/post/3lfarplxvcs2e) (2026-01-05): 213 + 214 + > I'm increasingly convinced that many Atmosphere backends start with a local "witness cache" of the repositories. 215 + > 216 + > A witness cache is a copy of the repository records, plus a timestamp of when the record was indexed (the "witness time") which you want to keep 217 + > 218 + > The key feature is: you can replay it 219 + 220 + > With local replay, you can add new tables or indexes to your backend and quickly backfill the data. If you don't have a witness cache, you would have to do backfill from the network, which is slow 221 + 222 + ### current leaflet-search architecture (no witness cache) 223 + 224 + ``` 225 + Firehose → TAP → Parse & Transform → Store DERIVED data → Discard raw record 226 + ``` 227 + 228 + we store: 229 + - `uri`, `did`, `rkey` 230 + - `title` (extracted) 231 + - `content` (flattened from blocks) 232 + - `created_at`, `publication_uri` 233 + 234 + we discard: the raw record JSON 235 + 236 + ### witness cache architecture 237 + 238 + ``` 239 + Firehose → Store RAW record + witness_time → Derive indexes on demand (replayable) 240 + ``` 241 + 242 + would store: 243 + - `uri`, `collection`, `rkey` 244 + - `raw_record` (full JSON blob) 245 + - `witness_time` (when we indexed it) 246 + 247 + then derive FTS, embeddings, etc. from local data via replay. 248 + 249 + ### comparison 250 + 251 + | scenario | current (no cache) | with witness cache | 252 + |----------|-------------------|-------------------| 253 + | add new parser (offprint) | re-crawl network | replay local | 254 + | leaflet adds textContent | wait for new records | replay & re-extract | 255 + | fix parsing bug | re-crawl affected | replay & re-derive | 256 + | change embedding model | re-fetch content | replay local | 257 + | add new index/table | backfill from network | replay locally | 258 + 259 + ### trade-offs 260 + 261 + **storage cost:** 262 + - ~3500 docs × ~10KB avg = ~35MB (not huge) 263 + - turso free tier: 9GB, so plenty of room 264 + 265 + **complexity:** 266 + - two-phase: store raw, then derive 267 + - vs current one-phase: derive immediately 268 + 269 + **benefits for standard-search:** 270 + - could add offprint/pckt parsers and replay existing data 271 + - when leaflet migrates to standard.site, re-derive without network 272 + - embedding backfill becomes local-only (no voyage API for content fetch) 273 + 274 + ### implementation options 275 + 276 + 1. **add `raw_record TEXT` column to existing tables** 277 + - simple, backwards compatible 278 + - can migrate incrementally 279 + 280 + 2. **separate `witness_cache` table** 281 + - `(uri PRIMARY KEY, collection, raw_record, witness_time)` 282 + - cleaner separation of concerns 283 + - documents/publications tables become derived views 284 + 285 + 3. **use duckdb/clickhouse for witness cache** (paul's suggestion) 286 + - better compression for JSON blobs 287 + - good for analytics queries 288 + - adds operational complexity 289 + 290 + for our scale, option 1 or 2 with turso is probably fine.

+21 -1

site/index.html

··· 234 234 margin-bottom: 0.75rem; 235 235 } 236 236 237 + .related-loading { 238 + font-size: 11px; 239 + color: #666; 240 + margin-top: 1rem; 241 + animation: pulse 1.5s ease-in-out infinite; 242 + } 243 + 244 + @keyframes pulse { 245 + 0%, 100% { opacity: 0.4; } 246 + 50% { opacity: 0.8; } 247 + } 248 + 237 249 .related-items { 238 250 display: flex; 239 251 flex-wrap: wrap; ··· 614 626 } 615 627 616 628 async function loadRelated(topResult) { 629 + // show loading state 630 + const loadingEl = document.createElement('div'); 631 + loadingEl.className = 'related-loading'; 632 + loadingEl.textContent = 'finding related...'; 633 + resultsDiv.appendChild(loadingEl); 634 + 617 635 try { 618 636 const res = await fetch(`${API_URL}/similar?uri=${encodeURIComponent(topResult.uri)}`); 619 637 const related = await res.json(); 638 + 639 + loadingEl.remove(); 620 640 621 641 if (!related || related.length === 0) return; 622 642 ··· 645 665 646 666 resultsDiv.insertAdjacentHTML('beforeend', relatedHtml); 647 667 } catch (e) { 648 - // silently fail - related is optional 668 + loadingEl.remove(); 649 669 } 650 670 } 651 671

Configure Feed

Configure Feed