GET /xrpc/app.bsky.actor.searchActorsTypeahead typeahead.waow.tech
16
fork

Configure Feed

Select the types of activity you want to include in your feed.

fix ingester OOM: bloom filter dedup + eliminate wasted Turso writes

replace unbounded hash map dedup (grew to 256MB → OOM every ~4h) with a
fixed-size bloom filter (~1.2MB, 10M bits, 7 hashes). split bare-DID
events in worker to use INSERT OR IGNORE (0 Turso writes for known actors)
instead of full UPSERT that triggered FTS5 churn on every hit.

also clean up /docs page: accurate indexing description, remove speculative
comparisons, add syntax highlighting to code blocks.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

+117 -67
+66 -28
ingester/src/main.zig
··· 8 8 const log = std.log.scoped(.ingester); 9 9 10 10 const MAX_BATCH: usize = 100; 11 - const SEEN_CAP: usize = 500_000; 11 + const BLOOM_BITS: usize = 10_000_000; // ~1.2MB fixed 12 + const BLOOM_HASHES: usize = 7; 13 + 14 + const BloomFilter = struct { 15 + bits: std.DynamicBitSetUnmanaged, 16 + num_bits: usize, 17 + num_hashes: usize, 18 + count: usize = 0, 19 + 20 + fn init(allocator: Allocator, num_bits: usize, num_hashes: usize) !BloomFilter { 21 + const bits = try std.DynamicBitSetUnmanaged.initEmpty(allocator, num_bits); 22 + return .{ 23 + .bits = bits, 24 + .num_bits = num_bits, 25 + .num_hashes = num_hashes, 26 + }; 27 + } 28 + 29 + fn deinit(self: *BloomFilter, allocator: Allocator) void { 30 + self.bits.deinit(allocator); 31 + } 32 + 33 + fn hashIndices(self: *const BloomFilter, key: []const u8) [BLOOM_HASHES]usize { 34 + const h1 = std.hash.Wyhash.hash(0, key); 35 + const h2 = std.hash.Wyhash.hash(1, key); 36 + var indices: [BLOOM_HASHES]usize = undefined; 37 + for (0..self.num_hashes) |i| { 38 + indices[i] = @intCast((h1 +% i *% h2) % self.num_bits); 39 + } 40 + return indices; 41 + } 42 + 43 + fn insert(self: *BloomFilter, key: []const u8) void { 44 + const indices = self.hashIndices(key); 45 + for (indices) |idx| { 46 + self.bits.set(idx); 47 + } 48 + self.count += 1; 49 + } 50 + 51 + fn contains(self: *const BloomFilter, key: []const u8) bool { 52 + const indices = self.hashIndices(key); 53 + for (indices) |idx| { 54 + if (!self.bits.isSet(idx)) return false; 55 + } 56 + return true; 57 + } 58 + 59 + fn reset(self: *BloomFilter) void { 60 + self.bits.unsetAll(); 61 + self.count = 0; 62 + } 63 + }; 12 64 13 65 const Config = struct { 14 66 worker_url: []const u8, ··· 40 92 delete_buffer: std.ArrayList([]const u8), 41 93 /// arena owns all string data in buffer/delete_buffer 42 94 arena: std.heap.ArenaAllocator, 43 - /// dedup set: tracks DIDs we've already enqueued from non-profile commits 44 - seen: std.StringHashMapUnmanaged(void) = .{}, 45 - seen_arena: std.heap.ArenaAllocator, 46 - seen_count: usize = 0, 95 + /// bloom filter: fixed-size dedup for non-profile DIDs (~1.2MB) 96 + bloom: BloomFilter, 47 97 pending_cursor: i64 = 0, 48 98 flushed_cursor: i64 = 0, 49 99 total_ingested: u64 = 0, ··· 51 101 last_flush: i64 = 0, 52 102 retry_after: i64 = 0, // timestamp before which we skip flush attempts 53 103 54 - fn init(allocator: Allocator, config: Config) IngestHandler { 104 + fn init(allocator: Allocator, config: Config) !IngestHandler { 55 105 return .{ 56 106 .allocator = allocator, 57 107 .config = config, 58 108 .buffer = .{}, 59 109 .delete_buffer = .{}, 60 110 .arena = std.heap.ArenaAllocator.init(allocator), 61 - .seen_arena = std.heap.ArenaAllocator.init(allocator), 111 + .bloom = try BloomFilter.init(allocator, BLOOM_BITS, BLOOM_HASHES), 62 112 .last_flush = std.time.timestamp(), 63 113 }; 64 114 } 65 115 66 116 fn deinit(self: *IngestHandler) void { 67 - self.seen.deinit(self.allocator); 68 - self.seen_arena.deinit(); 117 + self.bloom.deinit(self.allocator); 69 118 self.arena.deinit(); 70 119 self.buffer.deinit(self.allocator); 71 120 self.delete_buffer.deinit(self.allocator); ··· 75 124 return self.arena.allocator().dupe(u8, s) catch null; 76 125 } 77 126 78 - fn dupeForSeen(self: *IngestHandler, s: []const u8) ?[]const u8 { 79 - return self.seen_arena.allocator().dupe(u8, s) catch null; 80 - } 81 - 82 - fn pruneSeen(self: *IngestHandler) void { 83 - log.info("pruning seen set ({d} entries)", .{self.seen_count}); 84 - self.seen.clearRetainingCapacity(); 85 - _ = self.seen_arena.reset(.retain_capacity); 86 - self.seen_count = 0; 127 + fn resetBloom(self: *IngestHandler) void { 128 + log.info("resetting bloom filter ({d} insertions)", .{self.bloom.count}); 129 + self.bloom.reset(); 87 130 } 88 131 89 132 pub fn onEvent(self: *IngestHandler, event: zat.JetstreamEvent) void { ··· 105 148 self.buffer.clearRetainingCapacity(); 106 149 self.delete_buffer.clearRetainingCapacity(); 107 150 _ = self.arena.reset(.retain_capacity); 108 - self.pruneSeen(); 151 + self.resetBloom(); 109 152 self.flushed_cursor = self.pending_cursor; 110 153 self.retry_after = 0; 111 154 } ··· 165 208 // non-profile collections: just discover the DID 166 209 if (c.operation == .delete) return; 167 210 168 - // dedup: skip if we've seen this DID recently 169 - if (self.seen.contains(c.did)) return; 170 - 171 - const seen_key = self.dupeForSeen(c.did) orelse return; 172 - self.seen.put(self.allocator, seen_key, {}) catch return; 173 - self.seen_count += 1; 211 + // dedup: skip if bloom filter says we've seen this DID recently 212 + if (self.bloom.contains(c.did)) return; 213 + self.bloom.insert(c.did); 174 214 175 215 const did = self.dupe(c.did) orelse return; 176 216 self.buffer.append(self.allocator, .{ .did = did }) catch return; 177 - 178 - if (self.seen_count >= SEEN_CAP) self.pruneSeen(); 179 217 } 180 218 } 181 219 ··· 407 445 log.info("no cursor found, starting from live", .{}); 408 446 } 409 447 410 - var handler = IngestHandler.init(allocator, config); 448 + var handler = try IngestHandler.init(allocator, config); 411 449 defer handler.deinit(); 412 450 413 451 var client = zat.JetstreamClient.init(allocator, .{
+51 -39
src/index.ts
··· 473 473 return json({ error: "batch too large (max 10000)" }, 400); 474 474 } 475 475 476 - // batch upsert — use COALESCE to preserve existing fields on partial updates 476 + // batch upsert — bare-DID events use INSERT OR IGNORE (0 Turso writes for known actors), 477 + // profile/identity events use full UPSERT with COALESCE to preserve existing fields 477 478 const stmts = events.map((e) => { 479 + const isBareDID = !e.handle && !e.display_name && !e.avatar_cid && e.hidden === undefined; 480 + if (isBareDID) { 481 + return db.prepare( 482 + "INSERT OR IGNORE INTO actors (did) VALUES (?1)" 483 + ).bind(e.did); 484 + } 478 485 const avatarCid = e.avatar_cid || null; 479 486 const hidden = e.hidden !== undefined ? (e.hidden ? 1 : 0) : null; 480 487 return db.prepare( ··· 809 816 <body> 810 817 <div class="container"> 811 818 <div class="header"> 812 - <h1><strong>typeahead</strong> stats</h1> 819 + <h1><a href="/" style="color:inherit;text-decoration:none"><strong>typeahead</strong></a> stats</h1> 813 820 </div> 814 821 <p class="subtitle">index health and search activity</p> 815 822 ··· 1408 1415 pre code { background: none; border: none; padding: 0; font-size: 0.72rem; color: #bbb; } 1409 1416 .diff-add { color: #4a9; } 1410 1417 .diff-del { color: #a55; } 1418 + .kw { color: #c678dd; } 1419 + .str { color: #98c379; } 1420 + .fn { color: #61afef; } 1421 + .cm { color: #5c6370; font-style: italic; } 1422 + .op { color: #abb2bf; } 1411 1423 .callout { background: #111; border: 1px solid #222; border-left: 3px solid #4a9; 1412 1424 border-radius: 6px; padding: 0.7rem 0.9rem; margin-bottom: 1rem; 1413 1425 font-size: 0.78rem; color: #999; line-height: 1.6; } ··· 1434 1446 <body> 1435 1447 <div class="container"> 1436 1448 <div class="header"> 1437 - <h1><strong>typeahead</strong> docs</h1> 1449 + <h1><a href="/" style="color:inherit;text-decoration:none"><strong>typeahead</strong></a> docs</h1> 1438 1450 </div> 1439 1451 <p class="subtitle">switching from the bluesky typeahead API</p> 1440 1452 1441 1453 <div class="callout"> 1442 - <strong>tl;dr</strong> — change the base URL. the endpoint path and query params are the same. 1443 - response shape is compatible but slimmer. 1454 + <strong>tl;dr</strong> — replace <code>public.api.bsky.app</code> with <code>typeahead.waow.tech</code>. 1444 1455 </div> 1445 1456 1446 1457 <h2>what this is</h2> 1447 1458 <p> 1448 - typeahead is a community-run actor search for <a href="https://atproto.com">atproto</a>. 1449 - it's designed as a drop-in replacement for bluesky's 1450 - <code>app.bsky.actor.searchActorsTypeahead</code> endpoint. 1459 + typeahead is a community-run actor search for <a href="https://atproto.com">atproto</a>, 1460 + aiming to be a drop-in replacement for bluesky's 1461 + <code>app.bsky.actor.searchActorsTypeahead</code> endpoint. the endpoint path and query 1462 + params are identical; the response shape is compatible but slimmer (see 1463 + <a href="#response-comparison">response comparison</a> below). 1451 1464 </p> 1452 1465 <p> 1453 - the index is populated via <a href="https://docs.bsky.app/blog/jetstream">jetstream</a>, 1454 - so it tracks the full network. searches use FTS5 prefix matching against handles and 1455 - display names, with results edge-cached for 60s. 1466 + the index is built from a few <a href="https://docs.bsky.app/blog/jetstream">jetstream</a> 1467 + collections — profiles, posts, likes, and follows — so any account that creates or 1468 + interacts with content gets discovered automatically. for accounts that predate the index 1469 + or haven't been seen yet, search queries trigger a throttled backfill from the bluesky API 1470 + to fill gaps on demand. searches use FTS5 prefix matching against handles and display names, 1471 + with results edge-cached for 60s. 1456 1472 </p> 1457 1473 1458 1474 <h2>the change</h2> ··· 1461 1477 <span class="diff-add">+ https://typeahead.waow.tech</span>/xrpc/app.bsky.actor.searchActorsTypeahead?q=...&amp;limit=10</code></pre> 1462 1478 1463 1479 <h3>before</h3> 1464 - <pre><code>const response = await fetch( 1465 - \`https://public.api.bsky.app/xrpc/app.bsky.actor.searchActorsTypeahead?q=\${encodeURIComponent(query)}&amp;limit=10\` 1480 + <pre><code><span class="kw">const</span> response <span class="op">=</span> <span class="kw">await</span> <span class="fn">fetch</span>( 1481 + <span class="str">\`https://public.api.bsky.app/xrpc/app.bsky.actor.searchActorsTypeahead?q=\${<span class="fn">encodeURIComponent</span>(query)}&amp;limit=10\`</span> 1466 1482 );</code></pre> 1467 1483 1468 1484 <h3>after</h3> 1469 - <pre><code>const TYPEAHEAD_URL = 'https://typeahead.waow.tech'; 1485 + <pre><code><span class="kw">const</span> TYPEAHEAD_URL <span class="op">=</span> <span class="str">'https://typeahead.waow.tech'</span>; 1470 1486 1471 - const response = await fetch( 1472 - \`\${TYPEAHEAD_URL}/xrpc/app.bsky.actor.searchActorsTypeahead?q=\${encodeURIComponent(query)}&amp;limit=10\` 1487 + <span class="kw">const</span> response <span class="op">=</span> <span class="kw">await</span> <span class="fn">fetch</span>( 1488 + <span class="str">\`\${TYPEAHEAD_URL}/xrpc/app.bsky.actor.searchActorsTypeahead?q=\${<span class="fn">encodeURIComponent</span>(query)}&amp;limit=10\`</span> 1473 1489 );</code></pre> 1474 1490 1475 1491 <p> ··· 1482 1498 set the <code>X-Client</code> header so your app shows up by name in our 1483 1499 <a href="/stats">traffic stats</a> instead of as "unknown": 1484 1500 </p> 1485 - <pre><code>const response = await fetch( 1486 - \`\${TYPEAHEAD_URL}/xrpc/app.bsky.actor.searchActorsTypeahead?q=\${encodeURIComponent(query)}&amp;limit=10\`, 1487 - { headers: { 'X-Client': 'my-app.example.com' } } 1501 + <pre><code><span class="kw">const</span> response <span class="op">=</span> <span class="kw">await</span> <span class="fn">fetch</span>( 1502 + <span class="str">\`\${TYPEAHEAD_URL}/xrpc/app.bsky.actor.searchActorsTypeahead?q=\${<span class="fn">encodeURIComponent</span>(query)}&amp;limit=10\`</span>, 1503 + { headers: { <span class="str">'X-Client'</span>: <span class="str">'my-app.example.com'</span> } } 1488 1504 );</code></pre> 1489 1505 <p> 1490 1506 use your domain or app name — whatever you want to be identified as. ··· 1492 1508 but <code>X-Client</code> is preferred since it works everywhere (server-side, CLI, native apps). 1493 1509 </p> 1494 1510 1495 - <h2>response comparison</h2> 1511 + <h2 id="response-comparison">response comparison</h2> 1496 1512 <p> 1497 1513 both return <code>{ "actors": [...] }</code>. the actor objects differ: 1498 1514 </p> ··· 1516 1532 fetch those fields separately. 1517 1533 </div> 1518 1534 1519 - <h2>other differences</h2> 1535 + <h2>operational notes</h2> 1520 1536 <ul> 1521 - <li><strong>no auth required</strong> — public CORS endpoint, no token needed</li> 1522 - <li><strong>rate limited</strong> — 60 req/min per IP (vs bluesky's per-token limits)</li> 1537 + <li><strong>rate limited</strong> — 60 req/min per IP</li> 1523 1538 <li><strong>cached</strong> — results are edge-cached for 60s, so very recent profile changes may lag</li> 1524 - <li><strong>limit range</strong> — <code>1–100</code> (bluesky caps at 10 by default)</li> 1525 - <li><strong>moderation</strong> — actors flagged with <code>!hide</code> or <code>!takedown</code> labels are excluded from results</li> 1539 + <li><strong>limit range</strong> — <code>1–100</code> (bluesky defaults to 10)</li> 1540 + <li><strong>moderation</strong> — actors with <code>!hide</code> or <code>!takedown</code> labels are excluded</li> 1541 + <li><strong>CORS</strong> — enabled, so browser-based apps can call it directly</li> 1526 1542 </ul> 1527 1543 1528 1544 <h2>example: plyr.fm</h2> 1529 1545 <p> 1530 - <a href="https://tangled.sh/zzstoatzz.io/plyr.fm">plyr.fm</a> switched from a backend proxy 1531 - (which called the bluesky API) to calling typeahead directly from the frontend. the diff was 1532 - roughly: 1546 + <a href="https://tangled.sh/zzstoatzz.io/plyr.fm">plyr.fm</a> uses typeahead for actor 1547 + search. the integration looks roughly like: 1533 1548 </p> 1534 - <pre><code><span class="diff-add">// config.ts</span> 1535 - <span class="diff-add">export const TYPEAHEAD_URL = 'https://typeahead.waow.tech';</span> 1549 + <pre><code><span class="cm">// config.ts</span> 1550 + <span class="kw">export const</span> TYPEAHEAD_URL <span class="op">=</span> <span class="str">'https://typeahead.waow.tech'</span>; 1536 1551 1537 - <span class="diff-add">// HandleSearch.svelte</span> 1538 - const response = await fetch( 1539 - \`\${TYPEAHEAD_URL}/xrpc/app.bsky.actor.searchActorsTypeahead?q=\${encodeURIComponent(query)}&amp;limit=10\` 1552 + <span class="cm">// HandleSearch.svelte</span> 1553 + <span class="kw">const</span> response <span class="op">=</span> <span class="kw">await</span> <span class="fn">fetch</span>( 1554 + <span class="str">\`\${TYPEAHEAD_URL}/xrpc/app.bsky.actor.searchActorsTypeahead?q=\${<span class="fn">encodeURIComponent</span>(query)}&amp;limit=10\`</span> 1540 1555 ); 1541 - const data = await response.json(); 1542 - const actors = (data.actors ?? []).map(actor => ({ 1556 + <span class="kw">const</span> data <span class="op">=</span> <span class="kw">await</span> response.<span class="fn">json</span>(); 1557 + <span class="kw">const</span> actors <span class="op">=</span> (data.actors ?? []).<span class="fn">map</span>(actor <span class="op">=></span> ({ 1543 1558 did: actor.did, 1544 1559 handle: actor.handle, 1545 1560 display_name: actor.displayName ?? actor.handle, 1546 - avatar_url: actor.avatar ?? null, 1561 + avatar_url: actor.avatar ?? <span class="kw">null</span>, 1547 1562 }));</code></pre> 1548 - <p> 1549 - no backend proxy needed. the API supports CORS, so frontend calls work directly. 1550 - </p> 1551 1563 1552 1564 <h2>request indexing</h2> 1553 1565 <p>