fuzzy find my records ken.waow.tech
embeddings pds search
6
fork

Configure Feed

Select the types of activity you want to include in your feed.

truncate huge repos to 50k instead of rejecting them

the previous commit rejected repos over 50k records entirely. wrong —
we should index what we can and tell the user the rest was dropped.

now: if post-filter records > ABSOLUTE_MAX_RECORDS (50k), truncate to
the cap and set pack.truncated_from to the original count. the UI
shows the cap message inline in the pack-meta line with a DM prompt.
the user gets 50k searchable records instead of an error page.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

+25 -9
+4
backend/src/assets/main.js
··· 316 316 const cutoffDate = new Date(cutoffMs).toISOString().slice(0, 10); 317 317 parts.push(`indexed records after ${cutoffDate} (${skippedTime.toLocaleString()} older records skipped)`); 318 318 } 319 + const truncFrom = j.truncated_from || 0; 320 + if (truncFrom > 0) { 321 + parts.push(`capped at ${rec.toLocaleString()} of ${truncFrom.toLocaleString()} — hey, we know you have a big PDS, but ken can only handle so much right now. DM @zzstoatzz.io if you want to talk about it`); 322 + } 319 323 packStatsEl.textContent = parts.join(" · "); 320 324 searchForm.classList.remove("hidden"); 321 325 searchInput.focus();
+20 -9
backend/src/indexer.zig
··· 143 143 /// unix ms of the applied time cutoff, or 0 if no cutoff was applied. 144 144 /// UI displays this as "indexed records newer than <date>". 145 145 applied_tid_cutoff_ms: i64, 146 + /// if the post-filter record count exceeded ABSOLUTE_MAX_RECORDS, we 147 + /// truncated to the cap and stored the original count here. 0 means 148 + /// no truncation. UI shows a friendly message when this is set. 149 + truncated_from: usize, 146 150 147 151 pub fn count(self: *const IndexedPack) usize { 148 152 if (self.dim == 0) return 0; ··· 236 240 .skipped_by_collection = 0, 237 241 .skipped_by_time = 0, 238 242 .applied_tid_cutoff_ms = 0, 243 + .truncated_from = 0, 239 244 }; 240 245 241 246 // duplicate the DID for the hash map key so the cache owns it ··· 324 329 ); 325 330 } 326 331 327 - const walked = try repo_walk.walkOpened(arena, &opened, pack.did, filter); 332 + var walked = try repo_walk.walkOpened(arena, &opened, pack.did, filter); 328 333 329 334 if (walked.records.len > ABSOLUTE_MAX_RECORDS) { 330 - pack.error_msg = std.fmt.allocPrint( 331 - pack.arena.allocator(), 332 - "hey, we know you have a big PDS — everybody knows you have a big PDS — " ++ 333 - "but ken can only handle {d} records right now and yours has {d} after filtering. " ++ 334 - "DM @zzstoatzz.io on bluesky if you want to talk about it", 335 - .{ ABSOLUTE_MAX_RECORDS, walked.records.len }, 336 - ) catch "repo too large"; 337 - return error.OutOfMemory; 335 + const total = walked.records.len; 336 + // keep only the most recent records (they're lex-sorted by 337 + // collection/rkey, and within each collection TID rkeys are 338 + // chronological — but across collections the ordering is 339 + // alphabetical, not temporal. a perfect "most recent N" would 340 + // require a global sort by TID. for now we just take the tail 341 + // of the lex-sorted list, which biases toward later collections 342 + // alphabetically. good enough for a cap that rarely fires.) 343 + walked.records = walked.records[total - ABSOLUTE_MAX_RECORDS ..]; 344 + pack.truncated_from = total; 345 + std.log.info( 346 + " capped at {d} records (had {d} after filtering)", 347 + .{ ABSOLUTE_MAX_RECORDS, total }, 348 + ); 338 349 } 339 350 340 351 return walked;
+1
backend/src/server.zig
··· 644 644 try buf.print(alloc, "\"skipped_by_collection\":{d},", .{pack.skipped_by_collection}); 645 645 try buf.print(alloc, "\"skipped_by_time\":{d},", .{pack.skipped_by_time}); 646 646 try buf.print(alloc, "\"applied_tid_cutoff_ms\":{d},", .{pack.applied_tid_cutoff_ms}); 647 + try buf.print(alloc, "\"truncated_from\":{d},", .{pack.truncated_from}); 647 648 try buf.print(alloc, "\"count\":{d},", .{pack.count()}); 648 649 try buf.print(alloc, "\"indexed_at_ms\":{d},", .{pack.indexed_at_ms}); 649 650 try buf.print(alloc, "\"build_ms\":{d},", .{pack.build_ms});