apply filter + cutoff on listRecords fallback; fix skip copy

review followups on the filter commit:

- the listRecords fallback was skipping denylisted collections but
never running the count pass or applying the TID cutoff, so any repo
that hit the fallback (old/flaky PDS, CAR walker failure) regressed
to the old unbounded behavior. now the fallback paginates non-skipped
collections, then post-hoc applies the same 2-year TID cutoff if the
total crosses LARGE_REPO_THRESHOLD. post-hoc is less efficient than
the pre-filter count pass but listRecords doesn't report per-
collection totals, so we can't decide before fetching. fallback is
rare enough that the extra transient memory is acceptable; revisit
if it starts firing routinely.
- made decodeTidMicros public so the fallback can reuse it without
copy-pasting the TID alphabet.
- derived per_collection from collection_of post-cutoff in the
fallback, matching the CAR path (previously inlined during
pagination, which meant cutoff-dropped records still counted).
- skipped_by_collection stays 0 on the fallback path (documented) —
listRecords can't cheaply report what we chose not to fetch.
- fixed pack-meta copy to say "records with no text content" instead
of "likes/follows/reposts" — the denylist covers blocks, listitems,
threadgates, actor status, chat declarations, and tangled graph
follows/stars too. the old copy was misleading for any repo with
those record types.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

zzstoatzz 1 month ago 9a69eeb9 49e1b8a1

+79 -10

3 changed files

expand all

backend

src

assets

main.js

indexer.zig

repo_walk.zig

+5 -1

backend/src/assets/main.js

··· 301 301 const skippedTime = j.skipped_by_time || 0; 302 302 const cutoffMs = j.applied_tid_cutoff_ms || 0; 303 303 if (skippedColl > 0) { 304 - parts.push(`skipped ${skippedColl.toLocaleString()} likes/follows/reposts`); 304 + // the backend skip list covers likes, follows, reposts, blocks, 305 + // listitems, threadgates, postgates, actor status, chat declarations, 306 + // and tangled graph follows/stars — anything with no meaningful 307 + // prose. the short copy here rounds to "no text content". 308 + parts.push(`skipped ${skippedColl.toLocaleString()} records with no text content`); 305 309 } 306 310 if (cutoffMs > 0) { 307 311 const cutoffDate = new Date(cutoffMs).toISOString().slice(0, 10);

+70 -8

backend/src/indexer.zig

··· 392 392 393 393 for (collections) |collection| { 394 394 // apply the same collection allow-list as the CAR walker. 395 - // listRecords can't filter server-side, so we just skip the 396 - // entire pagination loop for collections in the deny list. 395 + // listRecords can't report per-collection totals cheaply, so 396 + // skipped_by_collection stays 0 on this path — we skip the 397 + // pagination entirely rather than fetching-then-dropping, 398 + // which is the whole point of the filter. 397 399 var skip = false; 398 400 for (DEFAULT_SKIP_COLLECTIONS) |sc| { 399 401 if (std.mem.eql(u8, collection, sc)) { ··· 423 425 try all_records.append(scratch_alloc, r); 424 426 try collection_of.append(scratch_alloc, collection); 425 427 count += 1; 426 - pack.records_fetched = all_records.items.len; 427 428 if (job.max_per_collection > 0 and count >= job.max_per_collection) break; 428 429 } 430 + pack.records_fetched = all_records.items.len; 429 431 430 432 if (job.max_per_collection > 0 and count >= job.max_per_collection) break; 431 433 cursor = page.cursor; 432 434 if (cursor == null or page.records.len == 0) break; 433 435 } 436 + } 434 437 435 - if (count > 0) { 436 - try per_collection.append(arena, .{ 437 - .nsid = try arena.dupe(u8, collection), 438 - .count = count, 439 - }); 438 + // mirror the CAR path's auto time cutoff. listRecords can't 439 + // pre-count, so we collect first and then drop — less efficient, 440 + // but the fallback is a rare edge case (old/flaky PDSes) so the 441 + // extra transient memory is acceptable. if the CAR walker 442 + // routinely failed we'd revisit. 443 + if (all_records.items.len > LARGE_REPO_THRESHOLD) { 444 + const now_ns = Io.Timestamp.now(job.io, .real).nanoseconds; 445 + const now_us: i64 = @intCast(@divTrunc(now_ns, 1000)); 446 + const cutoff_us = now_us - AUTO_CUTOFF_WINDOW_US; 447 + pack.applied_tid_cutoff_ms = @divTrunc(cutoff_us, 1000); 448 + std.log.info( 449 + " large repo ({d} > {d}) — applying time cutoff at {d} ms", 450 + .{ all_records.items.len, LARGE_REPO_THRESHOLD, pack.applied_tid_cutoff_ms }, 451 + ); 452 + 453 + var kept_records: std.ArrayList(pds.Record) = .empty; 454 + var kept_coll: std.ArrayList([]const u8) = .empty; 455 + try kept_records.ensureTotalCapacity(scratch_alloc, all_records.items.len); 456 + try kept_coll.ensureTotalCapacity(scratch_alloc, collection_of.items.len); 457 + 458 + var n_time_skipped: usize = 0; 459 + for (all_records.items, collection_of.items) |r, c| { 460 + // at-uri is "at://{did}/{collection}/{rkey}" — rkey is 461 + // the tail after the last slash. non-TID rkeys (profile 462 + // `self`, etc.) fall through decodeTidMicros as null and 463 + // are always kept. 464 + const last_slash = std.mem.lastIndexOfScalar(u8, r.uri, '/') orelse { 465 + kept_records.appendAssumeCapacity(r); 466 + kept_coll.appendAssumeCapacity(c); 467 + continue; 468 + }; 469 + const rkey = r.uri[last_slash + 1 ..]; 470 + if (repo_walk.decodeTidMicros(rkey)) |tid_us| { 471 + if (tid_us < cutoff_us) { 472 + n_time_skipped += 1; 473 + continue; 474 + } 475 + } 476 + kept_records.appendAssumeCapacity(r); 477 + kept_coll.appendAssumeCapacity(c); 440 478 } 479 + 480 + all_records = kept_records; 481 + collection_of = kept_coll; 482 + pack.skipped_by_time = n_time_skipped; 483 + pack.records_fetched = all_records.items.len; 484 + } 485 + 486 + // derive per-collection counts from the (possibly cutoff-trimmed) 487 + // record list, matching the CAR path. records are lex-sorted by 488 + // listRecords pagination order so same-collection runs are 489 + // contiguous. 490 + var cursor2: usize = 0; 491 + while (cursor2 < collection_of.items.len) { 492 + const start = cursor2; 493 + const coll = collection_of.items[start]; 494 + while (cursor2 < collection_of.items.len and 495 + std.mem.eql(u8, collection_of.items[cursor2], coll)) 496 + { 497 + cursor2 += 1; 498 + } 499 + try per_collection.append(arena, .{ 500 + .nsid = try arena.dupe(u8, coll), 501 + .count = cursor2 - start, 502 + }); 441 503 } 442 504 } 443 505

+4 -1

backend/src/repo_walk.zig

··· 466 466 /// TID layout: 64 bits, top bit always 0 (sort-safe), next 53 bits = 467 467 /// microseconds since unix epoch, bottom 10 bits = clock id. 468 468 /// Alphabet: `234567abcdefghijklmnopqrstuvwxyz`. 469 - fn decodeTidMicros(rkey: []const u8) ?i64 { 469 + /// 470 + /// public so the listRecords fallback in indexer.zig can apply the 471 + /// same time cutoff as the CAR walker for large repos. 472 + pub fn decodeTidMicros(rkey: []const u8) ?i64 { 470 473 if (rkey.len != 13) return null; 471 474 var val: u64 = 0; 472 475 for (rkey) |c| {

Configure Feed

Configure Feed