···2929rustls = { version = "0.23", default-features = false, features = ["aws-lc-rs"] }
3030rustversion = "1"
3131serde = { version = "1", features = ["derive"] }
3232+serde_ipld_dagcbor = "0.6"
3233serde_json = "1"
3334thiserror = "2.0.18"
3435tokio = { version = "1.49.0", features = ["full"] }
+10-7
hacking.md
···7878 - [-] with disk spilling for huge repo (maybe later)
7979 - [-] with queueing resync for large repos if resources are taken?? (maybe later)
8080 - [x] (self-reminder: get_repo should be rare in lightrail)
8181-- [ ] "deep crawl" mode for relays that listHosts -> listRepos on host instead of relying on relay listRepos
8282- - [ ] defensive loop-cursor handling
8181+- [x] prefix-merge walker (limit by total collections to be merged?)
8282+ - [x] add an all-collections index
8383- [~] actually firehose-index!!
8484 - [x] extract collections-added/removed directly from CAR slice
8585- - [ ] (spend some time on tests here)
8585+ - [x] (spend some time on tests here)
8686 - [x] do the thing (write them to the db)
8787- - [ ] swap in repo-stream
8888-- [~] prefix-merge walker (limit by total collections to be merged?)
8989- - [x] add an all-collections index
9090-- [ ] lenient sync1.1
8787+ - [x] swap in repo-stream
8888+- [ ] actually wire in the resync buffer (oops)
8989+- [ ] "deep crawl" mode for relays that listHosts -> listRepos on host instead of relying on relay listRepos
9090+ - [ ] defensive loop-cursor handling
9191+- [ ] lenient pre-sync1.1
9192 - [ ] *don't* allow non-validating commits that look like sync1.1
9293 - [ ] rachet by PDS host: be lenient if we have never seen a sync1.1-looking commit, always strict after we see one.
9494+ - [ ] boooo we probably need *even more* special handling for pre-sync1.1 repos since they don't include adjacent keys!!!
9395- [ ] account status convergeance: if we receive commits from apparently-inactive accounts, should we check upstream status to make sure we're not stale?
9496- [ ] split the keyspace: put the rbc/cbr indexes on a second keyspace with larger block size, expect hits on main keyspace
9597- [ ] websocket ping/pong (unless jacquard is already doing it)
9698- [ ] websocket no-events-received timeout reconnect
9999+9710098101very much still todo but i'm getting tired
99102- [ ] multi-relay listener
+4-3
readme.md
···11# lightrail: `listReposByCollection` service
2233-**status: in development**
33+**status: almost working well but not stable yet!!**
4455lightrail uses the adjacent keys included in CAR slices from firehose commits to detect the first record added and last record removed from a collection in an atproto repo.
66···18181919### wishlist features (probably doable?):
20202121-- accept multiple collections for `listReposbyCollection` (merge + dedup by DID; works bc key is `<collection>||<did>`)
2222-- `listReposByCollectionPrefix`, either with additional indexes up the NSID hierarchy, or via merge+dedup.
2121+- [x] DONE accept multiple collections for `listReposbyCollection` (merge + dedup by DID; works bc key is `<collection>||<did>`)
2222+- [x] DONE "wilcard" fo `listReposbyCollection` by omitting the `collection` query param entirely
2323+- ~~`listReposByCollectionPrefix`, either with additional indexes up the NSID hierarchy, or via merge+dedup.~~ not doing
2324- subscribe to multiple relays
2425- use authenticated repo contents for backfill instead of `com.atproto.repo.describeRepo` (see [./authenticated-collection-list.md](./authenticated-collection-list.md))
2526
+445-9
src/mst/mortality.rs
···33333434type Result<T> = std::result::Result<T, MstMortalityError>;
35353636+/// Collect every MST leaf path visible in a (possibly partial) CAR.
3737+///
3838+/// Uses `next_keys()` which silently skips subtrees whose MST node blocks are
3939+/// absent, so this works on both full and proof-only CARs.
4040+fn collect_visible_paths(parsed: jacquard_repo::car::reader::ParsedCar) -> Result<Vec<String>> {
4141+ let mut car = DriverBuilder::new().load_jacquard_parsed_car(parsed)?;
4242+ let mut visible = Vec::new();
4343+ while let Some((path, _)) = car.next_keys()? {
4444+ visible.push(path.to_string());
4545+ }
4646+ Ok(visible)
4747+}
4848+3649/// Walk the partial CAR's MST to detect which collections are newly added
3750/// ("born") or fully removed ("died") by this commit.
3851///
···6174 }
62756376 // ── Walk the partial CAR's MST to collect visible leaf keys ──────────────
6464- //
6565- // next_keys() silently skips subtrees whose MST node blocks are absent,
6666- // giving us all leaves reachable through blocks that ARE in the CAR —
6767- // exactly the proof nodes for keys adjacent to the changes.
6868- let mut car = DriverBuilder::new().load_jacquard_parsed_car(parsed)?;
6969- let mut visible: Vec<String> = Vec::new();
7070- while let Some((path, _)) = car.next_keys()? {
7171- visible.push(path.to_string());
7272- }
7777+ let visible = collect_visible_paths(parsed)?;
73787479 // ── Check collection death (all visible keys in C are being deleted) ──────
7580 let deleted_collections: HashSet<&str> = deleted
···268273 assert_eq!(died, vec![nsid("app.bsky.graph.follow")]);
269274 }
270275}
276276+277277+// =============================================================================
278278+// Fixture-driven tests
279279+// =============================================================================
280280+//
281281+// Two fixture sources:
282282+//
283283+// 1. `../atproto-interop-tests/firehose/commit-proof-fixtures.json`
284284+// Bluesky's interop fixture set — 6 commit scenarios with known
285285+// `blocksInProof` CIDs and expected tree state. We build the "after" MST
286286+// from scratch using jacquard_repo, filter to the fixture's proof CIDs, and
287287+// check the adjacent-key invariants that guard against spurious births and
288288+// deaths. Fixture 6 uses real `app.bsky.*` NSIDs, so we can also assert the
289289+// actual mortality result.
290290+//
291291+// 2. `../mst-test-suite/tests/diff/exhaustive/*.json` +
292292+// `../mst-test-suite/cars/exhaustive/*.car`
293293+// 16 384 exhaustive MST diff pairs. We load the "B" (after) CAR file
294294+// directly, filter its blocks to `created_nodes ∪ proof_nodes` (the minimal
295295+// firehose CAR content), and assert the key safety property: if a collection
296296+// still has records in the after-tree, at least one survivor must be visible
297297+// in the proof blocks so that our mortality logic cannot spuriously declare
298298+// the collection dead.
299299+//
300300+// Both test functions skip gracefully when the fixture directories are absent
301301+// (e.g. in CI environments that don't clone the sibling repos).
302302+303303+#[cfg(test)]
304304+mod fixture_tests {
305305+ use super::{collect_visible_paths, extract};
306306+ use std::collections::{BTreeMap, HashSet};
307307+ use std::sync::Arc;
308308+309309+ use bytes::Bytes;
310310+ use cid::Cid as IpldCid;
311311+ use jacquard_api::com_atproto::sync::subscribe_repos::RepoOp;
312312+ use jacquard_common::CowStr;
313313+ use jacquard_common::types::string::{Did, Nsid};
314314+ use jacquard_common::types::tid::Tid;
315315+ use jacquard_repo::car::reader::ParsedCar;
316316+ use jacquard_repo::commit::Commit;
317317+ use jacquard_repo::{MemoryBlockStore, Mst};
318318+319319+ // ── Helpers ───────────────────────────────────────────────────────────────
320320+321321+ fn parse_cid(s: &str) -> IpldCid {
322322+ s.parse().unwrap_or_else(|e| panic!("bad CID {s:?}: {e}"))
323323+ }
324324+325325+ /// Add a fake commit block that points to the current MST root.
326326+ ///
327327+ /// After this call `parsed.root` is the commit CID and the original MST
328328+ /// root CID is returned (needed to know what the commit's `data` field is).
329329+ fn attach_fake_commit(parsed: &mut ParsedCar) -> IpldCid {
330330+ let mst_root = parsed.root;
331331+ let commit = Commit {
332332+ did: Did::new_owned("did:web:example.com").unwrap(),
333333+ version: 3,
334334+ data: mst_root,
335335+ rev: Tid::now_0(),
336336+ prev: None,
337337+ sig: Bytes::from(vec![0u8; 64]),
338338+ };
339339+ let commit_cid = commit.to_cid().unwrap();
340340+ let commit_cbor = Bytes::from(commit.to_cbor().unwrap());
341341+ parsed.blocks.insert(commit_cid, commit_cbor);
342342+ parsed.root = commit_cid;
343343+ mst_root
344344+ }
345345+346346+ /// Build an MST from `after_keys` (each mapped to `leaf_cid`), collect all
347347+ /// its blocks, then return a `ParsedCar` filtered to `proof_cids` plus a
348348+ /// fake commit block. Also returns the computed MST root CID so callers can
349349+ /// verify it matches a fixture's `rootAfterCommit`.
350350+ async fn build_proof_car(
351351+ after_keys: &[&str],
352352+ leaf_cid: IpldCid,
353353+ proof_cids: &HashSet<String>,
354354+ ) -> (ParsedCar, IpldCid) {
355355+ let storage = Arc::new(MemoryBlockStore::new());
356356+ let mut mst = Mst::new(storage);
357357+ for key in after_keys {
358358+ mst = mst.add(key, leaf_cid).await.unwrap();
359359+ }
360360+ let (mst_root, all_blocks) = mst.collect_blocks().await.unwrap();
361361+362362+ let filtered: BTreeMap<IpldCid, Bytes> = all_blocks
363363+ .into_iter()
364364+ .filter(|(cid, _)| proof_cids.contains(&cid.to_string()))
365365+ .collect();
366366+367367+ let commit = Commit {
368368+ did: Did::new_owned("did:web:example.com").unwrap(),
369369+ version: 3,
370370+ data: mst_root,
371371+ rev: Tid::now_0(),
372372+ prev: None,
373373+ sig: Bytes::from(vec![0u8; 64]),
374374+ };
375375+ let commit_cid = commit.to_cid().unwrap();
376376+ let mut blocks = filtered;
377377+ blocks.insert(commit_cid, Bytes::from(commit.to_cbor().unwrap()));
378378+379379+ let parsed = ParsedCar {
380380+ root: commit_cid,
381381+ blocks,
382382+ };
383383+ (parsed, mst_root)
384384+ }
385385+386386+ /// Check the two safety invariants for a single commit scenario:
387387+ ///
388388+ /// - **No spurious death**: if collection C has survivors in the after-tree,
389389+ /// at least one survivor must be visible in the proof CAR.
390390+ /// - **No spurious birth**: if collection C already had records before the
391391+ /// commit (i.e., there are preexisting keys in the after-tree that are not
392392+ /// being created), at least one preexisting key must be visible.
393393+ ///
394394+ /// Returns a list of violation descriptions (empty → all good).
395395+ fn check_invariants(
396396+ adds: &[&str],
397397+ dels: &[&str],
398398+ after_keys: &[&str],
399399+ visible: &[String],
400400+ ) -> Vec<String> {
401401+ let adds_set: HashSet<&str> = adds.iter().copied().collect();
402402+ let dels_set: HashSet<&str> = dels.iter().copied().collect();
403403+ let visible_set: HashSet<&str> = visible.iter().map(String::as_str).collect();
404404+ let mut violations = Vec::new();
405405+406406+ // Spurious death: a collection still has survivors but none are visible.
407407+ for del_path in dels {
408408+ let (coll, _) = del_path.split_once('/').unwrap();
409409+ let prefix = format!("{coll}/");
410410+ let survivors: Vec<&str> = after_keys
411411+ .iter()
412412+ .copied()
413413+ .filter(|k| k.starts_with(&prefix) && !dels_set.contains(k))
414414+ .collect();
415415+ if survivors.is_empty() {
416416+ continue; // collection legitimately died
417417+ }
418418+ let visible_survivor = visible_set
419419+ .iter()
420420+ .any(|v| v.starts_with(&prefix) && !dels_set.contains(*v));
421421+ if !visible_survivor {
422422+ violations.push(format!(
423423+ "spurious death possible: collection '{coll}' has survivors \
424424+ {survivors:?} but none appear in proof (visible={visible:?})"
425425+ ));
426426+ }
427427+ }
428428+429429+ // Spurious birth: a collection existed before the commit (has preexisting
430430+ // keys in the after-tree) but no preexisting key is visible.
431431+ for add_path in adds {
432432+ let (coll, _) = add_path.split_once('/').unwrap();
433433+ let prefix = format!("{coll}/");
434434+ let preexisting: Vec<&str> = after_keys
435435+ .iter()
436436+ .copied()
437437+ .filter(|k| k.starts_with(&prefix) && !adds_set.contains(k))
438438+ .collect();
439439+ if preexisting.is_empty() {
440440+ continue; // collection genuinely new
441441+ }
442442+ let visible_preexisting = visible_set
443443+ .iter()
444444+ .any(|v| v.starts_with(&prefix) && !adds_set.contains(*v));
445445+ if !visible_preexisting {
446446+ violations.push(format!(
447447+ "spurious birth possible: collection '{coll}' has preexisting \
448448+ keys {preexisting:?} but none appear in proof (visible={visible:?})"
449449+ ));
450450+ }
451451+ }
452452+453453+ violations
454454+ }
455455+456456+ // ── atproto-interop-tests ─────────────────────────────────────────────────
457457+458458+ /// Leaf CID shared across all atproto-interop-tests fixtures.
459459+ const LEAF_CID_STR: &str = "bafyreie5cvv4h45feadgeuwhbcutmh6t2ceseocckahdoe6uat64zmz454";
460460+461461+ #[tokio::test]
462462+ async fn atproto_interop_fixtures() {
463463+ let fixtures_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
464464+ .join("../atproto-interop-tests/firehose/commit-proof-fixtures.json");
465465+466466+ if !fixtures_path.exists() {
467467+ println!("SKIP: atproto-interop-tests not found at {fixtures_path:?}");
468468+ return;
469469+ }
470470+471471+ let leaf_cid = parse_cid(LEAF_CID_STR);
472472+ let fixtures: Vec<serde_json::Value> =
473473+ serde_json::from_str(&std::fs::read_to_string(&fixtures_path).unwrap()).unwrap();
474474+475475+ let mut failures: Vec<String> = Vec::new();
476476+477477+ for fixture in &fixtures {
478478+ let comment = fixture["comment"].as_str().unwrap();
479479+480480+ let keys: Vec<&str> = fixture["keys"]
481481+ .as_array()
482482+ .unwrap()
483483+ .iter()
484484+ .map(|v| v.as_str().unwrap())
485485+ .collect();
486486+ let adds: Vec<&str> = fixture["adds"]
487487+ .as_array()
488488+ .unwrap()
489489+ .iter()
490490+ .map(|v| v.as_str().unwrap())
491491+ .collect();
492492+ let dels: Vec<&str> = fixture["dels"]
493493+ .as_array()
494494+ .unwrap()
495495+ .iter()
496496+ .map(|v| v.as_str().unwrap())
497497+ .collect();
498498+ let expected_root = fixture["rootAfterCommit"].as_str().unwrap();
499499+ let proof_cids: HashSet<String> = fixture["blocksInProof"]
500500+ .as_array()
501501+ .unwrap()
502502+ .iter()
503503+ .map(|v| v.as_str().unwrap().to_string())
504504+ .collect();
505505+506506+ // Build after-tree: keys + adds - dels
507507+ let dels_set: HashSet<&str> = dels.iter().copied().collect();
508508+ let mut after_keys: Vec<&str> = keys
509509+ .iter()
510510+ .copied()
511511+ .filter(|k| !dels_set.contains(k))
512512+ .collect();
513513+ after_keys.extend(adds.iter().copied());
514514+ after_keys.sort_unstable();
515515+516516+ let (parsed, computed_root) = build_proof_car(&after_keys, leaf_cid, &proof_cids).await;
517517+518518+ // Sanity: verify our MST matches the fixture.
519519+ if computed_root.to_string() != expected_root {
520520+ failures.push(format!(
521521+ "'{comment}': MST root mismatch \
522522+ (got {computed_root}, expected {expected_root}) — \
523523+ jacquard_repo may be incompatible with this fixture"
524524+ ));
525525+ continue;
526526+ }
527527+528528+ let visible = collect_visible_paths(parsed.clone()).unwrap();
529529+ let violations = check_invariants(&adds, &dels, &after_keys, &visible);
530530+ for v in violations {
531531+ failures.push(format!("'{comment}': {v}"));
532532+ }
533533+534534+ // Fixture 6 uses real app.bsky.* NSIDs — check the actual result.
535535+ if comment == "split with earlier leaves on same layer" {
536536+ let ops: Vec<_> = adds
537537+ .iter()
538538+ .map(|p| RepoOp {
539539+ action: CowStr::Owned("create".into()),
540540+ path: CowStr::Owned((*p).into()),
541541+ cid: None,
542542+ prev: None,
543543+ extra_data: Default::default(),
544544+ })
545545+ .chain(dels.iter().map(|p| RepoOp {
546546+ action: CowStr::Owned("delete".into()),
547547+ path: CowStr::Owned((*p).into()),
548548+ cid: None,
549549+ prev: None,
550550+ extra_data: Default::default(),
551551+ }))
552552+ .collect();
553553+554554+ let (born, died) = extract(&ops, parsed).unwrap();
555555+ // Adding to an existing app.bsky.feed.post collection:
556556+ // the adjacent key (3lon5cqsbwrj2) must be visible → no birth.
557557+ if !born.is_empty() || !died.is_empty() {
558558+ failures.push(format!(
559559+ "'{comment}': expected born=[], died=[] \
560560+ but got born={born:?}, died={died:?}; \
561561+ visible={visible:?}"
562562+ ));
563563+ }
564564+ }
565565+ }
566566+567567+ assert!(
568568+ failures.is_empty(),
569569+ "atproto-interop fixture violations:\n{}",
570570+ failures.join("\n")
571571+ );
572572+ }
573573+574574+ // ── mst-test-suite ────────────────────────────────────────────────────────
575575+576576+ /// Property test over all 16 384 exhaustive MST diff fixtures.
577577+ ///
578578+ /// Loads the "B" (after-commit) CAR directly, filters its blocks to
579579+ /// `created_nodes ∪ proof_nodes` (the blocks a compliant firehose sender
580580+ /// must include), and asserts that the "no spurious death" invariant holds:
581581+ /// if a collection still has surviving records in B, at least one survivor
582582+ /// must be visible in the partial proof CAR.
583583+ #[tokio::test]
584584+ async fn mst_suite_no_spurious_death() {
585585+ let suite_dir = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../mst-test-suite");
586586+587587+ if !suite_dir.exists() {
588588+ println!("SKIP: mst-test-suite not found at {suite_dir:?}");
589589+ return;
590590+ }
591591+592592+ let fixture_dir = suite_dir.join("tests/diff/exhaustive");
593593+ let mut entries: Vec<_> = std::fs::read_dir(&fixture_dir)
594594+ .unwrap()
595595+ .filter_map(|e| e.ok())
596596+ .filter(|e| e.path().extension().map_or(false, |x| x == "json"))
597597+ .collect();
598598+ entries.sort_by_key(|e| e.path());
599599+600600+ let mut checked = 0usize;
601601+ let mut failures: Vec<String> = Vec::new();
602602+603603+ for entry in &entries {
604604+ let text = std::fs::read_to_string(entry.path()).unwrap();
605605+ let fixture: serde_json::Value = serde_json::from_str(&text).unwrap();
606606+ let results = &fixture["results"];
607607+608608+ let deleted_rpaths: Vec<&str> = results["record_ops"]
609609+ .as_array()
610610+ .unwrap()
611611+ .iter()
612612+ .filter(|op| op["new_value"].is_null())
613613+ .map(|op| op["rpath"].as_str().unwrap())
614614+ .collect();
615615+616616+ if deleted_rpaths.is_empty() {
617617+ continue; // no deletions → no possible spurious deaths
618618+ }
619619+620620+ // Load the B (after) CAR.
621621+ let b_car_rel = fixture["inputs"]["mst_b"].as_str().unwrap();
622622+ let b_car_path = suite_dir.join(b_car_rel.trim_start_matches("./"));
623623+ let b_car_bytes = std::fs::read(&b_car_path).unwrap();
624624+ let mut full_parsed = jacquard_repo::car::parse_car_bytes(&b_car_bytes)
625625+ .await
626626+ .unwrap();
627627+628628+ // Attach a fake commit so repo-stream can load it.
629629+ attach_fake_commit(&mut full_parsed);
630630+ let commit_cid = full_parsed.root;
631631+632632+ // Walk the full B tree to collect all surviving keys.
633633+ let all_keys: HashSet<String> = collect_visible_paths(full_parsed.clone())
634634+ .unwrap()
635635+ .into_iter()
636636+ .collect();
637637+638638+ // Build the proof-filtered partial CAR.
639639+ let proof_cid_set: HashSet<String> = results["created_nodes"]
640640+ .as_array()
641641+ .unwrap()
642642+ .iter()
643643+ .chain(results["proof_nodes"].as_array().unwrap().iter())
644644+ .map(|v| v.as_str().unwrap().to_string())
645645+ .collect();
646646+647647+ let partial_blocks: BTreeMap<IpldCid, Bytes> = full_parsed
648648+ .blocks
649649+ .into_iter()
650650+ .filter(|(cid, _)| proof_cid_set.contains(&cid.to_string()) || *cid == commit_cid)
651651+ .collect();
652652+ let partial_parsed = ParsedCar {
653653+ root: commit_cid,
654654+ blocks: partial_blocks,
655655+ };
656656+657657+ let visible: HashSet<String> = collect_visible_paths(partial_parsed)
658658+ .unwrap()
659659+ .into_iter()
660660+ .collect();
661661+662662+ // Invariant: for every deleted key, if the collection has survivors
663663+ // in the after-tree, at least one must be visible in the proof CAR.
664664+ for del in &deleted_rpaths {
665665+ let (coll, _) = del.split_once('/').unwrap();
666666+ let prefix = format!("{coll}/");
667667+668668+ let survivors: Vec<&str> = all_keys
669669+ .iter()
670670+ .filter(|k| k.starts_with(&prefix))
671671+ .map(String::as_str)
672672+ .collect();
673673+674674+ if survivors.is_empty() {
675675+ continue; // collection truly died; fine
676676+ }
677677+678678+ let visible_survivor = visible.iter().any(|v| v.starts_with(&prefix));
679679+ if !visible_survivor {
680680+ let name = entry.path();
681681+ let name = name.file_name().unwrap().to_string_lossy();
682682+ failures.push(format!(
683683+ "{name}: deleting '{del}' — collection '{coll}' has survivors \
684684+ {survivors:?} but none visible in proof (visible keys: {visible:?})"
685685+ ));
686686+ }
687687+ }
688688+689689+ checked += 1;
690690+ }
691691+692692+ assert!(
693693+ checked > 0,
694694+ "no fixtures with deletions found in {fixture_dir:?}"
695695+ );
696696+ assert!(
697697+ failures.is_empty(),
698698+ "{} spurious-death invariant violations across {checked} fixtures:\n{}",
699699+ failures.len(),
700700+ failures.join("\n")
701701+ );
702702+ println!(
703703+ "mst_suite_no_spurious_death: checked {checked} fixtures with deletions — all passed"
704704+ );
705705+ }
706706+}