···100100- [x] firehose websocket
101101 - [-] ~~ping/pong (unless jacquard is already doing it):~~ seems like no but we can skip it
102102 - [x] no-events-received timeout reconnect
103103+- [x] account status convergeance: if we receive commits from apparently-inactive accounts, should we check upstream status to make sure we're not stale?
103104- [ ] resync short-circuit: tiny repos may actually return their entire CAR for getRecord
104105- [ ] commit CAR handling: generate a list of keys with gaps noted, to reliably detect missing adjacent keys
105105-- [ ] account status convergeance: if we receive commits from apparently-inactive accounts, should we check upstream status to make sure we're not stale?
106106107107108108very much still todo but i'm getting tired
+3-2
readme.md
···66666767[`relay.fire.hose.cam`](https://relay.fire.hose.cam/) is one of [microcosm](https://www.microcosm.blue/)'s full-network relays. Lightrail works with a relay or PDS host upstream, or any other service that implements at least:
68686969-- `com.atproto.sync.subscribeRepos` and
7070-- `com.atproto.sync.listRepos`
6969+- `com.atproto.sync.subscribeRepos`,
7070+- `com.atproto.sync.listRepos`, and
7171+- `com.atproto.sync.getRepoStatus`
717272737374### Key configs
+2
src/main.rs
···139139 let db = db.clone();
140140 let host = subscribe_host.clone();
141141 let resolver = resolver.clone();
142142+ let client = client.clone();
142143 async move {
143144 let mut sub = firehose::Subscriber::new(
144145 host,
···146147 resolver,
147148 args.max_firehose_workers,
148149 Duration::from_secs(args.cursor_save_interval_secs),
150150+ client,
149151 );
150152 sub.run(token)
151153 .await
+60-1
src/sync/firehose/commit_event.rs
···4444 seq: i64,
4545 resolver: &Resolver,
4646 db: &DbRef,
4747+ client: &crate::http::ThrottledClient,
4748) -> crate::error::Result<()> {
4849 let did = commit.repo.clone();
4950···6768 let (info, prev, pds_mode) = match step2 {
6869 Step2Result::Proceed(info, prev, mode) => (info, prev, mode),
6970 Step2Result::Drop => return Ok(()),
7171+ Step2Result::InactiveAccount(info, _prev) => {
7272+ // Our local record says inactive, but we may have missed a
7373+ // reactivation #account event. Check upstream before dropping.
7474+ if !validate::upstream_says_active(&pds_host, &did, client).await {
7575+ metrics::counter!("lightrail_event_dropped_total",
7676+ "event_type" => "commit", "reason" => "account_inactive")
7777+ .increment(1);
7878+ debug!(did = %did, status = info.status.as_str(),
7979+ "commit dropped: account not active (confirmed upstream)");
8080+ return Ok(());
8181+ }
8282+ // Upstream says active — persist the reactivation and re-run
8383+ // the step-2 checks with the updated status.
8484+ metrics::counter!("lightrail_account_reactivated_total",
8585+ "trigger" => "commit")
8686+ .increment(1);
8787+ info!(did = %did, "account reactivated via upstream check (triggered by #commit)");
8888+ let db_ra = db.clone();
8989+ let did_ra = did.clone();
9090+ let rev_ra = commit.rev.clone();
9191+ let pds_host_ra = pds_host.clone();
9292+ let step2_retry = tokio::task::spawn_blocking(move || {
9393+ reactivate_and_recheck(&db_ra, &did_ra, &rev_ra, pds_host_ra)
9494+ })
9595+ .await??;
9696+ match step2_retry {
9797+ Step2Result::Proceed(i, p, m) => (i, p, m),
9898+ _ => return Ok(()),
9999+ }
100100+ }
70101 Step2Result::Buffer => {
71102 // Repo is mid-resync. Serialize the commit and buffer it so it can
72103 // be replayed after the resync fetch completes.
···318349enum Step2Result {
319350 /// All good — proceed with signature verification and the rest of the pipeline.
320351 Proceed(RepoInfo, Option<RepoPrev>, Sync11Mode),
321321- /// Drop this event (inactive account, desynchronized state, stale rev, etc.).
352352+ /// Drop this event (desynchronized state, stale rev, future rev, etc.).
322353 Drop,
323354 /// Repo is mid-resync. Caller should buffer the commit for replay.
324355 Buffer,
356356+ /// Account is locally inactive. Caller may check upstream before dropping.
357357+ InactiveAccount(RepoInfo, Option<RepoPrev>),
325358}
326359327360/// Step 2: load the repo state and decide how to handle this commit.
···375408 if info.state == RepoState::Resyncing {
376409 return Ok(Step2Result::Buffer);
377410 }
411411+ // Separate inactive check so the caller can probe upstream before dropping.
412412+ if !info.status.is_active() {
413413+ return Ok(Step2Result::InactiveAccount(info, prev));
414414+ }
378415 if validate::should_drop(&info, prev.as_ref(), rev, "commit", &did) {
379416 return Ok(Step2Result::Drop);
380417 }
···383420 None => Sync11Mode::Lenient,
384421 };
385422 Ok(Step2Result::Proceed(info, prev, mode))
423423+}
424424+425425+/// Write `AccountStatus::Active` to storage for `did`, then re-run the
426426+/// step-2 checks so the commit pipeline can continue as normal.
427427+fn reactivate_and_recheck(
428428+ db: &DbRef,
429429+ did: &Did<'static>,
430430+ rev: &Tid,
431431+ pds_host: Option<Host>,
432432+) -> crate::error::Result<Step2Result> {
433433+ let Some((mut info, _)) = storage::repo::get(db, did)? else {
434434+ return Ok(Step2Result::Drop);
435435+ };
436436+ info.status = AccountStatus::Active;
437437+ let mut batch = db.database.batch();
438438+ storage::repo::put_info_into(&mut batch, db, did, &info);
439439+ batch
440440+ .commit()
441441+ .map_err(Into::<crate::storage::StorageError>::into)?;
442442+ // Re-run the full step-2 check; the updated Active status means
443443+ // InactiveAccount will not be returned again.
444444+ check_step2_blocking(db, did.clone(), rev, pds_host)
386445}
387446388447/// Perform the storage-backed validation steps (6–9) and, if all pass,
+28-9
src/sync/firehose/event_dispatcher.rs
···5050 max_concurrent: usize,
5151 resolver: Arc<crate::identity::Resolver>,
5252 db: DbRef,
5353+ client: crate::http::ThrottledClient,
5354}
54555556struct PendingCommit {
···101102// ---------------------------------------------------------------------------
102103103104impl CommitDispatcher {
104104- pub fn new(resolver: Arc<crate::identity::Resolver>, db: DbRef, max_concurrent: usize) -> Self {
105105+ pub fn new(
106106+ resolver: Arc<crate::identity::Resolver>,
107107+ db: DbRef,
108108+ max_concurrent: usize,
109109+ client: crate::http::ThrottledClient,
110110+ ) -> Self {
105111 Self {
106112 queues: HashMap::new(),
107113 busy: HashSet::new(),
···111117 max_concurrent,
112118 resolver,
113119 db,
120120+ client,
114121 }
115122 }
116123···202209203210 let resolver = self.resolver.clone();
204211 let db = self.db.clone();
212212+ let client = self.client.clone();
205213 let did_for_result = did.clone();
206214 let seq = pending.seq();
207215 let handle = self.workers.spawn(async move {
208216 match pending {
209217 PendingWork::Commit(p) => {
210210- run_commit_event_worker(p.commit, p.seq, did_for_result, resolver, db).await
218218+ run_commit_event_worker(
219219+ p.commit,
220220+ p.seq,
221221+ did_for_result,
222222+ resolver,
223223+ db,
224224+ client,
225225+ )
226226+ .await
211227 }
212228 PendingWork::Sync(p) => {
213213- run_sync_event_worker(p.sync, p.seq, did_for_result, resolver, db).await
229229+ run_sync_event_worker(p.sync, p.seq, did_for_result, resolver, db, client)
230230+ .await
214231 }
215232 PendingWork::Account(p) => {
216233 run_account_event_worker(p.account, p.seq, did_for_result, db).await
···328345 did: Did<'static>,
329346 resolver: Arc<crate::identity::Resolver>,
330347 db: DbRef,
348348+ client: crate::http::ThrottledClient,
331349) -> CommitWorkerResult {
332332- let outcome = super::commit_event::process_commit_event(commit, seq, &resolver, &db)
350350+ let outcome = super::commit_event::process_commit_event(commit, seq, &resolver, &db, &client)
333351 .await
334352 .map_err(|e| e.to_string());
335353 CommitWorkerResult { did, seq, outcome }
···341359 did: Did<'static>,
342360 resolver: Arc<crate::identity::Resolver>,
343361 db: DbRef,
362362+ client: crate::http::ThrottledClient,
344363) -> CommitWorkerResult {
345345- let outcome = super::sync_event::process_sync_event(sync, &resolver, &db)
364364+ let outcome = super::sync_event::process_sync_event(sync, &resolver, &db, &client)
346365 .await
347366 .map_err(|e| e.to_string());
348367 CommitWorkerResult { did, seq, outcome }
···434453 async fn commits_for_same_did_are_sequential() {
435454 let db = crate::storage::open_temporary().unwrap();
436455 let resolver = make_resolver();
437437- let mut d = CommitDispatcher::new(resolver, db, 4);
456456+ let mut d = CommitDispatcher::new(resolver, db, 4, crate::http::build_client());
438457439458 let did: Did<'static> = Did::new_owned("did:plc:testsequential").unwrap();
440459 let c1 = {
···474493 async fn commits_for_different_dids_run_in_parallel() {
475494 let db = crate::storage::open_temporary().unwrap();
476495 let resolver = make_resolver();
477477- let mut d = CommitDispatcher::new(resolver, db, 4);
496496+ let mut d = CommitDispatcher::new(resolver, db, 4, crate::http::build_client());
478497479498 let did_a: Did<'static> = Did::new_owned("did:plc:testa").unwrap();
480499 let did_b: Did<'static> = Did::new_owned("did:plc:testb").unwrap();
···491510 async fn watermark_advances_after_completion() {
492511 let db = crate::storage::open_temporary().unwrap();
493512 let resolver = make_resolver();
494494- let mut d = CommitDispatcher::new(resolver, db, 4);
513513+ let mut d = CommitDispatcher::new(resolver, db, 4, crate::http::build_client());
495514496515 let did_a: Did<'static> = Did::new_owned("did:plc:testwma").unwrap();
497516 let did_b: Did<'static> = Did::new_owned("did:plc:testwmb").unwrap();
···512531 async fn stalled_seq_evicted_from_watermark() {
513532 let db = crate::storage::open_temporary().unwrap();
514533 let resolver = make_resolver();
515515- let mut d = CommitDispatcher::new(resolver, db, 4);
534534+ let mut d = CommitDispatcher::new(resolver, db, 4, crate::http::build_client());
516535517536 // Manually inject an old entry into outstanding without spawning a worker.
518537 let stale_instant = Instant::now() - std::time::Duration::from_secs(STALL_EVICT_SECS + 1);