···5959 let key = key(did, seq);
6060 db.ks.insert(key, cbor)?;
6161 db.stats.resync_buffer_count.fetch_add(1, Ordering::Relaxed);
6262+ metrics::gauge!("lightrail_resync_buffer_depth").increment(1);
6263 Ok(())
6364}
6465···8687 Ok(events)
8788}
88899090+/// Delete all buffered events for `did`, returning the number removed.
9191+///
9292+/// Called when a DID is permanently abandoned (e.g. max not-found retries
9393+/// exhausted) to prevent orphaned buffer entries from inflating the count.
9494+pub fn drain_buffer(db: &DbRef, did: Did<'_>) -> StorageResult<u64> {
9595+ let prefix = key_prefix(did);
9696+ let mut count: u64 = 0;
9797+ for guard in db.ks.prefix(&prefix) {
9898+ let (key_slice, _) = guard.into_inner()?;
9999+ db.ks.remove(key_slice.as_ref())?;
100100+ db.stats.resync_buffer_count.fetch_sub(1, Ordering::Relaxed);
101101+ metrics::gauge!("lightrail_resync_buffer_depth").decrement(1);
102102+ count += 1;
103103+ }
104104+ Ok(count)
105105+}
106106+89107/// Delete a single buffered event after it has been successfully applied.
90108///
91109/// Call this after confirming the event was processed, not before — if the
···97115 let key = key(did, seq);
98116 db.ks.remove(key)?;
99117 db.stats.resync_buffer_count.fetch_sub(1, Ordering::Relaxed);
118118+ metrics::gauge!("lightrail_resync_buffer_depth").decrement(1);
100119 Ok(())
101120}
102121
+1
src/sync/firehose/account_event.rs
···5858 },
5959 );
6060 let n_removed = if tombstone {
6161+ // TODO: we might actually want to queue tombstones and work in bounded-sized delete batches
6162 storage::collection_index::remove_all_into(&mut batch, &db, &account.did)?
6263 } else {
6364 0
+32-4
src/sync/resync/dispatcher.rs
···244244 metrics::counter!("lightrail_resync_completed_total",
245245 "outcome" => "rate_limited")
246246 .increment(1);
247247+ drain_stale_buffer(&did, &db).await;
247248 cooling_hosts.insert(pds.clone(), Instant::now() + RATE_LIMIT_COOLDOWN);
248249 warn!(did = %did, pds = %pds, cooldown_secs = RATE_LIMIT_COOLDOWN.as_secs(),
249250 "PDS rate-limited; cooling down");
···443444 WorkerOutcome::Retry { error, retry_count } => {
444445 metrics::counter!("lightrail_resync_completed_total", "outcome" => "retry")
445446 .increment(1);
447447+ drain_stale_buffer(&did, &db).await;
446448 transition_state(
447449 db.clone(),
448450 did.clone(),
···467469 WorkerOutcome::NotFound { retry_count } => {
468470 metrics::counter!("lightrail_resync_completed_total", "outcome" => "not_found")
469471 .increment(1);
472472+ drain_stale_buffer(&did, &db).await;
470473 let new_retry = retry_count.saturating_add(1);
471474 if let Some(delay) = not_found_backoff(new_retry) {
472475 let item = ResyncItem {
···540543 // Ack to avoid the entry accumulating across future resyncs.
541544 let did_ack = did.clone();
542545 let db_ack = db.clone();
543543- if let Err(e) = tokio::task::spawn_blocking(move || {
546546+ match tokio::task::spawn_blocking(move || {
544547 crate::storage::resync_buffer::ack_buffer_entry(&db_ack, did_ack, seq)
545548 })
546549 .await
547550 {
548548- warn!(did = %did, seq, error = %e, "failed to ack malformed buffered commit");
551551+ Ok(Ok(())) => {}
552552+ Ok(Err(e)) => {
553553+ warn!(did = %did, seq, error = %e, "failed to ack malformed buffered commit")
554554+ }
555555+ Err(e) => {
556556+ warn!(did = %did, seq, error = %e, "ack task panicked for malformed buffered commit")
557557+ }
549558 }
550559 continue;
551560 }
···561570562571 let did_ack = did.clone();
563572 let db_ack = db.clone();
564564- if let Err(e) = tokio::task::spawn_blocking(move || {
573573+ match tokio::task::spawn_blocking(move || {
565574 crate::storage::resync_buffer::ack_buffer_entry(&db_ack, did_ack, seq)
566575 })
567576 .await
568577 {
569569- warn!(did = %did, seq, error = %e, "failed to ack buffered commit");
578578+ Ok(Ok(())) => {}
579579+ Ok(Err(e)) => warn!(did = %did, seq, error = %e, "failed to ack buffered commit"),
580580+ Err(e) => warn!(did = %did, seq, error = %e, "ack task panicked for buffered commit"),
570581 }
571582 }
572583···627638 })
628639 .await??;
629640 Ok(())
641641+}
642642+643643+/// Drain all buffered firehose events for a DID after a failed resync.
644644+///
645645+/// The next resync attempt will establish fresh ground truth, so events
646646+/// buffered during the failed attempt are stale and should be discarded.
647647+async fn drain_stale_buffer(did: &Did<'static>, db: &DbRef) {
648648+ let did_owned = did.clone();
649649+ let db = db.clone();
650650+ match tokio::task::spawn_blocking(move || storage::resync_buffer::drain_buffer(&db, did_owned))
651651+ .await
652652+ {
653653+ Ok(Ok(0)) => {}
654654+ Ok(Ok(n)) => debug!(did = %did, drained = n, "drained stale buffer entries"),
655655+ Ok(Err(e)) => warn!(did = %did, error = %e, "failed to drain stale buffer"),
656656+ Err(e) => warn!(did = %did, error = %e, "drain task panicked"),
657657+ }
630658}
631659632660fn backoff(retry_count: u16) -> Duration {