lightweight com.atproto.sync.listReposByCollection
45
fork

Configure Feed

Select the types of activity you want to include in your feed.

non-active account states cleanup

more to clean up tbh

phil abd8c92c 35678889

+191 -43
+49 -2
src/storage/repo.rs
··· 60 60 Desynchronized, 61 61 Resyncing, 62 62 Active, 63 + Throttled, 63 64 Takendown, 64 65 Suspended, 65 66 Deactivated, 67 + Deleted, 66 68 Error, 67 69 /// The PDS returned a definitive "no repo for this DID" response. 68 70 /// The DID identity exists but the PDS doesn't have a repo for it — ··· 78 80 RepoState::Desynchronized => "desynchronized", 79 81 RepoState::Resyncing => "resyncing", 80 82 RepoState::Active => "active", 83 + RepoState::Throttled => "throttled", 81 84 RepoState::Takendown => "takendown", 82 85 RepoState::Suspended => "suspended", 83 86 RepoState::Deactivated => "deactivated", 87 + RepoState::Deleted => "deleted", 84 88 RepoState::Error => "error", 85 89 RepoState::NotFound => "notFound", 86 90 } ··· 92 96 "desynchronized" => RepoState::Desynchronized, 93 97 "resyncing" => RepoState::Resyncing, 94 98 "active" => RepoState::Active, 99 + "throttled" => RepoState::Throttled, 95 100 "takendown" => RepoState::Takendown, 96 101 "suspended" => RepoState::Suspended, 97 102 "deactivated" => RepoState::Deactivated, 103 + "deleted" => RepoState::Deleted, 98 104 "error" => RepoState::Error, 99 105 "notFound" => RepoState::NotFound, 100 106 _ => return None, 101 107 }) 102 108 } 109 + 110 + pub fn to_account_inactive(&self) -> Option<AccountStatus> { 111 + match self { 112 + RepoState::Pending => None, 113 + RepoState::Desynchronized => None, 114 + RepoState::Resyncing => None, 115 + RepoState::Active => None, 116 + RepoState::Throttled => Some(AccountStatus::Throttled), 117 + RepoState::Takendown => Some(AccountStatus::Takendown), 118 + RepoState::Suspended => Some(AccountStatus::Suspended), 119 + RepoState::Deactivated => Some(AccountStatus::Deactivated), 120 + RepoState::Deleted => Some(AccountStatus::Deleted), 121 + RepoState::Error => None, 122 + RepoState::NotFound => None, 123 + } 124 + } 103 125 } 104 126 105 127 /// tap's "AccountStatus" type ··· 107 129 #[derive(Debug, Clone, PartialEq, Eq)] 108 130 pub enum AccountStatus { 109 131 Active, 132 + Throttled, 110 133 Takendown, 111 134 Suspended, 112 135 Deactivated, 136 + Desynchronized, 113 137 Deleted, 114 138 } 115 139 ··· 118 142 matches!(self, AccountStatus::Active) 119 143 } 120 144 121 - pub fn status(&self) -> Option<&str> { 145 + pub fn status(&self) -> Option<&'static str> { 122 146 match self { 123 147 AccountStatus::Active => None, 148 + AccountStatus::Throttled => Some("throttled"), 124 149 AccountStatus::Takendown => Some("takendown"), 125 150 AccountStatus::Suspended => Some("suspended"), 126 151 AccountStatus::Deactivated => Some("deactivated"), 152 + AccountStatus::Desynchronized => Some("desynchronized"), 127 153 AccountStatus::Deleted => Some("deleted"), 128 154 } 129 155 } 130 156 131 - pub fn as_str(&self) -> &str { 157 + pub fn as_str(&self) -> &'static str { 132 158 match self { 133 159 AccountStatus::Active => "active", 160 + AccountStatus::Throttled => "throttled", 134 161 AccountStatus::Takendown => "takendown", 135 162 AccountStatus::Suspended => "suspended", 136 163 AccountStatus::Deactivated => "deactivated", 164 + AccountStatus::Desynchronized => "desynchronized", 137 165 AccountStatus::Deleted => "deleted", 138 166 } 139 167 } 140 168 169 + /// for inactive accounts, map what the repo state should be 170 + /// 171 + /// (is this duplicated setup a bit silly? probably) 172 + /// 173 + /// panics: if the account status is active 174 + pub fn to_repo_inactive(&self) -> Option<RepoState> { 175 + match self { 176 + AccountStatus::Active => None, 177 + AccountStatus::Throttled => Some(RepoState::Throttled), 178 + AccountStatus::Takendown => Some(RepoState::Takendown), 179 + AccountStatus::Suspended => Some(RepoState::Suspended), 180 + AccountStatus::Deactivated => Some(RepoState::Deactivated), 181 + AccountStatus::Desynchronized => Some(RepoState::Desynchronized), 182 + AccountStatus::Deleted => Some(RepoState::Deleted), 183 + } 184 + } 185 + 141 186 fn from_str(s: &str) -> Option<Self> { 142 187 Some(match s { 143 188 "active" => AccountStatus::Active, 189 + "throttled" => AccountStatus::Throttled, 144 190 "takendown" => AccountStatus::Takendown, 145 191 "suspended" => AccountStatus::Suspended, 146 192 "deactivated" => AccountStatus::Deactivated, 193 + "desynchronized" => AccountStatus::Desynchronized, 147 194 "deleted" => AccountStatus::Deleted, 148 195 _ => return None, 149 196 })
+137 -32
src/sync/backfill.rs
··· 5 5 //! on, so the walk can be safely resumed after a restart. Already-known repos 6 6 //! (any state) are skipped — the dispatcher's retry mechanism handles repos 7 7 //! that need re-syncing. 8 + //! 9 + //! Accounts listed as non-active (takendown, suspended, deactivated, deleted) 10 + //! have their status recorded without queuing a resync — there's nothing to 11 + //! fetch. 8 12 9 13 use std::sync::Arc; 10 14 11 - use jacquard_api::com_atproto::sync::list_repos::ListRepos; 15 + use jacquard_api::com_atproto::sync::list_repos::{ListRepos, RepoStatus}; 12 16 use jacquard_common::{ 13 17 error::ClientErrorKind, 14 18 types::string::Did, ··· 17 21 }; 18 22 use tokio::time::Duration; 19 23 use tokio_util::sync::CancellationToken; 20 - use tracing::{error, info, trace, warn}; 24 + use tracing::{debug, error, info, trace, warn}; 21 25 22 26 use crate::{ 23 27 error::Result, ··· 26 30 storage::{ 27 31 self, DbRef, 28 32 backfill_progress::{BackfillProgress, get, set}, 33 + repo::{AccountStatus, RepoInfo, RepoState}, 29 34 }, 30 35 sync::discovery_queue::{DiscoveryItem, DiscoveryQueue}, 31 36 util::TokenExt, ··· 39 44 /// Maximum consecutive transient failures before giving up on this host. 40 45 const MAX_PAGE_FAILURES: u32 = 3; 41 46 47 + // --------------------------------------------------------------------------- 48 + // Listed account state 49 + // --------------------------------------------------------------------------- 50 + 51 + /// Map the `active`/`status` fields from a listRepos `Repo` entry to a 52 + /// [`ListedAccountState`]. Follows the same mapping as firehose `#account` 53 + /// events in `account_event.rs`. 54 + fn classify_repo(active: Option<bool>, status: &Option<RepoStatus<'_>>) -> RepoState { 55 + // active-as-fallback is safe: 56 + if active.unwrap_or(true) { 57 + return RepoState::Active; 58 + } 59 + let Some(jac_status) = status else { 60 + return RepoState::Active; 61 + }; 62 + match jac_status { 63 + RepoStatus::Takendown => RepoState::Takendown, 64 + RepoStatus::Suspended => RepoState::Suspended, 65 + RepoStatus::Deleted => RepoState::Deleted, 66 + RepoStatus::Deactivated => RepoState::Deactivated, 67 + RepoStatus::Desynchronized => RepoState::Desynchronized, 68 + RepoStatus::Throttled => RepoState::Throttled, 69 + RepoStatus::Other(_) => RepoState::Error, 70 + } 71 + } 72 + 73 + // --------------------------------------------------------------------------- 74 + // Main loop 75 + // --------------------------------------------------------------------------- 76 + 42 77 /// Walk `listRepos` on `host` and enqueue new repos for resync. 43 78 /// 44 79 /// When `validate` is true (deep crawl / untrusted PDS), DIDs are verified to 45 - /// actually live on `host` and non-matching ones are rejected. The resolver is 46 - /// always used for PDS-host discovery so that enqueue timestamps can be 47 - /// staggered per actual PDS, spreading work across hosts in the queue. 80 + /// actually live on `host` and non-matching ones are rejected. Validated DIDs 81 + /// are considered authoritative for account status — if the PDS says an account 82 + /// is deactivated, we trust it even if our local state is Active. Without 83 + /// validation (relay backfill), we only write non-active status for accounts 84 + /// we haven't seen before, to avoid overwriting Active state with stale relay 85 + /// data (e.g. accounts that migrated off and appear deactivated on the old PDS). 48 86 pub async fn run( 49 87 host: Host, 50 88 db: DbRef, ··· 95 133 dids 96 134 }; 97 135 98 - // Resolve each DID's actual PDS host for per-host stagger. 136 + // Resolve each DID's actual PDS host for discovery queue routing. 99 137 // Cache hits are free; misses fall back to the listed host. 100 138 // 101 139 // TODO: ...this is basically redundant with validate_dids now? 102 - let dids_with_hosts: Vec<(Did<'static>, Arc<Url>)> = { 140 + let dids_with_hosts: Vec<(Did<'static>, Arc<Url>, RepoState)> = { 103 141 let mut out = Vec::with_capacity(dids.len()); 104 - for did in dids { 142 + for (did, account_state) in dids { 105 143 let Some(res) = token.run(resolver.resolve(&did)).await else { 106 144 return Ok(false); // cancelled 107 145 }; 108 146 let pds_host = match res { 109 147 Ok(resolved) => resolved.pds.clone(), 110 148 Err(e) => { 111 - error!(did = %did, error = %e, "failed to resolve host for validated did; not enqueuing resync"); 149 + error!(did = %did, error = %e, "failed to resolve host for validated did; skipping"); 112 150 continue; 113 151 } 114 152 }; 115 - out.push((did, pds_host)); 153 + out.push((did, pds_host, account_state)); 116 154 } 117 155 out 118 156 }; 119 157 120 158 let progress_cursor = next_cursor.clone().unwrap_or_default(); 121 - let (page_queued, new_items) = { 159 + let (page_queued, page_inactive, new_items) = { 122 160 let db = db.clone(); 123 161 let host = host.clone(); 124 162 tokio::task::spawn_blocking(move || { 125 - persist_page(&db, &host, dids_with_hosts, progress_cursor) 163 + persist_page(&db, &host, dids_with_hosts, progress_cursor, validate) 126 164 }) 127 165 .await?? 128 166 }; ··· 144 182 host = %host, 145 183 page_repos = page_len, 146 184 page_queued, 185 + page_inactive, 147 186 total_queued, 148 187 next_cursor = next_cursor.as_deref().unwrap_or("(done)"), 149 188 "backfill page" ··· 192 231 cursor: Option<&str>, 193 232 client: &ThrottledClient, 194 233 token: &CancellationToken, 195 - ) -> Option<(Vec<Did<'static>>, Option<String>)> { 234 + ) -> Option<(Vec<(Did<'static>, RepoState)>, Option<String>)> { 196 235 let req = ListRepos { 197 236 cursor: cursor.map(Into::into), 198 237 limit: Some(PAGE_LIMIT), ··· 216 255 Ok(resp) => match resp.parse() { 217 256 Ok(out) => { 218 257 let next = out.cursor.as_deref().map(str::to_owned); 219 - let dids = out 258 + let listed = out 220 259 .repos 221 260 .into_iter() 222 - .map(|r| r.did.into_static()) 261 + .map(|r| { 262 + let state = classify_repo(r.active, &r.status); 263 + (r.did.into_static(), state) 264 + }) 223 265 .collect::<Vec<_>>(); 224 - Some((dids, next)) 266 + Some((listed, next)) 225 267 } 226 268 Err(e) => { 227 269 warn!(error = %e, host = %host, "listRepos response parse failed"); ··· 253 295 254 296 /// Filter `dids` to those whose resolved PDS endpoint matches `host`. 255 297 /// 256 - /// Bypasses the identity cache to avoid populating it with entries that may 257 298 /// Returns early with whatever has been validated so far if the token is cancelled. 258 299 async fn validate_dids( 259 - dids: Vec<Did<'static>>, 300 + dids: Vec<(Did<'static>, RepoState)>, 260 301 resolver: &Resolver, 261 302 host: &Host, 262 303 token: &CancellationToken, 263 - ) -> Vec<Did<'static>> { 304 + ) -> Vec<(Did<'static>, RepoState)> { 264 305 let host_str = host.to_string(); 265 306 let mut valid = Vec::with_capacity(dids.len()); 266 - for did in dids { 307 + for (did, account_state) in dids { 267 308 let Some(r) = token.run(resolver.resolve(&did)).await else { 268 309 break; 269 310 }; 270 311 match r { 271 312 Ok(resolved) if resolved.pds.host_str() == Some(host_str.as_str()) => { 272 - valid.push(did); 313 + valid.push((did, account_state)); 273 314 } 274 315 Ok(resolved) => { 275 316 metrics::counter!("lightrail_backfill_did_rejected_total", "reason" => "pds_mismatch") ··· 296 337 297 338 /// Insert newly-seen DIDs into the repo table and persist the backfill cursor. 298 339 /// 299 - /// Returns `(count_inserted, new_items)` — the caller pushes `new_items` into 300 - /// the in-memory discovery queue (async, with backpressure). 340 + /// Active accounts are returned for the caller to push into the discovery 341 + /// queue. Non-active accounts have their status written directly (no resync 342 + /// needed). When `authoritative` is false (relay backfill), non-active status 343 + /// is only written for accounts that have no existing Active record — we don't 344 + /// overwrite a locally-Active account with stale relay data. 345 + /// 346 + /// Returns `(active_count, inactive_count, new_active_items)`. 301 347 fn persist_page( 302 348 db: &DbRef, 303 349 host: &Host, 304 - items: Vec<DidWithPds>, 350 + items: Vec<(Did<'static>, Arc<Url>, RepoState)>, 305 351 progress_cursor: String, 306 - ) -> Result<(u64, Vec<DidWithPds>)> { 307 - let mut new_items: Vec<DidWithPds> = Vec::new(); 308 - for (did, pds) in items { 309 - let newly_inserted = storage::repo::ensure_repo(db, &did)?; 310 - if newly_inserted { 311 - db.stats.repos_queued_total.fetch_add(1, Ordering::Relaxed); 312 - new_items.push((did, pds)); 352 + authoritative: bool, 353 + ) -> Result<(u64, u64, Vec<DidWithPds>)> { 354 + let mut new_active: Vec<DidWithPds> = Vec::new(); 355 + let mut inactive_count: u64 = 0; 356 + for (did, pds, repo_state) in items { 357 + if let Some(deactiveated_account_status) = repo_state.to_account_inactive() { 358 + if write_inactive_status( 359 + db, 360 + &did, 361 + deactiveated_account_status, 362 + repo_state, 363 + authoritative, 364 + )? { 365 + inactive_count += 1; 366 + } 367 + } else { 368 + let newly_inserted = storage::repo::ensure_repo(db, &did)?; 369 + if newly_inserted { 370 + db.stats.repos_queued_total.fetch_add(1, Ordering::Relaxed); 371 + new_active.push((did, pds)); 372 + } 313 373 } 314 374 } 315 375 // Persist progress before advancing so a crash during the next ··· 322 382 completed_at: None, 323 383 }, 324 384 )?; 325 - Ok((new_items.len() as u64, new_items)) 385 + Ok((new_active.len() as u64, inactive_count, new_active)) 386 + } 387 + 388 + /// Write a non-active account status from a listRepos entry. 389 + /// 390 + /// Returns `true` if the status was written, `false` if skipped. 391 + /// 392 + /// When `authoritative` is true (deep crawl — PDS confirmed via identity 393 + /// resolution), the status is always written. When false (relay backfill), 394 + /// we only write if the existing local status is not Active, to avoid 395 + /// overwriting with stale data (e.g. an account that migrated off this PDS 396 + /// and shows as deactivated on the old host). 397 + fn write_inactive_status( 398 + db: &DbRef, 399 + did: &Did<'_>, 400 + status: AccountStatus, 401 + state: RepoState, 402 + authoritative: bool, 403 + ) -> Result<bool> { 404 + let existing = storage::repo::get_status(db, did)?; 405 + match existing { 406 + Some(ref existing_status) if existing_status.is_active() && !authoritative => { 407 + debug!( 408 + did = %did, 409 + listed_status = status.as_str(), 410 + "skipping non-active status from non-authoritative source; local account is active" 411 + ); 412 + return Ok(false); 413 + } 414 + Some(ref existing_status) if *existing_status == status => { 415 + return Ok(false); // already matches 416 + } 417 + _ => {} 418 + } 419 + let info = RepoInfo { 420 + state, 421 + status: status.clone(), 422 + error: None, 423 + }; 424 + storage::repo::put_info(db, did, &info)?; 425 + metrics::counter!( 426 + "lightrail_backfill_inactive_total", 427 + "status" => status.as_str() 428 + ) 429 + .increment(1); 430 + Ok(true) 326 431 }
+5 -9
src/sync/firehose/account_event.rs
··· 17 17 } else { 18 18 use jacquard_api::com_atproto::sync::subscribe_repos::AccountStatus; 19 19 match account.status.as_ref() { 20 + Some(AccountStatus::Throttled) => storage::repo::AccountStatus::Throttled, 20 21 Some(AccountStatus::Takendown) => storage::repo::AccountStatus::Takendown, 21 22 Some(AccountStatus::Suspended) => storage::repo::AccountStatus::Suspended, 22 23 Some(AccountStatus::Deactivated) => storage::repo::AccountStatus::Deactivated, ··· 25 26 } 26 27 }; 27 28 28 - let tombstone = matches!( 29 - new_status, 30 - storage::repo::AccountStatus::Takendown | storage::repo::AccountStatus::Deleted 31 - ); 29 + let tombstone = new_status == storage::repo::AccountStatus::Deleted; 32 30 33 31 // For active accounts: preserve the current repo state — a #sync event 34 32 // will follow if the repo needs to be refreshed. For inactive: set the ··· 38 36 .map(|(info, _)| info.state) 39 37 .unwrap_or(storage::repo::RepoState::Active) 40 38 } else { 41 - match &new_status { 42 - storage::repo::AccountStatus::Suspended => storage::repo::RepoState::Suspended, 43 - storage::repo::AccountStatus::Deactivated => storage::repo::RepoState::Deactivated, 44 - _ => storage::repo::RepoState::Takendown, 45 - } 39 + new_status 40 + .to_repo_inactive() 41 + .unwrap_or(storage::repo::RepoState::Error) 46 42 }; 47 43 48 44 // Atomically: update repo state + remove index entries if tombstoned.