lightweight com.atproto.sync.listReposByCollection
45
fork

Configure Feed

Select the types of activity you want to include in your feed.

initial mortality spin-out

phil 89ce75fe cec729ac

+283 -130
+3 -1
hacking.md
··· 91 91 - [ ] *don't* allow non-validating commits that look like sync1.1 92 92 - [ ] rachet by PDS host: be lenient if we have never seen a sync1.1-looking commit, always strict after we see one. 93 93 - [ ] account status convergeance: if we receive commits from apparently-inactive accounts, should we check upstream status to make sure we're not stale? 94 + - [ ] split the keyspace: put the rbc/cbr indexes on a second keyspace with larger block size, expect hits on main keyspace 95 + - [ ] websocket ping/pong (unless jacquard is already doing it) 96 + - [ ] websocket no-events-received timeout reconnect 94 97 95 98 very much still todo but i'm getting tired 96 99 - [ ] multi-relay listener ··· 98 101 - [ ] admin view of backfill state etc 99 102 - [ ] vanity stats for optimizations, like how many in-flight repos were saved from resync due to high-water-mark firehose cursor persistence 100 103 - [ ] if the upstream is a PDS (check with describeServer?) then make only accept events for DIDs that have it as their PDS 101 - - [ ] split the keyspace: put the rbc/cbr indexes on a second keyspace with larger block size, expect hits on main keyspace 102 104 103 105 ## some choices 104 106
+2
src/error.rs
··· 11 11 TaskPanic(#[from] tokio::task::JoinError), 12 12 #[error(transparent)] 13 13 Resync(#[from] crate::sync::resync::ResyncError), 14 + #[error("collection mortality: {0}")] 15 + MstMortality(#[from] crate::mst::mortality::MstMortalityError), 14 16 #[error("{0}")] 15 17 Other(String), 16 18 }
+1
src/mst/mod.rs
··· 1 1 //! mst (merkle search tree, the atproto repo structure) utils 2 2 3 3 pub mod collections; 4 + pub mod mortality;
+270
src/mst/mortality.rs
··· 1 + //! Collection birth/death detection from a partial CAR slice. 2 + //! 3 + //! ATProto firehose commit CARs include MST proof nodes for each changed key. 4 + //! Those proof nodes usually include the immediately adjacent keys (left and 5 + //! right neighbours in sorted order) for each change. By walking the partial 6 + //! CAR we collect all visible leaf keys: the changed keys themselves plus 7 + //! (usually) their in-collection neighbours. 8 + //! 9 + //! A collection **dies** when every visible key in it is being deleted — all 10 + //! included adjacent keys are not in its collection. 11 + //! 12 + //! A collection is **born** when every visible key in it is being created (no 13 + //! neighbours share its collection name). Rarely a neighbour in the same 14 + //! collection might be present in the repo but not in the proof, producing a 15 + //! spurious re-birth; that is harmless (the index entry already exists). 16 + //! 17 + //! Multi-op commits are handled because all ops are considered together: a key 18 + //! is a "survivor" only if it is visible AND not in the deleted set. 19 + 20 + use std::collections::HashSet; 21 + 22 + use jacquard_api::com_atproto::sync::subscribe_repos::RepoOp; 23 + use jacquard_common::types::string::Nsid; 24 + use repo_stream::{DriverBuilder, JacquardLoadError, WalkError}; 25 + 26 + #[derive(Debug, thiserror::Error)] 27 + pub enum MstMortalityError { 28 + #[error("failed to load CAR: {0}")] 29 + Load(#[from] JacquardLoadError), 30 + #[error("MST walk error: {0}")] 31 + Walk(#[from] WalkError), 32 + } 33 + 34 + type Result<T> = std::result::Result<T, MstMortalityError>; 35 + 36 + /// Walk the partial CAR's MST to detect which collections are newly added 37 + /// ("born") or fully removed ("died") by this commit. 38 + /// 39 + /// Returns `(born, died)` — both lists may be empty. 40 + pub fn extract( 41 + ops: &[RepoOp<'_>], 42 + parsed: jacquard_repo::car::reader::ParsedCar, 43 + ) -> Result<(Vec<Nsid<'static>>, Vec<Nsid<'static>>)> { 44 + // ── Build create/delete path sets ──────────────────────────────────────── 45 + let mut created: HashSet<String> = HashSet::new(); 46 + let mut deleted: HashSet<String> = HashSet::new(); 47 + for op in ops { 48 + match op.action.as_ref() { 49 + "create" => { 50 + created.insert(op.path.to_string()); 51 + } 52 + "delete" => { 53 + deleted.insert(op.path.to_string()); 54 + } 55 + _ => {} // updates don't affect collection mortality 56 + } 57 + } 58 + 59 + if created.is_empty() && deleted.is_empty() { 60 + return Ok((vec![], vec![])); 61 + } 62 + 63 + // ── Walk the partial CAR's MST to collect visible leaf keys ────────────── 64 + // 65 + // next_keys() silently skips subtrees whose MST node blocks are absent, 66 + // giving us all leaves reachable through blocks that ARE in the CAR — 67 + // exactly the proof nodes for keys adjacent to the changes. 68 + let mut car = DriverBuilder::new().load_jacquard_parsed_car(parsed)?; 69 + let mut visible: Vec<String> = Vec::new(); 70 + while let Some((path, _)) = car.next_keys()? { 71 + visible.push(path.to_string()); 72 + } 73 + 74 + // ── Check collection death (all visible keys in C are being deleted) ────── 75 + let deleted_collections: HashSet<&str> = deleted 76 + .iter() 77 + .filter_map(|p| p.split_once('/').map(|(c, _)| c)) 78 + .collect(); 79 + 80 + let mut died: Vec<Nsid<'static>> = Vec::new(); 81 + for coll in deleted_collections { 82 + let prefix = format!("{coll}/"); 83 + let has_survivor = visible 84 + .iter() 85 + .any(|k| k.starts_with(&prefix) && !deleted.contains(k.as_str())); 86 + if !has_survivor && let Ok(nsid) = Nsid::new_owned(coll) { 87 + died.push(nsid); 88 + } 89 + } 90 + 91 + // ── Check collection birth (all visible keys in C are being created) ────── 92 + let created_collections: HashSet<&str> = created 93 + .iter() 94 + .filter_map(|p| p.split_once('/').map(|(c, _)| c)) 95 + .collect(); 96 + 97 + let mut born: Vec<Nsid<'static>> = Vec::new(); 98 + for coll in created_collections { 99 + let prefix = format!("{coll}/"); 100 + let has_preexisting = visible 101 + .iter() 102 + .any(|k| k.starts_with(&prefix) && !created.contains(k.as_str())); 103 + if !has_preexisting && let Ok(nsid) = Nsid::new_owned(coll) { 104 + born.push(nsid); 105 + } 106 + } 107 + 108 + Ok((born, died)) 109 + } 110 + 111 + #[cfg(test)] 112 + mod tests { 113 + use super::*; 114 + use std::sync::Arc; 115 + 116 + use bytes::Bytes; 117 + use jacquard_common::CowStr; 118 + use jacquard_common::types::string::Did; 119 + use jacquard_common::types::tid::Tid; 120 + use jacquard_repo::commit::Commit; 121 + use jacquard_repo::{BlockStore, MemoryBlockStore, Mst, car::write_car_bytes}; 122 + 123 + fn nsid(s: &str) -> Nsid<'static> { 124 + Nsid::new_owned(s).unwrap() 125 + } 126 + 127 + fn op_create(path: &'static str) -> RepoOp<'static> { 128 + RepoOp { 129 + action: CowStr::Borrowed("create"), 130 + path: CowStr::Borrowed(path), 131 + cid: None, 132 + prev: None, 133 + extra_data: Default::default(), 134 + } 135 + } 136 + 137 + fn op_delete(path: &'static str) -> RepoOp<'static> { 138 + RepoOp { 139 + action: CowStr::Borrowed("delete"), 140 + path: CowStr::Borrowed(path), 141 + cid: None, 142 + prev: None, 143 + extra_data: Default::default(), 144 + } 145 + } 146 + 147 + /// Build a ParsedCar containing all the given MST keys. 148 + async fn make_parsed_car(keys: &[&str]) -> jacquard_repo::car::reader::ParsedCar { 149 + let storage = Arc::new(MemoryBlockStore::new()); 150 + let mut mst = Mst::new(storage.clone()); 151 + let dummy_cid = storage.put(b"record").await.unwrap(); 152 + for key in keys { 153 + mst = mst.add(key, dummy_cid).await.unwrap(); 154 + } 155 + let (mst_root, mut blocks) = mst.collect_blocks().await.unwrap(); 156 + let commit = Commit { 157 + did: Did::new_owned("did:web:example.com").unwrap(), 158 + version: 3, 159 + data: mst_root, 160 + rev: Tid::now_0(), 161 + prev: None, 162 + sig: Bytes::from(vec![0u8; 64]), 163 + }; 164 + let commit_cid = commit.to_cid().unwrap(); 165 + let commit_cbor = Bytes::from(commit.to_cbor().unwrap()); 166 + blocks.insert(commit_cid, commit_cbor); 167 + let car_bytes = write_car_bytes(commit_cid, blocks).await.unwrap(); 168 + jacquard_repo::car::parse_car_bytes(&car_bytes) 169 + .await 170 + .unwrap() 171 + } 172 + 173 + // --------------------------------------------------------------------------- 174 + // Basic cases 175 + // --------------------------------------------------------------------------- 176 + 177 + #[tokio::test] 178 + async fn empty_ops_returns_empty() { 179 + let parsed = make_parsed_car(&["app.bsky.feed.post/abc123"]).await; 180 + let (born, died) = extract(&[], parsed).unwrap(); 181 + assert!(born.is_empty()); 182 + assert!(died.is_empty()); 183 + } 184 + 185 + #[tokio::test] 186 + async fn update_ops_ignored() { 187 + let parsed = make_parsed_car(&["app.bsky.feed.post/abc123"]).await; 188 + let ops = [RepoOp { 189 + action: CowStr::Borrowed("update"), 190 + path: CowStr::Borrowed("app.bsky.feed.post/abc123"), 191 + cid: None, 192 + prev: None, 193 + extra_data: Default::default(), 194 + }]; 195 + let (born, died) = extract(&ops, parsed).unwrap(); 196 + assert!(born.is_empty()); 197 + assert!(died.is_empty()); 198 + } 199 + 200 + // --------------------------------------------------------------------------- 201 + // Birth detection 202 + // --------------------------------------------------------------------------- 203 + 204 + #[tokio::test] 205 + async fn new_collection_is_born() { 206 + // CAR contains only the created key → no preexisting neighbours. 207 + let parsed = make_parsed_car(&["app.bsky.feed.post/abc123"]).await; 208 + let ops = [op_create("app.bsky.feed.post/abc123")]; 209 + let (born, died) = extract(&ops, parsed).unwrap(); 210 + assert_eq!(born, vec![nsid("app.bsky.feed.post")]); 211 + assert!(died.is_empty()); 212 + } 213 + 214 + #[tokio::test] 215 + async fn birth_suppressed_when_preexisting_key_visible() { 216 + // CAR contains both the created key and a preexisting sibling. 217 + let parsed = 218 + make_parsed_car(&["app.bsky.feed.post/abc123", "app.bsky.feed.post/def456"]).await; 219 + let ops = [op_create("app.bsky.feed.post/abc123")]; 220 + let (born, died) = extract(&ops, parsed).unwrap(); 221 + assert!(born.is_empty()); 222 + assert!(died.is_empty()); 223 + } 224 + 225 + // --------------------------------------------------------------------------- 226 + // Death detection 227 + // --------------------------------------------------------------------------- 228 + 229 + #[tokio::test] 230 + async fn collection_dies_when_last_key_deleted() { 231 + let parsed = make_parsed_car(&["app.bsky.feed.post/abc123"]).await; 232 + let ops = [op_delete("app.bsky.feed.post/abc123")]; 233 + let (born, died) = extract(&ops, parsed).unwrap(); 234 + assert!(born.is_empty()); 235 + assert_eq!(died, vec![nsid("app.bsky.feed.post")]); 236 + } 237 + 238 + #[tokio::test] 239 + async fn death_suppressed_when_survivor_visible() { 240 + // CAR shows a sibling key that is not being deleted. 241 + let parsed = 242 + make_parsed_car(&["app.bsky.feed.post/abc123", "app.bsky.feed.post/def456"]).await; 243 + let ops = [op_delete("app.bsky.feed.post/abc123")]; 244 + let (born, died) = extract(&ops, parsed).unwrap(); 245 + assert!(born.is_empty()); 246 + assert!(died.is_empty()); 247 + } 248 + 249 + // --------------------------------------------------------------------------- 250 + // Multi-collection / multi-op 251 + // --------------------------------------------------------------------------- 252 + 253 + #[tokio::test] 254 + async fn birth_and_death_in_same_commit() { 255 + let parsed = make_parsed_car(&[ 256 + "app.bsky.feed.post/abc123", // will be created (born) 257 + "app.bsky.graph.follow/old", // will be deleted (died) 258 + ]) 259 + .await; 260 + let ops = [ 261 + op_create("app.bsky.feed.post/abc123"), 262 + op_delete("app.bsky.graph.follow/old"), 263 + ]; 264 + let (mut born, mut died) = extract(&ops, parsed).unwrap(); 265 + born.sort_unstable(); 266 + died.sort_unstable(); 267 + assert_eq!(born, vec![nsid("app.bsky.feed.post")]); 268 + assert_eq!(died, vec![nsid("app.bsky.graph.follow")]); 269 + } 270 + }
+7 -129
src/sync/firehose/commit_event.rs
··· 22 22 use jacquard_api::com_atproto::sync::subscribe_repos::{Commit, RepoOp}; 23 23 use jacquard_common::types::cid::CidLink; 24 24 use jacquard_common::types::{string::Did, string::Nsid, tid::Tid}; 25 - use jacquard_repo::mst::{CursorPosition, MstCursor, VerifiedWriteOp}; 25 + use jacquard_repo::mst::VerifiedWriteOp; 26 26 use jacquard_repo::{MemoryBlockStore, Mst}; 27 27 use tracing::{debug, info, warn}; 28 28 ··· 85 85 } 86 86 }; 87 87 88 - // Build the block store once; the MST is cheap to clone (just a CID + 89 - // Arc to the store). Both step 5 and collection mortality share it. 88 + // Clone the parsed CAR before consuming its blocks into the MST block store. 89 + // not super-cheap, but the Bytes values are refcounted at least so whatever 90 + let parsed_clone = parsed.clone(); 91 + 90 92 let storage = Arc::new(MemoryBlockStore::new_from_blocks(parsed.blocks)); 91 93 let new_mst = Mst::load(Arc::clone(&storage), mst_root_cid, None); 92 94 93 95 // ── Step 5: Inductive proof ─────────────────────────────────────────────── 94 - // Invert the ops on a clone of the new MST; the resulting root must equal 95 - // prevData. Uses a clone so the original `new_mst` is untouched for the 96 - // collection mortality check below. 97 - if !verify_inductive_proof(new_mst.clone(), &commit.ops, commit.prev_data.as_ref()).await? { 96 + if !verify_inductive_proof(new_mst, &commit.ops, commit.prev_data.as_ref()).await? { 98 97 metrics::counter!("lightrail_commit_dropped_total", "reason" => "proof_failed") 99 98 .increment(1); 100 99 debug!(did = %did, "commit dropped: inductive proof failed"); ··· 104 103 metrics::histogram!("lightrail_commit_ops").record(commit.ops.len() as f64); 105 104 106 105 // ── Collection birth/death detection ───────────────────────────────────── 107 - let (born, died) = extract_collection_mortality(&commit.ops, new_mst).await?; 106 + let (born, died) = crate::mst::mortality::extract(&commit.ops, parsed_clone)?; 108 107 109 108 // ── Steps 2, 6–9: Blocking storage checks + repo_prev update ──────────── 110 109 let db = db.clone(); ··· 271 270 } 272 271 _ => None, 273 272 } 274 - } 275 - 276 - // --------------------------------------------------------------------------- 277 - // Collection birth/death detection (async, MST cursor) 278 - // --------------------------------------------------------------------------- 279 - 280 - /// Walk the partial CAR's MST to detect which collections are newly added 281 - /// ("born") or fully removed ("died") by this commit. 282 - /// 283 - /// ## How it works 284 - /// 285 - /// ATProto firehose commit CARs include MST proof nodes for each changed key. 286 - /// Those proof nodes usually include the immediately adjacent keys (left and 287 - /// right neighbours in sorted order) for each change. By walking the partial 288 - /// CAR we collect all visible leaf keys: the changed keys themselves plus 289 - /// (usually) their in-collection neighbours. 290 - /// 291 - /// A collection dies when every visible key in it is being deleted - all 292 - /// included adjacent keys are *not* in its collection. (immediately-adjacent 293 - /// keys are always included when a key is deleted... i think?????????) 294 - /// 295 - /// A collection is born when every visible key in it is being created (no 296 - /// neighbours share its collection name). *rarely,* a neighbour in the same 297 - /// collection might be present in the repo but not in the proof, and we end up 298 - /// re-birthing (ew) a collection / adding it to the index twice. this is 299 - /// harmless. we still get to avoid almost all redundant inserts. 300 - /// 301 - /// Multi-op commits are handled because we consider all ops together: a key is 302 - /// a "survivor" only if it is visible AND not in the deleted set. 303 - async fn extract_collection_mortality( 304 - ops: &[RepoOp<'_>], 305 - mst: Mst<MemoryBlockStore>, 306 - ) -> crate::error::Result<(Vec<Nsid<'static>>, Vec<Nsid<'static>>)> { 307 - use std::collections::HashSet; 308 - 309 - // ── Build create/delete path sets ──────────────────────────────────────── 310 - let mut created: HashSet<String> = HashSet::new(); 311 - let mut deleted: HashSet<String> = HashSet::new(); 312 - for op in ops { 313 - match op.action.as_ref() { 314 - "create" => { 315 - created.insert(op.path.to_string()); 316 - } 317 - "delete" => { 318 - deleted.insert(op.path.to_string()); 319 - } 320 - _ => {} // updates don't affect collection mortality 321 - } 322 - } 323 - 324 - if created.is_empty() && deleted.is_empty() { 325 - return Ok((vec![], vec![])); 326 - } 327 - 328 - // ── Walk the partial CAR's MST to collect visible leaf keys ────────────── 329 - // 330 - // Strategy: try to descend into each Tree node (which succeeds if the 331 - // block is in the CAR). On error (block absent), skip the subtree instead. 332 - // This gives us all leaves reachable through blocks that ARE in the CAR — 333 - // exactly the proof nodes for keys adjacent to the changes. 334 - let mut cursor = MstCursor::new(mst); 335 - let mut visible: Vec<String> = Vec::new(); 336 - 337 - while !cursor.is_end() { 338 - match cursor.current() { 339 - CursorPosition::Leaf { key, .. } => { 340 - visible.push(key.to_string()); 341 - cursor 342 - .advance() 343 - .await 344 - .map_err(|e| crate::error::Error::Other(e.to_string()))?; 345 - } 346 - CursorPosition::Tree { .. } => { 347 - // Try to descend; fall back to skip if the block is absent. 348 - match cursor.advance().await { 349 - Ok(()) => {} 350 - Err(_) => cursor 351 - .skip_subtree() 352 - .await 353 - .map_err(|e| crate::error::Error::Other(e.to_string()))?, 354 - } 355 - } 356 - CursorPosition::End => break, 357 - } 358 - } 359 - 360 - // ── Check collection death (all visible keys in C are being deleted) ────── 361 - let deleted_collections: HashSet<&str> = deleted 362 - .iter() 363 - .filter_map(|p| p.split_once('/').map(|(c, _)| c)) 364 - .collect(); 365 - 366 - let mut died: Vec<Nsid<'static>> = Vec::new(); 367 - for coll in deleted_collections { 368 - let prefix = format!("{coll}/"); 369 - let has_survivor = visible 370 - .iter() 371 - .any(|k| k.starts_with(&prefix) && !deleted.contains(k.as_str())); 372 - if !has_survivor && let Ok(nsid) = Nsid::new_owned(coll) { 373 - died.push(nsid); 374 - } 375 - } 376 - 377 - // ── Check collection birth (all visible keys in C are being created) ────── 378 - let created_collections: HashSet<&str> = created 379 - .iter() 380 - .filter_map(|p| p.split_once('/').map(|(c, _)| c)) 381 - .collect(); 382 - 383 - let mut born: Vec<Nsid<'static>> = Vec::new(); 384 - for coll in created_collections { 385 - let prefix = format!("{coll}/"); 386 - let has_preexisting = visible 387 - .iter() 388 - .any(|k| k.starts_with(&prefix) && !created.contains(k.as_str())); 389 - if !has_preexisting && let Ok(nsid) = Nsid::new_owned(coll) { 390 - born.push(nsid); 391 - } 392 - } 393 - 394 - Ok((born, died)) 395 273 } 396 274 397 275 // ---------------------------------------------------------------------------