lightweight com.atproto.sync.listReposByCollection
45
fork

Configure Feed

Select the types of activity you want to include in your feed.

tests and a RepoReport thing

phil 93bd60ee 523258e3

+253 -21
+1 -1
src/mst/mod.rs
··· 6 6 7 7 use std::collections::BTreeMap; 8 8 9 - struct Span<T: Ord> { 9 + pub struct Span<T: Ord> { 10 10 gap_before: bool, 11 11 things: BTreeMap<T, bool>, // gap after 12 12 }
+252 -20
src/mst/slice_tricks.rs
··· 1 1 //! glean more from a `sync.getRecord` slice than we have any right to 2 2 3 3 use super::Span; 4 - use jacquard_common::types::string::Nsid; 4 + use cid::Cid as RawCid; 5 + use jacquard_common::types::string::{Nsid, Tid}; 5 6 use repo_stream::{MemCar, WalkItem}; 6 7 use std::cmp::Ordering; 7 8 use std::collections::BTreeSet; ··· 10 11 pub enum MstSliceTricksError { 11 12 #[error("repo-stream WalkError: {0}")] 12 13 WalkError(#[from] repo_stream::WalkError), 13 - #[error("bad repo path: {0}")] 14 - BadPath(String), 14 + #[error("bad data in repo: {0}")] 15 + InvalidData(String), 15 16 } 16 17 17 18 type Result<T> = std::result::Result<T, MstSliceTricksError>; ··· 25 26 /// there is probably a nice way to implement PartialOrd, but... we're just 26 27 /// going to tack a `/` on the end and call it a day 27 28 #[derive(Debug, PartialEq, PartialOrd, Eq, Ord)] 28 - struct TerminatedNsid(String); 29 + pub struct TerminatedNsid(String); 29 30 30 31 impl<'a> From<&Nsid<'a>> for TerminatedNsid { 31 32 fn from(nsid: &Nsid<'a>) -> TerminatedNsid { ··· 55 56 /// 56 57 /// a CollectionSpan can be complete even if it contains gaps, as long as 57 58 /// those gaps are *within* collection-bounding keys 58 - fn complete(&self) -> Option<Vec<Nsid<'static>>> { 59 + fn complete(&self) -> Option<BTreeSet<Nsid<'static>>> { 59 60 self.is_complete() 60 61 .then(|| self.things.keys().map(Into::into).collect()) 61 62 } ··· 89 90 in_gap = *gap_after; 90 91 (next_key, gap_after) = match spans.next() { 91 92 Some(n) => n, 92 - None => return candidates.next().is_none(), 93 + // last span key matched: remaining candidates are covered 94 + // only if there's a gap after the matched key 95 + None => return *gap_after || candidates.next().is_none(), 93 96 }; 94 97 candidate = match candidates.next() { 95 98 Some(c) => c, ··· 124 127 prev_gap = true; 125 128 continue; 126 129 }; 127 - let collection: Nsid<'_> = key 128 - .parse::<Nsid<'_>>() 129 - .map_err(|e| MstSliceTricksError::BadPath(format!("nsid parse: {e}")))?; 130 + let (nsid_str, _) = key.split_once('/').ok_or_else(|| { 131 + MstSliceTricksError::InvalidData(format!("missing '/' in key: {key}")) 132 + })?; 133 + let collection: Nsid<'_> = nsid_str 134 + .parse() 135 + .map_err(|e| MstSliceTricksError::InvalidData(format!("nsid parse: {e}")))?; 130 136 131 137 if let Some(prev) = prev_collection { 132 138 // last-from-collection wins setting gap_after ··· 152 158 /// 153 159 /// pretty basic rn, just checks if its root node is layer 0 or 1 (hard max 200, 154 160 /// 800 entries, respectively) 155 - fn likely_from_small_repo(car: &mut MemCar) -> bool { 161 + fn likely_from_small_repo(car: &MemCar) -> bool { 156 162 car.height() <= 1 157 163 } 158 164 165 + /// Calculated information from a CAR slice 166 + pub struct RepoReport { 167 + pub rev: Tid, 168 + pub data: RawCid, 169 + pub collections: RepoCollections, 170 + } 171 + 172 + /// Repositories are categorized for possibly-different processing 173 + pub enum RepoCollections { 174 + /// the CAR slice provably covered every collection! 175 + Complete(BTreeSet<Nsid<'static>>), 176 + /// the repo is likely very small 177 + Tiny(CollectionSpan), 178 + /// non-tiny incomplete car, collections we know about 179 + Otherwise(CollectionSpan), 180 + } 181 + 182 + /// Assess a CAR slice 183 + pub fn report(car: &mut MemCar) -> Result<RepoReport> { 184 + let rev = Tid::new(&car.commit.rev) 185 + .map_err(|e| MstSliceTricksError::InvalidData(format!("bad rev in commit: {e}")))?; 186 + let data = car.commit.data.into(); 187 + let span = span_from_slice(car)?; 188 + let collections = if let Some(c) = span.complete() { 189 + RepoCollections::Complete(c) 190 + } else if likely_from_small_repo(car) { 191 + RepoCollections::Tiny(span) 192 + } else { 193 + RepoCollections::Otherwise(span) 194 + }; 195 + Ok(RepoReport { 196 + rev, 197 + data, 198 + collections, 199 + }) 200 + } 201 + 159 202 #[cfg(test)] 160 203 mod tests { 161 204 use super::*; 162 205 use jacquard_common::types::string::Nsid; 163 206 use std::collections::BTreeSet; 164 207 208 + // --- helpers shared across all tests --- 209 + 210 + fn nsid(s: &str) -> Nsid<'static> { 211 + Nsid::new_owned(s).unwrap() 212 + } 213 + 165 214 fn make_span(gap_before: bool, things: &[(&str, bool)]) -> CollectionSpan { 166 215 CollectionSpan { 167 216 gap_before, ··· 235 284 assert!(s.could_cover(&nsids(&["a.b.d"]))); 236 285 } 237 286 238 - // NOTE: bug — Equal on the last span key returns candidates.next().is_none(), 239 - // ignoring gap_after. So a trailing candidate after an exact match is rejected 240 - // even when gap_after=true. 241 287 #[test] 242 - fn gap_after_last_matched_key_does_not_cover_remaining_candidates() { 288 + fn gap_after_last_matched_key_covers_remaining_candidates() { 243 289 let s = make_span(false, &[("a.b.c", true)]); 244 - assert!(!s.could_cover(&nsids(&["a.b.c", "a.b.d"]))); 290 + assert!(s.could_cover(&nsids(&["a.b.c", "a.b.d"]))); 245 291 } 246 292 247 293 // --- gap between two keys --------------------------------------------- ··· 260 306 261 307 // --- multiple collections in query set -------------------------------- 262 308 263 - // NOTE: bug — Equal advances the span but not the candidate. After matching 264 - // "a.b.a/", the same candidate is compared to "a.b.b/" → Less with in_gap=false 265 - // → false. Consecutive exact matches always fail when there are more span keys. 266 309 #[test] 267 310 fn consecutive_exact_matches_return_false() { 268 311 let s = make_span( ··· 274 317 275 318 #[test] 276 319 fn subset_of_exact_matches_returns_false() { 277 - // same root cause: after matching "a.b.a/", candidate stays "a.b.a/" and 278 - // compares Less to "a.b.b/" with no gap → false, even though "a.b.c" would match 279 320 let s = make_span( 280 321 false, 281 322 &[("a.b.a", false), ("a.b.b", false), ("a.b.c", false)], ··· 312 353 fn sub_namespace_covered_by_gap_before() { 313 354 let s = make_span(true, &[("a.b.c", false)]); 314 355 assert!(s.could_cover(&nsids(&["a.b.c.d"]))); 356 + } 357 + 358 + // ========================================================================== 359 + // span_from_slice 360 + // ========================================================================== 361 + 362 + use std::sync::Arc; 363 + 364 + use bytes::Bytes; 365 + use cid::Cid as IpldCid; 366 + use jacquard_common::types::string::Did; 367 + use jacquard_common::types::tid::Tid; 368 + use jacquard_repo::car::reader::ParsedCar; 369 + use jacquard_repo::commit::Commit; 370 + use jacquard_repo::{BlockStore, MemoryBlockStore, Mst, car::write_car_bytes}; 371 + use repo_stream::DriverBuilder; 372 + use std::collections::BTreeMap; 373 + 374 + /// Build a MemCar from `collection/rkey` keys. 375 + /// 376 + /// Only MST node blocks are included (no record data blocks), so records 377 + /// appear as `MissingRecord` during the walk. `span_from_slice` handles 378 + /// those exactly like present records — both carry a key. 379 + async fn make_mem_car(keys: &[&str]) -> repo_stream::MemCar { 380 + let storage = Arc::new(MemoryBlockStore::new()); 381 + let mut mst = Mst::new(storage.clone()); 382 + let dummy_cid = storage.put(b"record").await.unwrap(); 383 + for key in keys { 384 + mst = mst.add(key, dummy_cid).await.unwrap(); 385 + } 386 + let (mst_root, mut blocks) = mst.collect_blocks().await.unwrap(); 387 + let commit = Commit { 388 + did: Did::new_owned("did:web:example.com").unwrap(), 389 + version: 3, 390 + data: mst_root, 391 + rev: Tid::now_0(), 392 + prev: None, 393 + sig: Bytes::from(vec![0u8; 64]), 394 + }; 395 + let commit_cid = commit.to_cid().unwrap(); 396 + blocks.insert(commit_cid, Bytes::from(commit.to_cbor().unwrap())); 397 + let car_bytes = write_car_bytes(commit_cid, blocks).await.unwrap(); 398 + DriverBuilder::new() 399 + .load_car(tokio::io::BufReader::new(std::io::Cursor::new(car_bytes))) 400 + .await 401 + .unwrap() 402 + } 403 + 404 + /// Build a sparse MemCar containing only the root MST node and commit blocks. 405 + /// 406 + /// All subtree node blocks are absent, so subtrees surface as 407 + /// `MissingSubtree` during the walk. Useful for testing gap detection. 408 + async fn make_root_only_car(keys: &[&str]) -> repo_stream::MemCar { 409 + let storage = Arc::new(MemoryBlockStore::new()); 410 + let mut mst = Mst::new(storage.clone()); 411 + let dummy_cid = storage.put(b"record").await.unwrap(); 412 + for key in keys { 413 + mst = mst.add(key, dummy_cid).await.unwrap(); 414 + } 415 + let (mst_root, all_blocks) = mst.collect_blocks().await.unwrap(); 416 + let commit = Commit { 417 + did: Did::new_owned("did:web:example.com").unwrap(), 418 + version: 3, 419 + data: mst_root, 420 + rev: Tid::now_0(), 421 + prev: None, 422 + sig: Bytes::from(vec![0u8; 64]), 423 + }; 424 + let commit_cid = commit.to_cid().unwrap(); 425 + let commit_cbor = Bytes::from(commit.to_cbor().unwrap()); 426 + let root_bytes = all_blocks 427 + .get(&mst_root) 428 + .expect("root MST node not in blocks") 429 + .clone(); 430 + let mut sparse: BTreeMap<IpldCid, Bytes> = BTreeMap::new(); 431 + sparse.insert(commit_cid, commit_cbor); 432 + sparse.insert(mst_root, root_bytes); 433 + DriverBuilder::new() 434 + .load_jacquard_parsed_car(ParsedCar { 435 + root: commit_cid, 436 + blocks: sparse, 437 + }) 438 + .unwrap() 439 + } 440 + 441 + // --- full-CAR (no-gap) tests --- 442 + 443 + #[tokio::test] 444 + async fn span_empty_repo() { 445 + let mut car = make_mem_car(&[]).await; 446 + let span = span_from_slice(&mut car).unwrap(); 447 + assert!(span.is_complete()); 448 + assert_eq!(span.complete(), Some(BTreeSet::from_iter([]))); 449 + } 450 + 451 + #[tokio::test] 452 + async fn span_single_collection() { 453 + let mut car = make_mem_car(&["app.bsky.feed.post/abc123"]).await; 454 + let span = span_from_slice(&mut car).unwrap(); 455 + assert!(span.is_complete()); 456 + assert_eq!( 457 + span.complete(), 458 + Some(BTreeSet::from_iter([nsid("app.bsky.feed.post")])) 459 + ); 460 + } 461 + 462 + #[tokio::test] 463 + async fn span_multiple_records_same_collection_one_entry() { 464 + let mut car = make_mem_car(&[ 465 + "app.bsky.feed.post/aaa111", 466 + "app.bsky.feed.post/bbb222", 467 + "app.bsky.feed.post/ccc333", 468 + ]) 469 + .await; 470 + let span = span_from_slice(&mut car).unwrap(); 471 + assert!(span.is_complete()); 472 + assert_eq!( 473 + span.complete(), 474 + Some(BTreeSet::from_iter([nsid("app.bsky.feed.post")])) 475 + ); 476 + } 477 + 478 + #[tokio::test] 479 + async fn span_multiple_collections_complete_in_mst_order() { 480 + let mut car = make_mem_car(&[ 481 + "app.bsky.actor.profile/self", 482 + "app.bsky.feed.post/abc123", 483 + "app.bsky.graph.follow/def456", 484 + ]) 485 + .await; 486 + let span = span_from_slice(&mut car).unwrap(); 487 + assert!(span.is_complete()); 488 + assert_eq!( 489 + span.complete(), 490 + Some(BTreeSet::from_iter([ 491 + nsid("app.bsky.actor.profile"), 492 + nsid("app.bsky.feed.post"), 493 + nsid("app.bsky.graph.follow"), 494 + ])) 495 + ); 496 + // nothing outside a complete span is covered 497 + assert!(!span.could_cover(&BTreeSet::from([nsid("app.bsky.feed.like")]))); 498 + } 499 + 500 + #[tokio::test] 501 + async fn span_sub_namespace_sorts_before_parent_in_mst_order() { 502 + // '.' (0x2E) < '/' (0x2F), so TerminatedNsid ordering gives: 503 + // "sh.tangled.repo.issue/" < "sh.tangled.repo/" 504 + let mut car = make_mem_car(&["sh.tangled.repo/self", "sh.tangled.repo.issue/abc123"]).await; 505 + let span = span_from_slice(&mut car).unwrap(); 506 + assert!(span.is_complete()); 507 + assert_eq!( 508 + span.complete(), 509 + Some(BTreeSet::from_iter([ 510 + nsid("sh.tangled.repo.issue"), // sub-namespace first 511 + nsid("sh.tangled.repo"), 512 + ])) 513 + ); 514 + } 515 + 516 + // --- sparse-CAR (gap) tests --- 517 + 518 + // `app.bsky.feed.post/454397e440ec` is a known layer-4 key per the 519 + // atproto interop fixtures (see repo-stream/tests/mst-depth.rs). 520 + // Adding layer-0 keys on both sides makes the root node (height 4) have 521 + // left and right child pointers. Keeping only the root block means those 522 + // children surface as `MissingSubtree`, producing gaps on both sides of 523 + // the one visible collection. 524 + #[tokio::test] 525 + async fn span_root_only_is_incomplete_and_covers_flanking_collections() { 526 + let mut car = make_root_only_car(&[ 527 + "app.bsky.actor.profile/self", // layer 0, sorts before feed.post 528 + "app.bsky.feed.post/454397e440ec", // layer 4, appears in root node 529 + "app.bsky.graph.follow/self", // layer 0, sorts after feed.post 530 + ]) 531 + .await; 532 + let span = span_from_slice(&mut car).unwrap(); 533 + 534 + // feed.post is visible; subtrees produce gaps on both sides 535 + assert!(!span.is_complete()); 536 + assert_eq!(span.is_empty(), Some(false)); 537 + 538 + // the known collection must be coverable 539 + assert!(span.could_cover(&BTreeSet::from([nsid("app.bsky.feed.post")]))); 540 + 541 + // collections in the gaps are also coverable 542 + assert!(span.could_cover(&BTreeSet::from([ 543 + nsid("app.bsky.actor.profile"), 544 + nsid("app.bsky.feed.post"), 545 + nsid("app.bsky.graph.follow"), 546 + ]))); 315 547 } 316 548 }