lightweight com.atproto.sync.listReposByCollection
45
fork

Configure Feed

Select the types of activity you want to include in your feed.

wip

phil 285adc5c ba69cef4

+245 -18
+2
Cargo.lock
··· 2184 2184 version = "0.1.0" 2185 2185 dependencies = [ 2186 2186 "axum", 2187 + "bytes", 2187 2188 "clap", 2188 2189 "fjall", 2189 2190 "iroh-car", ··· 2193 2194 "jacquard-repo", 2194 2195 "metrics", 2195 2196 "metrics-exporter-prometheus", 2197 + "reqwest", 2196 2198 "serde", 2197 2199 "thiserror 2.0.18", 2198 2200 "tokio",
+3 -1
Cargo.toml
··· 10 10 iroh-car = "0.5.1" 11 11 jacquard-api = { version = "0.9.5", default-features = false, features = ["com_atproto"] } 12 12 jacquard-axum = { version = "0.9.6", default-features = false, features = ["tracing"] } 13 - jacquard-common = { version = "0.9.5", features = ["websocket"] } 13 + jacquard-common = { version = "0.9.5", features = ["websocket", "reqwest-client"] } 14 14 jacquard-repo = "0.9.6" 15 15 metrics = "0.24.3" 16 16 metrics-exporter-prometheus = { version = "0.18.1", features = ["http-listener"] } 17 + bytes = "1" 18 + reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } 17 19 serde = { version = "1", features = ["derive"] } 18 20 thiserror = "2.0.18" 19 21 tokio = { version = "1.49.0", features = ["full"] }
+146 -10
src/backfill/probe.rs
··· 1 1 //! `getRecord`-probing for large-repo backfill. 2 2 //! 3 3 //! MST keys have the form `<collection>/<rkey>`, where `collection` is an NSID 4 - //! and `rkey` is a Record Key, both subject to format restrictions and a total 5 - //! byte-length cap defined in the AT Protocol specs. 4 + //! and `rkey` is a Record Key, both subject to format restrictions defined in 5 + //! the AT Protocol specs. 6 6 //! 7 - //! `getRecord` always includes the keys adjacent to the queried key in its CAR 8 - //! slice response, even when the record does not exist. The probing algorithm 9 - //! exploits this to enumerate every collection with one request per collection: 7 + //! `getRecord` usually includes the keys adjacent to the queried key in its CAR 8 + //! slice response, and always does when the requested key does not exist. The 9 + //! probing algorithm exploits this to enumerate every collection with one 10 + //! request per collection: 10 11 //! 11 12 //! 1. Query `getRecord` with the **minimum legal MST key** — the 12 13 //! lexicographically lowest string that is a valid `<collection>/<rkey>`. ··· 16 17 //! 2. For that collection, compute the **maximum legal rkey** and query 17 18 //! `getRecord` with `<collection>/<max_rkey>`. The right-adjacent key in 18 19 //! the response is the first key of the *next* collection in the repo. 19 - //! 3. Repeat step 2 for each newly discovered collection until no right-adjacent 20 - //! key is returned, signalling that all collections have been found. 20 + //! 3. Repeat step 2 for each newly discovered collection until no right- 21 + //! adjacent key is returned, signalling that all collections have been 22 + //! found. 23 + //! 24 + //! It is *possible* for the maximum legal rkey of a collection to be present in 25 + //! a repo, and for its immediate right-adjacent key *not* to be present in the 26 + //! CAR slice response. In such cases, we can compute the very next legal 27 + //! collection and request it with the minimum legal rkey. 28 + //! 29 + //! Repositories can update while being probed. This is detectable because every 30 + //! probe response includes at least one parent block on any changed key's path. 31 + //! Need to get in the weeds but I think if we maintain a sparse tree from the 32 + //! probes, we might even be able to know whether an update added or removed any 33 + //! collections within the area we've already covered? 34 + //! 35 + //! TODO also: there is a `getBlocks` endpoint, which might be an alternative 36 + //! to probing: we could do one probe to get the root, then walk down the tree 37 + //! (on parallel paths even) to build out the sparse collection-boundary 38 + //! skeleton tree. Is this more efficient than probing with min/max rkeys? 39 + //! 40 + //! in either case, if handling repo updates leads to too many re-fetches, we 41 + //! should fall back to `getRepo` and full mst walking. 21 42 //! 22 43 //! Each discovered `(did, collection)` pair is written to the rbc/cbr index 23 44 //! via `db::index::insert`. 24 45 46 + use bytes::Bytes; 47 + use jacquard_api::com_atproto::sync::get_record::{GetRecord, GetRecordError}; 48 + use jacquard_common::{ 49 + error::ClientErrorKind, 50 + types::string::{Did, Nsid, RecordKey, Rkey}, 51 + xrpc::{XrpcError, XrpcExt}, 52 + }; 53 + 25 54 use crate::db::DbRef; 26 - use crate::error::Result; 55 + use crate::error::{Error, Result}; 56 + 57 + /// minimum legal NSID 58 + /// 59 + /// - whole domain authority must be lowercase 60 + /// - top level domain must start with an alphabetic character 61 + /// - other domain segments cannot begin or end with hyphens 62 + /// - max 253 chars of domain authority before the last name segment 63 + /// - name segment accepts uppercase and must begin with an alphabetic 64 + const MIN_COLLECTION: &str = "a-------------------------------------------------------------0.0-------------------------------------------------------------0.0-------------------------------------------------------------0.0-----------------------------------------------------------0.A"; 65 + 66 + /// minimum legal rkey: `-` = ordinal 45 (.:_~ are 46, 58, 95, 127 respectively) 67 + const MIN_RKEY: &str = "-"; 68 + 69 + /// maximum legal rkey: 512 of the max legal character 70 + const MAX_RKEY: &str = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"; 71 + 72 + /// Extract the collection segment from an MST key of the form 73 + /// `<collection>/<rkey>`. 74 + fn collection_from_key(key: &str) -> Option<&str> { 75 + key.split_once('/').map(|(col, _)| col) 76 + } 27 77 28 78 /// Probe `did` to enumerate its collections via sequential `getRecord` requests. 29 79 /// ··· 31 81 /// collection at a time until the end of the repo is reached. One XRPC request 32 82 /// is issued per collection present in the repo. 33 83 pub async fn probe_repo(host: &str, did: &str, db: DbRef) -> Result<()> { 34 - let _ = (host, did, db); 35 - todo!("sequential getRecord probing: walk right-adjacent keys to enumerate collections") 84 + // MAX_RKEY is 512 '~' characters — the lexicographically largest valid rkey. 85 + let max_rkey: String = "~".repeat(512); 86 + 87 + let client = reqwest::Client::new(); 88 + let base: jacquard_common::url::Url = format!("https://{}", host) 89 + .parse() 90 + .map_err(|e: jacquard_common::url::ParseError| Error::Other(e.to_string()))?; 91 + 92 + // Step 1: probe the minimum legal MST key to discover the first collection. 93 + let probe_key = format!("{}/{}", MIN_COLLECTION, MIN_RKEY); 94 + let car = match fetch_car(&client, &base, did, MIN_COLLECTION, MIN_RKEY).await? { 95 + Some(bytes) => bytes, 96 + None => return Ok(()), // repo is inaccessible or does not exist 97 + }; 98 + let adjacent = crate::mst::adjacent::extract_adjacent(&car, &probe_key).await?; 99 + let mut current_collection = match adjacent.next.as_deref().and_then(collection_from_key) { 100 + Some(col) => col.to_owned(), 101 + None => return Ok(()), // repo has no records 102 + }; 103 + 104 + // Steps 2+: for each discovered collection, insert it and walk to the next. 105 + loop { 106 + crate::db::index::insert(&db, did, &current_collection)?; 107 + 108 + let probe_key = format!("{}/{}", current_collection, max_rkey); 109 + let car = match fetch_car(&client, &base, did, &current_collection, &max_rkey).await? { 110 + Some(bytes) => bytes, 111 + None => break, // repo became inaccessible mid-probe 112 + }; 113 + let adjacent = crate::mst::adjacent::extract_adjacent(&car, &probe_key).await?; 114 + let next_collection = match adjacent.next.as_deref().and_then(collection_from_key) { 115 + Some(col) => col.to_owned(), 116 + None => break, // no more collections 117 + }; 118 + 119 + if next_collection == current_collection { 120 + // Safety guard: the adjacent key should always be in the next 121 + // collection, but avoid an infinite loop if it is not. 122 + break; 123 + } 124 + current_collection = next_collection; 125 + } 126 + 127 + Ok(()) 128 + } 129 + 130 + /// Make one `com.atproto.sync.getRecord` request and return the raw CAR bytes. 131 + /// 132 + /// Returns `None` if the repository is inaccessible (taken down, suspended, 133 + /// deactivated, or not found) or if an unexpected HTTP error occurs. 134 + async fn fetch_car( 135 + client: &reqwest::Client, 136 + base: &jacquard_common::url::Url, 137 + did: &str, 138 + collection: &str, 139 + rkey: &str, 140 + ) -> Result<Option<Bytes>> { 141 + let req = GetRecord { 142 + collection: Nsid::new_owned(collection).map_err(|e| Error::Other(e.to_string()))?, 143 + did: Did::new_owned(did).map_err(|e| Error::Other(e.to_string()))?, 144 + rkey: RecordKey(Rkey::new_owned(rkey).map_err(|e| Error::Other(e.to_string()))?), 145 + }; 146 + 147 + let resp = match client.xrpc(base.clone()).send(&req).await { 148 + Ok(resp) => resp, 149 + Err(e) => { 150 + return match e.kind() { 151 + // Network or unexpected HTTP-level errors: skip this repo. 152 + ClientErrorKind::Transport | ClientErrorKind::Http { .. } => Ok(None), 153 + _ => Err(Error::Other(e.to_string())), 154 + }; 155 + } 156 + }; 157 + 158 + // resp is HTTP 200 or 400 at this point (401 with WWW-Authenticate is 159 + // already surfaced as Err by send()). 160 + match resp.parse() { 161 + Ok(output) => Ok(Some(output.body)), 162 + Err(XrpcError::Xrpc(err)) => match err { 163 + GetRecordError::RepoNotFound(_) 164 + | GetRecordError::RepoTakendown(_) 165 + | GetRecordError::RepoSuspended(_) 166 + | GetRecordError::RepoDeactivated(_) 167 + | GetRecordError::RecordNotFound(_) 168 + | GetRecordError::Unknown(_) => Ok(None), 169 + }, 170 + Err(e) => Err(Error::Other(e.to_string())), 171 + } 36 172 }
+94 -7
src/mst/adjacent.rs
··· 1 1 //! Extract adjacent MST keys from a CAR slice. 2 2 //! 3 - //! Given the raw blocks included in a `getRecord` or firehose commit CAR slice, 4 - //! this module uses `jacquard-repo` primitives to find neighbouring record keys 5 - //! and detect collection boundaries. 3 + //! Given the raw blocks included in a `getRecord` CAR slice, this module 4 + //! uses `jacquard-repo` primitives to find neighbouring record keys and detect 5 + //! collection boundaries. 6 + //! 7 + //! `getRecord` CAR slices are *partial* snapshots of the MST: they include only 8 + //! the blocks on the path from the root to the queried key. Sibling subtrees 9 + //! are referenced by CID but their blocks are absent. The cursor walks only 10 + //! the blocks that are present, skipping any subtree whose block is missing. 11 + 12 + use std::sync::Arc; 13 + 14 + use jacquard_repo::{ 15 + MemoryBlockStore, Mst, 16 + car::parse_car_bytes, 17 + commit::Commit, 18 + mst::{CursorPosition, MstCursor}, 19 + }; 20 + 21 + use crate::error::{Error, Result}; 6 22 7 23 /// The adjacent keys returned by the MST for a given probe key. 8 24 #[derive(Debug, Clone)] ··· 15 31 16 32 /// Extract adjacent keys for `probe_key` from the given CAR block bytes. 17 33 /// 18 - /// Loads the blocks into an in-memory block store and uses `MstCursor` from 19 - /// `jacquard-repo` to walk the MST and find neighbours. 20 - pub fn extract_adjacent(_car_bytes: &[u8], _probe_key: &str) -> crate::error::Result<AdjacentKeys> { 21 - todo!("load CAR blocks into MemoryBlockStore, use MstCursor to find adjacent keys") 34 + /// Parses the MST from the CAR, then walks the tree in sorted order using 35 + /// `MstCursor`, collecting the largest visible key less than `probe_key` 36 + /// (prev) and the smallest visible key greater than `probe_key` (next). 37 + /// 38 + /// Subtrees whose blocks are absent from the CAR are silently skipped; this 39 + /// is expected for `getRecord` slices, which contain only the proof path. 40 + pub async fn extract_adjacent(car_bytes: &[u8], probe_key: &str) -> Result<AdjacentKeys> { 41 + // Parse the CAR bytes into a root CID + block map. 42 + let parsed = parse_car_bytes(car_bytes) 43 + .await 44 + .map_err(|e| Error::Other(e.to_string()))?; 45 + 46 + // The CAR root is the signed commit; its `data` field is the MST root CID. 47 + let mst_root = { 48 + let commit_bytes = parsed 49 + .blocks 50 + .get(&parsed.root) 51 + .ok_or_else(|| Error::Other("getRecord CAR has no commit block".into()))?; 52 + let commit = Commit::from_cbor(commit_bytes.as_ref()) 53 + .map_err(|e| Error::Other(format!("bad commit in getRecord CAR: {}", e)))?; 54 + *commit.data() 55 + }; 56 + 57 + // Load all CAR blocks into an in-memory store and mount the MST lazily. 58 + let storage = Arc::new(MemoryBlockStore::new_from_blocks(parsed.blocks)); 59 + let mst = Mst::load(storage, mst_root, None); 60 + 61 + // Walk the tree with a cursor in sorted key order. 62 + // The cursor starts pointing at the MST root (a Tree position); we process 63 + // each position as we encounter it. 64 + let mut cursor = MstCursor::new(mst); 65 + let mut prev: Option<String> = None; 66 + let mut next: Option<String> = None; 67 + 68 + loop { 69 + match cursor.current() { 70 + CursorPosition::End => break, 71 + 72 + CursorPosition::Leaf { key, .. } => { 73 + let k = key.as_str(); 74 + if k < probe_key { 75 + // This is a candidate for prev; keep the latest one seen. 76 + prev = Some(k.to_owned()); 77 + // Advance past this leaf (step_over; never fails). 78 + if cursor.advance().await.is_err() { 79 + break; 80 + } 81 + } else if k > probe_key { 82 + // First key greater than probe_key in sorted order. 83 + next = Some(k.to_owned()); 84 + break; 85 + } else { 86 + // k == probe_key: the record actually exists; skip it. 87 + if cursor.advance().await.is_err() { 88 + break; 89 + } 90 + } 91 + } 92 + 93 + CursorPosition::Tree { .. } => { 94 + // Descend into the subtree. If the block is absent from the 95 + // CAR (the cursor returns Err), skip the subtree instead. 96 + match cursor.advance().await { 97 + Ok(()) => {} 98 + Err(_) => { 99 + if cursor.skip_subtree().await.is_err() { 100 + break; 101 + } 102 + } 103 + } 104 + } 105 + } 106 + } 107 + 108 + Ok(AdjacentKeys { prev, next }) 22 109 } 23 110 24 111 /// Estimate whether this is a small repo by inspecting the MST level of the