Fast and robust atproto CAR file processing in rust
14
fork

Configure Feed

Select the types of activity you want to include in your feed.

whew

phil a28cc59c 356cbc3a

+777 -282
+6 -1
examples/read-file/main.rs
··· 32 32 log::info!("got commit: {:?}", mem_car.commit); 33 33 34 34 while let Step::Value(records) = mem_car.next_chunk(256)? { 35 - for Output { key: _, cid: _, data: _ } in records { 35 + for Output { 36 + key: _, 37 + cid: _, 38 + data: _, 39 + } in records 40 + { 36 41 // process records 37 42 } 38 43 }
-85
src/block.rs
··· 1 - use crate::{Bytes, mst::MstNode}; 2 - 3 - #[derive(Debug, Clone)] 4 - pub enum MaybeProcessedBlock { 5 - /// A block that's *probably* a Node (but we can't know yet) 6 - /// 7 - /// It *can be* a record that suspiciously looks a lot like a node, so we 8 - /// cannot eagerly turn it into a Node. We only know for sure what it is 9 - /// when we actually walk down the MST 10 - Raw(Bytes), 11 - /// A processed record from a block that was definitely not a Node 12 - /// 13 - /// Processing has to be fallible because the CAR can have totally-unused 14 - /// blocks, which can just be garbage. since we're eagerly trying to process 15 - /// record blocks without knowing for sure that they *are* records, we 16 - /// discard any definitely-not-nodes that fail processing and keep their 17 - /// error in the buffer for them. if we later try to retreive them as a 18 - /// record, then we can surface the error. 19 - /// 20 - /// If we _never_ needed this block, then we may have wasted a bit of effort 21 - /// trying to process it. Oh well. 22 - /// 23 - /// There's an alternative here, which would be to kick unprocessable blocks 24 - /// back to Raw, or maybe even a new RawUnprocessable variant. Then we could 25 - /// surface the typed error later if needed by trying to reprocess. 26 - Processed(Bytes), 27 - } 28 - 29 - impl MaybeProcessedBlock { 30 - pub fn to_node(&self) -> Option<MstNode> { 31 - let Self::Raw(bytes) = self else { 32 - return None; 33 - }; 34 - serde_ipld_dagcbor::from_slice(bytes).ok() 35 - } 36 - pub fn unknown_depth(&self) -> bool { 37 - let Self::Raw(bytes) = self else { 38 - return false; 39 - }; 40 - let Ok(node) = serde_ipld_dagcbor::from_slice::<MstNode>(bytes) else { 41 - return false; 42 - }; 43 - node.depth.is_none() 44 - } 45 - pub(crate) fn maybe(process: fn(Bytes) -> Bytes, data: Bytes) -> Self { 46 - if MstNode::could_be(&data) { 47 - MaybeProcessedBlock::Raw(data) 48 - } else { 49 - MaybeProcessedBlock::Processed(process(data)) 50 - } 51 - } 52 - pub(crate) fn len(&self) -> usize { 53 - match self { 54 - MaybeProcessedBlock::Raw(b) => b.len(), 55 - MaybeProcessedBlock::Processed(b) => b.len(), 56 - } 57 - } 58 - pub(crate) fn into_bytes(self) -> Bytes { 59 - match self { 60 - MaybeProcessedBlock::Raw(mut b) => { 61 - b.push(0x00); 62 - b 63 - } 64 - MaybeProcessedBlock::Processed(mut b) => { 65 - b.push(0x01); 66 - b 67 - } 68 - } 69 - } 70 - pub(crate) fn from_bytes(mut b: Bytes) -> Self { 71 - // TODO: make sure bytes is not empty, that it's explicitly 0 or 1, etc 72 - let suffix = b.pop().unwrap(); 73 - if suffix == 0x00 { 74 - MaybeProcessedBlock::Raw(b) 75 - } else { 76 - MaybeProcessedBlock::Processed(b) 77 - } 78 - } 79 - } 80 - 81 - /// Processor that just returns the raw blocks 82 - #[inline] 83 - pub fn noop(block: Bytes) -> Bytes { 84 - block 85 - }
+254 -13
src/disk.rs
··· 1 1 /*! 2 - Disk storage for blocks on disk 3 - 4 - Currently this uses sqlite. In testing sqlite wasn't the fastest, but it seemed 5 - to be the best behaved in terms of both on-disk space usage and memory usage. 2 + Disk storage and disk-based MST walking. 6 3 7 4 ```no_run 8 5 # use repo_stream::{DiskBuilder, DiskError}; ··· 17 14 ``` 18 15 */ 19 16 20 - use crate::{Bytes, drive::DriveError}; 17 + use crate::{ 18 + Bytes, Step, 19 + mst::ThingKind, 20 + walk::{MaybeProcessedBlock, MstError, Output, WalkError, WalkItem, Walker}, 21 + }; 21 22 use fjall::{Database, Error as FjallError, Keyspace, KeyspaceCreateOptions}; 23 + use std::convert::Infallible; 22 24 use std::path::PathBuf; 25 + use thiserror::Error; 26 + use tokio::sync::mpsc; 27 + 28 + // --------------------------------------------------------------------------- 29 + // Disk storage errors 30 + // --------------------------------------------------------------------------- 23 31 24 32 #[derive(Debug, thiserror::Error)] 25 33 pub enum DiskError { 26 34 /// A wrapped database error 27 - /// 28 - /// (The wrapped err should probably be obscured to remove public-facing 29 - /// sqlite bits) 30 35 #[error(transparent)] 31 36 DbError(#[from] FjallError), 32 37 /// A tokio blocking task failed to join 33 38 #[error("Failed to join a tokio blocking task: {0}")] 34 39 JoinError(#[from] tokio::task::JoinError), 35 40 /// The total size of stored blocks exceeded the allowed size 36 - /// 37 - /// If you need to process *really* big CARs, you can configure a higher 38 - /// limit. 39 41 #[error("Maximum disk size reached")] 40 42 MaxSizeExceeded, 41 43 } 42 44 45 + // --------------------------------------------------------------------------- 46 + // Disk driver errors 47 + // --------------------------------------------------------------------------- 48 + 49 + /// Errors that can happen while consuming blocks via the disk path 50 + #[derive(Debug, Error)] 51 + pub enum DriveError { 52 + #[error("Error from iroh_car: {0}")] 53 + CarReader(#[from] iroh_car::Error), 54 + #[error("Failed to decode commit block: {0}")] 55 + BadBlock(#[from] serde_ipld_dagcbor::DecodeError<Infallible>), 56 + #[error("The Commit block referenced by the root was not found")] 57 + MissingCommit, 58 + #[error("Failed to walk the MST: {0}")] 59 + WalkError(#[from] WalkError), 60 + #[error("CAR file had no roots")] 61 + MissingRoot, 62 + #[error("Storage error: {0}")] 63 + StorageError(#[from] DiskError), 64 + #[error("Unexpected missing block: {0:?}")] 65 + MissingBlock(cid::Cid), 66 + #[error("Tried to send on a closed channel")] 67 + ChannelSendError, 68 + #[error("Failed to join a task: {0}")] 69 + JoinError(#[from] tokio::task::JoinError), 70 + } 71 + 72 + impl From<MstError> for DriveError { 73 + fn from(me: MstError) -> DriveError { 74 + DriveError::WalkError(WalkError::MstError(me)) 75 + } 76 + } 77 + 78 + // --------------------------------------------------------------------------- 79 + // Disk store 80 + // --------------------------------------------------------------------------- 81 + 43 82 /// Builder-style disk store setup 44 83 #[derive(Debug, Clone)] 45 84 pub struct DiskBuilder { ··· 134 173 pub(crate) fn put_many( 135 174 &mut self, 136 175 kv: impl Iterator<Item = (Vec<u8>, Bytes)>, 137 - ) -> Result<(), DriveError> { 176 + ) -> Result<(), DiskError> { 138 177 let mut batch = self.db.batch(); 139 178 for (k, v) in kv { 140 179 self.stored += v.len(); 141 180 if self.stored > self.max_stored { 142 - return Err(DiskError::MaxSizeExceeded.into()); 181 + return Err(DiskError::MaxSizeExceeded); 143 182 } 144 183 batch.insert(&self.keyspace, k, v); 145 184 } ··· 158 197 Ok(tokio::task::spawn_blocking(move || keyspace.clear()).await??) 159 198 } 160 199 } 200 + 201 + // --------------------------------------------------------------------------- 202 + // disk_step on Walker (impl in this module to avoid walk.rs → disk.rs dep) 203 + // --------------------------------------------------------------------------- 204 + 205 + impl Walker { 206 + /// blocking!!!!! 207 + pub(crate) fn disk_step( 208 + &mut self, 209 + blocks: &DiskStore, 210 + process: impl Fn(Bytes) -> Bytes, 211 + ) -> Result<Option<WalkItem>, WalkError> { 212 + while let Some(thing) = self.next_todo() { 213 + let Some(block_slice) = blocks.get(&thing.link.to_bytes())? else { 214 + return Ok(Some(match thing.kind { 215 + ThingKind::Record(key) => WalkItem::MissingRecord { 216 + key, 217 + cid: thing.link.into(), 218 + }, 219 + ThingKind::ChildNode => WalkItem::MissingSubtree { 220 + cid: thing.link.into(), 221 + }, 222 + })); 223 + }; 224 + let mpb = MaybeProcessedBlock::from_bytes(block_slice.to_vec()); 225 + if let Some(out) = self.mpb_step(thing, &mpb, &process)? { 226 + return Ok(Some(WalkItem::Record(out))); 227 + } 228 + } 229 + Ok(None) 230 + } 231 + } 232 + 233 + // --------------------------------------------------------------------------- 234 + // Disk driver 235 + // --------------------------------------------------------------------------- 236 + 237 + struct BigState { 238 + store: DiskStore, 239 + walker: Walker, 240 + } 241 + 242 + /// MST walker that reads from disk instead of an in-memory hashmap 243 + pub struct DiskDriver { 244 + process: fn(Bytes) -> Bytes, 245 + state: Option<BigState>, 246 + } 247 + 248 + // for doctests only 249 + #[doc(hidden)] 250 + pub fn _get_fake_disk_driver() -> DiskDriver { 251 + DiskDriver { 252 + process: crate::walk::noop, 253 + state: None, 254 + } 255 + } 256 + 257 + impl DiskDriver { 258 + /// Walk the MST returning up to `n` key + record pairs 259 + /// 260 + /// ```no_run 261 + /// # use repo_stream::{disk::{DiskDriver, DriveError, _get_fake_disk_driver}, Step}; 262 + /// # #[tokio::main] 263 + /// # async fn main() -> Result<(), DriveError> { 264 + /// # let mut disk_driver = _get_fake_disk_driver(); 265 + /// while let Step::Value(outputs) = disk_driver.next_chunk(256).await? { 266 + /// for output in outputs { 267 + /// println!("{}: size={}", output.key, output.data.len()); 268 + /// } 269 + /// } 270 + /// # Ok(()) 271 + /// # } 272 + /// ``` 273 + pub async fn next_chunk(&mut self, n: usize) -> Result<Step<Vec<Output>>, DriveError> { 274 + let process = self.process; 275 + 276 + let mut state = self.state.take().expect("DiskDriver must have Some(state)"); 277 + 278 + let (state, res) = 279 + tokio::task::spawn_blocking(move || -> (BigState, Result<Vec<Output>, DriveError>) { 280 + let mut out = Vec::with_capacity(n); 281 + 282 + for _ in 0..n { 283 + match state.walker.disk_step(&state.store, process) { 284 + Err(e) => return (state, Err(e.into())), 285 + Ok(Some(WalkItem::Record(output))) => out.push(output), 286 + Ok(Some(WalkItem::MissingRecord { cid, .. })) 287 + | Ok(Some(WalkItem::MissingSubtree { cid })) => { 288 + return (state, Err(DriveError::MissingBlock(cid))); 289 + } 290 + Ok(None) => break, 291 + } 292 + } 293 + 294 + (state, Ok::<_, DriveError>(out)) 295 + }) 296 + .await?; 297 + 298 + self.state = Some(state); 299 + 300 + let out = res?; 301 + 302 + if out.is_empty() { 303 + Ok(Step::End(None)) 304 + } else { 305 + Ok(Step::Value(out)) 306 + } 307 + } 308 + 309 + fn read_tx_blocking( 310 + &mut self, 311 + n: usize, 312 + tx: mpsc::Sender<Result<Step<Vec<Output>>, DriveError>>, 313 + ) -> Result<(), mpsc::error::SendError<Result<Step<Vec<Output>>, DriveError>>> { 314 + let BigState { store, walker } = self.state.as_mut().expect("valid state"); 315 + 316 + loop { 317 + let mut out: Vec<Output> = Vec::with_capacity(n); 318 + 319 + for _ in 0..n { 320 + match walker.disk_step(store, self.process) { 321 + Err(e) => return tx.blocking_send(Err(e.into())), 322 + Ok(Some(WalkItem::Record(output))) => out.push(output), 323 + Ok(Some(WalkItem::MissingRecord { cid, .. })) 324 + | Ok(Some(WalkItem::MissingSubtree { cid })) => { 325 + return tx.blocking_send(Err(DriveError::MissingBlock(cid))); 326 + } 327 + Ok(None) => break, 328 + } 329 + } 330 + 331 + if out.is_empty() { 332 + break; 333 + } 334 + tx.blocking_send(Ok(Step::Value(out)))?; 335 + } 336 + 337 + Ok(()) 338 + } 339 + 340 + /// Spawn the disk reading task into a tokio blocking thread 341 + /// 342 + /// ```no_run 343 + /// # use repo_stream::{disk::{DiskDriver, DriveError, _get_fake_disk_driver}, Step}; 344 + /// # #[tokio::main] 345 + /// # async fn main() -> Result<(), DriveError> { 346 + /// # let mut disk_driver = _get_fake_disk_driver(); 347 + /// let (mut rx, join) = disk_driver.to_channel(512); 348 + /// while let Some(recvd) = rx.recv().await { 349 + /// let outputs = recvd?; 350 + /// let Step::Value(outputs) = outputs else { break; }; 351 + /// for output in outputs { 352 + /// println!("{}: size={}", output.key, output.data.len()); 353 + /// } 354 + /// 355 + /// } 356 + /// # Ok(()) 357 + /// # } 358 + /// ``` 359 + pub fn to_channel( 360 + mut self, 361 + n: usize, 362 + ) -> ( 363 + mpsc::Receiver<Result<Step<Vec<Output>>, DriveError>>, 364 + tokio::task::JoinHandle<Self>, 365 + ) { 366 + let (tx, rx) = mpsc::channel::<Result<Step<Vec<Output>>, DriveError>>(1); 367 + 368 + let chan_task = tokio::task::spawn_blocking(move || { 369 + if let Err(mpsc::error::SendError(_)) = self.read_tx_blocking(n, tx) { 370 + log::debug!("big car reader exited early due to dropped receiver channel"); 371 + } 372 + self 373 + }); 374 + 375 + (rx, chan_task) 376 + } 377 + 378 + /// Reset the disk storage so it can be reused. 379 + pub async fn reset_store(mut self) -> Result<DiskStore, DriveError> { 380 + let BigState { store, .. } = self.state.take().expect("valid state"); 381 + store.reset().await?; 382 + Ok(store) 383 + } 384 + } 385 + 386 + // --------------------------------------------------------------------------- 387 + // PartialCar::finish_loading lives in mem.rs but needs DiskDriver — it's 388 + // imported there from this module. 389 + // --------------------------------------------------------------------------- 390 + 391 + /// Build a `DiskDriver` from a walker and store. Used by `PartialCar::finish_loading`. 392 + pub(crate) fn make_disk_driver( 393 + store: DiskStore, 394 + walker: Walker, 395 + process: fn(Bytes) -> Bytes, 396 + ) -> DiskDriver { 397 + DiskDriver { 398 + process, 399 + state: Some(BigState { store, walker }), 400 + } 401 + }
+1 -3
src/drive.rs
··· 38 38 /// The partial state is returned so the caller can decide what to do 39 39 /// (e.g. resume with disk storage via `PartialCar::finish_loading`). 40 40 #[error("partially loaded car")] 41 - MemoryLimitReached(PartialCar<R>), 41 + MemoryLimitReached(Box<PartialCar<R>>), 42 42 } 43 43 44 44 ··· 166 166 MaybeProcessedBlock::Raw(bytes) => serde_ipld_dagcbor::from_slice(bytes)?, 167 167 }; 168 168 let mut walker = Walker::new(root_node); 169 - 170 - let prev_key = walker.step_to_edge(&mem_blocks)?; 171 169 172 170 Ok(MemCar { 173 171 commit,
+5 -9
src/lib.rs
··· 13 13 14 14 Some MST validations are applied: 15 15 - Keys must appear in order 16 - - Keys must be at the correct MST tree depth 16 + - Keys must be at the correct MST tree layer 17 17 18 18 `iroh_car` additionally applies a block size limit of `2MiB`. 19 19 ··· 74 74 75 75 */ 76 76 77 - pub mod block; 78 77 pub mod disk; 79 - pub mod drive; 80 - pub mod link; 78 + pub mod mem; 81 79 pub mod mst; 82 80 pub mod walk; 83 81 84 - pub use disk::{DiskBuilder, DiskError, DiskStore}; 85 - pub use block::noop; 86 - pub use drive::{DriveError, DriverBuilder, LoadError, MemCar, PartialCar}; 87 - pub use link::NodeThing; 82 + pub use disk::{DiskBuilder, DiskDriver, DiskError, DiskStore, DriveError}; 83 + pub use mem::{DriverBuilder, LoadError, MemCar, PartialCar}; 88 84 pub use mst::Commit; 89 - pub use walk::{Output, Step, WalkError, WalkItem}; 85 + pub use walk::{Output, Step, WalkError, WalkItem, noop}; 90 86 91 87 pub type Bytes = Vec<u8>; 92 88
-43
src/link.rs
··· 1 - use cid::Cid; 2 - 3 - #[derive(Debug, serde::Deserialize, Clone, PartialEq, Eq, Hash)] 4 - pub struct ObjectLink(Cid); 5 - 6 - impl ObjectLink { 7 - pub fn to_bytes(&self) -> Vec<u8> { 8 - self.0.to_bytes() 9 - } 10 - } 11 - 12 - impl From<Cid> for ObjectLink { 13 - fn from(cid: Cid) -> ObjectLink { 14 - ObjectLink(cid) 15 - } 16 - } 17 - 18 - impl From<ObjectLink> for Cid { 19 - fn from(link: ObjectLink) -> Cid { 20 - link.0 21 - } 22 - } 23 - 24 - #[derive(Debug, Clone, PartialEq)] 25 - pub struct NodeThing { 26 - pub link: ObjectLink, 27 - pub kind: ThingKind, 28 - } 29 - 30 - impl NodeThing { 31 - pub fn is_record(&self) -> bool { 32 - match self.kind { 33 - ThingKind::ChildNode => false, 34 - ThingKind::Record(_) => true, 35 - } 36 - } 37 - } 38 - 39 - #[derive(Debug, Clone, PartialEq)] 40 - pub enum ThingKind { 41 - ChildNode, 42 - Record(crate::RepoPath), 43 - }
+351
src/mem.rs
··· 1 + //! Load a CAR file into memory and walk its MST 2 + 3 + use crate::{ 4 + Bytes, HashMap, RepoPath, Step, 5 + disk::{DiskDriver, DiskError, DiskStore, DriveError, make_disk_driver}, 6 + mst::{Commit, MstNode, ObjectLink}, 7 + walk::{MaybeProcessedBlock, Output, WalkError, WalkItem, Walker}, 8 + }; 9 + use iroh_car::CarReader; 10 + use std::convert::Infallible; 11 + use thiserror::Error; 12 + use tokio::io::AsyncRead; 13 + 14 + /// Errors that can occur while loading a CAR into memory 15 + #[derive(Debug, Error)] 16 + pub enum LoadError<R: AsyncRead + Unpin> { 17 + #[error("failed reading CAR: {0}")] 18 + CarReader(#[from] iroh_car::Error), 19 + #[error("failed to decode cbor: {0}")] 20 + BadBlock(#[from] serde_ipld_dagcbor::DecodeError<Infallible>), 21 + #[error("missing commit")] 22 + MissingCommit, 23 + #[error("missing mst root node")] 24 + MissingRoot, 25 + #[error("failed to walk mst: {0}")] 26 + WalkError(#[from] WalkError), 27 + /// The memory limit was reached before all blocks were loaded. 28 + /// 29 + /// The partial state is returned so the caller can decide what to do 30 + /// (e.g. resume with disk storage via `PartialCar::finish_loading`). 31 + #[error("partially loaded car")] 32 + MemoryLimitReached(PartialCar<R>), 33 + } 34 + 35 + /// A partially memory-loaded CAR file that hit the memory limit mid-stream. 36 + /// 37 + /// Can be resumed with disk storage via `finish_loading`, or discarded. 38 + #[derive(Debug)] 39 + pub struct PartialCar<R: AsyncRead + Unpin> { 40 + pub(crate) car: CarReader<R>, 41 + pub(crate) root: cid::Cid, 42 + pub(crate) process: fn(Bytes) -> Bytes, 43 + pub(crate) max_size: usize, 44 + pub(crate) blocks: HashMap<ObjectLink, MaybeProcessedBlock>, 45 + /// The commit block, if it was seen before the memory limit was reached 46 + pub commit: Option<Commit>, 47 + } 48 + 49 + /// Builder-style driver setup 50 + #[derive(Debug, Clone)] 51 + pub struct DriverBuilder { 52 + pub mem_limit_mb: usize, 53 + pub block_processor: fn(Bytes) -> Bytes, 54 + } 55 + 56 + impl Default for DriverBuilder { 57 + fn default() -> Self { 58 + Self { 59 + mem_limit_mb: 10, 60 + block_processor: crate::walk::noop, 61 + } 62 + } 63 + } 64 + 65 + impl DriverBuilder { 66 + /// Begin configuring the driver with defaults 67 + pub fn new() -> Self { 68 + Default::default() 69 + } 70 + 71 + /// Set the in-memory size limit, in MiB 72 + /// 73 + /// Default: 10 MiB 74 + pub fn with_mem_limit_mb(mut self, new_limit: usize) -> Self { 75 + self.mem_limit_mb = new_limit; 76 + self 77 + } 78 + 79 + /// Set the block processor 80 + /// 81 + /// Default: noop, raw blocks will be emitted 82 + pub fn with_block_processor(mut self, new_processor: fn(Bytes) -> Bytes) -> Self { 83 + self.block_processor = new_processor; 84 + self 85 + } 86 + 87 + /// Load an atproto repository CAR into memory. 88 + /// 89 + /// Returns a `MemCar` ready for walking. If the blocks exceed the memory 90 + /// limit, returns `Err(LoadError::MemoryLimitReached(partial))` containing 91 + /// the partial state, which can be resumed with disk storage. 92 + pub async fn load_car<R: AsyncRead + Unpin>(&self, reader: R) -> Result<MemCar, LoadError<R>> { 93 + load_car(reader, self.block_processor, self.mem_limit_mb).await 94 + } 95 + } 96 + 97 + async fn load_car<R: AsyncRead + Unpin>( 98 + reader: R, 99 + process: fn(Bytes) -> Bytes, 100 + mem_limit_mb: usize, 101 + ) -> Result<MemCar, LoadError<R>> { 102 + let mut block_count = 0; 103 + 104 + let max_size = mem_limit_mb * 2_usize.pow(20); 105 + let mut mem_blocks = HashMap::new(); 106 + 107 + let mut car = CarReader::new(reader).await?; 108 + 109 + let roots = car.header().roots(); 110 + assert_eq!(roots.len(), 1); 111 + 112 + let root = *roots.first().ok_or(LoadError::MissingRoot)?; 113 + log::debug!("root: {root:?}"); 114 + 115 + let mut commit = None; 116 + 117 + let mut mem_size = 0; 118 + while let Some((cid, data)) = car.next_block().await? { 119 + block_count += 1; 120 + // The root commit block is handled separately — never passed to the processor 121 + if cid == root { 122 + let c: Commit = serde_ipld_dagcbor::from_slice(&data)?; 123 + commit = Some(c); 124 + continue; 125 + } 126 + 127 + let maybe_processed = MaybeProcessedBlock::maybe(process, data); 128 + 129 + mem_size += maybe_processed.len(); 130 + mem_blocks.insert(cid.into(), maybe_processed); 131 + if mem_size >= max_size { 132 + log::debug!("blocks loaded before memory limit: {block_count}"); 133 + return Err(LoadError::MemoryLimitReached(PartialCar { 134 + car, 135 + root, 136 + process, 137 + max_size, 138 + blocks: mem_blocks, 139 + commit, 140 + })); 141 + } 142 + } 143 + 144 + log::debug!("blocks: {block_count}"); 145 + 146 + let commit = commit.ok_or(LoadError::MissingCommit)?; 147 + 148 + let root_node: MstNode = match mem_blocks 149 + .get(&commit.data) 150 + .ok_or(LoadError::MissingCommit)? 151 + { 152 + MaybeProcessedBlock::Processed(_) => Err(WalkError::BadCommitFingerprint)?, 153 + MaybeProcessedBlock::Raw(bytes) => serde_ipld_dagcbor::from_slice(bytes)?, 154 + }; 155 + 156 + Ok(MemCar { 157 + commit, 158 + prev_key: None, 159 + blocks: mem_blocks, 160 + walker: Walker::new(root_node), 161 + process, 162 + trailing_key: None, 163 + }) 164 + } 165 + 166 + /// A fully loaded in-memory CAR file, ready for MST walking. 167 + #[derive(Debug)] 168 + pub struct MemCar { 169 + pub commit: Commit, 170 + /// For CAR slices: the key of the last record before this slice's leading edge. 171 + /// `None` if this slice (or full CAR) starts from the leftmost record in the tree. 172 + pub prev_key: Option<RepoPath>, 173 + pub blocks: HashMap<ObjectLink, MaybeProcessedBlock>, 174 + walker: Walker, 175 + process: fn(Bytes) -> Bytes, 176 + /// `None` = no gap encountered yet; `Some(k)` = trailing edge determined. 177 + trailing_key: Option<Option<RepoPath>>, 178 + } 179 + 180 + impl MemCar { 181 + /// Seek forward to the first record at or after `target`. 182 + /// 183 + /// Uses the MST structure to skip entire subtrees efficiently. 184 + /// After this returns, the next `next` or `next_chunk` call will start at or after `target`. 185 + pub fn seek(&mut self, target: &str) -> Result<(), WalkError> { 186 + self.walker.seek(target, &self.blocks) 187 + } 188 + 189 + /// Walk forward past any gaps to determine the trailing edge key. 190 + fn find_trailing_edge(&mut self) -> Result<Option<RepoPath>, WalkError> { 191 + let trailing = loop { 192 + match self.walker.step(&self.blocks, self.process)? { 193 + Some(WalkItem::Record(r)) => break Some(r.key), 194 + Some(WalkItem::MissingRecord { key, .. }) => break Some(key), 195 + Some(WalkItem::MissingSubtree { .. }) => continue, 196 + None => break None, 197 + } 198 + }; 199 + self.trailing_key = Some(trailing.clone()); 200 + Ok(trailing) 201 + } 202 + 203 + /// Get the next record. 204 + /// 205 + /// Returns `Step::Value(output)` for each record in key order, then 206 + /// `Step::End(None)` at the end of a full CAR, or `Step::End(Some(key))` 207 + /// for CAR slices where `key` is the first key immediately after the slice. 208 + /// 209 + /// TODO: make this an implementation of Iterator 210 + pub fn next(&mut self) -> Result<Step, WalkError> { 211 + if let Some(trailing) = &self.trailing_key { 212 + return Ok(Step::End(trailing.clone())); 213 + } 214 + match self.walker.step(&self.blocks, self.process)? { 215 + Some(WalkItem::Record(out)) => Ok(Step::Value(out)), 216 + Some(WalkItem::MissingRecord { key, .. }) => { 217 + self.trailing_key = Some(Some(key.clone())); 218 + Ok(Step::End(Some(key))) 219 + } 220 + Some(WalkItem::MissingSubtree { .. }) => { 221 + let trailing = self.find_trailing_edge()?; 222 + Ok(Step::End(trailing)) 223 + } 224 + None => { 225 + self.trailing_key = Some(None); 226 + Ok(Step::End(None)) 227 + } 228 + } 229 + } 230 + 231 + /// Iterate up to `n` records in key order. 232 + /// 233 + /// Returns `Step::Value(records)` while records remain, then `Step::End(next_key)` 234 + /// where `next_key` is the first key after the slice (for CAR slices), or `None`. 235 + pub fn next_chunk(&mut self, n: usize) -> Result<Step<Vec<Output>>, WalkError> { 236 + if let Some(trailing) = &self.trailing_key { 237 + return Ok(Step::End(trailing.clone())); 238 + } 239 + let mut out = Vec::with_capacity(n); 240 + for _ in 0..n { 241 + match self.walker.step(&self.blocks, self.process)? { 242 + Some(WalkItem::Record(record)) => out.push(record), 243 + Some(WalkItem::MissingRecord { key, .. }) => { 244 + self.trailing_key = Some(Some(key.clone())); 245 + return Ok(Step::Value(out)); // may be empty 246 + } 247 + Some(WalkItem::MissingSubtree { .. }) => { 248 + let trailing = self.find_trailing_edge()?; 249 + self.trailing_key = Some(trailing); 250 + return Ok(Step::Value(out)); // may be empty 251 + } 252 + None => break, 253 + } 254 + } 255 + if out.is_empty() { 256 + self.trailing_key = Some(None); 257 + Ok(Step::End(None)) 258 + } else { 259 + Ok(Step::Value(out)) 260 + } 261 + } 262 + } 263 + 264 + // --------------------------------------------------------------------------- 265 + // Resuming a partial load on disk 266 + // --------------------------------------------------------------------------- 267 + 268 + impl<R: AsyncRead + Unpin> PartialCar<R> { 269 + pub async fn finish_loading( 270 + mut self, 271 + mut store: DiskStore, 272 + ) -> Result<(Commit, Option<RepoPath>, DiskDriver), DriveError> { 273 + use tokio::sync::mpsc; 274 + 275 + store = tokio::task::spawn(async move { 276 + let kvs = self 277 + .blocks 278 + .into_iter() 279 + .map(|(k, v)| (k.to_bytes(), v.into_bytes())); 280 + 281 + store.put_many(kvs)?; 282 + Ok::<_, DriveError>(store) 283 + }) 284 + .await??; 285 + 286 + let (tx, mut rx) = mpsc::channel::<Vec<(ObjectLink, MaybeProcessedBlock)>>(1); 287 + 288 + let store_worker = tokio::task::spawn_blocking(move || { 289 + while let Some(chunk) = rx.blocking_recv() { 290 + let kvs = chunk 291 + .into_iter() 292 + .map(|(k, v)| (k.to_bytes(), v.into_bytes())); 293 + store.put_many(kvs)?; 294 + } 295 + Ok::<_, DriveError>(store) 296 + }); 297 + 298 + log::debug!("dumping the rest of the stream..."); 299 + loop { 300 + let mut mem_size = 0; 301 + let mut chunk = vec![]; 302 + loop { 303 + let Some((cid, data)) = self.car.next_block().await? else { 304 + break; 305 + }; 306 + if cid == self.root { 307 + let c: Commit = serde_ipld_dagcbor::from_slice(&data)?; 308 + self.commit = Some(c); 309 + continue; 310 + } 311 + 312 + let link = cid.into(); 313 + let data = Bytes::from(data); 314 + 315 + let maybe_processed = MaybeProcessedBlock::maybe(self.process, data); 316 + mem_size += maybe_processed.len(); 317 + chunk.push((link, maybe_processed)); 318 + if mem_size >= (self.max_size / 2) { 319 + break; 320 + } 321 + } 322 + if chunk.is_empty() { 323 + break; 324 + } 325 + tx.send(chunk) 326 + .await 327 + .map_err(|_| DriveError::ChannelSendError)?; 328 + } 329 + drop(tx); 330 + log::debug!("done. waiting for worker to finish..."); 331 + 332 + store = store_worker.await??; 333 + 334 + log::debug!("worker finished."); 335 + 336 + let commit = self.commit.ok_or(DriveError::MissingCommit)?; 337 + 338 + let db_bytes = store 339 + .get(&commit.data.to_bytes()) 340 + .map_err(|e| DriveError::StorageError(DiskError::DbError(e)))? 341 + .ok_or(DriveError::MissingCommit)?; 342 + 343 + let node: MstNode = match MaybeProcessedBlock::from_bytes(db_bytes.to_vec()) { 344 + MaybeProcessedBlock::Processed(_) => Err(WalkError::BadCommitFingerprint)?, 345 + MaybeProcessedBlock::Raw(bytes) => serde_ipld_dagcbor::from_slice(&bytes)?, 346 + }; 347 + let walker = Walker::new(node); 348 + 349 + Ok((commit, None, make_disk_driver(store, walker, self.process))) 350 + } 351 + }
+52 -14
src/mst.rs
··· 1 - //! Low-level types for parsing raw atproto MST CARs 1 + //! low-level types for parsing raw atproto MST CARs 2 2 //! 3 - //! The primary aim is to work through the **tree** structure. Non-node blocks 4 - //! are left as raw bytes, for upper levels to parse into DAG-CBOR or whatever. 3 + //! The primary aim is to work through the tree structure. Non-node blocks are 4 + //! left as raw bytes, for upper levels to parse into DAG-CBOR or whatever. 5 5 6 - use crate::link::{NodeThing, ObjectLink, ThingKind}; 7 6 use cid::Cid; 8 7 use serde::Deserialize; 9 8 use serde::de::{self, Deserializer, MapAccess, Unexpected, Visitor}; 10 9 use sha2::{Digest, Sha256}; 11 10 use std::fmt; 12 11 13 - pub type Depth = u32; 12 + #[derive(Debug, serde::Deserialize, Clone, PartialEq, Eq, Hash)] 13 + pub struct ObjectLink(Cid); 14 + 15 + impl ObjectLink { 16 + pub fn to_bytes(&self) -> Vec<u8> { 17 + self.0.to_bytes() 18 + } 19 + } 20 + 21 + impl From<Cid> for ObjectLink { 22 + fn from(cid: Cid) -> ObjectLink { 23 + ObjectLink(cid) 24 + } 25 + } 26 + 27 + impl From<ObjectLink> for Cid { 28 + fn from(link: ObjectLink) -> Cid { 29 + link.0 30 + } 31 + } 32 + 33 + #[derive(Debug, Clone, PartialEq)] 34 + pub struct NodeThing { 35 + pub link: ObjectLink, 36 + pub kind: ThingKind, 37 + } 38 + 39 + impl NodeThing { 40 + pub fn is_record(&self) -> bool { 41 + matches!(self.kind, ThingKind::Record(_)) 42 + } 43 + } 44 + 45 + #[derive(Debug, Clone, PartialEq)] 46 + pub enum ThingKind { 47 + ChildNode, 48 + Record(crate::RepoPath), 49 + } 50 + 51 + pub type Layer = u32; 14 52 15 53 /// The top-level data object in a repository's tree is a signed commit. 16 54 #[derive(Debug, Deserialize)] ··· 42 80 } 43 81 44 82 #[inline(always)] 45 - pub fn atproto_mst_depth(key: &str) -> Depth { 83 + pub fn atproto_mst_layer(key: &str) -> Layer { 46 84 // 128 bits oughta be enough: https://bsky.app/profile/retr0.id/post/3jwwbf4izps24 47 85 u128::from_be_bytes(Sha256::digest(key).split_at(16).0.try_into().unwrap()).leading_zeros() / 2 48 86 } 49 87 50 88 #[derive(Debug, Clone)] 51 89 pub struct MstNode { 52 - pub depth: Option<Depth>, // known for nodes with entries (required for root) 90 + pub layer: Option<Layer>, // known for nodes with entries (required for root) 53 91 pub things: Vec<NodeThing>, 54 92 } 55 93 ··· 74 112 let mut left = None; 75 113 let mut found_entries = false; 76 114 let mut things = Vec::new(); 77 - let mut depth = None; 115 + let mut layer = None; 78 116 79 117 while let Some(key) = map.next_key()? { 80 118 match key { ··· 118 156 ) 119 157 })?; 120 158 121 - let key_depth = atproto_mst_depth(&rkey_s); 122 - if depth.is_none() { 123 - depth = Some(key_depth); 124 - } else if Some(key_depth) != depth { 159 + let key_layer = atproto_mst_layer(&rkey_s); 160 + if layer.is_none() { 161 + layer = Some(key_layer); 162 + } else if Some(key_layer) != layer { 125 163 return Err(de::Error::invalid_value( 126 164 Unexpected::Bytes(&prefix), 127 - &"all rkeys to have equal MST depth", 165 + &"all rkeys to have equal MST layer", 128 166 )); 129 167 } 130 168 ··· 158 196 things.push(l); 159 197 } 160 198 161 - Ok(MstNode { depth, things }) 199 + Ok(MstNode { layer, things }) 162 200 } 163 201 } 164 202
+103 -109
src/walk.rs
··· 1 1 //! Depth-first MST traversal 2 2 3 - use crate::link::{NodeThing, ObjectLink, ThingKind}; 4 - use crate::mst::{Depth, MstNode}; 5 - use crate::{Bytes, HashMap, RepoPath, disk::DiskStore, block::MaybeProcessedBlock, noop}; 3 + use crate::mst::{Layer, MstNode, NodeThing, ObjectLink, ThingKind}; 4 + use crate::{Bytes, HashMap, RepoPath}; 6 5 use cid::Cid; 7 6 use std::convert::Infallible; 8 7 8 + // --------------------------------------------------------------------------- 9 + // Block representation (formerly block.rs) 10 + // --------------------------------------------------------------------------- 11 + 12 + /// A block that may or may not have been passed through the user's processor. 13 + /// 14 + /// `Raw` means we haven't processed it yet (it could still be an MST node). 15 + /// `Processed` means it's definitely a record and the processor has already run. 16 + #[derive(Debug, Clone)] 17 + pub enum MaybeProcessedBlock { 18 + Raw(Bytes), 19 + Processed(Bytes), 20 + } 21 + 22 + impl MaybeProcessedBlock { 23 + /// Apply `process` to `data` unless the block looks like an MST node. 24 + pub fn maybe(process: fn(Bytes) -> Bytes, data: Bytes) -> Self { 25 + if MstNode::could_be(&data) { 26 + MaybeProcessedBlock::Raw(data) 27 + } else { 28 + MaybeProcessedBlock::Processed(process(data)) 29 + } 30 + } 31 + 32 + pub fn from_bytes(data: Bytes) -> Self { 33 + if MstNode::could_be(&data) { 34 + MaybeProcessedBlock::Raw(data) 35 + } else { 36 + MaybeProcessedBlock::Processed(data) 37 + } 38 + } 39 + 40 + pub fn len(&self) -> usize { 41 + match self { 42 + MaybeProcessedBlock::Raw(b) | MaybeProcessedBlock::Processed(b) => b.len(), 43 + } 44 + } 45 + 46 + pub fn into_bytes(self) -> Bytes { 47 + match self { 48 + MaybeProcessedBlock::Raw(b) | MaybeProcessedBlock::Processed(b) => b, 49 + } 50 + } 51 + } 52 + 53 + /// Identity block processor — returns the block unchanged. 54 + pub fn noop(block: Bytes) -> Bytes { 55 + block 56 + } 57 + 58 + // --------------------------------------------------------------------------- 59 + // Walker errors 60 + // --------------------------------------------------------------------------- 61 + 9 62 /// Errors that can happen while walking 10 63 #[derive(Debug, thiserror::Error)] 11 64 pub enum WalkError { ··· 24 77 pub enum MstError { 25 78 #[error("Nodes cannot be empty (except for an entirely empty MST)")] 26 79 EmptyNode, 27 - #[error("Expected node to be at depth {expected}, but it was at {depth}")] 28 - WrongDepth { depth: Depth, expected: Depth }, 29 - #[error("MST depth underflow: depth-0 node with child trees")] 30 - DepthUnderflow, 80 + #[error("Expected node to be at layer {expected}, but it was at {layer}")] 81 + WrongLayer { layer: Layer, expected: Layer }, 82 + #[error("MST layer underflow: layer-0 node with child trees")] 83 + LayerUnderflow, 31 84 #[error("Encountered key {key:?} which cannot follow the previous: {prev:?}")] 32 85 KeyOutOfOrder { prev: RepoPath, key: RepoPath }, 33 86 } 34 87 88 + // --------------------------------------------------------------------------- 89 + // Walker output types 90 + // --------------------------------------------------------------------------- 91 + 35 92 /// An item yielded by `Walker::step`. 36 93 #[derive(Debug, PartialEq)] 37 94 pub enum WalkItem { ··· 43 100 MissingSubtree { cid: Cid }, 44 101 } 45 102 46 - /// Walker outputs 103 + /// A single record emitted by the walker. 47 104 #[derive(Debug, PartialEq)] 48 105 pub struct Output<T = Bytes> { 49 106 pub key: RepoPath, ··· 57 114 End(Option<RepoPath>), 58 115 } 59 116 60 - /// Traverser of an atproto MST 117 + /// Walker: traverser of an atproto MST 61 118 /// 62 - /// Walks the tree from left-to-right in depth-first order 119 + /// Walks the tree left-to-right in depth-first order (is also lexicographic order) 63 120 #[derive(Debug, Clone)] 64 121 pub struct Walker { 65 - links: usize, 66 - prev_key: RepoPath, 67 - root_depth: Depth, 68 - todo: Vec<Vec<NodeThing>>, 122 + pub(crate) prev_key: Option<RepoPath>, 123 + pub(crate) root_layer: Layer, 124 + pub(crate) todo: Vec<Vec<NodeThing>>, 69 125 } 70 126 71 127 impl Walker { 72 128 pub fn new(root_node: MstNode) -> Self { 73 129 Self { 74 - links: 0, 75 - prev_key: "".to_string(), 76 - root_depth: root_node.depth.unwrap_or(0), // empty root node = empty mst 130 + prev_key: None, 131 + root_layer: root_node.layer.unwrap_or(0), // empty root node = empty mst 77 132 todo: vec![root_node.things], 78 133 } 79 134 } 80 135 81 - fn mpb_step( 136 + pub(crate) fn mpb_step( 82 137 &mut self, 83 138 thing: NodeThing, 84 139 mpb: &MaybeProcessedBlock, ··· 91 146 MaybeProcessedBlock::Processed(t) => t.clone(), 92 147 }; 93 148 94 - if key <= self.prev_key { 149 + if Some(&key) <= self.prev_key.as_ref() { 95 150 return Err(WalkError::MstError(MstError::KeyOutOfOrder { 96 151 key, 97 - prev: self.prev_key.clone(), 152 + prev: self.prev_key.clone().unwrap_or("[no prev key]".to_string()), 98 153 })); 99 154 } 100 - self.prev_key = key.clone(); 155 + self.prev_key = Some(key.clone()); 101 156 102 157 log::trace!("val @ {key}"); 103 158 Ok(Some(Output { ··· 118 173 return Err(WalkError::MstError(MstError::EmptyNode)); 119 174 } 120 175 121 - let current_depth = self.root_depth - (self.todo.len() - 1) as u32; 122 - let next_depth = current_depth 176 + let current_layer = self.root_layer - (self.todo.len() - 1) as u32; 177 + let next_layer = current_layer 123 178 .checked_sub(1) 124 - .ok_or(MstError::DepthUnderflow)?; 125 - if let Some(d) = node.depth 126 - && d != next_depth 179 + .ok_or(MstError::LayerUnderflow)?; 180 + if let Some(d) = node.layer 181 + && d != next_layer 127 182 { 128 - return Err(WalkError::MstError(MstError::WrongDepth { 129 - depth: d, 130 - expected: next_depth, 183 + return Err(WalkError::MstError(MstError::WrongLayer { 184 + layer: d, 185 + expected: next_layer, 131 186 })); 132 187 } 133 188 134 - let n = node.things.len(); 135 - log::trace!("node into depth {next_depth} with {n} links"); 136 189 self.todo.push(node.things); 137 - self.links += n; 138 190 Ok(None) 139 191 } 140 192 } 141 193 } 142 194 143 195 #[inline(always)] 144 - fn next_todo(&mut self) -> Option<NodeThing> { 196 + pub(crate) fn next_todo(&mut self) -> Option<NodeThing> { 145 197 while let Some(last) = self.todo.last_mut() { 146 198 let Some(thing) = last.pop() else { 147 199 self.todo.pop(); ··· 156 208 /// 157 209 /// Returns `Ok(Some(item))` for each block encountered (record, missing 158 210 /// record, or missing subtree), or `Ok(None)` when traversal is complete. 159 - /// Only errors on structural MST violations (wrong depth, out-of-order keys). 211 + /// Only errors on structural MST violations (wrong layer, out-of-order keys). 160 212 pub fn step( 161 213 &mut self, 162 214 blocks: &HashMap<ObjectLink, MaybeProcessedBlock>, ··· 165 217 while let Some(thing) = self.next_todo() { 166 218 let Some(mpb) = blocks.get(&thing.link) else { 167 219 return Ok(Some(match thing.kind { 168 - ThingKind::Record(key) => { 169 - WalkItem::MissingRecord { key, cid: thing.link.into() } 170 - } 171 - ThingKind::ChildNode => WalkItem::MissingSubtree { cid: thing.link.into() }, 220 + ThingKind::Record(key) => WalkItem::MissingRecord { 221 + key, 222 + cid: thing.link.into(), 223 + }, 224 + ThingKind::ChildNode => WalkItem::MissingSubtree { 225 + cid: thing.link.into(), 226 + }, 172 227 })); 173 228 }; 174 229 if let Some(out) = self.mpb_step(thing, mpb, &process)? { 175 230 return Ok(Some(WalkItem::Record(out))); 176 231 } 177 232 } 178 - log::debug!("total links: {}", self.links); 179 233 Ok(None) 180 234 } 181 235 182 - /// Advance past leading missing blocks to find the first present record. 183 - /// 184 - /// Returns the key of the last missing *record* encountered before the 185 - /// first present record — i.e., the `prev_key` for a CAR slice's leading 186 - /// edge. After this returns, the next `step` call yields the first present 187 - /// record (or `None` if the whole tree is absent). 188 - pub fn step_to_edge( 189 - &mut self, 190 - blocks: &HashMap<ObjectLink, MaybeProcessedBlock>, 191 - ) -> Result<Option<RepoPath>, WalkError> { 192 - let mut ant = self.clone(); 193 - let mut prev_key = None; 194 - loop { 195 - match ant.step(blocks, noop)? { 196 - Some(WalkItem::Record(_)) => { 197 - // ant went one step too far; self holds the leading-edge position 198 - return Ok(prev_key); 199 - } 200 - Some(WalkItem::MissingRecord { key, .. }) => { 201 - prev_key = Some(key); 202 - *self = ant; 203 - ant = self.clone(); 204 - } 205 - Some(WalkItem::MissingSubtree { .. }) => { 206 - *self = ant; 207 - ant = self.clone(); 208 - } 209 - None => return Ok(prev_key), 210 - } 211 - } 212 - } 213 - 214 236 /// Skip forward to the first record at or after `target`, without emitting anything. 215 237 /// 216 238 /// Uses the tree structure to skip entire subtrees that are provably before `target`, 217 - /// only loading child nodes on the path to `target`. O(depth × branching_factor). 239 + /// only loading child nodes on the path to `target`. O(layer × branching_factor). 218 240 /// 219 241 /// After this returns `Ok(())`, the next call to `step` will yield the first record 220 242 /// at or after `target`, or `None` if no such record exists. ··· 270 292 } 271 293 SeekStep::SkipRecord(key) => { 272 294 self.todo.last_mut().unwrap().pop(); 273 - self.prev_key = key; 295 + self.prev_key = Some(key); 274 296 } 275 297 SeekStep::SkipSubtree => { 276 298 self.todo.last_mut().unwrap().pop(); 277 299 } 278 300 SeekStep::Descend => { 279 301 let child = self.todo.last_mut().unwrap().pop().unwrap(); 280 - // Note: self.todo borrow released before push below 281 302 282 303 let Some(mpb) = blocks.get(&child.link) else { 283 304 // Missing subtree on the seek path; skip it and continue 284 - // (seek is best-effort for sparse trees) 285 305 continue; 286 306 }; 287 307 let MaybeProcessedBlock::Raw(data) = mpb else { ··· 292 312 if node.is_empty() { 293 313 return Err(WalkError::MstError(MstError::EmptyNode)); 294 314 } 295 - // Depth validation mirrors mpb_step: todo still has the (possibly empty) 296 - // parent level, so todo.len()-1 is the parent's depth delta from root. 297 - let current_depth = self.root_depth - (self.todo.len() - 1) as u32; 298 - let next_depth = current_depth 315 + let current_layer = self.root_layer - (self.todo.len() - 1) as u32; 316 + let next_layer = current_layer 299 317 .checked_sub(1) 300 - .ok_or(MstError::DepthUnderflow)?; 301 - if let Some(d) = node.depth 302 - && d != next_depth 318 + .ok_or(MstError::LayerUnderflow)?; 319 + if let Some(d) = node.layer 320 + && d != next_layer 303 321 { 304 - return Err(WalkError::MstError(MstError::WrongDepth { 305 - depth: d, 306 - expected: next_depth, 322 + return Err(WalkError::MstError(MstError::WrongLayer { 323 + layer: d, 324 + expected: next_layer, 307 325 })); 308 326 } 309 - self.links += node.things.len(); 310 327 self.todo.push(node.things); 311 328 } 312 329 } 313 330 } 314 - } 315 - 316 - /// blocking!!!!! 317 - pub fn disk_step( 318 - &mut self, 319 - blocks: &DiskStore, 320 - process: impl Fn(Bytes) -> Bytes, 321 - ) -> Result<Option<WalkItem>, WalkError> { 322 - while let Some(thing) = self.next_todo() { 323 - let Some(block_slice) = blocks.get(&thing.link.to_bytes())? else { 324 - return Ok(Some(match thing.kind { 325 - ThingKind::Record(key) => { 326 - WalkItem::MissingRecord { key, cid: thing.link.into() } 327 - } 328 - ThingKind::ChildNode => WalkItem::MissingSubtree { cid: thing.link.into() }, 329 - })); 330 - }; 331 - let mpb = MaybeProcessedBlock::from_bytes(block_slice.to_vec()); 332 - if let Some(out) = self.mpb_step(thing, &mpb, &process)? { 333 - return Ok(Some(WalkItem::Record(out))); 334 - } 335 - } 336 - Ok(None) 337 331 } 338 332 }
+5 -5
tests/mst-depth.rs
··· 1 1 // use repo_stream::Driver; 2 - use repo_stream::mst::atproto_mst_depth; 2 + use repo_stream::mst::atproto_mst_layer; 3 3 4 4 // https://github.com/bluesky-social/atproto-interop-tests/blob/main/mst/example_keys.txt 5 5 const INTEROP_EXAMPLE_KEYS: &str = "\ ··· 164 164 fn test_interop_example_keys() { 165 165 for key in INTEROP_EXAMPLE_KEYS.split('\n') { 166 166 let expected: u32 = key.chars().nth(1).unwrap().to_digit(16).unwrap(); 167 - let computed: u32 = atproto_mst_depth(key); 167 + let computed: u32 = atproto_mst_layer(key); 168 168 assert_eq!(computed, expected); 169 169 } 170 170 } ··· 183 183 ("app.bsky.feed.post/454397e440ec", 4), 184 184 ("app.bsky.feed.post/9adeb165882c", 8), 185 185 ] { 186 - let computed = atproto_mst_depth(key); 186 + let computed = atproto_mst_layer(key); 187 187 assert_eq!(computed, expected); 188 188 } 189 189 } ··· 197 197 ("app.bsky.feed.post/454397e440ec", 4), 198 198 ("app.bsky.feed.post/9adeb165882c", 8), 199 199 ] { 200 - let computed = atproto_mst_depth(key); 200 + let computed = atproto_mst_layer(key); 201 201 assert_eq!(computed, expected); 202 202 } 203 203 } ··· 206 206 fn test_ietf_example_keys() { 207 207 // https://atproto.com/specs/repository#mst-structure 208 208 for (key, expected) in [("key1", 0), ("key7", 1), ("key515", 4)] { 209 - let computed = atproto_mst_depth(key); 209 + let computed = atproto_mst_layer(key); 210 210 assert_eq!(computed, expected); 211 211 } 212 212 }