···11extern crate repo_stream;
22-use repo_stream::{DriverBuilder, Step};
22+use repo_stream::DriverBuilder;
33use std::collections::HashSet;
44use std::path::Path;
55···24242525 let mut seen = HashSet::new();
2626 let mut collections = vec![];
2727- loop {
2828- match mem_car.next_chunk(256).unwrap() {
2929- Step::End(_) => break,
3030- Step::Value(outputs) => {
3131- for output in outputs {
3232- let collection = output.key.split_once('/').unwrap().0.to_string();
3333- if seen.insert(collection.clone()) {
3434- collections.push(collection);
3535- }
3636- }
2727+ while let Some(outputs) = mem_car.next_chunk_strict(256).unwrap() {
2828+ for output in outputs {
2929+ let collection = output.key.split_once('/').unwrap().0.to_string();
3030+ if seen.insert(collection.clone()) {
3131+ collections.push(collection);
3732 }
3833 }
3934 }
···55505651 let mut collections = vec![];
5752 loop {
5858- match mem_car.next().unwrap() {
5959- Step::End(_) => break,
6060- Step::Value(output) => {
5353+ match mem_car.next_strict().unwrap() {
5454+ None => break,
5555+ Some(output) => {
6156 let collection = output.key.split_once('/').unwrap().0.to_string();
6257 collections.push(collection.clone());
6358 mem_car.seek(&format!("{collection}/{tilde_max}")).unwrap();
···77727873 let mut seen = HashSet::new();
7974 let mut collections = vec![];
8080- loop {
8181- match mem_car.next_chunk(256).unwrap() {
8282- Step::End(_) => break,
8383- Step::Value(outputs) => {
8484- for output in outputs {
8585- let collection = output.key.split_once('/').unwrap().0.to_string();
8686- if seen.insert(collection.clone()) {
8787- collections.push(collection);
8888- }
8989- }
7575+ while let Some(outputs) = mem_car.next_chunk_strict(256).unwrap() {
7676+ for output in outputs {
7777+ let collection = output.key.split_once('/').unwrap().0.to_string();
7878+ if seen.insert(collection.clone()) {
7979+ collections.push(collection);
9080 }
9181 }
9282 }
···1049410595 let mut collections = vec![];
10696 loop {
107107- match mem_car.next().unwrap() {
108108- Step::End(_) => break,
109109- Step::Value(output) => {
9797+ match mem_car.next_strict().unwrap() {
9898+ None => break,
9999+ Some(output) => {
110100 let collection = output.key.split_once('/').unwrap().0.to_string();
111101 collections.push(collection.clone());
112102 mem_car.seek(&format!("{collection}/{tilde_max}")).unwrap();
+8-6
benches/huge-car.rs
···11extern crate repo_stream;
22-use repo_stream::{Driver, Step};
22+use repo_stream::DriverBuilder;
33use std::path::{Path, PathBuf};
4455use criterion::{Criterion, criterion_group, criterion_main};
···3232 let reader = tokio::fs::File::open(filename).await.unwrap();
3333 let reader = tokio::io::BufReader::new(reader);
34343535- let mut driver = match Driver::load_car(reader, ser, 1024).await.unwrap() {
3636- Driver::Memory(_, _, mem_driver) => mem_driver,
3737- Driver::Disk(_) => panic!("not doing disk for benchmark"),
3838- };
3535+ let mut driver = DriverBuilder::new()
3636+ .with_mem_limit_mb(1024)
3737+ .with_block_processor(ser)
3838+ .load_car(reader)
3939+ .await
4040+ .unwrap();
39414042 let mut n = 0;
4141- while let Step::Value(pairs) = driver.next_chunk(256).await.unwrap() {
4343+ while let Some(pairs) = driver.next_chunk(256).unwrap() {
4244 n += pairs.len();
4345 }
4446 n
+8-6
benches/non-huge-cars.rs
···11extern crate repo_stream;
22-use repo_stream::{Driver, Step};
22+use repo_stream::DriverBuilder;
3344use criterion::{Criterion, criterion_group, criterion_main};
55···3939}
40404141async fn drive_car(bytes: &[u8]) -> usize {
4242- let mut driver = match Driver::load_car(bytes, ser, 32).await.unwrap() {
4343- Driver::Memory(_, _, mem_driver) => mem_driver,
4444- Driver::Disk(_) => panic!("not benching big cars here"),
4545- };
4242+ let mut mem_car = DriverBuilder::new()
4343+ .with_mem_limit_mb(32)
4444+ .with_block_processor(ser)
4545+ .load_car(bytes)
4646+ .await
4747+ .unwrap();
46484749 let mut n = 0;
4848- while let Step::Value(pairs) = driver.next_chunk(256).await.unwrap() {
5050+ while let Some(pairs) = mem_car.next_chunk_strict(256).unwrap() {
4951 n += pairs.len();
5052 }
5153 n
+3-6
examples/disk-read-file/main.rs
···99static GLOBAL: MiMalloc = MiMalloc;
10101111use clap::Parser;
1212-use repo_stream::{DiskBuilder, DriverBuilder, LoadError, Step};
1212+use repo_stream::{DiskBuilder, DriverBuilder, LoadError};
1313use std::path::PathBuf;
1414use std::time::Instant;
1515···6666 // this example uses the disk driver's channel mode: the tree walking is
6767 // spawned onto a blocking thread, and we get chunks of rkey+blocks back
6868 let (mut rx, join) = driver.to_channel(512);
6969- while let Some(step) = rx.recv().await {
7070- let step = step?;
7171- let Step::Value(outputs) = step else {
7272- break;
7373- };
6969+ while let Some(outputs) = rx.recv().await {
7070+ let outputs = outputs?;
74717572 // keep a count of the total number of blocks seen
7673 n += outputs.len();
+2-2
examples/read-file/main.rs
···4455extern crate repo_stream;
66use clap::Parser;
77-use repo_stream::{DriverBuilder, Output, Step};
77+use repo_stream::{DriverBuilder, Output};
88use std::path::PathBuf;
991010type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
···31313232 log::info!("got commit: {:?}", mem_car.commit);
33333434- while let Step::Value(records) = mem_car.next_chunk(256)? {
3434+ while let Some(records) = mem_car.next_chunk_strict(256)? {
3535 for Output {
3636 key: _,
3737 cid: _,
+32-20
examples/read-slice/main.rs
···33*/
4455extern crate repo_stream;
66-use repo_stream::{DriverBuilder, LoadError, Output, Step};
66+use repo_stream::{DriverBuilder, LoadError, Output, WalkItem};
7788type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
99···2626 "\nthis slice is from {}, repo rev {}",
2727 mem_car.commit.did, mem_car.commit.rev
2828 );
2929- if let Some(key) = &mem_car.prev_key {
3030- println!(" -> key immediately before CAR slice: {key}");
3131- } else {
3232- println!(
3333- " -> no key preceeding the CAR slice, so it includes the leading edge of the tree."
3434- );
3535- }
36293730 println!("included records:");
3838- let end = loop {
3939- match mem_car.next_chunk(256)? {
4040- Step::Value(chunk) => {
4141- for Output { cid, key, .. } in chunk {
3131+3232+ let mut preceding: Option<String> = None;
3333+ let mut trailing: Option<String> = None;
3434+ let mut after_records = false;
3535+3636+ while let Some(items) = mem_car.next_chunk(256)? {
3737+ for item in items {
3838+ match item {
3939+ WalkItem::Record(Output { cid, key, .. }) => {
4040+ after_records = true;
4141+ trailing = None;
4242 print!(" SHA256 ");
4343 for byte in cid.to_bytes().iter().skip(4).take(5) {
4444 print!("{byte:02x}");
4545 }
4646 println!("...\t{key}");
4747 }
4848+ WalkItem::MissingRecord { key, .. } => {
4949+ if !after_records {
5050+ preceding = Some(key);
5151+ } else if trailing.is_none() {
5252+ trailing = Some(key);
5353+ }
5454+ }
5555+ WalkItem::MissingSubtree { .. } => {}
4856 }
4949- Step::End(e) => break e,
5057 }
5151- };
5858+ }
52595360 println!("done walking records present in the slice.");
5454- if let Some(key) = end {
5555- println!(" -> key immediately after CAR slice: {key}");
5656- } else {
5757- println!(
5858- " -> no key proceeding the CAR slice, so it includes the trailing edge of the tree."
5959- );
6161+ match preceding {
6262+ Some(key) => println!(" -> key immediately before CAR slice: {key}"),
6363+ None => println!(
6464+ " -> no key preceding the CAR slice, so it includes the leading edge of the tree."
6565+ ),
6666+ }
6767+ match trailing {
6868+ Some(key) => println!(" -> key immediately after CAR slice: {key}"),
6969+ None => println!(
7070+ " -> no key following the CAR slice, so it includes the trailing edge of the tree."
7171+ ),
6072 }
61736274 Ok(())
+3-3
readme.md
···1111[sponsor-badge]: https://img.shields.io/badge/at-microcosm-b820f9?labelColor=b820f9&logo=githubsponsors&logoColor=fff
12121313```rust no_run
1414-use repo_stream::{DriverBuilder, LoadError, DiskBuilder, Output, Step};
1414+use repo_stream::{DriverBuilder, LoadError, DiskBuilder, Output};
15151616#[tokio::main]
1717async fn main() -> Result<(), Box<dyn std::error::Error>> {
···3131 {
3232 // if all blocks fit within memory
3333 Ok(mut mem_car) => {
3434- while let Step::Value(chunk) = mem_car.next_chunk(256)? {
3434+ while let Some(chunk) = mem_car.next_chunk_strict(256)? {
3535 for Output { key: _, cid: _, data } in chunk {
3636 let size = usize::from_ne_bytes(<[u8; 8]>::try_from(data).unwrap());
3737 total_size += size;
···4646 // do the spilling, get back a disk driver
4747 let (_commit, _prev_key, mut driver) = partial.finish_loading(store).await?;
48484949- while let Step::Value(chunk) = driver.next_chunk(256).await? {
4949+ while let Some(chunk) = driver.next_chunk(256).await? {
5050 for Output { key: _, cid: _, data } in chunk {
5151 let size = usize::from_ne_bytes(<[u8; 8]>::try_from(data).unwrap());
5252 total_size += size;
+25-22
src/disk.rs
···1515*/
16161717use crate::{
1818- Bytes, Step,
1818+ Bytes,
1919 mst::ThingKind,
2020 walk::{MaybeProcessedBlock, MstError, Output, WalkError, WalkItem, Walker},
2121};
···6262 #[error("Storage error: {0}")]
6363 StorageError(#[from] DiskError),
6464 #[error("Unexpected missing block: {0:?}")]
6565- MissingBlock(cid::Cid),
6565+ MissingBlock(Box<cid::Cid>),
6666 #[error("Tried to send on a closed channel")]
6767 ChannelSendError,
6868 #[error("Failed to join a task: {0}")]
···255255}
256256257257impl DiskDriver {
258258- /// Walk the MST returning up to `n` key + record pairs
258258+ /// Walk the MST returning up to `n` key + record pairs.
259259+ ///
260260+ /// Returns `Ok(Some(outputs))` while records remain, `Ok(None)` when done.
261261+ /// Errors if any block is absent (disk path always expects all blocks present).
259262 ///
260263 /// ```no_run
261261- /// # use repo_stream::{disk::{DiskDriver, DriveError, _get_fake_disk_driver}, Step};
264264+ /// # use repo_stream::disk::{DiskDriver, DriveError, _get_fake_disk_driver};
262265 /// # #[tokio::main]
263266 /// # async fn main() -> Result<(), DriveError> {
264267 /// # let mut disk_driver = _get_fake_disk_driver();
265265- /// while let Step::Value(outputs) = disk_driver.next_chunk(256).await? {
268268+ /// while let Some(outputs) = disk_driver.next_chunk(256).await? {
266269 /// for output in outputs {
267270 /// println!("{}: size={}", output.key, output.data.len());
268271 /// }
···270273 /// # Ok(())
271274 /// # }
272275 /// ```
273273- pub async fn next_chunk(&mut self, n: usize) -> Result<Step<Vec<Output>>, DriveError> {
276276+ pub async fn next_chunk(&mut self, n: usize) -> Result<Option<Vec<Output>>, DriveError> {
274277 let process = self.process;
275278276279 let mut state = self.state.take().expect("DiskDriver must have Some(state)");
···285288 Ok(Some(WalkItem::Record(output))) => out.push(output),
286289 Ok(Some(WalkItem::MissingRecord { cid, .. }))
287290 | Ok(Some(WalkItem::MissingSubtree { cid })) => {
288288- return (state, Err(DriveError::MissingBlock(cid)));
291291+ return (state, Err(DriveError::MissingBlock(Box::new(cid))));
289292 }
290293 Ok(None) => break,
291294 }
···300303 let out = res?;
301304302305 if out.is_empty() {
303303- Ok(Step::End(None))
306306+ Ok(None)
304307 } else {
305305- Ok(Step::Value(out))
308308+ Ok(Some(out))
306309 }
307310 }
308311309312 fn read_tx_blocking(
310313 &mut self,
311314 n: usize,
312312- tx: mpsc::Sender<Result<Step<Vec<Output>>, DriveError>>,
313313- ) -> Result<(), mpsc::error::SendError<Result<Step<Vec<Output>>, DriveError>>> {
315315+ tx: mpsc::Sender<Result<Vec<Output>, DriveError>>,
316316+ ) -> Result<(), mpsc::error::SendError<Result<Vec<Output>, DriveError>>> {
314317 let BigState { store, walker } = self.state.as_mut().expect("valid state");
315318316319 loop {
···322325 Ok(Some(WalkItem::Record(output))) => out.push(output),
323326 Ok(Some(WalkItem::MissingRecord { cid, .. }))
324327 | Ok(Some(WalkItem::MissingSubtree { cid })) => {
325325- return tx.blocking_send(Err(DriveError::MissingBlock(cid)));
328328+ return tx.blocking_send(Err(DriveError::MissingBlock(Box::new(cid))));
326329 }
327330 Ok(None) => break,
328331 }
···331334 if out.is_empty() {
332335 break;
333336 }
334334- tx.blocking_send(Ok(Step::Value(out)))?;
337337+ tx.blocking_send(Ok(out))?;
335338 }
336339337340 Ok(())
338341 }
339342340340- /// Spawn the disk reading task into a tokio blocking thread
343343+ /// Spawn the disk reading task into a tokio blocking thread.
344344+ ///
345345+ /// The channel sends `Ok(chunk)` for each batch of records. When the walk
346346+ /// is complete the sender is dropped and `rx.recv()` returns `None`.
341347 ///
342348 /// ```no_run
343343- /// # use repo_stream::{disk::{DiskDriver, DriveError, _get_fake_disk_driver}, Step};
349349+ /// # use repo_stream::disk::{DiskDriver, DriveError, _get_fake_disk_driver};
344350 /// # #[tokio::main]
345351 /// # async fn main() -> Result<(), DriveError> {
346352 /// # let mut disk_driver = _get_fake_disk_driver();
347353 /// let (mut rx, join) = disk_driver.to_channel(512);
348348- /// while let Some(recvd) = rx.recv().await {
349349- /// let outputs = recvd?;
350350- /// let Step::Value(outputs) = outputs else { break; };
351351- /// for output in outputs {
354354+ /// while let Some(chunk) = rx.recv().await {
355355+ /// for output in chunk? {
352356 /// println!("{}: size={}", output.key, output.data.len());
353357 /// }
354354- ///
355358 /// }
356359 /// # Ok(())
357360 /// # }
···360363 mut self,
361364 n: usize,
362365 ) -> (
363363- mpsc::Receiver<Result<Step<Vec<Output>>, DriveError>>,
366366+ mpsc::Receiver<Result<Vec<Output>, DriveError>>,
364367 tokio::task::JoinHandle<Self>,
365368 ) {
366366- let (tx, rx) = mpsc::channel::<Result<Step<Vec<Output>>, DriveError>>(1);
369369+ let (tx, rx) = mpsc::channel::<Result<Vec<Output>, DriveError>>(1);
367370368371 let chan_task = tokio::task::spawn_blocking(move || {
369372 if let Err(mpsc::error::SendError(_)) = self.read_tx_blocking(n, tx) {
+5-5
src/lib.rs
···1818`iroh_car` additionally applies a block size limit of `2MiB`.
19192020```
2121-use repo_stream::{DriverBuilder, Step};
2121+use repo_stream::DriverBuilder;
22222323# #[tokio::main]
2424# async fn main() -> Result<(), Box<dyn std::error::Error>> {
···3131 .load_car(reader)
3232 .await?;
33333434-while let Step::Value(chunk) = mem_car.next_chunk(256)? {
3434+while let Some(chunk) = mem_car.next_chunk_strict(256)? {
3535 for output in chunk {
3636 let size = usize::from_ne_bytes(output.data.try_into().unwrap());
3737 total_size += size;
···4545If the CAR is too large for memory, handle the `MemoryLimitReached` error:
46464747```no_run
4848-use repo_stream::{DriverBuilder, LoadError, Step};
4848+use repo_stream::{DriverBuilder, LoadError};
49495050# #[tokio::main]
5151# async fn main() -> Result<(), Box<dyn std::error::Error>> {
···5656 .await
5757{
5858 Ok(mut mem_car) => {
5959- while let Step::Value(chunk) = mem_car.next_chunk(256)? {
5959+ while let Some(chunk) = mem_car.next_chunk_strict(256)? {
6060 // process records
6161 }
6262 }
···8282pub use disk::{DiskBuilder, DiskDriver, DiskError, DiskStore, DriveError};
8383pub use mem::{DriverBuilder, LoadError, MemCar, PartialCar};
8484pub use mst::Commit;
8585-pub use walk::{Output, Step, WalkError, WalkItem, noop};
8585+pub use walk::{Output, WalkError, WalkItem, noop};
86868787pub type Bytes = Vec<u8>;
8888
+57-55
src/mem.rs
···11//! Load a CAR file into memory and walk its MST
2233use crate::{
44- Bytes, HashMap, RepoPath, Step,
44+ Bytes, HashMap, RepoPath,
55 disk::{DiskDriver, DiskError, DiskStore, DriveError, make_disk_driver},
66 mst::{Commit, MstNode, ObjectLink},
77 walk::{MaybeProcessedBlock, Output, WalkError, WalkItem, Walker},
···161161 blocks: mem_blocks,
162162 walker: Walker::new(root_node),
163163 process,
164164- trailing_key: None,
165164 })
166165}
167166···171170 pub commit: Commit,
172171 /// For CAR slices: the key of the last record before this slice's leading edge.
173172 /// `None` if this slice (or full CAR) starts from the leftmost record in the tree.
173173+ /// Not set automatically — callers may derive it from leading `MissingRecord` items.
174174 pub prev_key: Option<RepoPath>,
175175 pub blocks: HashMap<ObjectLink, MaybeProcessedBlock>,
176176 walker: Walker,
177177 process: fn(Bytes) -> Bytes,
178178- /// `None` = no gap encountered yet; `Some(k)` = trailing edge determined.
179179- trailing_key: Option<Option<RepoPath>>,
180178}
181179182180impl MemCar {
183181 /// Seek forward to the first record at or after `target`.
184182 ///
185183 /// Uses the MST structure to skip entire subtrees efficiently.
186186- /// After this returns, the next `next` or `next_chunk` call will start at or after `target`.
184184+ /// After this returns, the next call to `next*` will start at or after `target`.
187185 pub fn seek(&mut self, target: &str) -> Result<(), WalkError> {
188186 self.walker.seek(target, &self.blocks)
189187 }
190188191191- /// Walk forward past any gaps to determine the trailing edge key.
192192- fn find_trailing_edge(&mut self) -> Result<Option<RepoPath>, WalkError> {
193193- let trailing = loop {
189189+ /// Get the next item from the walk.
190190+ ///
191191+ /// Returns all `WalkItem` variants as-is, including `MissingRecord` and
192192+ /// `MissingSubtree` for sparse trees and CAR slices. Returns `Ok(None)`
193193+ /// when the walk is complete.
194194+ ///
195195+ /// TODO: make this an implementation of Iterator
196196+ pub fn next(&mut self) -> Result<Option<WalkItem>, WalkError> {
197197+ self.walker.step(&self.blocks, self.process)
198198+ }
199199+200200+ /// Collect up to `n` walk items.
201201+ ///
202202+ /// Like `next`, passes through `MissingRecord` and `MissingSubtree` items.
203203+ /// Returns `Ok(None)` when the walk is complete.
204204+ pub fn next_chunk(&mut self, n: usize) -> Result<Option<Vec<WalkItem>>, WalkError> {
205205+ let mut out = Vec::with_capacity(n);
206206+ for _ in 0..n {
194207 match self.walker.step(&self.blocks, self.process)? {
195195- Some(WalkItem::Record(r)) => break Some(r.key),
196196- Some(WalkItem::MissingRecord { key, .. }) => break Some(key),
197197- Some(WalkItem::MissingSubtree { .. }) => continue,
198198- None => break None,
208208+ Some(item) => out.push(item),
209209+ None => break,
199210 }
200200- };
201201- self.trailing_key = Some(trailing.clone());
202202- Ok(trailing)
211211+ }
212212+ if out.is_empty() {
213213+ Ok(None)
214214+ } else {
215215+ Ok(Some(out))
216216+ }
203217 }
204218205205- /// Get the next record.
219219+ /// Get the next present record, erroring if any block is absent.
206220 ///
207207- /// Returns `Step::Value(output)` for each record in key order, then
208208- /// `Step::End(None)` at the end of a full CAR, or `Step::End(Some(key))`
209209- /// for CAR slices where `key` is the first key immediately after the slice.
210210- ///
211211- /// TODO: make this an implementation of Iterator
212212- pub fn next(&mut self) -> Result<Step, WalkError> {
213213- if let Some(trailing) = &self.trailing_key {
214214- return Ok(Step::End(trailing.clone()));
215215- }
221221+ /// Returns `Ok(None)` when the walk is complete. Returns
222222+ /// `Err(WalkError::MissingBlock)` if a record block is absent, or
223223+ /// `Err(WalkError::MissingNode)` if an MST node block is absent.
224224+ pub fn next_strict(&mut self) -> Result<Option<Output>, WalkError> {
216225 match self.walker.step(&self.blocks, self.process)? {
217217- Some(WalkItem::Record(out)) => Ok(Step::Value(out)),
218218- Some(WalkItem::MissingRecord { key, .. }) => {
219219- self.trailing_key = Some(Some(key.clone()));
220220- Ok(Step::End(Some(key)))
221221- }
222222- Some(WalkItem::MissingSubtree { .. }) => {
223223- let trailing = self.find_trailing_edge()?;
224224- Ok(Step::End(trailing))
225225- }
226226- None => {
227227- self.trailing_key = Some(None);
228228- Ok(Step::End(None))
226226+ None => Ok(None),
227227+ Some(WalkItem::Record(out)) => Ok(Some(out)),
228228+ Some(WalkItem::MissingRecord { key, cid }) => Err(WalkError::MissingBlock {
229229+ key,
230230+ cid: Box::new(cid),
231231+ }),
232232+ Some(WalkItem::MissingSubtree { cid }) => {
233233+ Err(WalkError::MissingNode { cid: Box::new(cid) })
229234 }
230235 }
231236 }
232237233233- /// Iterate up to `n` records in key order.
238238+ /// Collect up to `n` present records, erroring if any block is absent.
234239 ///
235235- /// Returns `Step::Value(records)` while records remain, then `Step::End(next_key)`
236236- /// where `next_key` is the first key after the slice (for CAR slices), or `None`.
237237- pub fn next_chunk(&mut self, n: usize) -> Result<Step<Vec<Output>>, WalkError> {
238238- if let Some(trailing) = &self.trailing_key {
239239- return Ok(Step::End(trailing.clone()));
240240- }
240240+ /// Returns `Ok(None)` when the walk is complete. Returns
241241+ /// `Err(WalkError::MissingBlock)` if a record block is absent, or
242242+ /// `Err(WalkError::MissingNode)` if an MST node block is absent.
243243+ pub fn next_chunk_strict(&mut self, n: usize) -> Result<Option<Vec<Output>>, WalkError> {
241244 let mut out = Vec::with_capacity(n);
242245 for _ in 0..n {
243246 match self.walker.step(&self.blocks, self.process)? {
247247+ None => break,
244248 Some(WalkItem::Record(record)) => out.push(record),
245245- Some(WalkItem::MissingRecord { key, .. }) => {
246246- self.trailing_key = Some(Some(key.clone()));
247247- return Ok(Step::Value(out)); // may be empty
249249+ Some(WalkItem::MissingRecord { key, cid }) => {
250250+ return Err(WalkError::MissingBlock {
251251+ key,
252252+ cid: Box::new(cid),
253253+ });
248254 }
249249- Some(WalkItem::MissingSubtree { .. }) => {
250250- let trailing = self.find_trailing_edge()?;
251251- self.trailing_key = Some(trailing);
252252- return Ok(Step::Value(out)); // may be empty
255255+ Some(WalkItem::MissingSubtree { cid }) => {
256256+ return Err(WalkError::MissingNode { cid: Box::new(cid) });
253257 }
254254- None => break,
255258 }
256259 }
257260 if out.is_empty() {
258258- self.trailing_key = Some(None);
259259- Ok(Step::End(None))
261261+ Ok(None)
260262 } else {
261261- Ok(Step::Value(out))
263263+ Ok(Some(out))
262264 }
263265 }
264266}
+9-6
src/walk.rs
···7070 MstError(#[from] MstError),
7171 #[error("storage error: {0}")]
7272 StorageError(#[from] fjall::Error),
7373+ /// Returned by `next_strict`/`next_chunk_strict` when a record block is absent.
7474+ #[error("record block absent: key={key:?} cid={cid}")]
7575+ MissingBlock {
7676+ key: crate::RepoPath,
7777+ cid: Box<cid::Cid>,
7878+ },
7979+ /// Returned by `next_strict`/`next_chunk_strict` when an MST node block is absent.
8080+ #[error("MST node block absent: cid={cid}")]
8181+ MissingNode { cid: Box<cid::Cid> },
7382}
74837584/// Errors from invalid repo path keys
···106115 pub key: RepoPath,
107116 pub cid: Cid,
108117 pub data: T,
109109-}
110110-111111-#[derive(Debug, PartialEq)]
112112-pub enum Step<T = Output> {
113113- Value(T),
114114- End(Option<RepoPath>),
115118}
116119117120/// Walker: traverser of an atproto MST
+43-25
tests/car-slices.rs
···11extern crate repo_stream;
22-use repo_stream::{DriverBuilder, LoadError, Output, Step};
22+use repo_stream::{DriverBuilder, LoadError, Output, WalkItem};
3344-const RECORD_SLICE: &'static [u8] = include_bytes!("../car-samples/slice-one.car");
55-const RECORD_NODE_FIRST_KEY: &'static [u8] =
66- include_bytes!("../car-samples/slice-node-first-key.car");
77-const RECORD_NODE_AFTER: &'static [u8] = include_bytes!("../car-samples/slice-node-after.car");
88-const RECORD_NODE_ABSENT: &'static [u8] =
99- include_bytes!("../car-samples/slice-proving-absence.car");
44+const RECORD_SLICE: &[u8] = include_bytes!("../car-samples/slice-one.car");
55+const RECORD_NODE_FIRST_KEY: &[u8] = include_bytes!("../car-samples/slice-node-first-key.car");
66+const RECORD_NODE_AFTER: &[u8] = include_bytes!("../car-samples/slice-node-after.car");
77+const RECORD_NODE_ABSENT: &[u8] = include_bytes!("../car-samples/slice-proving-absence.car");
10899+/// Walk a CAR slice and assert on:
1010+/// - `expect_preceding`: the last `MissingRecord` key before any present records
1111+/// (i.e. the key just before the slice's window)
1212+/// - `expected_records`: count of present records
1313+/// - `expected_sum`: sum of record sizes (via processor)
1414+/// - `expect_key`: a specific key that must appear among the present records
1515+/// - `expect_trailing`: the first `MissingRecord` key after the last present record
1616+/// (i.e. the key just after the slice's window)
1117async fn test_car_slice(
1218 bytes: &[u8],
1319 expected_records: usize,
1420 expected_sum: usize,
1515- expect_preceeding: Option<&str>,
2121+ expect_preceding: Option<&str>,
1622 expect_key: Option<&str>,
1717- expect_proceeding: Option<&str>,
2323+ expect_trailing: Option<&str>,
1824) {
1925 let mut mem_car = match DriverBuilder::new()
2026 .with_block_processor(|block| block.len().to_ne_bytes().to_vec())
···2531 Err(LoadError::MemoryLimitReached(_)) => panic!("too big"),
2632 Err(e) => panic!("{e}"),
2733 };
2828-2929- assert_eq!(mem_car.prev_key.as_deref(), expect_preceeding);
30343135 let mut found_records = 0;
3236 let mut sum = 0;
3337 let mut found_expected_key = false;
3438 let mut prev_key = "".to_string();
35393636- loop {
3737- match mem_car.next_chunk(256).unwrap() {
3838- Step::Value(records) => {
3939- for Output { key, cid: _, data } in records {
4040+ // The last MissingRecord key seen before the first present record.
4141+ let mut preceding: Option<String> = None;
4242+ // The first MissingRecord key seen after the last present record.
4343+ let mut trailing: Option<String> = None;
4444+ let mut after_records = false;
4545+4646+ while let Some(items) = mem_car.next_chunk(256).unwrap() {
4747+ for item in items {
4848+ match item {
4949+ WalkItem::Record(Output { key, cid: _, data }) => {
5050+ after_records = true;
5151+ trailing = None; // a later MissingRecord replaces this
4052 found_records += 1;
41534254 let (int_bytes, _) = data.split_at(size_of::<usize>());
4355 let size = usize::from_ne_bytes(int_bytes.try_into().unwrap());
4444-4556 sum += size;
5757+4658 if Some(key.as_str()) == expect_key {
4759 found_expected_key = true;
4860 }
4949- eprintln!("!!!! {key}");
5061 assert!(key > prev_key, "keys are streamed in order");
5162 prev_key = key;
5263 }
5353- }
5454- Step::End(proceeding) => {
5555- assert_eq!(proceeding.as_deref(), expect_proceeding);
5656- break;
6464+ WalkItem::MissingRecord { key, .. } => {
6565+ if !after_records {
6666+ preceding = Some(key);
6767+ } else if trailing.is_none() {
6868+ trailing = Some(key);
6969+ }
7070+ }
7171+ WalkItem::MissingSubtree { .. } => {}
5772 }
5873 }
5974 }
60756176 assert_eq!(found_records, expected_records);
7777+ assert_eq!(preceding.as_deref(), expect_preceding);
7878+ assert_eq!(trailing.as_deref(), expect_trailing);
7979+6280 if expected_records > 0 {
6381 assert!(found_expected_key);
6482 assert_eq!(sum, expected_sum);
6565- } else {
6666- assert!(!found_expected_key);
6783 }
6884}
6985···108124109125#[tokio::test]
110126async fn test_record_slice_proving_absence() {
111111- // missing key is `app.bsky.feed.like/3lohfzs6qea23`
112112- // NOTE: repo-stream output here isn't enough info for proof
127127+ // proves `app.bsky.feed.like/3lohfzs6qea23` is absent.
128128+ // the included MST nodes contain entries for neighbouring keys whose
129129+ // record blocks are not in this CAR — they surface as MissingRecord items.
130130+ // no present records; the last MissingRecord key seen is the neighbour.
113131 test_car_slice(
114132 RECORD_NODE_ABSENT,
115133 0,
+2-2
tests/non-huge-cars.rs
···11extern crate repo_stream;
22-use repo_stream::{DriverBuilder, Output, Step};
22+use repo_stream::{DriverBuilder, Output};
3344const EMPTY_CAR: &'static [u8] = include_bytes!("../car-samples/empty.car");
55const TINY_CAR: &'static [u8] = include_bytes!("../car-samples/tiny.car");
···2424 let mut found_bsky_profile = false;
2525 let mut prev_key = "".to_string();
26262727- while let Step::Value(pairs) = mem_car.next_chunk(256).unwrap() {
2727+ while let Some(pairs) = mem_car.next_chunk_strict(256).unwrap() {
2828 for Output { key, cid: _, data } in pairs {
2929 records += 1;
3030