···1818serde = { version = "1.0.228", features = ["derive"] }
1919serde_bytes = "0.11.19"
2020serde_ipld_dagcbor = "0.6.4"
2121+sha2 = "0.10.9"
2122thiserror = "2.0.17"
2223tokio = { version = "1.47.1", features = ["rt", "sync"] }
2324
+12-1
examples/read-file/main.rs
···11extern crate repo_stream;
22use clap::Parser;
33+use repo_stream::drive::Processable;
44+use serde::{Deserialize, Serialize};
35use std::path::PathBuf;
4657type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
···1012 file: PathBuf,
1113}
12141515+#[derive(Clone, Serialize, Deserialize)]
1616+struct S(usize);
1717+1818+impl Processable for S {
1919+ fn get_size(&self) -> usize {
2020+ 0 // no additional space taken, just its stack size (newtype is free)
2121+ }
2222+}
2323+1324#[tokio::main]
1425async fn main() -> Result<()> {
1526 env_logger::init();
···1930 let reader = tokio::io::BufReader::new(reader);
20312132 let (commit, mut driver) =
2222- match repo_stream::drive::load_car(reader, |block| block.len(), 1024 * 1024).await? {
3333+ match repo_stream::drive::load_car(reader, |block| S(block.len()), 1024 * 1024).await? {
2334 repo_stream::drive::Vehicle::Lil(commit, mem_driver) => (commit, mem_driver),
2435 repo_stream::drive::Vehicle::Big(_) => panic!("can't handle big cars yet"),
2536 };
-4
src/mst.rs
···8383 /// with an empty array of entries. This is the only situation in which a
8484 /// tree may contain an empty leaf node which does not either contain keys
8585 /// ("entries") or point to a sub-tree containing entries.
8686- ///
8787- /// TODO: to me this is slightly unclear with respect to `l` (ask someone).
8888- /// ...is that what "The top of the tree must not be a an empty node which
8989- /// only points to a sub-tree." is referring to?
9086 pub fn is_empty(&self) -> bool {
9187 self.left.is_none() && self.entries.is_empty()
9288 }
+169-28
src/walk.rs
···44use crate::drive::{MaybeProcessedBlock, Processable};
55use crate::mst::Node;
66use ipld_core::cid::Cid;
77+use sha2::{Digest, Sha256};
78use std::collections::HashMap;
89use std::convert::Infallible;
910···1718 #[error("Failed to decode commit block: {0}")]
1819 BadCommit(#[from] serde_ipld_dagcbor::DecodeError<Infallible>),
1920 #[error("Action node error: {0}")]
2020- RkeyError(#[from] RkeyError),
2121+ MstError(#[from] MstError),
2122 #[error("Encountered an rkey out of order while walking the MST")]
2223 RkeyOutOfOrder,
2324}
···3435}
35363637/// Errors from invalid Rkeys
3737-#[derive(Debug, thiserror::Error)]
3838-pub enum RkeyError {
3838+#[derive(Debug, PartialEq, thiserror::Error)]
3939+pub enum MstError {
3940 #[error("Failed to compute an rkey due to invalid prefix_len")]
4041 EntryPrefixOutOfbounds,
4142 #[error("RKey was not utf-8")]
4243 EntryRkeyNotUtf8(#[from] std::string::FromUtf8Error),
4444+ #[error("Nodes cannot be empty (except for an entirely empty MST)")]
4545+ EmptyNode,
4646+ #[error("Found an entry with rkey at the wrong depth")]
4747+ WrongDepth,
4848+ #[error("Lost track of our depth (possible bug?)")]
4949+ LostDepth,
5050+ #[error("MST depth underflow: depth-0 node with child trees")]
5151+ DepthUnderflow,
4352}
44534554/// Walker outputs
···55645665#[derive(Debug, Clone, PartialEq)]
5766enum Need {
5858- Node(Cid),
6767+ Node { depth: Depth, cid: Cid },
5968 Record { rkey: String, cid: Cid },
6069}
61706262-fn push_from_node(stack: &mut Vec<Need>, node: &Node) -> Result<(), RkeyError> {
7171+#[derive(Debug, Clone, Copy, PartialEq)]
7272+enum Depth {
7373+ Root,
7474+ Depth(u32),
7575+}
7676+7777+impl Depth {
7878+ fn from_key(key: &[u8]) -> Self {
7979+ let mut zeros = 0;
8080+ for byte in Sha256::digest(key) {
8181+ let leading = byte.leading_zeros();
8282+ zeros += leading;
8383+ if leading < 8 {
8484+ break;
8585+ }
8686+ }
8787+ Self::Depth(zeros / 2) // truncating divide (rounds down)
8888+ }
8989+ fn next_expected(&self) -> Result<Option<u32>, MstError> {
9090+ match self {
9191+ Self::Root => Ok(None),
9292+ Self::Depth(d) => d.checked_sub(1).ok_or(MstError::DepthUnderflow).map(Some),
9393+ }
9494+ }
9595+}
9696+9797+fn push_from_node(stack: &mut Vec<Need>, node: &Node, parent_depth: Depth) -> Result<(), MstError> {
9898+ // empty nodes are not allowed in the MST
9999+ // ...except for a single one for empty MST, but we wouldn't be pushing that
100100+ if node.is_empty() {
101101+ return Err(MstError::EmptyNode);
102102+ }
103103+63104 let mut entries = Vec::with_capacity(node.entries.len());
6464-65105 let mut prefix = vec![];
106106+ let mut this_depth = parent_depth.next_expected()?;
107107+66108 for entry in &node.entries {
67109 let mut rkey = vec![];
68110 let pre_checked = prefix
69111 .get(..entry.prefix_len)
7070- .ok_or(RkeyError::EntryPrefixOutOfbounds)?;
112112+ .ok_or(MstError::EntryPrefixOutOfbounds)?;
71113 rkey.extend_from_slice(pre_checked);
72114 rkey.extend_from_slice(&entry.keysuffix);
115115+116116+ let Depth::Depth(key_depth) = Depth::from_key(&rkey) else {
117117+ return Err(MstError::WrongDepth);
118118+ };
119119+120120+ // this_depth is `none` if we are the deepest child (directly below root)
121121+ // in that case we accept whatever highest depth is claimed
122122+ let expected_depth = match this_depth {
123123+ Some(d) => d,
124124+ None => {
125125+ this_depth = Some(key_depth);
126126+ key_depth
127127+ }
128128+ };
129129+130130+ // all keys we find should be this depth
131131+ if key_depth != expected_depth {
132132+ return Err(MstError::DepthUnderflow);
133133+ }
134134+73135 prefix = rkey.clone();
7413675137 entries.push(Need::Record {
···77139 cid: entry.value,
78140 });
79141 if let Some(ref tree) = entry.tree {
8080- entries.push(Need::Node(*tree));
142142+ entries.push(Need::Node {
143143+ depth: Depth::Depth(key_depth),
144144+ cid: *tree,
145145+ });
81146 }
82147 }
8314884149 entries.reverse();
85150 stack.append(&mut entries);
86151152152+ let d = this_depth.ok_or(MstError::LostDepth)?;
153153+87154 if let Some(tree) = node.left {
8888- stack.push(Need::Node(tree));
155155+ stack.push(Need::Node {
156156+ depth: Depth::Depth(d),
157157+ cid: tree,
158158+ });
89159 }
90160 Ok(())
91161}
···102172impl Walker {
103173 pub fn new(tree_root_cid: Cid) -> Self {
104174 Self {
105105- stack: vec![Need::Node(tree_root_cid)],
175175+ stack: vec![Need::Node {
176176+ depth: Depth::Root,
177177+ cid: tree_root_cid,
178178+ }],
106179 prev: "".to_string(),
107180 }
108181 }
···114187 process: impl Fn(Vec<u8>) -> T,
115188 ) -> Result<Step<T>, Trip> {
116189 loop {
117117- let Some(mut need) = self.stack.last() else {
190190+ let Some(need) = self.stack.last_mut() else {
118191 log::trace!("tried to walk but we're actually done.");
119192 return Ok(Step::Finish);
120193 };
121194122122- match &mut need {
123123- Need::Node(cid) => {
195195+ match need {
196196+ &mut Need::Node { depth, cid } => {
124197 log::trace!("need node {cid:?}");
125125- let Some(block) = blocks.remove(cid) else {
198198+ let Some(block) = blocks.remove(&cid) else {
126199 log::trace!("node not found, resting");
127127- return Ok(Step::Missing(*cid));
200200+ return Ok(Step::Missing(cid));
128201 };
129202130203 let MaybeProcessedBlock::Raw(data) = block else {
···137210 self.stack.pop();
138211139212 // queue up work on the found node next
140140- push_from_node(&mut self.stack, &node)?;
213213+ push_from_node(&mut self.stack, &node, depth)?;
141214 }
142215 Need::Record { rkey, cid } => {
143216 log::trace!("need record {cid:?}");
···176249 process: impl Fn(Vec<u8>) -> T,
177250 ) -> Result<Step<T>, DiskTrip> {
178251 loop {
179179- let Some(mut need) = self.stack.last() else {
252252+ let Some(need) = self.stack.last_mut() else {
180253 log::trace!("tried to walk but we're actually done.");
181254 return Ok(Step::Finish);
182255 };
183256184184- match &mut need {
185185- Need::Node(cid) => {
257257+ match need {
258258+ &mut Need::Node { depth, cid } => {
186259 let cid_bytes = cid.to_bytes();
187260 log::trace!("need node {cid:?}");
188261 let Some(block_bytes) = reader.get(cid_bytes)? else {
189262 log::trace!("node not found, resting");
190190- return Ok(Step::Missing(*cid));
263263+ return Ok(Step::Missing(cid));
191264 };
192265193266 let block: MaybeProcessedBlock<T> = crate::drive::decode(&block_bytes)?;
···202275 self.stack.pop();
203276204277 // queue up work on the found node next
205205- push_from_node(&mut self.stack, &node).map_err(Trip::RkeyError)?;
278278+ push_from_node(&mut self.stack, &node, depth).map_err(Trip::MstError)?;
206279 }
207280 Need::Record { rkey, cid } => {
208281 log::trace!("need record {cid:?}");
···289362 // }
290363291364 #[test]
292292- fn test_next_from_node_empty() {
293293- let node = Node {
365365+ fn test_depth_spec_0() {
366366+ let d = Depth::from_key(b"2653ae71");
367367+ assert_eq!(d, Depth::Depth(0))
368368+ }
369369+370370+ #[test]
371371+ fn test_depth_spec_1() {
372372+ let d = Depth::from_key(b"blue");
373373+ assert_eq!(d, Depth::Depth(1))
374374+ }
375375+376376+ #[test]
377377+ fn test_depth_spec_4() {
378378+ let d = Depth::from_key(b"app.bsky.feed.post/454397e440ec");
379379+ assert_eq!(d, Depth::Depth(4))
380380+ }
381381+382382+ #[test]
383383+ fn test_depth_spec_8() {
384384+ let d = Depth::from_key(b"app.bsky.feed.post/9adeb165882c");
385385+ assert_eq!(d, Depth::Depth(8))
386386+ }
387387+388388+ #[test]
389389+ fn test_depth_ietf_draft_0() {
390390+ let d = Depth::from_key(b"key1");
391391+ assert_eq!(d, Depth::Depth(0))
392392+ }
393393+394394+ #[test]
395395+ fn test_depth_ietf_draft_1() {
396396+ let d = Depth::from_key(b"key7");
397397+ assert_eq!(d, Depth::Depth(1))
398398+ }
399399+400400+ #[test]
401401+ fn test_depth_ietf_draft_4() {
402402+ let d = Depth::from_key(b"key515");
403403+ assert_eq!(d, Depth::Depth(4))
404404+ }
405405+406406+ #[test]
407407+ fn test_depth_interop() {
408408+ // examples from https://github.com/bluesky-social/atproto-interop-tests/blob/main/mst/key_heights.json
409409+ for (k, expected) in [
410410+ ("", 0),
411411+ ("asdf", 0),
412412+ ("blue", 1),
413413+ ("2653ae71", 0),
414414+ ("88bfafc7", 2),
415415+ ("2a92d355", 4),
416416+ ("884976f5", 6),
417417+ ("app.bsky.feed.post/454397e440ec", 4),
418418+ ("app.bsky.feed.post/9adeb165882c", 8),
419419+ ] {
420420+ let d = Depth::from_key(k.as_bytes());
421421+ assert_eq!(d, Depth::Depth(expected), "key: {}", k);
422422+ }
423423+ }
424424+425425+ #[test]
426426+ fn test_push_empty_fails() {
427427+ let empty_node = Node {
294428 left: None,
295429 entries: vec![],
296430 };
297431 let mut stack = vec![];
298298- push_from_node(&mut stack, &node).unwrap();
299299- assert_eq!(stack.last(), None);
432432+ let err = push_from_node(&mut stack, &empty_node, Depth::Depth(4));
433433+ assert_eq!(err, Err(MstError::EmptyNode));
300434 }
301435302436 #[test]
303303- fn test_needs_from_node_just_left() {
437437+ fn test_push_one_node() {
304438 let node = Node {
305439 left: Some(cid1()),
306440 entries: vec![],
307441 };
308442 let mut stack = vec![];
309309- push_from_node(&mut stack, &node).unwrap();
310310- assert_eq!(stack.last(), Some(Need::Node(cid1())).as_ref());
443443+ push_from_node(&mut stack, &node, Depth::Depth(4)).unwrap();
444444+ assert_eq!(
445445+ stack.last(),
446446+ Some(Need::Node {
447447+ depth: Depth::Depth(3),
448448+ cid: cid1()
449449+ })
450450+ .as_ref()
451451+ );
311452 }
312453313454 // #[test]
+29-25
tests/non-huge-cars.rs
···11extern crate repo_stream;
22-use futures::TryStreamExt;
33-use iroh_car::CarReader;
44-use std::convert::Infallible;
22+use repo_stream::drive::Processable;
33+use serde::{Deserialize, Serialize};
5465const TINY_CAR: &'static [u8] = include_bytes!("../car-samples/tiny.car");
76const LITTLE_CAR: &'static [u8] = include_bytes!("../car-samples/little.car");
87const MIDSIZE_CAR: &'static [u8] = include_bytes!("../car-samples/midsize.car");
981010-async fn test_car(bytes: &[u8], expected_records: usize, expected_sum: usize) {
1111- let reader = CarReader::new(bytes).await.unwrap();
99+#[derive(Clone, Serialize, Deserialize)]
1010+struct S(usize);
12111313- let root = reader
1414- .header()
1515- .roots()
1616- .first()
1717- .ok_or("missing root")
1818- .unwrap()
1919- .clone();
1212+impl Processable for S {
1313+ fn get_size(&self) -> usize {
1414+ 0 // no additional space taken, just its stack size (newtype is free)
1515+ }
1616+}
20172121- let stream = std::pin::pin!(reader.stream());
1818+async fn test_car(bytes: &[u8], expected_records: usize, expected_sum: usize) {
1919+ let mb = 2_usize.pow(20);
22202323- let (_commit, v) =
2424- repo_stream::drive::Vehicle::init(root, stream, |block| Ok::<_, Infallible>(block.len()))
2525- .await
2626- .unwrap();
2727- let mut record_stream = std::pin::pin!(v.stream());
2121+ let mut driver = match repo_stream::drive::load_car(bytes, |block| S(block.len()), 10 * mb)
2222+ .await
2323+ .unwrap()
2424+ {
2525+ repo_stream::drive::Vehicle::Lil(_commit, mem_driver) => mem_driver,
2626+ repo_stream::drive::Vehicle::Big(_) => panic!("too big"),
2727+ };
28282929 let mut records = 0;
3030 let mut sum = 0;
3131 let mut found_bsky_profile = false;
3232 let mut prev_rkey = "".to_string();
3333- while let Some((rkey, size)) = record_stream.try_next().await.unwrap() {
3434- records += 1;
3535- sum += size;
3636- if rkey == "app.bsky.actor.profile/self" {
3737- found_bsky_profile = true;
3333+3434+ while let Some(pairs) = driver.next_chunk(256).await.unwrap() {
3535+ for (rkey, S(size)) in pairs {
3636+ records += 1;
3737+ sum += size;
3838+ if rkey == "app.bsky.actor.profile/self" {
3939+ found_bsky_profile = true;
4040+ }
4141+ assert!(rkey > prev_rkey, "rkeys are streamed in order");
4242+ prev_rkey = rkey;
3843 }
3939- assert!(rkey > prev_rkey, "rkeys are streamed in order");
4040- prev_rkey = rkey;
4144 }
4545+4246 assert_eq!(records, expected_records);
4347 assert_eq!(sum, expected_sum);
4448 assert!(found_bsky_profile);