···11[package]
22name = "repo-stream"
33-version = "0.1.0"
33+version = "0.1.1"
44edition = "2024"
55license = "MIT OR Apache-2.0"
66description = "Fast and robust atproto CAR file processing in rust"
77+repository = "https://tangled.org/@microcosm.blue/repo-stream"
7889[dependencies]
910futures = "0.3.31"
+28-13
src/drive.rs
···11+//! Consume an MST block stream, producing an ordered stream of records
22+13use futures::{Stream, TryStreamExt};
24use ipld_core::cid::Cid;
35use std::collections::HashMap;
···68use crate::mst::{Commit, Node};
79use crate::walk::{Step, Trip, Walker};
8101111+/// Errors that can happen while consuming and emitting blocks and records
912#[derive(Debug, thiserror::Error)]
1013pub enum DriveError<E: Error> {
1114 #[error("Failed to initialize CarReader: {0}")]
···2023 MissingBlock(Cid),
2124 #[error("Failed to walk the mst tree: {0}")]
2225 Tripped(#[from] Trip<E>),
2323- #[error("Encountered an rkey out of order while walking the MST")]
2424- RkeyOutOfOrder,
2526}
26272728type CarBlock<E> = Result<(Cid, Vec<u8>), E>;
28293030+/// Newtype because i'll mix up strings somewhere if i don't
2931#[derive(Debug)]
3032pub struct Rkey(pub String);
3133···5557 Processed(Result<T, E>),
5658}
57596060+/// The core driver between the block stream and MST walker
5861pub struct Vehicle<SE, S, T, P, PE>
5962where
6063 S: Stream<Item = CarBlock<SE>>,
···6568 blocks: HashMap<Cid, MaybeProcessedBlock<T, PE>>,
6669 walker: Walker,
6770 process: P,
6868- prev_rkey: String,
6971}
70727173impl<SE, S, T: Clone, P, PE> Vehicle<SE, S, T, P, PE>
···7577 P: Fn(&[u8]) -> Result<T, PE>,
7678 PE: Error,
7779{
8080+ /// Set up the stream
8181+ ///
8282+ /// This will eagerly consume blocks until the `Commit` object is found.
8383+ /// *Usually* the it's the first block, but there is no guarantee.
8484+ ///
8585+ /// ### Parameters
8686+ ///
8787+ /// `root`: CID of the commit object that is the root of the MST
8888+ ///
8989+ /// `block_stream`: Input stream of raw CAR blocks
9090+ ///
9191+ /// `process`: record-transforming callback:
9292+ ///
9393+ /// For tasks where records can be quickly processed into a *smaller*
9494+ /// useful representation, you can do that eagerly as blocks come in by
9595+ /// passing the processor as a callback here. This can reduce overall
9696+ /// memory usage.
7897 pub async fn init(
7998 root: Cid,
8099 mut block_stream: S,
···116135 blocks,
117136 walker,
118137 process,
119119- prev_rkey: "".to_string(),
120138 };
121139 Ok((commit, me))
122140 }
···145163 Err(DriveError::MissingBlock(cid_needed))
146164 }
147165148148- pub async fn next_record(&mut self) -> Result<Option<(Rkey, T)>, DriveError<PE>> {
166166+ /// Manually step through the record outputs
167167+ pub async fn next_record(&mut self) -> Result<Option<(String, T)>, DriveError<PE>> {
149168 loop {
150169 // walk as far as we can until we run out of blocks or find a record
151151- let cid_needed = match self.walker.walk(&mut self.blocks, &self.process)? {
170170+ let cid_needed = match self.walker.step(&mut self.blocks, &self.process)? {
152171 Step::Rest(cid) => cid,
153172 Step::Finish => return Ok(None),
154154- Step::Step { rkey, data } => {
155155- if rkey <= self.prev_rkey {
156156- return Err(DriveError::RkeyOutOfOrder);
157157- }
158158- return Ok(Some((Rkey(rkey), data)));
159159- }
173173+ Step::Step { rkey, data } => return Ok(Some((rkey, data))),
160174 };
161175162176 // load blocks until we reach that cid
···164178 }
165179 }
166180167167- pub fn stream(self) -> impl Stream<Item = Result<(Rkey, T), DriveError<PE>>> {
181181+ /// Convert to a futures::stream of record outputs
182182+ pub fn stream(self) -> impl Stream<Item = Result<(String, T), DriveError<PE>>> {
168183 futures::stream::try_unfold(self, |mut this| async move {
169184 let maybe_record = this.next_record().await?;
170185 Ok(maybe_record.map(|b| (b, this)))
+4
src/lib.rs
···11+//! rep-stream
22+//!
33+//! For now see the [examples](https://tangled.org/@microcosm.blue/repo-stream/tree/main/examples)
44+15pub mod drive;
26pub mod mst;
37pub mod walk;
+29-5
src/walk.rs
···66use std::collections::HashMap;
77use std::error::Error;
8899+/// Errors that can happen while walking
910#[derive(Debug, thiserror::Error)]
1011pub enum Trip<E: Error> {
1112 #[error("empty mst nodes are not allowed")]
···1314 #[error("Failed to decode commit block: {0}")]
1415 BadCommit(Box<dyn std::error::Error>),
1516 #[error("Action node error: {0}")]
1616- ActionNode(#[from] ActionNodeError),
1717+ RkeyError(#[from] RkeyError),
1718 #[error("Process failed: {0}")]
1819 ProcessFailed(E),
2020+ #[error("Encountered an rkey out of order while walking the MST")]
2121+ RkeyOutOfOrder,
1922}
20232424+/// Errors from invalid Rkeys
2125#[derive(Debug, thiserror::Error)]
2222-pub enum ActionNodeError {
2626+pub enum RkeyError {
2327 #[error("Failed to compute an rkey due to invalid prefix_len")]
2428 EntryPrefixOutOfbounds,
2529 #[error("RKey was not utf-8")]
2630 EntryRkeyNotUtf8(#[from] std::string::FromUtf8Error),
2731}
28323333+/// Walker outputs
2934#[derive(Debug)]
3035pub enum Step<T> {
3636+ /// We need a CID but it's not in the block store
3737+ ///
3838+ /// Give the needed CID to the driver so it can load blocks until it's found
3139 Rest(Cid),
4040+ /// Reached the end of the MST! yay!
3241 Finish,
4242+ /// A record was found!
3343 Step { rkey: String, data: T },
3444}
3545···3949 Record { rkey: String, cid: Cid },
4050}
41514242-fn push_from_node(stack: &mut Vec<Need>, node: &Node) -> Result<(), ActionNodeError> {
5252+fn push_from_node(stack: &mut Vec<Need>, node: &Node) -> Result<(), RkeyError> {
4353 let mut entries = Vec::with_capacity(node.entries.len());
44544555 let mut prefix = vec![];
···4757 let mut rkey = vec![];
4858 let pre_checked = prefix
4959 .get(..entry.prefix_len)
5050- .ok_or(ActionNodeError::EntryPrefixOutOfbounds)?;
6060+ .ok_or(RkeyError::EntryPrefixOutOfbounds)?;
5161 rkey.extend_from_slice(pre_checked);
5262 rkey.extend_from_slice(&entry.keysuffix);
5363 prefix = rkey.clone();
···7080 Ok(())
7181}
72828383+/// Traverser of an atproto MST
8484+///
8585+/// Walks the tree from left-to-right in depth-first order
7386#[derive(Debug)]
7487pub struct Walker {
7588 stack: Vec<Need>,
8989+ prev: String,
7690}
77917892impl Walker {
7993 pub fn new(tree_root_cid: Cid) -> Self {
8094 Self {
8195 stack: vec![Need::Node(tree_root_cid)],
9696+ prev: "".to_string(),
8297 }
8398 }
84998585- pub fn walk<T: Clone, E: Error>(
100100+ /// Advance through nodes until we find a record or can't go further
101101+ pub fn step<T: Clone, E: Error>(
86102 &mut self,
87103 blocks: &mut HashMap<Cid, MaybeProcessedBlock<T, E>>,
88104 process: impl Fn(&[u8]) -> Result<T, E>,
···140156141157 log::trace!("emitting a block as a step. depth={}", self.stack.len());
142158 let data = data.map_err(Trip::ProcessFailed)?;
159159+160160+ // rkeys *must* be in order or else the tree is invalid (or
161161+ // we have a bug)
162162+ if rkey <= self.prev {
163163+ return Err(Trip::RkeyOutOfOrder);
164164+ }
165165+ self.prev = rkey.clone();
166166+143167 return Ok(Step::Step { rkey, data });
144168 }
145169 }
+3-3
tests/non-huge-cars.rs
···3333 while let Some((rkey, size)) = record_stream.try_next().await.unwrap() {
3434 records += 1;
3535 sum += size;
3636- if rkey.0 == "app.bsky.actor.profile/self" {
3636+ if rkey == "app.bsky.actor.profile/self" {
3737 found_bsky_profile = true;
3838 }
3939- assert!(rkey.0 > prev_rkey, "rkeys are streamed in order");
4040- prev_rkey = rkey.0;
3939+ assert!(rkey > prev_rkey, "rkeys are streamed in order");
4040+ prev_rkey = rkey;
4141 }
4242 assert_eq!(records, expected_records);
4343 assert_eq!(sum, expected_sum);