···2424 let reader = tokio::io::BufReader::new(reader);
25252626 let (commit, mut driver) = match DriverBuilder::new()
2727+ .with_mem_limit_mb(1000)
2728 .with_block_processor(|block| block.len().to_ne_bytes().to_vec())
2829 .load_car(reader)
2930 .await?
···36373738 while let Step::Value(records) = driver.next_chunk(256).await? {
3839 for Output { rkey, cid, data } in records {
3939- let size = usize::from_ne_bytes(data.try_into().unwrap());
4040- print!("0x");
4141- for byte in cid.to_bytes() {
4242- print!("{byte:>02x}");
4343- }
4444- println!(": {rkey} => record of len {}", size);
4040+ // let size = usize::from_ne_bytes(data.try_into().unwrap());
4141+ // print!("0x");
4242+ // for byte in cid.to_bytes() {
4343+ // print!("{byte:>02x}");
4444+ // }
4545+ // println!(": {rkey} => record of len {}", size);
4546 }
4647 }
4748
+84-7
src/drive.rs
···77 mst::MstNode,
88 walk::{MstError, Output},
99};
1010+use std::collections::BTreeMap;
1011use cid::Cid;
1112use iroh_car::CarReader;
1213use std::convert::Infallible;
···4647pub type BlockChunk = Vec<Output>;
47484849#[derive(Debug, Clone)]
4949-pub(crate) enum MaybeProcessedBlock {
5050+pub enum MaybeProcessedBlock {
5051 /// A block that's *probably* a Node (but we can't know yet)
5152 ///
5253 /// It *can be* a record that suspiciously looks a lot like a node, so we
···7273}
73747475impl MaybeProcessedBlock {
7676+ pub fn to_node(&self) -> Option<MstNode> {
7777+ let Self::Raw(bytes) = self else {
7878+ return None;
7979+ };
8080+ serde_ipld_dagcbor::from_slice(bytes).ok()
8181+ }
8282+ pub fn unknown_depth(&self) -> bool {
8383+ let Self::Raw(bytes) = self else {
8484+ return false;
8585+ };
8686+ let Ok(node) = serde_ipld_dagcbor::from_slice::<MstNode>(bytes) else {
8787+ return false;
8888+ };
8989+ node.depth.is_none()
9090+ }
7591 pub(crate) fn maybe(process: fn(Bytes) -> Bytes, data: Bytes) -> Self {
7692 if MstNode::could_be(&data) {
7793 MaybeProcessedBlock::Raw(data)
···168184 /// Begin processing an atproto MST from a CAR file
169185 pub async fn load_car<R: AsyncRead + Unpin>(&self, reader: R) -> Result<Driver<R>, DriveError> {
170186 Driver::load_car(reader, self.block_processor, self.mem_limit_mb).await
187187+ }
188188+189189+ /// Begin processing an atproto MST from a CAR file
190190+ pub async fn count_entries<R: AsyncRead + Unpin>(&self, reader: R) -> Result<Option<BTreeMap<usize, usize>>, DriveError> {
191191+ Driver::count_entries(reader).await
171192 }
172193}
173194174195impl<R: AsyncRead + Unpin> Driver<R> {
196196+ pub async fn count_entries(
197197+ reader: R,
198198+ ) -> Result<Option<BTreeMap<usize, usize>>, DriveError> {
199199+ let mut mem_blocks: HashMap<ObjectLink, _> = HashMap::new();
200200+201201+ let mut car = CarReader::new(reader).await?;
202202+203203+ let roots = car.header().roots();
204204+ assert_eq!(roots.len(), 1);
205205+206206+ let root = *roots.first().ok_or(DriveError::MissingRoot)?;
207207+ log::debug!("root: {root:?}");
208208+209209+ let mut commit = None;
210210+211211+212212+ // try to load all the blocks into memory
213213+ while let Some((cid, data)) = car.next_block().await? {
214214+ // the root commit is a Special Third Kind of block that we need to make
215215+ // sure not to optimistically send to the processing function
216216+ if cid == root {
217217+ let c: Commit = serde_ipld_dagcbor::from_slice(&data)?;
218218+ commit = Some(c);
219219+ continue;
220220+ }
221221+ let maybe_processed = MaybeProcessedBlock::maybe(|_| vec![], data);
222222+223223+ // stash (maybe processed) blocks in memory as long as we have room
224224+ mem_blocks.insert(cid.into(), maybe_processed);
225225+ }
226226+227227+ let commit = commit.ok_or(DriveError::MissingCommit)?;
228228+229229+ // the commit always must point to a Node; empty node => empty MST special case
230230+ let root_node: MstNode = match mem_blocks
231231+ .get(&commit.data)
232232+ .ok_or(DriveError::MissingCommit)?
233233+ {
234234+ MaybeProcessedBlock::Processed(_) => Err(WalkError::BadCommitFingerprint)?,
235235+ MaybeProcessedBlock::Raw(bytes) => serde_ipld_dagcbor::from_slice(bytes)?,
236236+ };
237237+238238+ if root_node.depth.unwrap_or(0) < 4 {
239239+ return Ok(None);
240240+ }
241241+242242+ let mut walker = Walker::new(root_node);
243243+ Ok(Some(walker.count_entries(&mut mem_blocks)?))
244244+ }
245245+175246 /// Begin processing an atproto MST from a CAR file
176247 ///
177248 /// Blocks will be loaded, processed, and buffered in memory. If the entire
···186257 process: fn(Bytes) -> Bytes,
187258 mem_limit_mb: usize,
188259 ) -> Result<Driver<R>, DriveError> {
260260+ let mut block_count = 0;
261261+189262 let max_size = mem_limit_mb * 2_usize.pow(20);
190263 let mut mem_blocks = HashMap::new();
191264192265 let mut car = CarReader::new(reader).await?;
193266194194- let root = *car
195195- .header()
196196- .roots()
197197- .first()
198198- .ok_or(DriveError::MissingRoot)?;
267267+ let roots = car.header().roots();
268268+ assert_eq!(roots.len(), 1);
269269+270270+ let root = *roots.first().ok_or(DriveError::MissingRoot)?;
199271 log::debug!("root: {root:?}");
200272201273 let mut commit = None;
···203275 // try to load all the blocks into memory
204276 let mut mem_size = 0;
205277 while let Some((cid, data)) = car.next_block().await? {
278278+ block_count += 1;
206279 // the root commit is a Special Third Kind of block that we need to make
207280 // sure not to optimistically send to the processing function
208281 if cid == root {
···218291 mem_size += maybe_processed.len();
219292 mem_blocks.insert(cid.into(), maybe_processed);
220293 if mem_size >= max_size {
294294+ log::debug!("blocks loaded before disk needed: {block_count}");
295295+221296 return Ok(Driver::Disk(NeedDisk {
222297 car,
223298 root,
···228303 }));
229304 }
230305 }
306306+307307+ log::debug!("blocks: {block_count}");
231308232309 // all blocks loaded and we fit in memory! hopefully we found the commit...
233310 let commit = commit.ok_or(DriveError::MissingCommit)?;
···274351/// so the sync/async boundaries become a little easier to work around.
275352#[derive(Debug)]
276353pub struct MemDriver {
277277- blocks: HashMap<ObjectLink, MaybeProcessedBlock>,
354354+ pub blocks: HashMap<ObjectLink, MaybeProcessedBlock>,
278355 walker: Walker,
279356 process: fn(Bytes) -> Bytes, // TODO: impl Fn(bytes) -> Bytes?
280357 next_missing: Option<NodeThing>,