···2222 let reader = tokio::fs::File::open(filename).await.unwrap();
2323 let reader = tokio::io::BufReader::new(reader);
24242525- let mb = 2_usize.pow(20);
2626-2727- let mut driver = match Driver::load_car(reader, |block| block.len(), 1024 * mb)
2525+ let mut driver = match Driver::load_car(reader, |block| block.len(), 1024)
2826 .await
2927 .unwrap()
3028 {
3131- Driver::Lil(_, mem_driver) => mem_driver,
3232- Driver::Big(_) => panic!("not doing disk for benchmark"),
2929+ Driver::Memory(_, mem_driver) => mem_driver,
3030+ Driver::Disk(_) => panic!("not doing disk for benchmark"),
3331 };
34323533 let mut n = 0;
+3-3
benches/non-huge-cars.rs
···2525}
26262727async fn drive_car(bytes: &[u8]) -> usize {
2828- let mut driver = match Driver::load_car(bytes, |block| block.len(), 32 * 2_usize.pow(20))
2828+ let mut driver = match Driver::load_car(bytes, |block| block.len(), 32)
2929 .await
3030 .unwrap()
3131 {
3232- Driver::Lil(_, mem_driver) => mem_driver,
3333- Driver::Big(_) => panic!("not benching big cars here"),
3232+ Driver::Memory(_, mem_driver) => mem_driver,
3333+ Driver::Disk(_) => panic!("not benching big cars here"),
3434 };
35353636 let mut n = 0;
+12-5
examples/disk-read-file/main.rs
···11+/*!
22+Read a CAR file by spilling to disk
33+*/
44+15extern crate repo_stream;
26use clap::Parser;
33-use repo_stream::{Driver, noop};
77+use repo_stream::{Driver, process::noop};
48use std::path::PathBuf;
59610#[derive(Debug, Parser)]
···24282529 // configure how much memory can be used before spilling to disk.
2630 // real memory usage may differ somewhat.
2727- let in_mem_limit = 10 * 2_usize.pow(20);
3131+ let in_mem_limit = 10; // MiB
28322933 // configure how much memory sqlite is allowed to use when dumping to disk
3030- let db_cache_mb = 32;
3434+ let db_cache_mb = 32; // MiB
31353236 log::info!("hello! reading the car...");
33373438 // in this example we only bother handling CARs that are too big for memory
3539 // `noop` helper means: do no block processing, store the raw blocks
3640 let driver = match Driver::load_car(reader, noop, in_mem_limit).await? {
3737- Driver::Lil(_, _) => panic!("try this on a bigger car"),
3838- Driver::Big(big_stuff) => {
4141+ Driver::Memory(_, _) => panic!("try this on a bigger car"),
4242+ Driver::Disk(big_stuff) => {
3943 // we reach here if the repo was too big and needs to be spilled to
4044 // disk to continue
4145···80848185 log::info!("arrived! joining rx...");
82868787+ // clean up the database. would be nice to do this in drop so it happens
8888+ // automatically, but some blocking work happens, so that's not allowed in
8989+ // async rust. 🤷♀️
8390 join.await?.reset_store().await?;
84918592 log::info!("done. n={n} zeros={zeros}");
+8-4
examples/read-file/main.rs
···11+/*!
22+Read a CAR file with in-memory processing
33+*/
44+15extern crate repo_stream;
26use clap::Parser;
37use repo_stream::Driver;
···2024 let reader = tokio::io::BufReader::new(reader);
21252226 let (commit, mut driver) =
2323- match Driver::load_car(reader, |block| block.len(), 16 * 1024 * 1024).await? {
2424- Driver::Lil(commit, mem_driver) => (commit, mem_driver),
2525- Driver::Big(_) => panic!("can't handle big cars yet"),
2727+ match Driver::load_car(reader, |block| block.len(), 16 /* MiB */).await? {
2828+ Driver::Memory(commit, mem_driver) => (commit, mem_driver),
2929+ Driver::Disk(_) => panic!("this example doesn't handle big CARs"),
2630 };
27312832 log::info!("got commit: {commit:?}");
···3236 n += pairs.len();
3337 // log::info!("got {rkey:?}");
3438 }
3535- log::info!("bye! {n}");
3939+ log::info!("bye! total records={n}");
36403741 Ok(())
3842}
+1-1
src/disk.rs
···4343 // let insert_stmt = tx.prepare("INSERT INTO blocks (key, val) VALUES (?1, ?2)")?;
4444 Ok(SqliteWriter { tx })
4545 }
4646- pub fn get_reader(&'_ self) -> Result<SqliteReader<'_>, rusqlite::Error> {
4646+ pub fn get_reader<'conn>(&'conn self) -> Result<SqliteReader<'conn>, rusqlite::Error> {
4747 let select_stmt = self.conn.prepare("SELECT val FROM blocks WHERE key = ?1")?;
4848 Ok(SqliteReader { select_stmt })
4949 }
+83-37
src/drive.rs
···101101}
102102103103pub enum Driver<R: AsyncRead + Unpin, T: Processable> {
104104- Lil(Commit, MemDriver<T>),
105105- Big(BigCar<R, T>),
104104+ Memory(Commit, MemDriver<T>),
105105+ Disk(BigCar<R, T>),
106106}
107107108108impl<R: AsyncRead + Unpin, T: Processable> Driver<R, T> {
109109 pub async fn load_car(
110110 reader: R,
111111 process: fn(Vec<u8>) -> T,
112112- max_size: usize,
112112+ max_size_mb: usize,
113113 ) -> Result<Driver<R, T>, DriveError> {
114114+ let max_size = max_size_mb * 2_usize.pow(20);
114115 let mut mem_blocks = HashMap::new();
115116116117 let mut car = CarReader::new(reader).await?;
···142143 mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size();
143144 mem_blocks.insert(cid, maybe_processed);
144145 if mem_size >= max_size {
145145- return Ok(Driver::Big(BigCar {
146146+ return Ok(Driver::Disk(BigCar {
146147 car,
147148 root,
148149 process,
···158159159160 let walker = Walker::new(commit.data);
160161161161- Ok(Driver::Lil(
162162+ Ok(Driver::Memory(
162163 commit,
163164 MemDriver {
164165 blocks: mem_blocks,
···321322 commit,
322323 BigCarReady {
323324 process: self.process,
324324- store,
325325- walker,
325325+ state: Some(BigState { store, walker }),
326326 },
327327 ))
328328 }
329329}
330330331331+struct BigState {
332332+ store: SqliteStore,
333333+ walker: Walker,
334334+}
335335+331336pub struct BigCarReady<T: Clone> {
332337 process: fn(Vec<u8>) -> T,
333333- store: SqliteStore,
334334- walker: Walker,
338338+ state: Option<BigState>,
335339}
336340337341impl<T: Processable + Send + 'static> BigCarReady<T> {
338338- pub async fn next_chunk(
339339- mut self,
340340- n: usize,
341341- ) -> Result<(Self, Option<BlockChunk<T>>), DriveError> {
342342- let mut out = Vec::with_capacity(n);
343343- (self, out) = tokio::task::spawn_blocking(move || {
344344- let store = self.store;
345345- let mut reader = store.get_reader()?;
342342+ pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk<T>>, DriveError> {
343343+ let process = self.process;
344344+345345+ // state should only *ever* be None transiently while inside here
346346+ let mut state = self
347347+ .state
348348+ .take()
349349+ .expect("BigCarReady must have Some(state)");
346350347347- for _ in 0..n {
348348- // walk as far as we can until we run out of blocks or find a record
349349- match self.walker.disk_step(&mut reader, self.process)? {
350350- Step::Missing(cid) => return Err(DriveError::MissingBlock(cid)),
351351- Step::Finish => break,
352352- Step::Found { rkey, data } => {
353353- out.push((rkey, data));
354354- continue;
351351+ // the big pain here is that we don't want to leave self.state in an
352352+ // invalid state (None), so all the error paths have to make sure it
353353+ // comes out again.
354354+ let (state, res) = tokio::task::spawn_blocking(
355355+ move || -> (BigState, Result<BlockChunk<T>, DriveError>) {
356356+ let mut reader_res = state.store.get_reader();
357357+ let reader: &mut _ = match reader_res {
358358+ Ok(ref mut r) => r,
359359+ Err(ref mut e) => {
360360+ // unfortunately we can't return the error directly because
361361+ // (for some reason) it's attached to the lifetime of the
362362+ // reader?
363363+ // hack a mem::swap so we can get it out :/
364364+ let mut e_swapped =
365365+ rusqlite::Error::InvalidParameterName("this error was stolen".into());
366366+ std::mem::swap(e, &mut e_swapped);
367367+ // the pain: `state` *has to* outlive the reader
368368+ drop(reader_res);
369369+ return (state, Err(e_swapped.into()));
355370 }
356371 };
357357- }
358372359359- drop(reader); // cannot outlive store
360360- self.store = store;
361361- Ok::<_, DriveError>((self, out))
362362- })
363363- .await??;
373373+ let mut out = Vec::with_capacity(n);
374374+375375+ for _ in 0..n {
376376+ // walk as far as we can until we run out of blocks or find a record
377377+ let step = match state.walker.disk_step(reader, process) {
378378+ Ok(s) => s,
379379+ Err(e) => {
380380+ // the pain: `state` *has to* outlive the reader
381381+ drop(reader_res);
382382+ return (state, Err(e.into()));
383383+ }
384384+ };
385385+ match step {
386386+ Step::Missing(cid) => {
387387+ // the pain: `state` *has to* outlive the reader
388388+ drop(reader_res);
389389+ return (state, Err(DriveError::MissingBlock(cid)));
390390+ }
391391+ Step::Finish => break,
392392+ Step::Found { rkey, data } => out.push((rkey, data)),
393393+ };
394394+ }
395395+396396+ // `state` *has to* outlive the reader
397397+ drop(reader_res);
398398+399399+ (state, Ok::<_, DriveError>(out))
400400+ },
401401+ )
402402+ .await?; // on tokio JoinError, we'll be left with invalid state :(
403403+404404+ // *must* restore state before dealing with the actual result
405405+ self.state = Some(state);
406406+407407+ let out = res?;
364408365409 if out.is_empty() {
366366- Ok((self, None))
410410+ Ok(None)
367411 } else {
368368- Ok((self, Some(out)))
412412+ Ok(Some(out))
369413 }
370414 }
371415···374418 n: usize,
375419 tx: mpsc::Sender<Result<BlockChunk<T>, DriveError>>,
376420 ) -> Result<(), mpsc::error::SendError<Result<BlockChunk<T>, DriveError>>> {
377377- let mut reader = match self.store.get_reader() {
421421+ let BigState { store, walker } = self.state.as_mut().expect("valid state");
422422+ let mut reader = match store.get_reader() {
378423 Ok(r) => r,
379424 Err(e) => return tx.blocking_send(Err(e.into())),
380425 };
···385430 for _ in 0..n {
386431 // walk as far as we can until we run out of blocks or find a record
387432388388- let step = match self.walker.disk_step(&mut reader, self.process) {
433433+ let step = match walker.disk_step(&mut reader, self.process) {
389434 Ok(s) => s,
390435 Err(e) => return tx.blocking_send(Err(e.into())),
391436 };
···433478434479 pub async fn reset_store(mut self) -> Result<SqliteStore, DriveError> {
435480 tokio::task::spawn_blocking(move || {
436436- self.store.reset()?;
437437- Ok(self.store)
481481+ let BigState { mut store, .. } = self.state.take().expect("valid state");
482482+ store.reset()?;
483483+ Ok(store)
438484 })
439485 .await?
440486 }
+67-4
src/lib.rs
···11-//! Fast and robust atproto CAR file processing in rust
22-//!
33-//! For now see the [examples](https://tangled.org/@microcosm.blue/repo-stream/tree/main/examples)
11+/*!
22+A robust CAR file -> MST walker for atproto
33+44+Small CARs have their blocks buffered in memory. If a configurable memory limit
55+is reached while reading blocks, CAR reading is suspended, and can be continued
66+by providing disk storage to buffer the CAR blocks instead.
77+88+A `process` function can be provided for tasks where records are transformed
99+into a smaller representation, to save memory (and disk) during block reading.
1010+1111+Once blocks are loaded, the MST is walked and emitted as chunks of pairs of
1212+`(rkey, processed_block)` pairs, in order (depth first, left-to-right).
1313+1414+Some MST validations are applied
1515+- Keys must appear in order
1616+- Keys must be at the correct MST tree depth
1717+1818+`iroh_car` additionally applies a block size limit of `2MiB`.
1919+2020+```
2121+use repo_stream::{Driver, SqliteStore};
2222+2323+# #[tokio::main]
2424+# async fn main() -> Result<(), Box<dyn std::error::Error>> {
2525+# let reader = include_bytes!("../car-samples/tiny.car").as_slice();
2626+let mut total_size = 0;
2727+let process = |rec: Vec<u8>| rec.len(); // block processing: just extract the size
2828+let in_mem_limit = 10; /* MiB */
2929+let db_cache_size = 32; /* MiB */
3030+3131+match Driver::load_car(reader, process, in_mem_limit).await? {
3232+3333+ // if all blocks fit within memory
3434+ Driver::Memory(_commit, mut driver) => {
3535+ while let Some(chunk) = driver.next_chunk(256).await? {
3636+ for (_rkey, size) in chunk {
3737+ total_size += size;
3838+ }
3939+ }
4040+ },
4141+4242+ // if the CAR was too big for in-memory processing
4343+ Driver::Disk(paused) => {
4444+ // set up a disk store we can spill to
4545+ let store = SqliteStore::new("some/path.sqlite".into(), db_cache_size).await?;
4646+ // do the spilling, get back a (similar) driver
4747+ let (_commit, mut driver) = paused.finish_loading(store).await?;
4848+4949+ while let Some(chunk) = driver.next_chunk(256).await? {
5050+ for (_rkey, size) in chunk {
5151+ total_size += size;
5252+ }
5353+ }
5454+5555+ // clean up the disk store (drop tables etc)
5656+ driver.reset_store().await?;
5757+ }
5858+};
5959+println!("sum of size of all records: {total_size}");
6060+# Ok(())
6161+# }
6262+```
6363+6464+Find more [examples in the repo](https://tangled.org/@microcosm.blue/repo-stream/tree/main/examples).
6565+6666+*/
467568mod mst;
669mod walk;
···11741275pub use disk::SqliteStore;
1376pub use drive::{DriveError, Driver};
1414-pub use process::{Processable, noop};
7777+pub use process::Processable;