···33use repo_stream::{Driver, noop};
44use std::path::PathBuf;
5566-type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
77-86#[derive(Debug, Parser)]
97struct Args {
108 #[arg()]
···1412}
15131614#[tokio::main]
1717-async fn main() -> Result<()> {
1515+async fn main() -> Result<(), Box<dyn std::error::Error>> {
1816 env_logger::init();
19172018 let Args { car, tmpfile } = Args::parse();
1919+2020+ // repo-stream takes an AsyncRead as input. wrapping a filesystem read in
2121+ // BufReader can provide a really significant performance win.
2122 let reader = tokio::fs::File::open(car).await?;
2223 let reader = tokio::io::BufReader::new(reader);
23242424- // let kb = 2_usize.pow(10);
2525- let mb = 2_usize.pow(20);
2525+ // configure how much memory can be used before spilling to disk.
2626+ // real memory usage may differ somewhat.
2727+ let in_mem_limit = 10 * 2_usize.pow(20);
26282727- let limit_mb = 32;
2929+ // configure how much memory sqlite is allowed to use when dumping to disk
3030+ let db_cache_mb = 32;
28312929- let driver = match Driver::load_car(reader, noop, 10 * mb).await? {
3232+ log::info!("hello! reading the car...");
3333+3434+ // in this example we only bother handling CARs that are too big for memory
3535+ // `noop` helper means: do no block processing, store the raw blocks
3636+ let driver = match Driver::load_car(reader, noop, in_mem_limit).await? {
3037 Driver::Lil(_, _) => panic!("try this on a bigger car"),
3138 Driver::Big(big_stuff) => {
3232- let disk_store = repo_stream::disk::SqliteStore::new(tmpfile.clone(), limit_mb).await?;
3939+ // we reach here if the repo was too big and needs to be spilled to
4040+ // disk to continue
4141+4242+ // set up a disk store we can spill to
4343+ let disk_store =
4444+ repo_stream::disk::SqliteStore::new(tmpfile.clone(), db_cache_mb).await?;
4545+4646+ // do the spilling, get back a (similar) driver
3347 let (commit, driver) = big_stuff.finish_loading(disk_store).await?;
3434- log::warn!("big: {:?}", commit);
4848+4949+ // at this point you might want to fetch the account's signing key
5050+ // via the DID from the commit, and then verify the signature.
5151+ log::warn!("big's comit: {:?}", commit);
5252+5353+ // pop the driver back out to get some code indentation relief
3554 driver
3655 }
3756 };
38575858+ // collect some random stats about the blocks
3959 let mut n = 0;
4060 let mut zeros = 0;
4141- let mut rx = driver.to_channel(512);
42614343- log::debug!("walking...");
6262+ log::info!("walking...");
6363+6464+ // this example uses the disk driver's channel mode: the tree walking is
6565+ // spawned onto a blocking thread, and we get chunks of rkey+blocks back
6666+ let (mut rx, join) = driver.to_channel(512);
4467 while let Some(r) = rx.recv().await {
4568 let pairs = r?;
6969+7070+ // keep a count of the total number of blocks seen
4671 n += pairs.len();
7272+4773 for (_, block) in pairs {
7474+ // for each block, count how many bytes are equal to '0'
7575+ // (this is just an example, you probably want to do something more
7676+ // interesting)
4877 zeros += block.into_iter().filter(|&b| b == b'0').count()
4978 }
5079 }
5151- log::debug!("done walking!");
8080+8181+ log::info!("arrived! joining rx...");
52825353- // log::info!("now is the time to check mem...");
5454- // tokio::time::sleep(std::time::Duration::from_secs(22)).await;
5555- log::info!("bye! n={n} zeros={zeros}");
8383+ join.await?.reset_store().await?;
56845757- std::fs::remove_file(tmpfile).unwrap(); // need to also remove -shm -wal
8585+ log::info!("done. n={n} zeros={zeros}");
58865987 Ok(())
6088}
···9090 }
9191}
92929393+impl<T> MaybeProcessedBlock<T> {
9494+ fn maybe(process: fn(Vec<u8>) -> T, data: Vec<u8>) -> Self {
9595+ if Node::could_be(&data) {
9696+ MaybeProcessedBlock::Raw(data)
9797+ } else {
9898+ MaybeProcessedBlock::Processed(process(data))
9999+ }
100100+ }
101101+}
102102+93103pub enum Driver<R: AsyncRead + Unpin, T: Processable> {
94104 Lil(Commit, MemDriver<T>),
95105 Big(BigCar<R, T>),
···126136 }
127137128138 // remaining possible types: node, record, other. optimistically process
129129- let maybe_processed = if Node::could_be(&data) {
130130- MaybeProcessedBlock::Raw(data)
131131- } else {
132132- MaybeProcessedBlock::Processed(process(data))
133133- };
139139+ let maybe_processed = MaybeProcessedBlock::maybe(process, data);
134140135141 // stash (maybe processed) blocks in memory as long as we have room
136142 mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size();
···192198 match self.walker.step(&mut self.blocks, self.process)? {
193199 Step::Missing(cid) => return Err(DriveError::MissingBlock(cid)),
194200 Step::Finish => break,
195195- Step::Step { rkey, data } => {
201201+ Step::Found { rkey, data } => {
196202 out.push((rkey, data));
197203 continue;
198204 }
···283289 }
284290 // remaining possible types: node, record, other. optimistically process
285291 // TODO: get the actual in-memory size to compute disk spill
286286- let maybe_processed = if Node::could_be(&data) {
287287- MaybeProcessedBlock::Raw(data)
288288- } else {
289289- MaybeProcessedBlock::Processed((self.process)(data))
290290- };
292292+ let maybe_processed = MaybeProcessedBlock::maybe(self.process, data);
291293 mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size();
292294 chunk.push((cid, maybe_processed));
293295 if mem_size >= self.max_size {
···347349 match self.walker.disk_step(&mut reader, self.process)? {
348350 Step::Missing(cid) => return Err(DriveError::MissingBlock(cid)),
349351 Step::Finish => break,
350350- Step::Step { rkey, data } => {
352352+ Step::Found { rkey, data } => {
351353 out.push((rkey, data));
352354 continue;
353355 }
···368370 }
369371370372 fn read_tx_blocking(
371371- mut self,
373373+ &mut self,
372374 n: usize,
373375 tx: mpsc::Sender<Result<BlockChunk<T>, DriveError>>,
374376 ) -> Result<(), mpsc::error::SendError<Result<BlockChunk<T>, DriveError>>> {
···393395 return tx.blocking_send(Err(DriveError::MissingBlock(cid)));
394396 }
395397 Step::Finish => return Ok(()),
396396- Step::Step { rkey, data } => {
398398+ Step::Found { rkey, data } => {
397399 out.push((rkey, data));
398400 continue;
399401 }
···409411 Ok(())
410412 }
411413412412- pub fn to_channel(self, n: usize) -> mpsc::Receiver<Result<BlockChunk<T>, DriveError>> {
414414+ pub fn to_channel(
415415+ mut self,
416416+ n: usize,
417417+ ) -> (
418418+ mpsc::Receiver<Result<BlockChunk<T>, DriveError>>,
419419+ tokio::task::JoinHandle<Self>,
420420+ ) {
413421 let (tx, rx) = mpsc::channel::<Result<BlockChunk<T>, DriveError>>(1);
414422415423 // sketch: this worker is going to be allowed to execute without a join handle
416416- tokio::task::spawn_blocking(move || {
424424+ let chan_task = tokio::task::spawn_blocking(move || {
417425 if let Err(mpsc::error::SendError(_)) = self.read_tx_blocking(n, tx) {
418426 log::debug!("big car reader exited early due to dropped receiver channel");
419427 }
428428+ self
420429 });
421430422422- rx
431431+ (rx, chan_task)
432432+ }
433433+434434+ pub async fn reset_store(mut self) -> Result<SqliteStore, DriveError> {
435435+ tokio::task::spawn_blocking(move || {
436436+ self.store.reset()?;
437437+ Ok(self.store)
438438+ })
439439+ .await?
423440 }
424441}
+3-2
src/lib.rs
···22//!
33//! For now see the [examples](https://tangled.org/@microcosm.blue/repo-stream/tree/main/examples)
4455+mod mst;
66+mod walk;
77+58pub mod disk;
69pub mod drive;
77-pub mod mst;
810pub mod process;
99-pub mod walk;
10111112pub use disk::SqliteStore;
1213pub use drive::{DriveError, Driver};
+3-3
src/walk.rs
···5151 /// Reached the end of the MST! yay!
5252 Finish,
5353 /// A record was found!
5454- Step { rkey: String, data: T },
5454+ Found { rkey: String, data: T },
5555}
56565757#[derive(Debug, Clone, PartialEq)]
···227227 }
228228 self.prev = rkey.clone();
229229230230- return Ok(Step::Step { rkey, data });
230230+ return Ok(Step::Found { rkey, data });
231231 }
232232 }
233233 }
···294294 }
295295 self.prev = rkey.clone();
296296297297- return Ok(Step::Step { rkey, data });
297297+ return Ok(Step::Found { rkey, data });
298298 }
299299 }
300300 }