···4455extern crate repo_stream;
66use clap::Parser;
77-use repo_stream::{DiskStore, Driver, process::noop};
77+use repo_stream::{DiskBuilder, Driver, DriverBuilder};
88use std::path::PathBuf;
991010#[derive(Debug, Parser)]
···2626 let reader = tokio::fs::File::open(car).await?;
2727 let reader = tokio::io::BufReader::new(reader);
28282929- // configure how much memory can be used before spilling to disk.
3030- // real memory usage may differ somewhat.
3131- let in_mem_limit = 10; // MiB
3232-3333- // configure how much memory sqlite is allowed to use when dumping to disk
3434- let db_cache_mb = 32; // MiB
3535-3629 log::info!("hello! reading the car...");
37303831 // in this example we only bother handling CARs that are too big for memory
3932 // `noop` helper means: do no block processing, store the raw blocks
4040- let driver = match Driver::load_car(reader, noop, in_mem_limit).await? {
3333+ let driver = match DriverBuilder::new()
3434+ .with_mem_limit_mb(10) // how much memory can be used before disk spill
3535+ .load_car(reader)
3636+ .await?
3737+ {
4138 Driver::Memory(_, _) => panic!("try this on a bigger car"),
4239 Driver::Disk(big_stuff) => {
4340 // we reach here if the repo was too big and needs to be spilled to
4441 // disk to continue
45424643 // set up a disk store we can spill to
4747- let disk_store = DiskStore::new(tmpfile.clone(), db_cache_mb).await?;
4444+ let disk_store = DiskBuilder::new().open(tmpfile).await?;
48454946 // do the spilling, get back a (similar) driver
5047 let (commit, driver) = big_stuff.finish_loading(disk_store).await?;
+9-6
examples/read-file/main.rs
···4455extern crate repo_stream;
66use clap::Parser;
77-use repo_stream::Driver;
77+use repo_stream::{Driver, DriverBuilder};
88use std::path::PathBuf;
991010type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
···2323 let reader = tokio::fs::File::open(file).await?;
2424 let reader = tokio::io::BufReader::new(reader);
25252626- let (commit, mut driver) =
2727- match Driver::load_car(reader, |block| block.len(), 16 /* MiB */).await? {
2828- Driver::Memory(commit, mem_driver) => (commit, mem_driver),
2929- Driver::Disk(_) => panic!("this example doesn't handle big CARs"),
3030- };
2626+ let (commit, mut driver) = match DriverBuilder::new()
2727+ .with_block_processor(|block| block.len())
2828+ .load_car(reader)
2929+ .await?
3030+ {
3131+ Driver::Memory(commit, mem_driver) => (commit, mem_driver),
3232+ Driver::Disk(_) => panic!("this example doesn't handle big CARs"),
3333+ };
31343235 log::info!("got commit: {commit:?}");
3336
+84-6
src/disk.rs
···55to be the best behaved in terms of both on-disk space usage and memory usage.
6677```no_run
88-# use repo_stream::{DiskStore, DiskError};
88+# use repo_stream::{DiskBuilder, DiskError};
99# #[tokio::main]
1010# async fn main() -> Result<(), DiskError> {
1111-let db_cache_size = 32; // MiB
1212-let store = DiskStore::new("/some/path.db".into(), db_cache_size).await?;
1111+let store = DiskBuilder::new()
1212+ .with_cache_size_mb(32)
1313+ .with_max_stored_mb(1024) // errors when >1GiB of processed blocks are inserted
1414+ .open("/some/path.db".into()).await?;
1315# Ok(())
1416# }
1517```
···3032 /// A tokio blocking task failed to join
3133 #[error("Failed to join a tokio blocking task: {0}")]
3234 JoinError(#[from] tokio::task::JoinError),
3535+ /// The total size of stored blocks exceeded the allowed size
3636+ ///
3737+ /// If you need to process *really* big CARs, you can configure a higher
3838+ /// limit.
3939+ #[error("Maximum disk size reached")]
4040+ MaxSizeExceeded,
3341 #[error("this error was replaced, seeing this is a bug.")]
3442 #[doc(hidden)]
3543 Stolen,
···4452 }
4553}
46545555+/// Builder-style disk store setup
5656+pub struct DiskBuilder {
5757+ /// Database in-memory cache allowance
5858+ ///
5959+ /// Default: 32 MiB
6060+ pub cache_size_mb: usize,
6161+ /// Database stored block size limit
6262+ ///
6363+ /// Default: 10 GiB
6464+ ///
6565+ /// Note: actual size on disk may be more, but should approximately scale
6666+ /// with this limit
6767+ pub max_stored_mb: usize,
6868+}
6969+7070+impl Default for DiskBuilder {
7171+ fn default() -> Self {
7272+ Self {
7373+ cache_size_mb: 32,
7474+ max_stored_mb: 10 * 1024, // 10 GiB
7575+ }
7676+ }
7777+}
7878+7979+impl DiskBuilder {
8080+ /// Begin configuring the storage with defaults
8181+ pub fn new() -> Self {
8282+ Default::default()
8383+ }
8484+ /// Set the in-memory cache allowance for the database
8585+ ///
8686+ /// Default: 32 MiB
8787+ pub fn with_cache_size_mb(mut self, size: usize) -> Self {
8888+ self.cache_size_mb = size;
8989+ self
9090+ }
9191+ /// Set the approximate stored block size limit
9292+ ///
9393+ /// Default: 10 GiB
9494+ pub fn with_max_stored_mb(mut self, max: usize) -> Self {
9595+ self.max_stored_mb = max;
9696+ self
9797+ }
9898+ /// Open and initialize the actual disk storage
9999+ pub async fn open(self, path: PathBuf) -> Result<DiskStore, DiskError> {
100100+ DiskStore::new(path, self.cache_size_mb, self.max_stored_mb).await
101101+ }
102102+}
103103+47104/// On-disk block storage
48105pub struct DiskStore {
49106 conn: rusqlite::Connection,
107107+ max_stored: usize,
108108+ stored: usize,
50109}
5111052111impl DiskStore {
53112 /// Initialize a new disk store
5454- pub async fn new(path: PathBuf, cache_mb: usize) -> Result<Self, DiskError> {
113113+ pub async fn new(
114114+ path: PathBuf,
115115+ cache_mb: usize,
116116+ max_stored_mb: usize,
117117+ ) -> Result<Self, DiskError> {
118118+ let max_stored = max_stored_mb * 2_usize.pow(20);
55119 let conn = tokio::task::spawn_blocking(move || {
56120 let conn = rusqlite::Connection::open(path)?;
57121···73137 })
74138 .await??;
751397676- Ok(Self { conn })
140140+ Ok(Self {
141141+ conn,
142142+ max_stored,
143143+ stored: 0,
144144+ })
77145 }
78146 pub(crate) fn get_writer(&'_ mut self) -> Result<SqliteWriter<'_>, DiskError> {
79147 let tx = self.conn.transaction()?;
8080- Ok(SqliteWriter { tx })
148148+ Ok(SqliteWriter {
149149+ tx,
150150+ stored: &mut self.stored,
151151+ max: self.max_stored,
152152+ })
81153 }
82154 pub(crate) fn get_reader<'conn>(&'conn self) -> Result<SqliteReader<'conn>, DiskError> {
83155 let select_stmt = self.conn.prepare("SELECT val FROM blocks WHERE key = ?1")?;
···106178107179pub(crate) struct SqliteWriter<'conn> {
108180 tx: rusqlite::Transaction<'conn>,
181181+ stored: &'conn mut usize,
182182+ max: usize,
109183}
110184111185impl SqliteWriter<'_> {
···119193 .map_err(DiskError::DbError)?;
120194 for pair in kv {
121195 let (k, v) = pair?;
196196+ *self.stored += v.len();
197197+ if *self.stored > self.max {
198198+ return Err(DiskError::MaxSizeExceeded.into());
199199+ }
122200 insert_stmt.execute((k, v)).map_err(DiskError::DbError)?;
123201 }
124202 Ok(())
+75-5
src/drive.rs
···115115 Disk(NeedDisk<R, T>),
116116}
117117118118+/// Builder-style driver setup
119119+pub struct DriverBuilder {
120120+ pub mem_limit_mb: usize,
121121+}
122122+123123+impl Default for DriverBuilder {
124124+ fn default() -> Self {
125125+ Self { mem_limit_mb: 16 }
126126+ }
127127+}
128128+129129+impl DriverBuilder {
130130+ /// Begin configuring the driver with defaults
131131+ pub fn new() -> Self {
132132+ Default::default()
133133+ }
134134+ /// Set the in-memory size limit, in MiB
135135+ ///
136136+ /// Default: 16 MiB
137137+ pub fn with_mem_limit_mb(self, new_limit: usize) -> Self {
138138+ Self {
139139+ mem_limit_mb: new_limit,
140140+ }
141141+ }
142142+ /// Set the block processor
143143+ ///
144144+ /// Default: noop, raw blocks will be emitted
145145+ pub fn with_block_processor<T: Processable>(
146146+ self,
147147+ p: fn(Vec<u8>) -> T,
148148+ ) -> DriverBuilderWithProcessor<T> {
149149+ DriverBuilderWithProcessor {
150150+ mem_limit_mb: self.mem_limit_mb,
151151+ block_processor: p,
152152+ }
153153+ }
154154+ /// Begin processing an atproto MST from a CAR file
155155+ pub async fn load_car<R: AsyncRead + Unpin>(
156156+ self,
157157+ reader: R,
158158+ ) -> Result<Driver<R, Vec<u8>>, DriveError> {
159159+ Driver::load_car(reader, crate::process::noop, self.mem_limit_mb).await
160160+ }
161161+}
162162+163163+/// Builder-style driver intermediate step
164164+///
165165+/// start from `DriverBuilder`
166166+pub struct DriverBuilderWithProcessor<T: Processable> {
167167+ pub mem_limit_mb: usize,
168168+ pub block_processor: fn(Vec<u8>) -> T,
169169+}
170170+171171+impl<T: Processable> DriverBuilderWithProcessor<T> {
172172+ /// Set the in-memory size limit, in MiB
173173+ ///
174174+ /// Default: 16 MiB
175175+ pub fn with_mem_limit_mb(mut self, new_limit: usize) -> Self {
176176+ self.mem_limit_mb = new_limit;
177177+ self
178178+ }
179179+ /// Begin processing an atproto MST from a CAR file
180180+ pub async fn load_car<R: AsyncRead + Unpin>(
181181+ self,
182182+ reader: R,
183183+ ) -> Result<Driver<R, T>, DriveError> {
184184+ Driver::load_car(reader, self.block_processor, self.mem_limit_mb).await
185185+ }
186186+}
187187+118188impl<R: AsyncRead + Unpin, T: Processable> Driver<R, T> {
119189 /// Begin processing an atproto MST from a CAR file
120190 ///
121191 /// Blocks will be loaded, processed, and buffered in memory. If the entire
122122- /// processed size is under the `max_size_mb` limit, a `Driver::Memory` will
123123- /// be returned along with a `Commit` ready for validation.
192192+ /// processed size is under the `mem_limit_mb` limit, a `Driver::Memory`
193193+ /// will be returned along with a `Commit` ready for validation.
124194 ///
125125- /// If the `max_size_mb` limit is reached before loading all blocks, the
195195+ /// If the `mem_limit_mb` limit is reached before loading all blocks, the
126196 /// partial state will be returned as `Driver::Disk(needed)`, which can be
127197 /// resumed by providing a `SqliteStorage` for on-disk block storage.
128198 pub async fn load_car(
129199 reader: R,
130200 process: fn(Vec<u8>) -> T,
131131- max_size_mb: usize,
201201+ mem_limit_mb: usize,
132202 ) -> Result<Driver<R, T>, DriveError> {
133133- let max_size = max_size_mb * 2_usize.pow(20);
203203+ let max_size = mem_limit_mb * 2_usize.pow(20);
134204 let mut mem_blocks = HashMap::new();
135205136206 let mut car = CarReader::new(reader).await?;
+10-8
src/lib.rs
···1818`iroh_car` additionally applies a block size limit of `2MiB`.
19192020```
2121-use repo_stream::{Driver, DiskStore};
2121+use repo_stream::{Driver, DriverBuilder, DiskBuilder};
22222323# #[tokio::main]
2424# async fn main() -> Result<(), Box<dyn std::error::Error>> {
2525# let reader = include_bytes!("../car-samples/tiny.car").as_slice();
2626let mut total_size = 0;
2727-let process = |rec: Vec<u8>| rec.len(); // block processing: just extract the size
2828-let in_mem_limit = 10; /* MiB */
2929-let db_cache_size = 32; /* MiB */
30273131-match Driver::load_car(reader, process, in_mem_limit).await? {
2828+match DriverBuilder::new()
2929+ .with_mem_limit_mb(10)
3030+ .with_block_processor(|rec| rec.len()) // block processing: just extract the raw record size
3131+ .load_car(reader)
3232+ .await?
3333+{
32343335 // if all blocks fit within memory
3436 Driver::Memory(_commit, mut driver) => {
···4244 // if the CAR was too big for in-memory processing
4345 Driver::Disk(paused) => {
4446 // set up a disk store we can spill to
4545- let store = DiskStore::new("some/path.db".into(), db_cache_size).await?;
4747+ let store = DiskBuilder::new().open("some/path.db".into()).await?;
4648 // do the spilling, get back a (similar) driver
4749 let (_commit, mut driver) = paused.finish_loading(store).await?;
4850···7981pub mod drive;
8082pub mod process;
81838282-pub use disk::{DiskError, DiskStore};
8383-pub use drive::{DriveError, Driver};
8484+pub use disk::{DiskBuilder, DiskError, DiskStore};
8585+pub use drive::{DriveError, Driver, DriverBuilder};
8486pub use mst::Commit;
8587pub use process::Processable;