···11+/*!
22+Sqlite storage for blocks on disk
33+44+In testing sqlite wasn't the fastest, but it seemed to be the best behaved in
55+terms of both on-disk space usage and memory usage.
66+77+```no_run
88+# use repo_stream::SqliteStore;
99+# #[tokio::main]
1010+# async fn main() -> Result<(), rusqlite::Error> {
1111+let db_cache_size = 32; // MiB
1212+let store = SqliteStore::new("/some/path.sqlite".into(), db_cache_size).await?;
1313+# Ok(())
1414+# }
1515+```
1616+*/
1717+118use crate::drive::DriveError;
219use rusqlite::OptionalExtension;
320use std::path::PathBuf;
4212222+/// On-disk block storage
523pub struct SqliteStore {
624 conn: rusqlite::Connection,
725}
···38563957 Ok(Self { conn })
4058 }
4141- pub fn get_writer(&'_ mut self) -> Result<SqliteWriter<'_>, rusqlite::Error> {
5959+ pub(crate) fn get_writer(&'_ mut self) -> Result<SqliteWriter<'_>, rusqlite::Error> {
4260 let tx = self.conn.transaction()?;
4361 // let insert_stmt = tx.prepare("INSERT INTO blocks (key, val) VALUES (?1, ?2)")?;
4462 Ok(SqliteWriter { tx })
4563 }
4646- pub fn get_reader<'conn>(&'conn self) -> Result<SqliteReader<'conn>, rusqlite::Error> {
6464+ pub(crate) fn get_reader<'conn>(&'conn self) -> Result<SqliteReader<'conn>, rusqlite::Error> {
4765 let select_stmt = self.conn.prepare("SELECT val FROM blocks WHERE key = ?1")?;
4866 Ok(SqliteReader { select_stmt })
4967 }
···5371 }
5472}
55735656-pub struct SqliteWriter<'conn> {
7474+pub(crate) struct SqliteWriter<'conn> {
5775 tx: rusqlite::Transaction<'conn>,
5876}
5977···7795 }
7896}
79978080-pub struct SqliteReader<'conn> {
9898+pub(crate) struct SqliteReader<'conn> {
8199 select_stmt: rusqlite::Statement<'conn>,
82100}
83101
+82-14
src/drive.rs
···11-//! Consume an MST block stream, producing an ordered stream of records
11+//! Consume a CAR from an AsyncRead, producing an ordered stream of records
2233use crate::disk::SqliteStore;
44use crate::process::Processable;
···100100 }
101101}
102102103103+/// Read a CAR file buffering blocks in memory or to disk
103104pub enum Driver<R: AsyncRead + Unpin, T: Processable> {
105105+ /// All blocks fit within the memory limit
106106+ ///
107107+ /// You probably want to check the commit's signature. You can go ahead and
108108+ /// walk the MST right away.
104109 Memory(Commit, MemDriver<T>),
105105- Disk(BigCar<R, T>),
110110+ /// Blocks exceed the memory limit
111111+ ///
112112+ /// You'll need to provide a disk storage to continue. The commit will be
113113+ /// returned and can be validated only once all blocks are loaded.
114114+ Disk(NeedDisk<R, T>),
106115}
107116108117impl<R: AsyncRead + Unpin, T: Processable> Driver<R, T> {
···143152 mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size();
144153 mem_blocks.insert(cid, maybe_processed);
145154 if mem_size >= max_size {
146146- return Ok(Driver::Disk(BigCar {
155155+ return Ok(Driver::Disk(NeedDisk {
147156 car,
148157 root,
149158 process,
···214223 }
215224}
216225217217-/// a paritally memory-loaded car file that needs disk spillover to continue
218218-pub struct BigCar<R: AsyncRead + Unpin, T: Processable> {
226226+/// A paritally memory-loaded car file that needs disk spillover to continue
227227+pub struct NeedDisk<R: AsyncRead + Unpin, T: Processable> {
219228 car: CarReader<R>,
220229 root: Cid,
221230 process: fn(Vec<u8>) -> T,
···236245 Ok(t)
237246}
238247239239-impl<R: AsyncRead + Unpin, T: Processable + Send + 'static> BigCar<R, T> {
248248+impl<R: AsyncRead + Unpin, T: Processable + Send + 'static> NeedDisk<R, T> {
240249 pub async fn finish_loading(
241250 mut self,
242251 mut store: SqliteStore,
243243- ) -> Result<(Commit, BigCarReady<T>), DriveError> {
252252+ ) -> Result<(Commit, DiskDriver<T>), DriveError> {
244253 // move store in and back out so we can manage lifetimes
245254 // dump mem blocks into the store
246255 store = tokio::task::spawn(async move {
···320329321330 Ok((
322331 commit,
323323- BigCarReady {
332332+ DiskDriver {
324333 process: self.process,
325334 state: Some(BigState { store, walker }),
326335 },
···333342 walker: Walker,
334343}
335344336336-pub struct BigCarReady<T: Clone> {
345345+/// MST walker that reads from disk instead of an in-memory hashmap
346346+pub struct DiskDriver<T: Clone> {
337347 process: fn(Vec<u8>) -> T,
338348 state: Option<BigState>,
339349}
340350341341-impl<T: Processable + Send + 'static> BigCarReady<T> {
351351+// for doctests only
352352+#[doc(hidden)]
353353+pub fn _get_fake_disk_driver() -> DiskDriver<Vec<u8>> {
354354+ use crate::process::noop;
355355+ DiskDriver {
356356+ process: noop,
357357+ state: None,
358358+ }
359359+}
360360+361361+impl<T: Processable + Send + 'static> DiskDriver<T> {
362362+ /// Walk the MST returning up to `n` rkey + record pairs
363363+ ///
364364+ /// ```no_run
365365+ /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, process::noop};
366366+ /// # #[tokio::main]
367367+ /// # async fn main() -> Result<(), DriveError> {
368368+ /// # let mut disk_driver = _get_fake_disk_driver();
369369+ /// while let Some(pairs) = disk_driver.next_chunk(256).await? {
370370+ /// for (rkey, record) in pairs {
371371+ /// println!("{rkey}: size={}", record.len());
372372+ /// }
373373+ /// }
374374+ /// let store = disk_driver.reset_store().await?;
375375+ /// # Ok(())
376376+ /// # }
377377+ /// ```
342378 pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk<T>>, DriveError> {
343379 let process = self.process;
344380345381 // state should only *ever* be None transiently while inside here
346346- let mut state = self
347347- .state
348348- .take()
349349- .expect("BigCarReady must have Some(state)");
382382+ let mut state = self.state.take().expect("DiskDriver must have Some(state)");
350383351384 // the big pain here is that we don't want to leave self.state in an
352385 // invalid state (None), so all the error paths have to make sure it
···456489 Ok(())
457490 }
458491492492+ /// Spawn the disk reading task into a tokio blocking thread
493493+ ///
494494+ /// The idea is to avoid so much sending back and forth to the blocking
495495+ /// thread, letting a blocking task do all the disk reading work and sending
496496+ /// records and rkeys back through an `mpsc` channel instead.
497497+ ///
498498+ /// This might also allow the disk work to continue while processing the
499499+ /// records. It's still not yet clear if this method actually has much
500500+ /// benefit over just using `.next_chunk(n)`.
501501+ ///
502502+ /// ```no_run
503503+ /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, process::noop};
504504+ /// # #[tokio::main]
505505+ /// # async fn main() -> Result<(), DriveError> {
506506+ /// # let mut disk_driver = _get_fake_disk_driver();
507507+ /// let (mut rx, join) = disk_driver.to_channel(512);
508508+ /// while let Some(recvd) = rx.recv().await {
509509+ /// let pairs = recvd?;
510510+ /// for (rkey, record) in pairs {
511511+ /// println!("{rkey}: size={}", record.len());
512512+ /// }
513513+ ///
514514+ /// }
515515+ /// let store = join.await?.reset_store().await?;
516516+ /// # Ok(())
517517+ /// # }
518518+ /// ```
459519 pub fn to_channel(
460520 mut self,
461521 n: usize,
···476536 (rx, chan_task)
477537 }
478538539539+ /// Reset the disk storage so it can be reused. You must call this.
540540+ ///
541541+ /// Ideally we'd put this in an `impl Drop`, but since it makes blocking
542542+ /// calls, that would be risky in an async context. For now you just have to
543543+ /// carefully make sure you call it.
544544+ ///
545545+ /// The sqlite store is returned, so it can be reused for another
546546+ /// `DiskDriver`.
479547 pub async fn reset_store(mut self) -> Result<SqliteStore, DriveError> {
480548 tokio::task::spawn_blocking(move || {
481549 let BigState { mut store, .. } = self.state.take().expect("valid state");
+2-1
src/lib.rs
···65656666*/
67676868-mod mst;
6868+pub mod mst;
6969mod walk;
70707171pub mod disk;
···74747575pub use disk::SqliteStore;
7676pub use drive::{DriveError, Driver};
7777+pub use mst::Commit;
7778pub use process::Processable;
+9-1
src/process.rs
···11+/*!
22+Record processor function output trait
33+*/
44+15use serde::{Serialize, de::DeserializeOwned};
2677+/// Output trait for record processing
38pub trait Processable: Clone + Serialize + DeserializeOwned {
44- /// the additional size taken up (not including its mem::size_of)
99+ /// Any additional in-memory size taken by the processed type
1010+ ///
1111+ /// Do not include stack size (`std::mem::size_of`)
512 fn get_size(&self) -> usize;
613}
7141515+/// Processor that just returns the raw blocks
816#[inline]
917pub fn noop(block: Vec<u8>) -> Vec<u8> {
1018 block