···11extern crate repo_stream;
22-use repo_stream::drive::Processable;
33-use serde::{Deserialize, Serialize};
42use std::path::{Path, PathBuf};
5364use criterion::{Criterion, criterion_group, criterion_main};
77-88-#[derive(Clone, Serialize, Deserialize)]
99-struct S(usize);
1010-1111-impl Processable for S {
1212- fn get_size(&self) -> usize {
1313- 0 // no additional space taken, just its stack size (newtype is free)
1414- }
1515-}
165176pub fn criterion_benchmark(c: &mut Criterion) {
187 let rt = tokio::runtime::Builder::new_multi_thread()
···34233524 let mb = 2_usize.pow(20);
36253737- let mut driver = match repo_stream::drive::load_car(reader, |block| S(block.len()), 1024 * mb)
2626+ let mut driver = match repo_stream::drive::load_car(reader, |block| block.len(), 1024 * mb)
3827 .await
3928 .unwrap()
4029 {
+1-12
benches/non-huge-cars.rs
···11extern crate repo_stream;
2233use criterion::{Criterion, criterion_group, criterion_main};
44-use repo_stream::drive::Processable;
55-use serde::{Deserialize, Serialize};
6475const TINY_CAR: &'static [u8] = include_bytes!("../car-samples/tiny.car");
86const LITTLE_CAR: &'static [u8] = include_bytes!("../car-samples/little.car");
97const MIDSIZE_CAR: &'static [u8] = include_bytes!("../car-samples/midsize.car");
1010-1111-#[derive(Clone, Serialize, Deserialize)]
1212-struct S(usize);
1313-1414-impl Processable for S {
1515- fn get_size(&self) -> usize {
1616- 0 // no additional space taken, just its stack size (newtype is free)
1717- }
1818-}
198209pub fn criterion_benchmark(c: &mut Criterion) {
2110 let rt = tokio::runtime::Builder::new_multi_thread()
···36253726async fn drive_car(bytes: &[u8]) -> usize {
3827 let mut driver =
3939- match repo_stream::drive::load_car(bytes, |block| S(block.len()), 32 * 2_usize.pow(20))
2828+ match repo_stream::drive::load_car(bytes, |block| block.len(), 32 * 2_usize.pow(20))
4029 .await
4130 .unwrap()
4231 {
+2-14
examples/disk-read-file/main.rs
···11extern crate repo_stream;
22use clap::Parser;
33-use repo_stream::drive::Processable;
44-use serde::{Deserialize, Serialize};
53use std::path::PathBuf;
6475type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
···1412 tmpfile: PathBuf,
1513}
16141717-#[derive(Clone, Serialize, Deserialize)]
1818-struct S(usize);
1919-2020-impl Processable for S {
2121- fn get_size(&self) -> usize {
2222- 0 // no additional space taken, just its stack size (newtype is free)
2323- }
2424-}
2525-2615#[tokio::main]
2716async fn main() -> Result<()> {
2817 env_logger::init();
···36253726 let limit_mb = 32;
38273939- let driver = match repo_stream::drive::load_car(reader, |block| S(block.len()), 10 * mb).await?
4040- {
2828+ let driver = match repo_stream::drive::load_car(reader, |block| block.len(), 10 * mb).await? {
4129 repo_stream::drive::Vehicle::Lil(_, _) => panic!("try this on a bigger car"),
4230 repo_stream::drive::Vehicle::Big(big_stuff) => {
4343- let disk_store = repo_stream::disk::SqliteStore::new(tmpfile.clone(), limit_mb);
3131+ let disk_store = repo_stream::disk::SqliteStore::new(tmpfile.clone(), limit_mb).await?;
4432 let (commit, driver) = big_stuff.finish_loading(disk_store).await?;
4533 log::warn!("big: {:?}", commit);
4634 driver
+1-12
examples/read-file/main.rs
···11extern crate repo_stream;
22use clap::Parser;
33-use repo_stream::drive::Processable;
44-use serde::{Deserialize, Serialize};
53use std::path::PathBuf;
6475type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
···1210 file: PathBuf,
1311}
14121515-#[derive(Clone, Serialize, Deserialize)]
1616-struct S(usize);
1717-1818-impl Processable for S {
1919- fn get_size(&self) -> usize {
2020- 0 // no additional space taken, just its stack size (newtype is free)
2121- }
2222-}
2323-2413#[tokio::main]
2514async fn main() -> Result<()> {
2615 env_logger::init();
···3019 let reader = tokio::io::BufReader::new(reader);
31203221 let (commit, mut driver) =
3333- match repo_stream::drive::load_car(reader, |block| S(block.len()), 1024 * 1024).await? {
2222+ match repo_stream::drive::load_car(reader, |block| block.len(), 1024 * 1024).await? {
3423 repo_stream::drive::Vehicle::Lil(commit, mem_driver) => (commit, mem_driver),
3524 repo_stream::drive::Vehicle::Big(_) => panic!("can't handle big cars yet"),
3625 };
+9-21
src/disk.rs
···33use std::path::PathBuf;
4455pub struct SqliteStore {
66- path: PathBuf,
77- limit_mb: usize,
66+ conn: rusqlite::Connection,
87}
98109impl SqliteStore {
1111- pub fn new(path: PathBuf, limit_mb: usize) -> Self {
1212- Self { path, limit_mb }
1313- }
1414-}
1515-1616-impl SqliteStore {
1717- pub async fn get_access(&mut self) -> Result<SqliteAccess, rusqlite::Error> {
1818- let path = self.path.clone();
1919- let limit_mb = self.limit_mb;
1010+ pub async fn new(path: PathBuf, cache_mb: usize) -> Result<Self, rusqlite::Error> {
2011 let conn = tokio::task::spawn_blocking(move || {
2112 let conn = rusqlite::Connection::open(path)?;
22132323- let sq_mb = -(2_i64.pow(10)); // negative is kibibytes for sqlite cache_size
1414+ let sqlite_one_mb = -(2_i64.pow(10)); // negative is kibibytes for sqlite cache_size
24152516 // conn.pragma_update(None, "journal_mode", "OFF")?;
2617 // conn.pragma_update(None, "journal_mode", "MEMORY")?;
2718 conn.pragma_update(None, "journal_mode", "WAL")?;
2819 // conn.pragma_update(None, "wal_autocheckpoint", "0")?; // this lets things get a bit big on disk
2920 conn.pragma_update(None, "synchronous", "OFF")?;
3030- conn.pragma_update(None, "cache_size", (limit_mb as i64 * sq_mb).to_string())?;
2121+ conn.pragma_update(
2222+ None,
2323+ "cache_size",
2424+ (cache_mb as i64 * sqlite_one_mb).to_string(),
2525+ )?;
3126 conn.execute(
3227 "CREATE TABLE blocks (
3328 key BLOB PRIMARY KEY NOT NULL,
···4136 .await
4237 .expect("join error")?;
43384444- Ok(SqliteAccess { conn })
3939+ Ok(Self { conn })
4540 }
4646-}
4747-4848-pub struct SqliteAccess {
4949- conn: rusqlite::Connection,
5050-}
5151-5252-impl SqliteAccess {
5341 pub fn get_writer(&'_ mut self) -> Result<SqliteWriter<'_>, rusqlite::Error> {
5442 let tx = self.conn.transaction()?;
5543 // let insert_stmt = tx.prepare("INSERT INTO blocks (key, val) VALUES (?1, ?2)")?;
+18-26
src/drive.rs
···11//! Consume an MST block stream, producing an ordered stream of records
2233-use crate::disk::{SqliteAccess, SqliteStore};
33+use crate::disk::SqliteStore;
44+use crate::process::Processable;
45use ipld_core::cid::Cid;
56use iroh_car::CarReader;
66-use serde::de::DeserializeOwned;
77use serde::{Deserialize, Serialize};
88use std::collections::HashMap;
99use std::convert::Infallible;
···4343 BincodeDecodeError(#[from] bincode::error::DecodeError),
4444 #[error("extra bytes remained after decoding")]
4545 ExtraGarbage,
4646-}
4747-4848-pub trait Processable: Clone + Serialize + DeserializeOwned {
4949- /// the additional size taken up (not including its mem::size_of)
5050- fn get_size(&self) -> usize;
5146}
52475348#[derive(Debug, Clone, Serialize, Deserialize)]
···191186 mut self,
192187 mut store: SqliteStore,
193188 ) -> Result<(Commit, BigCarReady<T>), DriveError> {
194194- // set up access for real
195195- let mut access = store.get_access().await?;
196196-197197- // move access in and back out so we can manage lifetimes
189189+ // move store in and back out so we can manage lifetimes
198190 // dump mem blocks into the store
199199- access = tokio::task::spawn(async move {
200200- let mut writer = access.get_writer()?;
191191+ store = tokio::task::spawn(async move {
192192+ let mut writer = store.get_writer()?;
201193202194 let kvs = self
203195 .mem_blocks
···206198207199 writer.put_many(kvs)?;
208200 writer.commit()?;
209209- Ok::<_, DriveError>(access)
201201+ Ok::<_, DriveError>(store)
210202 })
211203 .await??;
212204213205 let (tx, mut rx) = tokio::sync::mpsc::channel::<Vec<(Cid, MaybeProcessedBlock<T>)>>(2);
214206215215- let access_worker = tokio::task::spawn_blocking(move || {
216216- let mut writer = access.get_writer()?;
207207+ let store_worker = tokio::task::spawn_blocking(move || {
208208+ let mut writer = store.get_writer()?;
217209218210 while let Some(chunk) = rx.blocking_recv() {
219211 let kvs = chunk
···223215 }
224216225217 writer.commit()?;
226226- Ok::<_, DriveError>(access)
218218+ Ok::<_, DriveError>(store)
227219 }); // await later
228220229221 // dump the rest to disk (in chunks)
···267259 drop(tx);
268260 log::debug!("done. waiting for worker to finish...");
269261270270- access = access_worker.await??;
262262+ store = store_worker.await??;
271263272264 log::debug!("worker finished.");
273265···279271 commit,
280272 BigCarReady {
281273 process: self.process,
282282- access,
274274+ store,
283275 walker,
284276 },
285277 ))
···288280289281pub struct BigCarReady<T: Clone> {
290282 process: fn(Vec<u8>) -> T,
291291- access: SqliteAccess,
283283+ store: SqliteStore,
292284 walker: Walker,
293285}
294286···299291 ) -> Result<(Self, Option<Vec<(String, T)>>), DriveError> {
300292 let mut out = Vec::with_capacity(n);
301293 (self, out) = tokio::task::spawn_blocking(move || {
302302- let access = self.access;
303303- let mut reader = access.get_reader()?;
294294+ let store = self.store;
295295+ let mut reader = store.get_reader()?;
304296305297 for _ in 0..n {
306298 // walk as far as we can until we run out of blocks or find a record
···314306 };
315307 }
316308317317- drop(reader); // cannot outlive access
318318- self.access = access;
309309+ drop(reader); // cannot outlive store
310310+ self.store = store;
319311 Ok::<_, DriveError>((self, out))
320312 })
321313 .await??;
···343335 // ...should we return the join handle here so the caller at least knows about it?
344336 // yes probably for error handling?? (orrr put errors in the channel)
345337 let worker = tokio::task::spawn_blocking(move || {
346346- let mut reader = self.access.get_reader()?;
338338+ let mut reader = self.store.get_reader()?;
347339348340 loop {
349341 let mut out = Vec::with_capacity(n);
···367359 .map_err(|_| DriveError::ChannelSendError)?;
368360 }
369361370370- drop(reader); // cannot outlive access
362362+ drop(reader); // cannot outlive store
371363 Ok(())
372364 }); // await later
373365
+1
src/lib.rs
···55pub mod disk;
66pub mod drive;
77pub mod mst;
88+pub mod process;
89pub mod walk;
+12
src/process.rs
···11+use serde::{Serialize, de::DeserializeOwned};
22+33+pub trait Processable: Clone + Serialize + DeserializeOwned {
44+ /// the additional size taken up (not including its mem::size_of)
55+ fn get_size(&self) -> usize;
66+}
77+88+impl Processable for usize {
99+ fn get_size(&self) -> usize {
1010+ 0 // no additional space taken, just its stack size (newtype is free)
1111+ }
1212+}