···11extern crate repo_stream;
22use clap::Parser;
33-use repo_stream::disk::RedbStore;
43use repo_stream::drive::Processable;
54use serde::{Deserialize, Serialize};
65use std::path::PathBuf;
···1817#[derive(Clone, Serialize, Deserialize)]
1918struct S(usize);
20192121-impl Processable for S {}
2020+impl Processable for S {
2121+ fn get_size(&self) -> usize {
2222+ 0 // no additional space taken, just its stack size (newtype is free)
2323+ }
2424+}
22252326#[tokio::main]
2427async fn main() -> Result<()> {
···2831 let reader = tokio::fs::File::open(car).await?;
2932 let reader = tokio::io::BufReader::new(reader);
30333434+ // let kb = 2_usize.pow(10);
3535+ let mb = 2_usize.pow(20);
3636+3137 let mut driver =
3232- match repo_stream::drive::load_car(reader, |block| S(block.len()), 1024).await? {
3838+ match repo_stream::drive::load_car(reader, |block| S(block.len()), 16 * mb).await? {
3339 repo_stream::drive::Vehicle::Lil(_, _) => panic!("try this on a bigger car"),
3440 repo_stream::drive::Vehicle::Big(big_stuff) => {
3535- let disk_store = RedbStore::new(tmpfile);
4141+ // let disk_store = repo_stream::disk::SqliteStore::new(tmpfile);
4242+ let disk_store = repo_stream::disk::RedbStore::new(tmpfile);
3643 let (commit, driver) = big_stuff.finish_loading(disk_store).await?;
3744 log::warn!("big: {:?}", commit);
3845 driver
+7-2
src/disk.rs
···61616262 conn.pragma_update(None, "journal_mode", "WAL")?;
6363 conn.pragma_update(None, "synchronous", "OFF")?;
6464- conn.pragma_update(None, "cache_size", (-32 * 2_i64.pow(10)).to_string())?;
6464+ conn.pragma_update(None, "cache_size", (-4 * 2_i64.pow(10)).to_string())?;
6565 conn.execute(
6666 "CREATE TABLE blocks (
6767 key BLOB PRIMARY KEY NOT NULL,
···144144 type Access = RedbAccess;
145145 async fn get_access(&mut self) -> Result<RedbAccess, redb::Error> {
146146 let path = self.path.clone();
147147+ let kb = 2_usize.pow(10);
147148 let db = tokio::task::spawn_blocking(move || {
148148- let db = redb::Database::create(path)?;
149149+ let db = redb::Database::builder()
150150+ .set_cache_size(16 * kb)
151151+ .create(path)?;
149152 Ok::<_, Self::StorageError>(db)
150153 })
151154 .await
···204207 Ok(rv)
205208 }
206209}
210210+211211+///// TODO: that other single file db thing to try
+26-5
src/drive.rs
···5757// DiskDriveError(#[from] DiskDriveError<E>),
5858// }
59596060-pub trait Processable: Clone + Serialize + DeserializeOwned {}
6060+pub trait Processable: Clone + Serialize + DeserializeOwned {
6161+ /// the additional size taken up (not including its mem::size_of)
6262+ fn get_size(&self) -> usize;
6363+}
61646265#[derive(Debug, Clone, Serialize, Deserialize)]
6366pub enum MaybeProcessedBlock<T> {
···8588 Processed(T),
8689}
87908888-impl<T: Processable> Processable for MaybeProcessedBlock<T> {}
9191+impl<T: Processable> Processable for MaybeProcessedBlock<T> {
9292+ /// TODO this is probably a little broken
9393+ fn get_size(&self) -> usize {
9494+ use std::{cmp::max, mem::size_of};
9595+9696+ // enum is always as big as its biggest member?
9797+ let base_size = max(size_of::<Vec<u8>>(), size_of::<T>());
9898+9999+ let extra = match self {
100100+ Self::Raw(bytes) => bytes.len(),
101101+ Self::Processed(t) => t.get_size(),
102102+ };
103103+104104+ base_size + extra
105105+ }
106106+}
8910790108pub enum Vehicle<R: AsyncRead + Unpin, T: Processable> {
91109 Lil(Commit, MemDriver<T>),
···111129 let mut commit = None;
112130113131 // try to load all the blocks into memory
132132+ let mut mem_size = 0;
114133 while let Some((cid, data)) = car.next_block().await? {
115134 // the root commit is a Special Third Kind of block that we need to make
116135 // sure not to optimistically send to the processing function
···129148 };
130149131150 // stash (maybe processed) blocks in memory as long as we have room
151151+ mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size();
132152 mem_blocks.insert(cid, maybe_processed);
133133- if mem_blocks.len() >= max_size {
153153+ if mem_size >= max_size {
134154 return Ok(Vehicle::Big(BigCar {
135155 car,
136156 root,
···207227 // dump the rest to disk (in chunks)
208228 loop {
209229 let mut chunk = vec![];
230230+ let mut mem_size = 0;
210231 loop {
211232 let Some((cid, data)) = self.car.next_block().await? else {
212233 break;
···224245 } else {
225246 MaybeProcessedBlock::Processed((self.process)(&data))
226247 };
248248+ mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size();
227249 chunk.push((cid, maybe_processed));
228228- if chunk.len() >= self.max_size {
229229- // eventually this won't be .len()
250250+ if mem_size >= self.max_size {
230251 break;
231252 }
232253 }