···33use std::collections::VecDeque;
44use std::error::Error;
5566-use crate::disk_walk::{Trip, Walker, RkeyError};
77-use crate::mst::Commit;
66+use crate::disk_walk::{RkeyError, Trip, Walker};
77+use crate::mst::{Commit, Node};
8899use ipld_core::cid::Cid;
1010-use serde::{Serialize, de::DeserializeOwned};
1010+use serde::{Deserialize, Serialize, de::DeserializeOwned};
11111212/// Errors that can happen while consuming and emitting blocks and records
1313#[derive(Debug, thiserror::Error)]
1414-pub enum DriveError<E: Error> {
1414+pub enum DriveError {
1515 #[error("Failed to initialize CarReader: {0}")]
1616 CarReader(#[from] iroh_car::Error),
1717 #[error("Car block stream error: {0}")]
···2828 WalkingProblem(#[from] WalkError),
2929 #[error("whatever: {0}")]
3030 Boooooo(String),
3131- #[error("processing error: {0}")]
3232- ProcessingError(E),
3131+ #[error("Error while encoding: {0}")]
3232+ EncodingError(#[from] bincode::error::EncodeError),
3333+ #[error("Error while decoding: {0}")]
3434+ DecodingError(#[from] bincode::error::DecodeError),
3335}
34363537/// Limited subset of errors that can happen while walking
···5658 #[error("Could not find block: {0}")]
5759 MissingBlock(Cid),
5860}
6161+6262+#[derive(Serialize, Deserialize)]
6363+pub enum MaybeProcessedBlock<T: Serialize> {
6464+ Raw(Vec<u8>),
6565+ Processed(T),
6666+}
6767+6868+pub type Records = Vec<(String, Vec<u8>)>;
59696070/// Storage backend for caching large-repo blocks
6171///
6272/// Since
6363-pub trait BlockStore<MPB: Serialize + DeserializeOwned> {
6464- fn put_batch(&self, blocks: Vec<(Cid, MPB)>) -> impl Future<Output = Result<(), BlockStoreError>>; // unwraps for now
7373+pub trait BlockStore {
7474+ fn put_batch(
7575+ &self,
7676+ blocks: Vec<(Cid, Vec<u8>)>,
7777+ ) -> impl Future<Output = Result<(), BlockStoreError>>; // unwraps for now
6578 fn walk_batch(
6679 &self,
6780 walker: Walker,
6881 n: usize,
6969- ) -> impl Future<Output = Result<(Walker, Vec<(String, MPB)>), BlockStoreError>>; // boo string error for now because
8282+ ) -> impl Future<Output = Result<(Walker, Records), BlockStoreError>>; // boo string error for now because
7083}
71847285type CarBlock<E> = Result<(Cid, Vec<u8>), E>;
73867487/// The core driver between the block stream and MST walker
7575-pub struct Vehicle<SE, S, T, BS, P, PE>
8888+pub struct Vehicle<SE, S, T, BS, P>
7689where
7790 SE: Error + 'static,
7891 S: Stream<Item = CarBlock<SE>>,
7992 T: Clone + Serialize + DeserializeOwned,
8080- BS: BlockStore<Vec<u8>>,
8181- P: Fn(&[u8]) -> Result<T, PE>,
8282- PE: Error,
9393+ BS: BlockStore,
9494+ P: Fn(&[u8]) -> T,
8395{
8496 #[allow(dead_code)]
8597 block_stream: Option<S>,
···89101 out_cache: VecDeque<(String, T)>,
90102}
911039292-impl<SE, S, T, BS, P, PE> Vehicle<SE, S, T, BS, P, PE>
104104+impl<SE, S, T, BS, P> Vehicle<SE, S, T, BS, P>
93105where
94106 SE: Error + 'static,
95107 S: Stream<Item = CarBlock<SE>> + Unpin + Send,
96108 T: Clone + Serialize + DeserializeOwned + Send,
9797- BS: BlockStore<Vec<u8>> + Send,
9898- P: Fn(&[u8]) -> Result<T, PE> + Send,
9999- PE: Error,
109109+ BS: BlockStore + Send,
110110+ P: Fn(&[u8]) -> T,
100111{
101112 /// Set up the stream
102113 ///
···120131 block_stream: S,
121132 block_store: BS,
122133 process: P,
123123- ) -> Result<(Commit, Self), DriveError<PE>> {
134134+ ) -> Result<(Commit, Self), DriveError> {
124135 let mut commit = None;
125136126137 log::warn!("init: load blocks");
127138128128- let mut chunked = block_stream.try_chunks(4096);
139139+ let mut chunked = block_stream.try_chunks(256);
129140130141 // go ahead and put all blocks in the block store
131142 while let Some(chunk) = chunked
···140151 .map_err(|e| DriveError::BadCommit(e.into()))?;
141152 commit = Some(c);
142153 } else {
143143- to_insert.push((cid, data));
154154+ let wrapped = if Node::could_be(&data) {
155155+ MaybeProcessedBlock::Raw(data)
156156+ } else {
157157+ MaybeProcessedBlock::Processed(process(&data))
158158+ };
159159+ let bytes =
160160+ bincode::serde::encode_to_vec(wrapped, bincode::config::standard())?;
161161+162162+ to_insert.push((cid, bytes));
144163 }
145164 }
146165 block_store
···168187 Ok((commit, me))
169188 }
170189171171- async fn load_chunk(&mut self, n: usize) -> Result<(), DriveError<PE>> {
190190+ async fn load_chunk(&mut self, n: usize) -> Result<(), DriveError> {
172191 let walker = std::mem::take(&mut self.walker);
173192 let (walker, batch) = self
174193 .block_store
175194 .walk_batch(walker, n)
176195 .await
177177- .map_err(|e| DriveError::Boooooo(format!("booo! {e}")))?; // TODO
196196+ .map_err(|e| DriveError::Boooooo(format!("booo! (here right?) {e}")))?; // TODO
178197 self.walker = walker;
179198180199 let processed = batch
181200 .into_iter()
182182- .map(|(k, raw)| (self.process)(&raw).map(|t| (k, t)))
183183- .collect::<Result<Vec<_>, _>>()
184184- .map_err(DriveError::ProcessingError)?;
201201+ .map(|(k, encoded)| {
202202+ let (decoded, n): (MaybeProcessedBlock<T>, usize) =
203203+ bincode::serde::decode_from_slice(&encoded, bincode::config::standard())?;
204204+ assert_eq!(n, encoded.len());
205205+ let processed = match decoded {
206206+ MaybeProcessedBlock::Processed(t) => t,
207207+ MaybeProcessedBlock::Raw(block) => (self.process)(&block),
208208+ };
209209+ Ok((k, processed))
210210+ })
211211+ .collect::<Result<Vec<_>, DriveError>>()?;
185212186213 self.out_cache.extend(processed);
187214 Ok(())
···193220 /// (but non-zero), even if it's not the last chunk.
194221 ///
195222 /// an empty vec will be returned to signal the end.
196196- pub async fn next_chunk(&mut self, n: usize) -> Result<Vec<(String, T)>, DriveError<PE>> {
223223+ pub async fn next_chunk(&mut self, n: usize) -> Result<Vec<(String, T)>, DriveError> {
197224 if self.out_cache.is_empty() {
198225 self.load_chunk(n).await?;
199226 }
···201228 }
202229203230 /// Manually step through the record outputs
204204- pub async fn next_record(&mut self) -> Result<Option<(String, T)>, DriveError<PE>> {
231231+ pub async fn next_record(&mut self) -> Result<Option<(String, T)>, DriveError> {
205232 if self.out_cache.is_empty() {
206206- self.load_chunk(64).await?; // TODO
233233+ self.load_chunk(128).await?; // TODO
207234 }
208235 Ok(self.out_cache.pop_front())
209236 }
210237211238 /// Convert to a futures::stream of record outputs
212212- pub fn stream(self) -> impl Stream<Item = Result<(String, T), DriveError<PE>>> {
239239+ pub fn stream(self) -> impl Stream<Item = Result<(String, T), DriveError>> {
213240 futures::stream::try_unfold(self, |mut this| async move {
214241 let maybe_record = this.next_record().await?;
215242 Ok(maybe_record.map(|b| (b, this)))
+14-5
src/disk_redb.rs
···11-use crate::disk_drive::{BlockStore, BlockStoreError};
11+use crate::disk_drive::{BlockStore, BlockStoreError, MaybeProcessedBlock, Records};
22use crate::disk_walk::{Need, Walker};
33use ipld_core::cid::Cid;
44use redb::{Database, Durability, Error, ReadableDatabase, TableDefinition};
···4040 }
4141}
42424343-impl BlockStore<Vec<u8>> for RedbStore {
4343+impl BlockStore for RedbStore {
4444 async fn put_batch(&self, blocks: Vec<(Cid, Vec<u8>)>) -> Result<(), BlockStoreError> {
4545 let db = self.db.clone();
4646 tokio::task::spawn_blocking(move || -> Result<(), BlockStoreError> {
···6565 &self,
6666 mut walker: Walker,
6767 n: usize,
6868- ) -> Result<(Walker, Vec<(String, Vec<u8>)>), BlockStoreError> {
6868+ ) -> Result<(Walker, Records), BlockStoreError> {
6969 let db = self.db.clone();
7070 tokio::task::spawn_blocking(move || -> Result<_, BlockStoreError> {
7171 let tx = db.begin_read()?;
···8383 let block = res.value();
84848585 match need {
8686- Need::Node(_) => walker
8787- .handle_node(block)?,
8686+ Need::Node(_) => {
8787+ let (mpb, n) =
8888+ bincode::serde::decode_from_slice(block, bincode::config::standard())
8989+ .unwrap();
9090+ assert_eq!(n, block.len());
9191+ // DANGER: we're throwing in unit () as a placeholder here and assuming bincode will still work since Raw is the first variant
9292+ let MaybeProcessedBlock::Raw(bytes): MaybeProcessedBlock<()> = mpb else {
9393+ panic!("should have not been processed"); // tODO
9494+ };
9595+ walker.handle_node(&bytes)?
9696+ }
8897 Need::Record { rkey, .. } => {
8998 out.push((rkey, block.to_vec()));
9099 if out.len() >= n {
+2-2
src/disk_walk.rs
···2828 EntryRkeyNotUtf8(#[from] std::string::FromUtf8Error),
2929 #[error("Encountered an rkey out of order while walking the MST")]
3030 RkeyOutOfOrder,
3131- #[error("Failed to decode commit block: {0}")]
3232- BlockDecodeError(#[from] serde_ipld_dagcbor::DecodeError<Infallible>),
3131+ #[error("Failed to decode node block: {0}")]
3232+ NodeDecodeError(#[from] serde_ipld_dagcbor::DecodeError<Infallible>),
3333}
34343535/// Walker outputs