Fast and robust atproto CAR file processing in rust
14
fork

Configure Feed

Select the types of activity you want to include in your feed.

slightly simplify disk channel interface

phil d21329e8 46b76dc8

+93 -86
+4 -7
examples/disk-read-file/main.rs
··· 38 38 39 39 let mut n = 0; 40 40 let mut zeros = 0; 41 - let (mut rx, worker) = driver.rx(512).await?; 41 + let mut rx = driver.to_channel(512); 42 42 43 43 log::debug!("walking..."); 44 - while let Some(pairs) = rx.recv().await { 44 + while let Some(r) = rx.recv().await { 45 + let pairs = r?; 45 46 n += pairs.len(); 46 47 for (_, block) in pairs { 47 48 zeros += block.into_iter().filter(|&b| b == b'0').count() 48 49 } 49 50 } 50 - log::debug!("done walking! joining..."); 51 - 52 - worker.await.unwrap().unwrap(); 53 - 54 - log::debug!("joined."); 51 + log::debug!("done walking!"); 55 52 56 53 // log::info!("now is the time to check mem..."); 57 54 // tokio::time::sleep(std::time::Duration::from_secs(22)).await;
+89 -79
src/drive.rs
··· 7 7 use serde::{Deserialize, Serialize}; 8 8 use std::collections::HashMap; 9 9 use std::convert::Infallible; 10 - use tokio::io::AsyncRead; 10 + use tokio::{io::AsyncRead, sync::mpsc}; 11 11 12 12 use crate::mst::{Commit, Node}; 13 13 use crate::walk::{Step, WalkError, Walker}; ··· 44 44 #[error("extra bytes remained after decoding")] 45 45 ExtraGarbage, 46 46 } 47 + 48 + pub type BlockChunk<T> = Vec<(String, T)>; 47 49 48 50 #[derive(Debug, Clone, Serialize, Deserialize)] 49 51 pub enum MaybeProcessedBlock<T> { ··· 161 163 } 162 164 } 163 165 166 + /// The core driver between the block stream and MST walker 167 + /// 168 + /// In the future, PDSs will export CARs in a stream-friendly order that will 169 + /// enable processing them with tiny memory overhead. But that future is not 170 + /// here yet. 171 + /// 172 + /// CARs are almost always in a stream-unfriendly order, so I'm reverting the 173 + /// optimistic stream features: we load all block first, then walk the MST. 174 + /// 175 + /// This makes things much simpler: we only need to worry about spilling to disk 176 + /// in one place, and we always have a reasonable expecatation about how much 177 + /// work the init function will do. We can drop the CAR reader before walking, 178 + /// so the sync/async boundaries become a little easier to work around. 179 + #[derive(Debug)] 180 + pub struct MemDriver<T: Processable> { 181 + blocks: HashMap<Cid, MaybeProcessedBlock<T>>, 182 + walker: Walker, 183 + process: fn(Vec<u8>) -> T, 184 + } 185 + 186 + impl<T: Processable> MemDriver<T> { 187 + /// Manually step through the record outputs 188 + pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk<T>>, DriveError> { 189 + let mut out = Vec::with_capacity(n); 190 + for _ in 0..n { 191 + // walk as far as we can until we run out of blocks or find a record 192 + match self.walker.step(&mut self.blocks, self.process)? { 193 + Step::Missing(cid) => return Err(DriveError::MissingBlock(cid)), 194 + Step::Finish => break, 195 + Step::Step { rkey, data } => { 196 + out.push((rkey, data)); 197 + continue; 198 + } 199 + }; 200 + } 201 + 202 + if out.is_empty() { 203 + Ok(None) 204 + } else { 205 + Ok(Some(out)) 206 + } 207 + } 208 + } 209 + 164 210 /// a paritally memory-loaded car file that needs disk spillover to continue 165 211 pub struct BigCar<R: AsyncRead + Unpin, T: Processable> { 166 212 car: CarReader<R>, ··· 204 250 }) 205 251 .await??; 206 252 207 - let (tx, mut rx) = tokio::sync::mpsc::channel::<Vec<(Cid, MaybeProcessedBlock<T>)>>(2); 253 + let (tx, mut rx) = mpsc::channel::<Vec<(Cid, MaybeProcessedBlock<T>)>>(2); 208 254 209 255 let store_worker = tokio::task::spawn_blocking(move || { 210 256 let mut writer = store.get_writer()?; ··· 290 336 pub async fn next_chunk( 291 337 mut self, 292 338 n: usize, 293 - ) -> Result<(Self, Option<Vec<(String, T)>>), DriveError> { 339 + ) -> Result<(Self, Option<BlockChunk<T>>), DriveError> { 294 340 let mut out = Vec::with_capacity(n); 295 341 (self, out) = tokio::task::spawn_blocking(move || { 296 342 let store = self.store; ··· 321 367 } 322 368 } 323 369 324 - pub async fn rx( 370 + fn read_tx_blocking( 325 371 mut self, 326 372 n: usize, 327 - ) -> Result< 328 - ( 329 - tokio::sync::mpsc::Receiver<Vec<(String, T)>>, 330 - tokio::task::JoinHandle<Result<(), DriveError>>, 331 - ), 332 - DriveError, 333 - > { 334 - let (tx, rx) = tokio::sync::mpsc::channel::<Vec<(String, T)>>(1); 373 + tx: mpsc::Sender<Result<BlockChunk<T>, DriveError>>, 374 + ) -> Result<(), mpsc::error::SendError<Result<BlockChunk<T>, DriveError>>> { 375 + let mut reader = match self.store.get_reader() { 376 + Ok(r) => r, 377 + Err(e) => return tx.blocking_send(Err(e.into())), 378 + }; 335 379 336 - // sketch: this worker is going to be allowed to execute without a join handle 337 - // ...should we return the join handle here so the caller at least knows about it? 338 - // yes probably for error handling?? (orrr put errors in the channel) 339 - let worker = tokio::task::spawn_blocking(move || { 340 - let mut reader = self.store.get_reader()?; 380 + loop { 381 + let mut out: BlockChunk<T> = Vec::with_capacity(n); 341 382 342 - loop { 343 - let mut out = Vec::with_capacity(n); 383 + for _ in 0..n { 384 + // walk as far as we can until we run out of blocks or find a record 344 385 345 - for _ in 0..n { 346 - // walk as far as we can until we run out of blocks or find a record 347 - match self.walker.disk_step(&mut reader, self.process)? { 348 - Step::Missing(cid) => return Err(DriveError::MissingBlock(cid)), 349 - Step::Finish => break, 350 - Step::Step { rkey, data } => { 351 - out.push((rkey, data)); 352 - continue; 353 - } 354 - }; 355 - } 386 + let step = match self.walker.disk_step(&mut reader, self.process) { 387 + Ok(s) => s, 388 + Err(e) => return tx.blocking_send(Err(e.into())), 389 + }; 356 390 357 - if out.is_empty() { 358 - break; 359 - } 360 - tx.blocking_send(out) 361 - .map_err(|_| DriveError::ChannelSendError)?; 391 + match step { 392 + Step::Missing(cid) => { 393 + return tx.blocking_send(Err(DriveError::MissingBlock(cid))); 394 + } 395 + Step::Finish => return Ok(()), 396 + Step::Step { rkey, data } => { 397 + out.push((rkey, data)); 398 + continue; 399 + } 400 + }; 362 401 } 363 402 364 - drop(reader); // cannot outlive store 365 - Ok(()) 366 - }); // await later 403 + if out.is_empty() { 404 + break; 405 + } 406 + tx.blocking_send(Ok(out))?; 407 + } 367 408 368 - Ok((rx, worker)) 409 + Ok(()) 369 410 } 370 - } 371 411 372 - /// The core driver between the block stream and MST walker 373 - /// 374 - /// In the future, PDSs will export CARs in a stream-friendly order that will 375 - /// enable processing them with tiny memory overhead. But that future is not 376 - /// here yet. 377 - /// 378 - /// CARs are almost always in a stream-unfriendly order, so I'm reverting the 379 - /// optimistic stream features: we load all block first, then walk the MST. 380 - /// 381 - /// This makes things much simpler: we only need to worry about spilling to disk 382 - /// in one place, and we always have a reasonable expecatation about how much 383 - /// work the init function will do. We can drop the CAR reader before walking, 384 - /// so the sync/async boundaries become a little easier to work around. 385 - #[derive(Debug)] 386 - pub struct MemDriver<T: Processable> { 387 - blocks: HashMap<Cid, MaybeProcessedBlock<T>>, 388 - walker: Walker, 389 - process: fn(Vec<u8>) -> T, 390 - } 412 + pub fn to_channel(self, n: usize) -> mpsc::Receiver<Result<BlockChunk<T>, DriveError>> { 413 + let (tx, rx) = mpsc::channel::<Result<BlockChunk<T>, DriveError>>(1); 391 414 392 - impl<T: Processable> MemDriver<T> { 393 - /// Manually step through the record outputs 394 - pub async fn next_chunk(&mut self, n: usize) -> Result<Option<Vec<(String, T)>>, DriveError> { 395 - let mut out = Vec::with_capacity(n); 396 - for _ in 0..n { 397 - // walk as far as we can until we run out of blocks or find a record 398 - match self.walker.step(&mut self.blocks, self.process)? { 399 - Step::Missing(cid) => return Err(DriveError::MissingBlock(cid)), 400 - Step::Finish => break, 401 - Step::Step { rkey, data } => { 402 - out.push((rkey, data)); 403 - continue; 404 - } 405 - }; 406 - } 415 + // sketch: this worker is going to be allowed to execute without a join handle 416 + tokio::task::spawn_blocking(move || { 417 + if let Err(mpsc::error::SendError(_)) = self.read_tx_blocking(n, tx) { 418 + log::debug!("big car reader exited early due to dropped receiver channel"); 419 + } 420 + }); 407 421 408 - if out.is_empty() { 409 - Ok(None) 410 - } else { 411 - Ok(Some(out)) 412 - } 422 + rx 413 423 } 414 424 }