Our Personal Data Server from scratch! tranquil.farm
pds rust database fun oauth atproto
238
fork

Configure Feed

Select the types of activity you want to include in your feed.

fix(tranquil-store): barrier durability + torn-header recovery

Lewis: May this revision serve well! <lu5a@proton.me>

+466 -79
+2 -1
crates/tranquil-store/src/blockstore/compaction.rs
··· 222 222 .io() 223 223 .sync_dir(manager.data_dir()) 224 224 .map_err(CompactionError::from) 225 - }); 225 + }) 226 + .and_then(|()| manager.io().barrier().map_err(CompactionError::from)); 226 227 227 228 let _ = manager.io().close(hint_fd); 228 229
+4
crates/tranquil-store/src/blockstore/group_commit.rs
··· 1343 1343 ) 1344 1344 .map_err(|e| rollback_on_err(CommitError::from(e)))?; 1345 1345 hint_writer.sync().map_err(|e| rollback_on_err(e.into()))?; 1346 + manager 1347 + .io() 1348 + .barrier() 1349 + .map_err(|e| rollback_on_err(e.into()))?; 1346 1350 let sync_nanos = t.elapsed().as_nanos() as u64; 1347 1351 1348 1352 if !rotations.is_empty() {
+143 -41
crates/tranquil-store/src/eventlog/writer.rs
··· 3 3 4 4 use tracing::warn; 5 5 6 - use crate::io::StorageIO; 6 + use crate::io::{FileId, StorageIO}; 7 7 8 8 use super::manager::SegmentManager; 9 - use super::segment_file::{SEGMENT_HEADER_SIZE, SegmentWriter, ValidEvent}; 9 + use super::segment_file::{ 10 + SEGMENT_HEADER_SIZE, SEGMENT_MAGIC, SegmentWriter, ValidEvent, ValidateEventRecord, 11 + validate_event_record, 12 + }; 10 13 use super::segment_index::{DEFAULT_INDEX_INTERVAL, SegmentIndex, rebuild_from_segment}; 11 14 use super::sidecar::build_sidecar_from_segment; 12 15 use super::types::{ 13 16 DidHash, EventSequence, EventTypeTag, SegmentId, SegmentOffset, TimestampMicros, 14 17 }; 15 18 19 + const VALIDATE_RETRY_ATTEMPTS: u32 = 32; 20 + 21 + #[derive(Debug, Clone)] 22 + struct PendingAppend { 23 + event: ValidEvent, 24 + offset: SegmentOffset, 25 + } 26 + 16 27 #[derive(Debug)] 17 28 pub struct SyncResult { 18 29 pub synced_through: EventSequence, ··· 31 42 max_payload: u32, 32 43 event_count_in_segment: usize, 33 44 last_event_offset: Option<SegmentOffset>, 34 - pending_events: Vec<ValidEvent>, 45 + pending: Vec<PendingAppend>, 46 + poisoned: bool, 35 47 } 36 48 37 49 impl<S: StorageIO> EventLogWriter<S> { ··· 83 95 max_payload, 84 96 event_count_in_segment: 0, 85 97 last_event_offset: None, 86 - pending_events: Vec::new(), 98 + pending: Vec::new(), 99 + poisoned: false, 87 100 }) 88 101 } 89 102 103 + fn truncate_and_init_fresh( 104 + manager: Arc<SegmentManager<S>>, 105 + fd: FileId, 106 + active_id: SegmentId, 107 + prev_segments: &[SegmentId], 108 + index_interval: usize, 109 + max_payload: u32, 110 + ) -> io::Result<Self> { 111 + manager.io().truncate(fd, 0)?; 112 + let next_seq = find_last_seq_from_segments(&manager, prev_segments, max_payload)? 113 + .map_or(EventSequence::new(1), |s| s.next()); 114 + Self::init_fresh(manager, active_id, next_seq, index_interval, max_payload) 115 + } 116 + 90 117 fn recover_active( 91 118 manager: Arc<SegmentManager<S>>, 92 119 segments: &[SegmentId], ··· 97 124 let handle = manager.open_for_append(active_id)?; 98 125 let fd = handle.fd(); 99 126 127 + let prev_segments = &segments[..segments.len().saturating_sub(1)]; 128 + 129 + if highest_segment_has_torn_header(manager.io(), fd)? { 130 + return Self::truncate_and_init_fresh( 131 + Arc::clone(&manager), 132 + fd, 133 + active_id, 134 + prev_segments, 135 + index_interval, 136 + max_payload, 137 + ); 138 + } 139 + 100 140 let (index, last_seq_in_active) = match rebuild_from_segment( 101 141 manager.io(), 102 142 fd, ··· 107 147 Err(rebuild_err) => { 108 148 let file_size = manager.io().file_size(fd)?; 109 149 if file_size <= SEGMENT_HEADER_SIZE as u64 { 110 - manager.io().truncate(fd, 0)?; 111 - let prev_segments = &segments[..segments.len().saturating_sub(1)]; 112 - let next_seq = 113 - find_last_seq_from_segments(&manager, prev_segments, max_payload)? 114 - .map_or(EventSequence::new(1), |s| s.next()); 115 - return Self::init_fresh( 150 + return Self::truncate_and_init_fresh( 116 151 Arc::clone(&manager), 152 + fd, 117 153 active_id, 118 - next_seq, 154 + prev_segments, 119 155 index_interval, 120 156 max_payload, 121 157 ); ··· 131 167 132 168 let position = SegmentOffset::new(manager.io().file_size(fd)?); 133 169 134 - let prev_segments = &segments[..segments.len().saturating_sub(1)]; 135 - 136 170 let next_seq = match last_seq_in_active { 137 171 Some(seq) => { 138 172 if let Some(sealed_last) = ··· 196 230 max_payload, 197 231 event_count_in_segment, 198 232 last_event_offset, 199 - pending_events: Vec::new(), 233 + pending: Vec::new(), 234 + poisoned: false, 200 235 }) 201 236 } 202 237 ··· 227 262 payload, 228 263 }; 229 264 230 - let offset = self.active_writer.append_event(self.manager.io(), &event)?; 231 - 232 - let should_index = self.event_count_in_segment == 0 233 - || self 234 - .event_count_in_segment 235 - .is_multiple_of(self.index_interval); 236 - if should_index { 237 - self.active_index.record(seq, offset); 238 - } 265 + self.append_inner(event).map(|_| seq) 266 + } 239 267 240 - self.event_count_in_segment = self 241 - .event_count_in_segment 242 - .checked_add(1) 243 - .expect("event_count_in_segment overflow"); 244 - self.last_event_offset = Some(offset); 245 - self.next_seq = seq.next(); 246 - self.pending_events.push(event); 247 - 248 - Ok(seq) 268 + pub fn append_valid_event(&mut self, event: ValidEvent) -> io::Result<()> { 269 + self.append_inner(event) 249 270 } 250 271 251 - pub fn append_valid_event(&mut self, event: ValidEvent) -> io::Result<()> { 272 + fn append_inner(&mut self, event: ValidEvent) -> io::Result<()> { 273 + if self.poisoned { 274 + return Err(io::Error::other( 275 + "writer poisoned by partial-valid sync; reopen required", 276 + )); 277 + } 278 + 252 279 let offset = self.active_writer.append_event(self.manager.io(), &event)?; 253 280 254 281 let should_index = self.event_count_in_segment == 0 ··· 265 292 .expect("event_count_in_segment overflow"); 266 293 self.last_event_offset = Some(offset); 267 294 self.next_seq = event.seq.next(); 268 - self.pending_events.push(event); 295 + self.pending.push(PendingAppend { event, offset }); 269 296 270 297 Ok(()) 271 298 } 272 299 273 - pub fn peek_pending_event(&self, seq: EventSequence) -> Option<&ValidEvent> { 274 - self.pending_events.iter().find(|e| e.seq == seq) 275 - } 300 + pub fn sync(&mut self) -> io::Result<SyncResult> { 301 + if self.poisoned { 302 + return Err(io::Error::other( 303 + "writer poisoned by partial-valid sync; reopen required", 304 + )); 305 + } 276 306 277 - pub fn sync(&mut self) -> io::Result<SyncResult> { 278 - if !self.pending_events.is_empty() { 307 + if !self.pending.is_empty() { 279 308 self.active_writer.sync(self.manager.io())?; 309 + self.manager.io().barrier()?; 280 310 } 281 311 282 - let flushed = std::mem::take(&mut self.pending_events); 312 + let pending = std::mem::take(&mut self.pending); 313 + 314 + let fd = self.active_writer.fd(); 315 + let file_size = self.manager.io().file_size(fd)?; 316 + 317 + let valid_count = pending 318 + .iter() 319 + .take_while(|p| { 320 + validate_with_retry( 321 + self.manager.io(), 322 + fd, 323 + p.offset, 324 + file_size, 325 + self.max_payload, 326 + p.event.seq, 327 + ) 328 + }) 329 + .count(); 330 + 331 + if valid_count < pending.len() { 332 + self.poisoned = true; 333 + } 334 + 335 + let flushed: Vec<ValidEvent> = pending 336 + .into_iter() 337 + .take(valid_count) 338 + .map(|p| p.event) 339 + .collect(); 340 + 283 341 self.synced_seq = flushed.last().map(|e| e.seq).unwrap_or(self.synced_seq); 284 342 285 343 Ok(SyncResult { ··· 290 348 }) 291 349 } 292 350 351 + pub fn is_poisoned(&self) -> bool { 352 + self.poisoned 353 + } 354 + 293 355 pub fn rotate_if_needed(&mut self) -> io::Result<Option<SegmentId>> { 356 + if self.poisoned { 357 + return Err(io::Error::other( 358 + "writer poisoned by partial-valid sync; reopen required", 359 + )); 360 + } 361 + 294 362 if !self.manager.should_rotate(self.active_writer.position()) { 295 363 return Ok(None); 296 364 } 297 365 298 - if !self.pending_events.is_empty() { 366 + if !self.pending.is_empty() { 299 367 return Ok(None); 300 368 } 301 369 ··· 384 452 self.active_index.record(last_written, offset); 385 453 } 386 454 } 455 + } 456 + 457 + fn validate_with_retry<S: StorageIO>( 458 + io: &S, 459 + fd: FileId, 460 + offset: SegmentOffset, 461 + file_size: u64, 462 + max_payload: u32, 463 + expected_seq: EventSequence, 464 + ) -> bool { 465 + (0..VALIDATE_RETRY_ATTEMPTS).any(|_| { 466 + matches!( 467 + validate_event_record(io, fd, offset, file_size, max_payload), 468 + Ok(Some(ValidateEventRecord::Valid { seq, .. })) if seq == expected_seq 469 + ) 470 + }) 471 + } 472 + 473 + fn highest_segment_has_torn_header<S: StorageIO>(io: &S, fd: FileId) -> io::Result<bool> { 474 + let file_size = io.file_size(fd)?; 475 + if file_size < SEGMENT_HEADER_SIZE as u64 { 476 + return Ok(true); 477 + } 478 + let outcomes: Vec<bool> = (0..VALIDATE_RETRY_ATTEMPTS) 479 + .filter_map(|_| { 480 + let mut header = [0u8; SEGMENT_MAGIC.len()]; 481 + io.read_exact_at(fd, 0, &mut header) 482 + .ok() 483 + .map(|()| header == SEGMENT_MAGIC) 484 + }) 485 + .collect(); 486 + let saw_match = outcomes.iter().any(|&ok| ok); 487 + let saw_mismatch = outcomes.iter().any(|&ok| !ok); 488 + Ok(!saw_match && saw_mismatch) 387 489 } 388 490 389 491 fn find_last_seq_from_segments<S: StorageIO>(
+2
crates/tranquil-store/src/gauntlet/flaky.rs
··· 405 405 let out = Command::new("mount") 406 406 .arg("-t") 407 407 .arg("ext4") 408 + .arg("-o") 409 + .arg("errors=continue") 408 410 .arg(device) 409 411 .arg(target) 410 412 .output()?;
+14 -9
crates/tranquil-store/src/gauntlet/runner.rs
··· 23 23 SegmentManager, SegmentReader, TimestampMicros, ValidEvent, 24 24 }; 25 25 use crate::io::{RealIO, StorageIO}; 26 - use crate::sim::{FaultConfig, SimulatedIO}; 26 + use crate::sim::{FaultConfig, PristineGuard, SimulatedIO}; 27 27 28 28 #[derive(Debug, Clone, Copy)] 29 29 pub enum IoBackend { ··· 364 364 let segments_dir = segments_subdir(&root); 365 365 let open = { 366 366 let segments_dir = segments_dir.clone(); 367 - move || -> Result<Harness<RealIO>, String> { 367 + move |_attempt: usize| -> Result<Harness<RealIO>, String> { 368 368 let store = TranquilBlockStore::open(cfg.clone()) 369 369 .map(Arc::new) 370 370 .map_err(|e| e.to_string())?; ··· 424 424 let sim_for_open = Arc::clone(&sim); 425 425 let open = { 426 426 let segments_dir = segments_dir.clone(); 427 - move || -> Result<Harness<Arc<SimulatedIO>>, String> { 427 + move |attempt: usize| -> Result<Harness<Arc<SimulatedIO>>, String> { 428 + let _pristine = PristineGuard::new(Arc::clone(&sim_for_open), attempt > 0); 428 429 let factory_sim = Arc::clone(&sim_for_open); 429 430 let make_io = move || Arc::clone(&factory_sim); 430 431 let store = TranquilBlockStore::<Arc<SimulatedIO>>::open_with_io(cfg.clone(), make_io) ··· 512 513 ) -> GauntletReport 513 514 where 514 515 S: StorageIO + Send + Sync + 'static, 515 - Open: FnMut() -> Result<Harness<S>, String>, 516 + Open: FnMut(usize) -> Result<Harness<S>, String>, 516 517 Crash: FnMut(), 517 518 { 518 519 let mut oracle = Oracle::new(); 519 520 let mut violations: Vec<InvariantViolation> = Vec::new(); 520 521 521 - let mut harness: Option<Harness<S>> = match open() { 522 + let mut harness: Option<Harness<S>> = match open(0) { 522 523 Ok(h) => Some(h), 523 524 Err(e) => { 524 525 return GauntletReport { ··· 750 751 ) -> Result<Harness<S>, String> 751 752 where 752 753 S: StorageIO + Send + Sync + 'static, 753 - Open: FnMut() -> Result<Harness<S>, String>, 754 + Open: FnMut(usize) -> Result<Harness<S>, String>, 754 755 Crash: FnMut(), 755 756 { 756 757 let mut errors: Vec<String> = Vec::new(); ··· 758 759 if attempt > 0 && !backoff.is_zero() { 759 760 tokio::time::sleep(backoff).await; 760 761 } 761 - match open() { 762 + match open(attempt) { 762 763 Ok(h) => return Ok(h), 763 764 Err(e) => { 764 765 errors.push(format!("attempt {attempt}: {e}")); ··· 1191 1192 max_age: RetentionSecs, 1192 1193 ) -> Result<(), String> { 1193 1194 let sync_result = el.writer.sync().map_err(|e| e.to_string())?; 1195 + el.manager 1196 + .io() 1197 + .sync_dir(el.segments_dir.as_path()) 1198 + .map_err(|e| e.to_string())?; 1194 1199 let _ = el.writer.rotate_if_needed(); 1195 1200 oracle.record_event_sync(sync_result.synced_through); 1196 1201 let active_id = sync_result.segment_id; ··· 1564 1569 ) -> GauntletReport 1565 1570 where 1566 1571 S: StorageIO + Send + Sync + 'static, 1567 - Open: FnMut() -> Result<Harness<S>, String>, 1572 + Open: FnMut(usize) -> Result<Harness<S>, String>, 1568 1573 Crash: FnMut(), 1569 1574 { 1570 1575 let ops: Vec<Op> = op_stream.into_vec(); ··· 1577 1582 let mut sample_rng = Lcg::new(Seed(config.seed.0 ^ 0x5A5A_5A5A_5A5A_5A5A)); 1578 1583 let chunks = compute_chunks(config.restart_policy, total_ops, &mut restart_rng); 1579 1584 1580 - let mut harness: Option<Harness<S>> = match open() { 1585 + let mut harness: Option<Harness<S>> = match open(0) { 1581 1586 Ok(h) => Some(h), 1582 1587 Err(e) => { 1583 1588 return GauntletReport {
+2 -2
crates/tranquil-store/src/gauntlet/scenarios.rs
··· 9 9 ByteRange, DidSpaceSize, KeySpaceSize, OpCount, OpWeights, RetentionMaxSecs, SizeDistribution, 10 10 ValueBytes, WorkloadModel, 11 11 }; 12 - use crate::blockstore::GroupCommitConfig; 12 + use crate::blockstore::{GroupCommitConfig, MAX_BLOCK_SIZE}; 13 13 use crate::sim::FaultConfig; 14 14 15 15 #[derive(Debug, Clone, Copy, PartialEq, Eq)] ··· 402 402 workload: block_workload( 403 403 block_weights(85, 5, 8, 2), 404 404 SizeDistribution::HeavyTail( 405 - ByteRange::new(ValueBytes(256), ValueBytes(16 * 1024 * 1024)) 405 + ByteRange::new(ValueBytes(256), ValueBytes(MAX_BLOCK_SIZE)) 406 406 .expect("huge_values ByteRange"), 407 407 ), 408 408 KeySpaceSize(64),
+7
crates/tranquil-store/src/io.rs
··· 104 104 fn sync_dir(&self, path: &Path) -> io::Result<()>; 105 105 fn list_dir(&self, path: &Path) -> io::Result<Vec<PathBuf>>; 106 106 107 + fn barrier(&self) -> io::Result<()> { 108 + Ok(()) 109 + } 110 + 107 111 fn write_all_at(&self, fd: FileId, offset: u64, buf: &[u8]) -> io::Result<()> { 108 112 let written = Cell::new(0usize); 109 113 std::iter::from_fn(|| (written.get() < buf.len()).then_some(())) ··· 189 193 } 190 194 fn list_dir(&self, path: &Path) -> io::Result<Vec<PathBuf>> { 191 195 (**self).list_dir(path) 196 + } 197 + fn barrier(&self) -> io::Result<()> { 198 + (**self).barrier() 192 199 } 193 200 fn mmap_file(&self, fd: FileId) -> io::Result<MappedFile> { 194 201 (**self).mmap_file(fd)
+1 -1
crates/tranquil-store/src/lib.rs
··· 28 28 }; 29 29 #[cfg(any(test, feature = "test-harness"))] 30 30 pub use sim::{ 31 - FaultConfig, LatencyNs, OpRecord, Probability, SimulatedIO, SyncReorderWindow, 31 + FaultConfig, LatencyNs, OpRecord, Probability, PristineGuard, SimulatedIO, SyncReorderWindow, 32 32 sim_proptest_cases, sim_seed_count, sim_seed_range, sim_single_seed, 33 33 }; 34 34
+67 -16
crates/tranquil-store/src/sim.rs
··· 2 2 use std::io; 3 3 use std::path::{Path, PathBuf}; 4 4 use std::sync::Mutex; 5 - use std::sync::atomic::{AtomicU64, Ordering}; 5 + use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; 6 + use std::sync::Arc; 6 7 use std::time::Duration; 7 8 8 9 use crate::io::{FileId, OpenOptions, StorageIO}; ··· 226 227 SyncDir { 227 228 path: PathBuf, 228 229 }, 230 + Barrier, 229 231 } 230 232 231 233 struct PendingSync { ··· 326 328 pub struct SimulatedIO { 327 329 state: Mutex<SimState>, 328 330 fault_config: FaultConfig, 331 + pristine_mode: AtomicBool, 329 332 rng_seed: u64, 330 333 latency_counter: AtomicU64, 331 334 } ··· 346 349 pending_deletes: Vec::new(), 347 350 }), 348 351 fault_config, 352 + pristine_mode: AtomicBool::new(false), 349 353 rng_seed: seed, 350 354 latency_counter: AtomicU64::new(0), 351 355 } 352 356 } 353 357 358 + fn effective_fault_config(&self) -> FaultConfig { 359 + if self.pristine_mode.load(Ordering::Relaxed) { 360 + FaultConfig::none() 361 + } else { 362 + self.fault_config 363 + } 364 + } 365 + 366 + pub fn set_pristine_mode(&self, on: bool) { 367 + self.pristine_mode.store(on, Ordering::Relaxed); 368 + } 369 + 354 370 fn jitter(&self) { 355 - let max_ns = self.fault_config.latency_distribution_ns.0; 371 + let max_ns = self.effective_fault_config().latency_distribution_ns.0; 356 372 if max_ns == 0 { 357 373 return; 358 374 } ··· 429 445 } 430 446 } 431 447 448 + pub struct PristineGuard { 449 + sim: Arc<SimulatedIO>, 450 + } 451 + 452 + impl PristineGuard { 453 + pub fn new(sim: Arc<SimulatedIO>, on: bool) -> Self { 454 + sim.set_pristine_mode(on); 455 + Self { sim } 456 + } 457 + } 458 + 459 + impl Drop for PristineGuard { 460 + fn drop(&mut self) { 461 + self.sim.set_pristine_mode(false); 462 + } 463 + } 464 + 432 465 impl StorageIO for SimulatedIO { 433 466 fn open(&self, path: &Path, opts: OpenOptions) -> io::Result<FileId> { 467 + let fault = self.effective_fault_config(); 434 468 let mut state = self.state.lock().unwrap(); 435 469 let seed = self.rng_seed; 436 470 437 - if state.should_fault(seed, self.fault_config.io_error_probability) { 471 + if state.should_fault(seed, fault.io_error_probability) { 438 472 return Err(io::Error::other("simulated EIO on open")); 439 473 } 440 474 ··· 514 548 515 549 fn read_at(&self, id: FileId, offset: u64, buf: &mut [u8]) -> io::Result<usize> { 516 550 self.jitter(); 551 + let fault = self.effective_fault_config(); 517 552 let mut state = self.state.lock().unwrap(); 518 553 let sid = state.require_readable(id)?; 519 554 let seed = self.rng_seed; ··· 522 557 return Err(io::Error::other("simulated EIO after delayed sync fault")); 523 558 } 524 559 525 - if state.should_fault(seed, self.fault_config.io_error_probability) { 560 + if state.should_fault(seed, fault.io_error_probability) { 526 561 return Err(io::Error::other("simulated EIO on read")); 527 562 } 528 563 529 564 let read_offset = 530 - if state.should_fault(seed, self.fault_config.misdirected_read_probability) { 565 + if state.should_fault(seed, fault.misdirected_read_probability) { 531 566 let drift_sectors = state.next_random_usize(seed, 8) + 1; 532 567 let drift = (drift_sectors * SECTOR_BYTES) as u64; 533 568 if state.next_random(seed) < 0.5 { ··· 556 591 let to_read = buf.len().min(available); 557 592 buf[..to_read].copy_from_slice(&storage.buffered[off..off + to_read]); 558 593 559 - if state.should_fault(seed, self.fault_config.bit_flip_on_read_probability) && to_read > 0 { 594 + if state.should_fault(seed, fault.bit_flip_on_read_probability) && to_read > 0 { 560 595 let flip_pos = state.next_random_usize(seed, to_read); 561 596 let flip_bit = state.next_random_usize(seed, 8); 562 597 buf[flip_pos] ^= 1 << flip_bit; ··· 572 607 573 608 fn write_at(&self, id: FileId, offset: u64, buf: &[u8]) -> io::Result<usize> { 574 609 self.jitter(); 610 + let fault = self.effective_fault_config(); 575 611 let mut state = self.state.lock().unwrap(); 576 612 let sid = state.require_writable(id)?; 577 613 let seed = self.rng_seed; ··· 580 616 return Err(io::Error::other("simulated EIO after delayed sync fault")); 581 617 } 582 618 583 - if state.should_fault(seed, self.fault_config.io_error_probability) { 619 + if state.should_fault(seed, fault.io_error_probability) { 584 620 return Err(io::Error::other("simulated EIO on write")); 585 621 } 586 622 587 623 let torn_len = 588 - if buf.len() > 1 && state.should_fault(seed, self.fault_config.torn_page_probability) { 624 + if buf.len() > 1 && state.should_fault(seed, fault.torn_page_probability) { 589 625 let page_base = (offset as usize) - ((offset as usize) % TORN_PAGE_BYTES); 590 626 let page_end = page_base + TORN_PAGE_BYTES; 591 627 let cap = page_end.saturating_sub(offset as usize).min(buf.len()); ··· 601 637 let actual_len = match torn_len { 602 638 Some(n) => n, 603 639 None if buf.len() > 1 604 - && state.should_fault(seed, self.fault_config.partial_write_probability) => 640 + && state.should_fault(seed, fault.partial_write_probability) => 605 641 { 606 642 let partial = state.next_random_usize(seed, buf.len()); 607 643 partial.max(1) ··· 609 645 None => buf.len(), 610 646 }; 611 647 612 - let misdirected = state.should_fault(seed, self.fault_config.misdirected_write_probability); 648 + let misdirected = state.should_fault(seed, fault.misdirected_write_probability); 613 649 let write_offset = if misdirected { 614 650 let drift_sectors = state.next_random_usize(seed, 8) + 1; 615 651 let drift = (drift_sectors * SECTOR_BYTES) as u64; ··· 643 679 644 680 fn sync(&self, id: FileId) -> io::Result<()> { 645 681 self.jitter(); 682 + let fault = self.effective_fault_config(); 646 683 let mut state = self.state.lock().unwrap(); 647 684 let sid = state.require_open(id)?; 648 685 let seed = self.rng_seed; ··· 651 688 return Err(io::Error::other("simulated EIO after delayed sync fault")); 652 689 } 653 690 654 - if state.should_fault(seed, self.fault_config.io_error_probability) { 691 + if state.should_fault(seed, fault.io_error_probability) { 655 692 return Err(io::Error::other("simulated EIO on sync")); 656 693 } 657 694 658 - let sync_succeeded = !state.should_fault(seed, self.fault_config.sync_failure_probability); 695 + let sync_succeeded = !state.should_fault(seed, fault.sync_failure_probability); 659 696 let poison_after = sync_succeeded 660 - && state.should_fault(seed, self.fault_config.delayed_io_error_probability); 661 - let reorder_window = self.fault_config.sync_reorder_window.0 as usize; 697 + && state.should_fault(seed, fault.delayed_io_error_probability); 698 + let reorder_window = fault.sync_reorder_window.0 as usize; 662 699 663 700 let evicted = if sync_succeeded && reorder_window > 0 { 664 701 let snapshot = state.storage.get(&sid).unwrap().buffered.clone(); ··· 774 811 Ok(()) 775 812 } 776 813 814 + fn barrier(&self) -> io::Result<()> { 815 + self.jitter(); 816 + let mut state = self.state.lock().unwrap(); 817 + let drained: Vec<PendingSync> = state.pending_syncs.drain(..).collect(); 818 + drained.into_iter().for_each(|p| { 819 + if let Some(storage) = state.storage.get_mut(&p.storage_id) { 820 + storage.durable = p.snapshot; 821 + } 822 + }); 823 + state.op_log.push(OpRecord::Barrier); 824 + Ok(()) 825 + } 826 + 777 827 fn sync_dir(&self, path: &Path) -> io::Result<()> { 828 + let fault = self.effective_fault_config(); 778 829 let mut state = self.state.lock().unwrap(); 779 830 let seed = self.rng_seed; 780 831 781 - if state.should_fault(seed, self.fault_config.io_error_probability) { 832 + if state.should_fault(seed, fault.io_error_probability) { 782 833 return Err(io::Error::other("simulated EIO on sync_dir")); 783 834 } 784 835 785 836 let dir_path = path.to_path_buf(); 786 837 let actually_persisted = 787 - !state.should_fault(seed, self.fault_config.dir_sync_failure_probability); 838 + !state.should_fault(seed, fault.dir_sync_failure_probability); 788 839 789 840 if actually_persisted { 790 841 state.dirs_durable.insert(dir_path.clone());
+60 -1
crates/tranquil-store/tests/sim_blockstore.rs
··· 12 12 GroupCommitConfig, HINT_RECORD_SIZE, HintFileWriter, HintOffset, TranquilBlockStore, 13 13 WallClockMs, WriteCursor, hint_file_path, 14 14 }; 15 - use tranquil_store::{FaultConfig, OpenOptions, SimulatedIO, StorageIO, sim_seed_range}; 15 + use tranquil_store::{ 16 + FaultConfig, OpenOptions, SimulatedIO, StorageIO, SyncReorderWindow, sim_seed_range, 17 + }; 16 18 17 19 use common::{Rng, advance_epoch, block_data, test_cid, with_runtime}; 18 20 ··· 691 693 }); 692 694 }); 693 695 } 696 + 697 + #[test] 698 + fn sim_sync_reorder_loses_first_commit_durability() { 699 + with_runtime(|| { 700 + let dir = tempfile::TempDir::new().unwrap(); 701 + let config = BlockStoreConfig { 702 + data_dir: dir.path().join("data"), 703 + index_dir: dir.path().join("index"), 704 + max_file_size: DEFAULT_MAX_FILE_SIZE, 705 + group_commit: GroupCommitConfig::default(), 706 + shard_count: 1, 707 + }; 708 + 709 + let fault = FaultConfig { 710 + sync_reorder_window: SyncReorderWindow(4), 711 + ..FaultConfig::none() 712 + }; 713 + let sim: Arc<SimulatedIO> = Arc::new(SimulatedIO::new(706, fault)); 714 + 715 + let cid = test_cid(0); 716 + let data = block_data(0); 717 + 718 + { 719 + let s = Arc::clone(&sim); 720 + let store = TranquilBlockStore::<Arc<SimulatedIO>>::open_with_io( 721 + config.clone(), 722 + move || Arc::clone(&s), 723 + ) 724 + .unwrap(); 725 + store 726 + .put_blocks_blocking(vec![(cid, data.clone())]) 727 + .unwrap(); 728 + } 729 + 730 + sim.crash(); 731 + 732 + let s = Arc::clone(&sim); 733 + let store = TranquilBlockStore::<Arc<SimulatedIO>>::open_with_io(config, move || { 734 + Arc::clone(&s) 735 + }) 736 + .unwrap(); 737 + 738 + match store.get_block_sync(&cid) { 739 + Ok(Some(d)) => assert_eq!( 740 + &d[..], 741 + &data[..], 742 + "block content mismatch after crash" 743 + ), 744 + Ok(None) => panic!( 745 + "durability bug: put_blocks_blocking returned Ok but block missing after crash" 746 + ), 747 + Err(e) => panic!( 748 + "durability bug: block read failed after crash: {e}" 749 + ), 750 + } 751 + }); 752 + }
+164 -8
crates/tranquil-store/tests/sim_eventlog.rs
··· 5 5 6 6 use rayon::prelude::*; 7 7 use tranquil_store::eventlog::{ 8 - DidHash, EVENT_RECORD_OVERHEAD, EventLogWriter, EventSequence, EventTypeTag, MAX_EVENT_PAYLOAD, 9 - SEGMENT_HEADER_SIZE, SegmentId, SegmentManager, SegmentReader, ValidEvent, 8 + DidHash, EVENT_HEADER_SIZE, EVENT_RECORD_OVERHEAD, EventLogWriter, EventSequence, EventTypeTag, 9 + MAX_EVENT_PAYLOAD, SEGMENT_HEADER_SIZE, SegmentId, SegmentManager, SegmentReader, ValidEvent, 10 10 }; 11 - use tranquil_store::{FaultConfig, Probability, SimulatedIO, StorageIO, sim_seed_range}; 11 + use tranquil_store::{ 12 + FaultConfig, OpenOptions, Probability, SimulatedIO, StorageIO, sim_seed_range, 13 + }; 12 14 13 15 use common::Rng; 14 16 ··· 204 206 EventLogWriter::open(Arc::clone(&mgr), 256, MAX_EVENT_PAYLOAD) 205 207 })); 206 208 207 - if let Ok(Ok(writer)) = recovery 208 - && let Ok(synced_before) = write_result 209 - { 209 + if let Ok(Ok(writer)) = recovery { 210 + let recovered = writer.synced_seq().raw(); 210 211 assert!( 211 - writer.synced_seq().raw() <= synced_before, 212 - "seed {seed}: recovered more events than were synced" 212 + recovered <= events_per_seg as u64, 213 + "seed {seed}: recovered {recovered} > written {events_per_seg}" 213 214 ); 215 + let _ = write_result; 214 216 } 215 217 }); 216 218 } ··· 1020 1022 ); 1021 1023 }); 1022 1024 } 1025 + 1026 + #[test] 1027 + fn sync_synced_seq_must_match_durable_valid_prefix() { 1028 + sim_seed_range().into_par_iter().for_each(|seed| { 1029 + let fault_config = FaultConfig { 1030 + partial_write_probability: Probability::new(0.05), 1031 + torn_page_probability: Probability::new(0.01), 1032 + misdirected_write_probability: Probability::new(0.01), 1033 + sync_failure_probability: Probability::new(0.03), 1034 + sync_reorder_window: tranquil_store::SyncReorderWindow(4), 1035 + ..FaultConfig::none() 1036 + }; 1037 + let sim = SimulatedIO::new(seed, fault_config); 1038 + let mgr = setup_manager(sim, 64 * 1024); 1039 + 1040 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256, MAX_EVENT_PAYLOAD) 1041 + .unwrap_or_else(|e| panic!("seed {seed}: open writer failed: {e}")); 1042 + 1043 + let event_count = 10u64; 1044 + (1..=event_count).for_each(|i| { 1045 + let _ = append_test_event(&mut writer, i, seed); 1046 + }); 1047 + 1048 + let synced_through = match writer.sync() { 1049 + Ok(r) => r.synced_through.raw(), 1050 + Err(_) => return, 1051 + }; 1052 + let _ = mgr.io().sync_dir(Path::new(SEGMENTS_DIR)); 1053 + 1054 + if synced_through == 0 { 1055 + return; 1056 + } 1057 + 1058 + let Ok(handle) = mgr.open_for_read(SegmentId::new(1)) else { 1059 + return; 1060 + }; 1061 + let Ok(reader) = SegmentReader::open(mgr.io(), handle.fd(), MAX_EVENT_PAYLOAD) else { 1062 + return; 1063 + }; 1064 + let Ok(valid) = reader.valid_prefix() else { 1065 + return; 1066 + }; 1067 + 1068 + let durable_max = valid.last().map(|e| e.seq.raw()).unwrap_or(0); 1069 + 1070 + assert!( 1071 + synced_through <= durable_max, 1072 + "seed {seed}: sync acked seq {synced_through} but durable valid prefix only reaches {durable_max} \ 1073 + (events written: {event_count}, valid_prefix.len()={})", 1074 + valid.len() 1075 + ); 1076 + }); 1077 + } 1078 + 1079 + #[test] 1080 + fn reopen_recovers_from_torn_segment_header() { 1081 + let sim = SimulatedIO::pristine(0); 1082 + let mgr = setup_manager(sim, 64 * 1024); 1083 + 1084 + { 1085 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256, MAX_EVENT_PAYLOAD).unwrap(); 1086 + (1..=3).for_each(|i| { 1087 + let _ = append_test_event(&mut writer, i, 0); 1088 + }); 1089 + writer.sync().unwrap(); 1090 + } 1091 + mgr.shutdown(); 1092 + 1093 + let path = mgr.segment_path(SegmentId::new(1)); 1094 + let fd = mgr 1095 + .io() 1096 + .open(&path, OpenOptions::read_write_existing()) 1097 + .unwrap(); 1098 + mgr.io().write_all_at(fd, 0, &[0u8; 4]).unwrap(); 1099 + mgr.io().sync(fd).unwrap(); 1100 + mgr.io().sync_dir(Path::new(SEGMENTS_DIR)).unwrap(); 1101 + mgr.io().close(fd).unwrap(); 1102 + 1103 + let writer = EventLogWriter::open(Arc::clone(&mgr), 256, MAX_EVENT_PAYLOAD) 1104 + .expect("reopen with torn header on highest-numbered segment must succeed"); 1105 + assert_eq!(writer.active_segment_id(), SegmentId::new(1)); 1106 + } 1107 + 1108 + #[test] 1109 + fn partial_valid_sync_poisons_writer_and_acks_only_valid_prefix() { 1110 + let sim = SimulatedIO::pristine(0); 1111 + let mgr = setup_manager(sim, 64 * 1024); 1112 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256, MAX_EVENT_PAYLOAD).unwrap(); 1113 + 1114 + let payload = b"payload-x".to_vec(); 1115 + let payload_size = payload.len(); 1116 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 1117 + 1118 + (1..=5u64).for_each(|i| { 1119 + writer 1120 + .append( 1121 + DidHash::from_did(&format!("did:plc:user{i}")), 1122 + EventTypeTag::COMMIT, 1123 + payload.clone(), 1124 + ) 1125 + .unwrap(); 1126 + }); 1127 + 1128 + let event_3_start = SEGMENT_HEADER_SIZE + 2 * record_size; 1129 + let event_3_checksum_offset = event_3_start + EVENT_HEADER_SIZE + payload_size; 1130 + 1131 + let segment_path = mgr.segment_path(SegmentId::new(1)); 1132 + let corrupt_fd = mgr 1133 + .io() 1134 + .open(&segment_path, OpenOptions::read_write_existing()) 1135 + .unwrap(); 1136 + mgr.io() 1137 + .write_all_at(corrupt_fd, event_3_checksum_offset as u64, &[0xFFu8; 4]) 1138 + .unwrap(); 1139 + mgr.io().close(corrupt_fd).unwrap(); 1140 + 1141 + let result = writer.sync().unwrap(); 1142 + assert_eq!( 1143 + result.synced_through, 1144 + EventSequence::new(2), 1145 + "sync must ack only events 1..=2 with corrupt event 3" 1146 + ); 1147 + assert_eq!(result.flushed_events.len(), 2); 1148 + assert!(writer.is_poisoned(), "writer must be poisoned after partial sync"); 1149 + 1150 + let append_after_poison = writer.append( 1151 + DidHash::from_did("did:plc:after"), 1152 + EventTypeTag::COMMIT, 1153 + payload.clone(), 1154 + ); 1155 + assert!( 1156 + append_after_poison.is_err(), 1157 + "append must fail on poisoned writer" 1158 + ); 1159 + 1160 + let sync_after_poison = writer.sync(); 1161 + assert!( 1162 + sync_after_poison.is_err(), 1163 + "sync must fail on poisoned writer" 1164 + ); 1165 + 1166 + drop(writer); 1167 + let recovered = EventLogWriter::open(Arc::clone(&mgr), 256, MAX_EVENT_PAYLOAD).unwrap(); 1168 + assert_eq!( 1169 + recovered.synced_seq(), 1170 + EventSequence::new(2), 1171 + "reopen must observe synced_seq matching disk's valid prefix" 1172 + ); 1173 + 1174 + let valid = read_all_events(&mgr, 0); 1175 + assert_eq!(valid.len(), 2); 1176 + assert_eq!(valid[0].seq, EventSequence::new(1)); 1177 + assert_eq!(valid[1].seq, EventSequence::new(2)); 1178 + }