Our Personal Data Server from scratch!
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat(tranquil-store): sweep subcommand with axis override fan-out

Lewis: May this revision serve well! <lu5a@proton.me>

+643 -75
+403 -13
crates/tranquil-store/src/bin/tranquil_gauntlet.rs
··· 15 15 }; 16 16 17 17 const MAX_HOURS: f64 = 1.0e6; 18 + const DEFAULT_SWEEP_RUN_CAP: u64 = 10_000; 18 19 19 20 /// Deterministic storage-engine gauntlet: scenario fuzzing, shrinking, regression replay. 20 21 /// ··· 71 72 #[arg(long, default_value_t = DEFAULT_MAX_SHRINK_ITERATIONS, conflicts_with = "no_shrink")] 72 73 shrink_budget: usize, 73 74 }, 75 + /// Fan out a scenario across the cartesian product of declared axes. 76 + /// 77 + /// Reads a Toml config with [axes] lists (writer_concurrency, key_space, value_bytes, 78 + /// fault_density_scale, fault_density_uniform, restart_every_n_ops, commit_batch_size, 79 + /// max_file_size). For each combination, runs --seeds seeds and emits one NDjson record 80 + /// per (combo, seed). 81 + Sweep { 82 + /// Toml config declaring scenario & axes. 83 + #[arg(long)] 84 + config: PathBuf, 85 + 86 + /// First seed in the batch range. Default 0. 87 + #[arg(long)] 88 + seed_start: Option<u64>, 89 + 90 + /// Seeds per axis combination. Default 8. Must be > 0. 91 + #[arg(long)] 92 + seeds: Option<u64>, 93 + 94 + /// Directory to dump regression Json on failure. 95 + #[arg(long)] 96 + dump_regressions: Option<PathBuf>, 97 + 98 + /// Skip shrinking when dumping regressions. 99 + #[arg(long)] 100 + no_shrink: bool, 101 + 102 + /// Max shrink attempts per failing seed. 103 + #[arg(long, default_value_t = DEFAULT_MAX_SHRINK_ITERATIONS, conflicts_with = "no_shrink")] 104 + shrink_budget: usize, 105 + 106 + /// Hard cap on total (combinations x seeds). Default 10_000. Set 0 to disable. 107 + #[arg(long, default_value_t = DEFAULT_SWEEP_RUN_CAP)] 108 + max_runs: u64, 109 + }, 74 110 /// Replay a single seed or a saved regression file. 75 111 /// 76 112 /// With --from, replays a regression Json produced by `farm --dump-regressions`. ··· 131 167 toml::from_str(&raw).map_err(|e| format!("parse {}: {e}", path.display())) 132 168 } 133 169 170 + #[derive(Debug, Deserialize)] 171 + #[serde(deny_unknown_fields)] 172 + struct SweepConfigFile { 173 + scenario: Scenario, 174 + #[serde(default)] 175 + seed_start: Option<u64>, 176 + #[serde(default)] 177 + seeds: Option<u64>, 178 + #[serde(default)] 179 + dump_regressions: Option<PathBuf>, 180 + #[serde(default)] 181 + base_overrides: ConfigOverrides, 182 + #[serde(default)] 183 + axes: SweepAxes, 184 + } 185 + 186 + #[derive(Debug, Default, Deserialize)] 187 + #[serde(deny_unknown_fields)] 188 + struct SweepAxes { 189 + #[serde(default)] 190 + writer_concurrency: Vec<usize>, 191 + #[serde(default)] 192 + key_space: Vec<u32>, 193 + #[serde(default)] 194 + value_bytes: Vec<u32>, 195 + #[serde(default)] 196 + fault_density_scale: Vec<f64>, 197 + #[serde(default)] 198 + fault_density_uniform: Vec<f64>, 199 + #[serde(default)] 200 + restart_every_n_ops: Vec<usize>, 201 + #[serde(default)] 202 + commit_batch_size: Vec<usize>, 203 + #[serde(default)] 204 + max_file_size: Vec<u64>, 205 + } 206 + 207 + #[derive(Debug, Clone, Copy, Default)] 208 + struct SweepAxisValues { 209 + writer_concurrency: Option<usize>, 210 + key_space: Option<u32>, 211 + value_bytes: Option<u32>, 212 + fault_density_scale: Option<f64>, 213 + fault_density_uniform: Option<f64>, 214 + restart_every_n_ops: Option<usize>, 215 + commit_batch_size: Option<usize>, 216 + max_file_size: Option<u64>, 217 + } 218 + 219 + impl SweepAxisValues { 220 + fn apply_to(self, o: &mut ConfigOverrides) { 221 + if let Some(v) = self.writer_concurrency { 222 + o.writer_concurrency = Some(v); 223 + } 224 + if let Some(v) = self.key_space { 225 + o.key_space = Some(v); 226 + } 227 + if let Some(v) = self.value_bytes { 228 + o.value_bytes = Some(v); 229 + } 230 + if let Some(v) = self.fault_density_scale { 231 + o.fault_density_scale = Some(v); 232 + } 233 + if let Some(v) = self.fault_density_uniform { 234 + o.fault_density_uniform = Some(v); 235 + } 236 + if let Some(v) = self.restart_every_n_ops { 237 + o.restart_every_n_ops = Some(v); 238 + } 239 + if let Some(v) = self.commit_batch_size { 240 + o.store.group_commit.max_batch_size = Some(v); 241 + } 242 + if let Some(v) = self.max_file_size { 243 + o.store.max_file_size = Some(v); 244 + } 245 + } 246 + } 247 + 248 + impl SweepAxes { 249 + fn axis_values(&self) -> Vec<SweepAxisValues> { 250 + expand(&self.writer_concurrency) 251 + .into_iter() 252 + .flat_map(|wc| { 253 + expand(&self.key_space).into_iter().flat_map(move |ks| { 254 + expand(&self.value_bytes).into_iter().flat_map(move |vb| { 255 + expand(&self.fault_density_scale) 256 + .into_iter() 257 + .flat_map(move |fds| { 258 + expand(&self.fault_density_uniform).into_iter().flat_map( 259 + move |fdu| { 260 + expand(&self.restart_every_n_ops).into_iter().flat_map( 261 + move |rc| { 262 + expand(&self.commit_batch_size) 263 + .into_iter() 264 + .flat_map(move |cb| { 265 + expand(&self.max_file_size).into_iter().map( 266 + move |mfs| SweepAxisValues { 267 + writer_concurrency: wc, 268 + key_space: ks, 269 + value_bytes: vb, 270 + fault_density_scale: fds, 271 + fault_density_uniform: fdu, 272 + restart_every_n_ops: rc, 273 + commit_batch_size: cb, 274 + max_file_size: mfs, 275 + }, 276 + ) 277 + }) 278 + }, 279 + ) 280 + }, 281 + ) 282 + }) 283 + }) 284 + }) 285 + }) 286 + .collect() 287 + } 288 + } 289 + 290 + fn expand<T: Copy>(values: &[T]) -> Vec<Option<T>> { 291 + if values.is_empty() { 292 + vec![None] 293 + } else { 294 + values.iter().copied().map(Some).collect() 295 + } 296 + } 297 + 134 298 #[derive(Debug, Serialize)] 135 299 struct NdjsonResult { 136 300 scenario: &'static str, ··· 142 306 violations: Vec<NdjsonViolation>, 143 307 wall_ms: u64, 144 308 ops_in_stream: usize, 309 + #[serde(skip_serializing_if = "serde_json::Value::is_null")] 310 + overrides: serde_json::Value, 145 311 } 146 312 147 313 #[derive(Debug, Serialize)] ··· 180 346 } 181 347 } 182 348 183 - fn emit(scenario: Scenario, report: &GauntletReport, elapsed: Duration) -> io::Result<()> { 349 + fn overrides_json(overrides: &ConfigOverrides) -> serde_json::Value { 350 + match serde_json::to_value(overrides) { 351 + Ok(v) if v.as_object().map(|m| m.is_empty()).unwrap_or(true) => serde_json::Value::Null, 352 + Ok(v) => v, 353 + Err(_) => serde_json::Value::Null, 354 + } 355 + } 356 + 357 + fn emit( 358 + scenario: Scenario, 359 + report: &GauntletReport, 360 + elapsed: Duration, 361 + overrides: &ConfigOverrides, 362 + ) -> io::Result<()> { 184 363 let result = NdjsonResult { 185 364 scenario: scenario.cli_name(), 186 365 seed: report.seed.0, ··· 198 377 .collect(), 199 378 wall_ms: u64::try_from(elapsed.as_millis()).unwrap_or(u64::MAX), 200 379 ops_in_stream: report.ops.len(), 380 + overrides: overrides_json(overrides), 201 381 }; 202 382 let line = serde_json::to_string(&result).map_err(io::Error::other)?; 203 383 let stdout = io::stdout(); ··· 206 386 w.flush() 207 387 } 208 388 209 - fn emit_or_log(scenario: Scenario, report: &GauntletReport, elapsed: Duration) { 210 - if let Err(e) = emit(scenario, report, elapsed) 389 + fn emit_or_log( 390 + scenario: Scenario, 391 + report: &GauntletReport, 392 + elapsed: Duration, 393 + overrides: &ConfigOverrides, 394 + ) { 395 + if let Err(e) = emit(scenario, report, elapsed, overrides) 211 396 && e.kind() != io::ErrorKind::BrokenPipe 212 397 { 213 398 eprintln!("ndjson emit failed: {e}"); ··· 353 538 return; 354 539 } 355 540 f.store(true, Ordering::Relaxed); 356 - eprintln!( 357 - "interrupt received, stopping after current batch; press Ctrl-C again to abort" 358 - ); 541 + eprintln!("interrupt received, stopping after current batch; press Ctrl-C again to abort"); 359 542 if tokio::signal::ctrl_c().await.is_ok() { 360 543 eprintln!("second interrupt, aborting"); 361 544 std::process::exit(130); ··· 437 620 }; 438 621 run_repro(plan, &rt) 439 622 } 623 + Cmd::Sweep { 624 + config, 625 + seed_start, 626 + seeds, 627 + dump_regressions, 628 + no_shrink, 629 + shrink_budget, 630 + max_runs, 631 + } => { 632 + let plan = match resolve_sweep( 633 + config, 634 + seed_start, 635 + seeds, 636 + dump_regressions, 637 + !no_shrink, 638 + shrink_budget, 639 + max_runs, 640 + ) { 641 + Ok(p) => p, 642 + Err(e) => { 643 + eprintln!("{e}"); 644 + return ExitCode::from(2); 645 + } 646 + }; 647 + let rt = match build_runtime() { 648 + Ok(rt) => rt, 649 + Err(code) => return code, 650 + }; 651 + let interrupt = install_interrupt(&rt); 652 + run_sweep(plan, &rt, interrupt) 653 + } 654 + } 655 + } 656 + 657 + struct SweepPlan { 658 + scenario: Scenario, 659 + seed_start: u64, 660 + seeds: u64, 661 + dump_regressions: Option<PathBuf>, 662 + shrink: bool, 663 + shrink_budget: usize, 664 + base_overrides: ConfigOverrides, 665 + axes: Vec<SweepAxisValues>, 666 + } 667 + 668 + fn resolve_sweep( 669 + config: PathBuf, 670 + seed_start: Option<u64>, 671 + seeds: Option<u64>, 672 + dump_regressions: Option<PathBuf>, 673 + shrink: bool, 674 + shrink_budget: usize, 675 + max_runs: u64, 676 + ) -> Result<SweepPlan, String> { 677 + let raw = 678 + std::fs::read_to_string(&config).map_err(|e| format!("read {}: {e}", config.display()))?; 679 + let file: SweepConfigFile = 680 + toml::from_str(&raw).map_err(|e| format!("parse {}: {e}", config.display()))?; 681 + let seed_start = seed_start.or(file.seed_start).unwrap_or(0); 682 + let seeds = seeds.or(file.seeds).unwrap_or(8); 683 + if seeds == 0 { 684 + return Err("--seeds must be greater than zero".to_string()); 685 + } 686 + if shrink && shrink_budget == 0 { 687 + return Err("--shrink-budget must be greater than zero".to_string()); 688 + } 689 + let dump_regressions = dump_regressions.or(file.dump_regressions.clone()); 690 + let axes = file.axes.axis_values(); 691 + if axes.is_empty() { 692 + return Err("sweep produced no combinations".to_string()); 693 + } 694 + if max_runs > 0 { 695 + let combos = axes.len() as u64; 696 + let total = combos.saturating_mul(seeds); 697 + if total > max_runs { 698 + return Err(format!( 699 + "sweep would run {total} cases (combinations={combos} x seeds={seeds}), exceeds --max-runs {max_runs}; raise --max-runs or shrink axes" 700 + )); 701 + } 702 + } 703 + Ok(SweepPlan { 704 + scenario: file.scenario, 705 + seed_start, 706 + seeds, 707 + dump_regressions, 708 + shrink, 709 + shrink_budget, 710 + base_overrides: file.base_overrides, 711 + axes, 712 + }) 713 + } 714 + 715 + #[derive(Debug, Serialize)] 716 + struct NdjsonSweepSummary { 717 + #[serde(rename = "type")] 718 + kind: &'static str, 719 + scenario: &'static str, 720 + combinations: u64, 721 + seeds_per_combination: u64, 722 + total_seeds: u64, 723 + clean: u64, 724 + failed: u64, 725 + wall_ms: u64, 726 + interrupted: bool, 727 + } 728 + 729 + fn emit_sweep_summary(summary: &NdjsonSweepSummary) { 730 + let line = match serde_json::to_string(summary) { 731 + Ok(s) => s, 732 + Err(e) => { 733 + eprintln!("sweep summary serialize failed: {e}"); 734 + return; 735 + } 736 + }; 737 + let stdout = io::stdout(); 738 + let mut w = stdout.lock(); 739 + if let Err(e) = writeln!(w, "{line}").and_then(|()| w.flush()) 740 + && e.kind() != io::ErrorKind::BrokenPipe 741 + { 742 + eprintln!("sweep summary emit failed: {e}"); 743 + } 744 + } 745 + 746 + fn run_sweep(plan: SweepPlan, rt: &Runtime, interrupt: Arc<AtomicBool>) -> ExitCode { 747 + let SweepPlan { 748 + scenario, 749 + seed_start, 750 + seeds, 751 + dump_regressions, 752 + shrink, 753 + shrink_budget, 754 + base_overrides, 755 + axes, 756 + } = plan; 757 + let run_start = Instant::now(); 758 + let mut any_failed = false; 759 + let mut total_seeds: u64 = 0; 760 + let mut total_clean: u64 = 0; 761 + let mut total_failed: u64 = 0; 762 + let combos = axes.len() as u64; 763 + eprintln!( 764 + "sweep {}: {} combinations x {} seeds = {} runs", 765 + scenario.cli_name(), 766 + combos, 767 + seeds, 768 + combos.saturating_mul(seeds), 769 + ); 770 + let end = match seed_start.checked_add(seeds) { 771 + Some(e) => e, 772 + None => { 773 + eprintln!("seed range overflowed u64: seed_start={seed_start} seeds={seeds}"); 774 + return ExitCode::from(2); 775 + } 776 + }; 777 + for (combo_idx, axis_values) in axes.iter().enumerate() { 778 + if interrupt.load(Ordering::Relaxed) { 779 + break; 780 + } 781 + let mut overrides = base_overrides.clone(); 782 + axis_values.apply_to(&mut overrides); 783 + let combo_start = Instant::now(); 784 + let overrides_for_farm = overrides.clone(); 785 + let reports = farm::run_many_timed( 786 + move |s| { 787 + let mut cfg = config_for(scenario, s); 788 + overrides_for_farm.apply_to(&mut cfg); 789 + cfg 790 + }, 791 + (seed_start..end).map(Seed), 792 + ); 793 + let combo_wall = combo_start.elapsed(); 794 + let combo_failed = reports.iter().filter(|(r, _)| !r.is_clean()).count(); 795 + let combo_clean = reports.len().saturating_sub(combo_failed); 796 + reports.iter().for_each(|(r, elapsed)| { 797 + if !r.is_clean() { 798 + any_failed = true; 799 + if let Some(root) = &dump_regressions { 800 + dump_regression(scenario, r, root, &overrides, shrink, shrink_budget, rt); 801 + } 802 + } 803 + emit_or_log(scenario, r, *elapsed, &overrides); 804 + }); 805 + total_seeds += reports.len() as u64; 806 + total_clean += combo_clean as u64; 807 + total_failed += combo_failed as u64; 808 + eprintln!( 809 + "combo {}/{}: {} clean, {} failed, {:.1}s", 810 + combo_idx + 1, 811 + combos, 812 + combo_clean, 813 + combo_failed, 814 + combo_wall.as_secs_f64(), 815 + ); 816 + } 817 + let wall_ms = u64::try_from(run_start.elapsed().as_millis()).unwrap_or(u64::MAX); 818 + emit_sweep_summary(&NdjsonSweepSummary { 819 + kind: "sweep_summary", 820 + scenario: scenario.cli_name(), 821 + combinations: combos, 822 + seeds_per_combination: seeds, 823 + total_seeds, 824 + clean: total_clean, 825 + failed: total_failed, 826 + wall_ms, 827 + interrupted: interrupt.load(Ordering::Relaxed), 828 + }); 829 + if any_failed { 830 + ExitCode::from(1) 831 + } else { 832 + ExitCode::SUCCESS 440 833 } 441 834 } 442 835 ··· 489 882 let batch_wall = batch_start.elapsed(); 490 883 let batch_failed = reports.iter().filter(|(r, _)| !r.is_clean()).count(); 491 884 let batch_clean = reports.len().saturating_sub(batch_failed); 492 - let batch_ops: u64 = reports 493 - .iter() 494 - .map(|(r, _)| r.ops_executed.0 as u64) 495 - .sum(); 885 + let batch_ops: u64 = reports.iter().map(|(r, _)| r.ops_executed.0 as u64).sum(); 496 886 reports.iter().for_each(|(r, elapsed)| { 497 887 if !r.is_clean() { 498 888 any_failed = true; ··· 500 890 dump_regression(scenario, r, root, &overrides, shrink, shrink_budget, rt); 501 891 } 502 892 } 503 - emit_or_log(scenario, r, *elapsed); 893 + emit_or_log(scenario, r, *elapsed, &overrides); 504 894 }); 505 895 total_seeds += reports.len() as u64; 506 896 total_clean += batch_clean as u64; ··· 624 1014 rt, 625 1015 ); 626 1016 } 627 - emit_or_log(scenario, &report, elapsed); 1017 + emit_or_log(scenario, &report, elapsed, &overrides); 628 1018 if report.is_clean() { 629 1019 ExitCode::SUCCESS 630 1020 } else { ··· 696 1086 rt, 697 1087 ); 698 1088 } 699 - emit_or_log(scenario, &report, elapsed); 1089 + emit_or_log(scenario, &report, elapsed, &overrides); 700 1090 if report.is_clean() { 701 1091 ExitCode::SUCCESS 702 1092 } else {
+8 -5
crates/tranquil-store/src/blockstore/compaction.rs
··· 201 201 Ok::<_, CompactionError>(()) 202 202 }); 203 203 204 - let record_count = u32::try_from( 205 - (live_count as u128).saturating_add(dead_count as u128), 206 - ) 207 - .unwrap_or(u32::MAX); 204 + let record_count = 205 + u32::try_from((live_count as u128).saturating_add(dead_count as u128)).unwrap_or(u32::MAX); 208 206 let writer_position = writer.position(); 209 207 let finalize_result = scan_result 210 208 .and_then(|()| writer.sync().map_err(CompactionError::from)) ··· 219 217 .map_err(CompactionError::from) 220 218 }) 221 219 .and_then(|()| hint_writer.sync().map_err(CompactionError::from)) 222 - .and_then(|()| manager.io().sync_dir(manager.data_dir()).map_err(CompactionError::from)); 220 + .and_then(|()| { 221 + manager 222 + .io() 223 + .sync_dir(manager.data_dir()) 224 + .map_err(CompactionError::from) 225 + }); 223 226 224 227 let _ = manager.io().close(hint_fd); 225 228
+2 -3
crates/tranquil-store/src/blockstore/group_commit.rs
··· 1087 1087 entries: &[([u8; CID_SIZE], BlockLocation)], 1088 1088 ) -> Result<(), CommitError> { 1089 1089 use std::collections::BTreeMap; 1090 - let by_file: BTreeMap<DataFileId, Vec<(&[u8; CID_SIZE], BlockLocation)>> = entries 1091 - .iter() 1092 - .fold(BTreeMap::new(), |mut acc, (cid, loc)| { 1090 + let by_file: BTreeMap<DataFileId, Vec<(&[u8; CID_SIZE], BlockLocation)>> = 1091 + entries.iter().fold(BTreeMap::new(), |mut acc, (cid, loc)| { 1093 1092 acc.entry(loc.file_id).or_default().push((cid, *loc)); 1094 1093 acc 1095 1094 });
+2 -8
crates/tranquil-store/src/blockstore/manager.rs
··· 223 223 assert_eq!(mgr.io().file_size(next_handle.fd()).unwrap(), 0); 224 224 mgr.io().sync_dir(mgr.data_dir()).unwrap(); 225 225 mgr.commit_rotation(next_id, &next_handle); 226 - assert_eq!( 227 - mgr.open_for_read(next_id).unwrap().fd(), 228 - next_handle.fd() 229 - ); 226 + assert_eq!(mgr.open_for_read(next_id).unwrap().fd(), next_handle.fd()); 230 227 } 231 228 232 229 #[test] ··· 236 233 let (next_id, next_handle) = mgr.prepare_rotation(DataFileId::new(0)).unwrap(); 237 234 mgr.commit_rotation(next_id, &next_handle); 238 235 239 - assert_eq!( 240 - mgr.open_for_read(next_id).unwrap().fd(), 241 - next_handle.fd() 242 - ); 236 + assert_eq!(mgr.open_for_read(next_id).unwrap().fd(), next_handle.fd()); 243 237 drop(next_handle); 244 238 mgr.rollback_rotation(next_id); 245 239
+1 -4
crates/tranquil-store/src/blockstore/store.rs
··· 49 49 )), 50 50 CommitError::VerifyFailed { file_id, offset } => RepoError::storage(io::Error::new( 51 51 io::ErrorKind::InvalidData, 52 - format!( 53 - "post-sync verify failed at {file_id}:{}", 54 - offset.raw() 55 - ), 52 + format!("post-sync verify failed at {file_id}:{}", offset.raw()), 56 53 )), 57 54 } 58 55 }
+2 -8
crates/tranquil-store/src/eventlog/manager.rs
··· 418 418 assert_eq!(next_id, SegmentId::new(2)); 419 419 assert_eq!(mgr.io().file_size(next_handle.fd()).unwrap(), 0); 420 420 mgr.commit_rotation(next_id, &next_handle); 421 - assert_eq!( 422 - mgr.open_for_read(next_id).unwrap().fd(), 423 - next_handle.fd() 424 - ); 421 + assert_eq!(mgr.open_for_read(next_id).unwrap().fd(), next_handle.fd()); 425 422 } 426 423 427 424 #[test] ··· 431 428 let (next_id, next_handle) = mgr.prepare_rotation(SegmentId::new(1)).unwrap(); 432 429 mgr.commit_rotation(next_id, &next_handle); 433 430 434 - assert_eq!( 435 - mgr.open_for_read(next_id).unwrap().fd(), 436 - next_handle.fd() 437 - ); 431 + assert_eq!(mgr.open_for_read(next_id).unwrap().fd(), next_handle.fd()); 438 432 drop(next_handle); 439 433 mgr.rollback_rotation(next_id); 440 434
+8 -3
crates/tranquil-store/src/eventlog/writer.rs
··· 68 68 ) -> io::Result<Self> { 69 69 let handle = manager.open_for_append(segment_id)?; 70 70 manager.io().truncate(handle.fd(), 0)?; 71 - let writer = SegmentWriter::new(manager.io(), handle.fd(), segment_id, next_seq, max_payload)?; 71 + let writer = 72 + SegmentWriter::new(manager.io(), handle.fd(), segment_id, next_seq, max_payload)?; 72 73 writer.sync(manager.io())?; 73 74 manager.io().sync_dir(manager.segments_dir())?; 74 75 ··· 400 401 Err(e) if e.kind() != io::ErrorKind::InvalidData => Err(e), 401 402 _ => { 402 403 let handle = manager.open_for_read(seg_id)?; 403 - let (_, last_seq) = 404 - rebuild_from_segment(manager.io(), handle.fd(), DEFAULT_INDEX_INTERVAL, max_payload)?; 404 + let (_, last_seq) = rebuild_from_segment( 405 + manager.io(), 406 + handle.fd(), 407 + DEFAULT_INDEX_INTERVAL, 408 + max_payload, 409 + )?; 405 410 Ok(last_seq) 406 411 } 407 412 }
+127 -3
crates/tranquil-store/src/gauntlet/overrides.rs
··· 1 1 use serde::{Deserialize, Serialize}; 2 2 3 - use super::runner::{GauntletConfig, MaxFileSize, RunLimits, ShardCount, WallMs}; 4 - use super::workload::OpCount; 3 + use super::runner::{ 4 + GauntletConfig, IoBackend, MaxFileSize, OpInterval, RestartPolicy, RunLimits, ShardCount, 5 + WallMs, WriterConcurrency, 6 + }; 7 + use super::workload::{KeySpaceSize, OpCount, SizeDistribution, ValueBytes}; 8 + use crate::sim::FaultConfig; 5 9 6 - #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] 10 + #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] 7 11 #[serde(deny_unknown_fields)] 8 12 pub struct ConfigOverrides { 9 13 #[serde(default, skip_serializing_if = "Option::is_none")] 10 14 pub op_count: Option<usize>, 11 15 #[serde(default, skip_serializing_if = "Option::is_none")] 12 16 pub max_wall_ms: Option<u64>, 17 + #[serde(default, skip_serializing_if = "Option::is_none")] 18 + pub writer_concurrency: Option<usize>, 19 + #[serde(default, skip_serializing_if = "Option::is_none")] 20 + pub key_space: Option<u32>, 21 + #[serde(default, skip_serializing_if = "Option::is_none")] 22 + pub value_bytes: Option<u32>, 23 + #[serde(default, skip_serializing_if = "Option::is_none")] 24 + pub fault_density_scale: Option<f64>, 25 + #[serde(default, skip_serializing_if = "Option::is_none")] 26 + pub fault_density_uniform: Option<f64>, 27 + #[serde(default, skip_serializing_if = "Option::is_none")] 28 + pub restart_every_n_ops: Option<usize>, 13 29 #[serde(default, skip_serializing_if = "StoreOverrides::is_empty")] 14 30 pub store: StoreOverrides, 15 31 } ··· 66 82 max_wall_ms: Some(WallMs(ms)), 67 83 }; 68 84 } 85 + if let Some(n) = self.writer_concurrency { 86 + cfg.writer_concurrency = WriterConcurrency(n.max(1)); 87 + } 88 + if let Some(n) = self.key_space { 89 + cfg.workload.key_space = KeySpaceSize(n.max(1)); 90 + } 91 + if let Some(n) = self.value_bytes { 92 + cfg.workload.size_distribution = SizeDistribution::Fixed(ValueBytes(n)); 93 + } 94 + if let Some(m) = self.fault_density_scale 95 + && let IoBackend::Simulated { fault } = cfg.io 96 + { 97 + cfg.io = IoBackend::Simulated { 98 + fault: fault.scale_probabilities(m), 99 + }; 100 + } 101 + if let Some(d) = self.fault_density_uniform { 102 + cfg.io = IoBackend::Simulated { 103 + fault: FaultConfig::uniform_density(d.clamp(0.0, 1.0)), 104 + }; 105 + } 106 + if let Some(n) = self.restart_every_n_ops { 107 + cfg.restart_policy = if n == 0 { 108 + RestartPolicy::Never 109 + } else { 110 + RestartPolicy::EveryNOps(OpInterval(n)) 111 + }; 112 + } 69 113 if let Some(n) = self.store.max_file_size { 70 114 cfg.store.max_file_size = MaxFileSize(n); 71 115 } ··· 106 150 fn round_trip_preserves_set_fields() { 107 151 let o = ConfigOverrides { 108 152 op_count: Some(42), 153 + writer_concurrency: Some(16), 154 + key_space: Some(1_000_000), 155 + value_bytes: Some(4096), 156 + fault_density_scale: Some(1e-3), 157 + fault_density_uniform: Some(5e-4), 158 + restart_every_n_ops: Some(10_000), 109 159 store: StoreOverrides { 110 160 max_file_size: Some(4096), 111 161 group_commit: GroupCommitOverrides { ··· 119 169 let json = serde_json::to_string(&o).unwrap(); 120 170 let back: ConfigOverrides = serde_json::from_str(&json).unwrap(); 121 171 assert_eq!(o, back); 172 + } 173 + 174 + #[test] 175 + fn fault_density_scale_scales_moderate() { 176 + use crate::gauntlet::op::Seed; 177 + use crate::gauntlet::scenarios::{Scenario, config_for}; 178 + let mut cfg = config_for(Scenario::ModerateFaults, Seed(1)); 179 + let o = ConfigOverrides { 180 + fault_density_scale: Some(0.1), 181 + ..ConfigOverrides::default() 182 + }; 183 + o.apply_to(&mut cfg); 184 + match cfg.io { 185 + IoBackend::Simulated { fault } => { 186 + assert!(fault.torn_page_probability.raw() < 0.02); 187 + assert!(fault.torn_page_probability.raw() > 0.0); 188 + } 189 + _ => panic!("expected simulated io"), 190 + } 191 + } 192 + 193 + #[test] 194 + fn fault_density_scale_zero_kills_probabilities() { 195 + use crate::gauntlet::op::Seed; 196 + use crate::gauntlet::scenarios::{Scenario, config_for}; 197 + let mut cfg = config_for(Scenario::ModerateFaults, Seed(1)); 198 + let o = ConfigOverrides { 199 + fault_density_scale: Some(0.0), 200 + ..ConfigOverrides::default() 201 + }; 202 + o.apply_to(&mut cfg); 203 + match cfg.io { 204 + IoBackend::Simulated { fault } => { 205 + assert_eq!(fault.partial_write_probability.raw(), 0.0); 206 + assert_eq!(fault.torn_page_probability.raw(), 0.0); 207 + assert_eq!(fault.io_error_probability.raw(), 0.0); 208 + assert_eq!(fault.sync_failure_probability.raw(), 0.0); 209 + } 210 + _ => panic!("expected simulated io"), 211 + } 212 + } 213 + 214 + #[test] 215 + fn fault_density_scale_is_noop_on_real_backend() { 216 + use crate::gauntlet::op::Seed; 217 + use crate::gauntlet::scenarios::{Scenario, config_for}; 218 + let mut cfg = config_for(Scenario::SmokePR, Seed(1)); 219 + assert!(matches!(cfg.io, IoBackend::Real)); 220 + let o = ConfigOverrides { 221 + fault_density_scale: Some(0.5), 222 + ..ConfigOverrides::default() 223 + }; 224 + o.apply_to(&mut cfg); 225 + assert!(matches!(cfg.io, IoBackend::Real)); 226 + } 227 + 228 + #[test] 229 + fn fault_density_uniform_forces_simulated_backend() { 230 + use crate::gauntlet::op::Seed; 231 + use crate::gauntlet::scenarios::{Scenario, config_for}; 232 + let mut cfg = config_for(Scenario::SmokePR, Seed(1)); 233 + assert!(matches!(cfg.io, IoBackend::Real)); 234 + let o = ConfigOverrides { 235 + fault_density_uniform: Some(0.25), 236 + ..ConfigOverrides::default() 237 + }; 238 + o.apply_to(&mut cfg); 239 + match cfg.io { 240 + IoBackend::Simulated { fault } => { 241 + assert_eq!(fault.torn_page_probability.raw(), 0.25); 242 + assert_eq!(fault.io_error_probability.raw(), 0.25); 243 + } 244 + _ => panic!("expected simulated io"), 245 + } 122 246 } 123 247 }
+4 -5
crates/tranquil-store/src/gauntlet/runner.rs
··· 595 595 let mut segment_last_ts: Vec<(SegmentId, u64)> = Vec::new(); 596 596 segments.iter().for_each(|&id| { 597 597 let per_segment: Vec<ValidEvent> = match s.manager.open_for_read(id) { 598 - Ok(handle) => match SegmentReader::open(s.manager.io(), handle.fd(), MAX_EVENT_PAYLOAD) { 598 + Ok(handle) => match SegmentReader::open(s.manager.io(), handle.fd(), MAX_EVENT_PAYLOAD) 599 + { 599 600 Ok(reader) => reader.valid_prefix().unwrap_or_default(), 600 601 Err(_) => Vec::new(), 601 602 }, ··· 991 992 if !oracle.contains_record(collection, rkey) { 992 993 return Ok(()); 993 994 } 994 - let new_root = 995 - delete_record_atomic(&harness.store, old_root, collection, rkey).await?; 995 + let new_root = delete_record_atomic(&harness.store, old_root, collection, rkey).await?; 996 996 oracle.delete(collection, rkey); 997 997 *root = Some(new_root); 998 998 Ok(()) ··· 1288 1288 if !state.oracle.contains_record(collection, rkey) { 1289 1289 return Ok(()); 1290 1290 } 1291 - let new_root = 1292 - delete_record_atomic(&shared.store, old_root, collection, rkey).await?; 1291 + let new_root = delete_record_atomic(&shared.store, old_root, collection, rkey).await?; 1293 1292 state.oracle.delete(collection, rkey); 1294 1293 state.root = Some(new_root); 1295 1294 Ok(())
+3 -1
crates/tranquil-store/src/gauntlet/scenarios.rs
··· 99 99 "Eventlog-heavy workload with FSYNC_ORDERING / MONOTONIC_SEQ / TOMBSTONE_BOUND invariants." 100 100 } 101 101 Self::ContendedReaders => "60% reads, 64 writer tasks, simulated moderate faults.", 102 - Self::ContendedWriters => "Add/delete heavy, 32 writer tasks, simulated moderate faults.", 102 + Self::ContendedWriters => { 103 + "Add/delete heavy, 32 writer tasks, simulated moderate faults." 104 + } 103 105 } 104 106 } 105 107
+38
crates/tranquil-store/src/sim.rs
··· 129 129 || self.delayed_io_error_probability.is_nonzero() 130 130 || self.sync_reorder_window.0 > 0 131 131 } 132 + 133 + pub fn scale_probabilities(self, factor: f64) -> Self { 134 + let scale = |p: Probability| Probability::new((p.raw() * factor).clamp(0.0, 1.0)); 135 + Self { 136 + partial_write_probability: scale(self.partial_write_probability), 137 + bit_flip_on_read_probability: scale(self.bit_flip_on_read_probability), 138 + sync_failure_probability: scale(self.sync_failure_probability), 139 + dir_sync_failure_probability: scale(self.dir_sync_failure_probability), 140 + misdirected_write_probability: scale(self.misdirected_write_probability), 141 + io_error_probability: scale(self.io_error_probability), 142 + torn_page_probability: scale(self.torn_page_probability), 143 + misdirected_read_probability: scale(self.misdirected_read_probability), 144 + delayed_io_error_probability: scale(self.delayed_io_error_probability), 145 + sync_reorder_window: self.sync_reorder_window, 146 + latency_distribution_ns: self.latency_distribution_ns, 147 + } 148 + } 149 + 150 + pub fn uniform_density(density: f64) -> Self { 151 + assert!( 152 + density.is_finite() && (0.0..=1.0).contains(&density), 153 + "fault density out of range: {density}" 154 + ); 155 + let p = Probability::new(density); 156 + Self { 157 + partial_write_probability: p, 158 + bit_flip_on_read_probability: p, 159 + sync_failure_probability: p, 160 + dir_sync_failure_probability: p, 161 + misdirected_write_probability: p, 162 + io_error_probability: p, 163 + torn_page_probability: p, 164 + misdirected_read_probability: p, 165 + delayed_io_error_probability: p, 166 + sync_reorder_window: SyncReorderWindow(0), 167 + latency_distribution_ns: LatencyNs(0), 168 + } 169 + } 132 170 } 133 171 134 172 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+8 -5
crates/tranquil-store/tests/eventlog_manager_race.rs
··· 10 10 let sim: Arc<SimulatedIO> = Arc::new(SimulatedIO::pristine(0x1eed7a11)); 11 11 let segments_dir = PathBuf::from("/segments"); 12 12 13 - let manager = Arc::new( 14 - SegmentManager::new(Arc::clone(&sim), segments_dir.clone(), 1 << 20).unwrap(), 15 - ); 13 + let manager = 14 + Arc::new(SegmentManager::new(Arc::clone(&sim), segments_dir.clone(), 1 << 20).unwrap()); 16 15 17 16 let seg_id = SegmentId::new(1); 18 17 19 18 let write_handle = manager.open_for_append(seg_id).unwrap(); 20 - sim.write_at(write_handle.fd(), 0, b"arbitrary seed bytes for the segment") 21 - .unwrap(); 19 + sim.write_at( 20 + write_handle.fd(), 21 + 0, 22 + b"arbitrary seed bytes for the segment", 23 + ) 24 + .unwrap(); 22 25 sim.sync(write_handle.fd()).unwrap(); 23 26 sim.sync_dir(&segments_dir).unwrap(); 24 27 drop(write_handle);
+3 -1
crates/tranquil-store/tests/fd_lifecycle.rs
··· 40 40 41 41 fn tiny_block(seed: u64) -> Vec<u8> { 42 42 let bytes = seed.to_le_bytes(); 43 - (0..64).map(|i| bytes[i % 8] ^ (i as u8).wrapping_mul(31)).collect() 43 + (0..64) 44 + .map(|i| bytes[i % 8] ^ (i as u8).wrapping_mul(31)) 45 + .collect() 44 46 } 45 47 46 48 #[tokio::test]
+5 -2
crates/tranquil-store/tests/rotation_robustness.rs
··· 3 3 use std::collections::HashMap; 4 4 use std::io; 5 5 use std::path::{Path, PathBuf}; 6 - use std::sync::{Arc, Mutex}; 7 6 use std::sync::atomic::{AtomicBool, Ordering}; 7 + use std::sync::{Arc, Mutex}; 8 8 9 9 use tranquil_store::blockstore::{ 10 10 BlockStoreConfig, DataFileId, DataFileManager, DataFileWriter, GroupCommitConfig, ··· 59 59 impl StorageIO for FailingIO { 60 60 fn open(&self, path: &Path, opts: OpenOptions) -> io::Result<FileId> { 61 61 let fd = self.inner.open(path, opts)?; 62 - self.fd_to_path.lock().unwrap().insert(fd, path.to_path_buf()); 62 + self.fd_to_path 63 + .lock() 64 + .unwrap() 65 + .insert(fd, path.to_path_buf()); 63 66 Ok(fd) 64 67 } 65 68
+4 -6
crates/tranquil-store/tests/sim_blockstore.rs
··· 657 657 }; 658 658 659 659 let block_count = ((seed % 25) + 10) as u32; 660 - let all_cids: Vec<CidBytes> = 661 - (0..block_count).map(test_cid).collect(); 660 + let all_cids: Vec<CidBytes> = (0..block_count).map(test_cid).collect(); 662 661 663 662 { 664 663 let store = TranquilBlockStore::open(config.clone()).unwrap(); 665 - (0..block_count).try_for_each(|i| { 666 - store.put_blocks_blocking(vec![(test_cid(i), block_data(i))]) 667 - }) 668 - .unwrap(); 664 + (0..block_count) 665 + .try_for_each(|i| store.put_blocks_blocking(vec![(test_cid(i), block_data(i))])) 666 + .unwrap(); 669 667 670 668 let files = store.list_data_files().unwrap(); 671 669 assert!(
+16 -8
crates/tranquil-store/tests/sim_eventlog.rs
··· 39 39 .flat_map(|&seg_id| { 40 40 let fd = mgr 41 41 .open_for_read(seg_id) 42 - .unwrap_or_else(|e| panic!("seed {seed}: open_for_read({seg_id}) failed: {e}")).fd(); 42 + .unwrap_or_else(|e| panic!("seed {seed}: open_for_read({seg_id}) failed: {e}")) 43 + .fd(); 43 44 SegmentReader::open(mgr.io(), fd, MAX_EVENT_PAYLOAD) 44 45 .unwrap_or_else(|e| { 45 46 panic!( ··· 256 257 257 258 let seg2_fd = mgr 258 259 .open_for_read(SegmentId::new(2)) 259 - .unwrap_or_else(|e| panic!("seed {seed}: open_for_read(2) failed: {e}")).fd(); 260 + .unwrap_or_else(|e| panic!("seed {seed}: open_for_read(2) failed: {e}")) 261 + .fd(); 260 262 let seg2_events = SegmentReader::open(mgr.io(), seg2_fd, MAX_EVENT_PAYLOAD) 261 263 .unwrap_or_else(|e| { 262 264 panic!("seed {seed}: SegmentReader::open(2, MAX_EVENT_PAYLOAD) failed: {e}") ··· 271 273 272 274 let seg3_fd = mgr 273 275 .open_for_read(SegmentId::new(3)) 274 - .unwrap_or_else(|e| panic!("seed {seed}: open_for_read(3) failed: {e}")).fd(); 276 + .unwrap_or_else(|e| panic!("seed {seed}: open_for_read(3) failed: {e}")) 277 + .fd(); 275 278 let seg3_events = SegmentReader::open(mgr.io(), seg3_fd, MAX_EVENT_PAYLOAD) 276 279 .unwrap_or_else(|e| { 277 280 panic!("seed {seed}: SegmentReader::open(3, MAX_EVENT_PAYLOAD) failed: {e}") ··· 388 391 389 392 let fd = mgr 390 393 .open_for_read(SegmentId::new(1)) 391 - .unwrap_or_else(|e| panic!("seed {seed}: open_for_read(1) failed: {e}")).fd(); 394 + .unwrap_or_else(|e| panic!("seed {seed}: open_for_read(1) failed: {e}")) 395 + .fd(); 392 396 let recovered = SegmentReader::open(mgr.io(), fd, MAX_EVENT_PAYLOAD) 393 397 .unwrap_or_else(|e| { 394 398 panic!("seed {seed}: SegmentReader::open(1, MAX_EVENT_PAYLOAD) failed: {e}") ··· 458 462 459 463 let fd = mgr 460 464 .open_for_read(SegmentId::new(1)) 461 - .unwrap_or_else(|e| panic!("seed {seed}: open_for_read(1) failed: {e}")).fd(); 465 + .unwrap_or_else(|e| panic!("seed {seed}: open_for_read(1) failed: {e}")) 466 + .fd(); 462 467 let recovered = SegmentReader::open(mgr.io(), fd, MAX_EVENT_PAYLOAD) 463 468 .unwrap_or_else(|e| { 464 469 panic!("seed {seed}: SegmentReader::open(1, MAX_EVENT_PAYLOAD) failed: {e}") ··· 579 584 580 585 let fd = mgr 581 586 .open_for_read(SegmentId::new(1)) 582 - .unwrap_or_else(|e| panic!("seed {seed}: open_for_read(1) failed: {e}")).fd(); 587 + .unwrap_or_else(|e| panic!("seed {seed}: open_for_read(1) failed: {e}")) 588 + .fd(); 583 589 let events = SegmentReader::open(mgr.io(), fd, MAX_EVENT_PAYLOAD) 584 590 .unwrap_or_else(|e| { 585 591 panic!("seed {seed}: SegmentReader::open(1, MAX_EVENT_PAYLOAD) failed: {e}") ··· 652 658 653 659 let fd = mgr 654 660 .open_for_read(SegmentId::new(1)) 655 - .unwrap_or_else(|e| panic!("seed {seed}: open_for_read(1) failed: {e}")).fd(); 661 + .unwrap_or_else(|e| panic!("seed {seed}: open_for_read(1) failed: {e}")) 662 + .fd(); 656 663 let events = SegmentReader::open(mgr.io(), fd, MAX_EVENT_PAYLOAD) 657 664 .unwrap_or_else(|e| { 658 665 panic!("seed {seed}: SegmentReader::open(1, MAX_EVENT_PAYLOAD) failed: {e}") ··· 938 945 939 946 let pristine_fd = pristine_mgr 940 947 .open_for_read(SegmentId::new(1)) 941 - .unwrap_or_else(|e| panic!("seed {seed}: pristine open_for_read(1) failed: {e}")).fd(); 948 + .unwrap_or_else(|e| panic!("seed {seed}: pristine open_for_read(1) failed: {e}")) 949 + .fd(); 942 950 let pristine_events = SegmentReader::open( 943 951 pristine_mgr.io(), 944 952 pristine_fd,
+9
justfile
··· 37 37 gauntlet-repro SEED SCENARIO="smoke-pr": 38 38 SQLX_OFFLINE=true cargo run --release -p tranquil-store --bin tranquil-gauntlet --features tranquil-store/gauntlet-cli -- repro --scenario {{SCENARIO}} --seed {{SEED}} 39 39 40 + gauntlet-repro-config CONFIG SEED: 41 + SQLX_OFFLINE=true cargo run --release -p tranquil-store --bin tranquil-gauntlet --features tranquil-store/gauntlet-cli -- repro --config {{CONFIG}} --seed {{SEED}} 42 + 40 43 gauntlet-repro-from FILE: 41 44 SQLX_OFFLINE=true cargo run --release -p tranquil-store --bin tranquil-gauntlet --features tranquil-store/gauntlet-cli -- repro --from {{FILE}} 45 + 46 + gauntlet-sweep CONFIG SEEDS="8" DUMP="proptest-regressions": 47 + SQLX_OFFLINE=true cargo run --release -p tranquil-store --bin tranquil-gauntlet --features tranquil-store/gauntlet-cli -- sweep --config {{CONFIG}} --seeds {{SEEDS}} --dump-regressions {{DUMP}} 48 + 49 + gauntlet-soak HOURS="24" OUTPUT="": 50 + SQLX_OFFLINE=true GAUNTLET_SOAK_HOURS={{HOURS}} GAUNTLET_SOAK_OUTPUT={{OUTPUT}} cargo nextest run -p tranquil-store --features tranquil-store/test-harness --profile gauntlet-soak --test gauntlet_soak --run-ignored all -- soak_long_leak_gate 42 51 43 52 test-unit: 44 53 SQLX_OFFLINE=true cargo test --test dpop_unit --test validation_edge_cases --test scope_edge_cases