Our Personal Data Server from scratch! tranquil.farm
pds rust database fun oauth atproto
238
fork

Configure Feed

Select the types of activity you want to include in your feed.

fix(tranquil-store): atomic record commits, hint-as-truth recovery

Lewis: May this revision serve well! <lu5a@proton.me>

+456 -189
+9 -4
crates/tranquil-lexicon/src/dynamic.rs
··· 61 61 62 62 impl DynamicRegistry { 63 63 pub fn new() -> Self { 64 - let network_disabled = 65 - std::env::var("TRANQUIL_LEXICON_OFFLINE").is_ok_and(|v| v == "1" || v == "true"); 66 64 Self { 67 65 store: RwLock::new(SchemaStore { 68 66 schemas: HashMap::new(), ··· 70 68 }), 71 69 negative_cache: RwLock::new(HashMap::new()), 72 70 in_flight: RwLock::new(HashMap::new()), 73 - network_disabled: AtomicBool::new(network_disabled), 71 + network_disabled: AtomicBool::new(false), 74 72 } 75 73 } 76 74 77 - #[allow(dead_code)] 75 + pub fn from_env() -> Self { 76 + let registry = Self::new(); 77 + let disabled = 78 + std::env::var("TRANQUIL_LEXICON_OFFLINE").is_ok_and(|v| v == "1" || v == "true"); 79 + registry.set_network_disabled(disabled); 80 + registry 81 + } 82 + 78 83 pub fn set_network_disabled(&self, disabled: bool) { 79 84 self.network_disabled.store(disabled, Ordering::Relaxed); 80 85 }
+1 -1
crates/tranquil-lexicon/src/registry.rs
··· 25 25 Self { 26 26 schemas: HashMap::new(), 27 27 #[cfg(feature = "resolve")] 28 - dynamic: crate::dynamic::DynamicRegistry::new(), 28 + dynamic: crate::dynamic::DynamicRegistry::from_env(), 29 29 } 30 30 } 31 31
+141 -12
crates/tranquil-store/src/blockstore/group_commit.rs
··· 11 11 use super::BlocksSynced; 12 12 use crate::io::{FileId, OpenOptions, StorageIO}; 13 13 14 - use super::data_file::{CID_SIZE, DataFileWriter}; 14 + use super::data_file::{CID_SIZE, DataFileWriter, ReadBlockRecord, decode_block_record}; 15 15 use super::hash_index::{BlockIndex, BlockIndexError, CheckpointPositions}; 16 16 use super::hint::{HintFileWriter, hint_file_path}; 17 17 use super::manager::DataFileManager; ··· 106 106 Io(Arc<io::Error>), 107 107 Index(String), 108 108 ChannelClosed, 109 + VerifyFailed { 110 + file_id: DataFileId, 111 + offset: BlockOffset, 112 + }, 109 113 } 110 114 111 115 impl std::fmt::Display for CommitError { ··· 114 118 Self::Io(e) => write!(f, "io: {}", e.as_ref()), 115 119 Self::Index(e) => write!(f, "index: {e}"), 116 120 Self::ChannelClosed => write!(f, "commit channel closed"), 121 + Self::VerifyFailed { file_id, offset } => write!( 122 + f, 123 + "post-sync verify failed at {file_id}:{} (misdirected write or durable corruption)", 124 + offset.raw() 125 + ), 117 126 } 118 127 } 119 128 } ··· 122 131 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { 123 132 match self { 124 133 Self::Io(e) => Some(e.as_ref()), 125 - Self::Index(_) | Self::ChannelClosed => None, 134 + Self::Index(_) | Self::ChannelClosed | Self::VerifyFailed { .. } => None, 126 135 } 127 136 } 128 137 } ··· 181 190 pub channel_capacity: usize, 182 191 pub checkpoint_interval_ms: u64, 183 192 pub checkpoint_write_threshold: u64, 193 + pub verify_persisted_blocks: bool, 184 194 } 185 195 186 196 impl Default for GroupCommitConfig { ··· 190 200 channel_capacity: 4096, 191 201 checkpoint_interval_ms: 60_000, 192 202 checkpoint_write_threshold: 100_000, 203 + verify_persisted_blocks: false, 193 204 } 194 205 } 195 206 } ··· 200 211 file_ids: Arc<FileIdAllocator>, 201 212 active_files: Arc<ActiveFileSet>, 202 213 hint_positions: Arc<ShardHintPositions>, 214 + verify_persisted_blocks: bool, 203 215 } 204 216 205 217 struct ActiveState { ··· 403 415 file_ids: Arc::clone(&file_ids), 404 416 active_files: Arc::clone(&active_files), 405 417 hint_positions: Arc::clone(&hint_positions), 418 + verify_persisted_blocks: config.verify_persisted_blocks, 406 419 }; 407 420 SingleShardWriter::spawn( 408 421 ctx, ··· 1057 1070 hint_fd: FileId, 1058 1071 } 1059 1072 1073 + fn verify_persisted_blocks<S: StorageIO>( 1074 + manager: &DataFileManager<S>, 1075 + entries: &[([u8; CID_SIZE], BlockLocation)], 1076 + ) -> Result<(), CommitError> { 1077 + use std::collections::BTreeMap; 1078 + let by_file: BTreeMap<DataFileId, Vec<(&[u8; CID_SIZE], BlockLocation)>> = entries 1079 + .iter() 1080 + .fold(BTreeMap::new(), |mut acc, (cid, loc)| { 1081 + acc.entry(loc.file_id).or_default().push((cid, *loc)); 1082 + acc 1083 + }); 1084 + 1085 + by_file.into_iter().try_for_each(|(file_id, locations)| { 1086 + let path = manager.data_file_path(file_id); 1087 + let fd = match manager.io().open(&path, OpenOptions::read_only_existing()) { 1088 + Ok(fd) => fd, 1089 + Err(_) => return Ok(()), 1090 + }; 1091 + let file_size = match manager.io().file_size(fd) { 1092 + Ok(s) => s, 1093 + Err(_) => { 1094 + let _ = manager.io().close(fd); 1095 + return Ok(()); 1096 + } 1097 + }; 1098 + let result = locations.into_iter().try_for_each(|(expected_cid, loc)| { 1099 + verify_block_at(manager, fd, file_size, expected_cid, loc) 1100 + }); 1101 + let _ = manager.io().close(fd); 1102 + result 1103 + }) 1104 + } 1105 + 1106 + #[derive(Debug)] 1107 + enum VerifyOutcome { 1108 + NoFaultDetected, 1109 + Faulted, 1110 + } 1111 + 1112 + fn verify_block_at<S: StorageIO>( 1113 + manager: &DataFileManager<S>, 1114 + fd: FileId, 1115 + file_size: u64, 1116 + expected_cid: &[u8; CID_SIZE], 1117 + loc: BlockLocation, 1118 + ) -> Result<(), CommitError> { 1119 + let passed = (0..VERIFY_RETRY_ATTEMPTS).any(|_| { 1120 + matches!( 1121 + verify_once(manager, fd, file_size, expected_cid, loc), 1122 + VerifyOutcome::NoFaultDetected 1123 + ) 1124 + }); 1125 + match passed { 1126 + true => Ok(()), 1127 + false => Err(CommitError::VerifyFailed { 1128 + file_id: loc.file_id, 1129 + offset: loc.offset, 1130 + }), 1131 + } 1132 + } 1133 + 1134 + fn verify_once<S: StorageIO>( 1135 + manager: &DataFileManager<S>, 1136 + fd: FileId, 1137 + file_size: u64, 1138 + expected_cid: &[u8; CID_SIZE], 1139 + loc: BlockLocation, 1140 + ) -> VerifyOutcome { 1141 + match decode_block_record(manager.io(), fd, loc.offset, file_size) { 1142 + Ok(Some(ReadBlockRecord::Valid { cid_bytes, .. })) if cid_bytes == *expected_cid => { 1143 + VerifyOutcome::NoFaultDetected 1144 + } 1145 + Ok(Some(ReadBlockRecord::Valid { .. })) => { 1146 + tracing::warn!( 1147 + file_id = %loc.file_id, 1148 + offset = loc.offset.raw(), 1149 + "verify: stored CID mismatch (misdirected write)" 1150 + ); 1151 + VerifyOutcome::Faulted 1152 + } 1153 + Ok(Some(ReadBlockRecord::Corrupted { .. } | ReadBlockRecord::Truncated { .. })) 1154 + | Ok(None) => { 1155 + tracing::warn!( 1156 + file_id = %loc.file_id, 1157 + offset = loc.offset.raw(), 1158 + "verify: block undecodable at location" 1159 + ); 1160 + VerifyOutcome::Faulted 1161 + } 1162 + Err(_) => VerifyOutcome::NoFaultDetected, 1163 + } 1164 + } 1165 + 1166 + const VERIFY_RETRY_ATTEMPTS: u32 = 4; 1167 + 1168 + fn rollback_batch<S: StorageIO>( 1169 + manager: &DataFileManager<S>, 1170 + state: &ActiveState, 1171 + rotations: &[RotationState], 1172 + ) { 1173 + let _ = manager 1174 + .io() 1175 + .truncate(state.hint_fd, state.hint_position.raw()); 1176 + let _ = manager.io().sync(state.hint_fd); 1177 + rotations.iter().for_each(|rot| { 1178 + manager.rollback_rotation(rot.file_id, rot.fd); 1179 + let _ = manager.io().close(rot.hint_fd); 1180 + let _ = manager 1181 + .io() 1182 + .delete(&hint_file_path(manager.data_dir(), rot.file_id)); 1183 + }); 1184 + } 1185 + 1060 1186 fn process_batch<S: StorageIO>( 1061 1187 manager: &DataFileManager<S>, 1062 1188 index: &BlockIndex, ··· 1154 1280 }); 1155 1281 1156 1282 if let Err(e) = write_result { 1157 - rotations.into_iter().for_each(|rot| { 1158 - manager.rollback_rotation(rot.file_id, rot.fd); 1159 - let _ = manager.io().close(rot.hint_fd); 1160 - let _ = manager 1161 - .io() 1162 - .delete(&hint_file_path(manager.data_dir(), rot.file_id)); 1163 - }); 1283 + rollback_batch(manager, state, &rotations); 1164 1284 return Err(e); 1165 1285 } 1166 1286 ··· 1169 1289 let current_epoch = epoch.current(); 1170 1290 let now = crate::wall_clock_ms(); 1171 1291 1292 + let rollback_on_err = |e: CommitError| -> CommitError { 1293 + rollback_batch(manager, state, &rotations); 1294 + e 1295 + }; 1296 + 1172 1297 all_decrements 1173 1298 .iter() 1174 - .try_for_each(|cid| hint_writer.append_decrement(cid, current_epoch, now))?; 1299 + .try_for_each(|cid| hint_writer.append_decrement(cid, current_epoch, now)) 1300 + .map_err(|e| rollback_on_err(CommitError::from(e)))?; 1175 1301 1176 1302 let t = std::time::Instant::now(); 1177 - data_writer.sync()?; 1178 - hint_writer.sync()?; 1303 + data_writer.sync().map_err(|e| rollback_on_err(e.into()))?; 1304 + if ctx.verify_persisted_blocks { 1305 + verify_persisted_blocks(manager, &index_entries).map_err(rollback_on_err)?; 1306 + } 1307 + hint_writer.sync().map_err(|e| rollback_on_err(e.into()))?; 1179 1308 let sync_nanos = t.elapsed().as_nanos() as u64; 1180 1309 1181 1310 if !rotations.is_empty() {
+6 -5
crates/tranquil-store/src/blockstore/manager.rs
··· 127 127 pub fn rollback_rotation(&self, file_id: DataFileId, fd: FileId) { 128 128 let _ = self.io.close(fd); 129 129 self.handles.write().remove(&file_id); 130 + let _ = self.io.delete(&self.data_file_path(file_id)); 130 131 } 131 132 132 133 pub fn should_rotate(&self, position: BlockOffset) -> bool { ··· 216 217 } 217 218 218 219 #[test] 219 - fn rotation_rollback_cleans_handle() { 220 + fn rotation_rollback_cleans_handle_and_deletes_file() { 220 221 let mgr = setup_manager(1024); 221 222 let _fd0 = mgr.open_for_append(DataFileId::new(0)).unwrap(); 222 223 let (next_id, next_fd) = mgr.prepare_rotation(DataFileId::new(0)).unwrap(); ··· 225 226 assert_eq!(mgr.open_for_read(next_id).unwrap(), next_fd); 226 227 mgr.rollback_rotation(next_id, next_fd); 227 228 228 - let reopened_fd = mgr.open_for_read(next_id).unwrap(); 229 - assert_ne!( 230 - reopened_fd, next_fd, 231 - "rollback should have closed the cached fd" 229 + let reopen = mgr.open_for_read(next_id); 230 + assert!( 231 + reopen.is_err_and(|e| e.kind() == io::ErrorKind::NotFound), 232 + "rollback must delete the data file so recovery cannot resurrect uncommitted bytes" 232 233 ); 233 234 } 234 235
+29 -18
crates/tranquil-store/src/blockstore/reader.rs
··· 127 127 file_size: u64, 128 128 location: BlockLocation, 129 129 ) -> Result<Bytes, ReadError> { 130 - match decode_block_record(self.manager.io(), fd, location.offset, file_size)? { 131 - Some(ReadBlockRecord::Valid { data, .. }) 132 - if data.len() == location.length.raw() as usize => 133 - { 134 - Ok(Bytes::from(data)) 135 - } 136 - Some(ReadBlockRecord::Valid { .. }) => Err(ReadError::Corrupted { 137 - file_id: location.file_id, 138 - offset: location.offset, 139 - }), 140 - Some(ReadBlockRecord::Corrupted { offset } | ReadBlockRecord::Truncated { offset }) => { 141 - Err(ReadError::Corrupted { 130 + let attempt_once = || -> Result<Bytes, ReadError> { 131 + match decode_block_record(self.manager.io(), fd, location.offset, file_size)? { 132 + Some(ReadBlockRecord::Valid { data, .. }) 133 + if data.len() == location.length.raw() as usize => 134 + { 135 + Ok(Bytes::from(data)) 136 + } 137 + Some(ReadBlockRecord::Valid { .. }) => Err(ReadError::Corrupted { 138 + file_id: location.file_id, 139 + offset: location.offset, 140 + }), 141 + Some( 142 + ReadBlockRecord::Corrupted { offset } | ReadBlockRecord::Truncated { offset }, 143 + ) => Err(ReadError::Corrupted { 142 144 file_id: location.file_id, 143 145 offset, 144 - }) 146 + }), 147 + None => Err(ReadError::Corrupted { 148 + file_id: location.file_id, 149 + offset: location.offset, 150 + }), 145 151 } 146 - None => Err(ReadError::Corrupted { 147 - file_id: location.file_id, 148 - offset: location.offset, 149 - }), 150 - } 152 + }; 153 + (0..READ_RETRY_ATTEMPTS.saturating_sub(1)) 154 + .find_map(|_| match attempt_once() { 155 + Ok(bytes) => Some(Ok(bytes)), 156 + Err(ReadError::Corrupted { .. }) => None, 157 + Err(e) => Some(Err(e)), 158 + }) 159 + .unwrap_or_else(attempt_once) 151 160 } 152 161 } 162 + 163 + const READ_RETRY_ATTEMPTS: u32 = 4;
+38 -28
crates/tranquil-store/src/blockstore/store.rs
··· 20 20 use super::manager::DataFileManager; 21 21 use super::reader::{BlockStoreReader, ReadError}; 22 22 use super::types::{ 23 - BlockLength, BlockLocation, BlockOffset, CollectionResult, CompactionResult, DataFileId, 24 - EpochCounter, LivenessInfo, WallClockMs, WriteCursor, 23 + BlockLocation, BlockOffset, CollectionResult, CompactionResult, DataFileId, EpochCounter, 24 + LivenessInfo, WallClockMs, 25 25 }; 26 26 27 27 fn cid_to_bytes(cid: &Cid) -> Result<[u8; CID_SIZE], RepoError> { ··· 35 35 ), 36 36 )) 37 37 }) 38 - } 39 - 40 - fn block_index_err_to_repo(e: super::hash_index::BlockIndexError) -> RepoError { 41 - RepoError::storage(io::Error::other(e.to_string())) 42 38 } 43 39 44 40 fn commit_error_to_repo(e: CommitError) -> RepoError { ··· 50 46 CommitError::ChannelClosed => RepoError::storage(io::Error::new( 51 47 io::ErrorKind::BrokenPipe, 52 48 "blockstore commit channel closed", 49 + )), 50 + CommitError::VerifyFailed { file_id, offset } => RepoError::storage(io::Error::new( 51 + io::ErrorKind::InvalidData, 52 + format!( 53 + "post-sync verify failed at {file_id}:{}", 54 + offset.raw() 55 + ), 53 56 )), 54 57 } 55 58 } ··· 299 302 Err(e) => return Err(RepoError::storage(e)), 300 303 }; 301 304 302 - let result = Self::scan_and_index(io, index, fd, file_id, start_offset); 305 + let hint_path = super::hint::hint_file_path(data_dir, file_id); 306 + let hint_exists = io 307 + .open(&hint_path, OpenOptions::read_only_existing()) 308 + .map(|fd| { 309 + let _ = io.close(fd); 310 + }) 311 + .is_ok(); 312 + 313 + let result = Self::scan_and_index(io, index, fd, file_id, start_offset, hint_exists); 303 314 304 315 let _ = io.close(fd); 305 316 ··· 312 323 fd: crate::io::FileId, 313 324 file_id: DataFileId, 314 325 start_offset: BlockOffset, 326 + hint_exists: bool, 315 327 ) -> Result<(), RepoError> { 316 328 let file_size = io.file_size(fd).map_err(RepoError::storage)?; 317 329 ··· 320 332 } 321 333 322 334 let scan_pos = &mut { start_offset }; 323 - let (recovered_entries, last_valid_end) = std::iter::from_fn(|| { 335 + let (scanned_entries, last_valid_end) = std::iter::from_fn(|| { 324 336 match super::data_file::decode_block_record(io, fd, *scan_pos, file_size) { 325 337 Err(e) => { 326 338 tracing::warn!( ··· 341 353 Ok(n) if n <= super::types::MAX_BLOCK_SIZE => n, 342 354 _ => return None, 343 355 }; 344 - let length = BlockLength::new(raw_len); 356 + let length = super::types::BlockLength::new(raw_len); 345 357 let record_size = BLOCK_RECORD_OVERHEAD as u64 + u64::from(raw_len); 346 - *scan_pos = scan_pos.advance(record_size); 358 + let new_end = offset.advance(record_size); 359 + *scan_pos = new_end; 347 360 Some(( 348 361 cid_bytes, 349 362 BlockLocation { ··· 351 364 offset, 352 365 length, 353 366 }, 367 + new_end, 354 368 )) 355 369 } 356 370 Ok(Some(ReadBlockRecord::Corrupted { .. } | ReadBlockRecord::Truncated { .. })) => { ··· 360 374 }) 361 375 .fold( 362 376 (Vec::new(), start_offset), 363 - |(mut entries, _), (cid_bytes, location)| { 364 - let new_end = location 365 - .offset 366 - .advance(BLOCK_RECORD_OVERHEAD as u64 + location.length.as_u64()); 367 - entries.push((cid_bytes, location)); 377 + |(mut entries, _), (cid, loc, new_end)| { 378 + entries.push((cid, loc)); 368 379 (entries, new_end) 369 380 }, 370 381 ); ··· 374 385 file_id = %file_id, 375 386 truncating_from = last_valid_end.raw(), 376 387 file_size, 377 - "truncating partial/corrupted tail" 388 + scanned_count = scanned_entries.len(), 389 + "truncating partial/unacked tail" 378 390 ); 379 391 io.truncate(fd, last_valid_end.raw()) 380 392 .map_err(RepoError::storage)?; 381 393 io.sync(fd).map_err(RepoError::storage)?; 382 394 } 383 395 384 - if !recovered_entries.is_empty() { 385 - let new_cursor = WriteCursor { 396 + if !hint_exists && !scanned_entries.is_empty() { 397 + tracing::info!( 398 + file_id = %file_id, 399 + scanned = scanned_entries.len(), 400 + "rebuilding index from data file (no hint file, treating as restored backup)" 401 + ); 402 + let cursor = super::types::WriteCursor { 386 403 file_id, 387 404 offset: last_valid_end, 388 405 }; 389 - let inserted = index 390 - .batch_put_if_absent(&recovered_entries, new_cursor) 391 - .map_err(block_index_err_to_repo)?; 392 - tracing::info!( 393 - file_id = %file_id, 394 - scanned = recovered_entries.len(), 395 - inserted, 396 - new_cursor_offset = last_valid_end.raw(), 397 - "recovery data file scan" 398 - ); 406 + index 407 + .batch_put_if_absent(&scanned_entries, cursor) 408 + .map_err(|e| RepoError::storage(io::Error::other(e.to_string())))?; 399 409 } 400 410 401 411 Ok(())
+6
crates/tranquil-store/src/gauntlet/overrides.rs
··· 42 42 pub checkpoint_interval_ms: Option<u64>, 43 43 #[serde(default, skip_serializing_if = "Option::is_none")] 44 44 pub checkpoint_write_threshold: Option<u64>, 45 + #[serde(default, skip_serializing_if = "Option::is_none")] 46 + pub verify_persisted_blocks: Option<bool>, 45 47 } 46 48 47 49 impl GroupCommitOverrides { ··· 50 52 && self.channel_capacity.is_none() 51 53 && self.checkpoint_interval_ms.is_none() 52 54 && self.checkpoint_write_threshold.is_none() 55 + && self.verify_persisted_blocks.is_none() 53 56 } 54 57 } 55 58 ··· 81 84 } 82 85 if let Some(n) = gc.checkpoint_write_threshold { 83 86 cfg.store.group_commit.checkpoint_write_threshold = n; 87 + } 88 + if let Some(b) = gc.verify_persisted_blocks { 89 + cfg.store.group_commit.verify_persisted_blocks = b; 84 90 } 85 91 } 86 92 }
+94 -120
crates/tranquil-store/src/gauntlet/runner.rs
··· 6 6 7 7 use cid::Cid; 8 8 use jacquard_repo::mst::Mst; 9 - use jacquard_repo::storage::BlockStore; 10 9 11 10 use super::invariants::{ 12 11 EventLogSnapshot, InvariantCtx, InvariantSet, InvariantViolation, SnapshotEvent, invariants_for, ··· 16 15 use super::workload::{Lcg, OpCount, SizeDistribution, ValueBytes, WorkloadModel}; 17 16 use crate::blockstore::{ 18 17 BlockStoreConfig, CidBytes, CompactionError, GroupCommitConfig, TranquilBlockStore, 19 - hash_to_cid_bytes, 18 + hash_to_cid, hash_to_cid_bytes, 20 19 }; 21 20 use crate::eventlog::{ 22 21 DEFAULT_INDEX_INTERVAL, DidHash, EventLogWriter, EventTypeTag, MAX_EVENT_PAYLOAD, SegmentId, ··· 128 127 129 128 #[derive(Debug, thiserror::Error)] 130 129 enum OpError { 131 - #[error("put record: {0}")] 132 - PutRecord(String), 133 130 #[error("mst add: {0}")] 134 131 MstAdd(String), 135 132 #[error("mst delete: {0}")] ··· 970 967 value_seed, 971 968 } => { 972 969 let record_bytes = make_record_bytes(*value_seed, workload.size_distribution); 973 - let record_cid = harness 974 - .store 975 - .put(&record_bytes) 976 - .await 977 - .map_err(|e| OpError::PutRecord(e.to_string()))?; 970 + let record_cid = hash_to_cid(&record_bytes); 978 971 let record_cid_bytes = try_cid_to_fixed(&record_cid)?; 979 972 980 - let outcome = add_record_inner( 973 + let (new_root, applied) = add_record_atomic( 981 974 &harness.store, 982 975 *root, 983 976 collection, 984 977 rkey, 985 978 record_cid, 986 979 record_cid_bytes, 980 + record_bytes, 987 981 ) 988 - .await; 989 - match outcome { 990 - Ok((new_root, applied)) => { 991 - *root = Some(new_root); 992 - if applied { 993 - oracle.add(collection.clone(), rkey.clone(), record_cid_bytes); 994 - } 995 - Ok(()) 996 - } 997 - Err(e) => { 998 - if let Err(cleanup_err) = 999 - decrement_obsolete(&harness.store, vec![record_cid_bytes]).await 1000 - { 1001 - tracing::warn!( 1002 - op_error = %e, 1003 - cleanup_error = %cleanup_err, 1004 - "AddRecord cleanup decrement failed", 1005 - ); 1006 - } 1007 - Err(e) 1008 - } 982 + .await?; 983 + *root = Some(new_root); 984 + if applied { 985 + oracle.add(collection.clone(), rkey.clone(), record_cid_bytes); 1009 986 } 987 + Ok(()) 1010 988 } 1011 989 Op::DeleteRecord { collection, rkey } => { 1012 990 let Some(old_root) = *root else { return Ok(()) }; 1013 991 if !oracle.contains_record(collection, rkey) { 1014 992 return Ok(()); 1015 993 } 1016 - let key = format!("{}/{}", collection.0, rkey.0); 1017 - let loaded = Mst::load(harness.store.clone(), old_root, None); 1018 - let updated = loaded 1019 - .delete(&key) 1020 - .await 1021 - .map_err(|e| OpError::MstDelete(e.to_string()))?; 1022 - let new_root = updated 1023 - .persist() 1024 - .await 1025 - .map_err(|e| OpError::MstPersist(e.to_string()))?; 1026 - apply_mst_diff(&harness.store, old_root, new_root).await?; 994 + let new_root = 995 + delete_record_atomic(&harness.store, old_root, collection, rkey).await?; 1027 996 oracle.delete(collection, rkey); 1028 997 *root = Some(new_root); 1029 998 Ok(()) ··· 1145 1114 Ok(events.last().map(|e: &ValidEvent| e.timestamp.raw())) 1146 1115 } 1147 1116 1148 - async fn add_record_inner<S: StorageIO + Send + Sync + 'static>( 1117 + async fn add_record_atomic<S: StorageIO + Send + Sync + 'static>( 1149 1118 store: &Arc<TranquilBlockStore<S>>, 1150 1119 root: Option<Cid>, 1151 1120 collection: &super::op::CollectionName, 1152 1121 rkey: &super::op::RecordKey, 1153 1122 record_cid: Cid, 1154 1123 record_cid_bytes: CidBytes, 1124 + record_bytes: Vec<u8>, 1155 1125 ) -> Result<(Cid, bool), OpError> { 1156 1126 let key = format!("{}/{}", collection.0, rkey.0); 1157 1127 let loaded = match root { ··· 1162 1132 .add(&key, record_cid) 1163 1133 .await 1164 1134 .map_err(|e| OpError::MstAdd(e.to_string()))?; 1135 + let diff = loaded 1136 + .diff(&updated) 1137 + .await 1138 + .map_err(|e| OpError::MstDiff(e.to_string()))?; 1165 1139 let new_root = updated 1166 - .persist() 1140 + .get_pointer() 1167 1141 .await 1168 1142 .map_err(|e| OpError::MstPersist(e.to_string()))?; 1169 1143 1170 - match root { 1171 - Some(old_root) if old_root == new_root => { 1172 - decrement_obsolete(store, vec![record_cid_bytes]).await?; 1173 - Ok((new_root, false)) 1174 - } 1175 - Some(old_root) => { 1176 - apply_mst_diff(store, old_root, new_root).await?; 1177 - Ok((new_root, true)) 1178 - } 1179 - None => Ok((new_root, true)), 1144 + if matches!(root, Some(r) if r == new_root) { 1145 + return Ok((new_root, false)); 1180 1146 } 1181 - } 1182 1147 1183 - async fn decrement_obsolete<S: StorageIO + Send + Sync + 'static>( 1184 - store: &Arc<TranquilBlockStore<S>>, 1185 - obsolete: Vec<CidBytes>, 1186 - ) -> Result<(), OpError> { 1187 - let s = store.clone(); 1188 - tokio::task::spawn_blocking(move || { 1189 - s.apply_commit_blocking(vec![], obsolete) 1190 - .map_err(|e| e.to_string()) 1191 - }) 1192 - .await 1193 - .map_err(|e| OpError::Join(e.to_string()))? 1194 - .map_err(OpError::ApplyCommit) 1148 + let blocks = diff_blocks_plus_record(diff.new_mst_blocks, record_cid_bytes, record_bytes)?; 1149 + let obsolete = diff_obsolete(diff.removed_mst_blocks, diff.removed_cids)?; 1150 + commit_atomic(store, blocks, obsolete).await?; 1151 + Ok((new_root, true)) 1195 1152 } 1196 1153 1197 - async fn apply_mst_diff<S: StorageIO + Send + Sync + 'static>( 1154 + async fn delete_record_atomic<S: StorageIO + Send + Sync + 'static>( 1198 1155 store: &Arc<TranquilBlockStore<S>>, 1199 1156 old_root: Cid, 1200 - new_root: Cid, 1201 - ) -> Result<(), OpError> { 1202 - let old_m = Mst::load(store.clone(), old_root, None); 1203 - let new_m = Mst::load(store.clone(), new_root, None); 1204 - let diff = old_m 1205 - .diff(&new_m) 1157 + collection: &super::op::CollectionName, 1158 + rkey: &super::op::RecordKey, 1159 + ) -> Result<Cid, OpError> { 1160 + let key = format!("{}/{}", collection.0, rkey.0); 1161 + let loaded = Mst::load(store.clone(), old_root, None); 1162 + let updated = loaded 1163 + .delete(&key) 1164 + .await 1165 + .map_err(|e| OpError::MstDelete(e.to_string()))?; 1166 + let diff = loaded 1167 + .diff(&updated) 1206 1168 .await 1207 1169 .map_err(|e| OpError::MstDiff(e.to_string()))?; 1208 - let obsolete: Vec<CidBytes> = diff 1209 - .removed_mst_blocks 1170 + let new_root = updated 1171 + .get_pointer() 1172 + .await 1173 + .map_err(|e| OpError::MstPersist(e.to_string()))?; 1174 + let blocks: Vec<(CidBytes, Vec<u8>)> = diff 1175 + .new_mst_blocks 1210 1176 .into_iter() 1211 - .chain(diff.removed_cids.into_iter()) 1212 - .map(|c| try_cid_to_fixed(&c)) 1177 + .map(|(c, b)| Ok::<_, OpError>((try_cid_to_fixed(&c)?, b.to_vec()))) 1213 1178 .collect::<Result<_, _>>()?; 1179 + let obsolete = diff_obsolete(diff.removed_mst_blocks, diff.removed_cids)?; 1180 + commit_atomic(store, blocks, obsolete).await?; 1181 + Ok(new_root) 1182 + } 1183 + 1184 + fn diff_blocks_plus_record( 1185 + new_mst_blocks: std::collections::BTreeMap<Cid, bytes::Bytes>, 1186 + record_cid_bytes: CidBytes, 1187 + record_bytes: Vec<u8>, 1188 + ) -> Result<Vec<(CidBytes, Vec<u8>)>, OpError> { 1189 + let mut blocks: Vec<(CidBytes, Vec<u8>)> = Vec::with_capacity(new_mst_blocks.len() + 1); 1190 + blocks.push((record_cid_bytes, record_bytes)); 1191 + new_mst_blocks.into_iter().try_for_each(|(c, b)| { 1192 + let cb = try_cid_to_fixed(&c)?; 1193 + blocks.push((cb, b.to_vec())); 1194 + Ok::<_, OpError>(()) 1195 + })?; 1196 + Ok(blocks) 1197 + } 1198 + 1199 + fn diff_obsolete( 1200 + removed_mst_blocks: Vec<Cid>, 1201 + removed_cids: Vec<Cid>, 1202 + ) -> Result<Vec<CidBytes>, OpError> { 1203 + removed_mst_blocks 1204 + .into_iter() 1205 + .chain(removed_cids.into_iter()) 1206 + .map(|c| try_cid_to_fixed(&c)) 1207 + .collect::<Result<_, _>>() 1208 + .map_err(OpError::from) 1209 + } 1210 + 1211 + async fn commit_atomic<S: StorageIO + Send + Sync + 'static>( 1212 + store: &Arc<TranquilBlockStore<S>>, 1213 + blocks: Vec<(CidBytes, Vec<u8>)>, 1214 + obsolete: Vec<CidBytes>, 1215 + ) -> Result<(), OpError> { 1214 1216 let s = store.clone(); 1215 1217 tokio::task::spawn_blocking(move || { 1216 - s.apply_commit_blocking(vec![], obsolete) 1218 + s.apply_commit_blocking(blocks, obsolete) 1217 1219 .map_err(|e| e.to_string()) 1218 1220 }) 1219 1221 .await ··· 1239 1241 .try_for_each(|fid| match store.compact_file(fid, 0) { 1240 1242 Ok(_) => Ok(()), 1241 1243 Err(CompactionError::ActiveFileCannotBeCompacted) => Ok(()), 1244 + Err(CompactionError::Io(e)) if e.kind() == std::io::ErrorKind::NotFound => Ok(()), 1242 1245 Err(e) => Err(OpError::CompactFile(format!("{fid}: {e}"))), 1243 1246 }) 1244 1247 } ··· 1255 1258 value_seed, 1256 1259 } => { 1257 1260 let record_bytes = make_record_bytes(*value_seed, workload.size_distribution); 1258 - let record_cid = shared 1259 - .store 1260 - .put(&record_bytes) 1261 - .await 1262 - .map_err(|e| OpError::PutRecord(e.to_string()))?; 1261 + let record_cid = hash_to_cid(&record_bytes); 1263 1262 let record_cid_bytes = try_cid_to_fixed(&record_cid)?; 1264 1263 1265 1264 let mut state = shared.write.lock().await; 1266 - let outcome = add_record_inner( 1265 + let (new_root, applied) = add_record_atomic( 1267 1266 &shared.store, 1268 1267 state.root, 1269 1268 collection, 1270 1269 rkey, 1271 1270 record_cid, 1272 1271 record_cid_bytes, 1272 + record_bytes, 1273 1273 ) 1274 - .await; 1275 - match outcome { 1276 - Ok((new_root, applied)) => { 1277 - state.root = Some(new_root); 1278 - if applied { 1279 - state 1280 - .oracle 1281 - .add(collection.clone(), rkey.clone(), record_cid_bytes); 1282 - } 1283 - Ok(()) 1284 - } 1285 - Err(e) => { 1286 - drop(state); 1287 - if let Err(cleanup) = 1288 - decrement_obsolete(&shared.store, vec![record_cid_bytes]).await 1289 - { 1290 - tracing::warn!( 1291 - op_error = %e, 1292 - cleanup_error = %cleanup, 1293 - "AddRecord concurrent cleanup decrement failed", 1294 - ); 1295 - } 1296 - Err(e) 1297 - } 1274 + .await?; 1275 + state.root = Some(new_root); 1276 + if applied { 1277 + state 1278 + .oracle 1279 + .add(collection.clone(), rkey.clone(), record_cid_bytes); 1298 1280 } 1281 + Ok(()) 1299 1282 } 1300 1283 Op::DeleteRecord { collection, rkey } => { 1301 1284 let mut state = shared.write.lock().await; ··· 1305 1288 if !state.oracle.contains_record(collection, rkey) { 1306 1289 return Ok(()); 1307 1290 } 1308 - let key = format!("{}/{}", collection.0, rkey.0); 1309 - let loaded = Mst::load(shared.store.clone(), old_root, None); 1310 - let updated = loaded 1311 - .delete(&key) 1312 - .await 1313 - .map_err(|e| OpError::MstDelete(e.to_string()))?; 1314 - let new_root = updated 1315 - .persist() 1316 - .await 1317 - .map_err(|e| OpError::MstPersist(e.to_string()))?; 1318 - apply_mst_diff(&shared.store, old_root, new_root).await?; 1291 + let new_root = 1292 + delete_record_atomic(&shared.store, old_root, collection, rkey).await?; 1319 1293 state.oracle.delete(collection, rkey); 1320 1294 state.root = Some(new_root); 1321 1295 Ok(())
+4 -1
crates/tranquil-store/src/gauntlet/scenarios.rs
··· 523 523 fn sim_store() -> StoreConfig { 524 524 StoreConfig { 525 525 max_file_size: MaxFileSize(16 * 1024), 526 - group_commit: GroupCommitConfig::default(), 526 + group_commit: GroupCommitConfig { 527 + verify_persisted_blocks: true, 528 + ..GroupCommitConfig::default() 529 + }, 527 530 shard_count: ShardCount(1), 528 531 } 529 532 }
+128
crates/tranquil-store/tests/verify_rollback_orphan.rs
··· 1 + mod common; 2 + 3 + use std::sync::Arc; 4 + 5 + use tranquil_store::OpenOptions; 6 + use tranquil_store::RealIO; 7 + use tranquil_store::StorageIO; 8 + use tranquil_store::blockstore::BlockLength; 9 + use tranquil_store::blockstore::{ 10 + BlockLocation, BlockOffset, BlockStoreConfig, DataFileId, DataFileManager, DataFileWriter, 11 + GroupCommitConfig, HintFileWriter, HintOffset, TranquilBlockStore, hint_file_path, 12 + }; 13 + 14 + use common::{test_cid, with_runtime}; 15 + 16 + fn fresh_store_dir() -> (tempfile::TempDir, BlockStoreConfig) { 17 + let dir = tempfile::TempDir::new().unwrap(); 18 + let data_dir = dir.path().join("data"); 19 + let index_dir = dir.path().join("index"); 20 + std::fs::create_dir_all(&data_dir).unwrap(); 21 + std::fs::create_dir_all(&index_dir).unwrap(); 22 + let config = BlockStoreConfig { 23 + data_dir, 24 + index_dir, 25 + max_file_size: 8192, 26 + group_commit: GroupCommitConfig::default(), 27 + shard_count: 1, 28 + }; 29 + (dir, config) 30 + } 31 + 32 + fn hint_file_size(path: &std::path::Path) -> u64 { 33 + let io = RealIO::new(); 34 + let fd = io.open(path, OpenOptions::read_write()).unwrap(); 35 + let size = io.file_size(fd).unwrap(); 36 + let _ = io.close(fd); 37 + size 38 + } 39 + 40 + #[test] 41 + fn rollback_rotation_does_not_leave_orphan_data_file() { 42 + with_runtime(|| { 43 + let (_dir, config) = fresh_store_dir(); 44 + let data_dir = config.data_dir.clone(); 45 + 46 + { 47 + let store = TranquilBlockStore::open(config.clone()).unwrap(); 48 + store 49 + .put_blocks_blocking(vec![(test_cid(1), vec![0x11; 64])]) 50 + .unwrap(); 51 + drop(store); 52 + } 53 + 54 + let orphan_cid = test_cid(99_999); 55 + { 56 + let io: Arc<RealIO> = Arc::new(RealIO::new()); 57 + let manager = DataFileManager::new(Arc::clone(&io), data_dir.clone(), 4096); 58 + let (next_id, next_fd) = manager.prepare_rotation(DataFileId::new(0)).unwrap(); 59 + manager.commit_rotation(next_id, next_fd); 60 + 61 + let mut writer = DataFileWriter::new(&*io, next_fd, next_id).unwrap(); 62 + let _ = writer.append_block(&orphan_cid, &vec![0xAB; 256]).unwrap(); 63 + writer.sync().unwrap(); 64 + io.sync_dir(&data_dir).unwrap(); 65 + 66 + let _ = io.delete(&hint_file_path(&data_dir, next_id)); 67 + manager.rollback_rotation(next_id, next_fd); 68 + } 69 + 70 + let store = TranquilBlockStore::open(config).unwrap(); 71 + assert!( 72 + store.get_block_sync(&orphan_cid).unwrap().is_none(), 73 + "rollback_rotation must delete the uncommitted data file; otherwise recovery's \ 74 + backup-restore branch resurrects rejected blocks" 75 + ); 76 + }); 77 + } 78 + 79 + #[test] 80 + fn truncated_old_hint_drops_rejected_entry_on_reopen() { 81 + with_runtime(|| { 82 + let (_dir, config) = fresh_store_dir(); 83 + let data_dir = config.data_dir.clone(); 84 + let old_file_id = DataFileId::new(0); 85 + let old_hint_path = hint_file_path(&data_dir, old_file_id); 86 + 87 + let keep_cid = test_cid(1); 88 + { 89 + let store = TranquilBlockStore::open(config.clone()).unwrap(); 90 + store 91 + .put_blocks_blocking(vec![(keep_cid, vec![0x11; 64])]) 92 + .unwrap(); 93 + drop(store); 94 + } 95 + 96 + let hint_len_before = hint_file_size(&old_hint_path); 97 + let rejected_cid = test_cid(42_424); 98 + { 99 + let io: Arc<RealIO> = Arc::new(RealIO::new()); 100 + let fd = io.open(&old_hint_path, OpenOptions::read_write()).unwrap(); 101 + let mut writer = HintFileWriter::resume(&*io, fd, HintOffset::new(hint_len_before)); 102 + writer 103 + .append_hint( 104 + &rejected_cid, 105 + &BlockLocation { 106 + file_id: old_file_id, 107 + offset: BlockOffset::new(4096), 108 + length: BlockLength::new(64), 109 + }, 110 + ) 111 + .unwrap(); 112 + writer.sync().unwrap(); 113 + io.truncate(fd, hint_len_before).unwrap(); 114 + io.sync(fd).unwrap(); 115 + let _ = io.close(fd); 116 + } 117 + 118 + let store = TranquilBlockStore::open(config).unwrap(); 119 + assert!( 120 + store.get_block_sync(&rejected_cid).unwrap().is_none(), 121 + "after rollback_batch truncates state.hint_fd, the rejected hint is gone and reopen is clean" 122 + ); 123 + assert!( 124 + store.get_block_sync(&keep_cid).unwrap().is_some(), 125 + "legitimate pre-batch block remains readable after rollback" 126 + ); 127 + }); 128 + }