Our Personal Data Server from scratch!
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat(tranquil-store): eventlog

Lewis: May this revision serve well! <lu5a@proton.me>

+8423 -10
+14
Cargo.lock
··· 4211 4211 checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" 4212 4212 4213 4213 [[package]] 4214 + name = "memmap2" 4215 + version = "0.9.10" 4216 + source = "registry+https://github.com/rust-lang/crates.io-index" 4217 + checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" 4218 + dependencies = [ 4219 + "libc", 4220 + ] 4221 + 4222 + [[package]] 4214 4223 name = "metrics" 4215 4224 version = "0.24.3" 4216 4225 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 7822 7831 dependencies = [ 7823 7832 "async-trait", 7824 7833 "bytes", 7834 + "chrono", 7825 7835 "cid", 7826 7836 "fjall", 7827 7837 "flume 0.11.1", ··· 7837 7847 "rand 0.8.5", 7838 7848 "serde", 7839 7849 "serde_ipld_dagcbor", 7850 + "serde_json", 7840 7851 "sha2", 7841 7852 "sqlx", 7842 7853 "tempfile", 7854 + "thiserror 2.0.18", 7843 7855 "tokio", 7844 7856 "tracing", 7857 + "tranquil-db-traits", 7845 7858 "tranquil-repo", 7859 + "tranquil-types", 7846 7860 "xxhash-rust", 7847 7861 ] 7848 7862
+21
crates/tranquil-store/Cargo.toml
··· 17 17 bytes = "1" 18 18 memmap2 = "0.9" 19 19 tracing = { workspace = true } 20 + chrono = { workspace = true } 21 + serde_json = { workspace = true } 22 + thiserror = { workspace = true } 23 + tranquil-db-traits = { workspace = true } 24 + tranquil-types = { workspace = true } 20 25 jacquard-repo = { workspace = true } 21 26 cid = { workspace = true } 22 27 multihash = { workspace = true } ··· 42 47 [[bench]] 43 48 name = "blockstore" 44 49 harness = false 50 + 51 + [[bench]] 52 + name = "eventlog" 53 + harness = false 54 + 55 + [[bench]] 56 + name = "metastore" 57 + harness = false 58 + 59 + [[bench]] 60 + name = "metastore_scale" 61 + harness = false 62 + 63 + [[bench]] 64 + name = "profile_reads" 65 + harness = false
+959
crates/tranquil-store/benches/eventlog.rs
··· 1 + use std::path::Path; 2 + use std::sync::Arc; 3 + use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; 4 + use std::time::{Duration, Instant}; 5 + 6 + use chrono::Utc; 7 + use tranquil_db_traits::{RepoEventType, SequenceNumber, SequencedEvent}; 8 + use tranquil_types::Did; 9 + 10 + use tranquil_store::RealIO; 11 + use tranquil_store::eventlog::{EventLog, EventLogConfig, EventSequence}; 12 + 13 + fn make_did(index: usize) -> Did { 14 + let suffix: String = format!("{index:024x}"); 15 + Did::new(format!("did:plc:{suffix}")).unwrap() 16 + } 17 + 18 + fn make_event(index: usize) -> SequencedEvent { 19 + let ops_size = match index % 4 { 20 + 0 => 64, 21 + 1 => 256, 22 + 2 => 1024, 23 + _ => 4096, 24 + }; 25 + 26 + let ops_payload: String = (0..ops_size) 27 + .map(|i| ((index.wrapping_mul(31).wrapping_add(i)) % 26 + 97) as u8 as char) 28 + .collect(); 29 + 30 + SequencedEvent { 31 + seq: SequenceNumber::from_raw(i64::try_from(index + 1).expect("event index overflow")), 32 + did: make_did(index % 10_000), 33 + created_at: Utc::now(), 34 + event_type: match index % 4 { 35 + 0 => RepoEventType::Commit, 36 + 1 => RepoEventType::Identity, 37 + 2 => RepoEventType::Account, 38 + _ => RepoEventType::Sync, 39 + }, 40 + commit_cid: None, 41 + prev_cid: None, 42 + prev_data_cid: None, 43 + ops: Some(serde_json::json!({ "data": ops_payload })), 44 + blobs: None, 45 + blocks_cids: None, 46 + handle: None, 47 + active: None, 48 + status: None, 49 + rev: None, 50 + } 51 + } 52 + 53 + fn estimated_payload_size(event: &SequencedEvent) -> usize { 54 + tranquil_store::eventlog::encode_payload(event).len() 55 + } 56 + 57 + struct LatencyStats { 58 + p50: Duration, 59 + p95: Duration, 60 + p99: Duration, 61 + max: Duration, 62 + mean: Duration, 63 + } 64 + 65 + fn compute_stats(durations: &mut [Duration]) -> Option<LatencyStats> { 66 + if durations.is_empty() { 67 + return None; 68 + } 69 + durations.sort(); 70 + let len = durations.len(); 71 + let sum: Duration = durations.iter().sum(); 72 + let divisor = u32::try_from(len).unwrap_or(u32::MAX); 73 + let last = len - 1; 74 + Some(LatencyStats { 75 + p50: durations[last * 50 / 100], 76 + p95: durations[last * 95 / 100], 77 + p99: durations[last * 99 / 100], 78 + max: durations[last], 79 + mean: sum / divisor, 80 + }) 81 + } 82 + 83 + fn format_latency(stats: Option<&LatencyStats>) -> String { 84 + match stats { 85 + Some(s) => format!( 86 + " | p50={:?} p95={:?} p99={:?} max={:?} mean={:?}", 87 + s.p50, s.p95, s.p99, s.max, s.mean 88 + ), 89 + None => String::new(), 90 + } 91 + } 92 + 93 + fn open_eventlog(dir: &Path) -> EventLog<RealIO> { 94 + let segments_dir = dir.join("segments"); 95 + std::fs::create_dir_all(&segments_dir).unwrap(); 96 + EventLog::open( 97 + EventLogConfig { 98 + segments_dir, 99 + ..EventLogConfig::default() 100 + }, 101 + RealIO::new(), 102 + ) 103 + .unwrap() 104 + } 105 + 106 + fn bench_sequential_append(event_count: usize) { 107 + println!("-- sequential append: {event_count} events --"); 108 + let dir = tempfile::TempDir::new().unwrap(); 109 + let log = open_eventlog(dir.path()); 110 + 111 + let events: Vec<SequencedEvent> = (0..event_count).map(make_event).collect(); 112 + let total_bytes: usize = events.iter().map(estimated_payload_size).sum(); 113 + let mut latencies = Vec::with_capacity(event_count); 114 + 115 + let start = Instant::now(); 116 + events.iter().enumerate().for_each(|(i, event)| { 117 + let t = Instant::now(); 118 + log.append_event(&make_did(i % 10_000), RepoEventType::Commit, event) 119 + .unwrap(); 120 + if (i + 1) % 256 == 0 { 121 + log.sync().unwrap(); 122 + } 123 + latencies.push(t.elapsed()); 124 + }); 125 + log.sync().unwrap(); 126 + let elapsed = start.elapsed(); 127 + 128 + let lat = format_latency(compute_stats(&mut latencies).as_ref()); 129 + println!( 130 + "{:.0} events/sec, {:.1} MB/sec, {:.1}ms{lat}", 131 + event_count as f64 / elapsed.as_secs_f64(), 132 + total_bytes as f64 / elapsed.as_secs_f64() / (1024.0 * 1024.0), 133 + elapsed.as_secs_f64() * 1000.0, 134 + ); 135 + let _ = log.shutdown(); 136 + } 137 + 138 + fn bench_concurrent_producers(event_count: usize, producers: usize) { 139 + println!("-- {producers} concurrent producers, {event_count} total events --"); 140 + let dir = tempfile::TempDir::new().unwrap(); 141 + let log = Arc::new(open_eventlog(dir.path())); 142 + 143 + let events_per_producer = event_count / producers; 144 + let actual_count = events_per_producer * producers; 145 + let avg_payload: usize = (0..4) 146 + .map(|i| estimated_payload_size(&make_event(i))) 147 + .sum::<usize>() 148 + / 4; 149 + 150 + let start = Instant::now(); 151 + 152 + let handles: Vec<_> = (0..producers) 153 + .map(|pid| { 154 + let log = Arc::clone(&log); 155 + std::thread::spawn(move || { 156 + let mut latencies = Vec::with_capacity(events_per_producer); 157 + (0..events_per_producer).for_each(|i| { 158 + let global = pid * events_per_producer + i; 159 + let event = make_event(global); 160 + let t = Instant::now(); 161 + log.append_and_sync(&make_did(global % 10_000), RepoEventType::Commit, &event) 162 + .unwrap(); 163 + latencies.push(t.elapsed()); 164 + }); 165 + latencies 166 + }) 167 + }) 168 + .collect(); 169 + 170 + let mut all_latencies: Vec<Duration> = handles 171 + .into_iter() 172 + .flat_map(|h| h.join().unwrap()) 173 + .collect(); 174 + let elapsed = start.elapsed(); 175 + 176 + let total_bytes = actual_count * avg_payload; 177 + let lat = format_latency(compute_stats(&mut all_latencies).as_ref()); 178 + println!( 179 + "{:.0} events/sec, {:.1} MB/sec, {:.1}ms{lat}", 180 + actual_count as f64 / elapsed.as_secs_f64(), 181 + total_bytes as f64 / elapsed.as_secs_f64() / (1024.0 * 1024.0), 182 + elapsed.as_secs_f64() * 1000.0, 183 + ); 184 + let _ = log.shutdown(); 185 + } 186 + 187 + fn bench_batch_append(event_count: usize, batch_size: usize) { 188 + println!("-- batch append: {event_count} events, batch_size={batch_size} --"); 189 + let dir = tempfile::TempDir::new().unwrap(); 190 + let log = open_eventlog(dir.path()); 191 + 192 + let events: Vec<SequencedEvent> = (0..event_count).map(make_event).collect(); 193 + let dids: Vec<Did> = (0..event_count).map(|i| make_did(i % 10_000)).collect(); 194 + let total_bytes: usize = events.iter().map(estimated_payload_size).sum(); 195 + let mut batch_latencies = Vec::with_capacity(event_count / batch_size + 1); 196 + 197 + let start = Instant::now(); 198 + events 199 + .chunks(batch_size) 200 + .enumerate() 201 + .for_each(|(chunk_idx, chunk)| { 202 + let base = chunk_idx * batch_size; 203 + let batch: Vec<(&Did, RepoEventType, &SequencedEvent)> = chunk 204 + .iter() 205 + .enumerate() 206 + .map(|(j, event)| (&dids[base + j], RepoEventType::Commit, event)) 207 + .collect(); 208 + let t = Instant::now(); 209 + log.append_batch(batch).unwrap(); 210 + log.sync().unwrap(); 211 + batch_latencies.push(t.elapsed()); 212 + }); 213 + let elapsed = start.elapsed(); 214 + 215 + let lat = format_latency(compute_stats(&mut batch_latencies).as_ref()); 216 + println!( 217 + "{:.0} events/sec, {:.1} MB/sec, {:.1}ms{lat}", 218 + event_count as f64 / elapsed.as_secs_f64(), 219 + total_bytes as f64 / elapsed.as_secs_f64() / (1024.0 * 1024.0), 220 + elapsed.as_secs_f64() * 1000.0, 221 + ); 222 + let _ = log.shutdown(); 223 + } 224 + 225 + fn bench_rotation_under_load(event_count: usize) { 226 + println!("-- rotation: {event_count} events, 256KB segments --"); 227 + let dir = tempfile::TempDir::new().unwrap(); 228 + let segments_dir = dir.path().join("segments"); 229 + std::fs::create_dir_all(&segments_dir).unwrap(); 230 + let log = EventLog::open( 231 + EventLogConfig { 232 + segments_dir, 233 + max_segment_size: 256 * 1024, 234 + ..EventLogConfig::default() 235 + }, 236 + RealIO::new(), 237 + ) 238 + .unwrap(); 239 + 240 + let events: Vec<SequencedEvent> = (0..event_count).map(make_event).collect(); 241 + let append_latencies = Vec::with_capacity(event_count); 242 + let rotation_latencies = Vec::new(); 243 + 244 + let start = Instant::now(); 245 + let (mut append_latencies, mut rotation_latencies, _) = events.iter().enumerate().fold( 246 + (append_latencies, rotation_latencies, false), 247 + |(mut appends, mut rotations, fd_limited), (i, event)| { 248 + let t = Instant::now(); 249 + log.append_and_sync(&make_did(i % 10_000), RepoEventType::Commit, event) 250 + .unwrap(); 251 + appends.push(t.elapsed()); 252 + 253 + match fd_limited { 254 + true => (appends, rotations, true), 255 + false => { 256 + let rt = Instant::now(); 257 + match log.maybe_rotate() { 258 + Ok(true) => { 259 + rotations.push(rt.elapsed()); 260 + (appends, rotations, false) 261 + } 262 + Ok(false) => (appends, rotations, false), 263 + Err(e) => { 264 + println!("fd limit hit at {} segments: {e}", log.segment_count()); 265 + (appends, rotations, true) 266 + } 267 + } 268 + } 269 + } 270 + }, 271 + ); 272 + let elapsed = start.elapsed(); 273 + 274 + let append_lat = format_latency(compute_stats(&mut append_latencies).as_ref()); 275 + let rotation_lat = format_latency(compute_stats(&mut rotation_latencies).as_ref()); 276 + println!( 277 + "{:.0} events/sec, {} segments, {} rotations, {:.1}ms", 278 + event_count as f64 / elapsed.as_secs_f64(), 279 + log.segment_count(), 280 + rotation_latencies.len(), 281 + elapsed.as_secs_f64() * 1000.0, 282 + ); 283 + println!("append{append_lat}"); 284 + println!("rotation{rotation_lat}"); 285 + let _ = log.shutdown(); 286 + } 287 + 288 + fn scan_all_events(log: &EventLog<RealIO>, batch_size: usize) -> usize { 289 + scan_all_events_from(log, EventSequence::BEFORE_ALL, batch_size, 0) 290 + } 291 + 292 + fn scan_all_events_from( 293 + log: &EventLog<RealIO>, 294 + cursor: EventSequence, 295 + batch_size: usize, 296 + accumulated: usize, 297 + ) -> usize { 298 + let batch = log.get_events_since(cursor, batch_size).unwrap(); 299 + match batch.last() { 300 + None => accumulated, 301 + Some(last) => { 302 + let next_cursor = EventSequence::new(u64::try_from(last.seq.as_i64()).unwrap()); 303 + scan_all_events_from(log, next_cursor, batch_size, accumulated + batch.len()) 304 + } 305 + } 306 + } 307 + 308 + fn bench_sequential_scan(event_count: usize) { 309 + println!("-- sequential scan: {event_count} events --"); 310 + let dir = tempfile::TempDir::new().unwrap(); 311 + let log = open_eventlog(dir.path()); 312 + 313 + let events: Vec<SequencedEvent> = (0..event_count).map(make_event).collect(); 314 + let total_bytes: usize = events.iter().map(estimated_payload_size).sum(); 315 + 316 + events.iter().enumerate().for_each(|(i, event)| { 317 + log.append_event(&make_did(i % 10_000), RepoEventType::Commit, event) 318 + .unwrap(); 319 + }); 320 + log.sync().unwrap(); 321 + 322 + let start = Instant::now(); 323 + let read_count = scan_all_events(&log, 4096); 324 + let elapsed = start.elapsed(); 325 + 326 + println!( 327 + "{:.0} events/sec, {:.1} MB/sec, {read_count} events, {:.1}ms", 328 + read_count as f64 / elapsed.as_secs_f64(), 329 + total_bytes as f64 / elapsed.as_secs_f64() / (1024.0 * 1024.0), 330 + elapsed.as_secs_f64() * 1000.0, 331 + ); 332 + let _ = log.shutdown(); 333 + } 334 + 335 + fn bench_parallel_readers(event_count: usize, readers: usize) { 336 + println!("-- parallel readers: {event_count} events, {readers} readers --"); 337 + let dir = tempfile::TempDir::new().unwrap(); 338 + let log = Arc::new(open_eventlog(dir.path())); 339 + 340 + let events: Vec<SequencedEvent> = (0..event_count).map(make_event).collect(); 341 + 342 + events.iter().enumerate().for_each(|(i, event)| { 343 + log.append_event(&make_did(i % 10_000), RepoEventType::Commit, event) 344 + .unwrap(); 345 + }); 346 + log.sync().unwrap(); 347 + 348 + let total_read = Arc::new(AtomicU64::new(0)); 349 + let batch_size = 4096; 350 + 351 + let start = Instant::now(); 352 + 353 + let handles: Vec<_> = (0..readers) 354 + .map(|_| { 355 + let log = Arc::clone(&log); 356 + let total_read = Arc::clone(&total_read); 357 + std::thread::spawn(move || { 358 + let count = scan_all_events(&log, batch_size) as u64; 359 + total_read.fetch_add(count, Ordering::Relaxed); 360 + }) 361 + }) 362 + .collect(); 363 + 364 + handles.into_iter().for_each(|h| h.join().unwrap()); 365 + let elapsed = start.elapsed(); 366 + 367 + let total = total_read.load(Ordering::Relaxed); 368 + let avg_payload: usize = (0..4) 369 + .map(|i| estimated_payload_size(&make_event(i))) 370 + .sum::<usize>() 371 + / 4; 372 + println!( 373 + "{:.0} total events/sec across {readers} readers ({:.0} per reader)", 374 + total as f64 / elapsed.as_secs_f64(), 375 + (total as f64 / readers as f64) / elapsed.as_secs_f64(), 376 + ); 377 + println!( 378 + "aggregate {:.1} MB/sec, {:.1}ms", 379 + (total as f64 * avg_payload as f64) / elapsed.as_secs_f64() / (1024.0 * 1024.0), 380 + elapsed.as_secs_f64() * 1000.0, 381 + ); 382 + let _ = log.shutdown(); 383 + } 384 + 385 + fn bench_stampede(event_count: usize, producers: usize, readers: usize, subscribers: usize) { 386 + println!( 387 + "-- stampede: {event_count} events, {producers} producers, {readers} readers, {subscribers} subscribers --" 388 + ); 389 + let dir = tempfile::TempDir::new().unwrap(); 390 + let log = Arc::new(open_eventlog(dir.path())); 391 + 392 + let events_per_producer = event_count / producers; 393 + let actual_events = events_per_producer * producers; 394 + 395 + let writes_done = Arc::new(AtomicBool::new(false)); 396 + let total_written = Arc::new(AtomicU64::new(0)); 397 + let total_read = Arc::new(AtomicU64::new(0)); 398 + let total_subscribed = Arc::new(AtomicU64::new(0)); 399 + 400 + let rt = tokio::runtime::Builder::new_multi_thread() 401 + .worker_threads(4) 402 + .enable_all() 403 + .build() 404 + .unwrap(); 405 + 406 + let start = Instant::now(); 407 + 408 + let subscriber_handles: Vec<_> = (0..subscribers) 409 + .map(|_| { 410 + let log = Arc::clone(&log); 411 + let writes_done = Arc::clone(&writes_done); 412 + let total_subscribed = Arc::clone(&total_subscribed); 413 + let total_written = Arc::clone(&total_written); 414 + rt.spawn(async move { 415 + let mut sub = log.subscriber(EventSequence::BEFORE_ALL); 416 + let mut count = 0u64; 417 + loop { 418 + match tokio::time::timeout(Duration::from_millis(100), sub.next()).await { 419 + Ok(Some(_)) => { 420 + count += 1; 421 + } 422 + Ok(None) => break, 423 + Err(_) => { 424 + if writes_done.load(Ordering::Acquire) { 425 + let written = total_written.load(Ordering::Acquire); 426 + if count >= written { 427 + break; 428 + } 429 + match tokio::time::timeout(Duration::from_secs(2), sub.next()).await 430 + { 431 + Ok(Some(_)) => count += 1, 432 + _ => break, 433 + } 434 + } 435 + } 436 + } 437 + } 438 + total_subscribed.fetch_add(count, Ordering::Relaxed); 439 + }) 440 + }) 441 + .collect(); 442 + 443 + let writer_handles: Vec<_> = (0..producers) 444 + .map(|pid| { 445 + let log = Arc::clone(&log); 446 + let total_written = Arc::clone(&total_written); 447 + std::thread::spawn(move || { 448 + let mut latencies = Vec::with_capacity(events_per_producer); 449 + (0..events_per_producer).for_each(|i| { 450 + let global = pid * events_per_producer + i; 451 + let event = make_event(global); 452 + let t = Instant::now(); 453 + log.append_and_sync(&make_did(global % 10_000), RepoEventType::Commit, &event) 454 + .unwrap(); 455 + latencies.push(t.elapsed()); 456 + total_written.fetch_add(1, Ordering::Release); 457 + }); 458 + latencies 459 + }) 460 + }) 461 + .collect(); 462 + 463 + let reader_handles: Vec<_> = (0..readers) 464 + .map(|_| { 465 + let log = Arc::clone(&log); 466 + let writes_done = Arc::clone(&writes_done); 467 + let total_read = Arc::clone(&total_read); 468 + std::thread::spawn(move || { 469 + let mut cursor = EventSequence::BEFORE_ALL; 470 + let mut count = 0u64; 471 + loop { 472 + let batch = log.get_events_since(cursor, 1024).unwrap(); 473 + match batch.last() { 474 + Some(last) => { 475 + count += batch.len() as u64; 476 + cursor = EventSequence::new(u64::try_from(last.seq.as_i64()).unwrap()); 477 + } 478 + None if writes_done.load(Ordering::Acquire) => { 479 + let final_batch = log.get_events_since(cursor, 1024).unwrap(); 480 + match final_batch.last() { 481 + Some(last) => { 482 + count += final_batch.len() as u64; 483 + cursor = EventSequence::new( 484 + u64::try_from(last.seq.as_i64()).unwrap(), 485 + ); 486 + } 487 + None => break, 488 + } 489 + } 490 + None => { 491 + std::thread::yield_now(); 492 + } 493 + } 494 + } 495 + total_read.fetch_add(count, Ordering::Relaxed); 496 + }) 497 + }) 498 + .collect(); 499 + 500 + let mut write_latencies: Vec<Duration> = writer_handles 501 + .into_iter() 502 + .flat_map(|h| h.join().unwrap()) 503 + .collect(); 504 + let write_elapsed = start.elapsed(); 505 + 506 + writes_done.store(true, Ordering::Release); 507 + 508 + reader_handles.into_iter().for_each(|h| h.join().unwrap()); 509 + let read_elapsed = start.elapsed(); 510 + 511 + rt.block_on(async { 512 + let _ = tokio::time::timeout( 513 + Duration::from_secs(10), 514 + futures::future::join_all(subscriber_handles), 515 + ) 516 + .await; 517 + }); 518 + let total_elapsed = start.elapsed(); 519 + 520 + let reads = total_read.load(Ordering::Relaxed); 521 + let subscribed = total_subscribed.load(Ordering::Relaxed); 522 + 523 + let write_lat = format_latency(compute_stats(&mut write_latencies).as_ref()); 524 + println!( 525 + "writes: {:.0} events/sec, {actual_events} events, {:.1}ms{write_lat}", 526 + actual_events as f64 / write_elapsed.as_secs_f64(), 527 + write_elapsed.as_secs_f64() * 1000.0, 528 + ); 529 + println!( 530 + "reads: {:.0} events/sec, {readers} readers, {reads} events, {:.1}ms", 531 + reads as f64 / read_elapsed.as_secs_f64(), 532 + read_elapsed.as_secs_f64() * 1000.0, 533 + ); 534 + println!( 535 + "subscribers: {subscribed} events across {subscribers} subscribers, {:.1}ms", 536 + total_elapsed.as_secs_f64() * 1000.0, 537 + ); 538 + println!("segments: {}", log.segment_count()); 539 + let _ = log.shutdown(); 540 + } 541 + 542 + fn bench_broadcast_fanout(subscriber_count: usize) { 543 + println!("-- broadcast fanout: 10000 events, {subscriber_count} subscribers --"); 544 + let dir = tempfile::TempDir::new().unwrap(); 545 + let log = Arc::new(open_eventlog(dir.path())); 546 + 547 + let event_count = 10_000usize; 548 + 549 + let rt = tokio::runtime::Builder::new_multi_thread() 550 + .worker_threads( 551 + std::thread::available_parallelism() 552 + .map(|n| n.get()) 553 + .unwrap_or(8), 554 + ) 555 + .enable_all() 556 + .build() 557 + .unwrap(); 558 + 559 + rt.block_on(async { 560 + let received_counts: Arc<Vec<AtomicU64>> = 561 + Arc::new((0..subscriber_count).map(|_| AtomicU64::new(0)).collect()); 562 + 563 + let sub_handles: Vec<_> = (0..subscriber_count) 564 + .map(|sub_id| { 565 + let mut subscriber = log.subscriber(EventSequence::BEFORE_ALL); 566 + let received_counts = Arc::clone(&received_counts); 567 + tokio::spawn(async move { 568 + let mut count = 0u64; 569 + while count < event_count as u64 { 570 + match tokio::time::timeout(Duration::from_secs(10), subscriber.next()).await 571 + { 572 + Ok(Some(_)) => count += 1, 573 + _ => break, 574 + } 575 + } 576 + received_counts[sub_id].store(count, Ordering::Relaxed); 577 + }) 578 + }) 579 + .collect(); 580 + 581 + let log_writer = Arc::clone(&log); 582 + let write_handle = tokio::task::spawn_blocking(move || { 583 + let mut latencies = Vec::with_capacity(event_count); 584 + (0..event_count).for_each(|i| { 585 + let event = make_event(i); 586 + let t = Instant::now(); 587 + log_writer 588 + .append_and_sync(&make_did(i % 10_000), RepoEventType::Commit, &event) 589 + .unwrap(); 590 + latencies.push(t.elapsed()); 591 + }); 592 + latencies 593 + }); 594 + 595 + let mut write_latencies = write_handle.await.unwrap(); 596 + 597 + let _ = tokio::time::timeout( 598 + Duration::from_secs(30), 599 + futures::future::join_all(sub_handles), 600 + ) 601 + .await; 602 + 603 + let total_received: u64 = received_counts 604 + .iter() 605 + .map(|c| c.load(Ordering::Relaxed)) 606 + .sum(); 607 + let min_received = received_counts 608 + .iter() 609 + .map(|c| c.load(Ordering::Relaxed)) 610 + .min() 611 + .unwrap_or(0); 612 + 613 + let write_lat = format_latency(compute_stats(&mut write_latencies).as_ref()); 614 + println!("write{write_lat}"); 615 + println!( 616 + "total received: {total_received}/{}, min per sub: {min_received}/{event_count}", 617 + event_count as u64 * subscriber_count as u64, 618 + ); 619 + }); 620 + 621 + let _ = log.shutdown(); 622 + } 623 + 624 + async fn bench_pg_write_throughput(event_count: usize, concurrency: usize) { 625 + let database_url = match std::env::var("DATABASE_URL") { 626 + Ok(url) => url, 627 + Err(_) => { 628 + println!("skipped, set DATABASE_URL to enable"); 629 + return; 630 + } 631 + }; 632 + 633 + let max_conns = u32::try_from(concurrency) 634 + .unwrap_or(u32::MAX) 635 + .saturating_add(10); 636 + let pool = sqlx::postgres::PgPoolOptions::new() 637 + .max_connections(max_conns) 638 + .acquire_timeout(Duration::from_secs(30)) 639 + .connect(&database_url) 640 + .await 641 + .unwrap(); 642 + 643 + sqlx::query( 644 + "CREATE TABLE IF NOT EXISTS bench_repo_seq ( 645 + seq BIGSERIAL PRIMARY KEY, 646 + did TEXT NOT NULL, 647 + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), 648 + event_type TEXT NOT NULL, 649 + ops JSONB 650 + )", 651 + ) 652 + .execute(&pool) 653 + .await 654 + .unwrap(); 655 + sqlx::query("TRUNCATE bench_repo_seq") 656 + .execute(&pool) 657 + .await 658 + .unwrap(); 659 + 660 + let events_per_task = event_count / concurrency; 661 + let actual_count = events_per_task * concurrency; 662 + 663 + let start = Instant::now(); 664 + 665 + let handles: Vec<_> = (0..concurrency) 666 + .map(|task_id| { 667 + let pool = pool.clone(); 668 + tokio::spawn(async move { 669 + futures::stream::iter(0..events_per_task) 670 + .then(|i| { 671 + let pool = pool.clone(); 672 + async move { 673 + let global = task_id * events_per_task + i; 674 + let did = format!("did:plc:{global:024x}"); 675 + let ops_size = match global % 4 { 676 + 0 => 64, 677 + 1 => 256, 678 + 2 => 1024, 679 + _ => 4096, 680 + }; 681 + let payload: String = (0..ops_size) 682 + .map(|j| { 683 + ((global.wrapping_mul(31).wrapping_add(j)) % 26 + 97) as u8 684 + as char 685 + }) 686 + .collect(); 687 + let ops = serde_json::json!({ "data": payload }); 688 + let t = Instant::now(); 689 + sqlx::query( 690 + "INSERT INTO bench_repo_seq (did, event_type, ops) VALUES ($1, $2, $3)", 691 + ) 692 + .bind(&did) 693 + .bind("commit") 694 + .bind(&ops) 695 + .execute(&pool) 696 + .await 697 + .unwrap(); 698 + t.elapsed() 699 + } 700 + }) 701 + .collect::<Vec<Duration>>() 702 + .await 703 + }) 704 + }) 705 + .collect(); 706 + 707 + let mut all_latencies: Vec<Duration> = futures::future::join_all(handles) 708 + .await 709 + .into_iter() 710 + .flat_map(Result::unwrap) 711 + .collect(); 712 + let elapsed = start.elapsed(); 713 + 714 + let lat = format_latency(compute_stats(&mut all_latencies).as_ref()); 715 + println!( 716 + "{:.0} events/sec, {:.1}ms{lat}", 717 + actual_count as f64 / elapsed.as_secs_f64(), 718 + elapsed.as_secs_f64() * 1000.0, 719 + ); 720 + 721 + sqlx::query("TRUNCATE bench_repo_seq") 722 + .execute(&pool) 723 + .await 724 + .unwrap(); 725 + pool.close().await; 726 + } 727 + 728 + async fn bench_pg_read_throughput(event_count: usize, concurrency: usize) { 729 + let database_url = match std::env::var("DATABASE_URL") { 730 + Ok(url) => url, 731 + Err(_) => { 732 + println!("skipped, set DATABASE_URL to enable"); 733 + return; 734 + } 735 + }; 736 + 737 + let max_conns = u32::try_from(concurrency) 738 + .unwrap_or(u32::MAX) 739 + .saturating_add(5); 740 + let pool = sqlx::postgres::PgPoolOptions::new() 741 + .max_connections(max_conns) 742 + .connect(&database_url) 743 + .await 744 + .unwrap(); 745 + 746 + sqlx::query( 747 + "CREATE TABLE IF NOT EXISTS bench_repo_seq ( 748 + seq BIGSERIAL PRIMARY KEY, 749 + did TEXT NOT NULL, 750 + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), 751 + event_type TEXT NOT NULL, 752 + ops JSONB 753 + )", 754 + ) 755 + .execute(&pool) 756 + .await 757 + .unwrap(); 758 + 759 + let row: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM bench_repo_seq") 760 + .fetch_one(&pool) 761 + .await 762 + .unwrap(); 763 + if (row.0 as usize) < event_count { 764 + sqlx::query("TRUNCATE bench_repo_seq") 765 + .execute(&pool) 766 + .await 767 + .unwrap(); 768 + println!("populating {event_count} events"); 769 + futures::stream::iter(0..event_count) 770 + .map(|i| { 771 + let pool = pool.clone(); 772 + async move { 773 + let did = format!("did:plc:{i:024x}"); 774 + let ops = serde_json::json!({ "data": "x".repeat(256) }); 775 + sqlx::query( 776 + "INSERT INTO bench_repo_seq (did, event_type, ops) VALUES ($1, $2, $3)", 777 + ) 778 + .bind(&did) 779 + .bind("commit") 780 + .bind(&ops) 781 + .execute(&pool) 782 + .await 783 + .unwrap(); 784 + } 785 + }) 786 + .buffer_unordered(50) 787 + .collect::<Vec<()>>() 788 + .await; 789 + } 790 + 791 + let total_events = Arc::new(AtomicU64::new(0)); 792 + 793 + let start = Instant::now(); 794 + 795 + let handles: Vec<_> = (0..concurrency) 796 + .map(|_| { 797 + let pool = pool.clone(); 798 + let total_events = Arc::clone(&total_events); 799 + tokio::spawn(async move { 800 + let mut cursor = 0i64; 801 + let mut count = 0u64; 802 + loop { 803 + let rows: Vec<(i64,)> = sqlx::query_as( 804 + "SELECT seq FROM bench_repo_seq WHERE seq > $1 ORDER BY seq LIMIT $2", 805 + ) 806 + .bind(cursor) 807 + .bind(1000i64) 808 + .fetch_all(&pool) 809 + .await 810 + .unwrap(); 811 + if rows.is_empty() { 812 + break; 813 + } 814 + count += rows.len() as u64; 815 + cursor = rows.last().unwrap().0; 816 + } 817 + total_events.fetch_add(count, Ordering::Relaxed); 818 + }) 819 + }) 820 + .collect(); 821 + 822 + futures::future::join_all(handles).await; 823 + let elapsed = start.elapsed(); 824 + 825 + let total = total_events.load(Ordering::Relaxed); 826 + println!( 827 + "{:.0} total events/sec across {concurrency} readers, {total} events, {:.1}ms", 828 + total as f64 / elapsed.as_secs_f64(), 829 + elapsed.as_secs_f64() * 1000.0, 830 + ); 831 + 832 + pool.close().await; 833 + } 834 + 835 + fn main() { 836 + println!("-- eventlog benchmarks --"); 837 + let cpus = std::thread::available_parallelism() 838 + .map(|n| n.get()) 839 + .unwrap_or(8); 840 + println!("available parallelism: {cpus}"); 841 + 842 + let parse_env_list = |var: &str, defaults: Vec<usize>| -> Vec<usize> { 843 + std::env::var(var).map_or(defaults, |s| { 844 + s.split(',') 845 + .map(|n| { 846 + n.trim() 847 + .replace('_', "") 848 + .parse::<usize>() 849 + .unwrap_or_else(|e| panic!("{var}: {e}")) 850 + }) 851 + .collect() 852 + }) 853 + }; 854 + 855 + let event_counts = parse_env_list("BENCH_EVENT_COUNTS", vec![10_000, 100_000]); 856 + let large_event_counts = parse_env_list("BENCH_LARGE_EVENT_COUNTS", vec![1_000_000]); 857 + let producer_counts = parse_env_list("BENCH_PRODUCERS", vec![1, 10, 50, 100, 500]); 858 + 859 + let all_write_counts: Vec<usize> = event_counts 860 + .iter() 861 + .chain(large_event_counts.iter()) 862 + .copied() 863 + .collect(); 864 + 865 + println!("event counts: {event_counts:?}, large: {large_event_counts:?}"); 866 + println!("producer counts: {producer_counts:?}"); 867 + 868 + println!("-- write throughput --"); 869 + 870 + all_write_counts.iter().for_each(|&n| { 871 + bench_sequential_append(n); 872 + }); 873 + 874 + all_write_counts.iter().for_each(|&n| { 875 + producer_counts.iter().for_each(|&p| { 876 + if n >= p { 877 + bench_concurrent_producers(n, p); 878 + } 879 + }); 880 + }); 881 + 882 + all_write_counts.iter().for_each(|&n| { 883 + [256usize, 1024, 4096].iter().for_each(|&batch| { 884 + bench_batch_append(n, batch); 885 + }); 886 + }); 887 + 888 + println!("-- rotation --"); 889 + 890 + event_counts.iter().for_each(|&n| { 891 + bench_rotation_under_load(n); 892 + }); 893 + 894 + println!("-- read throughput --"); 895 + 896 + event_counts.iter().for_each(|&n| { 897 + bench_sequential_scan(n); 898 + }); 899 + 900 + event_counts.iter().for_each(|&n| { 901 + [2usize, 4, 8, 16, 32].iter().for_each(|&r| { 902 + bench_parallel_readers(n, r); 903 + }); 904 + }); 905 + 906 + println!("-- broadcast fanout --"); 907 + 908 + [1usize, 10, 100, 500, 1000].iter().for_each(|&s| { 909 + bench_broadcast_fanout(s); 910 + }); 911 + 912 + println!("-- stampede --"); 913 + 914 + bench_stampede(100_000, 50, 8, 10); 915 + bench_stampede(100_000, 100, 16, 50); 916 + bench_stampede(500_000, 100, 16, 50); 917 + 918 + let rt = tokio::runtime::Builder::new_multi_thread() 919 + .worker_threads(cpus) 920 + .enable_all() 921 + .build() 922 + .unwrap(); 923 + 924 + if std::env::var("DATABASE_URL").is_ok() { 925 + println!("-- postgres comparison --"); 926 + 927 + event_counts.iter().for_each(|&n| { 928 + producer_counts.iter().for_each(|&p| { 929 + if n >= p { 930 + println!("-- postgres write: {n} events, {p} writers --",); 931 + rt.block_on(bench_pg_write_throughput(n, p)); 932 + } 933 + }); 934 + }); 935 + 936 + event_counts.iter().for_each(|&n| { 937 + [1usize, 4, 16, 32].iter().for_each(|&r| { 938 + println!("-- postgres read: {n} events, {r} readers --",); 939 + rt.block_on(bench_pg_read_throughput(n, r)); 940 + }); 941 + }); 942 + 943 + rt.block_on(async { 944 + let url = std::env::var("DATABASE_URL").unwrap(); 945 + let pool = sqlx::postgres::PgPoolOptions::new() 946 + .max_connections(5) 947 + .connect(&url) 948 + .await 949 + .unwrap(); 950 + sqlx::query("DROP TABLE IF EXISTS bench_repo_seq") 951 + .execute(&pool) 952 + .await 953 + .unwrap(); 954 + pool.close().await; 955 + }); 956 + } else { 957 + println!("set DATABASE_URL for postgres comparison"); 958 + } 959 + }
+44 -6
crates/tranquil-store/src/blockstore/group_commit.rs
··· 4 4 use std::sync::Arc; 5 5 use std::thread; 6 6 7 + use crate::fsync_order::PostBlockstoreHook; 8 + 9 + use super::BlocksSynced; 7 10 use crate::io::{FileId, OpenOptions, StorageIO}; 8 11 9 12 use super::data_file::{CID_SIZE, DataFileWriter}; ··· 110 113 index: Arc<KeyIndex>, 111 114 config: GroupCommitConfig, 112 115 ) -> Result<Self, CommitError> { 116 + Self::spawn_with_hook(manager, index, config, None) 117 + } 118 + 119 + pub fn spawn_with_hook<S: StorageIO + 'static>( 120 + manager: DataFileManager<S>, 121 + index: Arc<KeyIndex>, 122 + config: GroupCommitConfig, 123 + post_sync_hook: Option<Arc<dyn PostBlockstoreHook>>, 124 + ) -> Result<Self, CommitError> { 113 125 let cursor = index.read_write_cursor().map_err(CommitError::from)?; 114 126 let mut state = initialize_active_state(&manager, cursor)?; 115 127 ··· 118 130 let handle = thread::Builder::new() 119 131 .name("blockstore-group-commit".into()) 120 132 .spawn(move || { 121 - commit_loop(&manager, &*index, &receiver, &config, &mut state); 133 + commit_loop( 134 + &manager, 135 + &index, 136 + &receiver, 137 + &config, 138 + &mut state, 139 + post_sync_hook.as_deref(), 140 + ); 122 141 }) 123 142 .map_err(|e| CommitError::from(io::Error::other(e)))?; 124 143 ··· 284 303 receiver: &flume::Receiver<CommitRequest>, 285 304 config: &GroupCommitConfig, 286 305 state: &mut ActiveState, 306 + post_sync_hook: Option<&dyn PostBlockstoreHook>, 287 307 ) { 288 308 loop { 289 309 let first = match receiver.recv() { ··· 302 322 303 323 let result = process_batch(manager, index, &batch, state); 304 324 325 + if let Ok((ref _dedup, ref proof)) = result { 326 + run_post_sync_hook(post_sync_hook, proof); 327 + } 328 + 305 329 if let Err(ref e) = result { 306 330 tracing::warn!(error = %e, "commit batch failed"); 307 331 } 308 332 309 - dispatch_responses(batch, result); 333 + dispatch_responses(batch, result.map(|(dedup, _proof)| dedup)); 310 334 311 335 if shutdown_after { 312 - drain_and_process_remaining(manager, index, receiver, state); 336 + drain_and_process_remaining(manager, index, receiver, state, post_sync_hook); 313 337 return; 314 338 } 315 339 } 316 340 } 317 341 342 + fn run_post_sync_hook(hook: Option<&dyn PostBlockstoreHook>, proof: &BlocksSynced) { 343 + if let Some(hook) = hook 344 + && let Err(e) = hook.on_blocks_synced(proof) 345 + { 346 + tracing::error!(error = %e, "post-blockstore sync hook failed"); 347 + } 348 + } 349 + 318 350 fn drain_and_process_remaining<S: StorageIO>( 319 351 manager: &DataFileManager<S>, 320 352 index: &KeyIndex, 321 353 receiver: &flume::Receiver<CommitRequest>, 322 354 state: &mut ActiveState, 355 + post_sync_hook: Option<&dyn PostBlockstoreHook>, 323 356 ) { 324 357 let entries: Vec<BatchEntry> = std::iter::from_fn(|| receiver.try_recv().ok()) 325 358 .filter_map(|req| classify_request(req).ok()) ··· 330 363 } 331 364 332 365 let result = process_batch(manager, index, &entries, state); 333 - dispatch_responses(entries, result); 366 + 367 + if let Ok((ref _dedup, ref proof)) = result { 368 + run_post_sync_hook(post_sync_hook, proof); 369 + } 370 + 371 + dispatch_responses(entries, result.map(|(dedup, _proof)| dedup)); 334 372 } 335 373 336 374 struct RotationState { ··· 343 381 index: &KeyIndex, 344 382 batch: &[BatchEntry], 345 383 state: &mut ActiveState, 346 - ) -> Result<HashMap<[u8; CID_SIZE], BlockLocation>, CommitError> { 384 + ) -> Result<(HashMap<[u8; CID_SIZE], BlockLocation>, BlocksSynced), CommitError> { 347 385 let mut dedup: HashMap<[u8; CID_SIZE], BlockLocation> = HashMap::new(); 348 386 let mut index_entries: Vec<([u8; CID_SIZE], BlockLocation)> = Vec::new(); 349 387 let mut all_decrements: Vec<[u8; CID_SIZE]> = Vec::new(); ··· 446 484 .batch_put(&index_entries, &all_decrements, cursor) 447 485 .map_err(CommitError::from)?; 448 486 449 - Ok(dedup) 487 + Ok((dedup, BlocksSynced::new())) 450 488 } 451 489 452 490 fn dispatch_responses(
+8
crates/tranquil-store/src/blockstore/mod.rs
··· 32 32 33 33 use crate::io::StorageIO; 34 34 35 + pub struct BlocksSynced(()); 36 + 37 + impl BlocksSynced { 38 + pub(in crate::blockstore) fn new() -> Self { 39 + Self(()) 40 + } 41 + } 42 + 35 43 pub(crate) fn list_files_by_extension<S: StorageIO>( 36 44 io: &S, 37 45 dir: &Path,
+15 -3
crates/tranquil-store/src/blockstore/store.rs
··· 10 10 use multihash::Multihash; 11 11 use sha2::{Digest, Sha256}; 12 12 13 + use crate::fsync_order::PostBlockstoreHook; 13 14 use crate::io::{OpenOptions, RealIO, StorageIO}; 14 15 15 16 use super::data_file::{BLOCK_RECORD_OVERHEAD, CID_SIZE, ReadBlockRecord}; ··· 101 102 102 103 impl TranquilBlockStore { 103 104 pub fn open(config: BlockStoreConfig) -> Result<Self, RepoError> { 105 + Self::open_with_hook(config, None) 106 + } 107 + 108 + pub fn open_with_hook( 109 + config: BlockStoreConfig, 110 + post_sync_hook: Option<Arc<dyn PostBlockstoreHook>>, 111 + ) -> Result<Self, RepoError> { 104 112 if config.data_dir == config.index_dir { 105 113 return Err(RepoError::storage(io::Error::new( 106 114 io::ErrorKind::InvalidInput, ··· 126 134 127 135 let manager_for_writer = 128 136 DataFileManager::new(RealIO::new(), config.data_dir.clone(), config.max_file_size); 129 - let writer = 130 - GroupCommitWriter::spawn(manager_for_writer, Arc::clone(&index), config.group_commit) 131 - .map_err(commit_error_to_repo)?; 137 + let writer = GroupCommitWriter::spawn_with_hook( 138 + manager_for_writer, 139 + Arc::clone(&index), 140 + config.group_commit, 141 + post_sync_hook, 142 + ) 143 + .map_err(commit_error_to_repo)?; 132 144 let sender = writer.sender().clone(); 133 145 134 146 let manager_for_reader = Arc::new(DataFileManager::new(
+296
crates/tranquil-store/src/eventlog/bridge.rs
··· 1 + use std::io; 2 + use std::sync::Arc; 3 + 4 + use chrono::{DateTime, Utc}; 5 + use tracing::warn; 6 + use tranquil_db_traits::{DbError, SequenceNumber, SequencedEvent}; 7 + 8 + use super::notifier::EventLogNotifier; 9 + use super::types::{EventSequence, TimestampMicros}; 10 + use super::writer::SyncResult; 11 + use super::{EventLog, EventWithMutations, decode_payload, to_sequenced_event}; 12 + use crate::io::StorageIO; 13 + 14 + pub struct DeferredBroadcast(SyncResult); 15 + 16 + fn io_to_db(e: io::Error) -> DbError { 17 + DbError::Query(e.to_string()) 18 + } 19 + 20 + fn seq_to_event(seq: SequenceNumber) -> EventSequence { 21 + let raw = seq.as_i64(); 22 + if raw < 0 { 23 + warn!( 24 + seq = raw, 25 + "negative SequenceNumber passed to eventlog bridge, treating as BEFORE_ALL" 26 + ); 27 + return EventSequence::BEFORE_ALL; 28 + } 29 + EventSequence::cursor_from_i64(raw).unwrap_or(EventSequence::BEFORE_ALL) 30 + } 31 + 32 + fn datetime_to_micros(dt: &DateTime<Utc>) -> u64 { 33 + let micros = dt.timestamp_micros(); 34 + debug_assert!(micros >= 0, "pre-epoch DateTime passed to eventlog bridge"); 35 + u64::try_from(micros).unwrap_or(0) 36 + } 37 + 38 + pub struct EventLogBridge<S: StorageIO> { 39 + log: Arc<EventLog<S>>, 40 + } 41 + 42 + impl<S: StorageIO> EventLogBridge<S> { 43 + pub fn new(log: Arc<EventLog<S>>) -> Self { 44 + Self { log } 45 + } 46 + 47 + pub fn notifier(&self) -> EventLogNotifier<S> { 48 + EventLogNotifier::new(Arc::clone(&self.log)) 49 + } 50 + 51 + pub fn log(&self) -> &Arc<EventLog<S>> { 52 + &self.log 53 + } 54 + 55 + pub fn get_max_seq(&self) -> SequenceNumber { 56 + let es = self.log.max_seq(); 57 + SequenceNumber::from_raw(es.as_i64()) 58 + } 59 + 60 + pub fn get_events_since_seq( 61 + &self, 62 + since: SequenceNumber, 63 + limit: Option<i64>, 64 + ) -> Result<Vec<SequencedEvent>, DbError> { 65 + let cap = limit 66 + .and_then(|l| usize::try_from(l).ok()) 67 + .unwrap_or(usize::MAX); 68 + self.get_events_impl(since, cap) 69 + } 70 + 71 + pub fn get_events_since_cursor( 72 + &self, 73 + cursor: SequenceNumber, 74 + limit: i64, 75 + ) -> Result<Vec<SequencedEvent>, DbError> { 76 + let cap = usize::try_from(limit).unwrap_or(usize::MAX); 77 + self.get_events_impl(cursor, cap) 78 + } 79 + 80 + fn get_events_impl( 81 + &self, 82 + since: SequenceNumber, 83 + limit: usize, 84 + ) -> Result<Vec<SequencedEvent>, DbError> { 85 + let cursor = seq_to_event(since); 86 + self.log.get_events_since(cursor, limit).map_err(io_to_db) 87 + } 88 + 89 + pub fn get_event_by_seq(&self, seq: SequenceNumber) -> Result<Option<SequencedEvent>, DbError> { 90 + let es = EventSequence::from_i64(seq.as_i64()) 91 + .ok_or_else(|| DbError::Query("invalid sequence number".into()))?; 92 + self.log.get_event(es).map_err(io_to_db) 93 + } 94 + 95 + pub fn get_events_in_seq_range( 96 + &self, 97 + start: SequenceNumber, 98 + end: SequenceNumber, 99 + ) -> Result<Vec<SequencedEvent>, DbError> { 100 + let end_raw = match u64::try_from(end.as_i64()) { 101 + Ok(v) => v, 102 + Err(_) => return Ok(Vec::new()), 103 + }; 104 + let cursor = seq_to_event(start); 105 + if end_raw <= cursor.raw().saturating_add(1) { 106 + return Ok(Vec::new()); 107 + } 108 + let range_size = 109 + usize::try_from(end_raw.saturating_sub(cursor.raw())).unwrap_or(usize::MAX); 110 + let raw_events = self 111 + .log 112 + .reader() 113 + .read_events_from(cursor, range_size) 114 + .map_err(io_to_db)?; 115 + 116 + raw_events 117 + .iter() 118 + .take_while(|e| e.seq.raw() < end_raw) 119 + .map(|raw| { 120 + let payload = 121 + decode_payload(&raw.payload).map_err(|e| DbError::Query(e.to_string()))?; 122 + to_sequenced_event(raw, &payload).map_err(|e| DbError::Query(e.to_string())) 123 + }) 124 + .collect() 125 + } 126 + 127 + pub fn get_min_seq_since( 128 + &self, 129 + since: DateTime<Utc>, 130 + ) -> Result<Option<SequenceNumber>, DbError> { 131 + let target_us = datetime_to_micros(&since); 132 + let target_ts = TimestampMicros::new(target_us); 133 + let reader = self.log.reader(); 134 + 135 + let segments = self.log.manager().list_segments().map_err(io_to_db)?; 136 + if segments.is_empty() { 137 + return Ok(None); 138 + } 139 + 140 + let scan_from_seg = self.find_segment_for_timestamp(&segments, target_ts)?; 141 + 142 + let start_seq = match scan_from_seg { 143 + Some(idx) => reader 144 + .load_index(segments[idx]) 145 + .map_err(io_to_db)? 146 + .first_seq() 147 + .map(|s| s.prev_or_before_all()) 148 + .unwrap_or(EventSequence::BEFORE_ALL), 149 + None => return Ok(None), 150 + }; 151 + 152 + const SCAN_BATCH: usize = 1024; 153 + self.scan_for_timestamp(reader, start_seq, target_ts, SCAN_BATCH) 154 + } 155 + 156 + fn scan_for_timestamp( 157 + &self, 158 + reader: &super::EventLogReader<S>, 159 + cursor: EventSequence, 160 + target_ts: TimestampMicros, 161 + batch_size: usize, 162 + ) -> Result<Option<SequenceNumber>, DbError> { 163 + let batch = reader 164 + .read_events_from(cursor, batch_size) 165 + .map_err(io_to_db)?; 166 + if batch.is_empty() { 167 + return Ok(None); 168 + } 169 + match batch.iter().find(|e| e.timestamp >= target_ts) { 170 + Some(e) => Ok(Some(SequenceNumber::from_raw(e.seq.as_i64()))), 171 + None => { 172 + let next_cursor = batch.last().map(|e| e.seq).unwrap_or(cursor); 173 + self.scan_for_timestamp(reader, next_cursor, target_ts, batch_size) 174 + } 175 + } 176 + } 177 + 178 + fn find_segment_for_timestamp( 179 + &self, 180 + segments: &[super::SegmentId], 181 + target_ts: TimestampMicros, 182 + ) -> Result<Option<usize>, DbError> { 183 + let reader = self.log.reader(); 184 + 185 + let last_seg_idx = segments.len() - 1; 186 + let last_index = reader 187 + .load_index(segments[last_seg_idx]) 188 + .map_err(io_to_db)?; 189 + 190 + let last_ts = last_index 191 + .first_seq() 192 + .and_then(|seq| reader.read_event_at(seq).ok().flatten()) 193 + .map(|e| e.timestamp); 194 + 195 + match last_ts { 196 + Some(ts) if ts < target_ts => { 197 + let tail_ts = last_index 198 + .last_seq() 199 + .and_then(|seq| reader.read_event_at(seq).ok().flatten()) 200 + .map(|e| e.timestamp); 201 + match tail_ts { 202 + Some(ts) if ts < target_ts => return Ok(None), 203 + _ => return Ok(Some(last_seg_idx)), 204 + } 205 + } 206 + None => return Ok(None), 207 + _ => {} 208 + } 209 + 210 + Ok(self 211 + .binary_search_segment(reader, segments, target_ts, 0, last_seg_idx, None)? 212 + .map(|r| r.saturating_sub(1))) 213 + } 214 + 215 + fn binary_search_segment( 216 + &self, 217 + reader: &super::EventLogReader<S>, 218 + segments: &[super::SegmentId], 219 + target_ts: TimestampMicros, 220 + lo: usize, 221 + hi: usize, 222 + best: Option<usize>, 223 + ) -> Result<Option<usize>, DbError> { 224 + if lo > hi { 225 + return Ok(best); 226 + } 227 + let mid = lo + (hi - lo) / 2; 228 + let seg_ts = reader 229 + .load_index(segments[mid]) 230 + .map_err(io_to_db)? 231 + .first_seq() 232 + .and_then(|seq| reader.read_event_at(seq).ok().flatten()) 233 + .map(|e| e.timestamp); 234 + 235 + match seg_ts { 236 + Some(ts) if ts < target_ts => { 237 + self.binary_search_segment(reader, segments, target_ts, mid + 1, hi, best) 238 + } 239 + Some(_) => match mid { 240 + 0 => Ok(Some(0)), 241 + _ => { 242 + self.binary_search_segment(reader, segments, target_ts, lo, mid - 1, Some(mid)) 243 + } 244 + }, 245 + None => self.binary_search_segment(reader, segments, target_ts, mid + 1, hi, best), 246 + } 247 + } 248 + 249 + pub fn get_events_with_mutations_since( 250 + &self, 251 + since: SequenceNumber, 252 + limit: usize, 253 + ) -> Result<Vec<EventWithMutations>, DbError> { 254 + let cursor = seq_to_event(since); 255 + self.log 256 + .get_events_with_mutations_since(cursor, limit) 257 + .map_err(io_to_db) 258 + } 259 + 260 + pub fn insert_event(&self, event: &SequencedEvent) -> Result<SequenceNumber, io::Error> { 261 + let seq = self 262 + .log 263 + .append_and_sync(&event.did, event.event_type, event)?; 264 + Ok(SequenceNumber::from_raw(seq.as_i64())) 265 + } 266 + 267 + pub fn insert_event_deferred( 268 + &self, 269 + event: &SequencedEvent, 270 + ) -> Result<(SequenceNumber, DeferredBroadcast), io::Error> { 271 + let seq = self.log.append_event(&event.did, event.event_type, event)?; 272 + let sync_result = self.log.sync_data()?; 273 + Ok(( 274 + SequenceNumber::from_raw(seq.as_i64()), 275 + DeferredBroadcast(sync_result), 276 + )) 277 + } 278 + 279 + pub fn insert_event_deferred_raw( 280 + &self, 281 + did: &tranquil_types::Did, 282 + event_type: tranquil_db_traits::RepoEventType, 283 + payload: Vec<u8>, 284 + ) -> Result<(SequenceNumber, DeferredBroadcast), io::Error> { 285 + let seq = self.log.append_raw_payload(did, event_type, payload)?; 286 + let sync_result = self.log.sync_data()?; 287 + Ok(( 288 + SequenceNumber::from_raw(seq.as_i64()), 289 + DeferredBroadcast(sync_result), 290 + )) 291 + } 292 + 293 + pub fn complete_broadcast(&self, deferred: DeferredBroadcast) { 294 + self.log.broadcast_result(&deferred.0); 295 + } 296 + }
+659
crates/tranquil-store/src/eventlog/manager.rs
··· 1 + use std::collections::HashMap; 2 + use std::io; 3 + use std::path::{Path, PathBuf}; 4 + use std::sync::atomic::{AtomicU64, Ordering}; 5 + 6 + use parking_lot::RwLock; 7 + 8 + use crate::io::{FileId, OpenOptions, StorageIO}; 9 + 10 + use super::segment_file::SEGMENT_HEADER_SIZE; 11 + use super::segment_index::SegmentIndex; 12 + use super::types::{SegmentId, SegmentOffset}; 13 + 14 + pub(crate) const SEGMENT_FILE_EXTENSION: &str = "tqe"; 15 + pub(crate) const INDEX_FILE_EXTENSION: &str = "tqi"; 16 + 17 + struct CachedSegmentHandle { 18 + fd: FileId, 19 + sealed: bool, 20 + writable: bool, 21 + } 22 + 23 + pub struct SegmentManager<S: StorageIO> { 24 + io: S, 25 + segments_dir: PathBuf, 26 + max_segment_size: u64, 27 + handles: RwLock<HashMap<SegmentId, CachedSegmentHandle>>, 28 + retention_epoch: AtomicU64, 29 + } 30 + 31 + impl<S: StorageIO> SegmentManager<S> { 32 + pub fn new(io: S, segments_dir: PathBuf, max_segment_size: u64) -> io::Result<Self> { 33 + assert!( 34 + max_segment_size > SEGMENT_HEADER_SIZE as u64, 35 + "max_segment_size ({max_segment_size}) must exceed SEGMENT_HEADER_SIZE ({SEGMENT_HEADER_SIZE})" 36 + ); 37 + io.mkdir(&segments_dir)?; 38 + Ok(Self { 39 + io, 40 + segments_dir, 41 + max_segment_size, 42 + handles: RwLock::new(HashMap::new()), 43 + retention_epoch: AtomicU64::new(0), 44 + }) 45 + } 46 + 47 + pub fn io(&self) -> &S { 48 + &self.io 49 + } 50 + 51 + pub fn segments_dir(&self) -> &Path { 52 + &self.segments_dir 53 + } 54 + 55 + pub fn max_segment_size(&self) -> u64 { 56 + self.max_segment_size 57 + } 58 + 59 + pub fn segment_path(&self, id: SegmentId) -> PathBuf { 60 + self.segments_dir 61 + .join(format!("{id}.{SEGMENT_FILE_EXTENSION}")) 62 + } 63 + 64 + pub fn index_path(&self, id: SegmentId) -> PathBuf { 65 + self.segments_dir 66 + .join(format!("{id}.{INDEX_FILE_EXTENSION}")) 67 + } 68 + 69 + pub fn list_segments(&self) -> io::Result<Vec<SegmentId>> { 70 + let entries = self.io.list_dir(&self.segments_dir)?; 71 + let mut ids: Vec<SegmentId> = entries 72 + .iter() 73 + .filter_map(|path| { 74 + let stem = path.file_stem()?.to_str()?; 75 + let ext = path.extension()?.to_str()?; 76 + (ext == SEGMENT_FILE_EXTENSION) 77 + .then(|| stem.parse::<u32>().ok().map(SegmentId::new))? 78 + }) 79 + .collect(); 80 + ids.sort(); 81 + Ok(ids) 82 + } 83 + 84 + pub fn open_for_read(&self, id: SegmentId) -> io::Result<FileId> { 85 + if let Some(entry) = self.handles.read().get(&id) { 86 + return Ok(entry.fd); 87 + } 88 + let path = self.segment_path(id); 89 + let fd = self.io.open(&path, OpenOptions::read_only_existing())?; 90 + let mut cache = self.handles.write(); 91 + match cache.get(&id) { 92 + Some(entry) => { 93 + let _ = self.io.close(fd); 94 + Ok(entry.fd) 95 + } 96 + None => { 97 + cache.insert( 98 + id, 99 + CachedSegmentHandle { 100 + fd, 101 + sealed: false, 102 + writable: false, 103 + }, 104 + ); 105 + Ok(fd) 106 + } 107 + } 108 + } 109 + 110 + pub fn open_for_append(&self, id: SegmentId) -> io::Result<FileId> { 111 + { 112 + let cache = self.handles.read(); 113 + if let Some(entry) = cache.get(&id) { 114 + if entry.sealed { 115 + return Err(io::Error::new( 116 + io::ErrorKind::InvalidInput, 117 + format!("cannot append to sealed segment {id}"), 118 + )); 119 + } 120 + if entry.writable { 121 + return Ok(entry.fd); 122 + } 123 + } 124 + } 125 + let path = self.segment_path(id); 126 + let fd = self.io.open(&path, OpenOptions::read_write())?; 127 + let mut cache = self.handles.write(); 128 + match cache.get(&id) { 129 + Some(entry) if entry.sealed => { 130 + let _ = self.io.close(fd); 131 + Err(io::Error::new( 132 + io::ErrorKind::InvalidInput, 133 + format!("cannot append to sealed segment {id}"), 134 + )) 135 + } 136 + Some(entry) if entry.writable => { 137 + let _ = self.io.close(fd); 138 + Ok(entry.fd) 139 + } 140 + Some(entry) => { 141 + let old_fd = entry.fd; 142 + cache.insert( 143 + id, 144 + CachedSegmentHandle { 145 + fd, 146 + sealed: false, 147 + writable: true, 148 + }, 149 + ); 150 + let _ = self.io.close(old_fd); 151 + Ok(fd) 152 + } 153 + None => { 154 + cache.insert( 155 + id, 156 + CachedSegmentHandle { 157 + fd, 158 + sealed: false, 159 + writable: true, 160 + }, 161 + ); 162 + Ok(fd) 163 + } 164 + } 165 + } 166 + 167 + pub fn should_rotate(&self, position: SegmentOffset) -> bool { 168 + position.raw() >= self.max_segment_size 169 + } 170 + 171 + pub fn prepare_rotation(&self, current_id: SegmentId) -> io::Result<(SegmentId, FileId)> { 172 + let next = current_id.next(); 173 + let path = self.segment_path(next); 174 + let fd = self.io.open(&path, OpenOptions::read_write())?; 175 + self.io.truncate(fd, 0)?; 176 + self.io.sync_dir(&self.segments_dir)?; 177 + Ok((next, fd)) 178 + } 179 + 180 + pub fn commit_rotation(&self, new_id: SegmentId, fd: FileId) { 181 + self.handles.write().insert( 182 + new_id, 183 + CachedSegmentHandle { 184 + fd, 185 + sealed: false, 186 + writable: true, 187 + }, 188 + ); 189 + } 190 + 191 + pub fn seal_segment(&self, id: SegmentId, index: &SegmentIndex) -> io::Result<()> { 192 + let path = self.index_path(id); 193 + index.save(&self.io, &path)?; 194 + let mut cache = self.handles.write(); 195 + let entry = cache.get_mut(&id).ok_or_else(|| { 196 + io::Error::new( 197 + io::ErrorKind::InvalidInput, 198 + format!("seal_segment: segment {id} not in handle cache"), 199 + ) 200 + })?; 201 + entry.sealed = true; 202 + Ok(()) 203 + } 204 + 205 + pub fn is_sealed(&self, id: SegmentId) -> bool { 206 + self.handles 207 + .read() 208 + .get(&id) 209 + .is_some_and(|entry| entry.sealed) 210 + } 211 + 212 + pub fn rollback_rotation(&self, new_id: SegmentId, fd: FileId) { 213 + let _ = self.io.close(fd); 214 + self.handles.write().remove(&new_id); 215 + let _ = self.io.delete(&self.segment_path(new_id)); 216 + } 217 + 218 + pub fn delete_segment(&self, id: SegmentId) -> io::Result<()> { 219 + { 220 + let mut cache = self.handles.write(); 221 + if let Some(entry) = cache.remove(&id) { 222 + let _ = self.io.close(entry.fd); 223 + } 224 + } 225 + match self.io.delete(&self.index_path(id)) { 226 + Ok(()) => {} 227 + Err(e) if e.kind() == io::ErrorKind::NotFound => {} 228 + Err(e) => return Err(e), 229 + } 230 + self.io.delete(&self.segment_path(id))?; 231 + self.io.sync_dir(&self.segments_dir)?; 232 + self.retention_epoch.fetch_add(1, Ordering::Relaxed); 233 + Ok(()) 234 + } 235 + 236 + pub fn oldest_segment(&self) -> io::Result<Option<SegmentId>> { 237 + self.list_segments().map(|segs| segs.into_iter().next()) 238 + } 239 + 240 + pub fn retention_epoch(&self) -> u64 { 241 + self.retention_epoch.load(Ordering::Relaxed) 242 + } 243 + 244 + pub fn shutdown(&self) { 245 + self.handles.write().drain().for_each(|(_, handle)| { 246 + let _ = self.io.close(handle.fd); 247 + }); 248 + } 249 + } 250 + 251 + impl<S: StorageIO> Drop for SegmentManager<S> { 252 + fn drop(&mut self) { 253 + self.shutdown(); 254 + } 255 + } 256 + 257 + #[cfg(test)] 258 + mod tests { 259 + use super::*; 260 + use crate::eventlog::segment_file::{SegmentWriter, ValidEvent}; 261 + use crate::eventlog::segment_index::{DEFAULT_INDEX_INTERVAL, rebuild_from_segment}; 262 + use crate::eventlog::types::{ 263 + DidHash, EventSequence, EventTypeTag, SegmentOffset, TimestampMicros, 264 + }; 265 + use crate::sim::SimulatedIO; 266 + 267 + fn setup_manager(max_segment_size: u64) -> SegmentManager<SimulatedIO> { 268 + let sim = SimulatedIO::pristine(42); 269 + SegmentManager::new(sim, PathBuf::from("/segments"), max_segment_size).unwrap() 270 + } 271 + 272 + fn test_event(seq: u64, payload: &[u8]) -> ValidEvent { 273 + ValidEvent { 274 + seq: EventSequence::new(seq), 275 + timestamp: TimestampMicros::new(seq * 1_000_000), 276 + did_hash: DidHash::from_did(&format!("did:plc:test{seq}")), 277 + event_type: EventTypeTag::COMMIT, 278 + payload: payload.to_vec(), 279 + } 280 + } 281 + 282 + #[test] 283 + fn new_creates_directory() { 284 + let sim = SimulatedIO::pristine(42); 285 + let mgr = SegmentManager::new(sim, PathBuf::from("/eventlog/segments"), 1024).unwrap(); 286 + let entries = mgr.io().list_dir(Path::new("/eventlog/segments")).unwrap(); 287 + assert!(entries.is_empty()); 288 + } 289 + 290 + #[test] 291 + fn segment_path_format() { 292 + let mgr = setup_manager(1024); 293 + assert_eq!( 294 + mgr.segment_path(SegmentId::new(0)), 295 + Path::new("/segments/00000000.tqe") 296 + ); 297 + assert_eq!( 298 + mgr.segment_path(SegmentId::new(42)), 299 + Path::new("/segments/00000042.tqe") 300 + ); 301 + } 302 + 303 + #[test] 304 + fn index_path_format() { 305 + let mgr = setup_manager(1024); 306 + assert_eq!( 307 + mgr.index_path(SegmentId::new(0)), 308 + Path::new("/segments/00000000.tqi") 309 + ); 310 + assert_eq!( 311 + mgr.index_path(SegmentId::new(7)), 312 + Path::new("/segments/00000007.tqi") 313 + ); 314 + } 315 + 316 + #[test] 317 + fn open_for_append_creates_file() { 318 + let mgr = setup_manager(1024); 319 + let fd = mgr.open_for_append(SegmentId::new(1)).unwrap(); 320 + assert_eq!(mgr.io().file_size(fd).unwrap(), 0); 321 + } 322 + 323 + #[test] 324 + fn open_for_read_missing_file_errors() { 325 + let mgr = setup_manager(1024); 326 + assert!(mgr.open_for_read(SegmentId::new(99)).is_err()); 327 + } 328 + 329 + #[test] 330 + fn handle_cache_returns_same_fd() { 331 + let mgr = setup_manager(1024); 332 + let fd1 = mgr.open_for_append(SegmentId::new(1)).unwrap(); 333 + let fd2 = mgr.open_for_append(SegmentId::new(1)).unwrap(); 334 + assert_eq!(fd1, fd2); 335 + } 336 + 337 + #[test] 338 + fn open_for_read_uses_cache_from_append() { 339 + let mgr = setup_manager(1024); 340 + let fd_write = mgr.open_for_append(SegmentId::new(1)).unwrap(); 341 + let fd_read = mgr.open_for_read(SegmentId::new(1)).unwrap(); 342 + assert_eq!(fd_write, fd_read); 343 + } 344 + 345 + #[test] 346 + fn list_segments_finds_segment_files() { 347 + let mgr = setup_manager(1024); 348 + mgr.open_for_append(SegmentId::new(1)).unwrap(); 349 + mgr.open_for_append(SegmentId::new(3)).unwrap(); 350 + 351 + let segments = mgr.list_segments().unwrap(); 352 + assert_eq!(segments, vec![SegmentId::new(1), SegmentId::new(3)]); 353 + } 354 + 355 + #[test] 356 + fn list_segments_ignores_non_segment_files() { 357 + let mgr = setup_manager(1024); 358 + mgr.open_for_append(SegmentId::new(1)).unwrap(); 359 + mgr.io() 360 + .open(Path::new("/segments/notes.txt"), OpenOptions::read_write()) 361 + .unwrap(); 362 + 363 + let segments = mgr.list_segments().unwrap(); 364 + assert_eq!(segments, vec![SegmentId::new(1)]); 365 + } 366 + 367 + #[test] 368 + fn list_segments_ignores_index_files() { 369 + let mgr = setup_manager(1024); 370 + mgr.open_for_append(SegmentId::new(1)).unwrap(); 371 + mgr.io() 372 + .open( 373 + Path::new("/segments/00000001.tqi"), 374 + OpenOptions::read_write(), 375 + ) 376 + .unwrap(); 377 + 378 + let segments = mgr.list_segments().unwrap(); 379 + assert_eq!(segments, vec![SegmentId::new(1)]); 380 + } 381 + 382 + #[test] 383 + fn list_segments_sorted_ascending() { 384 + let mgr = setup_manager(1024); 385 + mgr.open_for_append(SegmentId::new(5)).unwrap(); 386 + mgr.open_for_append(SegmentId::new(1)).unwrap(); 387 + mgr.open_for_append(SegmentId::new(3)).unwrap(); 388 + 389 + let segments = mgr.list_segments().unwrap(); 390 + assert_eq!( 391 + segments, 392 + vec![SegmentId::new(1), SegmentId::new(3), SegmentId::new(5)] 393 + ); 394 + } 395 + 396 + #[test] 397 + fn should_rotate_respects_threshold() { 398 + let mgr = setup_manager(1024); 399 + assert!(!mgr.should_rotate(SegmentOffset::new(100))); 400 + assert!(!mgr.should_rotate(SegmentOffset::new(1023))); 401 + assert!(mgr.should_rotate(SegmentOffset::new(1024))); 402 + assert!(mgr.should_rotate(SegmentOffset::new(2000))); 403 + } 404 + 405 + #[test] 406 + fn rotation_lifecycle_prepare_commit() { 407 + let mgr = setup_manager(1024); 408 + let _fd0 = mgr.open_for_append(SegmentId::new(1)).unwrap(); 409 + let (next_id, next_fd) = mgr.prepare_rotation(SegmentId::new(1)).unwrap(); 410 + assert_eq!(next_id, SegmentId::new(2)); 411 + assert_eq!(mgr.io().file_size(next_fd).unwrap(), 0); 412 + mgr.commit_rotation(next_id, next_fd); 413 + assert_eq!(mgr.open_for_read(next_id).unwrap(), next_fd); 414 + } 415 + 416 + #[test] 417 + fn rotation_rollback_cleans_up() { 418 + let mgr = setup_manager(1024); 419 + let _fd0 = mgr.open_for_append(SegmentId::new(1)).unwrap(); 420 + let (next_id, next_fd) = mgr.prepare_rotation(SegmentId::new(1)).unwrap(); 421 + mgr.commit_rotation(next_id, next_fd); 422 + 423 + assert_eq!(mgr.open_for_read(next_id).unwrap(), next_fd); 424 + mgr.rollback_rotation(next_id, next_fd); 425 + 426 + let segments = mgr.list_segments().unwrap(); 427 + assert_eq!(segments, vec![SegmentId::new(1)]); 428 + } 429 + 430 + #[test] 431 + fn seal_segment_persists_index_and_marks_sealed() { 432 + let mgr = setup_manager(64 * 1024); 433 + let fd = mgr.open_for_append(SegmentId::new(1)).unwrap(); 434 + let mut writer = 435 + SegmentWriter::new(mgr.io(), fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 436 + 437 + (1u64..=10).for_each(|i| { 438 + writer 439 + .append_event(mgr.io(), &test_event(i, format!("payload-{i}").as_bytes())) 440 + .unwrap(); 441 + }); 442 + writer.sync(mgr.io()).unwrap(); 443 + 444 + let (index, _) = rebuild_from_segment(mgr.io(), fd, DEFAULT_INDEX_INTERVAL).unwrap(); 445 + 446 + assert!(!mgr.is_sealed(SegmentId::new(1))); 447 + mgr.seal_segment(SegmentId::new(1), &index).unwrap(); 448 + assert!(mgr.is_sealed(SegmentId::new(1))); 449 + 450 + let loaded = SegmentIndex::load(mgr.io(), &mgr.index_path(SegmentId::new(1))) 451 + .unwrap() 452 + .unwrap(); 453 + assert_eq!(loaded, index); 454 + } 455 + 456 + #[test] 457 + fn delete_segment_removes_files_and_handle() { 458 + let mgr = setup_manager(64 * 1024); 459 + let fd = mgr.open_for_append(SegmentId::new(1)).unwrap(); 460 + let mut writer = 461 + SegmentWriter::new(mgr.io(), fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 462 + writer 463 + .append_event(mgr.io(), &test_event(1, b"will be deleted")) 464 + .unwrap(); 465 + writer.sync(mgr.io()).unwrap(); 466 + 467 + let (index, _) = rebuild_from_segment(mgr.io(), fd, DEFAULT_INDEX_INTERVAL).unwrap(); 468 + mgr.seal_segment(SegmentId::new(1), &index).unwrap(); 469 + 470 + let epoch_before = mgr.retention_epoch(); 471 + mgr.delete_segment(SegmentId::new(1)).unwrap(); 472 + assert_eq!(mgr.retention_epoch(), epoch_before + 1); 473 + 474 + assert!(mgr.list_segments().unwrap().is_empty()); 475 + assert!(mgr.open_for_read(SegmentId::new(1)).is_err()); 476 + } 477 + 478 + #[test] 479 + fn oldest_segment_returns_first() { 480 + let mgr = setup_manager(1024); 481 + assert_eq!(mgr.oldest_segment().unwrap(), None); 482 + 483 + mgr.open_for_append(SegmentId::new(3)).unwrap(); 484 + mgr.open_for_append(SegmentId::new(1)).unwrap(); 485 + mgr.open_for_append(SegmentId::new(5)).unwrap(); 486 + 487 + assert_eq!(mgr.oldest_segment().unwrap(), Some(SegmentId::new(1))); 488 + } 489 + 490 + #[test] 491 + fn retention_epoch_starts_at_zero() { 492 + let mgr = setup_manager(1024); 493 + assert_eq!(mgr.retention_epoch(), 0); 494 + } 495 + 496 + #[test] 497 + fn rotate_and_write_across_segments() { 498 + let mgr = setup_manager(1024); 499 + 500 + let fd1 = mgr.open_for_append(SegmentId::new(1)).unwrap(); 501 + let mut writer1 = 502 + SegmentWriter::new(mgr.io(), fd1, SegmentId::new(1), EventSequence::new(1)).unwrap(); 503 + writer1 504 + .append_event(mgr.io(), &test_event(1, b"first segment")) 505 + .unwrap(); 506 + writer1.sync(mgr.io()).unwrap(); 507 + 508 + let (id2, fd2) = mgr.prepare_rotation(SegmentId::new(1)).unwrap(); 509 + mgr.commit_rotation(id2, fd2); 510 + 511 + let mut writer2 = SegmentWriter::new(mgr.io(), fd2, id2, EventSequence::new(2)).unwrap(); 512 + writer2 513 + .append_event(mgr.io(), &test_event(2, b"second segment")) 514 + .unwrap(); 515 + writer2.sync(mgr.io()).unwrap(); 516 + 517 + let fd1_read = mgr.open_for_read(SegmentId::new(1)).unwrap(); 518 + let events1 = crate::eventlog::SegmentReader::open(mgr.io(), fd1_read) 519 + .unwrap() 520 + .valid_prefix() 521 + .unwrap(); 522 + assert_eq!(events1.len(), 1); 523 + assert_eq!(events1[0].payload, b"first segment"); 524 + 525 + let fd2_read = mgr.open_for_read(id2).unwrap(); 526 + let events2 = crate::eventlog::SegmentReader::open(mgr.io(), fd2_read) 527 + .unwrap() 528 + .valid_prefix() 529 + .unwrap(); 530 + assert_eq!(events2.len(), 1); 531 + assert_eq!(events2[0].payload, b"second segment"); 532 + } 533 + 534 + #[test] 535 + fn seal_then_append_errors() { 536 + let mgr = setup_manager(64 * 1024); 537 + let fd = mgr.open_for_append(SegmentId::new(1)).unwrap(); 538 + SegmentWriter::new(mgr.io(), fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 539 + 540 + let index = SegmentIndex::new(); 541 + mgr.seal_segment(SegmentId::new(1), &index).unwrap(); 542 + 543 + let result = mgr.open_for_append(SegmentId::new(1)); 544 + assert!(result.is_err()); 545 + } 546 + 547 + #[test] 548 + fn accessors() { 549 + let mgr = setup_manager(999); 550 + assert_eq!(mgr.max_segment_size(), 999); 551 + assert_eq!(mgr.segments_dir(), Path::new("/segments")); 552 + } 553 + 554 + #[test] 555 + fn multiple_deletions_increment_epoch() { 556 + let mgr = setup_manager(1024); 557 + mgr.open_for_append(SegmentId::new(1)).unwrap(); 558 + mgr.open_for_append(SegmentId::new(2)).unwrap(); 559 + mgr.open_for_append(SegmentId::new(3)).unwrap(); 560 + 561 + assert_eq!(mgr.retention_epoch(), 0); 562 + mgr.delete_segment(SegmentId::new(1)).unwrap(); 563 + assert_eq!(mgr.retention_epoch(), 1); 564 + mgr.delete_segment(SegmentId::new(2)).unwrap(); 565 + assert_eq!(mgr.retention_epoch(), 2); 566 + } 567 + 568 + #[test] 569 + fn open_for_read_does_not_infer_sealed_from_index_file() { 570 + let mgr = setup_manager(64 * 1024); 571 + let fd = mgr.open_for_append(SegmentId::new(1)).unwrap(); 572 + let mut writer = 573 + SegmentWriter::new(mgr.io(), fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 574 + writer 575 + .append_event(mgr.io(), &test_event(1, b"sealed test")) 576 + .unwrap(); 577 + writer.sync(mgr.io()).unwrap(); 578 + 579 + let (index, _) = rebuild_from_segment(mgr.io(), fd, DEFAULT_INDEX_INTERVAL).unwrap(); 580 + mgr.seal_segment(SegmentId::new(1), &index).unwrap(); 581 + 582 + mgr.handles.write().remove(&SegmentId::new(1)); 583 + 584 + let _read_fd = mgr.open_for_read(SegmentId::new(1)).unwrap(); 585 + assert!(!mgr.is_sealed(SegmentId::new(1))); 586 + } 587 + 588 + #[test] 589 + fn open_for_read_unsealed_allows_append() { 590 + let mgr = setup_manager(1024); 591 + let _fd = mgr.open_for_append(SegmentId::new(1)).unwrap(); 592 + 593 + mgr.handles.write().remove(&SegmentId::new(1)); 594 + 595 + let _read_fd = mgr.open_for_read(SegmentId::new(1)).unwrap(); 596 + assert!(!mgr.is_sealed(SegmentId::new(1))); 597 + } 598 + 599 + #[test] 600 + fn shutdown_clears_handles() { 601 + let mgr = setup_manager(1024); 602 + mgr.open_for_append(SegmentId::new(1)).unwrap(); 603 + mgr.open_for_append(SegmentId::new(2)).unwrap(); 604 + 605 + mgr.shutdown(); 606 + assert!(mgr.handles.read().is_empty()); 607 + } 608 + 609 + #[test] 610 + #[should_panic(expected = "max_segment_size")] 611 + fn rejects_max_segment_size_too_small() { 612 + let sim = SimulatedIO::pristine(42); 613 + let _ = SegmentManager::new(sim, PathBuf::from("/segments"), 5); 614 + } 615 + 616 + #[test] 617 + fn prepare_rotation_truncates_stale_file() { 618 + let mgr = setup_manager(1024); 619 + let _fd0 = mgr.open_for_append(SegmentId::new(1)).unwrap(); 620 + 621 + let stale_path = mgr.segment_path(SegmentId::new(2)); 622 + let stale_fd = mgr 623 + .io() 624 + .open(&stale_path, OpenOptions::read_write()) 625 + .unwrap(); 626 + mgr.io().write_all_at(stale_fd, 0, &[0xDE; 4096]).unwrap(); 627 + mgr.io().sync(stale_fd).unwrap(); 628 + assert_eq!(mgr.io().file_size(stale_fd).unwrap(), 4096); 629 + mgr.io().close(stale_fd).unwrap(); 630 + 631 + let (next_id, next_fd) = mgr.prepare_rotation(SegmentId::new(1)).unwrap(); 632 + assert_eq!(next_id, SegmentId::new(2)); 633 + assert_eq!(mgr.io().file_size(next_fd).unwrap(), 0); 634 + } 635 + 636 + #[test] 637 + fn open_for_append_upgrades_read_only_handle() { 638 + let mgr = setup_manager(1024); 639 + let fd_append = mgr.open_for_append(SegmentId::new(1)).unwrap(); 640 + 641 + mgr.handles.write().remove(&SegmentId::new(1)); 642 + 643 + let fd_read = mgr.open_for_read(SegmentId::new(1)).unwrap(); 644 + assert_ne!(fd_read, fd_append); 645 + assert!(!mgr.handles.read().get(&SegmentId::new(1)).unwrap().writable); 646 + 647 + let fd_upgraded = mgr.open_for_append(SegmentId::new(1)).unwrap(); 648 + assert_ne!(fd_upgraded, fd_read); 649 + assert!(mgr.handles.read().get(&SegmentId::new(1)).unwrap().writable); 650 + } 651 + 652 + #[test] 653 + fn seal_uncached_segment_returns_error() { 654 + let mgr = setup_manager(1024); 655 + let index = SegmentIndex::new(); 656 + let result = mgr.seal_segment(SegmentId::new(99), &index); 657 + assert!(result.is_err()); 658 + } 659 + }
+489
crates/tranquil-store/src/eventlog/mod.rs
··· 1 + mod bridge; 2 + mod manager; 3 + mod notifier; 4 + mod payload; 5 + mod reader; 6 + mod segment_file; 7 + mod segment_index; 8 + mod types; 9 + mod writer; 10 + 11 + use std::collections::VecDeque; 12 + use std::io; 13 + use std::path::PathBuf; 14 + use std::sync::Arc; 15 + use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; 16 + use std::time::{Duration, Instant}; 17 + 18 + use parking_lot::Mutex; 19 + use tokio::sync::broadcast; 20 + use tracing::warn; 21 + use tranquil_db_traits::{RepoEventType, SequencedEvent}; 22 + use tranquil_types::Did; 23 + 24 + use crate::blockstore::BlocksSynced; 25 + use crate::fsync_order::PostBlockstoreHook; 26 + use crate::io::StorageIO; 27 + 28 + pub use bridge::{DeferredBroadcast, EventLogBridge}; 29 + pub use manager::SegmentManager; 30 + pub use notifier::EventLogNotifier; 31 + pub use payload::{ 32 + EventPayload, PayloadError, decode_payload, encode_payload, encode_payload_with_mutations, 33 + to_sequenced_event, validate_payload_size, 34 + }; 35 + pub use reader::{EventLogReader, RawEvent}; 36 + pub use segment_file::{ 37 + EVENT_HEADER_SIZE, EVENT_RECORD_OVERHEAD, ReadEventRecord, SEGMENT_FORMAT_VERSION, 38 + SEGMENT_HEADER_SIZE, SEGMENT_MAGIC, SegmentReader, SegmentWriter, ValidEvent, 39 + ValidateEventRecord, decode_event_record, encode_event_record, validate_event_record, 40 + }; 41 + pub use segment_index::{DEFAULT_INDEX_INTERVAL, SegmentIndex, rebuild_from_segment}; 42 + pub use types::{ 43 + DEFAULT_SEGMENT_SIZE, DidHash, EventLength, EventSequence, EventTypeTag, MAX_EVENT_PAYLOAD, 44 + SegmentId, SegmentOffset, TimestampMicros, 45 + }; 46 + pub use writer::{EventLogWriter, SyncResult}; 47 + 48 + const DEFAULT_BROADCAST_BUFFER: usize = 16384; 49 + 50 + pub struct EventWithMutations { 51 + pub event: SequencedEvent, 52 + pub mutation_set: Option<Vec<u8>>, 53 + } 54 + 55 + pub struct EventLogConfig { 56 + pub segments_dir: PathBuf, 57 + pub max_segment_size: u64, 58 + pub index_interval: usize, 59 + pub broadcast_buffer: usize, 60 + pub use_mmap: bool, 61 + } 62 + 63 + impl Default for EventLogConfig { 64 + fn default() -> Self { 65 + Self { 66 + segments_dir: PathBuf::from("eventlog"), 67 + max_segment_size: DEFAULT_SEGMENT_SIZE, 68 + index_interval: DEFAULT_INDEX_INTERVAL, 69 + broadcast_buffer: DEFAULT_BROADCAST_BUFFER, 70 + use_mmap: true, 71 + } 72 + } 73 + } 74 + 75 + pub struct EventLog<S: StorageIO> { 76 + writer: Mutex<EventLogWriter<S>>, 77 + reader: Arc<EventLogReader<S>>, 78 + manager: Arc<SegmentManager<S>>, 79 + broadcast_tx: broadcast::Sender<RawEvent>, 80 + synced_seq: AtomicU64, 81 + consecutive_sync_failures: AtomicU32, 82 + } 83 + 84 + impl<S: StorageIO> EventLog<S> { 85 + pub fn open(config: EventLogConfig, io: S) -> io::Result<Self> { 86 + let manager = Arc::new(SegmentManager::new( 87 + io, 88 + config.segments_dir, 89 + config.max_segment_size, 90 + )?); 91 + 92 + let writer = EventLogWriter::open(Arc::clone(&manager), config.index_interval)?; 93 + let synced = writer.synced_seq(); 94 + 95 + let reader = Arc::new(EventLogReader::new(Arc::clone(&manager), config.use_mmap)); 96 + reader.set_active_segment(writer.active_segment_id()); 97 + reader.seed_index(writer.active_segment_id(), writer.active_index_snapshot()); 98 + reader.refresh_segment_ranges()?; 99 + 100 + let (broadcast_tx, _) = broadcast::channel(config.broadcast_buffer); 101 + 102 + Ok(Self { 103 + writer: Mutex::new(writer), 104 + reader, 105 + manager, 106 + broadcast_tx, 107 + synced_seq: AtomicU64::new(synced.raw()), 108 + consecutive_sync_failures: AtomicU32::new(0), 109 + }) 110 + } 111 + 112 + pub fn append_event( 113 + &self, 114 + did: &Did, 115 + event_type: RepoEventType, 116 + event: &SequencedEvent, 117 + ) -> io::Result<EventSequence> { 118 + let payload = encode_payload(event); 119 + self.append_raw_payload(did, event_type, payload) 120 + } 121 + 122 + pub fn append_raw_payload( 123 + &self, 124 + did: &Did, 125 + event_type: RepoEventType, 126 + payload: Vec<u8>, 127 + ) -> io::Result<EventSequence> { 128 + let did_hash = DidHash::from_did(did.as_str()); 129 + let tag = repo_event_type_to_tag(event_type); 130 + validate_payload_size(&payload) 131 + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; 132 + self.writer.lock().append(did_hash, tag, payload) 133 + } 134 + 135 + pub fn sync(&self) -> io::Result<SyncResult> { 136 + self.sync_and_broadcast() 137 + } 138 + 139 + pub fn append_and_sync( 140 + &self, 141 + did: &Did, 142 + event_type: RepoEventType, 143 + event: &SequencedEvent, 144 + ) -> io::Result<EventSequence> { 145 + let seq = self.append_event(did, event_type, event)?; 146 + self.sync_and_broadcast()?; 147 + Ok(seq) 148 + } 149 + 150 + pub fn append_batch( 151 + &self, 152 + events: Vec<(&Did, RepoEventType, &SequencedEvent)>, 153 + ) -> io::Result<Vec<EventSequence>> { 154 + let mut writer = self.writer.lock(); 155 + events 156 + .iter() 157 + .map(|(did, event_type, event)| { 158 + let did_hash = DidHash::from_did(did.as_str()); 159 + let tag = repo_event_type_to_tag(*event_type); 160 + let payload = encode_payload(event); 161 + validate_payload_size(&payload) 162 + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; 163 + writer.append(did_hash, tag, payload) 164 + }) 165 + .collect() 166 + } 167 + 168 + pub fn sync_data(&self) -> io::Result<SyncResult> { 169 + let mut writer = self.writer.lock(); 170 + let result = writer.sync()?; 171 + self.synced_seq 172 + .store(result.synced_through.raw(), Ordering::Release); 173 + 174 + if let (Some(first), Some(last)) = 175 + (result.flushed_events.first(), result.flushed_events.last()) 176 + { 177 + self.reader.extend_active_range(first.seq, last.seq); 178 + } 179 + Ok(result) 180 + } 181 + 182 + pub fn broadcast_result(&self, result: &SyncResult) { 183 + result.flushed_events.iter().for_each(|e| { 184 + let _ = self.broadcast_tx.send(valid_event_to_raw(e)); 185 + }); 186 + } 187 + 188 + pub fn sync_and_broadcast(&self) -> io::Result<SyncResult> { 189 + let result = self.sync_data()?; 190 + self.broadcast_result(&result); 191 + Ok(result) 192 + } 193 + 194 + pub fn get_events_since( 195 + &self, 196 + cursor: EventSequence, 197 + limit: usize, 198 + ) -> io::Result<Vec<SequencedEvent>> { 199 + let raw_events = self.reader.read_events_from(cursor, limit)?; 200 + raw_events 201 + .iter() 202 + .map(|raw| { 203 + let payload = decode_payload(&raw.payload) 204 + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; 205 + to_sequenced_event(raw, &payload) 206 + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) 207 + }) 208 + .collect() 209 + } 210 + 211 + pub fn get_events_with_mutations_since( 212 + &self, 213 + cursor: EventSequence, 214 + limit: usize, 215 + ) -> io::Result<Vec<EventWithMutations>> { 216 + let raw_events = self.reader.read_events_from(cursor, limit)?; 217 + raw_events 218 + .iter() 219 + .map(|raw| { 220 + let payload = decode_payload(&raw.payload) 221 + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; 222 + let mutation_set = payload.mutation_set.clone(); 223 + let event = to_sequenced_event(raw, &payload) 224 + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; 225 + Ok(EventWithMutations { 226 + event, 227 + mutation_set, 228 + }) 229 + }) 230 + .collect() 231 + } 232 + 233 + pub fn get_event(&self, seq: EventSequence) -> io::Result<Option<SequencedEvent>> { 234 + self.reader.read_event_at(seq)?.map_or(Ok(None), |raw| { 235 + let payload = decode_payload(&raw.payload) 236 + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; 237 + let event = to_sequenced_event(&raw, &payload) 238 + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; 239 + Ok(Some(event)) 240 + }) 241 + } 242 + 243 + pub fn max_seq(&self) -> EventSequence { 244 + let raw = self.synced_seq.load(Ordering::Acquire); 245 + match raw { 246 + 0 => EventSequence::BEFORE_ALL, 247 + n => EventSequence::new(n), 248 + } 249 + } 250 + 251 + pub fn subscribe(&self) -> broadcast::Receiver<RawEvent> { 252 + self.broadcast_tx.subscribe() 253 + } 254 + 255 + pub fn maybe_rotate(&self) -> io::Result<bool> { 256 + let (sealed_id, new_active_id) = { 257 + let mut writer = self.writer.lock(); 258 + match writer.rotate_if_needed()? { 259 + None => return Ok(false), 260 + Some(sealed_id) => (sealed_id, writer.active_segment_id()), 261 + } 262 + }; 263 + self.reader.on_segment_rotated(sealed_id, new_active_id)?; 264 + Ok(true) 265 + } 266 + 267 + pub fn run_retention(&self, max_age: Duration) -> io::Result<usize> { 268 + let max_age_us = u64::try_from(max_age.as_micros()).unwrap_or(u64::MAX); 269 + let cutoff_us = TimestampMicros::now().raw().saturating_sub(max_age_us); 270 + let active_id = self.writer.lock().active_segment_id(); 271 + let segments = self.manager.list_segments()?; 272 + 273 + let deleted = segments 274 + .iter() 275 + .take_while(|&&id| id != active_id) 276 + .filter(|&&id| { 277 + self.reader 278 + .load_index(id) 279 + .ok() 280 + .and_then(|idx| idx.last_seq()) 281 + .and_then(|seq| { 282 + self.reader 283 + .read_event_at(seq) 284 + .ok() 285 + .flatten() 286 + .map(|e| e.timestamp.raw() < cutoff_us) 287 + }) 288 + .unwrap_or(false) 289 + }) 290 + .copied() 291 + .collect::<Vec<_>>(); 292 + 293 + deleted.iter().try_for_each(|&id| -> io::Result<()> { 294 + self.manager.delete_segment(id)?; 295 + self.reader.invalidate_index(id); 296 + self.reader.invalidate_mmap(id); 297 + Ok(()) 298 + })?; 299 + 300 + if !deleted.is_empty() { 301 + self.reader.refresh_segment_ranges()?; 302 + } 303 + 304 + Ok(deleted.len()) 305 + } 306 + 307 + pub fn segment_count(&self) -> usize { 308 + self.manager.list_segments().map_or(0, |s| s.len()) 309 + } 310 + 311 + pub fn disk_usage(&self) -> io::Result<u64> { 312 + let segments = self.manager.list_segments()?; 313 + segments.iter().try_fold(0u64, |acc, &id| { 314 + let fd = self.manager.open_for_read(id)?; 315 + let size = self.manager.io().file_size(fd)?; 316 + Ok(acc.saturating_add(size)) 317 + }) 318 + } 319 + 320 + pub fn shutdown(&self) -> io::Result<()> { 321 + self.writer.lock().shutdown() 322 + } 323 + 324 + pub fn subscriber(&self, start_seq: EventSequence) -> EventLogSubscriber<S> { 325 + EventLogSubscriber::new( 326 + self.broadcast_tx.subscribe(), 327 + Arc::clone(&self.reader), 328 + start_seq, 329 + ) 330 + } 331 + 332 + pub fn reader(&self) -> &EventLogReader<S> { 333 + &self.reader 334 + } 335 + 336 + pub fn manager(&self) -> &Arc<SegmentManager<S>> { 337 + &self.manager 338 + } 339 + 340 + pub fn consecutive_sync_failures(&self) -> u32 { 341 + self.consecutive_sync_failures.load(Ordering::Relaxed) 342 + } 343 + } 344 + 345 + impl<S: StorageIO + Send + Sync> PostBlockstoreHook for EventLog<S> { 346 + fn on_blocks_synced(&self, _proof: &BlocksSynced) -> io::Result<()> { 347 + match self.sync_and_broadcast() { 348 + Ok(_) => { 349 + self.consecutive_sync_failures.store(0, Ordering::Relaxed); 350 + if let Err(e) = self.maybe_rotate() { 351 + warn!(error = %e, "eventlog rotation deferred"); 352 + } 353 + Ok(()) 354 + } 355 + Err(e) => { 356 + let count = self 357 + .consecutive_sync_failures 358 + .fetch_add(1, Ordering::Relaxed) 359 + .saturating_add(1); 360 + warn!( 361 + error = %e, 362 + consecutive_failures = count, 363 + "eventlog sync failed after blockstore commit" 364 + ); 365 + Err(e) 366 + } 367 + } 368 + } 369 + } 370 + 371 + pub struct EventLogSubscriber<S: StorageIO> { 372 + rx: broadcast::Receiver<RawEvent>, 373 + last_seen: EventSequence, 374 + reader: Arc<EventLogReader<S>>, 375 + backfill_buffer: VecDeque<RawEvent>, 376 + consecutive_lags: u32, 377 + last_lag_time: Option<Instant>, 378 + } 379 + 380 + const MAX_CONSECUTIVE_LAGS_BEFORE_WARN: u32 = 3; 381 + const LAG_WINDOW: Duration = Duration::from_secs(10); 382 + const BACKFILL_BATCH_SIZE: usize = 1024; 383 + 384 + impl<S: StorageIO> EventLogSubscriber<S> { 385 + pub fn new( 386 + rx: broadcast::Receiver<RawEvent>, 387 + reader: Arc<EventLogReader<S>>, 388 + start_seq: EventSequence, 389 + ) -> Self { 390 + Self { 391 + rx, 392 + last_seen: start_seq, 393 + reader, 394 + backfill_buffer: VecDeque::new(), 395 + consecutive_lags: 0, 396 + last_lag_time: None, 397 + } 398 + } 399 + 400 + pub async fn next(&mut self) -> Option<RawEvent> { 401 + loop { 402 + if let Some(event) = self.backfill_buffer.pop_front() { 403 + self.last_seen = event.seq; 404 + self.consecutive_lags = 0; 405 + return Some(event); 406 + } 407 + 408 + match self.rx.recv().await { 409 + Ok(event) if event.seq > self.last_seen => { 410 + self.last_seen = event.seq; 411 + self.consecutive_lags = 0; 412 + return Some(event); 413 + } 414 + Ok(_) => continue, 415 + Err(broadcast::error::RecvError::Lagged(n)) => { 416 + warn!( 417 + lagged = n, 418 + last_seen = %self.last_seen, 419 + "subscriber lagged, backfilling from disk" 420 + ); 421 + self.track_lag(); 422 + match self.fill_backfill_buffer() { 423 + Ok(()) => continue, 424 + Err(e) => { 425 + warn!(error = %e, "backfill failed"); 426 + return None; 427 + } 428 + } 429 + } 430 + Err(broadcast::error::RecvError::Closed) => return None, 431 + } 432 + } 433 + } 434 + 435 + fn fill_backfill_buffer(&mut self) -> io::Result<()> { 436 + let events = self 437 + .reader 438 + .read_events_from(self.last_seen, BACKFILL_BATCH_SIZE)?; 439 + events.into_iter().for_each(|event| { 440 + self.backfill_buffer.push_back(event); 441 + }); 442 + Ok(()) 443 + } 444 + 445 + fn track_lag(&mut self) { 446 + let now = Instant::now(); 447 + let in_window = self 448 + .last_lag_time 449 + .is_some_and(|t| now.duration_since(t) < LAG_WINDOW); 450 + 451 + if in_window { 452 + self.consecutive_lags = self.consecutive_lags.saturating_add(1); 453 + } else { 454 + self.consecutive_lags = 1; 455 + } 456 + self.last_lag_time = Some(now); 457 + 458 + if self.consecutive_lags >= MAX_CONSECUTIVE_LAGS_BEFORE_WARN { 459 + warn!( 460 + consecutive_lags = self.consecutive_lags, 461 + last_seen = %self.last_seen, 462 + "subscriber repeatedly falling behind" 463 + ); 464 + } 465 + } 466 + 467 + pub fn last_seen(&self) -> EventSequence { 468 + self.last_seen 469 + } 470 + } 471 + 472 + fn valid_event_to_raw(e: &ValidEvent) -> RawEvent { 473 + RawEvent { 474 + seq: e.seq, 475 + timestamp: e.timestamp, 476 + did_hash: e.did_hash, 477 + event_type: e.event_type, 478 + payload: bytes::Bytes::from(e.payload.clone()), 479 + } 480 + } 481 + 482 + fn repo_event_type_to_tag(event_type: RepoEventType) -> EventTypeTag { 483 + match event_type { 484 + RepoEventType::Commit => EventTypeTag::COMMIT, 485 + RepoEventType::Identity => EventTypeTag::IDENTITY, 486 + RepoEventType::Account => EventTypeTag::ACCOUNT, 487 + RepoEventType::Sync => EventTypeTag::SYNC, 488 + } 489 + }
+36
crates/tranquil-store/src/eventlog/notifier.rs
··· 1 + use std::sync::Arc; 2 + 3 + use async_trait::async_trait; 4 + use tranquil_db_traits::{DbError, RepoEventNotifier, RepoEventReceiver}; 5 + 6 + use super::{EventLog, EventLogSubscriber, EventSequence}; 7 + use crate::io::StorageIO; 8 + 9 + pub struct EventLogNotifier<S: StorageIO> { 10 + log: Arc<EventLog<S>>, 11 + } 12 + 13 + impl<S: StorageIO> EventLogNotifier<S> { 14 + pub fn new(log: Arc<EventLog<S>>) -> Self { 15 + Self { log } 16 + } 17 + } 18 + 19 + #[async_trait] 20 + impl<S: StorageIO + 'static> RepoEventNotifier for EventLogNotifier<S> { 21 + async fn subscribe(&self) -> Result<Box<dyn RepoEventReceiver>, DbError> { 22 + let subscriber = self.log.subscriber(EventSequence::BEFORE_ALL); 23 + Ok(Box::new(EventLogEventReceiver { subscriber })) 24 + } 25 + } 26 + 27 + struct EventLogEventReceiver<S: StorageIO> { 28 + subscriber: EventLogSubscriber<S>, 29 + } 30 + 31 + #[async_trait] 32 + impl<S: StorageIO + 'static> RepoEventReceiver for EventLogEventReceiver<S> { 33 + async fn recv(&mut self) -> Option<i64> { 34 + self.subscriber.next().await.map(|event| event.seq.as_i64()) 35 + } 36 + }
+461
crates/tranquil-store/src/eventlog/payload.rs
··· 1 + use serde::{Deserialize, Serialize}; 2 + use tranquil_db_traits::{AccountStatus, SequenceNumber, SequencedEvent}; 3 + use tranquil_types::{CidLink, Did, Handle}; 4 + 5 + use crate::eventlog::reader::RawEvent; 6 + use crate::eventlog::types::MAX_EVENT_PAYLOAD; 7 + 8 + const PAYLOAD_VERSION: u8 = 1; 9 + const LARGE_PAYLOAD_WARNING_THRESHOLD: usize = 1024 * 1024; 10 + 11 + const CID_BYTE_LEN: usize = 36; 12 + 13 + #[derive(Debug, Clone, Serialize, Deserialize)] 14 + pub struct EventPayload { 15 + pub did: String, 16 + pub commit_cid: Option<Vec<u8>>, 17 + pub prev_cid: Option<Vec<u8>>, 18 + pub prev_data_cid: Option<Vec<u8>>, 19 + pub ops: Option<Vec<u8>>, 20 + pub blobs: Option<Vec<String>>, 21 + pub blocks_cids: Option<Vec<String>>, 22 + pub handle: Option<String>, 23 + pub active: Option<bool>, 24 + pub status: Option<u8>, 25 + pub rev: Option<String>, 26 + pub mutation_set: Option<Vec<u8>>, 27 + } 28 + 29 + #[derive(Debug, thiserror::Error)] 30 + pub enum PayloadError { 31 + #[error("payload too large: {size} bytes exceeds max {max}")] 32 + TooLarge { size: usize, max: usize }, 33 + #[error("deserialization failed: {0}")] 34 + DeserializeFailed(postcard::Error), 35 + #[error("unknown payload version: {0}")] 36 + UnknownVersion(u8), 37 + #[error("invalid DID in payload: {0}")] 38 + InvalidDid(String), 39 + #[error("invalid timestamp: {0}")] 40 + InvalidTimestamp(u64), 41 + #[error("invalid ops JSON in payload: {0}")] 42 + InvalidOps(serde_json::Error), 43 + #[error("invalid handle in payload: {0}")] 44 + InvalidHandle(String), 45 + #[error("invalid CID length: got {got}, expected {expected}")] 46 + InvalidCidLength { got: usize, expected: usize }, 47 + } 48 + 49 + fn cid_link_to_bytes(cid: &CidLink) -> Option<Vec<u8>> { 50 + let c = cid.to_cid()?; 51 + let raw = c.to_bytes(); 52 + (raw.len() == CID_BYTE_LEN).then_some(raw) 53 + } 54 + 55 + fn bytes_to_cid_link(bytes: &[u8]) -> Result<Option<CidLink>, PayloadError> { 56 + if bytes.len() != CID_BYTE_LEN { 57 + return Err(PayloadError::InvalidCidLength { 58 + got: bytes.len(), 59 + expected: CID_BYTE_LEN, 60 + }); 61 + } 62 + Ok(cid::Cid::read_bytes(bytes) 63 + .ok() 64 + .map(|c| CidLink::from_cid(&c))) 65 + } 66 + 67 + fn account_status_to_u8(status: &AccountStatus) -> u8 { 68 + match status { 69 + AccountStatus::Active => 0, 70 + AccountStatus::Takendown => 1, 71 + AccountStatus::Suspended => 2, 72 + AccountStatus::Deactivated => 3, 73 + AccountStatus::Deleted => 4, 74 + } 75 + } 76 + 77 + fn u8_to_account_status(tag: u8) -> Option<AccountStatus> { 78 + match tag { 79 + 0 => Some(AccountStatus::Active), 80 + 1 => Some(AccountStatus::Takendown), 81 + 2 => Some(AccountStatus::Suspended), 82 + 3 => Some(AccountStatus::Deactivated), 83 + 4 => Some(AccountStatus::Deleted), 84 + _ => None, 85 + } 86 + } 87 + 88 + pub fn encode_payload(event: &SequencedEvent) -> Vec<u8> { 89 + encode_payload_with_mutations(event, None) 90 + } 91 + 92 + pub fn encode_payload_with_mutations( 93 + event: &SequencedEvent, 94 + mutation_set: Option<&[u8]>, 95 + ) -> Vec<u8> { 96 + let ops_bytes = event 97 + .ops 98 + .as_ref() 99 + .map(|v| serde_json::to_vec(v).expect("serde_json::Value always serializes")); 100 + 101 + let payload = EventPayload { 102 + did: event.did.as_str().to_owned(), 103 + commit_cid: event.commit_cid.as_ref().and_then(cid_link_to_bytes), 104 + prev_cid: event.prev_cid.as_ref().and_then(cid_link_to_bytes), 105 + prev_data_cid: event.prev_data_cid.as_ref().and_then(cid_link_to_bytes), 106 + ops: ops_bytes, 107 + blobs: event.blobs.clone(), 108 + blocks_cids: event.blocks_cids.clone(), 109 + handle: event 110 + .handle 111 + .as_ref() 112 + .map(|h: &Handle| h.as_str().to_owned()), 113 + active: event.active, 114 + status: event.status.as_ref().map(account_status_to_u8), 115 + rev: event.rev.clone(), 116 + mutation_set: mutation_set.map(|b| b.to_vec()), 117 + }; 118 + 119 + let body = postcard::to_allocvec(&payload).expect("EventPayload serialization is infallible"); 120 + 121 + if body.len() > LARGE_PAYLOAD_WARNING_THRESHOLD { 122 + tracing::warn!( 123 + size = body.len(), 124 + did = %event.did, 125 + "unusually large event payload" 126 + ); 127 + } 128 + 129 + let mut buf = Vec::with_capacity(1 + body.len()); 130 + buf.push(PAYLOAD_VERSION); 131 + buf.extend_from_slice(&body); 132 + buf 133 + } 134 + 135 + pub fn decode_payload(bytes: &[u8]) -> Result<EventPayload, PayloadError> { 136 + let (&version, body) = bytes.split_first().ok_or(PayloadError::DeserializeFailed( 137 + postcard::Error::DeserializeUnexpectedEnd, 138 + ))?; 139 + 140 + if version != PAYLOAD_VERSION { 141 + return Err(PayloadError::UnknownVersion(version)); 142 + } 143 + 144 + postcard::from_bytes(body).map_err(PayloadError::DeserializeFailed) 145 + } 146 + 147 + pub fn validate_payload_size(payload: &[u8]) -> Result<(), PayloadError> { 148 + let max = MAX_EVENT_PAYLOAD as usize; 149 + if payload.len() > max { 150 + return Err(PayloadError::TooLarge { 151 + size: payload.len(), 152 + max, 153 + }); 154 + } 155 + Ok(()) 156 + } 157 + 158 + pub fn to_sequenced_event( 159 + raw: &RawEvent, 160 + payload: &EventPayload, 161 + ) -> Result<SequencedEvent, PayloadError> { 162 + let timestamp_secs = raw.timestamp.raw() / 1_000_000; 163 + let timestamp_secs_i64 = i64::try_from(timestamp_secs) 164 + .map_err(|_| PayloadError::InvalidTimestamp(raw.timestamp.raw()))?; 165 + let timestamp_subsec_us = 166 + u32::try_from(raw.timestamp.raw() % 1_000_000).expect("modulo 1M always fits u32"); 167 + 168 + let created_at = 169 + chrono::DateTime::from_timestamp(timestamp_secs_i64, timestamp_subsec_us * 1_000) 170 + .unwrap_or_default(); 171 + 172 + let did = Did::new(&payload.did).map_err(|_| PayloadError::InvalidDid(payload.did.clone()))?; 173 + 174 + let ops = payload 175 + .ops 176 + .as_ref() 177 + .map(|bytes| serde_json::from_slice(bytes)) 178 + .transpose() 179 + .map_err(PayloadError::InvalidOps)?; 180 + 181 + let handle = payload 182 + .handle 183 + .as_ref() 184 + .map(|h| Handle::new(h.as_str()).map_err(|_| PayloadError::InvalidHandle(h.clone()))) 185 + .transpose()?; 186 + 187 + Ok(SequencedEvent { 188 + seq: SequenceNumber::from_raw(raw.seq.as_i64()), 189 + did, 190 + created_at, 191 + event_type: raw.event_type.to_repo_event_type(), 192 + commit_cid: payload 193 + .commit_cid 194 + .as_deref() 195 + .map(bytes_to_cid_link) 196 + .transpose()? 197 + .flatten(), 198 + prev_cid: payload 199 + .prev_cid 200 + .as_deref() 201 + .map(bytes_to_cid_link) 202 + .transpose()? 203 + .flatten(), 204 + prev_data_cid: payload 205 + .prev_data_cid 206 + .as_deref() 207 + .map(bytes_to_cid_link) 208 + .transpose()? 209 + .flatten(), 210 + ops, 211 + blobs: payload.blobs.clone(), 212 + blocks_cids: payload.blocks_cids.clone(), 213 + handle, 214 + active: payload.active, 215 + status: payload.status.and_then(u8_to_account_status), 216 + rev: payload.rev.clone(), 217 + }) 218 + } 219 + 220 + #[cfg(test)] 221 + mod tests { 222 + use super::*; 223 + use crate::eventlog::types::{DidHash, EventSequence, EventTypeTag, TimestampMicros}; 224 + use bytes::Bytes; 225 + use sha2::Digest; 226 + use tranquil_db_traits::RepoEventType; 227 + 228 + fn test_did() -> Did { 229 + Did::new("did:plc:testuser1234567890abcdef").unwrap() 230 + } 231 + 232 + fn test_cid_link() -> CidLink { 233 + let hash = sha2::Digest::finalize(sha2::Sha256::new()); 234 + let mh = multihash::Multihash::<64>::wrap(0x12, &hash).unwrap(); 235 + let c = cid::Cid::new_v1(0x71, mh); 236 + CidLink::from_cid(&c) 237 + } 238 + 239 + #[test] 240 + fn round_trip_minimal_payload() { 241 + let event = SequencedEvent { 242 + seq: SequenceNumber::from_raw(42), 243 + did: test_did(), 244 + created_at: chrono::Utc::now(), 245 + event_type: RepoEventType::Account, 246 + commit_cid: None, 247 + prev_cid: None, 248 + prev_data_cid: None, 249 + ops: None, 250 + blobs: None, 251 + blocks_cids: None, 252 + handle: None, 253 + active: Some(true), 254 + status: Some(AccountStatus::Active), 255 + rev: None, 256 + }; 257 + 258 + let encoded = encode_payload(&event); 259 + assert_eq!(encoded[0], PAYLOAD_VERSION); 260 + 261 + let decoded = decode_payload(&encoded).unwrap(); 262 + assert_eq!(decoded.did, event.did.as_str()); 263 + assert_eq!(decoded.active, Some(true)); 264 + assert_eq!(decoded.status, Some(0)); 265 + assert!(decoded.commit_cid.is_none()); 266 + } 267 + 268 + #[test] 269 + fn round_trip_full_commit_payload() { 270 + let cid = test_cid_link(); 271 + let ops = serde_json::json!([{"action": "create", "path": "app.bsky.feed.post/abc"}]); 272 + 273 + let event = SequencedEvent { 274 + seq: SequenceNumber::from_raw(100), 275 + did: test_did(), 276 + created_at: chrono::Utc::now(), 277 + event_type: RepoEventType::Commit, 278 + commit_cid: Some(cid.clone()), 279 + prev_cid: Some(cid.clone()), 280 + prev_data_cid: Some(cid.clone()), 281 + ops: Some(ops.clone()), 282 + blobs: Some(vec!["bafkreibtest".to_owned()]), 283 + blocks_cids: Some(vec!["bafyreiblock".to_owned()]), 284 + handle: Some(Handle::new("test.bsky.social").unwrap()), 285 + active: None, 286 + status: None, 287 + rev: Some("rev123".to_owned()), 288 + }; 289 + 290 + let encoded = encode_payload(&event); 291 + let decoded = decode_payload(&encoded).unwrap(); 292 + 293 + let raw = RawEvent { 294 + seq: EventSequence::new(100), 295 + timestamp: TimestampMicros::now(), 296 + did_hash: DidHash::from_did(event.did.as_str()), 297 + event_type: EventTypeTag::COMMIT, 298 + payload: Bytes::from(encoded), 299 + }; 300 + 301 + let reconstructed = to_sequenced_event(&raw, &decoded).unwrap(); 302 + assert_eq!(reconstructed.did.as_str(), event.did.as_str()); 303 + assert_eq!(reconstructed.commit_cid, event.commit_cid); 304 + assert_eq!(reconstructed.prev_cid, event.prev_cid); 305 + assert_eq!(reconstructed.prev_data_cid, event.prev_data_cid); 306 + assert_eq!(reconstructed.blobs, event.blobs); 307 + assert_eq!(reconstructed.blocks_cids, event.blocks_cids); 308 + assert_eq!( 309 + reconstructed.handle.as_ref().map(|h: &Handle| h.as_str()), 310 + event.handle.as_ref().map(|h: &Handle| h.as_str()) 311 + ); 312 + assert_eq!(reconstructed.rev, event.rev); 313 + assert_eq!(reconstructed.event_type, RepoEventType::Commit); 314 + 315 + let reconstructed_ops = reconstructed.ops.unwrap(); 316 + assert_eq!(reconstructed_ops, ops); 317 + } 318 + 319 + #[test] 320 + fn unknown_version_rejected() { 321 + let mut encoded = encode_payload(&SequencedEvent { 322 + seq: SequenceNumber::from_raw(1), 323 + did: test_did(), 324 + created_at: chrono::Utc::now(), 325 + event_type: RepoEventType::Identity, 326 + commit_cid: None, 327 + prev_cid: None, 328 + prev_data_cid: None, 329 + ops: None, 330 + blobs: None, 331 + blocks_cids: None, 332 + handle: None, 333 + active: None, 334 + status: None, 335 + rev: None, 336 + }); 337 + 338 + encoded[0] = 99; 339 + match decode_payload(&encoded) { 340 + Err(PayloadError::UnknownVersion(99)) => {} 341 + other => panic!("expected UnknownVersion(99), got {other:?}"), 342 + } 343 + } 344 + 345 + #[test] 346 + fn empty_payload_rejected() { 347 + match decode_payload(&[]) { 348 + Err(PayloadError::DeserializeFailed(_)) => {} 349 + other => panic!("expected DeserializeFailed, got {other:?}"), 350 + } 351 + } 352 + 353 + #[test] 354 + fn validate_payload_size_accepts_within_limit() { 355 + let data = vec![0u8; MAX_EVENT_PAYLOAD as usize]; 356 + assert!(validate_payload_size(&data).is_ok()); 357 + } 358 + 359 + #[test] 360 + fn validate_payload_size_rejects_oversized() { 361 + let data = vec![0u8; MAX_EVENT_PAYLOAD as usize + 1]; 362 + match validate_payload_size(&data) { 363 + Err(PayloadError::TooLarge { size, max }) => { 364 + assert_eq!(size, MAX_EVENT_PAYLOAD as usize + 1); 365 + assert_eq!(max, MAX_EVENT_PAYLOAD as usize); 366 + } 367 + other => panic!("expected TooLarge, got {other:?}"), 368 + } 369 + } 370 + 371 + #[test] 372 + fn account_status_round_trip() { 373 + let statuses = [ 374 + AccountStatus::Active, 375 + AccountStatus::Takendown, 376 + AccountStatus::Suspended, 377 + AccountStatus::Deactivated, 378 + AccountStatus::Deleted, 379 + ]; 380 + 381 + statuses.iter().for_each(|status| { 382 + let tag = account_status_to_u8(status); 383 + let recovered = u8_to_account_status(tag).unwrap(); 384 + assert_eq!(&recovered, status); 385 + }); 386 + } 387 + 388 + #[test] 389 + fn invalid_account_status_returns_none() { 390 + assert!(u8_to_account_status(255).is_none()); 391 + } 392 + 393 + #[test] 394 + fn cid_bytes_round_trip() { 395 + let cid = test_cid_link(); 396 + let bytes = cid_link_to_bytes(&cid).unwrap(); 397 + assert_eq!(bytes.len(), CID_BYTE_LEN); 398 + let recovered = bytes_to_cid_link(&bytes).unwrap().unwrap(); 399 + assert_eq!(cid, recovered); 400 + } 401 + 402 + #[test] 403 + fn cid_bytes_wrong_length_rejected() { 404 + let short = vec![0u8; 10]; 405 + match bytes_to_cid_link(&short) { 406 + Err(PayloadError::InvalidCidLength { 407 + got: 10, 408 + expected: 36, 409 + }) => {} 410 + other => panic!("expected InvalidCidLength, got {other:?}"), 411 + } 412 + } 413 + 414 + #[test] 415 + fn event_type_tag_mapping() { 416 + assert_eq!( 417 + EventTypeTag::COMMIT.to_repo_event_type(), 418 + RepoEventType::Commit 419 + ); 420 + assert_eq!( 421 + EventTypeTag::IDENTITY.to_repo_event_type(), 422 + RepoEventType::Identity 423 + ); 424 + assert_eq!( 425 + EventTypeTag::ACCOUNT.to_repo_event_type(), 426 + RepoEventType::Account 427 + ); 428 + assert_eq!(EventTypeTag::SYNC.to_repo_event_type(), RepoEventType::Sync); 429 + } 430 + 431 + #[test] 432 + fn timestamp_microseconds_preserved() { 433 + let us = 1_700_000_000_123_456u64; 434 + let raw = RawEvent { 435 + seq: EventSequence::new(1), 436 + timestamp: TimestampMicros::new(us), 437 + did_hash: DidHash::from_did("did:plc:test"), 438 + event_type: EventTypeTag::COMMIT, 439 + payload: Bytes::new(), 440 + }; 441 + 442 + let payload = EventPayload { 443 + did: "did:plc:testuser1234567890abcdef".to_owned(), 444 + commit_cid: None, 445 + prev_cid: None, 446 + prev_data_cid: None, 447 + ops: None, 448 + blobs: None, 449 + blocks_cids: None, 450 + handle: None, 451 + active: None, 452 + status: None, 453 + rev: None, 454 + mutation_set: None, 455 + }; 456 + 457 + let event = to_sequenced_event(&raw, &payload).unwrap(); 458 + let recovered_us = u64::try_from(event.created_at.timestamp_micros()).unwrap(); 459 + assert_eq!(recovered_us, us); 460 + } 461 + }
+1011
crates/tranquil-store/src/eventlog/reader.rs
··· 1 + use std::cell::Cell; 2 + use std::collections::HashMap; 3 + use std::io; 4 + use std::sync::Arc; 5 + 6 + use bytes::Bytes; 7 + use parking_lot::RwLock; 8 + use tracing::warn; 9 + 10 + use crate::io::{MappedFile, StorageIO}; 11 + 12 + use super::manager::SegmentManager; 13 + use super::segment_file::{ReadEventRecord, SEGMENT_HEADER_SIZE, decode_event_record}; 14 + use super::segment_index::{DEFAULT_INDEX_INTERVAL, SegmentIndex, rebuild_from_segment}; 15 + use super::types::{ 16 + DidHash, EventSequence, EventTypeTag, SegmentId, SegmentOffset, TimestampMicros, 17 + }; 18 + 19 + const FIRST_EVENT_OFFSET: SegmentOffset = SegmentOffset::new(SEGMENT_HEADER_SIZE as u64); 20 + 21 + #[derive(Debug, Clone)] 22 + pub struct RawEvent { 23 + pub seq: EventSequence, 24 + pub timestamp: TimestampMicros, 25 + pub did_hash: DidHash, 26 + pub event_type: EventTypeTag, 27 + pub payload: Bytes, 28 + } 29 + 30 + #[derive(Debug, Clone, Copy)] 31 + struct SegmentRange { 32 + id: SegmentId, 33 + first: EventSequence, 34 + last: EventSequence, 35 + } 36 + 37 + pub struct EventLogReader<S: StorageIO> { 38 + manager: Arc<SegmentManager<S>>, 39 + indexes: RwLock<HashMap<SegmentId, Arc<SegmentIndex>>>, 40 + ranges: RwLock<Vec<SegmentRange>>, 41 + mmaps: RwLock<HashMap<SegmentId, Arc<MappedFile>>>, 42 + active_segment: RwLock<Option<SegmentId>>, 43 + use_mmap: bool, 44 + } 45 + 46 + impl<S: StorageIO> EventLogReader<S> { 47 + pub fn new(manager: Arc<SegmentManager<S>>, use_mmap: bool) -> Self { 48 + Self { 49 + manager, 50 + indexes: RwLock::new(HashMap::new()), 51 + ranges: RwLock::new(Vec::new()), 52 + mmaps: RwLock::new(HashMap::new()), 53 + active_segment: RwLock::new(None), 54 + use_mmap, 55 + } 56 + } 57 + 58 + pub fn set_active_segment(&self, id: SegmentId) { 59 + *self.active_segment.write() = Some(id); 60 + } 61 + 62 + pub fn extend_active_range(&self, first_seq: EventSequence, last_seq: EventSequence) { 63 + let active_id = match *self.active_segment.read() { 64 + Some(id) => id, 65 + None => return, 66 + }; 67 + 68 + let mut ranges = self.ranges.write(); 69 + match ranges.last_mut() { 70 + Some(last_range) if last_range.id == active_id => { 71 + last_range.last = last_seq; 72 + } 73 + _ => { 74 + ranges.push(SegmentRange { 75 + id: active_id, 76 + first: first_seq, 77 + last: last_seq, 78 + }); 79 + } 80 + } 81 + } 82 + 83 + pub fn seed_index(&self, segment_id: SegmentId, index: SegmentIndex) { 84 + self.indexes.write().insert(segment_id, Arc::new(index)); 85 + } 86 + 87 + pub fn load_index(&self, segment_id: SegmentId) -> io::Result<Arc<SegmentIndex>> { 88 + if let Some(idx) = self.indexes.read().get(&segment_id) { 89 + return Ok(Arc::clone(idx)); 90 + } 91 + 92 + let index = match SegmentIndex::load( 93 + self.manager.io(), 94 + &self.manager.index_path(segment_id), 95 + ) { 96 + Ok(Some(idx)) => idx, 97 + Ok(None) => self.rebuild_index(segment_id)?, 98 + Err(e) => { 99 + warn!(segment = %segment_id, error = %e, "index load failed, rebuilding from segment scan"); 100 + self.rebuild_index(segment_id)? 101 + } 102 + }; 103 + 104 + let arc = Arc::new(index); 105 + self.indexes.write().insert(segment_id, Arc::clone(&arc)); 106 + Ok(arc) 107 + } 108 + 109 + fn rebuild_index(&self, segment_id: SegmentId) -> io::Result<SegmentIndex> { 110 + let fd = self.manager.open_for_read(segment_id)?; 111 + let (idx, _) = rebuild_from_segment(self.manager.io(), fd, DEFAULT_INDEX_INTERVAL)?; 112 + let _ = idx.save(self.manager.io(), &self.manager.index_path(segment_id)); 113 + Ok(idx) 114 + } 115 + 116 + pub fn find_segment_for_seq(&self, target_seq: EventSequence) -> Option<SegmentId> { 117 + let ranges = self.ranges.read(); 118 + let idx = ranges.partition_point(|r| r.last < target_seq); 119 + ranges 120 + .get(idx) 121 + .and_then(|r| (r.first <= target_seq).then_some(r.id)) 122 + } 123 + 124 + pub fn refresh_segment_ranges(&self) -> io::Result<()> { 125 + let segment_ids = self.manager.list_segments()?; 126 + let active = *self.active_segment.read(); 127 + 128 + let new_ranges: Vec<SegmentRange> = segment_ids 129 + .iter() 130 + .filter_map(|&id| { 131 + let is_active = active.is_some_and(|a| a == id); 132 + let idx = match is_active { 133 + true => { 134 + if let Some(cached) = self.indexes.read().get(&id).cloned() { 135 + Ok(cached) 136 + } else { 137 + self.rebuild_index(id).map(|rebuilt| { 138 + let arc = Arc::new(rebuilt); 139 + self.indexes.write().insert(id, Arc::clone(&arc)); 140 + arc 141 + }) 142 + } 143 + } 144 + false => self.load_index(id), 145 + }; 146 + match idx { 147 + Ok(idx) => match (idx.first_seq(), idx.last_seq()) { 148 + (Some(first), Some(last)) => Some(SegmentRange { id, first, last }), 149 + _ => None, 150 + }, 151 + Err(e) => { 152 + warn!(segment = %id, error = %e, "failed to load index for range cache"); 153 + None 154 + } 155 + } 156 + }) 157 + .collect(); 158 + *self.ranges.write() = new_ranges; 159 + Ok(()) 160 + } 161 + 162 + fn is_mmap_eligible(&self, segment_id: SegmentId) -> bool { 163 + self.use_mmap 164 + && self 165 + .active_segment 166 + .read() 167 + .is_none_or(|active| active != segment_id) 168 + } 169 + 170 + fn get_mmap(&self, segment_id: SegmentId) -> io::Result<Arc<MappedFile>> { 171 + if let Some(m) = self.mmaps.read().get(&segment_id) { 172 + return Ok(Arc::clone(m)); 173 + } 174 + 175 + let fd = self.manager.open_for_read(segment_id)?; 176 + let mapped = self.manager.io().mmap_file(fd)?; 177 + let arc = Arc::new(mapped); 178 + self.mmaps.write().insert(segment_id, Arc::clone(&arc)); 179 + Ok(arc) 180 + } 181 + 182 + fn scan_events_from_offset( 183 + &self, 184 + segment_id: SegmentId, 185 + start_offset: SegmentOffset, 186 + start_seq: EventSequence, 187 + limit: usize, 188 + events: &mut Vec<RawEvent>, 189 + predicate: impl FnMut(&EventSequence) -> bool, 190 + ) -> io::Result<bool> { 191 + if self.is_mmap_eligible(segment_id) { 192 + self.scan_mmap( 193 + segment_id, 194 + start_offset, 195 + start_seq, 196 + limit, 197 + events, 198 + predicate, 199 + ) 200 + } else { 201 + let fd = self.manager.open_for_read(segment_id)?; 202 + let file_size = self.manager.io().file_size(fd)?; 203 + self.scan_direct( 204 + fd, 205 + file_size, 206 + start_offset, 207 + start_seq, 208 + limit, 209 + events, 210 + predicate, 211 + ) 212 + } 213 + } 214 + 215 + fn scan_mmap( 216 + &self, 217 + segment_id: SegmentId, 218 + start_offset: SegmentOffset, 219 + start_seq: EventSequence, 220 + limit: usize, 221 + events: &mut Vec<RawEvent>, 222 + mut predicate: impl FnMut(&EventSequence) -> bool, 223 + ) -> io::Result<bool> { 224 + let mmap = self.get_mmap(segment_id)?; 225 + let data: &[u8] = (*mmap).as_ref(); 226 + let file_size = data.len() as u64; 227 + let offset = Cell::new(start_offset); 228 + let collected = Cell::new(0usize); 229 + 230 + std::iter::from_fn(|| { 231 + let cur = offset.get(); 232 + (cur.raw() < file_size && collected.get() < limit) 233 + .then(|| decode_mmap_event(data, cur, file_size, segment_id)) 234 + }) 235 + .try_for_each(|result| -> io::Result<()> { 236 + match result? { 237 + MmapDecodeResult::Valid(event, next_offset) => { 238 + offset.set(next_offset); 239 + if event.seq > start_seq && predicate(&event.seq) { 240 + events.push(event); 241 + collected.set(collected.get() + 1); 242 + } 243 + } 244 + MmapDecodeResult::Corrupted 245 + | MmapDecodeResult::Truncated 246 + | MmapDecodeResult::EndOfSegment => { 247 + offset.set(SegmentOffset::new(file_size)); 248 + } 249 + } 250 + Ok(()) 251 + })?; 252 + Ok(collected.get() >= limit) 253 + } 254 + 255 + #[allow(clippy::too_many_arguments)] 256 + fn scan_direct( 257 + &self, 258 + fd: crate::io::FileId, 259 + file_size: u64, 260 + start_offset: SegmentOffset, 261 + start_seq: EventSequence, 262 + limit: usize, 263 + events: &mut Vec<RawEvent>, 264 + mut predicate: impl FnMut(&EventSequence) -> bool, 265 + ) -> io::Result<bool> { 266 + let offset = Cell::new(start_offset); 267 + let collected = Cell::new(0usize); 268 + 269 + std::iter::from_fn(|| { 270 + let cur = offset.get(); 271 + (cur.raw() < file_size && collected.get() < limit) 272 + .then(|| decode_event_record(self.manager.io(), fd, cur, file_size)) 273 + }) 274 + .try_for_each(|result| -> io::Result<()> { 275 + match result? { 276 + Some(ReadEventRecord::Valid { event, next_offset }) => { 277 + offset.set(next_offset); 278 + if event.seq > start_seq && predicate(&event.seq) { 279 + events.push(RawEvent { 280 + seq: event.seq, 281 + timestamp: event.timestamp, 282 + did_hash: event.did_hash, 283 + event_type: event.event_type, 284 + payload: Bytes::from(event.payload), 285 + }); 286 + collected.set(collected.get() + 1); 287 + } 288 + } 289 + Some(ReadEventRecord::Corrupted { .. } | ReadEventRecord::Truncated { .. }) 290 + | None => { 291 + offset.set(SegmentOffset::new(file_size)); 292 + } 293 + } 294 + Ok(()) 295 + })?; 296 + Ok(collected.get() >= limit) 297 + } 298 + 299 + pub fn read_events_from( 300 + &self, 301 + start_seq: EventSequence, 302 + limit: usize, 303 + ) -> io::Result<Vec<RawEvent>> { 304 + if limit == 0 { 305 + return Ok(Vec::new()); 306 + } 307 + 308 + let ranges = self.ranges.read().clone(); 309 + let start_idx = match start_seq { 310 + EventSequence::BEFORE_ALL => Some(0), 311 + seq => { 312 + let point = ranges.partition_point(|r| r.last <= seq); 313 + (point < ranges.len()).then_some(point) 314 + } 315 + }; 316 + 317 + let start_idx = match start_idx { 318 + Some(idx) => idx, 319 + None => return Ok(Vec::new()), 320 + }; 321 + 322 + let mut events = Vec::with_capacity(limit.min(1024)); 323 + 324 + ranges[start_idx..].iter().enumerate().try_fold( 325 + false, 326 + |limit_reached, (i, range)| -> io::Result<bool> { 327 + if limit_reached { 328 + return Ok(true); 329 + } 330 + 331 + let remaining = limit - events.len(); 332 + let is_first = i == 0; 333 + 334 + let (scan_offset, effective_seq) = match (is_first, start_seq) { 335 + (_, EventSequence::BEFORE_ALL) | (false, _) => { 336 + (FIRST_EVENT_OFFSET, EventSequence::BEFORE_ALL) 337 + } 338 + (true, seq) => { 339 + let index = self.load_index(range.id)?; 340 + (index.lookup(seq).unwrap_or(FIRST_EVENT_OFFSET), seq) 341 + } 342 + }; 343 + 344 + self.scan_events_from_offset( 345 + range.id, 346 + scan_offset, 347 + effective_seq, 348 + remaining, 349 + &mut events, 350 + |_| true, 351 + ) 352 + }, 353 + )?; 354 + 355 + Ok(events) 356 + } 357 + 358 + pub fn read_event_at(&self, seq: EventSequence) -> io::Result<Option<RawEvent>> { 359 + let segment_id = match self.find_segment_for_seq(seq) { 360 + Some(id) => id, 361 + None => return Ok(None), 362 + }; 363 + 364 + let index = self.load_index(segment_id)?; 365 + let scan_offset = index.lookup(seq).unwrap_or(FIRST_EVENT_OFFSET); 366 + 367 + let mut events = Vec::with_capacity(1); 368 + self.scan_events_from_offset( 369 + segment_id, 370 + scan_offset, 371 + seq.prev_or_before_all(), 372 + 1, 373 + &mut events, 374 + |s| *s == seq, 375 + )?; 376 + 377 + Ok(events.into_iter().next()) 378 + } 379 + 380 + pub fn on_segment_rotated( 381 + &self, 382 + sealed_id: SegmentId, 383 + new_active_id: SegmentId, 384 + ) -> io::Result<()> { 385 + self.invalidate_index(sealed_id); 386 + self.invalidate_mmap(sealed_id); 387 + self.set_active_segment(new_active_id); 388 + self.refresh_segment_ranges() 389 + } 390 + 391 + pub fn invalidate_mmap(&self, segment_id: SegmentId) { 392 + self.mmaps.write().remove(&segment_id); 393 + } 394 + 395 + pub fn invalidate_index(&self, segment_id: SegmentId) { 396 + self.indexes.write().remove(&segment_id); 397 + } 398 + } 399 + 400 + enum MmapDecodeResult { 401 + Valid(RawEvent, SegmentOffset), 402 + Corrupted, 403 + Truncated, 404 + EndOfSegment, 405 + } 406 + 407 + fn decode_mmap_event( 408 + data: &[u8], 409 + offset: SegmentOffset, 410 + file_size: u64, 411 + segment_id: SegmentId, 412 + ) -> io::Result<MmapDecodeResult> { 413 + use super::segment_file::EVENT_HEADER_SIZE; 414 + use super::types::MAX_EVENT_PAYLOAD; 415 + 416 + let raw = offset.raw(); 417 + if raw > file_size { 418 + warn!( 419 + segment = %segment_id, 420 + offset = raw, 421 + file_size, 422 + "decode offset past file size (corrupt index?)" 423 + ); 424 + return Ok(MmapDecodeResult::Corrupted); 425 + } 426 + let remaining = file_size - raw; 427 + if remaining == 0 { 428 + return Ok(MmapDecodeResult::EndOfSegment); 429 + } 430 + 431 + if remaining < EVENT_HEADER_SIZE as u64 { 432 + warn!( 433 + segment = %segment_id, 434 + offset = raw, 435 + remaining, 436 + "truncated record in sealed segment: not enough bytes for header" 437 + ); 438 + return Ok(MmapDecodeResult::Truncated); 439 + } 440 + 441 + let base = usize::try_from(raw).expect("file offset exceeds platform address space"); 442 + let header_slice = &data[base..base + EVENT_HEADER_SIZE]; 443 + 444 + let seq_raw = u64::from_le_bytes(header_slice[0..8].try_into().unwrap()); 445 + if seq_raw == 0 { 446 + warn!( 447 + segment = %segment_id, 448 + offset = raw, 449 + "corrupted record in sealed segment: seq == 0" 450 + ); 451 + return Ok(MmapDecodeResult::Corrupted); 452 + } 453 + let seq = EventSequence::new(seq_raw); 454 + 455 + let timestamp = 456 + TimestampMicros::new(u64::from_le_bytes(header_slice[8..16].try_into().unwrap())); 457 + let did_hash = DidHash::from_raw(u32::from_le_bytes(header_slice[16..20].try_into().unwrap())); 458 + let event_type = match EventTypeTag::from_raw(header_slice[20]) { 459 + Some(t) => t, 460 + None => { 461 + warn!( 462 + segment = %segment_id, 463 + offset = raw, 464 + tag = header_slice[20], 465 + "corrupted record in sealed segment: invalid event type" 466 + ); 467 + return Ok(MmapDecodeResult::Corrupted); 468 + } 469 + }; 470 + 471 + let payload_len = u32::from_le_bytes(header_slice[21..25].try_into().unwrap()); 472 + if payload_len > MAX_EVENT_PAYLOAD { 473 + warn!( 474 + segment = %segment_id, 475 + offset = raw, 476 + payload_len, 477 + "corrupted record in sealed segment: payload exceeds maximum" 478 + ); 479 + return Ok(MmapDecodeResult::Corrupted); 480 + } 481 + 482 + let record_size = super::segment_file::EVENT_RECORD_OVERHEAD as u64 + u64::from(payload_len); 483 + if record_size > remaining { 484 + warn!( 485 + segment = %segment_id, 486 + offset = raw, 487 + record_size, 488 + remaining, 489 + "truncated record in sealed segment: record extends past file end" 490 + ); 491 + return Ok(MmapDecodeResult::Truncated); 492 + } 493 + 494 + let payload_start = base + EVENT_HEADER_SIZE; 495 + let payload_end = payload_start + usize::try_from(payload_len).expect("payload_len fits usize"); 496 + 497 + let checksum_start = payload_end; 498 + let stored_checksum = 499 + u32::from_le_bytes(data[checksum_start..checksum_start + 4].try_into().unwrap()); 500 + 501 + let mut hasher = xxhash_rust::xxh3::Xxh3::new(); 502 + hasher.update(header_slice); 503 + hasher.update(&data[payload_start..payload_end]); 504 + let computed = hasher.digest() as u32; 505 + 506 + if stored_checksum != computed { 507 + warn!( 508 + segment = %segment_id, 509 + offset = raw, 510 + seq = %seq, 511 + stored = stored_checksum, 512 + computed, 513 + "corrupted record in sealed segment: checksum mismatch" 514 + ); 515 + return Ok(MmapDecodeResult::Corrupted); 516 + } 517 + 518 + let next_offset = offset.advance(record_size); 519 + Ok(MmapDecodeResult::Valid( 520 + RawEvent { 521 + seq, 522 + timestamp, 523 + did_hash, 524 + event_type, 525 + payload: Bytes::copy_from_slice(&data[payload_start..payload_end]), 526 + }, 527 + next_offset, 528 + )) 529 + } 530 + 531 + #[cfg(test)] 532 + mod tests { 533 + use super::*; 534 + use crate::eventlog::segment_file::EVENT_RECORD_OVERHEAD; 535 + use crate::eventlog::writer::EventLogWriter; 536 + use crate::sim::SimulatedIO; 537 + use std::path::PathBuf; 538 + 539 + fn setup_manager(max_segment_size: u64) -> Arc<SegmentManager<SimulatedIO>> { 540 + let sim = SimulatedIO::pristine(42); 541 + Arc::new(SegmentManager::new(sim, PathBuf::from("/segments"), max_segment_size).unwrap()) 542 + } 543 + 544 + fn setup_with_events( 545 + event_count: u64, 546 + payload_size: usize, 547 + max_segment_size: u64, 548 + ) -> ( 549 + Arc<SegmentManager<SimulatedIO>>, 550 + EventLogReader<SimulatedIO>, 551 + ) { 552 + let mgr = setup_manager(max_segment_size); 553 + { 554 + let mut writer = 555 + EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 556 + (1..=event_count).for_each(|i| { 557 + writer 558 + .append( 559 + DidHash::from_did(&format!("did:plc:user{i}")), 560 + EventTypeTag::COMMIT, 561 + vec![0xAA; payload_size], 562 + ) 563 + .unwrap(); 564 + }); 565 + writer.shutdown().unwrap(); 566 + } 567 + mgr.shutdown(); 568 + 569 + let reader = EventLogReader::new(Arc::clone(&mgr), false); 570 + reader.refresh_segment_ranges().unwrap(); 571 + (mgr, reader) 572 + } 573 + 574 + fn setup_multi_segment( 575 + events_per_segment: u64, 576 + num_segments: u64, 577 + payload_size: usize, 578 + ) -> ( 579 + Arc<SegmentManager<SimulatedIO>>, 580 + EventLogReader<SimulatedIO>, 581 + ) { 582 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 583 + let max_segment_size = 584 + (SEGMENT_HEADER_SIZE + record_size * events_per_segment as usize) as u64; 585 + 586 + let mgr = setup_manager(max_segment_size); 587 + { 588 + let mut writer = 589 + EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 590 + let total = events_per_segment * num_segments; 591 + (1..=total).for_each(|i| { 592 + writer 593 + .append( 594 + DidHash::from_did(&format!("did:plc:user{i}")), 595 + EventTypeTag::COMMIT, 596 + vec![i as u8; payload_size], 597 + ) 598 + .unwrap(); 599 + if i % events_per_segment == 0 && i < total { 600 + writer.sync().unwrap(); 601 + writer.rotate_if_needed().unwrap(); 602 + } 603 + }); 604 + writer.shutdown().unwrap(); 605 + } 606 + mgr.shutdown(); 607 + 608 + let reader = EventLogReader::new(Arc::clone(&mgr), false); 609 + reader.refresh_segment_ranges().unwrap(); 610 + (mgr, reader) 611 + } 612 + 613 + #[test] 614 + fn read_events_from_single_segment() { 615 + let (_, reader) = setup_with_events(10, 50, 64 * 1024); 616 + 617 + let events = reader 618 + .read_events_from(EventSequence::BEFORE_ALL, 100) 619 + .unwrap(); 620 + assert_eq!(events.len(), 10); 621 + events.iter().enumerate().for_each(|(i, e)| { 622 + assert_eq!(e.seq, EventSequence::new(i as u64 + 1)); 623 + assert_eq!(e.event_type, EventTypeTag::COMMIT); 624 + assert_eq!(e.payload.len(), 50); 625 + }); 626 + } 627 + 628 + #[test] 629 + fn read_events_from_cursor() { 630 + let (_, reader) = setup_with_events(10, 50, 64 * 1024); 631 + 632 + let events = reader.read_events_from(EventSequence::new(5), 100).unwrap(); 633 + assert_eq!(events.len(), 5); 634 + assert_eq!(events[0].seq, EventSequence::new(6)); 635 + assert_eq!(events[4].seq, EventSequence::new(10)); 636 + } 637 + 638 + #[test] 639 + fn read_events_respects_limit() { 640 + let (_, reader) = setup_with_events(10, 50, 64 * 1024); 641 + 642 + let events = reader 643 + .read_events_from(EventSequence::BEFORE_ALL, 3) 644 + .unwrap(); 645 + assert_eq!(events.len(), 3); 646 + assert_eq!(events[0].seq, EventSequence::new(1)); 647 + assert_eq!(events[2].seq, EventSequence::new(3)); 648 + } 649 + 650 + #[test] 651 + fn read_events_empty_on_zero_limit() { 652 + let (_, reader) = setup_with_events(5, 50, 64 * 1024); 653 + let events = reader 654 + .read_events_from(EventSequence::BEFORE_ALL, 0) 655 + .unwrap(); 656 + assert!(events.is_empty()); 657 + } 658 + 659 + #[test] 660 + fn read_event_at_existing() { 661 + let (_, reader) = setup_with_events(10, 50, 64 * 1024); 662 + 663 + let event = reader.read_event_at(EventSequence::new(5)).unwrap(); 664 + assert!(event.is_some()); 665 + let event = event.unwrap(); 666 + assert_eq!(event.seq, EventSequence::new(5)); 667 + assert_eq!(event.payload.len(), 50); 668 + } 669 + 670 + #[test] 671 + fn read_event_at_missing() { 672 + let (_, reader) = setup_with_events(5, 50, 64 * 1024); 673 + 674 + let event = reader.read_event_at(EventSequence::new(100)).unwrap(); 675 + assert!(event.is_none()); 676 + } 677 + 678 + #[test] 679 + fn cross_segment_read() { 680 + let (_, reader) = setup_multi_segment(3, 3, 50); 681 + 682 + let events = reader 683 + .read_events_from(EventSequence::BEFORE_ALL, 100) 684 + .unwrap(); 685 + assert_eq!(events.len(), 9); 686 + events.iter().enumerate().for_each(|(i, e)| { 687 + assert_eq!(e.seq, EventSequence::new(i as u64 + 1)); 688 + }); 689 + } 690 + 691 + #[test] 692 + fn cross_segment_cursor_resumption() { 693 + let (_, reader) = setup_multi_segment(3, 3, 50); 694 + 695 + let events = reader.read_events_from(EventSequence::new(4), 100).unwrap(); 696 + assert_eq!(events.len(), 5); 697 + assert_eq!(events[0].seq, EventSequence::new(5)); 698 + assert_eq!(events[4].seq, EventSequence::new(9)); 699 + } 700 + 701 + #[test] 702 + fn cross_segment_limit_respected() { 703 + let (_, reader) = setup_multi_segment(3, 3, 50); 704 + 705 + let events = reader 706 + .read_events_from(EventSequence::BEFORE_ALL, 5) 707 + .unwrap(); 708 + assert_eq!(events.len(), 5); 709 + assert_eq!(events[0].seq, EventSequence::new(1)); 710 + assert_eq!(events[4].seq, EventSequence::new(5)); 711 + } 712 + 713 + #[test] 714 + fn cross_segment_limit_at_boundary() { 715 + let (_, reader) = setup_multi_segment(3, 3, 50); 716 + 717 + let events = reader.read_events_from(EventSequence::new(2), 5).unwrap(); 718 + assert_eq!(events.len(), 5); 719 + assert_eq!(events[0].seq, EventSequence::new(3)); 720 + assert_eq!(events[4].seq, EventSequence::new(7)); 721 + } 722 + 723 + #[test] 724 + fn find_segment_for_seq_locates_correct_segment() { 725 + let (_, reader) = setup_multi_segment(3, 2, 50); 726 + 727 + assert_eq!( 728 + reader.find_segment_for_seq(EventSequence::new(1)), 729 + Some(SegmentId::new(1)) 730 + ); 731 + assert_eq!( 732 + reader.find_segment_for_seq(EventSequence::new(3)), 733 + Some(SegmentId::new(1)) 734 + ); 735 + assert_eq!( 736 + reader.find_segment_for_seq(EventSequence::new(4)), 737 + Some(SegmentId::new(2)) 738 + ); 739 + assert_eq!( 740 + reader.find_segment_for_seq(EventSequence::new(6)), 741 + Some(SegmentId::new(2)) 742 + ); 743 + assert_eq!(reader.find_segment_for_seq(EventSequence::new(100)), None); 744 + } 745 + 746 + #[test] 747 + fn index_caching_returns_same_arc() { 748 + let (_, reader) = setup_with_events(5, 50, 64 * 1024); 749 + 750 + let idx1 = reader.load_index(SegmentId::new(1)).unwrap(); 751 + let idx2 = reader.load_index(SegmentId::new(1)).unwrap(); 752 + assert!(Arc::ptr_eq(&idx1, &idx2)); 753 + } 754 + 755 + #[test] 756 + fn refresh_after_segment_deletion() { 757 + let (mgr, reader) = setup_multi_segment(3, 3, 50); 758 + 759 + assert_eq!( 760 + reader.find_segment_for_seq(EventSequence::new(1)), 761 + Some(SegmentId::new(1)) 762 + ); 763 + 764 + mgr.delete_segment(SegmentId::new(1)).unwrap(); 765 + reader.invalidate_index(SegmentId::new(1)); 766 + reader.invalidate_mmap(SegmentId::new(1)); 767 + reader.refresh_segment_ranges().unwrap(); 768 + 769 + assert_eq!(reader.find_segment_for_seq(EventSequence::new(1)), None); 770 + assert_eq!( 771 + reader.find_segment_for_seq(EventSequence::new(4)), 772 + Some(SegmentId::new(2)) 773 + ); 774 + } 775 + 776 + #[test] 777 + fn mmap_read_matches_direct_read() { 778 + let (mgr, direct_reader) = setup_with_events(10, 50, 64 * 1024); 779 + 780 + let mmap_reader = EventLogReader::new(Arc::clone(&mgr), true); 781 + mmap_reader.refresh_segment_ranges().unwrap(); 782 + 783 + let direct_events = direct_reader 784 + .read_events_from(EventSequence::BEFORE_ALL, 100) 785 + .unwrap(); 786 + let mmap_events = mmap_reader 787 + .read_events_from(EventSequence::BEFORE_ALL, 100) 788 + .unwrap(); 789 + 790 + assert_eq!(direct_events.len(), mmap_events.len()); 791 + direct_events 792 + .iter() 793 + .zip(mmap_events.iter()) 794 + .for_each(|(d, m)| { 795 + assert_eq!(d.seq, m.seq); 796 + assert_eq!(d.timestamp, m.timestamp); 797 + assert_eq!(d.did_hash, m.did_hash); 798 + assert_eq!(d.event_type, m.event_type); 799 + assert_eq!(d.payload, m.payload); 800 + }); 801 + } 802 + 803 + #[test] 804 + fn read_event_at_first_and_last() { 805 + let (_, reader) = setup_with_events(20, 50, 64 * 1024); 806 + 807 + let first = reader 808 + .read_event_at(EventSequence::new(1)) 809 + .unwrap() 810 + .unwrap(); 811 + assert_eq!(first.seq, EventSequence::new(1)); 812 + 813 + let last = reader 814 + .read_event_at(EventSequence::new(20)) 815 + .unwrap() 816 + .unwrap(); 817 + assert_eq!(last.seq, EventSequence::new(20)); 818 + } 819 + 820 + #[test] 821 + fn empty_reader_returns_empty() { 822 + let mgr = setup_manager(64 * 1024); 823 + let reader = EventLogReader::new(Arc::clone(&mgr), false); 824 + reader.refresh_segment_ranges().unwrap(); 825 + 826 + let events = reader 827 + .read_events_from(EventSequence::BEFORE_ALL, 100) 828 + .unwrap(); 829 + assert!(events.is_empty()); 830 + 831 + let event = reader.read_event_at(EventSequence::new(1)).unwrap(); 832 + assert!(event.is_none()); 833 + } 834 + 835 + #[test] 836 + fn cursor_past_end_returns_empty() { 837 + let (_, reader) = setup_with_events(5, 50, 64 * 1024); 838 + let events = reader.read_events_from(EventSequence::new(5), 100).unwrap(); 839 + assert!(events.is_empty()); 840 + } 841 + 842 + #[test] 843 + fn cross_segment_read_event_at() { 844 + let (_, reader) = setup_multi_segment(3, 3, 50); 845 + 846 + (1..=9).for_each(|i| { 847 + let event = reader 848 + .read_event_at(EventSequence::new(i)) 849 + .unwrap() 850 + .unwrap(); 851 + assert_eq!(event.seq, EventSequence::new(i)); 852 + assert_eq!(event.payload[0], i as u8); 853 + }); 854 + } 855 + 856 + #[test] 857 + fn different_event_types_preserved() { 858 + let mgr = setup_manager(64 * 1024); 859 + { 860 + let mut writer = 861 + EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 862 + let types = [ 863 + EventTypeTag::COMMIT, 864 + EventTypeTag::IDENTITY, 865 + EventTypeTag::ACCOUNT, 866 + EventTypeTag::SYNC, 867 + ]; 868 + types.iter().enumerate().for_each(|(i, &et)| { 869 + writer 870 + .append( 871 + DidHash::from_did(&format!("did:plc:user{i}")), 872 + et, 873 + vec![0xAA; 32], 874 + ) 875 + .unwrap(); 876 + }); 877 + writer.shutdown().unwrap(); 878 + } 879 + mgr.shutdown(); 880 + 881 + let reader = EventLogReader::new(Arc::clone(&mgr), false); 882 + reader.refresh_segment_ranges().unwrap(); 883 + 884 + let events = reader 885 + .read_events_from(EventSequence::BEFORE_ALL, 100) 886 + .unwrap(); 887 + assert_eq!(events[0].event_type, EventTypeTag::COMMIT); 888 + assert_eq!(events[1].event_type, EventTypeTag::IDENTITY); 889 + assert_eq!(events[2].event_type, EventTypeTag::ACCOUNT); 890 + assert_eq!(events[3].event_type, EventTypeTag::SYNC); 891 + } 892 + 893 + #[test] 894 + fn active_segment_excludes_mmap() { 895 + let (mgr, _) = setup_with_events(10, 50, 64 * 1024); 896 + 897 + let reader = EventLogReader::new(Arc::clone(&mgr), true); 898 + reader.set_active_segment(SegmentId::new(1)); 899 + reader.refresh_segment_ranges().unwrap(); 900 + 901 + assert!(!reader.is_mmap_eligible(SegmentId::new(1))); 902 + assert!(reader.is_mmap_eligible(SegmentId::new(2))); 903 + } 904 + 905 + #[test] 906 + fn no_active_segment_mmaps_all() { 907 + let reader: EventLogReader<SimulatedIO> = 908 + EventLogReader::new(setup_manager(64 * 1024), true); 909 + 910 + assert!(reader.is_mmap_eligible(SegmentId::new(1))); 911 + assert!(reader.is_mmap_eligible(SegmentId::new(99))); 912 + } 913 + 914 + #[test] 915 + fn corrupt_index_offset_does_not_panic() { 916 + let mgr = setup_manager(64 * 1024); 917 + { 918 + let mut writer = 919 + EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 920 + (1..=5).for_each(|i| { 921 + writer 922 + .append( 923 + DidHash::from_did(&format!("did:plc:user{i}")), 924 + EventTypeTag::COMMIT, 925 + vec![0xAA; 50], 926 + ) 927 + .unwrap(); 928 + }); 929 + writer.shutdown().unwrap(); 930 + } 931 + mgr.shutdown(); 932 + 933 + let mut bad_index = SegmentIndex::new(); 934 + bad_index.record(EventSequence::new(1), SegmentOffset::new(999_999)); 935 + bad_index.record(EventSequence::new(5), SegmentOffset::new(999_999)); 936 + bad_index 937 + .save(mgr.io(), &mgr.index_path(SegmentId::new(1))) 938 + .unwrap(); 939 + 940 + let reader = EventLogReader::new(Arc::clone(&mgr), false); 941 + reader.refresh_segment_ranges().unwrap(); 942 + 943 + let events = reader 944 + .read_events_from(EventSequence::BEFORE_ALL, 100) 945 + .unwrap(); 946 + assert!(events.is_empty() || events.len() <= 5); 947 + 948 + let event = reader.read_event_at(EventSequence::new(3)).unwrap(); 949 + assert!(event.is_none() || event.is_some_and(|e| e.seq == EventSequence::new(3))); 950 + } 951 + 952 + #[test] 953 + fn corrupt_index_offset_mmap_does_not_panic() { 954 + let mgr = setup_manager(64 * 1024); 955 + { 956 + let mut writer = 957 + EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 958 + (1..=5).for_each(|i| { 959 + writer 960 + .append( 961 + DidHash::from_did(&format!("did:plc:user{i}")), 962 + EventTypeTag::COMMIT, 963 + vec![0xAA; 50], 964 + ) 965 + .unwrap(); 966 + }); 967 + writer.shutdown().unwrap(); 968 + } 969 + mgr.shutdown(); 970 + 971 + let mut bad_index = SegmentIndex::new(); 972 + bad_index.record(EventSequence::new(1), SegmentOffset::new(999_999)); 973 + bad_index.record(EventSequence::new(5), SegmentOffset::new(999_999)); 974 + bad_index 975 + .save(mgr.io(), &mgr.index_path(SegmentId::new(1))) 976 + .unwrap(); 977 + 978 + let reader = EventLogReader::new(Arc::clone(&mgr), true); 979 + reader.refresh_segment_ranges().unwrap(); 980 + 981 + let events = reader 982 + .read_events_from(EventSequence::BEFORE_ALL, 100) 983 + .unwrap(); 984 + assert!(events.is_empty() || events.len() <= 5); 985 + } 986 + 987 + #[test] 988 + fn on_segment_rotated_updates_state() { 989 + let (mgr, _direct_reader) = setup_multi_segment(3, 2, 50); 990 + 991 + let reader = EventLogReader::new(Arc::clone(&mgr), true); 992 + reader.refresh_segment_ranges().unwrap(); 993 + 994 + assert_eq!( 995 + reader.find_segment_for_seq(EventSequence::new(1)), 996 + Some(SegmentId::new(1)) 997 + ); 998 + 999 + reader 1000 + .on_segment_rotated(SegmentId::new(1), SegmentId::new(2)) 1001 + .unwrap(); 1002 + 1003 + assert!(!reader.is_mmap_eligible(SegmentId::new(2))); 1004 + assert!(reader.is_mmap_eligible(SegmentId::new(1))); 1005 + 1006 + let events = reader 1007 + .read_events_from(EventSequence::BEFORE_ALL, 100) 1008 + .unwrap(); 1009 + assert_eq!(events.len(), 6); 1010 + } 1011 + }
+921
crates/tranquil-store/src/eventlog/segment_file.rs
··· 1 + use std::io; 2 + 3 + use crate::io::{FileId, StorageIO}; 4 + 5 + use super::types::{ 6 + DidHash, EventSequence, EventTypeTag, MAX_EVENT_PAYLOAD, SegmentId, SegmentOffset, 7 + TimestampMicros, 8 + }; 9 + 10 + pub const SEGMENT_MAGIC: [u8; 4] = *b"TQEV"; 11 + pub const SEGMENT_FORMAT_VERSION: u8 = 1; 12 + pub const SEGMENT_HEADER_SIZE: usize = 5; 13 + 14 + pub const EVENT_HEADER_SIZE: usize = 8 + 8 + 4 + 1 + 4; 15 + pub const EVENT_RECORD_OVERHEAD: usize = EVENT_HEADER_SIZE + 4; 16 + 17 + #[must_use] 18 + #[derive(Debug, Clone, PartialEq, Eq)] 19 + pub struct ValidEvent { 20 + pub seq: EventSequence, 21 + pub timestamp: TimestampMicros, 22 + pub did_hash: DidHash, 23 + pub event_type: EventTypeTag, 24 + pub payload: Vec<u8>, 25 + } 26 + 27 + fn event_record_checksum(header: &[u8; EVENT_HEADER_SIZE], payload: &[u8]) -> u32 { 28 + let mut hasher = xxhash_rust::xxh3::Xxh3::new(); 29 + hasher.update(header); 30 + hasher.update(payload); 31 + hasher.digest() as u32 32 + } 33 + 34 + fn encode_header(event: &ValidEvent, payload_len: u32) -> [u8; EVENT_HEADER_SIZE] { 35 + let mut header = [0u8; EVENT_HEADER_SIZE]; 36 + header[0..8].copy_from_slice(&event.seq.raw().to_le_bytes()); 37 + header[8..16].copy_from_slice(&event.timestamp.raw().to_le_bytes()); 38 + header[16..20].copy_from_slice(&event.did_hash.raw().to_le_bytes()); 39 + header[20] = event.event_type.raw(); 40 + header[21..25].copy_from_slice(&payload_len.to_le_bytes()); 41 + header 42 + } 43 + 44 + pub fn encode_event_record<S: StorageIO>( 45 + io: &S, 46 + fd: FileId, 47 + offset: SegmentOffset, 48 + event: &ValidEvent, 49 + ) -> io::Result<u64> { 50 + let payload_len = u32::try_from(event.payload.len()).map_err(|_| { 51 + io::Error::new( 52 + io::ErrorKind::InvalidInput, 53 + "event payload exceeds u32::MAX", 54 + ) 55 + })?; 56 + if payload_len > MAX_EVENT_PAYLOAD { 57 + return Err(io::Error::new( 58 + io::ErrorKind::InvalidInput, 59 + "event payload exceeds MAX_EVENT_PAYLOAD", 60 + )); 61 + } 62 + 63 + let header = encode_header(event, payload_len); 64 + let checksum = event_record_checksum(&header, &event.payload); 65 + let record_size = EVENT_RECORD_OVERHEAD as u64 + u64::from(payload_len); 66 + 67 + let base = offset.raw(); 68 + io.write_all_at(fd, base, &header)?; 69 + io.write_all_at(fd, base + EVENT_HEADER_SIZE as u64, &event.payload)?; 70 + io.write_all_at( 71 + fd, 72 + base + EVENT_HEADER_SIZE as u64 + u64::from(payload_len), 73 + &checksum.to_le_bytes(), 74 + )?; 75 + 76 + Ok(record_size) 77 + } 78 + 79 + #[must_use] 80 + #[derive(Debug)] 81 + pub enum ReadEventRecord { 82 + Valid { 83 + event: ValidEvent, 84 + next_offset: SegmentOffset, 85 + }, 86 + Corrupted { 87 + offset: SegmentOffset, 88 + }, 89 + Truncated { 90 + offset: SegmentOffset, 91 + }, 92 + } 93 + 94 + pub fn decode_event_record<S: StorageIO>( 95 + io: &S, 96 + fd: FileId, 97 + offset: SegmentOffset, 98 + file_size: u64, 99 + ) -> io::Result<Option<ReadEventRecord>> { 100 + let raw = offset.raw(); 101 + if raw > file_size { 102 + return Ok(Some(ReadEventRecord::Corrupted { offset })); 103 + } 104 + let remaining = file_size - raw; 105 + if remaining == 0 { 106 + return Ok(None); 107 + } 108 + 109 + if remaining < EVENT_HEADER_SIZE as u64 { 110 + return Ok(Some(ReadEventRecord::Truncated { offset })); 111 + } 112 + 113 + let mut header = [0u8; EVENT_HEADER_SIZE]; 114 + io.read_exact_at(fd, raw, &mut header)?; 115 + 116 + let seq_raw = u64::from_le_bytes(header[0..8].try_into().unwrap()); 117 + if seq_raw == 0 { 118 + return Ok(Some(ReadEventRecord::Corrupted { offset })); 119 + } 120 + let seq = EventSequence::new(seq_raw); 121 + 122 + let timestamp = TimestampMicros::new(u64::from_le_bytes(header[8..16].try_into().unwrap())); 123 + let did_hash = DidHash::from_raw(u32::from_le_bytes(header[16..20].try_into().unwrap())); 124 + let event_type_raw = header[20]; 125 + let event_type = match EventTypeTag::from_raw(event_type_raw) { 126 + Some(t) => t, 127 + None => return Ok(Some(ReadEventRecord::Corrupted { offset })), 128 + }; 129 + 130 + let payload_len = u32::from_le_bytes(header[21..25].try_into().unwrap()); 131 + if payload_len > MAX_EVENT_PAYLOAD { 132 + return Ok(Some(ReadEventRecord::Corrupted { offset })); 133 + } 134 + 135 + let record_size = EVENT_RECORD_OVERHEAD as u64 + u64::from(payload_len); 136 + if record_size > remaining { 137 + return Ok(Some(ReadEventRecord::Truncated { offset })); 138 + } 139 + 140 + let payload_offset = raw + EVENT_HEADER_SIZE as u64; 141 + let mut payload = vec![0u8; usize::try_from(payload_len).expect("payload_len fits usize")]; 142 + io.read_exact_at(fd, payload_offset, &mut payload)?; 143 + 144 + let mut checksum_bytes = [0u8; 4]; 145 + io.read_exact_at( 146 + fd, 147 + payload_offset + u64::from(payload_len), 148 + &mut checksum_bytes, 149 + )?; 150 + 151 + let stored_checksum = u32::from_le_bytes(checksum_bytes); 152 + let computed_checksum = event_record_checksum(&header, &payload); 153 + 154 + if stored_checksum != computed_checksum { 155 + return Ok(Some(ReadEventRecord::Corrupted { offset })); 156 + } 157 + 158 + let next_offset = offset.advance(record_size); 159 + 160 + Ok(Some(ReadEventRecord::Valid { 161 + event: ValidEvent { 162 + seq, 163 + timestamp, 164 + did_hash, 165 + event_type, 166 + payload, 167 + }, 168 + next_offset, 169 + })) 170 + } 171 + 172 + #[derive(Debug)] 173 + pub enum ValidateEventRecord { 174 + Valid { 175 + seq: EventSequence, 176 + next_offset: SegmentOffset, 177 + }, 178 + Corrupted, 179 + Truncated, 180 + } 181 + 182 + const CHECKSUM_CHUNK_SIZE: usize = 8 * 1024; 183 + 184 + pub fn validate_event_record<S: StorageIO>( 185 + io: &S, 186 + fd: FileId, 187 + offset: SegmentOffset, 188 + file_size: u64, 189 + ) -> io::Result<Option<ValidateEventRecord>> { 190 + let raw = offset.raw(); 191 + assert!( 192 + raw <= file_size, 193 + "validate offset {raw} past file size {file_size}" 194 + ); 195 + let remaining = file_size - raw; 196 + if remaining == 0 { 197 + return Ok(None); 198 + } 199 + 200 + if remaining < EVENT_HEADER_SIZE as u64 { 201 + return Ok(Some(ValidateEventRecord::Truncated)); 202 + } 203 + 204 + let mut header = [0u8; EVENT_HEADER_SIZE]; 205 + io.read_exact_at(fd, raw, &mut header)?; 206 + 207 + let seq_raw = u64::from_le_bytes(header[0..8].try_into().unwrap()); 208 + if seq_raw == 0 { 209 + return Ok(Some(ValidateEventRecord::Corrupted)); 210 + } 211 + let seq = EventSequence::new(seq_raw); 212 + 213 + let event_type_raw = header[20]; 214 + if EventTypeTag::from_raw(event_type_raw).is_none() { 215 + return Ok(Some(ValidateEventRecord::Corrupted)); 216 + } 217 + 218 + let payload_len = u32::from_le_bytes(header[21..25].try_into().unwrap()); 219 + if payload_len > MAX_EVENT_PAYLOAD { 220 + return Ok(Some(ValidateEventRecord::Corrupted)); 221 + } 222 + 223 + let record_size = EVENT_RECORD_OVERHEAD as u64 + u64::from(payload_len); 224 + if record_size > remaining { 225 + return Ok(Some(ValidateEventRecord::Truncated)); 226 + } 227 + 228 + let payload_offset = raw + EVENT_HEADER_SIZE as u64; 229 + 230 + let mut hasher = xxhash_rust::xxh3::Xxh3::new(); 231 + hasher.update(&header); 232 + let mut chunk = [0u8; CHECKSUM_CHUNK_SIZE]; 233 + (0..u64::from(payload_len)) 234 + .step_by(CHECKSUM_CHUNK_SIZE) 235 + .map(|chunk_start| { 236 + let to_read = 237 + ((u64::from(payload_len) - chunk_start) as usize).min(CHECKSUM_CHUNK_SIZE); 238 + (payload_offset + chunk_start, to_read) 239 + }) 240 + .try_for_each(|(pos, to_read)| { 241 + io.read_exact_at(fd, pos, &mut chunk[..to_read])?; 242 + hasher.update(&chunk[..to_read]); 243 + Ok::<_, io::Error>(()) 244 + })?; 245 + let computed_checksum = hasher.digest() as u32; 246 + 247 + let mut checksum_bytes = [0u8; 4]; 248 + io.read_exact_at( 249 + fd, 250 + payload_offset + u64::from(payload_len), 251 + &mut checksum_bytes, 252 + )?; 253 + let stored_checksum = u32::from_le_bytes(checksum_bytes); 254 + 255 + if stored_checksum != computed_checksum { 256 + return Ok(Some(ValidateEventRecord::Corrupted)); 257 + } 258 + 259 + let next_offset = offset.advance(record_size); 260 + Ok(Some(ValidateEventRecord::Valid { seq, next_offset })) 261 + } 262 + 263 + pub struct SegmentWriter { 264 + fd: FileId, 265 + segment_id: SegmentId, 266 + position: SegmentOffset, 267 + base_seq: EventSequence, 268 + last_seq: Option<EventSequence>, 269 + } 270 + 271 + impl SegmentWriter { 272 + pub fn new<S: StorageIO>( 273 + io: &S, 274 + fd: FileId, 275 + segment_id: SegmentId, 276 + base_seq: EventSequence, 277 + ) -> io::Result<Self> { 278 + let mut header = [0u8; SEGMENT_HEADER_SIZE]; 279 + header[..4].copy_from_slice(&SEGMENT_MAGIC); 280 + header[4] = SEGMENT_FORMAT_VERSION; 281 + io.write_all_at(fd, 0, &header)?; 282 + Ok(Self { 283 + fd, 284 + segment_id, 285 + position: SegmentOffset::new(SEGMENT_HEADER_SIZE as u64), 286 + base_seq, 287 + last_seq: None, 288 + }) 289 + } 290 + 291 + pub fn resume<S: StorageIO>( 292 + io: &S, 293 + fd: FileId, 294 + segment_id: SegmentId, 295 + position: SegmentOffset, 296 + base_seq: EventSequence, 297 + last_seq: Option<EventSequence>, 298 + ) -> Self { 299 + assert!( 300 + position.raw() >= SEGMENT_HEADER_SIZE as u64, 301 + "resume position {position:?} is before header end" 302 + ); 303 + #[cfg(debug_assertions)] 304 + { 305 + let mut magic = [0u8; 4]; 306 + io.read_exact_at(fd, 0, &mut magic) 307 + .expect("resume: failed to read segment header"); 308 + assert_eq!(magic, SEGMENT_MAGIC, "resume: bad segment magic"); 309 + } 310 + #[cfg(not(debug_assertions))] 311 + let _ = io; 312 + Self { 313 + fd, 314 + segment_id, 315 + position, 316 + base_seq, 317 + last_seq, 318 + } 319 + } 320 + 321 + pub fn append_event<S: StorageIO>( 322 + &mut self, 323 + io: &S, 324 + event: &ValidEvent, 325 + ) -> io::Result<SegmentOffset> { 326 + assert!( 327 + self.last_seq.is_none_or(|prev| event.seq > prev), 328 + "non-monotonic sequence: {} after {}", 329 + event.seq, 330 + self.last_seq.unwrap() 331 + ); 332 + let record_offset = self.position; 333 + let bytes_written = encode_event_record(io, self.fd, record_offset, event)?; 334 + self.position = self.position.advance(bytes_written); 335 + self.last_seq = Some(event.seq); 336 + Ok(record_offset) 337 + } 338 + 339 + pub fn sync<S: StorageIO>(&self, io: &S) -> io::Result<()> { 340 + io.sync(self.fd) 341 + } 342 + 343 + pub fn position(&self) -> SegmentOffset { 344 + self.position 345 + } 346 + 347 + pub fn segment_id(&self) -> SegmentId { 348 + self.segment_id 349 + } 350 + 351 + pub fn base_seq(&self) -> EventSequence { 352 + self.base_seq 353 + } 354 + 355 + pub fn fd(&self) -> FileId { 356 + self.fd 357 + } 358 + } 359 + 360 + pub struct SegmentReader<'a, S: StorageIO> { 361 + io: &'a S, 362 + fd: FileId, 363 + position: SegmentOffset, 364 + file_size: u64, 365 + } 366 + 367 + impl<'a, S: StorageIO> SegmentReader<'a, S> { 368 + pub fn open(io: &'a S, fd: FileId) -> io::Result<Self> { 369 + let file_size = io.file_size(fd)?; 370 + if file_size < SEGMENT_HEADER_SIZE as u64 { 371 + return Err(io::Error::new( 372 + io::ErrorKind::InvalidData, 373 + "file too small for segment header", 374 + )); 375 + } 376 + 377 + let mut header = [0u8; SEGMENT_HEADER_SIZE]; 378 + io.read_exact_at(fd, 0, &mut header)?; 379 + 380 + if header[..SEGMENT_MAGIC.len()] != SEGMENT_MAGIC { 381 + return Err(io::Error::new( 382 + io::ErrorKind::InvalidData, 383 + "bad segment magic", 384 + )); 385 + } 386 + if header[SEGMENT_MAGIC.len()] != SEGMENT_FORMAT_VERSION { 387 + return Err(io::Error::new( 388 + io::ErrorKind::InvalidData, 389 + "unsupported segment format version", 390 + )); 391 + } 392 + 393 + Ok(Self { 394 + io, 395 + fd, 396 + position: SegmentOffset::new(SEGMENT_HEADER_SIZE as u64), 397 + file_size, 398 + }) 399 + } 400 + 401 + pub fn valid_prefix(self) -> io::Result<Vec<ValidEvent>> { 402 + self.map(|result| { 403 + result.map(|record| match record { 404 + ReadEventRecord::Valid { event, .. } => Some(event), 405 + ReadEventRecord::Corrupted { .. } | ReadEventRecord::Truncated { .. } => None, 406 + }) 407 + }) 408 + .scan((), |(), result| match result { 409 + Err(e) => Some(Err(e)), 410 + Ok(Some(event)) => Some(Ok(event)), 411 + Ok(None) => None, 412 + }) 413 + .collect() 414 + } 415 + 416 + pub fn fd(&self) -> FileId { 417 + self.fd 418 + } 419 + 420 + pub fn position(&self) -> SegmentOffset { 421 + self.position 422 + } 423 + 424 + pub fn file_size(&self) -> u64 { 425 + self.file_size 426 + } 427 + } 428 + 429 + impl<S: StorageIO> Iterator for SegmentReader<'_, S> { 430 + type Item = io::Result<ReadEventRecord>; 431 + 432 + fn next(&mut self) -> Option<Self::Item> { 433 + match decode_event_record(self.io, self.fd, self.position, self.file_size) { 434 + Err(e) => { 435 + self.position = SegmentOffset::new(self.file_size); 436 + Some(Err(e)) 437 + } 438 + Ok(None) => None, 439 + Ok(Some(record)) => { 440 + match &record { 441 + ReadEventRecord::Valid { next_offset, .. } => { 442 + self.position = *next_offset; 443 + } 444 + ReadEventRecord::Corrupted { .. } | ReadEventRecord::Truncated { .. } => { 445 + self.position = SegmentOffset::new(self.file_size); 446 + } 447 + } 448 + Some(Ok(record)) 449 + } 450 + } 451 + } 452 + } 453 + 454 + #[cfg(test)] 455 + mod tests { 456 + use super::*; 457 + use crate::OpenOptions; 458 + use crate::sim::SimulatedIO; 459 + use proptest::prelude::*; 460 + use std::path::Path; 461 + 462 + fn setup() -> (SimulatedIO, FileId) { 463 + let sim = SimulatedIO::pristine(42); 464 + let dir = Path::new("/test"); 465 + sim.mkdir(dir).unwrap(); 466 + sim.sync_dir(dir).unwrap(); 467 + let fd = sim 468 + .open(Path::new("/test/segment.tqe"), OpenOptions::read_write()) 469 + .unwrap(); 470 + (sim, fd) 471 + } 472 + 473 + fn test_did_hash(seed: u8) -> DidHash { 474 + DidHash::from_did(&format!("did:plc:test{seed}")) 475 + } 476 + 477 + fn test_event(seq: u64, payload: &[u8]) -> ValidEvent { 478 + ValidEvent { 479 + seq: EventSequence::new(seq), 480 + timestamp: TimestampMicros::new(seq * 1_000_000), 481 + did_hash: test_did_hash(seq as u8), 482 + event_type: EventTypeTag::COMMIT, 483 + payload: payload.to_vec(), 484 + } 485 + } 486 + 487 + #[test] 488 + fn write_and_read_single_event() { 489 + let (sim, fd) = setup(); 490 + let mut writer = 491 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 492 + 493 + let event = test_event(1, b"test event payload"); 494 + let offset = writer.append_event(&sim, &event).unwrap(); 495 + writer.sync(&sim).unwrap(); 496 + 497 + assert_eq!(offset, SegmentOffset::new(SEGMENT_HEADER_SIZE as u64)); 498 + 499 + let reader = SegmentReader::open(&sim, fd).unwrap(); 500 + let events = reader.valid_prefix().unwrap(); 501 + assert_eq!(events.len(), 1); 502 + assert_eq!(events[0], event); 503 + } 504 + 505 + #[test] 506 + fn write_and_read_multiple_events() { 507 + let (sim, fd) = setup(); 508 + let mut writer = 509 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 510 + 511 + let written: Vec<ValidEvent> = (1u64..=3) 512 + .map(|i| { 513 + let event = test_event(i, format!("event {i}").as_bytes()); 514 + writer.append_event(&sim, &event).unwrap(); 515 + event 516 + }) 517 + .collect(); 518 + writer.sync(&sim).unwrap(); 519 + 520 + let reader = SegmentReader::open(&sim, fd).unwrap(); 521 + let events = reader.valid_prefix().unwrap(); 522 + assert_eq!(events, written); 523 + } 524 + 525 + #[test] 526 + fn empty_segment_has_no_events() { 527 + let (sim, fd) = setup(); 528 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 529 + 530 + let reader = SegmentReader::open(&sim, fd).unwrap(); 531 + let events = reader.valid_prefix().unwrap(); 532 + assert!(events.is_empty()); 533 + } 534 + 535 + #[test] 536 + fn detects_truncated_event() { 537 + let (sim, fd) = setup(); 538 + let mut writer = 539 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 540 + writer 541 + .append_event(&sim, &test_event(1, b"complete event")) 542 + .unwrap(); 543 + writer.sync(&sim).unwrap(); 544 + 545 + sim.write_all_at(fd, writer.position().raw(), &[1, 2, 3, 4, 5]) 546 + .unwrap(); 547 + sim.sync(fd).unwrap(); 548 + 549 + let mut reader = SegmentReader::open(&sim, fd).unwrap(); 550 + let first = reader.next().unwrap().unwrap(); 551 + assert!(matches!(first, ReadEventRecord::Valid { .. })); 552 + 553 + let second = reader.next().unwrap().unwrap(); 554 + assert!(matches!(second, ReadEventRecord::Truncated { .. })); 555 + } 556 + 557 + #[test] 558 + fn checksum_detects_corruption() { 559 + let (sim, fd) = setup(); 560 + let mut writer = 561 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 562 + writer 563 + .append_event(&sim, &test_event(1, &vec![0xAA; 256])) 564 + .unwrap(); 565 + writer.sync(&sim).unwrap(); 566 + 567 + let corrupt_offset = SEGMENT_HEADER_SIZE as u64 + EVENT_HEADER_SIZE as u64 + 128; 568 + sim.write_all_at(fd, corrupt_offset, &[0x00]).unwrap(); 569 + 570 + let mut reader = SegmentReader::open(&sim, fd).unwrap(); 571 + let record = reader.next().unwrap().unwrap(); 572 + assert!(matches!(record, ReadEventRecord::Corrupted { .. })); 573 + } 574 + 575 + #[test] 576 + fn crash_before_sync_loses_events() { 577 + let (sim, fd) = setup(); 578 + let mut writer = 579 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 580 + writer 581 + .append_event(&sim, &test_event(1, b"synced")) 582 + .unwrap(); 583 + writer.sync(&sim).unwrap(); 584 + sim.sync_dir(Path::new("/test")).unwrap(); 585 + 586 + writer 587 + .append_event(&sim, &test_event(2, b"not synced")) 588 + .unwrap(); 589 + 590 + sim.crash(); 591 + 592 + let fd = sim 593 + .open(Path::new("/test/segment.tqe"), OpenOptions::read()) 594 + .unwrap(); 595 + let reader = SegmentReader::open(&sim, fd).unwrap(); 596 + let events = reader.valid_prefix().unwrap(); 597 + assert_eq!(events.len(), 1); 598 + assert_eq!(events[0].payload, b"synced"); 599 + } 600 + 601 + #[test] 602 + fn rejects_oversized_payload() { 603 + let (sim, fd) = setup(); 604 + let mut writer = 605 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 606 + let result = writer.append_event( 607 + &sim, 608 + &test_event(1, &vec![0u8; MAX_EVENT_PAYLOAD as usize + 1]), 609 + ); 610 + assert!(result.is_err()); 611 + } 612 + 613 + #[test] 614 + fn zero_length_payload_round_trips() { 615 + let (sim, fd) = setup(); 616 + let mut writer = 617 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 618 + let event = ValidEvent { 619 + seq: EventSequence::new(1), 620 + timestamp: TimestampMicros::new(1_000_000), 621 + did_hash: test_did_hash(1), 622 + event_type: EventTypeTag::IDENTITY, 623 + payload: vec![], 624 + }; 625 + writer.append_event(&sim, &event).unwrap(); 626 + writer.sync(&sim).unwrap(); 627 + 628 + let reader = SegmentReader::open(&sim, fd).unwrap(); 629 + let events = reader.valid_prefix().unwrap(); 630 + assert_eq!(events, vec![event]); 631 + } 632 + 633 + #[test] 634 + fn accepts_exact_max_payload() { 635 + let (sim, fd) = setup(); 636 + let mut writer = 637 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 638 + let result = writer.append_event( 639 + &sim, 640 + &test_event(1, &vec![0xBB; MAX_EVENT_PAYLOAD as usize]), 641 + ); 642 + assert!(result.is_ok()); 643 + } 644 + 645 + #[test] 646 + fn bad_magic_rejected() { 647 + let sim = SimulatedIO::pristine(42); 648 + let dir = Path::new("/test"); 649 + sim.mkdir(dir).unwrap(); 650 + sim.sync_dir(dir).unwrap(); 651 + let fd = sim 652 + .open(Path::new("/test/bad.tqe"), OpenOptions::read_write()) 653 + .unwrap(); 654 + sim.write_all_at(fd, 0, b"NOPE\x01").unwrap(); 655 + 656 + let result = SegmentReader::open(&sim, fd); 657 + assert!(result.is_err()); 658 + } 659 + 660 + #[test] 661 + fn encode_decode_round_trip_at_offset() { 662 + let (sim, fd) = setup(); 663 + 664 + sim.write_all_at(fd, 0, &[0u8; 100]).unwrap(); 665 + 666 + let offset = SegmentOffset::new(100); 667 + let event = ValidEvent { 668 + seq: EventSequence::new(42), 669 + timestamp: TimestampMicros::new(9_999_999), 670 + did_hash: test_did_hash(7), 671 + event_type: EventTypeTag::ACCOUNT, 672 + payload: b"round trip test data".to_vec(), 673 + }; 674 + let bytes_written = encode_event_record(&sim, fd, offset, &event).unwrap(); 675 + let expected_size = EVENT_RECORD_OVERHEAD as u64 + event.payload.len() as u64; 676 + assert_eq!(bytes_written, expected_size); 677 + 678 + let file_size = sim.file_size(fd).unwrap(); 679 + let record = decode_event_record(&sim, fd, offset, file_size) 680 + .unwrap() 681 + .unwrap(); 682 + match record { 683 + ReadEventRecord::Valid { event: decoded, .. } => assert_eq!(decoded, event), 684 + other => panic!("expected Valid, got {other:?}"), 685 + } 686 + } 687 + 688 + #[test] 689 + fn resume_writer_continues_at_position() { 690 + let (sim, fd) = setup(); 691 + let mut writer = 692 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 693 + writer.append_event(&sim, &test_event(1, b"first")).unwrap(); 694 + writer.sync(&sim).unwrap(); 695 + 696 + let resume_pos = writer.position(); 697 + let mut writer2 = SegmentWriter::resume( 698 + &sim, 699 + fd, 700 + SegmentId::new(1), 701 + resume_pos, 702 + EventSequence::new(1), 703 + Some(EventSequence::new(1)), 704 + ); 705 + writer2 706 + .append_event(&sim, &test_event(2, b"second")) 707 + .unwrap(); 708 + writer2.sync(&sim).unwrap(); 709 + 710 + let reader = SegmentReader::open(&sim, fd).unwrap(); 711 + let events = reader.valid_prefix().unwrap(); 712 + assert_eq!(events.len(), 2); 713 + assert_eq!(events[0].payload, b"first"); 714 + assert_eq!(events[1].payload, b"second"); 715 + } 716 + 717 + #[test] 718 + fn all_event_types_round_trip() { 719 + let (sim, fd) = setup(); 720 + let mut writer = 721 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 722 + 723 + let types = [ 724 + EventTypeTag::COMMIT, 725 + EventTypeTag::IDENTITY, 726 + EventTypeTag::ACCOUNT, 727 + EventTypeTag::SYNC, 728 + ]; 729 + 730 + types.iter().enumerate().for_each(|(i, &event_type)| { 731 + let event = ValidEvent { 732 + seq: EventSequence::new((i + 1) as u64), 733 + timestamp: TimestampMicros::new(1_000_000), 734 + did_hash: test_did_hash(i as u8), 735 + event_type, 736 + payload: b"payload".to_vec(), 737 + }; 738 + writer.append_event(&sim, &event).unwrap(); 739 + }); 740 + writer.sync(&sim).unwrap(); 741 + 742 + let reader = SegmentReader::open(&sim, fd).unwrap(); 743 + let events = reader.valid_prefix().unwrap(); 744 + assert_eq!(events.len(), 4); 745 + events 746 + .iter() 747 + .zip(types.iter()) 748 + .for_each(|(event, &expected_type)| { 749 + assert_eq!(event.event_type, expected_type); 750 + }); 751 + } 752 + 753 + #[test] 754 + fn seq_zero_detected_as_corrupted() { 755 + let (sim, fd) = setup(); 756 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 757 + 758 + let mut raw_header = [0u8; EVENT_HEADER_SIZE]; 759 + raw_header[0..8].copy_from_slice(&0u64.to_le_bytes()); 760 + raw_header[8..16].copy_from_slice(&1_000_000u64.to_le_bytes()); 761 + raw_header[16..20].copy_from_slice(&test_did_hash(1).raw().to_le_bytes()); 762 + raw_header[20] = EventTypeTag::COMMIT.raw(); 763 + raw_header[21..25].copy_from_slice(&5u32.to_le_bytes()); 764 + 765 + sim.write_all_at(fd, SEGMENT_HEADER_SIZE as u64, &raw_header) 766 + .unwrap(); 767 + sim.write_all_at( 768 + fd, 769 + SEGMENT_HEADER_SIZE as u64 + EVENT_HEADER_SIZE as u64, 770 + b"hello", 771 + ) 772 + .unwrap(); 773 + sim.write_all_at( 774 + fd, 775 + SEGMENT_HEADER_SIZE as u64 + EVENT_HEADER_SIZE as u64 + 5, 776 + &[0u8; 4], 777 + ) 778 + .unwrap(); 779 + 780 + let mut reader = SegmentReader::open(&sim, fd).unwrap(); 781 + let record = reader.next().unwrap().unwrap(); 782 + assert!(matches!(record, ReadEventRecord::Corrupted { .. })); 783 + } 784 + 785 + #[test] 786 + fn writer_accessors() { 787 + let (sim, fd) = setup(); 788 + let writer = 789 + SegmentWriter::new(&sim, fd, SegmentId::new(7), EventSequence::new(100)).unwrap(); 790 + assert_eq!(writer.segment_id(), SegmentId::new(7)); 791 + assert_eq!(writer.base_seq(), EventSequence::new(100)); 792 + assert_eq!( 793 + writer.position(), 794 + SegmentOffset::new(SEGMENT_HEADER_SIZE as u64) 795 + ); 796 + assert_eq!(writer.fd(), fd); 797 + } 798 + 799 + fn run_crash_recovery_seed(seed: u64) { 800 + let sim = SimulatedIO::new(seed, crate::FaultConfig::aggressive()); 801 + let dir = Path::new("/data"); 802 + let _ = sim.mkdir(dir); 803 + let _ = sim.sync_dir(dir); 804 + 805 + let written_count = 806 + if let Ok(fd) = sim.open(Path::new("/data/segment.tqe"), OpenOptions::read_write()) { 807 + if let Ok(mut writer) = 808 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)) 809 + { 810 + let count = (1u64..=20).fold(0u64, |count, i| { 811 + let event = ValidEvent { 812 + seq: EventSequence::new(i), 813 + timestamp: TimestampMicros::new(i * 1_000_000), 814 + did_hash: DidHash::from_did(&format!("did:plc:user{i}")), 815 + event_type: EventTypeTag::COMMIT, 816 + payload: vec![i as u8; ((i as usize) + 1) * 10], 817 + }; 818 + match writer.append_event(&sim, &event) { 819 + Ok(_) => count + 1, 820 + Err(_) => count, 821 + } 822 + }); 823 + let _ = writer.sync(&sim); 824 + count 825 + } else { 826 + 0 827 + } 828 + } else { 829 + 0 830 + }; 831 + let _ = sim.sync_dir(dir); 832 + 833 + sim.crash(); 834 + 835 + if let Ok(fd) = sim.open(Path::new("/data/segment.tqe"), OpenOptions::read()) 836 + && let Ok(reader) = SegmentReader::open(&sim, fd) 837 + { 838 + let recovered: Vec<_> = reader 839 + .map_while(|r| match r { 840 + Ok(ReadEventRecord::Valid { event, .. }) => Some(event), 841 + _ => None, 842 + }) 843 + .collect(); 844 + 845 + assert!( 846 + recovered.len() as u64 <= written_count, 847 + "recovered {} events but only wrote {written_count}", 848 + recovered.len() 849 + ); 850 + 851 + recovered.windows(2).enumerate().for_each(|(i, pair)| { 852 + assert!( 853 + pair[0].seq < pair[1].seq, 854 + "event {i} seq {} not less than event {} seq {}", 855 + pair[0].seq, 856 + i + 1, 857 + pair[1].seq, 858 + ); 859 + }); 860 + } 861 + } 862 + 863 + proptest! { 864 + #![proptest_config(ProptestConfig::with_cases(2000))] 865 + 866 + #[test] 867 + fn sim_crash_recovery_aggressive_faults(seed in 0u64..u64::MAX) { 868 + run_crash_recovery_seed(seed); 869 + } 870 + } 871 + 872 + fn run_bit_flip_detection_seed(seed: u64) { 873 + let sim = SimulatedIO::pristine(seed); 874 + let dir = Path::new("/data"); 875 + sim.mkdir(dir).unwrap(); 876 + sim.sync_dir(dir).unwrap(); 877 + 878 + let fd = sim 879 + .open(Path::new("/data/segment.tqe"), OpenOptions::read_write()) 880 + .unwrap(); 881 + let mut writer = 882 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 883 + 884 + let data_len = ((seed % 256) as usize).max(1); 885 + let event = ValidEvent { 886 + seq: EventSequence::new(1), 887 + timestamp: TimestampMicros::new(1_000_000), 888 + did_hash: DidHash::from_did("did:plc:bitflip"), 889 + event_type: EventTypeTag::COMMIT, 890 + payload: vec![0xAA; data_len], 891 + }; 892 + writer.append_event(&sim, &event).unwrap(); 893 + writer.sync(&sim).unwrap(); 894 + 895 + let record_start = SEGMENT_HEADER_SIZE as u64; 896 + let record_end = record_start + EVENT_RECORD_OVERHEAD as u64 + data_len as u64; 897 + let flip_pos = record_start + (seed.wrapping_mul(7) % (record_end - record_start)); 898 + let flip_bit = (seed.wrapping_mul(13) % 8) as u8; 899 + 900 + let mut byte_buf = [0u8; 1]; 901 + sim.read_exact_at(fd, flip_pos, &mut byte_buf).unwrap(); 902 + byte_buf[0] ^= 1 << flip_bit; 903 + sim.write_all_at(fd, flip_pos, &byte_buf).unwrap(); 904 + 905 + let mut reader = SegmentReader::open(&sim, fd).unwrap(); 906 + let record = reader.next().unwrap().unwrap(); 907 + assert!( 908 + !matches!(record, ReadEventRecord::Valid { .. }), 909 + "bit flip at offset {flip_pos} bit {flip_bit} was not detected" 910 + ); 911 + } 912 + 913 + proptest! { 914 + #![proptest_config(ProptestConfig::with_cases(2000))] 915 + 916 + #[test] 917 + fn sim_bit_flip_detected_by_checksum(seed in 0u64..u64::MAX) { 918 + run_bit_flip_detection_seed(seed); 919 + } 920 + } 921 + }
+666
crates/tranquil-store/src/eventlog/segment_index.rs
··· 1 + use std::cell::Cell; 2 + use std::io; 3 + use std::path::Path; 4 + 5 + use serde::{Deserialize, Serialize}; 6 + 7 + use crate::io::{FileId, OpenOptions, StorageIO}; 8 + use crate::record::{RecordReader, RecordWriter}; 9 + 10 + use super::segment_file::{ 11 + SEGMENT_HEADER_SIZE, SEGMENT_MAGIC, ValidateEventRecord, validate_event_record, 12 + }; 13 + use super::types::{EventSequence, SegmentOffset}; 14 + 15 + pub const DEFAULT_INDEX_INTERVAL: usize = 256; 16 + const MAX_INDEX_ENTRIES: usize = 4 * 1024 * 1024; 17 + 18 + #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] 19 + struct IndexEntry { 20 + seq: EventSequence, 21 + offset: SegmentOffset, 22 + } 23 + 24 + #[derive(Debug, Clone, PartialEq, Eq)] 25 + pub struct SegmentIndex { 26 + entries: Vec<IndexEntry>, 27 + } 28 + 29 + impl SegmentIndex { 30 + pub fn new() -> Self { 31 + Self { 32 + entries: Vec::new(), 33 + } 34 + } 35 + 36 + pub fn record(&mut self, seq: EventSequence, offset: SegmentOffset) { 37 + debug_assert!( 38 + self.entries.last().is_none_or(|last| seq > last.seq), 39 + "index entries must be monotonically increasing" 40 + ); 41 + self.entries.push(IndexEntry { seq, offset }); 42 + } 43 + 44 + pub fn lookup(&self, target_seq: EventSequence) -> Option<SegmentOffset> { 45 + let idx = self.entries.partition_point(|e| e.seq <= target_seq); 46 + match idx { 47 + 0 => None, 48 + i => Some(self.entries[i - 1].offset), 49 + } 50 + } 51 + 52 + pub fn first_seq(&self) -> Option<EventSequence> { 53 + self.entries.first().map(|e| e.seq) 54 + } 55 + 56 + pub fn last_seq(&self) -> Option<EventSequence> { 57 + self.entries.last().map(|e| e.seq) 58 + } 59 + 60 + pub fn entry_count(&self) -> usize { 61 + self.entries.len() 62 + } 63 + 64 + pub fn save<S: StorageIO>(&self, io: &S, path: &Path) -> io::Result<()> { 65 + let tmp_path = path.with_extension("tqi.tmp"); 66 + let fd = io.open(&tmp_path, OpenOptions::read_write())?; 67 + 68 + let result = (|| { 69 + let serialized = postcard::to_allocvec(&self.entries) 70 + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; 71 + 72 + let mut writer = RecordWriter::new(io, fd)?; 73 + writer.append(&serialized)?; 74 + io.truncate(fd, writer.position())?; 75 + writer.sync()?; 76 + Ok(()) 77 + })(); 78 + 79 + if let Err(e) = result { 80 + let _ = io.close(fd); 81 + return Err(e); 82 + } 83 + io.close(fd)?; 84 + 85 + io.rename(&tmp_path, path)?; 86 + 87 + if let Some(parent) = path.parent() { 88 + io.sync_dir(parent)?; 89 + } 90 + 91 + Ok(()) 92 + } 93 + 94 + pub fn load<S: StorageIO>(io: &S, path: &Path) -> io::Result<Option<Self>> { 95 + let fd = match io.open(path, OpenOptions::read_only_existing()) { 96 + Ok(fd) => fd, 97 + Err(e) if e.kind() == io::ErrorKind::NotFound => return Ok(None), 98 + Err(e) => return Err(e), 99 + }; 100 + 101 + let reader = match RecordReader::open(io, fd) { 102 + Ok(r) => r, 103 + Err(e) => { 104 + let _ = io.close(fd); 105 + return Err(e); 106 + } 107 + }; 108 + let records = reader.valid_records(); 109 + io.close(fd)?; 110 + 111 + let payload = records.into_iter().next().ok_or_else(|| { 112 + io::Error::new(io::ErrorKind::InvalidData, "index file contains no records") 113 + })?; 114 + 115 + const MIN_POSTCARD_ENTRY_BYTES: usize = 2; 116 + if payload.len() / MIN_POSTCARD_ENTRY_BYTES > MAX_INDEX_ENTRIES { 117 + return Err(io::Error::new( 118 + io::ErrorKind::InvalidData, 119 + "index payload too large", 120 + )); 121 + } 122 + 123 + let entries: Vec<IndexEntry> = postcard::from_bytes(&payload) 124 + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; 125 + 126 + if entries.len() > MAX_INDEX_ENTRIES { 127 + return Err(io::Error::new( 128 + io::ErrorKind::InvalidData, 129 + "index contains too many entries", 130 + )); 131 + } 132 + 133 + let is_sorted = entries.windows(2).all(|pair| pair[0].seq < pair[1].seq); 134 + if !is_sorted { 135 + return Err(io::Error::new( 136 + io::ErrorKind::InvalidData, 137 + "index entries not monotonically sorted", 138 + )); 139 + } 140 + 141 + Ok(Some(Self { entries })) 142 + } 143 + } 144 + 145 + impl Default for SegmentIndex { 146 + fn default() -> Self { 147 + Self::new() 148 + } 149 + } 150 + 151 + struct ScanState { 152 + index: SegmentIndex, 153 + event_count: usize, 154 + last_seq: Option<EventSequence>, 155 + last_offset: Option<SegmentOffset>, 156 + } 157 + 158 + pub fn rebuild_from_segment<S: StorageIO>( 159 + io: &S, 160 + segment_fd: FileId, 161 + index_interval: usize, 162 + ) -> io::Result<(SegmentIndex, Option<EventSequence>)> { 163 + assert!(index_interval > 0, "index_interval must be positive"); 164 + let file_size = io.file_size(segment_fd)?; 165 + 166 + if file_size < SEGMENT_HEADER_SIZE as u64 { 167 + return Err(io::Error::new( 168 + io::ErrorKind::InvalidData, 169 + "file too small for segment header", 170 + )); 171 + } 172 + 173 + let mut header = [0u8; SEGMENT_HEADER_SIZE]; 174 + io.read_exact_at(segment_fd, 0, &mut header)?; 175 + if header[..SEGMENT_MAGIC.len()] != SEGMENT_MAGIC { 176 + return Err(io::Error::new( 177 + io::ErrorKind::InvalidData, 178 + "bad segment magic", 179 + )); 180 + } 181 + if header[SEGMENT_MAGIC.len()] != super::segment_file::SEGMENT_FORMAT_VERSION { 182 + return Err(io::Error::new( 183 + io::ErrorKind::InvalidData, 184 + "unsupported segment format version", 185 + )); 186 + } 187 + 188 + let current_offset = Cell::new(SegmentOffset::new(SEGMENT_HEADER_SIZE as u64)); 189 + let prev_seq: Cell<Option<EventSequence>> = Cell::new(None); 190 + 191 + let mut valid_events = std::iter::from_fn(|| { 192 + let offset = current_offset.get(); 193 + if offset.raw() >= file_size { 194 + return None; 195 + } 196 + match validate_event_record(io, segment_fd, offset, file_size) { 197 + Err(e) => Some(Err(e)), 198 + Ok(None) => None, 199 + Ok(Some(ValidateEventRecord::Valid { seq, next_offset })) => { 200 + if prev_seq.get().is_some_and(|prev| seq <= prev) { 201 + return None; 202 + } 203 + prev_seq.set(Some(seq)); 204 + current_offset.set(next_offset); 205 + Some(Ok((seq, offset))) 206 + } 207 + Ok(Some(ValidateEventRecord::Corrupted | ValidateEventRecord::Truncated)) => None, 208 + } 209 + }); 210 + 211 + let initial = ScanState { 212 + index: SegmentIndex::new(), 213 + event_count: 0, 214 + last_seq: None, 215 + last_offset: None, 216 + }; 217 + 218 + let state = valid_events.try_fold(initial, |mut state, record| -> io::Result<ScanState> { 219 + let (seq, record_offset) = record?; 220 + 221 + let should_index = state.event_count == 0 || state.event_count % index_interval == 0; 222 + if should_index { 223 + state.index.record(seq, record_offset); 224 + } 225 + 226 + state.event_count += 1; 227 + state.last_seq = Some(seq); 228 + state.last_offset = Some(record_offset); 229 + 230 + Ok(state) 231 + })?; 232 + 233 + let mut index = state.index; 234 + if let (Some(seq), Some(offset)) = (state.last_seq, state.last_offset) 235 + && index.last_seq() != Some(seq) 236 + { 237 + index.record(seq, offset); 238 + } 239 + 240 + let valid_end = current_offset.get().raw(); 241 + if valid_end < file_size { 242 + io.truncate(segment_fd, valid_end)?; 243 + io.sync(segment_fd)?; 244 + } 245 + 246 + Ok((index, state.last_seq)) 247 + } 248 + 249 + #[cfg(test)] 250 + mod tests { 251 + use super::*; 252 + use crate::OpenOptions; 253 + use crate::eventlog::segment_file::{ 254 + EVENT_HEADER_SIZE, SegmentWriter, ValidEvent, encode_event_record, 255 + }; 256 + use crate::eventlog::types::{ 257 + DidHash, EventSequence, EventTypeTag, SegmentId, SegmentOffset, TimestampMicros, 258 + }; 259 + use crate::sim::SimulatedIO; 260 + use std::path::Path; 261 + 262 + fn setup() -> (SimulatedIO, FileId) { 263 + let sim = SimulatedIO::pristine(42); 264 + let dir = Path::new("/test"); 265 + sim.mkdir(dir).unwrap(); 266 + sim.sync_dir(dir).unwrap(); 267 + let fd = sim 268 + .open(Path::new("/test/segment.tqe"), OpenOptions::read_write()) 269 + .unwrap(); 270 + (sim, fd) 271 + } 272 + 273 + fn test_event(seq: u64, payload: &[u8]) -> ValidEvent { 274 + ValidEvent { 275 + seq: EventSequence::new(seq), 276 + timestamp: TimestampMicros::new(seq * 1_000_000), 277 + did_hash: DidHash::from_did(&format!("did:plc:test{seq}")), 278 + event_type: EventTypeTag::COMMIT, 279 + payload: payload.to_vec(), 280 + } 281 + } 282 + 283 + fn write_n_events<S: StorageIO>( 284 + io: &S, 285 + fd: FileId, 286 + count: u64, 287 + ) -> Vec<(EventSequence, SegmentOffset)> { 288 + let mut writer = 289 + SegmentWriter::new(io, fd, SegmentId::new(0), EventSequence::new(1)).unwrap(); 290 + let offsets: Vec<_> = (1..=count) 291 + .map(|i| { 292 + let event = test_event(i, format!("payload-{i}").as_bytes()); 293 + let offset = writer.append_event(io, &event).unwrap(); 294 + (event.seq, offset) 295 + }) 296 + .collect(); 297 + writer.sync(io).unwrap(); 298 + offsets 299 + } 300 + 301 + #[test] 302 + fn empty_index() { 303 + let index = SegmentIndex::new(); 304 + assert_eq!(index.entry_count(), 0); 305 + assert_eq!(index.first_seq(), None); 306 + assert_eq!(index.last_seq(), None); 307 + assert_eq!(index.lookup(EventSequence::new(1)), None); 308 + } 309 + 310 + #[test] 311 + fn record_and_lookup_single_entry() { 312 + let mut index = SegmentIndex::new(); 313 + index.record(EventSequence::new(10), SegmentOffset::new(100)); 314 + 315 + assert_eq!(index.entry_count(), 1); 316 + assert_eq!(index.first_seq(), Some(EventSequence::new(10))); 317 + assert_eq!(index.last_seq(), Some(EventSequence::new(10))); 318 + 319 + assert_eq!( 320 + index.lookup(EventSequence::new(10)), 321 + Some(SegmentOffset::new(100)) 322 + ); 323 + assert_eq!( 324 + index.lookup(EventSequence::new(15)), 325 + Some(SegmentOffset::new(100)) 326 + ); 327 + assert_eq!(index.lookup(EventSequence::new(5)), None); 328 + } 329 + 330 + #[test] 331 + fn lookup_returns_floor_entry() { 332 + let mut index = SegmentIndex::new(); 333 + index.record(EventSequence::new(1), SegmentOffset::new(100)); 334 + index.record(EventSequence::new(100), SegmentOffset::new(5000)); 335 + index.record(EventSequence::new(200), SegmentOffset::new(10000)); 336 + 337 + assert_eq!( 338 + index.lookup(EventSequence::new(1)), 339 + Some(SegmentOffset::new(100)) 340 + ); 341 + assert_eq!( 342 + index.lookup(EventSequence::new(50)), 343 + Some(SegmentOffset::new(100)) 344 + ); 345 + assert_eq!( 346 + index.lookup(EventSequence::new(100)), 347 + Some(SegmentOffset::new(5000)) 348 + ); 349 + assert_eq!( 350 + index.lookup(EventSequence::new(150)), 351 + Some(SegmentOffset::new(5000)) 352 + ); 353 + assert_eq!( 354 + index.lookup(EventSequence::new(200)), 355 + Some(SegmentOffset::new(10000)) 356 + ); 357 + assert_eq!( 358 + index.lookup(EventSequence::new(999)), 359 + Some(SegmentOffset::new(10000)) 360 + ); 361 + } 362 + 363 + #[test] 364 + fn lookup_before_first_returns_none() { 365 + let mut index = SegmentIndex::new(); 366 + index.record(EventSequence::new(10), SegmentOffset::new(100)); 367 + index.record(EventSequence::new(20), SegmentOffset::new(200)); 368 + 369 + assert_eq!(index.lookup(EventSequence::new(5)), None); 370 + assert_eq!(index.lookup(EventSequence::new(9)), None); 371 + } 372 + 373 + #[test] 374 + fn save_and_load_round_trip() { 375 + let sim = SimulatedIO::pristine(42); 376 + let dir = Path::new("/test"); 377 + sim.mkdir(dir).unwrap(); 378 + sim.sync_dir(dir).unwrap(); 379 + 380 + let mut index = SegmentIndex::new(); 381 + index.record(EventSequence::new(1), SegmentOffset::new(5)); 382 + index.record(EventSequence::new(256), SegmentOffset::new(50000)); 383 + index.record(EventSequence::new(512), SegmentOffset::new(100000)); 384 + 385 + let path = Path::new("/test/00000001.tqi"); 386 + index.save(&sim, path).unwrap(); 387 + 388 + let loaded = SegmentIndex::load(&sim, path).unwrap().unwrap(); 389 + assert_eq!(loaded, index); 390 + } 391 + 392 + #[test] 393 + fn load_missing_file_returns_none() { 394 + let sim = SimulatedIO::pristine(42); 395 + let dir = Path::new("/test"); 396 + sim.mkdir(dir).unwrap(); 397 + sim.sync_dir(dir).unwrap(); 398 + 399 + let result = SegmentIndex::load(&sim, Path::new("/test/missing.tqi")).unwrap(); 400 + assert!(result.is_none()); 401 + } 402 + 403 + #[test] 404 + fn load_corrupt_file_returns_err() { 405 + let sim = SimulatedIO::pristine(42); 406 + let dir = Path::new("/test"); 407 + sim.mkdir(dir).unwrap(); 408 + sim.sync_dir(dir).unwrap(); 409 + 410 + let path = Path::new("/test/corrupt.tqi"); 411 + let fd = sim.open(path, OpenOptions::read_write()).unwrap(); 412 + sim.write_all_at(fd, 0, b"TQST\x02garbage_not_valid_postcard") 413 + .unwrap(); 414 + sim.sync(fd).unwrap(); 415 + sim.close(fd).unwrap(); 416 + 417 + let result = SegmentIndex::load(&sim, path); 418 + assert!(result.is_err()); 419 + } 420 + 421 + #[test] 422 + fn save_empty_index_round_trips() { 423 + let sim = SimulatedIO::pristine(42); 424 + let dir = Path::new("/test"); 425 + sim.mkdir(dir).unwrap(); 426 + sim.sync_dir(dir).unwrap(); 427 + 428 + let index = SegmentIndex::new(); 429 + let path = Path::new("/test/empty.tqi"); 430 + index.save(&sim, path).unwrap(); 431 + 432 + let loaded = SegmentIndex::load(&sim, path).unwrap().unwrap(); 433 + assert_eq!(loaded.entry_count(), 0); 434 + } 435 + 436 + #[test] 437 + fn rebuild_empty_segment() { 438 + let (sim, fd) = setup(); 439 + SegmentWriter::new(&sim, fd, SegmentId::new(0), EventSequence::new(1)).unwrap(); 440 + sim.sync(fd).unwrap(); 441 + 442 + let (index, last_seq) = rebuild_from_segment(&sim, fd, DEFAULT_INDEX_INTERVAL).unwrap(); 443 + assert_eq!(index.entry_count(), 0); 444 + assert_eq!(last_seq, None); 445 + } 446 + 447 + #[test] 448 + fn rebuild_single_event() { 449 + let (sim, fd) = setup(); 450 + let offsets = write_n_events(&sim, fd, 1); 451 + sim.sync(fd).unwrap(); 452 + 453 + let (index, last_seq) = rebuild_from_segment(&sim, fd, DEFAULT_INDEX_INTERVAL).unwrap(); 454 + assert_eq!(last_seq, Some(EventSequence::new(1))); 455 + assert_eq!(index.entry_count(), 1); 456 + assert_eq!(index.first_seq(), Some(EventSequence::new(1))); 457 + assert_eq!(index.lookup(EventSequence::new(1)), Some(offsets[0].1)); 458 + } 459 + 460 + #[test] 461 + fn rebuild_indexes_first_and_last() { 462 + let (sim, fd) = setup(); 463 + let offsets = write_n_events(&sim, fd, 10); 464 + sim.sync(fd).unwrap(); 465 + 466 + let (index, last_seq) = rebuild_from_segment(&sim, fd, DEFAULT_INDEX_INTERVAL).unwrap(); 467 + assert_eq!(last_seq, Some(EventSequence::new(10))); 468 + assert_eq!(index.entry_count(), 2); 469 + assert_eq!(index.first_seq(), Some(EventSequence::new(1))); 470 + assert_eq!(index.last_seq(), Some(EventSequence::new(10))); 471 + assert_eq!(index.lookup(EventSequence::new(1)), Some(offsets[0].1)); 472 + assert_eq!(index.lookup(EventSequence::new(10)), Some(offsets[9].1)); 473 + } 474 + 475 + #[test] 476 + fn rebuild_indexes_at_interval() { 477 + let (sim, fd) = setup(); 478 + let offsets = write_n_events(&sim, fd, 600); 479 + sim.sync(fd).unwrap(); 480 + 481 + let (index, last_seq) = rebuild_from_segment(&sim, fd, 256).unwrap(); 482 + assert_eq!(last_seq, Some(EventSequence::new(600))); 483 + assert_eq!(index.first_seq(), Some(EventSequence::new(1))); 484 + assert_eq!(index.last_seq(), Some(EventSequence::new(600))); 485 + assert_eq!(index.entry_count(), 4); 486 + assert_eq!(index.lookup(EventSequence::new(1)), Some(offsets[0].1)); 487 + assert_eq!(index.lookup(EventSequence::new(257)), Some(offsets[256].1)); 488 + assert_eq!(index.lookup(EventSequence::new(256)), Some(offsets[0].1)); 489 + assert_eq!(index.lookup(EventSequence::new(513)), Some(offsets[512].1)); 490 + assert_eq!(index.lookup(EventSequence::new(600)), Some(offsets[599].1)); 491 + } 492 + 493 + #[test] 494 + fn rebuild_truncates_corruption() { 495 + let (sim, fd) = setup(); 496 + write_n_events(&sim, fd, 5); 497 + sim.sync(fd).unwrap(); 498 + 499 + let file_size_before = sim.file_size(fd).unwrap(); 500 + 501 + sim.write_all_at(fd, file_size_before, b"garbage_trailing_data") 502 + .unwrap(); 503 + sim.sync(fd).unwrap(); 504 + let file_size_with_garbage = sim.file_size(fd).unwrap(); 505 + assert!(file_size_with_garbage > file_size_before); 506 + 507 + let (index, last_seq) = rebuild_from_segment(&sim, fd, DEFAULT_INDEX_INTERVAL).unwrap(); 508 + assert_eq!(last_seq, Some(EventSequence::new(5))); 509 + assert_eq!(index.first_seq(), Some(EventSequence::new(1))); 510 + 511 + let file_size_after = sim.file_size(fd).unwrap(); 512 + assert_eq!(file_size_after, file_size_before); 513 + } 514 + 515 + #[test] 516 + fn rebuild_truncates_partial_record() { 517 + let (sim, fd) = setup(); 518 + write_n_events(&sim, fd, 3); 519 + sim.sync(fd).unwrap(); 520 + 521 + let valid_end = sim.file_size(fd).unwrap(); 522 + 523 + let partial_header = [0u8; EVENT_HEADER_SIZE - 5]; 524 + sim.write_all_at(fd, valid_end, &partial_header).unwrap(); 525 + sim.sync(fd).unwrap(); 526 + 527 + let (_, last_seq) = rebuild_from_segment(&sim, fd, DEFAULT_INDEX_INTERVAL).unwrap(); 528 + assert_eq!(last_seq, Some(EventSequence::new(3))); 529 + assert_eq!(sim.file_size(fd).unwrap(), valid_end); 530 + } 531 + 532 + #[test] 533 + fn rebuild_truncates_at_non_monotonic_seq() { 534 + let (sim, fd) = setup(); 535 + let mut writer = 536 + SegmentWriter::new(&sim, fd, SegmentId::new(0), EventSequence::new(1)).unwrap(); 537 + 538 + let event1 = test_event(1, b"first"); 539 + let event2 = test_event(2, b"second"); 540 + writer.append_event(&sim, &event1).unwrap(); 541 + let offset_after_two = { 542 + writer.append_event(&sim, &event2).unwrap(); 543 + writer.position() 544 + }; 545 + writer.sync(&sim).unwrap(); 546 + 547 + let valid_size_before = offset_after_two.raw(); 548 + 549 + let regressed = ValidEvent { 550 + seq: EventSequence::new(1), 551 + timestamp: TimestampMicros::new(3_000_000), 552 + did_hash: DidHash::from_did("did:plc:test3"), 553 + event_type: EventTypeTag::COMMIT, 554 + payload: b"regressed".to_vec(), 555 + }; 556 + encode_event_record(&sim, fd, offset_after_two, &regressed).unwrap(); 557 + sim.sync(fd).unwrap(); 558 + 559 + let (index, last_seq) = rebuild_from_segment(&sim, fd, DEFAULT_INDEX_INTERVAL).unwrap(); 560 + assert_eq!(last_seq, Some(EventSequence::new(2))); 561 + assert_eq!(index.first_seq(), Some(EventSequence::new(1))); 562 + assert_eq!(index.last_seq(), Some(EventSequence::new(2))); 563 + assert_eq!(sim.file_size(fd).unwrap(), valid_size_before); 564 + } 565 + 566 + #[test] 567 + fn rebuild_interval_one_indexes_every_event() { 568 + let (sim, fd) = setup(); 569 + let offsets = write_n_events(&sim, fd, 10); 570 + sim.sync(fd).unwrap(); 571 + 572 + let (index, _) = rebuild_from_segment(&sim, fd, 1).unwrap(); 573 + assert_eq!(index.entry_count(), 10); 574 + 575 + offsets.iter().enumerate().for_each(|(i, (seq, offset))| { 576 + assert_eq!(index.lookup(*seq), Some(*offset), "event {i} lookup failed"); 577 + }); 578 + } 579 + 580 + #[test] 581 + fn rebuild_and_save_load_round_trip() { 582 + let sim = SimulatedIO::pristine(42); 583 + let dir = Path::new("/test"); 584 + sim.mkdir(dir).unwrap(); 585 + sim.sync_dir(dir).unwrap(); 586 + 587 + let fd = sim 588 + .open(Path::new("/test/segment.tqe"), OpenOptions::read_write()) 589 + .unwrap(); 590 + write_n_events(&sim, fd, 300); 591 + sim.sync(fd).unwrap(); 592 + 593 + let (index, last_seq) = rebuild_from_segment(&sim, fd, 256).unwrap(); 594 + assert_eq!(last_seq, Some(EventSequence::new(300))); 595 + 596 + let index_path = Path::new("/test/00000000.tqi"); 597 + index.save(&sim, index_path).unwrap(); 598 + 599 + let loaded = SegmentIndex::load(&sim, index_path).unwrap().unwrap(); 600 + assert_eq!(loaded, index); 601 + assert_eq!(loaded.entry_count(), index.entry_count()); 602 + assert_eq!(loaded.first_seq(), index.first_seq()); 603 + assert_eq!(loaded.last_seq(), index.last_seq()); 604 + } 605 + 606 + #[test] 607 + fn save_overwrites_stale_tmp() { 608 + let sim = SimulatedIO::pristine(42); 609 + let dir = Path::new("/test"); 610 + sim.mkdir(dir).unwrap(); 611 + sim.sync_dir(dir).unwrap(); 612 + 613 + let stale_tmp = Path::new("/test/00000000.tqi.tmp"); 614 + let stale_fd = sim.open(stale_tmp, OpenOptions::read_write()).unwrap(); 615 + sim.write_all_at(stale_fd, 0, b"stale_garbage_from_prior_crash_xxxxxxxxxx") 616 + .unwrap(); 617 + sim.sync(stale_fd).unwrap(); 618 + sim.close(stale_fd).unwrap(); 619 + 620 + let mut index = SegmentIndex::new(); 621 + index.record(EventSequence::new(1), SegmentOffset::new(5)); 622 + 623 + let path = Path::new("/test/00000000.tqi"); 624 + index.save(&sim, path).unwrap(); 625 + 626 + let loaded = SegmentIndex::load(&sim, path).unwrap().unwrap(); 627 + assert_eq!(loaded, index); 628 + } 629 + 630 + #[test] 631 + fn rebuild_bad_magic_returns_err() { 632 + let sim = SimulatedIO::pristine(42); 633 + let dir = Path::new("/test"); 634 + sim.mkdir(dir).unwrap(); 635 + sim.sync_dir(dir).unwrap(); 636 + 637 + let fd = sim 638 + .open(Path::new("/test/bad.tqe"), OpenOptions::read_write()) 639 + .unwrap(); 640 + sim.write_all_at(fd, 0, b"NOPE\x01").unwrap(); 641 + sim.sync(fd).unwrap(); 642 + 643 + let result = rebuild_from_segment(&sim, fd, DEFAULT_INDEX_INTERVAL); 644 + assert!(result.is_err()); 645 + } 646 + 647 + #[test] 648 + fn rebuild_no_truncation_when_clean() { 649 + let (sim, fd) = setup(); 650 + write_n_events(&sim, fd, 5); 651 + sim.sync(fd).unwrap(); 652 + 653 + let size_before = sim.file_size(fd).unwrap(); 654 + rebuild_from_segment(&sim, fd, DEFAULT_INDEX_INTERVAL).unwrap(); 655 + let size_after = sim.file_size(fd).unwrap(); 656 + assert_eq!(size_before, size_after); 657 + } 658 + 659 + #[test] 660 + fn lookup_at_before_all_returns_none() { 661 + let mut index = SegmentIndex::new(); 662 + index.record(EventSequence::new(1), SegmentOffset::new(5)); 663 + 664 + assert_eq!(index.lookup(EventSequence::BEFORE_ALL), None); 665 + } 666 + }
+522
crates/tranquil-store/src/eventlog/types.rs
··· 1 + use serde::{Deserialize, Serialize}; 2 + use tranquil_db_traits::SequenceNumber; 3 + 4 + pub const MAX_EVENT_PAYLOAD: u32 = 4 * 1024 * 1024; 5 + pub const DEFAULT_SEGMENT_SIZE: u64 = 64 * 1024 * 1024; 6 + 7 + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] 8 + pub struct EventSequence(u64); 9 + 10 + impl EventSequence { 11 + pub const BEFORE_ALL: Self = Self(0); 12 + 13 + pub fn new(seq: u64) -> Self { 14 + assert!( 15 + seq > 0, 16 + "EventSequence must be positive; use BEFORE_ALL for cursor start" 17 + ); 18 + Self(seq) 19 + } 20 + 21 + pub fn raw(self) -> u64 { 22 + self.0 23 + } 24 + 25 + pub fn next(self) -> Self { 26 + Self(self.0.checked_add(1).expect("EventSequence overflow")) 27 + } 28 + 29 + pub fn prev_or_before_all(self) -> Self { 30 + match self.0 { 31 + 0 | 1 => Self::BEFORE_ALL, 32 + n => Self(n - 1), 33 + } 34 + } 35 + 36 + pub fn as_i64(self) -> i64 { 37 + i64::try_from(self.0).expect("EventSequence exceeds i64::MAX") 38 + } 39 + 40 + pub fn from_i64(n: i64) -> Option<Self> { 41 + match u64::try_from(n) { 42 + Ok(0) | Err(_) => None, 43 + Ok(v) => Some(Self(v)), 44 + } 45 + } 46 + 47 + pub fn cursor_from_i64(n: i64) -> Option<Self> { 48 + u64::try_from(n).ok().map(Self) 49 + } 50 + } 51 + 52 + impl From<EventSequence> for SequenceNumber { 53 + fn from(es: EventSequence) -> Self { 54 + SequenceNumber::from_raw(es.as_i64()) 55 + } 56 + } 57 + 58 + impl TryFrom<SequenceNumber> for EventSequence { 59 + type Error = &'static str; 60 + 61 + fn try_from(seq: SequenceNumber) -> Result<Self, Self::Error> { 62 + let raw = seq.as_i64(); 63 + match u64::try_from(raw) { 64 + Ok(0) => Err("SequenceNumber 0 maps to BEFORE_ALL, not a valid EventSequence"), 65 + Ok(v) => Ok(Self(v)), 66 + Err(_) => Err("negative SequenceNumber cannot convert to EventSequence"), 67 + } 68 + } 69 + } 70 + 71 + impl std::fmt::Display for EventSequence { 72 + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 73 + write!(f, "{}", self.0) 74 + } 75 + } 76 + 77 + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] 78 + pub struct SegmentId(u32); 79 + 80 + impl SegmentId { 81 + pub fn new(id: u32) -> Self { 82 + Self(id) 83 + } 84 + 85 + pub fn raw(self) -> u32 { 86 + self.0 87 + } 88 + 89 + pub fn next(self) -> Self { 90 + Self(self.0.checked_add(1).expect("SegmentId overflow")) 91 + } 92 + } 93 + 94 + impl std::fmt::Display for SegmentId { 95 + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 96 + write!(f, "{:08}", self.0) 97 + } 98 + } 99 + 100 + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] 101 + pub struct SegmentOffset(u64); 102 + 103 + impl SegmentOffset { 104 + pub const fn new(offset: u64) -> Self { 105 + Self(offset) 106 + } 107 + 108 + pub const fn raw(self) -> u64 { 109 + self.0 110 + } 111 + 112 + pub fn advance(self, delta: u64) -> Self { 113 + Self(self.0.checked_add(delta).expect("SegmentOffset overflow")) 114 + } 115 + } 116 + 117 + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] 118 + pub struct EventLength(u32); 119 + 120 + impl EventLength { 121 + pub fn new(length: u32) -> Self { 122 + assert!( 123 + length <= MAX_EVENT_PAYLOAD, 124 + "EventLength {length} exceeds MAX_EVENT_PAYLOAD {MAX_EVENT_PAYLOAD}" 125 + ); 126 + Self(length) 127 + } 128 + 129 + pub fn raw(self) -> u32 { 130 + self.0 131 + } 132 + 133 + pub fn as_u64(self) -> u64 { 134 + u64::from(self.0) 135 + } 136 + } 137 + 138 + impl<'de> Deserialize<'de> for EventLength { 139 + fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> { 140 + let raw = u32::deserialize(deserializer)?; 141 + if raw > MAX_EVENT_PAYLOAD { 142 + return Err(serde::de::Error::custom(format_args!( 143 + "EventLength {raw} exceeds MAX_EVENT_PAYLOAD {MAX_EVENT_PAYLOAD}" 144 + ))); 145 + } 146 + Ok(Self(raw)) 147 + } 148 + } 149 + 150 + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] 151 + pub struct DidHash(u32); 152 + 153 + impl DidHash { 154 + pub fn from_did(did: &str) -> Self { 155 + Self(xxhash_rust::xxh3::xxh3_64(did.as_bytes()) as u32) 156 + } 157 + 158 + pub fn from_raw(hash: u32) -> Self { 159 + Self(hash) 160 + } 161 + 162 + pub fn raw(self) -> u32 { 163 + self.0 164 + } 165 + } 166 + 167 + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] 168 + pub struct EventTypeTag(u8); 169 + 170 + impl EventTypeTag { 171 + pub const COMMIT: Self = Self(1); 172 + pub const IDENTITY: Self = Self(2); 173 + pub const ACCOUNT: Self = Self(3); 174 + pub const SYNC: Self = Self(4); 175 + 176 + pub fn from_raw(tag: u8) -> Option<Self> { 177 + match tag { 178 + 1..=4 => Some(Self(tag)), 179 + _ => None, 180 + } 181 + } 182 + 183 + pub fn raw(self) -> u8 { 184 + self.0 185 + } 186 + 187 + pub fn to_repo_event_type(self) -> tranquil_db_traits::RepoEventType { 188 + match self { 189 + Self::COMMIT => tranquil_db_traits::RepoEventType::Commit, 190 + Self::IDENTITY => tranquil_db_traits::RepoEventType::Identity, 191 + Self::ACCOUNT => tranquil_db_traits::RepoEventType::Account, 192 + Self::SYNC => tranquil_db_traits::RepoEventType::Sync, 193 + _ => unreachable!("EventTypeTag invariant guarantees valid discriminant"), 194 + } 195 + } 196 + } 197 + 198 + impl<'de> Deserialize<'de> for EventTypeTag { 199 + fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> { 200 + let raw = u8::deserialize(deserializer)?; 201 + Self::from_raw(raw) 202 + .ok_or_else(|| serde::de::Error::custom(format_args!("invalid EventTypeTag: {raw}"))) 203 + } 204 + } 205 + 206 + impl std::fmt::Display for EventTypeTag { 207 + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 208 + match *self { 209 + Self::COMMIT => write!(f, "Commit"), 210 + Self::IDENTITY => write!(f, "Identity"), 211 + Self::ACCOUNT => write!(f, "Account"), 212 + Self::SYNC => write!(f, "Sync"), 213 + _ => unreachable!("EventTypeTag invariant violated: raw value {}", self.0), 214 + } 215 + } 216 + } 217 + 218 + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] 219 + pub struct TimestampMicros(u64); 220 + 221 + impl TimestampMicros { 222 + pub fn new(us: u64) -> Self { 223 + Self(us) 224 + } 225 + 226 + pub fn raw(self) -> u64 { 227 + self.0 228 + } 229 + 230 + pub fn now() -> Self { 231 + let duration = std::time::SystemTime::now() 232 + .duration_since(std::time::UNIX_EPOCH) 233 + .expect("system clock before unix epoch"); 234 + Self( 235 + duration 236 + .as_secs() 237 + .saturating_mul(1_000_000) 238 + .saturating_add(u64::from(duration.subsec_micros())), 239 + ) 240 + } 241 + } 242 + 243 + #[cfg(test)] 244 + mod tests { 245 + use super::*; 246 + 247 + #[test] 248 + fn event_sequence_lifecycle() { 249 + let seq = EventSequence::new(1); 250 + assert_eq!(seq.raw(), 1); 251 + assert_eq!(seq.next(), EventSequence::new(2)); 252 + assert_eq!(seq.as_i64(), 1); 253 + } 254 + 255 + #[test] 256 + fn event_sequence_before_all() { 257 + assert_eq!(EventSequence::BEFORE_ALL.raw(), 0); 258 + } 259 + 260 + #[test] 261 + fn event_sequence_prev_or_before_all() { 262 + assert_eq!( 263 + EventSequence::BEFORE_ALL.prev_or_before_all(), 264 + EventSequence::BEFORE_ALL 265 + ); 266 + assert_eq!( 267 + EventSequence::new(1).prev_or_before_all(), 268 + EventSequence::BEFORE_ALL 269 + ); 270 + assert_eq!( 271 + EventSequence::new(2).prev_or_before_all(), 272 + EventSequence::new(1) 273 + ); 274 + assert_eq!( 275 + EventSequence::new(100).prev_or_before_all(), 276 + EventSequence::new(99) 277 + ); 278 + } 279 + 280 + #[test] 281 + #[should_panic(expected = "EventSequence must be positive")] 282 + fn event_sequence_zero_panics() { 283 + EventSequence::new(0); 284 + } 285 + 286 + #[test] 287 + fn event_sequence_i64_round_trip() { 288 + let seq = EventSequence::new(42); 289 + let as_i64 = seq.as_i64(); 290 + assert_eq!(EventSequence::from_i64(as_i64), Some(seq)); 291 + } 292 + 293 + #[test] 294 + fn event_sequence_from_i64_rejects_zero_and_negative() { 295 + assert_eq!(EventSequence::from_i64(0), None); 296 + assert_eq!(EventSequence::from_i64(-1), None); 297 + } 298 + 299 + #[test] 300 + fn event_sequence_cursor_from_i64_allows_zero() { 301 + assert_eq!( 302 + EventSequence::cursor_from_i64(0), 303 + Some(EventSequence::BEFORE_ALL) 304 + ); 305 + assert_eq!( 306 + EventSequence::cursor_from_i64(1), 307 + Some(EventSequence::new(1)) 308 + ); 309 + assert_eq!(EventSequence::cursor_from_i64(-1), None); 310 + } 311 + 312 + #[test] 313 + #[should_panic(expected = "EventSequence overflow")] 314 + fn event_sequence_overflow_panics() { 315 + EventSequence::new(u64::MAX).next(); 316 + } 317 + 318 + #[test] 319 + fn segment_id_display_zero_padded() { 320 + assert_eq!(SegmentId::new(0).to_string(), "00000000"); 321 + assert_eq!(SegmentId::new(1).to_string(), "00000001"); 322 + assert_eq!(SegmentId::new(99999999).to_string(), "99999999"); 323 + } 324 + 325 + #[test] 326 + fn segment_id_next_increments() { 327 + assert_eq!(SegmentId::new(0).next(), SegmentId::new(1)); 328 + assert_eq!(SegmentId::new(99).next(), SegmentId::new(100)); 329 + } 330 + 331 + #[test] 332 + #[should_panic(expected = "SegmentId overflow")] 333 + fn segment_id_overflow_panics() { 334 + SegmentId::new(u32::MAX).next(); 335 + } 336 + 337 + #[test] 338 + fn segment_offset_advance() { 339 + let offset = SegmentOffset::new(100); 340 + assert_eq!(offset.advance(50), SegmentOffset::new(150)); 341 + } 342 + 343 + #[test] 344 + #[should_panic(expected = "SegmentOffset overflow")] 345 + fn segment_offset_overflow_panics() { 346 + SegmentOffset::new(u64::MAX).advance(1); 347 + } 348 + 349 + #[test] 350 + fn event_length_valid() { 351 + let len = EventLength::new(1024); 352 + assert_eq!(len.raw(), 1024); 353 + assert_eq!(len.as_u64(), 1024); 354 + } 355 + 356 + #[test] 357 + fn event_length_max_accepted() { 358 + let len = EventLength::new(MAX_EVENT_PAYLOAD); 359 + assert_eq!(len.raw(), MAX_EVENT_PAYLOAD); 360 + } 361 + 362 + #[test] 363 + #[should_panic(expected = "exceeds MAX_EVENT_PAYLOAD")] 364 + fn event_length_overflow_panics() { 365 + EventLength::new(MAX_EVENT_PAYLOAD + 1); 366 + } 367 + 368 + #[test] 369 + fn did_hash_deterministic() { 370 + let hash1 = DidHash::from_did("did:plc:abc123"); 371 + let hash2 = DidHash::from_did("did:plc:abc123"); 372 + assert_eq!(hash1, hash2); 373 + } 374 + 375 + #[test] 376 + fn did_hash_different_dids_differ() { 377 + let hash1 = DidHash::from_did("did:plc:abc123"); 378 + let hash2 = DidHash::from_did("did:plc:xyz789"); 379 + assert_ne!(hash1, hash2); 380 + } 381 + 382 + #[test] 383 + fn event_type_tag_known_variants() { 384 + assert_eq!(EventTypeTag::COMMIT.raw(), 1); 385 + assert_eq!(EventTypeTag::IDENTITY.raw(), 2); 386 + assert_eq!(EventTypeTag::ACCOUNT.raw(), 3); 387 + assert_eq!(EventTypeTag::SYNC.raw(), 4); 388 + } 389 + 390 + #[test] 391 + fn event_type_tag_from_raw_valid() { 392 + assert_eq!(EventTypeTag::from_raw(1), Some(EventTypeTag::COMMIT)); 393 + assert_eq!(EventTypeTag::from_raw(2), Some(EventTypeTag::IDENTITY)); 394 + assert_eq!(EventTypeTag::from_raw(3), Some(EventTypeTag::ACCOUNT)); 395 + assert_eq!(EventTypeTag::from_raw(4), Some(EventTypeTag::SYNC)); 396 + } 397 + 398 + #[test] 399 + fn event_type_tag_from_raw_invalid() { 400 + assert_eq!(EventTypeTag::from_raw(0), None); 401 + assert_eq!(EventTypeTag::from_raw(5), None); 402 + assert_eq!(EventTypeTag::from_raw(255), None); 403 + } 404 + 405 + #[test] 406 + fn event_type_tag_display() { 407 + assert_eq!(EventTypeTag::COMMIT.to_string(), "Commit"); 408 + assert_eq!(EventTypeTag::IDENTITY.to_string(), "Identity"); 409 + assert_eq!(EventTypeTag::ACCOUNT.to_string(), "Account"); 410 + assert_eq!(EventTypeTag::SYNC.to_string(), "Sync"); 411 + } 412 + 413 + #[test] 414 + fn timestamp_micros_round_trip() { 415 + let ts = TimestampMicros::new(1_700_000_000_000_000); 416 + assert_eq!(ts.raw(), 1_700_000_000_000_000); 417 + } 418 + 419 + #[test] 420 + fn timestamp_micros_now_is_reasonable() { 421 + let ts = TimestampMicros::now(); 422 + assert!(ts.raw() > 1_700_000_000_000_000); 423 + } 424 + 425 + #[test] 426 + fn postcard_round_trip_event_sequence() { 427 + let seq = EventSequence::new(42); 428 + let bytes = postcard::to_allocvec(&seq).unwrap(); 429 + let decoded: EventSequence = postcard::from_bytes(&bytes).unwrap(); 430 + assert_eq!(seq, decoded); 431 + } 432 + 433 + #[test] 434 + fn postcard_round_trip_segment_id() { 435 + let id = SegmentId::new(7); 436 + let bytes = postcard::to_allocvec(&id).unwrap(); 437 + let decoded: SegmentId = postcard::from_bytes(&bytes).unwrap(); 438 + assert_eq!(id, decoded); 439 + } 440 + 441 + #[test] 442 + fn postcard_round_trip_did_hash() { 443 + let hash = DidHash::from_did("did:plc:test"); 444 + let bytes = postcard::to_allocvec(&hash).unwrap(); 445 + let decoded: DidHash = postcard::from_bytes(&bytes).unwrap(); 446 + assert_eq!(hash, decoded); 447 + } 448 + 449 + #[test] 450 + fn postcard_round_trip_event_type_tag() { 451 + let tag = EventTypeTag::COMMIT; 452 + let bytes = postcard::to_allocvec(&tag).unwrap(); 453 + let decoded: EventTypeTag = postcard::from_bytes(&bytes).unwrap(); 454 + assert_eq!(tag, decoded); 455 + } 456 + 457 + #[test] 458 + fn postcard_round_trip_timestamp_micros() { 459 + let ts = TimestampMicros::new(1_700_000_000_000_000); 460 + let bytes = postcard::to_allocvec(&ts).unwrap(); 461 + let decoded: TimestampMicros = postcard::from_bytes(&bytes).unwrap(); 462 + assert_eq!(ts, decoded); 463 + } 464 + 465 + #[test] 466 + fn postcard_rejects_invalid_event_type_tag() { 467 + let bytes = postcard::to_allocvec(&0u8).unwrap(); 468 + assert!(postcard::from_bytes::<EventTypeTag>(&bytes).is_err()); 469 + 470 + let bytes = postcard::to_allocvec(&5u8).unwrap(); 471 + assert!(postcard::from_bytes::<EventTypeTag>(&bytes).is_err()); 472 + 473 + let bytes = postcard::to_allocvec(&255u8).unwrap(); 474 + assert!(postcard::from_bytes::<EventTypeTag>(&bytes).is_err()); 475 + } 476 + 477 + #[test] 478 + fn postcard_rejects_oversized_event_length() { 479 + let oversized = MAX_EVENT_PAYLOAD + 1; 480 + let bytes = postcard::to_allocvec(&oversized).unwrap(); 481 + assert!(postcard::from_bytes::<EventLength>(&bytes).is_err()); 482 + } 483 + 484 + #[test] 485 + fn event_sequence_to_sequence_number() { 486 + let es = EventSequence::new(42); 487 + let sn: SequenceNumber = es.into(); 488 + assert_eq!(sn.as_i64(), 42); 489 + } 490 + 491 + #[test] 492 + fn event_sequence_before_all_to_sequence_number() { 493 + let sn: SequenceNumber = EventSequence::BEFORE_ALL.into(); 494 + assert_eq!(sn, SequenceNumber::ZERO); 495 + } 496 + 497 + #[test] 498 + fn sequence_number_to_event_sequence() { 499 + let sn = SequenceNumber::from_raw(42); 500 + let es = EventSequence::try_from(sn).unwrap(); 501 + assert_eq!(es.raw(), 42); 502 + } 503 + 504 + #[test] 505 + fn sequence_number_zero_rejects_to_event_sequence() { 506 + let result = EventSequence::try_from(SequenceNumber::ZERO); 507 + assert!(result.is_err()); 508 + } 509 + 510 + #[test] 511 + fn sequence_number_negative_rejects_to_event_sequence() { 512 + let result = EventSequence::try_from(SequenceNumber::from_raw(-1)); 513 + assert!(result.is_err()); 514 + } 515 + 516 + #[test] 517 + fn postcard_accepts_max_event_length() { 518 + let bytes = postcard::to_allocvec(&MAX_EVENT_PAYLOAD).unwrap(); 519 + let decoded: EventLength = postcard::from_bytes(&bytes).unwrap(); 520 + assert_eq!(decoded.raw(), MAX_EVENT_PAYLOAD); 521 + } 522 + }
+972
crates/tranquil-store/src/eventlog/writer.rs
··· 1 + use std::io; 2 + use std::sync::Arc; 3 + 4 + use tracing::warn; 5 + 6 + use crate::io::StorageIO; 7 + 8 + use super::manager::SegmentManager; 9 + use super::segment_file::{SEGMENT_HEADER_SIZE, SegmentWriter, ValidEvent}; 10 + use super::segment_index::{DEFAULT_INDEX_INTERVAL, SegmentIndex, rebuild_from_segment}; 11 + use super::types::{ 12 + DidHash, EventSequence, EventTypeTag, MAX_EVENT_PAYLOAD, SegmentId, SegmentOffset, 13 + TimestampMicros, 14 + }; 15 + 16 + #[derive(Debug)] 17 + pub struct SyncResult { 18 + pub synced_through: EventSequence, 19 + pub segment_id: SegmentId, 20 + pub position: SegmentOffset, 21 + pub flushed_events: Vec<ValidEvent>, 22 + } 23 + 24 + pub struct EventLogWriter<S: StorageIO> { 25 + manager: Arc<SegmentManager<S>>, 26 + active_writer: SegmentWriter, 27 + active_index: SegmentIndex, 28 + next_seq: EventSequence, 29 + synced_seq: EventSequence, 30 + index_interval: usize, 31 + event_count_in_segment: usize, 32 + last_event_offset: Option<SegmentOffset>, 33 + pending_events: Vec<ValidEvent>, 34 + } 35 + 36 + impl<S: StorageIO> EventLogWriter<S> { 37 + pub fn open(manager: Arc<SegmentManager<S>>, index_interval: usize) -> io::Result<Self> { 38 + assert!(index_interval > 0, "index_interval must be positive"); 39 + 40 + let segments = manager.list_segments()?; 41 + 42 + match segments.last() { 43 + None => Self::init_fresh( 44 + manager, 45 + SegmentId::new(1), 46 + EventSequence::new(1), 47 + index_interval, 48 + ), 49 + Some(&last_id) => Self::recover_active(manager, &segments, last_id, index_interval), 50 + } 51 + } 52 + 53 + fn init_fresh( 54 + manager: Arc<SegmentManager<S>>, 55 + segment_id: SegmentId, 56 + next_seq: EventSequence, 57 + index_interval: usize, 58 + ) -> io::Result<Self> { 59 + let fd = manager.open_for_append(segment_id)?; 60 + manager.io().truncate(fd, 0)?; 61 + let writer = SegmentWriter::new(manager.io(), fd, segment_id, next_seq)?; 62 + writer.sync(manager.io())?; 63 + manager.io().sync_dir(manager.segments_dir())?; 64 + 65 + Ok(Self { 66 + manager, 67 + active_writer: writer, 68 + active_index: SegmentIndex::new(), 69 + next_seq, 70 + synced_seq: next_seq.prev_or_before_all(), 71 + index_interval, 72 + event_count_in_segment: 0, 73 + last_event_offset: None, 74 + pending_events: Vec::new(), 75 + }) 76 + } 77 + 78 + fn recover_active( 79 + manager: Arc<SegmentManager<S>>, 80 + segments: &[SegmentId], 81 + active_id: SegmentId, 82 + index_interval: usize, 83 + ) -> io::Result<Self> { 84 + let fd = manager.open_for_append(active_id)?; 85 + 86 + let (index, last_seq_in_active) = match rebuild_from_segment( 87 + manager.io(), 88 + fd, 89 + index_interval, 90 + ) { 91 + Ok(result) => result, 92 + Err(rebuild_err) => { 93 + let file_size = manager.io().file_size(fd)?; 94 + if file_size <= SEGMENT_HEADER_SIZE as u64 { 95 + manager.io().truncate(fd, 0)?; 96 + let prev_segments = &segments[..segments.len().saturating_sub(1)]; 97 + let next_seq = find_last_seq_from_segments(&manager, prev_segments)? 98 + .map_or(EventSequence::new(1), |s| s.next()); 99 + return Self::init_fresh( 100 + Arc::clone(&manager), 101 + active_id, 102 + next_seq, 103 + index_interval, 104 + ); 105 + } 106 + return Err(io::Error::new( 107 + io::ErrorKind::InvalidData, 108 + format!( 109 + "segment {active_id} rebuild failed ({file_size} bytes on disk): {rebuild_err}" 110 + ), 111 + )); 112 + } 113 + }; 114 + 115 + let position = SegmentOffset::new(manager.io().file_size(fd)?); 116 + 117 + let prev_segments = &segments[..segments.len().saturating_sub(1)]; 118 + 119 + let next_seq = match last_seq_in_active { 120 + Some(seq) => { 121 + if let Some(sealed_last) = find_last_seq_from_segments(&manager, prev_segments)? 122 + && seq <= sealed_last 123 + { 124 + return Err(io::Error::new( 125 + io::ErrorKind::InvalidData, 126 + format!( 127 + "active segment last seq ({seq}) must exceed sealed segments' \ 128 + last seq ({sealed_last}): cross-segment corruption detected" 129 + ), 130 + )); 131 + } 132 + seq.next() 133 + } 134 + None => find_last_seq_from_segments(&manager, prev_segments)? 135 + .map_or(EventSequence::new(1), |s| s.next()), 136 + }; 137 + 138 + let synced_seq = next_seq.prev_or_before_all(); 139 + 140 + let event_count_in_segment = match (index.first_seq(), index.last_seq()) { 141 + (Some(first), Some(last)) => { 142 + debug_assert!( 143 + first <= last, 144 + "index invariant violated: first_seq {first} > last_seq {last}" 145 + ); 146 + usize::try_from(last.raw() - first.raw() + 1).expect("event count exceeds usize") 147 + } 148 + _ => 0, 149 + }; 150 + 151 + let base_seq = index.first_seq().unwrap_or(next_seq); 152 + 153 + let last_event_offset = index.last_seq().and_then(|seq| index.lookup(seq)); 154 + 155 + let writer = SegmentWriter::resume( 156 + manager.io(), 157 + fd, 158 + active_id, 159 + position, 160 + base_seq, 161 + last_seq_in_active, 162 + ); 163 + 164 + if let Err(e) = manager.io().delete(&manager.index_path(active_id)) 165 + && e.kind() != io::ErrorKind::NotFound 166 + { 167 + warn!(segment = %active_id, error = %e, "failed to delete stale index"); 168 + } 169 + 170 + Ok(Self { 171 + manager, 172 + active_writer: writer, 173 + active_index: index, 174 + next_seq, 175 + synced_seq, 176 + index_interval, 177 + event_count_in_segment, 178 + last_event_offset, 179 + pending_events: Vec::new(), 180 + }) 181 + } 182 + 183 + pub fn append( 184 + &mut self, 185 + did_hash: DidHash, 186 + event_type: EventTypeTag, 187 + payload: Vec<u8>, 188 + ) -> io::Result<EventSequence> { 189 + let payload_len = u32::try_from(payload.len()) 190 + .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "payload exceeds u32::MAX"))?; 191 + if payload_len > MAX_EVENT_PAYLOAD { 192 + return Err(io::Error::new( 193 + io::ErrorKind::InvalidInput, 194 + format!( 195 + "payload length {payload_len} exceeds MAX_EVENT_PAYLOAD {MAX_EVENT_PAYLOAD}" 196 + ), 197 + )); 198 + } 199 + 200 + let seq = self.next_seq; 201 + let timestamp = TimestampMicros::now(); 202 + 203 + let event = ValidEvent { 204 + seq, 205 + timestamp, 206 + did_hash, 207 + event_type, 208 + payload, 209 + }; 210 + 211 + let offset = self.active_writer.append_event(self.manager.io(), &event)?; 212 + 213 + let should_index = self.event_count_in_segment == 0 214 + || self 215 + .event_count_in_segment 216 + .is_multiple_of(self.index_interval); 217 + if should_index { 218 + self.active_index.record(seq, offset); 219 + } 220 + 221 + self.event_count_in_segment = self 222 + .event_count_in_segment 223 + .checked_add(1) 224 + .expect("event_count_in_segment overflow"); 225 + self.last_event_offset = Some(offset); 226 + self.next_seq = seq.next(); 227 + self.pending_events.push(event); 228 + 229 + Ok(seq) 230 + } 231 + 232 + pub fn sync(&mut self) -> io::Result<SyncResult> { 233 + if !self.pending_events.is_empty() { 234 + self.active_writer.sync(self.manager.io())?; 235 + } 236 + 237 + let flushed = std::mem::take(&mut self.pending_events); 238 + self.synced_seq = flushed.last().map(|e| e.seq).unwrap_or(self.synced_seq); 239 + 240 + Ok(SyncResult { 241 + synced_through: self.synced_seq, 242 + segment_id: self.active_writer.segment_id(), 243 + position: self.active_writer.position(), 244 + flushed_events: flushed, 245 + }) 246 + } 247 + 248 + pub fn rotate_if_needed(&mut self) -> io::Result<Option<SegmentId>> { 249 + if !self.manager.should_rotate(self.active_writer.position()) { 250 + return Ok(None); 251 + } 252 + 253 + if !self.pending_events.is_empty() { 254 + return Ok(None); 255 + } 256 + 257 + let old_id = self.active_writer.segment_id(); 258 + 259 + self.ensure_last_event_indexed(); 260 + 261 + self.manager.seal_segment(old_id, &self.active_index)?; 262 + 263 + let (new_id, new_fd) = self.manager.prepare_rotation(old_id)?; 264 + 265 + match SegmentWriter::new::<S>(self.manager.io(), new_fd, new_id, self.next_seq) { 266 + Ok(writer) => { 267 + self.active_writer = writer; 268 + self.active_index = SegmentIndex::new(); 269 + self.event_count_in_segment = 0; 270 + self.last_event_offset = None; 271 + self.manager.commit_rotation(new_id, new_fd); 272 + Ok(Some(old_id)) 273 + } 274 + Err(e) => { 275 + self.manager.rollback_rotation(new_id, new_fd); 276 + Err(e) 277 + } 278 + } 279 + } 280 + 281 + pub fn checkpoint_index(&self) -> io::Result<()> { 282 + if self.active_index.entry_count() == 0 { 283 + return Ok(()); 284 + } 285 + let path = self.manager.index_path(self.active_writer.segment_id()); 286 + self.active_index.save(self.manager.io(), &path) 287 + } 288 + 289 + pub fn current_seq(&self) -> EventSequence { 290 + self.next_seq.prev_or_before_all() 291 + } 292 + 293 + pub fn synced_seq(&self) -> EventSequence { 294 + self.synced_seq 295 + } 296 + 297 + pub fn active_segment_id(&self) -> SegmentId { 298 + self.active_writer.segment_id() 299 + } 300 + 301 + pub fn active_index_snapshot(&self) -> SegmentIndex { 302 + self.active_index.clone() 303 + } 304 + 305 + pub fn position(&self) -> SegmentOffset { 306 + self.active_writer.position() 307 + } 308 + 309 + pub fn shutdown(&mut self) -> io::Result<()> { 310 + let _ = self.sync()?; 311 + self.ensure_last_event_indexed(); 312 + self.checkpoint_index() 313 + } 314 + 315 + fn ensure_last_event_indexed(&mut self) { 316 + let last_written = self.next_seq.prev_or_before_all(); 317 + let needs_final_index = self.last_event_offset.is_some() 318 + && (self.active_index.last_seq() != Some(last_written)); 319 + if let (true, Some(offset)) = (needs_final_index, self.last_event_offset) { 320 + self.active_index.record(last_written, offset); 321 + } 322 + } 323 + } 324 + 325 + fn find_last_seq_from_segments<S: StorageIO>( 326 + manager: &SegmentManager<S>, 327 + segments: &[SegmentId], 328 + ) -> io::Result<Option<EventSequence>> { 329 + segments.iter().rev().try_fold(None, |acc, &seg_id| { 330 + if acc.is_some() { 331 + return Ok(acc); 332 + } 333 + 334 + match SegmentIndex::load(manager.io(), &manager.index_path(seg_id)) { 335 + Ok(Some(idx)) => Ok(idx.last_seq()), 336 + Err(e) if e.kind() != io::ErrorKind::InvalidData => Err(e), 337 + _ => { 338 + let fd = manager.open_for_read(seg_id)?; 339 + let (_, last_seq) = rebuild_from_segment(manager.io(), fd, DEFAULT_INDEX_INTERVAL)?; 340 + Ok(last_seq) 341 + } 342 + } 343 + }) 344 + } 345 + 346 + #[cfg(test)] 347 + mod tests { 348 + use super::*; 349 + use crate::eventlog::segment_file::{EVENT_RECORD_OVERHEAD, SegmentReader}; 350 + use crate::eventlog::segment_index::DEFAULT_INDEX_INTERVAL; 351 + use crate::sim::SimulatedIO; 352 + use std::path::{Path, PathBuf}; 353 + 354 + fn setup_manager(max_segment_size: u64) -> Arc<SegmentManager<SimulatedIO>> { 355 + let sim = SimulatedIO::pristine(42); 356 + Arc::new(SegmentManager::new(sim, PathBuf::from("/segments"), max_segment_size).unwrap()) 357 + } 358 + 359 + fn append_test_event( 360 + writer: &mut EventLogWriter<SimulatedIO>, 361 + did_seed: &str, 362 + ) -> EventSequence { 363 + writer 364 + .append( 365 + DidHash::from_did(did_seed), 366 + EventTypeTag::COMMIT, 367 + format!("payload-{did_seed}").into_bytes(), 368 + ) 369 + .unwrap() 370 + } 371 + 372 + #[test] 373 + fn open_fresh_creates_segment() { 374 + let mgr = setup_manager(64 * 1024); 375 + let writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 376 + 377 + assert_eq!(writer.active_segment_id(), SegmentId::new(1)); 378 + assert_eq!(writer.current_seq(), EventSequence::BEFORE_ALL); 379 + assert_eq!(writer.synced_seq(), EventSequence::BEFORE_ALL); 380 + assert_eq!( 381 + writer.position(), 382 + SegmentOffset::new(SEGMENT_HEADER_SIZE as u64) 383 + ); 384 + 385 + let segments = mgr.list_segments().unwrap(); 386 + assert_eq!(segments, vec![SegmentId::new(1)]); 387 + } 388 + 389 + #[test] 390 + fn append_assigns_contiguous_sequences() { 391 + let mgr = setup_manager(64 * 1024); 392 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 393 + 394 + let seqs: Vec<EventSequence> = (1..=5) 395 + .map(|i| append_test_event(&mut writer, &format!("did:plc:user{i}"))) 396 + .collect(); 397 + 398 + assert_eq!(seqs, (1..=5).map(EventSequence::new).collect::<Vec<_>>()); 399 + assert_eq!(writer.current_seq(), EventSequence::new(5)); 400 + } 401 + 402 + #[test] 403 + fn sync_returns_flushed_events() { 404 + let mgr = setup_manager(64 * 1024); 405 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 406 + 407 + (1..=3).for_each(|i| { 408 + append_test_event(&mut writer, &format!("did:plc:user{i}")); 409 + }); 410 + 411 + let result = writer.sync().unwrap(); 412 + assert_eq!(result.synced_through, EventSequence::new(3)); 413 + assert_eq!(result.flushed_events.len(), 3); 414 + assert_eq!(result.segment_id, SegmentId::new(1)); 415 + 416 + result 417 + .flushed_events 418 + .iter() 419 + .enumerate() 420 + .for_each(|(i, event)| { 421 + assert_eq!(event.seq, EventSequence::new(i as u64 + 1)); 422 + }); 423 + 424 + assert_eq!(writer.synced_seq(), EventSequence::new(3)); 425 + } 426 + 427 + #[test] 428 + fn sync_without_pending_is_noop() { 429 + let mgr = setup_manager(64 * 1024); 430 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 431 + 432 + let result = writer.sync().unwrap(); 433 + assert_eq!(result.synced_through, EventSequence::BEFORE_ALL); 434 + assert!(result.flushed_events.is_empty()); 435 + } 436 + 437 + #[test] 438 + fn second_sync_returns_only_new_events() { 439 + let mgr = setup_manager(64 * 1024); 440 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 441 + 442 + (1..=3).for_each(|i| { 443 + append_test_event(&mut writer, &format!("did:plc:user{i}")); 444 + }); 445 + writer.sync().unwrap(); 446 + 447 + (4..=5).for_each(|i| { 448 + append_test_event(&mut writer, &format!("did:plc:user{i}")); 449 + }); 450 + let result = writer.sync().unwrap(); 451 + assert_eq!(result.synced_through, EventSequence::new(5)); 452 + assert_eq!(result.flushed_events.len(), 2); 453 + assert_eq!(result.flushed_events[0].seq, EventSequence::new(4)); 454 + assert_eq!(result.flushed_events[1].seq, EventSequence::new(5)); 455 + } 456 + 457 + #[test] 458 + fn recovery_preserves_synced_events() { 459 + let mgr = setup_manager(64 * 1024); 460 + 461 + { 462 + let mut writer = 463 + EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 464 + (1..=5).for_each(|i| { 465 + append_test_event(&mut writer, &format!("did:plc:user{i}")); 466 + }); 467 + writer.sync().unwrap(); 468 + } 469 + 470 + mgr.shutdown(); 471 + 472 + let writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 473 + assert_eq!(writer.current_seq(), EventSequence::new(5)); 474 + assert_eq!(writer.synced_seq(), EventSequence::new(5)); 475 + assert_eq!(writer.active_segment_id(), SegmentId::new(1)); 476 + 477 + let fd = mgr.open_for_read(SegmentId::new(1)).unwrap(); 478 + let events = SegmentReader::open(mgr.io(), fd) 479 + .unwrap() 480 + .valid_prefix() 481 + .unwrap(); 482 + assert_eq!(events.len(), 5); 483 + } 484 + 485 + #[test] 486 + fn recovery_loses_unsynced_events() { 487 + let mgr = setup_manager(64 * 1024); 488 + 489 + { 490 + let mut writer = 491 + EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 492 + (1..=3).for_each(|i| { 493 + append_test_event(&mut writer, &format!("did:plc:user{i}")); 494 + }); 495 + writer.sync().unwrap(); 496 + mgr.io().sync_dir(Path::new("/segments")).unwrap(); 497 + 498 + (4..=6).for_each(|i| { 499 + append_test_event(&mut writer, &format!("did:plc:user{i}")); 500 + }); 501 + } 502 + 503 + mgr.shutdown(); 504 + mgr.io().crash(); 505 + 506 + let writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 507 + assert_eq!(writer.current_seq(), EventSequence::new(3)); 508 + assert_eq!(writer.next_seq, EventSequence::new(4)); 509 + } 510 + 511 + #[test] 512 + fn rotation_creates_new_segment() { 513 + let payload_size = 100; 514 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 515 + let max_segment_size = SEGMENT_HEADER_SIZE + record_size * 3; 516 + 517 + let mgr = setup_manager(max_segment_size as u64); 518 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 519 + 520 + (1..=3).for_each(|i| { 521 + writer 522 + .append( 523 + DidHash::from_did(&format!("did:plc:user{i}")), 524 + EventTypeTag::COMMIT, 525 + vec![0xAA; payload_size], 526 + ) 527 + .unwrap(); 528 + }); 529 + writer.sync().unwrap(); 530 + assert!(writer.rotate_if_needed().unwrap().is_some()); 531 + 532 + assert_eq!(writer.active_segment_id(), SegmentId::new(2)); 533 + assert_eq!( 534 + writer.position(), 535 + SegmentOffset::new(SEGMENT_HEADER_SIZE as u64) 536 + ); 537 + 538 + let segments = mgr.list_segments().unwrap(); 539 + assert_eq!(segments, vec![SegmentId::new(1), SegmentId::new(2)]); 540 + } 541 + 542 + #[test] 543 + fn rotation_seals_old_segment() { 544 + let payload_size = 100; 545 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 546 + let max_segment_size = SEGMENT_HEADER_SIZE + record_size * 2; 547 + 548 + let mgr = setup_manager(max_segment_size as u64); 549 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 550 + 551 + (1..=2).for_each(|i| { 552 + writer 553 + .append( 554 + DidHash::from_did(&format!("did:plc:user{i}")), 555 + EventTypeTag::COMMIT, 556 + vec![0xBB; payload_size], 557 + ) 558 + .unwrap(); 559 + }); 560 + writer.sync().unwrap(); 561 + writer.rotate_if_needed().unwrap(); 562 + 563 + assert!(mgr.is_sealed(SegmentId::new(1))); 564 + 565 + let index = SegmentIndex::load(mgr.io(), &mgr.index_path(SegmentId::new(1))) 566 + .unwrap() 567 + .unwrap(); 568 + assert_eq!(index.first_seq(), Some(EventSequence::new(1))); 569 + assert_eq!(index.last_seq(), Some(EventSequence::new(2))); 570 + } 571 + 572 + #[test] 573 + fn sequences_continue_across_rotation() { 574 + let payload_size = 50; 575 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 576 + let max_segment_size = SEGMENT_HEADER_SIZE + record_size * 2; 577 + 578 + let mgr = setup_manager(max_segment_size as u64); 579 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 580 + 581 + (1..=2).for_each(|i| { 582 + writer 583 + .append( 584 + DidHash::from_did(&format!("did:plc:user{i}")), 585 + EventTypeTag::COMMIT, 586 + vec![0xCC; payload_size], 587 + ) 588 + .unwrap(); 589 + }); 590 + writer.sync().unwrap(); 591 + writer.rotate_if_needed().unwrap(); 592 + 593 + let seq = writer 594 + .append( 595 + DidHash::from_did("did:plc:user3"), 596 + EventTypeTag::COMMIT, 597 + vec![0xCC; payload_size], 598 + ) 599 + .unwrap(); 600 + assert_eq!(seq, EventSequence::new(3)); 601 + } 602 + 603 + #[test] 604 + fn recovery_after_rotation() { 605 + let payload_size = 50; 606 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 607 + let max_segment_size = SEGMENT_HEADER_SIZE + record_size * 2; 608 + 609 + let mgr = setup_manager(max_segment_size as u64); 610 + 611 + { 612 + let mut writer = 613 + EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 614 + (1..=2).for_each(|i| { 615 + writer 616 + .append( 617 + DidHash::from_did(&format!("did:plc:user{i}")), 618 + EventTypeTag::COMMIT, 619 + vec![0xDD; payload_size], 620 + ) 621 + .unwrap(); 622 + }); 623 + writer.sync().unwrap(); 624 + writer.rotate_if_needed().unwrap(); 625 + 626 + writer 627 + .append( 628 + DidHash::from_did("did:plc:user3"), 629 + EventTypeTag::COMMIT, 630 + vec![0xDD; payload_size], 631 + ) 632 + .unwrap(); 633 + writer.sync().unwrap(); 634 + } 635 + 636 + mgr.shutdown(); 637 + 638 + let writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 639 + assert_eq!(writer.active_segment_id(), SegmentId::new(2)); 640 + assert_eq!(writer.current_seq(), EventSequence::new(3)); 641 + assert_eq!(writer.next_seq, EventSequence::new(4)); 642 + } 643 + 644 + #[test] 645 + fn recovery_sealed_last_segment() { 646 + let payload_size = 50; 647 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 648 + let max_segment_size = SEGMENT_HEADER_SIZE + record_size * 2; 649 + 650 + let mgr = setup_manager(max_segment_size as u64); 651 + 652 + { 653 + let mut writer = 654 + EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 655 + (1..=2).for_each(|i| { 656 + writer 657 + .append( 658 + DidHash::from_did(&format!("did:plc:user{i}")), 659 + EventTypeTag::COMMIT, 660 + vec![0xEE; payload_size], 661 + ) 662 + .unwrap(); 663 + }); 664 + writer.sync().unwrap(); 665 + writer.rotate_if_needed().unwrap(); 666 + } 667 + 668 + mgr.shutdown(); 669 + mgr.io().crash(); 670 + 671 + let writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 672 + assert_eq!(writer.next_seq, EventSequence::new(3)); 673 + } 674 + 675 + #[test] 676 + fn recovery_empty_active_after_rotation() { 677 + let payload_size = 50; 678 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 679 + let max_segment_size = SEGMENT_HEADER_SIZE + record_size * 2; 680 + 681 + let mgr = setup_manager(max_segment_size as u64); 682 + 683 + { 684 + let mut writer = 685 + EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 686 + (1..=2).for_each(|i| { 687 + writer 688 + .append( 689 + DidHash::from_did(&format!("did:plc:user{i}")), 690 + EventTypeTag::COMMIT, 691 + vec![0xEE; payload_size], 692 + ) 693 + .unwrap(); 694 + }); 695 + writer.sync().unwrap(); 696 + writer.rotate_if_needed().unwrap(); 697 + } 698 + 699 + mgr.shutdown(); 700 + 701 + let writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 702 + assert_eq!(writer.next_seq, EventSequence::new(3)); 703 + 704 + let fd = mgr.open_for_read(SegmentId::new(1)).unwrap(); 705 + let events = SegmentReader::open(mgr.io(), fd) 706 + .unwrap() 707 + .valid_prefix() 708 + .unwrap(); 709 + assert_eq!(events.len(), 2); 710 + } 711 + 712 + #[test] 713 + fn checkpoint_creates_index_file() { 714 + let mgr = setup_manager(64 * 1024); 715 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 716 + 717 + (1..=10).for_each(|i| { 718 + append_test_event(&mut writer, &format!("did:plc:user{i}")); 719 + }); 720 + writer.sync().unwrap(); 721 + 722 + writer.checkpoint_index().unwrap(); 723 + 724 + let wip = mgr.index_path(SegmentId::new(1)); 725 + let loaded = SegmentIndex::load(mgr.io(), &wip).unwrap(); 726 + assert!(loaded.is_some()); 727 + } 728 + 729 + #[test] 730 + fn checkpoint_empty_index_is_noop() { 731 + let mgr = setup_manager(64 * 1024); 732 + let writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 733 + 734 + writer.checkpoint_index().unwrap(); 735 + 736 + let wip = mgr.index_path(SegmentId::new(1)); 737 + let loaded = SegmentIndex::load(mgr.io(), &wip).unwrap(); 738 + assert!(loaded.is_none()); 739 + } 740 + 741 + #[test] 742 + fn current_seq_and_synced_seq_diverge_before_sync() { 743 + let mgr = setup_manager(64 * 1024); 744 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 745 + 746 + append_test_event(&mut writer, "did:plc:user1"); 747 + append_test_event(&mut writer, "did:plc:user2"); 748 + 749 + assert_eq!(writer.current_seq(), EventSequence::new(2)); 750 + assert_eq!(writer.synced_seq(), EventSequence::BEFORE_ALL); 751 + 752 + writer.sync().unwrap(); 753 + 754 + assert_eq!(writer.current_seq(), EventSequence::new(2)); 755 + assert_eq!(writer.synced_seq(), EventSequence::new(2)); 756 + } 757 + 758 + #[test] 759 + fn sparse_index_built_at_intervals() { 760 + let mgr = setup_manager(64 * 1024); 761 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 4).unwrap(); 762 + 763 + (1..=10).for_each(|i| { 764 + append_test_event(&mut writer, &format!("did:plc:user{i}")); 765 + }); 766 + writer.sync().unwrap(); 767 + 768 + assert_eq!(writer.active_index.first_seq(), Some(EventSequence::new(1))); 769 + assert!(writer.active_index.entry_count() >= 3); 770 + assert!(writer.active_index.lookup(EventSequence::new(1)).is_some()); 771 + assert!(writer.active_index.lookup(EventSequence::new(5)).is_some()); 772 + } 773 + 774 + #[test] 775 + fn multi_rotation_and_recovery() { 776 + let payload_size = 30; 777 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 778 + let max_segment_size = SEGMENT_HEADER_SIZE + record_size * 3; 779 + 780 + let mgr = setup_manager(max_segment_size as u64); 781 + 782 + { 783 + let mut writer = 784 + EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 785 + (1..=9).for_each(|i| { 786 + writer 787 + .append( 788 + DidHash::from_did(&format!("did:plc:user{i}")), 789 + EventTypeTag::COMMIT, 790 + vec![i as u8; payload_size], 791 + ) 792 + .unwrap(); 793 + 794 + if i % 3 == 0 { 795 + writer.sync().unwrap(); 796 + writer.rotate_if_needed().unwrap(); 797 + } 798 + }); 799 + writer.sync().unwrap(); 800 + } 801 + 802 + mgr.shutdown(); 803 + 804 + let writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 805 + assert_eq!(writer.next_seq, EventSequence::new(10)); 806 + 807 + let segments = mgr.list_segments().unwrap(); 808 + assert!(segments.len() >= 3); 809 + } 810 + 811 + #[test] 812 + fn shutdown_syncs_and_checkpoints() { 813 + let mgr = setup_manager(64 * 1024); 814 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 815 + 816 + (1..=5).for_each(|i| { 817 + append_test_event(&mut writer, &format!("did:plc:user{i}")); 818 + }); 819 + 820 + assert_eq!(writer.synced_seq(), EventSequence::BEFORE_ALL); 821 + 822 + writer.shutdown().unwrap(); 823 + 824 + assert_eq!(writer.synced_seq(), EventSequence::new(5)); 825 + 826 + let wip = mgr.index_path(SegmentId::new(1)); 827 + assert!(SegmentIndex::load(mgr.io(), &wip).unwrap().is_some()); 828 + } 829 + 830 + #[test] 831 + fn rotation_indexes_last_event() { 832 + let payload_size = 50; 833 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 834 + let max_segment_size = SEGMENT_HEADER_SIZE + record_size * 5; 835 + 836 + let mgr = setup_manager(max_segment_size as u64); 837 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 838 + 839 + (1..=5).for_each(|i| { 840 + writer 841 + .append( 842 + DidHash::from_did(&format!("did:plc:user{i}")), 843 + EventTypeTag::COMMIT, 844 + vec![0xFF; payload_size], 845 + ) 846 + .unwrap(); 847 + }); 848 + writer.sync().unwrap(); 849 + writer.rotate_if_needed().unwrap(); 850 + 851 + let index = SegmentIndex::load(mgr.io(), &mgr.index_path(SegmentId::new(1))) 852 + .unwrap() 853 + .unwrap(); 854 + 855 + assert_eq!(index.last_seq(), Some(EventSequence::new(5))); 856 + assert!(index.lookup(EventSequence::new(5)).is_some()); 857 + } 858 + 859 + #[test] 860 + fn open_idempotent_on_fresh() { 861 + let mgr = setup_manager(64 * 1024); 862 + 863 + { 864 + let _writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 865 + } 866 + mgr.shutdown(); 867 + 868 + let writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 869 + assert_eq!(writer.active_segment_id(), SegmentId::new(1)); 870 + assert_eq!(writer.current_seq(), EventSequence::BEFORE_ALL); 871 + } 872 + 873 + #[test] 874 + fn append_after_recovery_continues_sequence() { 875 + let mgr = setup_manager(64 * 1024); 876 + 877 + { 878 + let mut writer = 879 + EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 880 + (1..=3).for_each(|i| { 881 + append_test_event(&mut writer, &format!("did:plc:user{i}")); 882 + }); 883 + writer.sync().unwrap(); 884 + } 885 + 886 + mgr.shutdown(); 887 + 888 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 889 + let seq = append_test_event(&mut writer, "did:plc:user4"); 890 + assert_eq!(seq, EventSequence::new(4)); 891 + writer.sync().unwrap(); 892 + 893 + let fd = mgr.open_for_read(SegmentId::new(1)).unwrap(); 894 + let events = SegmentReader::open(mgr.io(), fd) 895 + .unwrap() 896 + .valid_prefix() 897 + .unwrap(); 898 + assert_eq!(events.len(), 4); 899 + assert_eq!(events[3].seq, EventSequence::new(4)); 900 + } 901 + 902 + #[test] 903 + fn recovery_falls_back_to_scan_when_index_corrupt() { 904 + let payload_size = 50; 905 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 906 + let max_segment_size = SEGMENT_HEADER_SIZE + record_size * 2; 907 + 908 + let mgr = setup_manager(max_segment_size as u64); 909 + 910 + { 911 + let mut writer = 912 + EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 913 + (1..=2).for_each(|i| { 914 + writer 915 + .append( 916 + DidHash::from_did(&format!("did:plc:user{i}")), 917 + EventTypeTag::COMMIT, 918 + vec![0xAA; payload_size], 919 + ) 920 + .unwrap(); 921 + }); 922 + writer.sync().unwrap(); 923 + writer.rotate_if_needed().unwrap(); 924 + 925 + (3..=4).for_each(|i| { 926 + writer 927 + .append( 928 + DidHash::from_did(&format!("did:plc:user{i}")), 929 + EventTypeTag::COMMIT, 930 + vec![0xAA; payload_size], 931 + ) 932 + .unwrap(); 933 + }); 934 + writer.sync().unwrap(); 935 + writer.rotate_if_needed().unwrap(); 936 + } 937 + 938 + mgr.shutdown(); 939 + 940 + let index_path = mgr.index_path(SegmentId::new(1)); 941 + let fd = mgr 942 + .io() 943 + .open(&index_path, crate::OpenOptions::read_write()) 944 + .unwrap(); 945 + mgr.io().write_all_at(fd, 0, b"CORRUPT_GARBAGE").unwrap(); 946 + mgr.io().sync(fd).unwrap(); 947 + mgr.io().close(fd).unwrap(); 948 + 949 + let index_path_2 = mgr.index_path(SegmentId::new(2)); 950 + let fd2 = mgr 951 + .io() 952 + .open(&index_path_2, crate::OpenOptions::read_write()) 953 + .unwrap(); 954 + mgr.io().write_all_at(fd2, 0, b"CORRUPT_GARBAGE").unwrap(); 955 + mgr.io().sync(fd2).unwrap(); 956 + mgr.io().close(fd2).unwrap(); 957 + 958 + let writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 959 + assert_eq!(writer.next_seq, EventSequence::new(5)); 960 + } 961 + 962 + #[test] 963 + fn rotation_not_needed_returns_false() { 964 + let mgr = setup_manager(64 * 1024); 965 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), DEFAULT_INDEX_INTERVAL).unwrap(); 966 + 967 + append_test_event(&mut writer, "did:plc:user1"); 968 + writer.sync().unwrap(); 969 + 970 + assert!(writer.rotate_if_needed().unwrap().is_none()); 971 + } 972 + }
+7
crates/tranquil-store/src/fsync_order.rs
··· 1 + use std::io; 2 + 3 + use crate::blockstore::BlocksSynced; 4 + 5 + pub trait PostBlockstoreHook: Send + Sync { 6 + fn on_blocks_synced(&self, proof: &BlocksSynced) -> io::Result<()>; 7 + }
+3 -1
crates/tranquil-store/src/lib.rs
··· 1 1 pub mod blockstore; 2 + pub mod eventlog; 3 + pub mod fsync_order; 2 4 mod harness; 3 5 mod io; 4 6 mod record; ··· 16 18 FILE_MAGIC, FORMAT_VERSION, HEADER_SIZE, MAX_RECORD_PAYLOAD, RECORD_OVERHEAD, ReadRecord, 17 19 RecordReader, RecordWriter, 18 20 }; 19 - pub use sim::{FaultConfig, SimulatedIO}; 21 + pub use sim::{FaultConfig, OpRecord, SimulatedIO};
+642
crates/tranquil-store/tests/eventlog_crash.rs
··· 1 + use std::path::{Path, PathBuf}; 2 + use std::sync::Arc; 3 + 4 + use tranquil_store::eventlog::{ 5 + DidHash, EVENT_RECORD_OVERHEAD, EventLogWriter, EventSequence, EventTypeTag, 6 + SEGMENT_HEADER_SIZE, SegmentId, SegmentManager, SegmentReader, SegmentWriter, TimestampMicros, 7 + ValidEvent, rebuild_from_segment, 8 + }; 9 + use tranquil_store::{FaultConfig, OpenOptions, SimulatedIO, StorageIO}; 10 + 11 + fn setup_manager(sim: SimulatedIO, max_segment_size: u64) -> Arc<SegmentManager<SimulatedIO>> { 12 + Arc::new(SegmentManager::new(sim, PathBuf::from("/segments"), max_segment_size).unwrap()) 13 + } 14 + 15 + fn append_test_event(writer: &mut EventLogWriter<SimulatedIO>, seq_hint: u64) -> EventSequence { 16 + writer 17 + .append( 18 + DidHash::from_did(&format!("did:plc:crash{seq_hint}")), 19 + EventTypeTag::COMMIT, 20 + format!("payload-{seq_hint}").into_bytes(), 21 + ) 22 + .unwrap() 23 + } 24 + 25 + #[test] 26 + fn synced_events_survive_crash() { 27 + (0..500u64).for_each(|seed| { 28 + let sim = SimulatedIO::pristine(seed); 29 + let mgr = setup_manager(sim, 64 * 1024); 30 + 31 + let n = 10u64; 32 + { 33 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 34 + (1..=n).for_each(|i| { 35 + append_test_event(&mut writer, i); 36 + }); 37 + writer.sync().unwrap(); 38 + mgr.io().sync_dir(Path::new("/segments")).unwrap(); 39 + } 40 + 41 + mgr.shutdown(); 42 + mgr.io().crash(); 43 + 44 + let writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 45 + assert_eq!( 46 + writer.synced_seq(), 47 + EventSequence::new(n), 48 + "seed {seed}: expected all synced events to survive" 49 + ); 50 + 51 + let fd = mgr.open_for_read(SegmentId::new(1)).unwrap(); 52 + let events = SegmentReader::open(mgr.io(), fd) 53 + .unwrap() 54 + .valid_prefix() 55 + .unwrap(); 56 + assert_eq!(events.len(), n as usize, "seed {seed}"); 57 + 58 + events.iter().enumerate().for_each(|(i, e)| { 59 + assert_eq!(e.seq, EventSequence::new(i as u64 + 1)); 60 + }); 61 + }); 62 + } 63 + 64 + #[test] 65 + fn unsynced_events_lost_on_crash() { 66 + (0..500u64).for_each(|seed| { 67 + let sim = SimulatedIO::pristine(seed); 68 + let mgr = setup_manager(sim, 64 * 1024); 69 + 70 + let synced_count = 5u64; 71 + let unsynced_count = 5u64; 72 + { 73 + let mut writer = 74 + EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 75 + (1..=synced_count).for_each(|i| { 76 + append_test_event(&mut writer, i); 77 + }); 78 + writer.sync().unwrap(); 79 + mgr.io().sync_dir(Path::new("/segments")).unwrap(); 80 + 81 + (synced_count + 1..=synced_count + unsynced_count).for_each(|i| { 82 + append_test_event(&mut writer, i); 83 + }); 84 + } 85 + 86 + mgr.shutdown(); 87 + mgr.io().crash(); 88 + 89 + let writer = 90 + EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 91 + let recovered_count = writer.synced_seq().raw(); 92 + assert_eq!( 93 + recovered_count, synced_count, 94 + "seed {seed}: pristine IO should recover exactly {synced_count} synced events, got {recovered_count}" 95 + ); 96 + }); 97 + } 98 + 99 + #[test] 100 + fn sequence_monotonicity_after_recovery() { 101 + (0..500u64).for_each(|seed| { 102 + let sim = SimulatedIO::new(seed, FaultConfig::moderate()); 103 + let mgr = setup_manager(sim, 64 * 1024); 104 + 105 + let crash_point = (seed % 15) + 3; 106 + let write_result: Result<(), std::io::Error> = (|| { 107 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256)?; 108 + (1..=crash_point).try_for_each(|i| -> std::io::Result<()> { 109 + writer.append( 110 + DidHash::from_did(&format!("did:plc:mono{i}")), 111 + EventTypeTag::COMMIT, 112 + format!("data-{i}").into_bytes(), 113 + )?; 114 + if i % 3 == 0 { 115 + writer.sync()?; 116 + mgr.io().sync_dir(Path::new("/segments"))?; 117 + } 118 + Ok(()) 119 + })?; 120 + Ok(()) 121 + })(); 122 + let _ = write_result; 123 + 124 + mgr.shutdown(); 125 + mgr.io().crash(); 126 + 127 + let mgr_clone = Arc::clone(&mgr); 128 + let recovery_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { 129 + let mut writer = EventLogWriter::open(Arc::clone(&mgr_clone), 256)?; 130 + let new_seqs: Vec<EventSequence> = (0..5u64) 131 + .filter_map(|i| { 132 + writer 133 + .append( 134 + DidHash::from_did(&format!("did:plc:post{i}")), 135 + EventTypeTag::COMMIT, 136 + format!("post-recovery-{i}").into_bytes(), 137 + ) 138 + .ok() 139 + }) 140 + .collect(); 141 + Ok::<_, std::io::Error>(new_seqs) 142 + })); 143 + 144 + let Ok(Ok(new_seqs)) = recovery_result else { 145 + return; 146 + }; 147 + 148 + new_seqs.windows(2).for_each(|pair| { 149 + assert!( 150 + pair[1].raw() == pair[0].raw() + 1, 151 + "seed {seed}: non-contiguous seqs {} -> {}", 152 + pair[0], 153 + pair[1], 154 + ); 155 + }); 156 + 157 + if let Some(first_new) = new_seqs.first() { 158 + assert!(first_new.raw() > 0, "seed {seed}: new sequence starts at 0"); 159 + } 160 + }); 161 + } 162 + 163 + #[test] 164 + fn partial_event_truncated_on_recovery() { 165 + (0..500u64).for_each(|seed| { 166 + let sim = SimulatedIO::pristine(seed); 167 + let mgr = setup_manager(sim, 64 * 1024); 168 + 169 + let complete_count = 5u64; 170 + { 171 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 172 + (1..=complete_count).for_each(|i| { 173 + append_test_event(&mut writer, i); 174 + }); 175 + writer.sync().unwrap(); 176 + mgr.io().sync_dir(Path::new("/segments")).unwrap(); 177 + } 178 + 179 + let fd = mgr.open_for_read(SegmentId::new(1)).unwrap(); 180 + let file_size = mgr.io().file_size(fd).unwrap(); 181 + let partial_bytes = ((seed % 20) + 1) as usize; 182 + let junk: Vec<u8> = (0..partial_bytes) 183 + .map(|i| (i as u8).wrapping_add(seed as u8)) 184 + .collect(); 185 + mgr.io().write_all_at(fd, file_size, &junk).unwrap(); 186 + mgr.io().sync(fd).unwrap(); 187 + 188 + mgr.shutdown(); 189 + mgr.io().crash(); 190 + 191 + let writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 192 + assert_eq!( 193 + writer.synced_seq(), 194 + EventSequence::new(complete_count), 195 + "seed {seed}: partial write should be truncated, preserving {complete_count} events" 196 + ); 197 + }); 198 + } 199 + 200 + #[test] 201 + fn cross_segment_recovery() { 202 + (0..200u64).for_each(|seed| { 203 + let payload_size = 50; 204 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 205 + let events_per_segment = 3; 206 + let max_segment_size = (SEGMENT_HEADER_SIZE + record_size * events_per_segment) as u64; 207 + 208 + let sim = SimulatedIO::pristine(seed); 209 + let mgr = setup_manager(sim, max_segment_size); 210 + 211 + let sealed_events = 9u64; 212 + let trailing_unsynced = 2u64; 213 + let total_events = sealed_events + trailing_unsynced; 214 + { 215 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 216 + (1..=total_events).for_each(|i| { 217 + writer 218 + .append( 219 + DidHash::from_did(&format!("did:plc:xseg{i}")), 220 + EventTypeTag::COMMIT, 221 + vec![i as u8; payload_size], 222 + ) 223 + .unwrap(); 224 + 225 + if i % events_per_segment as u64 == 0 && i <= sealed_events { 226 + writer.sync().unwrap(); 227 + writer.rotate_if_needed().unwrap(); 228 + } 229 + }); 230 + mgr.io().sync_dir(Path::new("/segments")).unwrap(); 231 + } 232 + 233 + mgr.shutdown(); 234 + mgr.io().crash(); 235 + 236 + let writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 237 + let recovered = writer.synced_seq().raw(); 238 + 239 + let sealed_segments = mgr.list_segments().unwrap(); 240 + let sealed_count = sealed_segments.len().saturating_sub(1); 241 + 242 + assert!( 243 + recovered >= (sealed_count as u64) * events_per_segment as u64, 244 + "seed {seed}: recovered {recovered} but expected at least {} sealed events", 245 + sealed_count * events_per_segment, 246 + ); 247 + 248 + sealed_segments[..sealed_count].iter().for_each(|&seg_id| { 249 + let fd = mgr.open_for_read(seg_id).unwrap(); 250 + let events = SegmentReader::open(mgr.io(), fd) 251 + .unwrap() 252 + .valid_prefix() 253 + .unwrap(); 254 + assert_eq!( 255 + events.len(), 256 + events_per_segment, 257 + "seed {seed}: sealed segment {seg_id} should have {events_per_segment} events" 258 + ); 259 + }); 260 + }); 261 + } 262 + 263 + #[test] 264 + fn corrupt_index_triggers_rebuild() { 265 + (0..200u64).for_each(|seed| { 266 + let payload_size = 50; 267 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 268 + let max_segment_size = (SEGMENT_HEADER_SIZE + record_size * 3) as u64; 269 + 270 + let sim = SimulatedIO::pristine(seed); 271 + let mgr = setup_manager(sim, max_segment_size); 272 + 273 + { 274 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 275 + (1..=6).for_each(|i| { 276 + writer 277 + .append( 278 + DidHash::from_did(&format!("did:plc:idx{i}")), 279 + EventTypeTag::COMMIT, 280 + vec![0xAA; payload_size], 281 + ) 282 + .unwrap(); 283 + if i % 3 == 0 { 284 + writer.sync().unwrap(); 285 + writer.rotate_if_needed().unwrap(); 286 + } 287 + }); 288 + writer.sync().unwrap(); 289 + } 290 + mgr.shutdown(); 291 + 292 + let index_path = mgr.index_path(SegmentId::new(1)); 293 + if let Ok(fd) = mgr.io().open(&index_path, OpenOptions::read_write()) { 294 + mgr.io() 295 + .write_all_at(fd, 0, b"CORRUPT_INDEX_GARBAGE_DATA_XYZ") 296 + .unwrap(); 297 + mgr.io().sync(fd).unwrap(); 298 + mgr.io().close(fd).unwrap(); 299 + } 300 + 301 + let writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 302 + 303 + assert!( 304 + writer.synced_seq().raw() >= 6, 305 + "seed {seed}: recovery after corrupt index should find all events, got seq {}", 306 + writer.synced_seq(), 307 + ); 308 + }); 309 + } 310 + 311 + #[test] 312 + fn large_sealed_segment_index_rebuild_latency() { 313 + let payload_size = 1024; 314 + let event_count = 64_000u64; 315 + 316 + let sim = SimulatedIO::pristine(42); 317 + let mgr = setup_manager(sim, 256 * 1024 * 1024); 318 + 319 + { 320 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 321 + (1..=event_count).for_each(|i| { 322 + writer 323 + .append( 324 + DidHash::from_did(&format!("did:plc:bench{i}")), 325 + EventTypeTag::COMMIT, 326 + vec![0xBB; payload_size], 327 + ) 328 + .unwrap(); 329 + }); 330 + writer.sync().unwrap(); 331 + writer.checkpoint_index().unwrap(); 332 + } 333 + mgr.shutdown(); 334 + 335 + let index_path = mgr.index_path(SegmentId::new(1)); 336 + let _ = mgr.io().delete(&index_path); 337 + 338 + let fd = mgr.open_for_read(SegmentId::new(1)).unwrap(); 339 + 340 + let start = std::time::Instant::now(); 341 + let (index, last_seq) = rebuild_from_segment(mgr.io(), fd, 256).unwrap(); 342 + let elapsed = start.elapsed(); 343 + 344 + assert_eq!(last_seq, Some(EventSequence::new(event_count))); 345 + assert!(index.entry_count() > 0); 346 + assert!( 347 + elapsed.as_secs() < 2, 348 + "index rebuild took {:?}, exceeds 2s budget", 349 + elapsed, 350 + ); 351 + } 352 + 353 + #[test] 354 + fn corrupt_metadata_triggers_scan() { 355 + let sim = SimulatedIO::pristine(42); 356 + let mgr = setup_manager(sim, 64 * 1024); 357 + 358 + { 359 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 360 + (1..=10).for_each(|i| { 361 + append_test_event(&mut writer, i); 362 + }); 363 + writer.sync().unwrap(); 364 + writer.checkpoint_index().unwrap(); 365 + } 366 + mgr.shutdown(); 367 + 368 + let index_path = mgr.index_path(SegmentId::new(1)); 369 + if let Ok(fd) = mgr.io().open(&index_path, OpenOptions::read_write()) { 370 + mgr.io() 371 + .write_all_at(fd, 0, b"TOTALLY_CORRUPT_META") 372 + .unwrap(); 373 + mgr.io().sync(fd).unwrap(); 374 + mgr.io().close(fd).unwrap(); 375 + } 376 + 377 + let writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 378 + assert_eq!( 379 + writer.synced_seq(), 380 + EventSequence::new(10), 381 + "recovery via segment scan should find all 10 events" 382 + ); 383 + } 384 + 385 + #[test] 386 + fn pristine_comparison_under_faults() { 387 + (0..500u64).for_each(|seed| { 388 + let event_count = 15u64; 389 + let sync_interval = 5u64; 390 + 391 + let pristine_sim = SimulatedIO::pristine(seed); 392 + let pristine_mgr = setup_manager(pristine_sim, 64 * 1024); 393 + 394 + { 395 + let mut writer = EventLogWriter::open(Arc::clone(&pristine_mgr), 256).unwrap(); 396 + (1..=event_count).for_each(|i| { 397 + writer 398 + .append( 399 + DidHash::from_did(&format!("did:plc:prist{i}")), 400 + EventTypeTag::COMMIT, 401 + format!("pristine-{i}").into_bytes(), 402 + ) 403 + .unwrap(); 404 + if i % sync_interval == 0 { 405 + writer.sync().unwrap(); 406 + } 407 + }); 408 + writer.sync().unwrap(); 409 + } 410 + pristine_mgr.shutdown(); 411 + 412 + let pristine_fd = pristine_mgr.open_for_read(SegmentId::new(1)).unwrap(); 413 + let pristine_events = SegmentReader::open(pristine_mgr.io(), pristine_fd) 414 + .unwrap() 415 + .valid_prefix() 416 + .unwrap(); 417 + 418 + let faulty_sim = SimulatedIO::new(seed, FaultConfig::moderate()); 419 + let faulty_mgr = setup_manager(faulty_sim, 64 * 1024); 420 + 421 + let write_ok = (|| -> std::io::Result<()> { 422 + let mut writer = EventLogWriter::open(Arc::clone(&faulty_mgr), 256)?; 423 + (1..=event_count).try_for_each(|i| -> std::io::Result<()> { 424 + writer.append( 425 + DidHash::from_did(&format!("did:plc:prist{i}")), 426 + EventTypeTag::COMMIT, 427 + format!("pristine-{i}").into_bytes(), 428 + )?; 429 + if i % sync_interval == 0 { 430 + let _ = writer.sync(); 431 + let _ = faulty_mgr.io().sync_dir(Path::new("/segments")); 432 + } 433 + Ok(()) 434 + })?; 435 + let _ = writer.sync(); 436 + Ok(()) 437 + })(); 438 + let _ = write_ok; 439 + 440 + faulty_mgr.shutdown(); 441 + faulty_mgr.io().crash(); 442 + 443 + let faulty_clone = Arc::clone(&faulty_mgr); 444 + let recovery = std::panic::catch_unwind(std::panic::AssertUnwindSafe( 445 + || -> std::io::Result<Option<Vec<ValidEvent>>> { 446 + let recovered_writer = EventLogWriter::open(Arc::clone(&faulty_clone), 256)?; 447 + 448 + let recovered_seq = recovered_writer.synced_seq().raw(); 449 + assert!( 450 + recovered_seq <= event_count, 451 + "seed {seed}: recovered {recovered_seq} > written {event_count}" 452 + ); 453 + 454 + if recovered_seq == 0 { 455 + return Ok(None); 456 + } 457 + 458 + let fd = faulty_clone.open_for_read(SegmentId::new(1))?; 459 + let events = SegmentReader::open(faulty_clone.io(), fd)?.valid_prefix()?; 460 + Ok(Some(events)) 461 + }, 462 + )); 463 + 464 + if let Ok(Ok(Some(recovered_events))) = recovery { 465 + let is_prefix = recovered_events 466 + .iter() 467 + .zip(pristine_events.iter()) 468 + .all(|(r, p)| r.seq == p.seq && r.payload == p.payload); 469 + 470 + assert!( 471 + is_prefix, 472 + "seed {seed}: recovered events must be a prefix of pristine" 473 + ); 474 + } 475 + }); 476 + } 477 + 478 + #[test] 479 + fn bit_flip_detected_by_checksum() { 480 + (0..1000u64).for_each(|seed| { 481 + let sim = SimulatedIO::pristine(seed); 482 + let dir = Path::new("/test"); 483 + sim.mkdir(dir).unwrap(); 484 + sim.sync_dir(dir).unwrap(); 485 + 486 + let fd = sim 487 + .open(Path::new("/test/segment.tqe"), OpenOptions::read_write()) 488 + .unwrap(); 489 + let mut writer = 490 + SegmentWriter::new(&sim, fd, SegmentId::new(1), EventSequence::new(1)).unwrap(); 491 + 492 + let data_len = ((seed % 256) as usize).max(1); 493 + let event = ValidEvent { 494 + seq: EventSequence::new(1), 495 + timestamp: TimestampMicros::new(1_000_000), 496 + did_hash: DidHash::from_did("did:plc:bitflip"), 497 + event_type: EventTypeTag::COMMIT, 498 + payload: vec![0xAA; data_len], 499 + }; 500 + writer.append_event(&sim, &event).unwrap(); 501 + writer.sync(&sim).unwrap(); 502 + 503 + let record_start = SEGMENT_HEADER_SIZE as u64; 504 + let record_end = record_start + EVENT_RECORD_OVERHEAD as u64 + data_len as u64; 505 + let flip_pos = record_start + (seed.wrapping_mul(7) % (record_end - record_start)); 506 + let flip_bit = (seed.wrapping_mul(13) % 8) as u8; 507 + 508 + let mut byte_buf = [0u8; 1]; 509 + sim.read_exact_at(fd, flip_pos, &mut byte_buf).unwrap(); 510 + byte_buf[0] ^= 1 << flip_bit; 511 + sim.write_all_at(fd, flip_pos, &byte_buf).unwrap(); 512 + 513 + use tranquil_store::eventlog::ReadEventRecord; 514 + let mut reader = SegmentReader::open(&sim, fd).unwrap(); 515 + let record = reader.next().unwrap().unwrap(); 516 + assert!( 517 + !matches!(record, ReadEventRecord::Valid { .. }), 518 + "seed {seed}: bit flip at offset {flip_pos} bit {flip_bit} was not detected" 519 + ); 520 + }); 521 + } 522 + 523 + fn fault_configs() -> Vec<(&'static str, FaultConfig)> { 524 + vec![ 525 + ( 526 + "partial_writes_only", 527 + FaultConfig { 528 + partial_write_probability: 0.15, 529 + ..FaultConfig::none() 530 + }, 531 + ), 532 + ( 533 + "sync_failures_only", 534 + FaultConfig { 535 + sync_failure_probability: 0.10, 536 + dir_sync_failure_probability: 0.05, 537 + ..FaultConfig::none() 538 + }, 539 + ), 540 + ("combined", FaultConfig::moderate()), 541 + ( 542 + "bit_flips_only", 543 + FaultConfig { 544 + bit_flip_on_read_probability: 0.05, 545 + ..FaultConfig::none() 546 + }, 547 + ), 548 + ] 549 + } 550 + 551 + #[test] 552 + fn pristine_comparison_parameterized_faults() { 553 + fault_configs().iter().for_each(|(config_name, config)| { 554 + (0..200u64).for_each(|seed| { 555 + let event_count = 10u64; 556 + 557 + let pristine_sim = SimulatedIO::pristine(seed); 558 + let pristine_mgr = setup_manager(pristine_sim, 64 * 1024); 559 + { 560 + let mut writer = 561 + EventLogWriter::open(Arc::clone(&pristine_mgr), 256).unwrap(); 562 + (1..=event_count).for_each(|i| { 563 + writer 564 + .append( 565 + DidHash::from_did(&format!("did:plc:param{i}")), 566 + EventTypeTag::COMMIT, 567 + format!("param-{i}").into_bytes(), 568 + ) 569 + .unwrap(); 570 + if i % 4 == 0 { 571 + writer.sync().unwrap(); 572 + } 573 + }); 574 + writer.sync().unwrap(); 575 + } 576 + pristine_mgr.shutdown(); 577 + 578 + let pristine_fd = pristine_mgr.open_for_read(SegmentId::new(1)).unwrap(); 579 + let pristine_events = SegmentReader::open(pristine_mgr.io(), pristine_fd) 580 + .unwrap() 581 + .valid_prefix() 582 + .unwrap(); 583 + 584 + let faulty_sim = SimulatedIO::new(seed, *config); 585 + let faulty_mgr = setup_manager(faulty_sim, 64 * 1024); 586 + let _ = (|| -> std::io::Result<()> { 587 + let mut writer = 588 + EventLogWriter::open(Arc::clone(&faulty_mgr), 256)?; 589 + (1..=event_count).try_for_each(|i| -> std::io::Result<()> { 590 + writer.append( 591 + DidHash::from_did(&format!("did:plc:param{i}")), 592 + EventTypeTag::COMMIT, 593 + format!("param-{i}").into_bytes(), 594 + )?; 595 + if i % 4 == 0 { 596 + let _ = writer.sync(); 597 + let _ = faulty_mgr.io().sync_dir(Path::new("/segments")); 598 + } 599 + Ok(()) 600 + })?; 601 + let _ = writer.sync(); 602 + Ok(()) 603 + })(); 604 + 605 + faulty_mgr.shutdown(); 606 + faulty_mgr.io().crash(); 607 + 608 + let faulty_clone = Arc::clone(&faulty_mgr); 609 + let recovery = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| -> std::io::Result<Option<Vec<ValidEvent>>> { 610 + let recovered_writer = 611 + EventLogWriter::open(Arc::clone(&faulty_clone), 256)?; 612 + 613 + let recovered_seq = recovered_writer.synced_seq().raw(); 614 + assert!( 615 + recovered_seq <= event_count, 616 + "config={config_name} seed={seed}: recovered {recovered_seq} > written {event_count}" 617 + ); 618 + 619 + if recovered_seq == 0 { 620 + return Ok(None); 621 + } 622 + 623 + let fd = faulty_clone.open_for_read(SegmentId::new(1))?; 624 + let events = SegmentReader::open(faulty_clone.io(), fd)? 625 + .valid_prefix()?; 626 + Ok(Some(events)) 627 + })); 628 + 629 + if let Ok(Ok(Some(recovered_events))) = recovery { 630 + let is_prefix = recovered_events 631 + .iter() 632 + .zip(pristine_events.iter()) 633 + .all(|(r, p)| r.seq == p.seq && r.payload == p.payload); 634 + 635 + assert!( 636 + is_prefix, 637 + "config={config_name} seed={seed}: recovered is not prefix of pristine" 638 + ); 639 + } 640 + }); 641 + }); 642 + }
+677
crates/tranquil-store/tests/eventlog_properties.rs
··· 1 + use std::path::{Path, PathBuf}; 2 + use std::sync::Arc; 3 + use std::time::Duration; 4 + 5 + use tranquil_store::eventlog::{ 6 + DidHash, EVENT_RECORD_OVERHEAD, EventLog, EventLogConfig, EventLogReader, EventLogWriter, 7 + EventSequence, EventTypeTag, MAX_EVENT_PAYLOAD, PayloadError, RawEvent, SEGMENT_HEADER_SIZE, 8 + SegmentId, SegmentIndex, SegmentManager, SegmentReader, TimestampMicros, ValidEvent, 9 + decode_payload, encode_payload, to_sequenced_event, validate_payload_size, 10 + }; 11 + use tranquil_store::{OpRecord, OpenOptions, SimulatedIO, StorageIO}; 12 + 13 + fn setup_manager(max_segment_size: u64) -> Arc<SegmentManager<SimulatedIO>> { 14 + let sim = SimulatedIO::pristine(42); 15 + Arc::new(SegmentManager::new(sim, PathBuf::from("/segments"), max_segment_size).unwrap()) 16 + } 17 + 18 + fn append_test_event(writer: &mut EventLogWriter<SimulatedIO>, seq_hint: u64) -> EventSequence { 19 + writer 20 + .append( 21 + DidHash::from_did(&format!("did:plc:prop{seq_hint}")), 22 + EventTypeTag::COMMIT, 23 + format!("payload-{seq_hint}").into_bytes(), 24 + ) 25 + .unwrap() 26 + } 27 + 28 + #[test] 29 + fn sequence_assignment_is_contiguous() { 30 + let n = 100u64; 31 + let mgr = setup_manager(64 * 1024); 32 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 33 + 34 + let seqs: Vec<EventSequence> = (1..=n).map(|i| append_test_event(&mut writer, i)).collect(); 35 + 36 + seqs.iter().enumerate().for_each(|(i, seq)| { 37 + assert_eq!( 38 + seq.raw(), 39 + i as u64 + 1, 40 + "event {i} should have seq {}", 41 + i + 1, 42 + ); 43 + }); 44 + } 45 + 46 + #[test] 47 + fn cursor_resumption_returns_correct_suffix() { 48 + let mgr = setup_manager(64 * 1024); 49 + 50 + { 51 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 52 + (1..=1000).for_each(|i| { 53 + append_test_event(&mut writer, i); 54 + }); 55 + writer.shutdown().unwrap(); 56 + } 57 + mgr.shutdown(); 58 + 59 + let reader = EventLogReader::new(Arc::clone(&mgr), false); 60 + reader.refresh_segment_ranges().unwrap(); 61 + 62 + let events = reader 63 + .read_events_from(EventSequence::new(500), 1000) 64 + .unwrap(); 65 + assert_eq!(events.len(), 500); 66 + assert_eq!(events[0].seq, EventSequence::new(501)); 67 + assert_eq!(events[499].seq, EventSequence::new(1000)); 68 + 69 + events.windows(2).for_each(|pair| { 70 + assert_eq!( 71 + pair[1].seq.raw(), 72 + pair[0].seq.raw() + 1, 73 + "gap between {} and {}", 74 + pair[0].seq, 75 + pair[1].seq, 76 + ); 77 + }); 78 + } 79 + 80 + #[test] 81 + fn cross_segment_read_is_seamless() { 82 + let payload_size = 50; 83 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 84 + let events_per_segment = 10; 85 + let max_segment_size = (SEGMENT_HEADER_SIZE + record_size * events_per_segment) as u64; 86 + let total_events = 100u64; 87 + 88 + let mgr = setup_manager(max_segment_size); 89 + 90 + { 91 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 92 + (1..=total_events).for_each(|i| { 93 + writer 94 + .append( 95 + DidHash::from_did(&format!("did:plc:xseg{i}")), 96 + EventTypeTag::COMMIT, 97 + vec![i as u8; payload_size], 98 + ) 99 + .unwrap(); 100 + 101 + if i % events_per_segment as u64 == 0 && i < total_events { 102 + writer.sync().unwrap(); 103 + writer.rotate_if_needed().unwrap(); 104 + } 105 + }); 106 + writer.shutdown().unwrap(); 107 + } 108 + mgr.shutdown(); 109 + 110 + let reader = EventLogReader::new(Arc::clone(&mgr), false); 111 + reader.refresh_segment_ranges().unwrap(); 112 + 113 + let events = reader 114 + .read_events_from(EventSequence::BEFORE_ALL, total_events as usize + 10) 115 + .unwrap(); 116 + 117 + assert_eq!(events.len(), total_events as usize); 118 + 119 + events.iter().enumerate().for_each(|(i, e)| { 120 + assert_eq!( 121 + e.seq, 122 + EventSequence::new(i as u64 + 1), 123 + "event at index {i} has wrong seq" 124 + ); 125 + }); 126 + 127 + let mut seen = std::collections::HashSet::new(); 128 + events.iter().for_each(|e| { 129 + assert!(seen.insert(e.seq.raw()), "duplicate seq {}", e.seq,); 130 + }); 131 + } 132 + 133 + #[test] 134 + fn retention_deletes_only_old_segments() { 135 + let payload_size = 50; 136 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 137 + let events_per_segment = 3; 138 + let max_segment_size = (SEGMENT_HEADER_SIZE + record_size * events_per_segment) as u64; 139 + 140 + let sim = SimulatedIO::pristine(42); 141 + let mgr = 142 + Arc::new(SegmentManager::new(sim, PathBuf::from("/segments"), max_segment_size).unwrap()); 143 + 144 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 145 + 146 + (1..=15).for_each(|i| { 147 + writer 148 + .append( 149 + DidHash::from_did(&format!("did:plc:ret{i}")), 150 + EventTypeTag::COMMIT, 151 + vec![0xAA; payload_size], 152 + ) 153 + .unwrap(); 154 + 155 + if i % events_per_segment as u64 == 0 { 156 + writer.sync().unwrap(); 157 + writer.rotate_if_needed().unwrap(); 158 + } 159 + }); 160 + writer.sync().unwrap(); 161 + 162 + let segments_before = mgr.list_segments().unwrap(); 163 + assert!(segments_before.len() >= 5); 164 + 165 + let segments_to_delete: Vec<_> = segments_before[..2].to_vec(); 166 + segments_to_delete.iter().for_each(|&id| { 167 + mgr.delete_segment(id).unwrap(); 168 + }); 169 + 170 + let segments_after = mgr.list_segments().unwrap(); 171 + assert_eq!(segments_after.len(), segments_before.len() - 2,); 172 + 173 + segments_to_delete.iter().for_each(|id| { 174 + assert!( 175 + !segments_after.contains(id), 176 + "deleted segment {id} still present" 177 + ); 178 + }); 179 + 180 + segments_after.iter().for_each(|id| { 181 + assert!( 182 + !segments_to_delete.contains(id), 183 + "remaining segment {id} was supposed to be deleted" 184 + ); 185 + }); 186 + } 187 + 188 + #[test] 189 + fn did_hash_is_deterministic() { 190 + let dids = [ 191 + "did:plc:abc123", 192 + "did:plc:xyz789", 193 + "did:web:example.com", 194 + "did:plc:aaaabbbbccccddddeeeeffffggg", 195 + ]; 196 + 197 + dids.iter().for_each(|did| { 198 + let h1 = DidHash::from_did(did); 199 + let h2 = DidHash::from_did(did); 200 + assert_eq!(h1, h2, "DidHash not deterministic for {did}"); 201 + }); 202 + } 203 + 204 + #[test] 205 + fn payload_round_trip() { 206 + use bytes::Bytes; 207 + use tranquil_db_traits::{AccountStatus, RepoEventType, SequenceNumber, SequencedEvent}; 208 + use tranquil_types::{Did, Handle}; 209 + 210 + let variants: Vec<(RepoEventType, EventTypeTag, SequencedEvent)> = vec![ 211 + ( 212 + RepoEventType::Commit, 213 + EventTypeTag::COMMIT, 214 + SequencedEvent { 215 + seq: SequenceNumber::from_raw(1), 216 + did: Did::new("did:plc:testuser1234567890abcdef").unwrap(), 217 + created_at: chrono::Utc::now(), 218 + event_type: RepoEventType::Commit, 219 + commit_cid: None, 220 + prev_cid: None, 221 + prev_data_cid: None, 222 + ops: Some( 223 + serde_json::json!([{"action": "create", "path": "app.bsky.feed.post/abc"}]), 224 + ), 225 + blobs: Some(vec!["bafkreibtest".to_owned()]), 226 + blocks_cids: None, 227 + handle: None, 228 + active: None, 229 + status: None, 230 + rev: Some("rev1".to_owned()), 231 + }, 232 + ), 233 + ( 234 + RepoEventType::Identity, 235 + EventTypeTag::IDENTITY, 236 + SequencedEvent { 237 + seq: SequenceNumber::from_raw(2), 238 + did: Did::new("did:plc:testuser1234567890abcdef").unwrap(), 239 + created_at: chrono::Utc::now(), 240 + event_type: RepoEventType::Identity, 241 + commit_cid: None, 242 + prev_cid: None, 243 + prev_data_cid: None, 244 + ops: None, 245 + blobs: None, 246 + blocks_cids: None, 247 + handle: Some(Handle::new("test.bsky.social").unwrap()), 248 + active: None, 249 + status: None, 250 + rev: None, 251 + }, 252 + ), 253 + ( 254 + RepoEventType::Account, 255 + EventTypeTag::ACCOUNT, 256 + SequencedEvent { 257 + seq: SequenceNumber::from_raw(3), 258 + did: Did::new("did:plc:testuser1234567890abcdef").unwrap(), 259 + created_at: chrono::Utc::now(), 260 + event_type: RepoEventType::Account, 261 + commit_cid: None, 262 + prev_cid: None, 263 + prev_data_cid: None, 264 + ops: None, 265 + blobs: None, 266 + blocks_cids: None, 267 + handle: None, 268 + active: Some(true), 269 + status: Some(AccountStatus::Active), 270 + rev: None, 271 + }, 272 + ), 273 + ( 274 + RepoEventType::Sync, 275 + EventTypeTag::SYNC, 276 + SequencedEvent { 277 + seq: SequenceNumber::from_raw(4), 278 + did: Did::new("did:plc:testuser1234567890abcdef").unwrap(), 279 + created_at: chrono::Utc::now(), 280 + event_type: RepoEventType::Sync, 281 + commit_cid: None, 282 + prev_cid: None, 283 + prev_data_cid: None, 284 + ops: None, 285 + blobs: None, 286 + blocks_cids: None, 287 + handle: None, 288 + active: None, 289 + status: None, 290 + rev: None, 291 + }, 292 + ), 293 + ]; 294 + 295 + variants.iter().for_each(|(event_type, tag, event)| { 296 + let encoded = encode_payload(event); 297 + let decoded = decode_payload(&encoded).unwrap(); 298 + 299 + let raw = RawEvent { 300 + seq: EventSequence::new(event.seq.as_i64() as u64), 301 + timestamp: TimestampMicros::now(), 302 + did_hash: DidHash::from_did(event.did.as_str()), 303 + event_type: *tag, 304 + payload: Bytes::from(encoded), 305 + }; 306 + 307 + let reconstructed = to_sequenced_event(&raw, &decoded).unwrap(); 308 + assert_eq!(reconstructed.did.as_str(), event.did.as_str()); 309 + assert_eq!(reconstructed.event_type, *event_type); 310 + assert_eq!(reconstructed.rev, event.rev); 311 + assert_eq!(reconstructed.blobs, event.blobs); 312 + assert_eq!(reconstructed.active, event.active); 313 + }); 314 + } 315 + 316 + #[test] 317 + fn max_payload_accepted() { 318 + let payload = vec![0xBB; MAX_EVENT_PAYLOAD as usize]; 319 + assert!(validate_payload_size(&payload).is_ok()); 320 + 321 + let sim = SimulatedIO::pristine(42); 322 + let dir = Path::new("/test"); 323 + sim.mkdir(dir).unwrap(); 324 + sim.sync_dir(dir).unwrap(); 325 + 326 + let fd = sim 327 + .open(Path::new("/test/segment.tqe"), OpenOptions::read_write()) 328 + .unwrap(); 329 + let mut writer = tranquil_store::eventlog::SegmentWriter::new( 330 + &sim, 331 + fd, 332 + SegmentId::new(1), 333 + EventSequence::new(1), 334 + ) 335 + .unwrap(); 336 + 337 + let event = ValidEvent { 338 + seq: EventSequence::new(1), 339 + timestamp: TimestampMicros::new(1_000_000), 340 + did_hash: DidHash::from_did("did:plc:maxpayload"), 341 + event_type: EventTypeTag::COMMIT, 342 + payload: payload.clone(), 343 + }; 344 + writer.append_event(&sim, &event).unwrap(); 345 + writer.sync(&sim).unwrap(); 346 + 347 + let reader = SegmentReader::open(&sim, fd).unwrap(); 348 + let events = reader.valid_prefix().unwrap(); 349 + assert_eq!(events.len(), 1); 350 + assert_eq!(events[0].payload.len(), MAX_EVENT_PAYLOAD as usize); 351 + } 352 + 353 + #[test] 354 + fn oversized_payload_rejected() { 355 + let payload = vec![0xCC; MAX_EVENT_PAYLOAD as usize + 1]; 356 + match validate_payload_size(&payload) { 357 + Err(PayloadError::TooLarge { size, max }) => { 358 + assert_eq!(size, MAX_EVENT_PAYLOAD as usize + 1); 359 + assert_eq!(max, MAX_EVENT_PAYLOAD as usize); 360 + } 361 + other => panic!("expected TooLarge, got {other:?}"), 362 + } 363 + } 364 + 365 + #[test] 366 + fn retention_does_not_break_active_readers() { 367 + let payload_size = 50; 368 + let record_size = EVENT_RECORD_OVERHEAD + payload_size; 369 + let events_per_segment = 5; 370 + let max_segment_size = (SEGMENT_HEADER_SIZE + record_size * events_per_segment) as u64; 371 + 372 + let sim = SimulatedIO::pristine(42); 373 + let mgr = 374 + Arc::new(SegmentManager::new(sim, PathBuf::from("/segments"), max_segment_size).unwrap()); 375 + 376 + { 377 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 378 + (1..=25).for_each(|i| { 379 + writer 380 + .append( 381 + DidHash::from_did(&format!("did:plc:active{i}")), 382 + EventTypeTag::COMMIT, 383 + vec![i as u8; payload_size], 384 + ) 385 + .unwrap(); 386 + if i % events_per_segment as u64 == 0 { 387 + writer.sync().unwrap(); 388 + writer.rotate_if_needed().unwrap(); 389 + } 390 + }); 391 + writer.sync().unwrap(); 392 + } 393 + mgr.shutdown(); 394 + 395 + let reader = EventLogReader::new(Arc::clone(&mgr), false); 396 + reader.refresh_segment_ranges().unwrap(); 397 + 398 + let first_batch = reader 399 + .read_events_from(EventSequence::BEFORE_ALL, 10) 400 + .unwrap(); 401 + assert_eq!(first_batch.len(), 10); 402 + 403 + mgr.delete_segment(SegmentId::new(1)).unwrap(); 404 + reader.invalidate_index(SegmentId::new(1)); 405 + reader.invalidate_mmap(SegmentId::new(1)); 406 + reader.refresh_segment_ranges().unwrap(); 407 + 408 + let later_events = reader.read_events_from(EventSequence::new(10), 20).unwrap(); 409 + assert!(!later_events.is_empty()); 410 + later_events.iter().for_each(|e| { 411 + assert!(e.seq.raw() > 10); 412 + }); 413 + } 414 + 415 + #[tokio::test] 416 + async fn subscriber_lag_recovery() { 417 + let sim = SimulatedIO::pristine(42); 418 + let config = EventLogConfig { 419 + segments_dir: PathBuf::from("/segments"), 420 + max_segment_size: 64 * 1024, 421 + index_interval: 256, 422 + broadcast_buffer: 4, 423 + use_mmap: false, 424 + }; 425 + 426 + let event_log = EventLog::open(config, sim).unwrap(); 427 + let mut subscriber = event_log.subscriber(EventSequence::BEFORE_ALL); 428 + 429 + let total_events = 20u64; 430 + (1..=total_events).for_each(|i| { 431 + event_log 432 + .append_and_sync( 433 + &tranquil_types::Did::new("did:plc:testuser1234567890abcdef").unwrap(), 434 + tranquil_db_traits::RepoEventType::Commit, 435 + &tranquil_db_traits::SequencedEvent { 436 + seq: tranquil_db_traits::SequenceNumber::from_raw(i as i64), 437 + did: tranquil_types::Did::new("did:plc:testuser1234567890abcdef").unwrap(), 438 + created_at: chrono::Utc::now(), 439 + event_type: tranquil_db_traits::RepoEventType::Commit, 440 + commit_cid: None, 441 + prev_cid: None, 442 + prev_data_cid: None, 443 + ops: None, 444 + blobs: None, 445 + blocks_cids: None, 446 + handle: None, 447 + active: None, 448 + status: None, 449 + rev: None, 450 + }, 451 + ) 452 + .unwrap(); 453 + }); 454 + 455 + let mut received_seqs: Vec<u64> = Vec::new(); 456 + let timeout = tokio::time::timeout(Duration::from_secs(5), async { 457 + while let Some(event) = subscriber.next().await { 458 + received_seqs.push(event.seq.raw()); 459 + if event.seq.raw() >= total_events { 460 + break; 461 + } 462 + } 463 + }); 464 + 465 + timeout 466 + .await 467 + .expect("subscriber timed out before receiving all events"); 468 + 469 + assert_eq!( 470 + received_seqs.len(), 471 + total_events as usize, 472 + "subscriber should receive all {total_events} events, got {}", 473 + received_seqs.len(), 474 + ); 475 + 476 + received_seqs.windows(2).for_each(|pair| { 477 + assert!( 478 + pair[1] > pair[0], 479 + "events must be in order: {} -> {}", 480 + pair[0], 481 + pair[1], 482 + ); 483 + }); 484 + 485 + let unique: std::collections::HashSet<u64> = received_seqs.iter().copied().collect(); 486 + assert_eq!( 487 + unique.len(), 488 + received_seqs.len(), 489 + "no duplicate events allowed" 490 + ); 491 + } 492 + 493 + #[test] 494 + fn index_checkpoint_accelerates_recovery() { 495 + let event_count = 50_000u64; 496 + let sim = SimulatedIO::pristine(42); 497 + let mgr = 498 + Arc::new(SegmentManager::new(sim, PathBuf::from("/segments"), 256 * 1024 * 1024).unwrap()); 499 + 500 + { 501 + let mut writer = EventLogWriter::open(Arc::clone(&mgr), 256).unwrap(); 502 + (1..=event_count).for_each(|i| { 503 + writer 504 + .append( 505 + DidHash::from_did(&format!("did:plc:chk{i}")), 506 + EventTypeTag::COMMIT, 507 + format!("ckpt-{i}").into_bytes(), 508 + ) 509 + .unwrap(); 510 + }); 511 + writer.shutdown().unwrap(); 512 + } 513 + mgr.shutdown(); 514 + 515 + let index = SegmentIndex::load(mgr.io(), &mgr.index_path(SegmentId::new(1))) 516 + .unwrap() 517 + .unwrap(); 518 + 519 + assert!(index.entry_count() > 0); 520 + assert_eq!(index.first_seq(), Some(EventSequence::new(1))); 521 + assert_eq!(index.last_seq(), Some(EventSequence::new(event_count))); 522 + 523 + let mid = EventSequence::new(event_count / 2); 524 + let offset = index.lookup(mid); 525 + assert!(offset.is_some(), "index should cover midpoint seq {}", mid,); 526 + 527 + let reader_with_index = EventLogReader::new(Arc::clone(&mgr), false); 528 + 529 + let reads_before = mgr 530 + .io() 531 + .op_log() 532 + .iter() 533 + .filter(|op| matches!(op, OpRecord::ReadAt { .. })) 534 + .count(); 535 + 536 + reader_with_index.refresh_segment_ranges().unwrap(); 537 + let mid_events = reader_with_index 538 + .read_events_from(EventSequence::new(event_count / 2), 10) 539 + .unwrap(); 540 + assert_eq!(mid_events.len(), 10); 541 + 542 + let reads_with_index = mgr 543 + .io() 544 + .op_log() 545 + .iter() 546 + .filter(|op| matches!(op, OpRecord::ReadAt { .. })) 547 + .count() 548 + - reads_before; 549 + 550 + let _ = mgr.io().delete(&mgr.index_path(SegmentId::new(1))); 551 + 552 + let reader_without_index = EventLogReader::new(Arc::clone(&mgr), false); 553 + 554 + let reads_before = mgr 555 + .io() 556 + .op_log() 557 + .iter() 558 + .filter(|op| matches!(op, OpRecord::ReadAt { .. })) 559 + .count(); 560 + 561 + reader_without_index.refresh_segment_ranges().unwrap(); 562 + let mid_events_no_idx = reader_without_index 563 + .read_events_from(EventSequence::new(event_count / 2), 10) 564 + .unwrap(); 565 + assert_eq!(mid_events_no_idx.len(), 10); 566 + 567 + let reads_without_index = mgr 568 + .io() 569 + .op_log() 570 + .iter() 571 + .filter(|op| matches!(op, OpRecord::ReadAt { .. })) 572 + .count() 573 + - reads_before; 574 + 575 + assert!( 576 + reads_with_index < reads_without_index, 577 + "read with index ({reads_with_index} reads) should require fewer reads than without ({reads_without_index} reads)" 578 + ); 579 + } 580 + 581 + #[test] 582 + fn fsync_ordering_blocks_before_events() { 583 + use tranquil_store::blockstore::{ 584 + CID_SIZE, DataFileId, DataFileManager, DataFileReader, DataFileWriter, 585 + }; 586 + 587 + fn test_cid(seed: u8) -> [u8; CID_SIZE] { 588 + let mut cid = [0u8; CID_SIZE]; 589 + cid[0] = 0x01; 590 + cid[1] = 0x71; 591 + cid[2] = 0x12; 592 + cid[3] = 0x20; 593 + cid[4] = seed; 594 + cid 595 + } 596 + 597 + let sim = Arc::new(SimulatedIO::pristine(42)); 598 + let data_dir = Path::new("/blocks"); 599 + sim.mkdir(data_dir).unwrap(); 600 + sim.sync_dir(data_dir).unwrap(); 601 + let seg_dir = Path::new("/segments"); 602 + 603 + let block_mgr = 604 + DataFileManager::with_default_max_size(Arc::clone(&sim), data_dir.to_path_buf()); 605 + let event_mgr = Arc::new( 606 + SegmentManager::new(Arc::clone(&sim), PathBuf::from("/segments"), 64 * 1024).unwrap(), 607 + ); 608 + 609 + let block_fd = block_mgr.open_for_append(DataFileId::new(0)).unwrap(); 610 + let mut block_writer = 611 + DataFileWriter::new(block_mgr.io(), block_fd, DataFileId::new(0)).unwrap(); 612 + let cid = test_cid(1); 613 + let _ = block_writer.append_block(&cid, &[0xAA; 128]).unwrap(); 614 + block_writer.sync().unwrap(); 615 + sim.sync_dir(data_dir).unwrap(); 616 + 617 + { 618 + let mut event_writer = EventLogWriter::open(Arc::clone(&event_mgr), 256).unwrap(); 619 + event_writer 620 + .append( 621 + DidHash::from_did("did:plc:fsyncorder"), 622 + EventTypeTag::COMMIT, 623 + b"event-before-sync".to_vec(), 624 + ) 625 + .unwrap(); 626 + } 627 + 628 + sim.crash(); 629 + event_mgr.shutdown(); 630 + 631 + let block_fd = sim 632 + .open( 633 + Path::new("/blocks/000000.tqb"), 634 + OpenOptions::read_only_existing(), 635 + ) 636 + .unwrap(); 637 + let block_reader = DataFileReader::open(&*sim, block_fd).unwrap(); 638 + let recovered_blocks = block_reader.valid_blocks().unwrap(); 639 + assert_eq!( 640 + recovered_blocks.len(), 641 + 1, 642 + "blockstore was synced, block must survive crash" 643 + ); 644 + assert_eq!(recovered_blocks[0].1, cid, "recovered block CID must match"); 645 + 646 + let event_writer = EventLogWriter::open(Arc::clone(&event_mgr), 256).unwrap(); 647 + assert_eq!( 648 + event_writer.synced_seq(), 649 + EventSequence::BEFORE_ALL, 650 + "crash between blockstore sync and eventlog sync must not persist the event (blocks exist, event does not = orphan, not inconsistency)" 651 + ); 652 + 653 + drop(event_writer); 654 + 655 + { 656 + let mut event_writer = EventLogWriter::open(Arc::clone(&event_mgr), 256).unwrap(); 657 + event_writer 658 + .append( 659 + DidHash::from_did("did:plc:fsyncorder"), 660 + EventTypeTag::COMMIT, 661 + b"event-with-sync".to_vec(), 662 + ) 663 + .unwrap(); 664 + event_writer.sync().unwrap(); 665 + sim.sync_dir(seg_dir).unwrap(); 666 + } 667 + 668 + event_mgr.shutdown(); 669 + sim.crash(); 670 + 671 + let event_writer = EventLogWriter::open(Arc::clone(&event_mgr), 256).unwrap(); 672 + assert_eq!( 673 + event_writer.synced_seq(), 674 + EventSequence::new(1), 675 + "both stores synced, event must survive crash" 676 + ); 677 + }