Our Personal Data Server from scratch!
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat(tranquil-store): leak gate and metrics sampling for gauntlet

Lewis: May this revision serve well! <lu5a@proton.me>

+483
+236
crates/tranquil-store/src/gauntlet/leak.rs
··· 1 + use std::num::NonZeroU64; 2 + use std::time::Duration; 3 + 4 + use serde::{Deserialize, Serialize}; 5 + 6 + use super::metrics::{MetricName, MetricsSample}; 7 + 8 + #[derive(Debug, Clone, Copy, Serialize, Deserialize)] 9 + pub struct LeakGateConfig { 10 + pub warmup_ms: u64, 11 + pub window_ms: NonZeroU64, 12 + pub growth_limit_pct: f64, 13 + } 14 + 15 + #[derive(Debug, Clone, Copy)] 16 + pub struct LeakGateBuildError(pub &'static str); 17 + 18 + impl std::fmt::Display for LeakGateBuildError { 19 + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 20 + f.write_str(self.0) 21 + } 22 + } 23 + 24 + impl std::error::Error for LeakGateBuildError {} 25 + 26 + impl LeakGateConfig { 27 + pub fn try_new( 28 + warmup_ms: u64, 29 + window_ms: u64, 30 + growth_limit_pct: f64, 31 + ) -> Result<Self, LeakGateBuildError> { 32 + let window_ms = NonZeroU64::new(window_ms) 33 + .ok_or(LeakGateBuildError("leak gate window_ms must be > 0"))?; 34 + if !growth_limit_pct.is_finite() || growth_limit_pct < 0.0 { 35 + return Err(LeakGateBuildError( 36 + "leak gate growth_limit_pct must be finite and non-negative", 37 + )); 38 + } 39 + Ok(Self { 40 + warmup_ms, 41 + window_ms, 42 + growth_limit_pct, 43 + }) 44 + } 45 + 46 + pub fn standard() -> Self { 47 + Self::try_new(60 * 60 * 1_000, 4 * 60 * 60 * 1_000, 5.0) 48 + .expect("standard leak gate config is valid") 49 + } 50 + 51 + pub fn short_for_tests() -> Self { 52 + Self::try_new(60_000, 4 * 60_000, 5.0).expect("short_for_tests leak gate config is valid") 53 + } 54 + 55 + pub fn warmup(&self) -> Duration { 56 + Duration::from_millis(self.warmup_ms) 57 + } 58 + 59 + pub fn window(&self) -> Duration { 60 + Duration::from_millis(self.window_ms.get()) 61 + } 62 + } 63 + 64 + #[derive(Debug, Clone, Serialize, Deserialize)] 65 + pub struct LeakViolation { 66 + pub metric: String, 67 + pub start_ms: u64, 68 + pub end_ms: u64, 69 + pub start_value: u64, 70 + pub end_value: u64, 71 + pub growth_pct: f64, 72 + pub limit_pct: f64, 73 + } 74 + 75 + pub fn evaluate(samples: &[MetricsSample], cfg: LeakGateConfig) -> Vec<LeakViolation> { 76 + if samples.len() < 2 { 77 + return Vec::new(); 78 + } 79 + MetricName::ALL 80 + .iter() 81 + .flat_map(|&m| evaluate_metric(samples, m, cfg)) 82 + .collect() 83 + } 84 + 85 + fn evaluate_metric( 86 + samples: &[MetricsSample], 87 + metric: MetricName, 88 + cfg: LeakGateConfig, 89 + ) -> Option<LeakViolation> { 90 + let post_warmup: Vec<&MetricsSample> = samples 91 + .iter() 92 + .filter(|s| s.elapsed_ms >= cfg.warmup_ms) 93 + .collect(); 94 + if post_warmup.len() < 2 { 95 + return None; 96 + } 97 + 98 + let min_delta = metric.min_absolute_delta(); 99 + let window = cfg.window_ms.get(); 100 + let mut worst: Option<LeakViolation> = None; 101 + for (i, start) in post_warmup.iter().enumerate() { 102 + let Some(start_v) = start.metric(metric) else { 103 + continue; 104 + }; 105 + if start_v == 0 { 106 + continue; 107 + } 108 + let deadline = start.elapsed_ms.saturating_add(window); 109 + for end in post_warmup.iter().skip(i + 1) { 110 + if end.elapsed_ms > deadline { 111 + break; 112 + } 113 + let Some(end_v) = end.metric(metric) else { 114 + continue; 115 + }; 116 + if end_v <= start_v { 117 + continue; 118 + } 119 + let delta = end_v - start_v; 120 + if delta < min_delta { 121 + continue; 122 + } 123 + let growth = (delta as f64 / start_v as f64) * 100.0; 124 + if growth <= cfg.growth_limit_pct { 125 + continue; 126 + } 127 + let candidate = LeakViolation { 128 + metric: metric.as_str().to_string(), 129 + start_ms: start.elapsed_ms, 130 + end_ms: end.elapsed_ms, 131 + start_value: start_v, 132 + end_value: end_v, 133 + growth_pct: growth, 134 + limit_pct: cfg.growth_limit_pct, 135 + }; 136 + match &worst { 137 + Some(w) if w.growth_pct >= candidate.growth_pct => {} 138 + _ => worst = Some(candidate), 139 + } 140 + } 141 + } 142 + worst 143 + } 144 + 145 + #[cfg(test)] 146 + mod tests { 147 + use super::*; 148 + 149 + const GIB: u64 = 1024 * 1024 * 1024; 150 + 151 + fn sample(elapsed_ms: u64, rss: u64) -> MetricsSample { 152 + MetricsSample { 153 + elapsed_ms, 154 + rss_bytes: Some(rss), 155 + fd_count: Some(10), 156 + data_dir_bytes: 0, 157 + index_dir_bytes: 0, 158 + segments_dir_bytes: 0, 159 + data_file_count: Some(0), 160 + segment_count: Some(0), 161 + block_index_entries: 0, 162 + hint_file_bytes: 0, 163 + } 164 + } 165 + 166 + #[test] 167 + fn flat_metrics_no_violation() { 168 + let cfg = LeakGateConfig::short_for_tests(); 169 + let series: Vec<MetricsSample> = (0..20) 170 + .map(|i| sample(60_000 + i * 60_000, GIB)) 171 + .collect(); 172 + assert!(evaluate(&series, cfg).is_empty()); 173 + } 174 + 175 + #[test] 176 + fn growing_rss_flagged() { 177 + let cfg = LeakGateConfig::short_for_tests(); 178 + let series: Vec<MetricsSample> = (0..20) 179 + .map(|i| sample(60_000 + i * 60_000, GIB + i * 64 * 1024 * 1024)) 180 + .collect(); 181 + let v = evaluate(&series, cfg); 182 + assert!(!v.is_empty()); 183 + assert_eq!(v[0].metric, "rss_bytes"); 184 + assert!(v[0].growth_pct > 5.0); 185 + } 186 + 187 + #[test] 188 + fn warmup_samples_ignored() { 189 + let cfg = LeakGateConfig::short_for_tests(); 190 + let mut series: Vec<MetricsSample> = Vec::new(); 191 + series.push(sample(10_000, 1)); 192 + series.push(sample(30_000, GIB)); 193 + (0..10).for_each(|i| { 194 + series.push(sample(60_000 + i * 60_000, GIB)); 195 + }); 196 + assert!(evaluate(&series, cfg).is_empty()); 197 + } 198 + 199 + #[test] 200 + fn window_bound_honored() { 201 + let cfg = LeakGateConfig::try_new(0, 2 * 60_000, 5.0).unwrap(); 202 + let series = vec![sample(0, GIB), sample(200_000, 2 * GIB)]; 203 + assert!( 204 + evaluate(&series, cfg).is_empty(), 205 + "200s gap exceeds 120s window, growth must not be flagged" 206 + ); 207 + } 208 + 209 + #[test] 210 + fn small_absolute_delta_not_flagged() { 211 + let cfg = LeakGateConfig::short_for_tests(); 212 + let series: Vec<MetricsSample> = (0..10) 213 + .map(|i| sample(60_000 + i * 60_000, GIB + i * 1024)) 214 + .collect(); 215 + assert!( 216 + evaluate(&series, cfg).is_empty(), 217 + "kilobyte growth is below the RSS absolute-delta floor" 218 + ); 219 + } 220 + 221 + #[test] 222 + fn missing_metric_samples_skipped() { 223 + let cfg = LeakGateConfig::short_for_tests(); 224 + let mut series: Vec<MetricsSample> = (0..10) 225 + .map(|i| sample(60_000 + i * 60_000, GIB)) 226 + .collect(); 227 + series[3].rss_bytes = None; 228 + series[7].rss_bytes = None; 229 + assert!(evaluate(&series, cfg).is_empty()); 230 + } 231 + 232 + #[test] 233 + fn zero_window_rejected_at_construction() { 234 + assert!(LeakGateConfig::try_new(0, 0, 5.0).is_err()); 235 + } 236 + }
+247
crates/tranquil-store/src/gauntlet/metrics.rs
··· 1 + use std::path::Path; 2 + use std::sync::Arc; 3 + use std::time::Duration; 4 + 5 + use serde::{Deserialize, Serialize}; 6 + use tracing::warn; 7 + 8 + use super::runner::{EventLogState, Harness}; 9 + use crate::blockstore::TranquilBlockStore; 10 + use crate::io::StorageIO; 11 + 12 + #[derive(Debug, Clone, Copy, Serialize, Deserialize)] 13 + pub struct MetricsSample { 14 + pub elapsed_ms: u64, 15 + pub rss_bytes: Option<u64>, 16 + pub fd_count: Option<u64>, 17 + pub data_dir_bytes: u64, 18 + pub index_dir_bytes: u64, 19 + pub segments_dir_bytes: u64, 20 + pub data_file_count: Option<u64>, 21 + pub segment_count: Option<u64>, 22 + pub block_index_entries: u64, 23 + pub hint_file_bytes: u64, 24 + } 25 + 26 + impl MetricsSample { 27 + pub fn metric(&self, name: MetricName) -> Option<u64> { 28 + match name { 29 + MetricName::RssBytes => self.rss_bytes, 30 + MetricName::FdCount => self.fd_count, 31 + MetricName::DataDirBytes => Some(self.data_dir_bytes), 32 + MetricName::IndexDirBytes => Some(self.index_dir_bytes), 33 + MetricName::SegmentsDirBytes => Some(self.segments_dir_bytes), 34 + MetricName::DataFileCount => self.data_file_count, 35 + MetricName::SegmentCount => self.segment_count, 36 + MetricName::BlockIndexEntries => Some(self.block_index_entries), 37 + MetricName::HintFileBytes => Some(self.hint_file_bytes), 38 + } 39 + } 40 + } 41 + 42 + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 43 + pub enum MetricName { 44 + RssBytes, 45 + FdCount, 46 + DataDirBytes, 47 + IndexDirBytes, 48 + SegmentsDirBytes, 49 + DataFileCount, 50 + SegmentCount, 51 + BlockIndexEntries, 52 + HintFileBytes, 53 + } 54 + 55 + impl MetricName { 56 + pub const ALL: &'static [MetricName] = &[ 57 + Self::RssBytes, 58 + Self::FdCount, 59 + Self::DataDirBytes, 60 + Self::IndexDirBytes, 61 + Self::SegmentsDirBytes, 62 + Self::DataFileCount, 63 + Self::SegmentCount, 64 + Self::BlockIndexEntries, 65 + Self::HintFileBytes, 66 + ]; 67 + 68 + pub const fn as_str(self) -> &'static str { 69 + match self { 70 + Self::RssBytes => "rss_bytes", 71 + Self::FdCount => "fd_count", 72 + Self::DataDirBytes => "data_dir_bytes", 73 + Self::IndexDirBytes => "index_dir_bytes", 74 + Self::SegmentsDirBytes => "segments_dir_bytes", 75 + Self::DataFileCount => "data_file_count", 76 + Self::SegmentCount => "segment_count", 77 + Self::BlockIndexEntries => "block_index_entries", 78 + Self::HintFileBytes => "hint_file_bytes", 79 + } 80 + } 81 + 82 + pub const fn min_absolute_delta(self) -> u64 { 83 + match self { 84 + Self::RssBytes => 16 * 1024 * 1024, 85 + Self::FdCount => 16, 86 + Self::DataDirBytes => 16 * 1024 * 1024, 87 + Self::IndexDirBytes => 1024 * 1024, 88 + Self::SegmentsDirBytes => 16 * 1024 * 1024, 89 + Self::DataFileCount => 16, 90 + Self::SegmentCount => 4, 91 + Self::BlockIndexEntries => 1024, 92 + Self::HintFileBytes => 1024 * 1024, 93 + } 94 + } 95 + } 96 + 97 + pub fn sample_harness<S: StorageIO + Send + Sync + 'static>( 98 + harness: &Harness<S>, 99 + elapsed: Duration, 100 + ) -> MetricsSample { 101 + MetricsSample { 102 + elapsed_ms: u64::try_from(elapsed.as_millis()).unwrap_or(u64::MAX), 103 + rss_bytes: read_rss(), 104 + fd_count: count_open_fds(), 105 + data_dir_bytes: dir_bytes(harness.store.data_dir()), 106 + index_dir_bytes: dir_bytes(harness.store.block_index().index_dir()), 107 + segments_dir_bytes: harness 108 + .eventlog 109 + .as_ref() 110 + .map(|el| dir_bytes(&el.segments_dir)) 111 + .unwrap_or(0), 112 + data_file_count: data_file_count(&harness.store), 113 + segment_count: harness.eventlog.as_ref().and_then(segment_count), 114 + block_index_entries: harness.store.block_index().approximate_block_count(), 115 + hint_file_bytes: hint_bytes(harness.store.data_dir()), 116 + } 117 + } 118 + 119 + fn data_file_count<S: StorageIO + Send + Sync + 'static>( 120 + store: &Arc<TranquilBlockStore<S>>, 121 + ) -> Option<u64> { 122 + match store.list_data_files() { 123 + Ok(v) => Some(v.len() as u64), 124 + Err(e) => { 125 + warn!(error = %e, "gauntlet metrics: list_data_files failed"); 126 + None 127 + } 128 + } 129 + } 130 + 131 + fn segment_count<S: StorageIO + Send + Sync + 'static>(el: &EventLogState<S>) -> Option<u64> { 132 + match el.manager.list_segments() { 133 + Ok(v) => Some(v.len() as u64), 134 + Err(e) => { 135 + warn!(error = %e, "gauntlet metrics: list_segments failed"); 136 + None 137 + } 138 + } 139 + } 140 + 141 + fn dir_bytes(path: &Path) -> u64 { 142 + let Ok(entries) = std::fs::read_dir(path) else { 143 + return 0; 144 + }; 145 + entries 146 + .filter_map(Result::ok) 147 + .map(|entry| match entry.file_type() { 148 + Ok(ft) if ft.is_dir() => dir_bytes(&entry.path()), 149 + Ok(_) => entry.metadata().map(|m| m.len()).unwrap_or(0), 150 + Err(_) => 0, 151 + }) 152 + .sum() 153 + } 154 + 155 + fn hint_bytes(data_dir: &Path) -> u64 { 156 + let Ok(entries) = std::fs::read_dir(data_dir) else { 157 + return 0; 158 + }; 159 + entries 160 + .filter_map(Result::ok) 161 + .filter(|entry| { 162 + entry 163 + .path() 164 + .extension() 165 + .and_then(|e| e.to_str()) 166 + .map(|e| e == "tqh") 167 + .unwrap_or(false) 168 + }) 169 + .map(|entry| entry.metadata().map(|m| m.len()).unwrap_or(0)) 170 + .sum() 171 + } 172 + 173 + #[cfg(target_os = "linux")] 174 + fn read_rss() -> Option<u64> { 175 + let status = match std::fs::read_to_string("/proc/self/status") { 176 + Ok(s) => s, 177 + Err(e) => { 178 + warn!(error = %e, "gauntlet metrics: read /proc/self/status failed"); 179 + return None; 180 + } 181 + }; 182 + let parsed = status.lines().find_map(|line| { 183 + let rest = line.strip_prefix("VmRSS:")?; 184 + let kb: u64 = rest.split_whitespace().next()?.parse().ok()?; 185 + Some(kb * 1024) 186 + }); 187 + if parsed.is_none() { 188 + warn!("gauntlet metrics: VmRSS line missing from /proc/self/status"); 189 + } 190 + parsed 191 + } 192 + 193 + #[cfg(not(target_os = "linux"))] 194 + fn read_rss() -> Option<u64> { 195 + None 196 + } 197 + 198 + #[cfg(target_os = "linux")] 199 + fn count_open_fds() -> Option<u64> { 200 + match std::fs::read_dir("/proc/self/fd") { 201 + Ok(entries) => Some(entries.filter_map(Result::ok).count() as u64), 202 + Err(e) => { 203 + warn!(error = %e, "gauntlet metrics: read /proc/self/fd failed"); 204 + None 205 + } 206 + } 207 + } 208 + 209 + #[cfg(not(target_os = "linux"))] 210 + fn count_open_fds() -> Option<u64> { 211 + None 212 + } 213 + 214 + #[cfg(test)] 215 + mod tests { 216 + use super::*; 217 + 218 + #[test] 219 + fn metric_names_roundtrip_strings() { 220 + MetricName::ALL.iter().for_each(|m| { 221 + let s = m.as_str(); 222 + assert!(!s.is_empty()); 223 + }); 224 + } 225 + 226 + #[test] 227 + #[cfg_attr(not(target_os = "linux"), ignore = "linux /proc only")] 228 + fn rss_reads_nonzero() { 229 + let rss = read_rss().expect("rss"); 230 + assert!(rss > 0, "rss should be positive, got {rss}"); 231 + } 232 + 233 + #[test] 234 + #[cfg_attr(not(target_os = "linux"), ignore = "linux /proc only")] 235 + fn fd_count_reads_nonzero() { 236 + let fd = count_open_fds().expect("fd"); 237 + assert!(fd > 0); 238 + } 239 + 240 + #[test] 241 + fn dir_bytes_sums_entries() { 242 + let dir = tempfile::TempDir::new().unwrap(); 243 + std::fs::write(dir.path().join("a"), b"1234").unwrap(); 244 + std::fs::write(dir.path().join("b"), b"5678").unwrap(); 245 + assert_eq!(dir_bytes(dir.path()), 8); 246 + } 247 + }