A public mirror for the whole atmosphere hubble.microcosm.blue
27
fork

Configure Feed

Select the types of activity you want to include in your feed.

more dictionary stuff and start intern test

phil c5a940d8 92687e0c

+227 -22
+48
space-efficiency-check/examples/sweep.rs
··· 238 238 } 239 239 } 240 240 241 + // ── 32k block deep dive: zstd level × dictionary ─────────── 242 + for &level in &[1, 3, 6, 9] { 243 + for &dict_kb in &[16, 64, 256] { 244 + let dict = dict_kb * 1024; 245 + for &train_kb in &[256, 1024] { 246 + let mult = (train_kb / dict_kb) as i32; 247 + if mult < 1 { continue; } 248 + configs.push(SweepConfig { 249 + block_size: 32 * 1024, 250 + zstd_level: level, 251 + dict_bytes: dict as i32, 252 + train_mult: Some(mult), 253 + ..base 254 + }); 255 + } 256 + // also test default training 257 + configs.push(SweepConfig { 258 + block_size: 32 * 1024, 259 + zstd_level: level, 260 + dict_bytes: dict as i32, 261 + train_mult: None, 262 + ..base 263 + }); 264 + } 265 + } 266 + 267 + // ── 32k block: high-training dict at z6/z9 ───────────────── 268 + for &level in &[6, 9] { 269 + // 16k dict, 128x training (2M) 270 + configs.push(SweepConfig { 271 + block_size: 32 * 1024, 272 + zstd_level: level, 273 + dict_bytes: 16_384, 274 + train_mult: Some(128), 275 + ..base 276 + }); 277 + // 32k dict: default, 16x, 64x, 128x 278 + for &tm in &[-1, 16, 64, 128] { 279 + configs.push(SweepConfig { 280 + block_size: 32 * 1024, 281 + zstd_level: level, 282 + dict_bytes: 32_768, 283 + train_mult: if tm < 0 { None } else { Some(tm) }, 284 + ..base 285 + }); 286 + } 287 + } 288 + 241 289 // ── Subcompactions ───────────────────────────────────────── 242 290 for &sub in &[1, 4, 10] { 243 291 configs.push(SweepConfig {
+97
space-efficiency-check/index.html
··· 73 73 <div id="chart-dict-cost"></div> 74 74 </div> 75 75 <div class="chart-container"> 76 + <h2>32k blocks: zstd level × dictionary <label class="zero-toggle"><input type="checkbox" data-chart="chart-32k"> y from 0</label></h2> 77 + <div id="chart-32k"></div> 78 + </div> 79 + </div> 80 + 81 + <div class="chart-row"> 82 + <div class="chart-container"> 76 83 <h2>key restart interval <label class="zero-toggle"><input type="checkbox" data-chart="chart-restart"> y from 0</label></h2> 77 84 <div id="chart-restart"></div> 78 85 </div> ··· 268 275 65536,3,8,65536,16,true,10,17718370930,16897,51.0 269 276 65536,3,8,262144,1,true,10,17676607265,16857,27.5 270 277 65536,3,8,262144,4,true,10,17806456807,16981,65.2 278 + 32768,1,8,16384,,true,10,18145390797,17304,28.8 279 + 32768,1,8,16384,64,true,10,18162942166,17321,28.7 280 + 32768,1,8,65536,,true,10,18113630667,17274,25.4 281 + 32768,1,8,65536,4,true,10,18252364775,17406,25.5 282 + 32768,1,8,65536,16,true,10,18191268324,17348,27.3 283 + 32768,1,8,262144,,true,10,18433074184,17579,25.8 284 + 32768,1,8,262144,1,true,10,18941076049,18063,30.4 285 + 32768,1,8,262144,4,true,10,18630362351,17767,32.4 286 + 32768,3,8,16384,,true,10,18092158457,17254,43.3 287 + 32768,3,8,65536,,true,10,18175762988,17333,50.1 288 + 32768,3,8,262144,,true,10,18237799128,17392,58.5 289 + 32768,6,8,16384,,true,10,17815254359,16989,67.7 290 + 32768,6,8,16384,64,true,10,17700896109,16880,71.0 291 + 32768,6,8,65536,,true,10,17903486595,17074,74.4 292 + 32768,6,8,65536,4,true,10,17835381202,17009,75.6 293 + 32768,6,8,65536,16,true,10,17792150430,16967,79.5 294 + 32768,6,8,262144,,true,10,17617765307,16801,82.0 295 + 32768,6,8,262144,1,true,10,17722853142,16901,85.7 296 + 32768,6,8,262144,4,true,10,17519172262,16707,89.4 297 + 32768,9,8,16384,,true,10,17542440248,16729,93.6 298 + 32768,9,8,16384,64,true,10,17402812286,16596,96.1 299 + 32768,9,8,65536,,true,10,17621780999,16805,103.5 300 + 32768,9,8,65536,4,true,10,17560386951,16746,106.3 301 + 32768,9,8,65536,16,true,10,17513473140,16702,110.7 302 + 32768,9,8,262144,,true,10,17288349711,16487,114.8 303 + 32768,9,8,262144,1,true,10,17736615600,16914,134.2 304 + 32768,9,8,262144,4,true,10,17193958194,16397,126.5 305 + 32768,6,8,16384,128,true,10,17685606257,16866,73.3 306 + 32768,6,8,32768,,true,10,17851433056,17024,69.4 307 + 32768,6,8,32768,16,true,10,17756182902,16933,72.2 308 + 32768,6,8,32768,64,true,10,17701788554,16881,77.0 309 + 32768,6,8,32768,128,true,10,17677204659,16858,80.8 310 + 32768,9,8,16384,128,true,10,17377098336,16572,100.3 311 + 32768,9,8,32768,,true,10,17573743202,16759,97.5 312 + 32768,9,8,32768,16,true,10,17464617794,16655,101.3 313 + 32768,9,8,32768,64,true,10,17397585233,16591,109.2 314 + 32768,9,8,32768,128,true,10,17363679655,16559,114.3 271 315 `; 272 316 273 317 // ── parse ───────────────────────────────────────────────────────── ··· 713 757 })); 714 758 } 715 759 760 + // ── chart: 32k block deep dive ──────────────────────────────────── 761 + function chart32k() { 762 + const bs = 32768; 763 + const subset = data.filter(d => 764 + d.block_size === bs && d.zstd_level > 0 && d.restart_interval === 8 && 765 + d.opt_filters && d.subcompactions === 10 766 + ); 767 + 768 + function dictLabel(d) { 769 + if (d.dict_bytes === 0) return 'none'; 770 + const dk = d.dict_bytes >= 1024 ? (d.dict_bytes / 1024) + 'k' : d.dict_bytes; 771 + const tm = d.train_mult === null ? 'def' : d.train_mult + 'x'; 772 + return `${dk}/${tm}`; 773 + } 774 + 775 + // build globally sorted x-axis: none first, then by dict size, then training 776 + const allLabels = [...new Set(subset.map(dictLabel))]; 777 + allLabels.sort((a, b) => { 778 + if (a === 'none') return -1; 779 + if (b === 'none') return 1; 780 + const parseDk = s => parseInt(s) * (s.includes('k/') ? 1024 : 1); 781 + const parseTm = s => { const m = s.match(/\/(\d+)x$/); return m ? parseInt(m[1]) : -1; }; 782 + return parseDk(a) - parseDk(b) || parseTm(a) - parseTm(b); 783 + }); 784 + 785 + const levels = [...new Set(subset.map(d => d.zstd_level))].sort((a, b) => a - b); 786 + const traces = []; 787 + 788 + for (const level of levels) { 789 + const rows = subset.filter(d => d.zstd_level === level); 790 + rows.sort((a, b) => allLabels.indexOf(dictLabel(a)) - allLabels.indexOf(dictLabel(b))); 791 + 792 + traces.push({ 793 + x: rows.map(dictLabel), 794 + y: rows.map(gib), 795 + name: levelName(level), 796 + mode: 'lines+markers', 797 + line: { color: levelColors[level] || '#888' }, 798 + marker: { size: 7 }, 799 + text: rows.map(d => `${d.compact_secs}s`), 800 + hovertemplate: `32k, ${levelName(level)}, dict=%{x}<br>%{y:.2f} GiB<br>%{text}<extra></extra>`, 801 + }); 802 + } 803 + 804 + Plotly.newPlot('chart-32k', traces, plotLayout({ 805 + xaxis: { title: 'Dictionary Config (size/training)', type: 'category', 806 + categoryorder: 'array', categoryarray: allLabels }, 807 + yaxis: { title: 'Size (GiB)' }, 808 + margin: { t: 30, b: 70, l: 60, r: 20 }, 809 + })); 810 + } 811 + 716 812 // ── chart 4: restart interval ───────────────────────────────────── 717 813 function chartRestart() { 718 814 const traces = []; ··· 914 1010 chartDict(); 915 1011 chartDictBenefit(); 916 1012 chartDictCost(); 1013 + chart32k(); 917 1014 chartRestart(); 918 1015 chartSubcomp(); 919 1016 setupFilters();
+20 -10
space-efficiency-check/src/main.rs
··· 31 31 /// sample fraction of DIDs to import (0.0–1.0, default: all) 32 32 #[arg(long)] 33 33 sample: Option<f64>, 34 + 35 + /// store sequential u64s in place of dids 36 + #[arg(long, action)] 37 + intern: bool, 34 38 } 35 39 36 40 fn open_db(path: &Path) -> Result<DB, rocksdb::Error> { 41 + let block_size = 32_768; 42 + let zstd_level = 6; 43 + let restart_interval = 8; // oops sweep base is wrong here (use 16 default) 44 + let dict_bytes = 16_384; 45 + let train_mult = 64; 46 + let opt_filters = true; 47 + let subcompactions = 10; 48 + 37 49 let mut opts = Options::default(); 38 50 opts.create_if_missing(true); 39 51 40 52 // compress lower levels more 41 53 opts.set_compression_type(DBCompressionType::Zstd); 42 54 43 - const D: i32 = 0; 44 - 45 55 opts.set_compression_options( 46 56 -14, // window_bits (default, -14 means use zstd default) 47 - 3, // level (default 3) 57 + zstd_level, // level (default 3) 48 58 0, // strategy (0 = default) 49 - D, // max_dict_bytes (0 = no dictionary) (tried 16k) 59 + dict_bytes, // max_dict_bytes (0 = no dictionary) (tried 16k) 50 60 ); 51 - // opts.set_zstd_max_train_bytes(16 * D); 61 + opts.set_zstd_max_train_bytes(dict_bytes * train_mult); 52 62 53 63 // skip filters on bottom-most layer 54 64 // TODO: verify how much this does 55 - opts.set_optimize_filters_for_hits(true); 65 + opts.set_optimize_filters_for_hits(opt_filters); 56 66 57 67 // keep this for now while running on the full set since it speeds things up a lot 58 68 // but probably drop it if we sample SSTs and don't need that? 59 69 // (or maybe actually measure how much it affects total compression ratio? expecting minimal) 60 - opts.set_max_subcompactions(10); // default 1, probably do 3–4 in prod 70 + opts.set_max_subcompactions(subcompactions); // default 1, probably do 3–4 in prod 61 71 62 72 // larger blocks for better compression (good scan, worse point read) 63 73 let mut block_opts = rocksdb::BlockBasedOptions::default(); 64 - block_opts.set_block_size(32 * 2_usize.pow(10)); // N * KB blocks 65 - block_opts.set_block_restart_interval(8); 74 + block_opts.set_block_size(block_size); // N * KB blocks 75 + block_opts.set_block_restart_interval(restart_interval); 66 76 opts.set_block_based_table_factory(&block_opts); 67 77 68 78 DB::open(&opts, path) ··· 81 91 return Ok(()) 82 92 } 83 93 84 - let stats = run_workers(&args.car_dir, db.clone(), args.workers, args.mem_limit_mb, args.sample).await?; 94 + let stats = run_workers(&args.car_dir, db.clone(), args.workers, args.mem_limit_mb, args.sample, args.intern).await?; 85 95 86 96 let repos = stats.repos.load(Ordering::Relaxed); 87 97 let empty = stats.empty_repos.load(Ordering::Relaxed);
+25 -12
space-efficiency-check/src/work.rs
··· 47 47 workers: usize, 48 48 mem_limit_mb: usize, 49 49 sample: Option<f64>, 50 + intern: bool, 50 51 ) -> Result<Stats, ProcessError> { 51 52 let stats = Arc::new(Stats::default()); 52 53 let (tx, rx) = async_channel::bounded(1024); 53 54 let start = Instant::now(); 55 + 56 + let intern = intern.then_some(Arc::new(AtomicU64::new(0))); 54 57 55 58 let mut set = JoinSet::new(); 56 59 for _ in 0..workers { 57 60 let rx = rx.clone(); 58 61 let db = db.clone(); 59 62 let stats = stats.clone(); 60 - set.spawn(worker(rx, db, stats, mem_limit_mb, sample)); 63 + let intern = intern.clone(); 64 + set.spawn(worker(rx, db, stats, mem_limit_mb, intern)); 61 65 } 62 66 63 67 let mut file_count = 0; ··· 67 71 if !meta.is_file() { 68 72 continue; 69 73 } 74 + 75 + // skip this DID if it doesn't fall within the sample fraction 76 + if let Some(frac) = sample { 77 + if rand::random::<f64>() >= frac { 78 + continue 79 + } 80 + } 81 + 70 82 stats.car_bytes.fetch_add(meta.len(), Ordering::Relaxed); 71 83 file_count += 1; 72 84 tx.send(entry.path()) ··· 97 109 db: Arc<DB>, 98 110 stats: Arc<Stats>, 99 111 mem_limit_mb: usize, 100 - sample: Option<f64>, 112 + intern: Option<Arc<AtomicU64>>, 101 113 ) { 102 114 while let Ok(path) = rx.recv().await { 103 - let p = process_car(&path, mem_limit_mb, db.clone(), &stats, sample); 115 + let p = process_car(&path, mem_limit_mb, db.clone(), &stats, intern.clone()); 104 116 match tokio::time::timeout(Duration::from_secs(30), p).await { 105 117 Ok(Ok(_)) => {} 106 118 Ok(Err(e)) => { ··· 120 132 mem_limit_mb: usize, 121 133 db: Arc<DB>, 122 134 stats: &Stats, 123 - sample: Option<f64>, 135 + intern: Option<Arc<AtomicU64>>, 124 136 ) -> Result<(), ProcessError> { 125 137 let file = tokio::fs::File::open(path).await?; 126 138 let reader = BufReader::new(file); ··· 135 147 let did = car.commit.did.clone(); 136 148 let cid = car.commit.data.to_bytes(); 137 149 138 - // skip this DID if it doesn't fall within the sample fraction 139 - if let Some(frac) = sample { 140 - if rand::random::<f64>() >= frac { 141 - return Ok(()); 142 - } 143 - } 150 + let mut prefix = if let Some(intern) = intern { 151 + let id = intern.fetch_add(1, Ordering::Relaxed); 152 + id.to_be_bytes().to_vec() 153 + } else { 154 + did.as_bytes().to_vec() 155 + }; 144 156 145 157 if false { 158 + // for now we're not writing account keys 146 159 let mut account_key = Vec::with_capacity(1 + did.len()); 147 160 account_key.push(PREFIX_ACCOUNT); 148 161 account_key.extend_from_slice(&did.as_bytes()); ··· 166 179 167 180 let mut batch = WriteBatch::default(); 168 181 for output in &chunk { 169 - let key = format!("{did}/{}", output.key); 170 - batch.put(key.as_bytes(), &output.data); 182 + prefix.extend_from_slice(output.key.as_bytes()); 183 + batch.put(&prefix, &output.data); 171 184 } 172 185 let db = db.clone(); 173 186 tokio::task::spawn_blocking(move || {
+37
space-efficiency-check/sweep-results.csv
··· 150 150 65536,3,8,65536,16,true,10,17718370930,16897,51.0 151 151 65536,3,8,262144,1,true,10,17676607265,16857,27.5 152 152 65536,3,8,262144,4,true,10,17806456807,16981,65.2 153 + 32768,1,8,16384,,true,10,18145390797,17304,28.8 154 + 32768,1,8,16384,64,true,10,18162942166,17321,28.7 155 + 32768,1,8,65536,,true,10,18113630667,17274,25.4 156 + 32768,1,8,65536,4,true,10,18252364775,17406,25.5 157 + 32768,1,8,65536,16,true,10,18191268324,17348,27.3 158 + 32768,1,8,262144,,true,10,18433074184,17579,25.8 159 + 32768,1,8,262144,1,true,10,18941076049,18063,30.4 160 + 32768,1,8,262144,4,true,10,18630362351,17767,32.4 161 + 32768,3,8,16384,,true,10,18092158457,17254,43.3 162 + 32768,3,8,65536,,true,10,18175762988,17333,50.1 163 + 32768,3,8,262144,,true,10,18237799128,17392,58.5 164 + 32768,6,8,16384,,true,10,17815254359,16989,67.7 165 + 32768,6,8,16384,64,true,10,17700896109,16880,71.0 166 + 32768,6,8,65536,,true,10,17903486595,17074,74.4 167 + 32768,6,8,65536,4,true,10,17835381202,17009,75.6 168 + 32768,6,8,65536,16,true,10,17792150430,16967,79.5 169 + 32768,6,8,262144,,true,10,17617765307,16801,82.0 170 + 32768,6,8,262144,1,true,10,17722853142,16901,85.7 171 + 32768,6,8,262144,4,true,10,17519172262,16707,89.4 172 + 32768,9,8,16384,,true,10,17542440248,16729,93.6 173 + 32768,9,8,16384,64,true,10,17402812286,16596,96.1 174 + 32768,9,8,65536,,true,10,17621780999,16805,103.5 175 + 32768,9,8,65536,4,true,10,17560386951,16746,106.3 176 + 32768,9,8,65536,16,true,10,17513473140,16702,110.7 177 + 32768,9,8,262144,,true,10,17288349711,16487,114.8 178 + 32768,9,8,262144,1,true,10,17736615600,16914,134.2 179 + 32768,9,8,262144,4,true,10,17193958194,16397,126.5 180 + 32768,6,8,16384,128,true,10,17685606257,16866,73.3 181 + 32768,6,8,32768,,true,10,17851433056,17024,69.4 182 + 32768,6,8,32768,16,true,10,17756182902,16933,72.2 183 + 32768,6,8,32768,64,true,10,17701788554,16881,77.0 184 + 32768,6,8,32768,128,true,10,17677204659,16858,80.8 185 + 32768,9,8,16384,128,true,10,17377098336,16572,100.3 186 + 32768,9,8,32768,,true,10,17573743202,16759,97.5 187 + 32768,9,8,32768,16,true,10,17464617794,16655,101.3 188 + 32768,9,8,32768,64,true,10,17397585233,16591,109.2 189 + 32768,9,8,32768,128,true,10,17363679655,16559,114.3