an efficient binary archive format
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

fix benchmarks

zach 6b33d888 fa52fab4

+120 -57
+3 -3
Cargo.toml
··· 14 14 [dependencies] 15 15 crc32fast = "1.5.0" 16 16 memmap2 = "0.9.9" 17 - zerocopy = { version = "0.8.38", features = ["std", "derive"] } 18 - zstd = "0.13.3" 17 + zerocopy = { version = "0.8", features = ["std", "derive"] } 18 + zstd = "0.13" 19 19 clap = { version = "4.5", features = ["derive"], optional = true } 20 - fs2 = "0.4.3" 20 + fs2 = "0.4" 21 21 22 22 [features] 23 23 default = ["cli"]
+74 -34
bench/bench.py
··· 64 64 ) 65 65 66 66 67 - def benchmark_bindle_uncompressed(src_dir: Path, archive_path: Path) -> tuple[float, int, float]: 67 + def benchmark_bindle_uncompressed(bindle_bin: Path, src_dir: Path, archive_path: Path) -> tuple[float, int, float]: 68 68 """Benchmark bindle without compression.""" 69 69 # Pack 70 70 start = time.perf_counter() 71 71 subprocess.run( 72 - ["cargo", "run", "--release", "--", "pack", str(archive_path), str(src_dir)], 73 - cwd=Path(__file__).parent.parent, 74 - capture_output=True, 72 + [str(bindle_bin), "pack", str(archive_path), str(src_dir)], 73 + stdout=subprocess.DEVNULL, 74 + stderr=subprocess.DEVNULL, 75 75 check=True, 76 76 ) 77 77 pack_time = time.perf_counter() - start ··· 79 79 size = archive_path.stat().st_size 80 80 81 81 # Unpack 82 - extract_dir = archive_path.parent / "extract_bindle_none" 82 + extract_dir = archive_path.parent / f"extract_{archive_path.stem}" 83 83 extract_dir.mkdir(exist_ok=True) 84 84 start = time.perf_counter() 85 85 subprocess.run( 86 - ["cargo", "run", "--release", "--", "unpack", str(archive_path), str(extract_dir)], 87 - cwd=Path(__file__).parent.parent, 88 - capture_output=True, 86 + [str(bindle_bin), "unpack", str(archive_path), str(extract_dir)], 87 + stdout=subprocess.DEVNULL, 88 + stderr=subprocess.DEVNULL, 89 89 check=True, 90 90 ) 91 91 unpack_time = time.perf_counter() - start ··· 93 93 return pack_time, size, unpack_time 94 94 95 95 96 - def benchmark_bindle_compressed(src_dir: Path, archive_path: Path) -> tuple[float, int, float]: 96 + def benchmark_bindle_compressed(bindle_bin: Path, src_dir: Path, archive_path: Path) -> tuple[float, int, float]: 97 97 """Benchmark bindle with zstd compression.""" 98 98 # Pack 99 99 start = time.perf_counter() 100 100 subprocess.run( 101 - ["cargo", "run", "--release", "--", "pack", str(archive_path), str(src_dir), "--compress"], 102 - cwd=Path(__file__).parent.parent, 103 - capture_output=True, 101 + [str(bindle_bin), "pack", str(archive_path), str(src_dir), "--compress"], 102 + stdout=subprocess.DEVNULL, 103 + stderr=subprocess.DEVNULL, 104 104 check=True, 105 105 ) 106 106 pack_time = time.perf_counter() - start ··· 108 108 size = archive_path.stat().st_size 109 109 110 110 # Unpack 111 - extract_dir = archive_path.parent / "extract_bindle_zstd" 111 + extract_dir = archive_path.parent / f"extract_{archive_path.stem}" 112 112 extract_dir.mkdir(exist_ok=True) 113 113 start = time.perf_counter() 114 114 subprocess.run( 115 - ["cargo", "run", "--release", "--", "unpack", str(archive_path), str(extract_dir)], 116 - cwd=Path(__file__).parent.parent, 117 - capture_output=True, 115 + [str(bindle_bin), "unpack", str(archive_path), str(extract_dir)], 116 + stdout=subprocess.DEVNULL, 117 + stderr=subprocess.DEVNULL, 118 118 check=True, 119 119 ) 120 120 unpack_time = time.perf_counter() - start ··· 136 136 size = archive_path.stat().st_size 137 137 138 138 # Extract 139 - extract_dir = archive_path.parent / "extract_tar" 139 + extract_dir = archive_path.parent / f"extract_{archive_path.stem}" 140 140 extract_dir.mkdir(exist_ok=True) 141 141 start = time.perf_counter() 142 142 subprocess.run( ··· 163 163 size = archive_path.stat().st_size 164 164 165 165 # Extract 166 - extract_dir = archive_path.parent / "extract_tar_gz" 166 + extract_dir = archive_path.parent / f"extract_{archive_path.stem}" 167 167 extract_dir.mkdir(exist_ok=True) 168 168 start = time.perf_counter() 169 169 subprocess.run( ··· 190 190 size = archive_path.stat().st_size 191 191 192 192 # Extract 193 - extract_dir = archive_path.parent / "extract_zip" 193 + extract_dir = archive_path.parent / f"extract_{archive_path.stem}" 194 194 extract_dir.mkdir(exist_ok=True) 195 195 start = time.perf_counter() 196 196 subprocess.run( 197 - ["unzip", "-q", str(archive_path), "-d", str(extract_dir)], 197 + ["unzip", "-o", "-q", str(archive_path), "-d", str(extract_dir)], 198 198 capture_output=True, 199 199 check=True, 200 200 ) ··· 204 204 205 205 206 206 def main(): 207 + project_root = Path(__file__).parent.parent 208 + 207 209 print("Building bindle in release mode...") 208 210 subprocess.run( 209 211 ["cargo", "build", "--release", "--features", "cli"], 210 - cwd=Path(__file__).parent.parent, 212 + cwd=project_root, 211 213 capture_output=True, 212 214 check=True, 213 215 ) 214 216 217 + # Get the built binary path 218 + bindle_bin = project_root / "target" / "release" / "bindle" 219 + if not bindle_bin.exists(): 220 + raise FileNotFoundError(f"Built binary not found at {bindle_bin}") 221 + 215 222 with tempfile.TemporaryDirectory() as tmpdir: 216 223 tmpdir = Path(tmpdir) 217 224 225 + # Ensure directories exist and warm up filesystem 226 + test_data = tmpdir / "test_data" 227 + test_data.mkdir(parents=True, exist_ok=True) 228 + 229 + # Warm up: write and delete a small file to initialize filesystem 230 + warmup_file = tmpdir / "warmup" 231 + warmup_file.write_bytes(b"warmup" * 1000) 232 + warmup_file.unlink() 233 + 218 234 # Create test data 219 235 print("Creating test dataset...") 220 - test_data = tmpdir / "test_data" 221 236 create_test_data(test_data) 222 237 238 + # Warm up: read all test files to initialize filesystem caches 239 + for f in test_data.rglob("*"): 240 + if f.is_file(): 241 + _ = f.read_bytes() 242 + 223 243 # Calculate total size 224 244 total_size = sum(f.stat().st_size for f in test_data.rglob("*") if f.is_file()) 225 245 file_count = len(list(test_data.rglob("*"))) ··· 227 247 print(f"Test dataset: {file_count} files, {format_size(total_size)}\n") 228 248 229 249 benchmarks = [ 230 - ("bindle (uncompressed)", lambda: benchmark_bindle_uncompressed( 231 - test_data, tmpdir / "test.bndl" 250 + ("bindle (uncompressed)", lambda run: benchmark_bindle_uncompressed( 251 + bindle_bin, test_data, tmpdir / f"test_{run}.bndl" 232 252 )), 233 - ("bindle (zstd)", lambda: benchmark_bindle_compressed( 234 - test_data, tmpdir / "test_zstd.bndl" 253 + ("bindle (zstd)", lambda run: benchmark_bindle_compressed( 254 + bindle_bin, test_data, tmpdir / f"test_zstd_{run}.bndl" 235 255 )), 236 - ("tar", lambda: benchmark_tar( 237 - test_data, tmpdir / "test.tar" 256 + ("tar", lambda run: benchmark_tar( 257 + test_data, tmpdir / f"test_{run}.tar" 238 258 )), 239 - ("tar.gz", lambda: benchmark_tar_gz( 240 - test_data, tmpdir / "test.tar.gz" 259 + ("tar.gz", lambda run: benchmark_tar_gz( 260 + test_data, tmpdir / f"test_{run}.tar.gz" 241 261 )), 242 - ("zip", lambda: benchmark_zip( 243 - test_data, tmpdir / "test.zip" 262 + ("zip", lambda run: benchmark_zip( 263 + test_data, tmpdir / f"test_{run}.zip" 244 264 )), 245 265 ] 246 266 247 267 results = [] 268 + num_runs = 4 # Run each test 4 times, discard first, average remaining 3 269 + 248 270 for name, bench_fn in benchmarks: 249 271 print(f"Benchmarking {name}...", flush=True) 250 272 try: 251 - pack_time, size, unpack_time = bench_fn() 252 - results.append((name, pack_time, size, unpack_time)) 273 + pack_times = [] 274 + unpack_times = [] 275 + size = 0 276 + 277 + for run in range(num_runs): 278 + pack_time, run_size, unpack_time = bench_fn(run) 279 + pack_times.append(pack_time) 280 + unpack_times.append(unpack_time) 281 + size = run_size 282 + 283 + # Discard first run, average the rest 284 + avg_pack = sum(pack_times[1:]) / (num_runs - 1) 285 + avg_unpack = sum(unpack_times[1:]) / (num_runs - 1) 286 + 287 + results.append((name, avg_pack, size, avg_unpack)) 288 + except subprocess.CalledProcessError as e: 289 + print(f" ERROR: Command failed with exit code {e.returncode}") 290 + if e.stderr: 291 + print(f" stderr: {e.stderr.decode()}") 292 + results.append((name, 0, 0, 0)) 253 293 except Exception as e: 254 294 print(f" ERROR: {e}") 255 295 results.append((name, 0, 0, 0))
+21 -7
src/bindle.rs
··· 448 448 /// Creates subdirectories as needed to match the stored paths. 449 449 pub fn unpack<P: AsRef<Path>>(&self, dest: P) -> io::Result<()> { 450 450 let dest_path = dest.as_ref(); 451 - if let Some(parent) = dest_path.parent() { 452 - std::fs::create_dir_all(parent)?; 451 + std::fs::create_dir_all(dest_path)?; 452 + 453 + // Collect all unique parent directories 454 + let mut dirs = std::collections::HashSet::new(); 455 + for (name, _) in &self.index { 456 + if let Some(parent) = Path::new(name).parent() { 457 + // Only add non-empty parent paths 458 + if parent != Path::new("") { 459 + dirs.insert(dest_path.join(parent)); 460 + } 461 + } 462 + } 463 + 464 + // Create all directories upfront (sorted for parent-first order) 465 + if !dirs.is_empty() { 466 + let mut dirs: Vec<_> = dirs.into_iter().collect(); 467 + dirs.sort(); 468 + for dir in dirs { 469 + std::fs::create_dir_all(&dir)?; 470 + } 453 471 } 454 472 455 473 // Sort entries by physical offset for sequential reads (better cache locality) 456 474 let mut entries: Vec<_> = self.index.iter().collect(); 457 475 entries.sort_by_key(|(_, entry)| entry.offset()); 458 476 477 + // Extract files without per-file directory checks 459 478 for (name, _) in entries { 460 479 let file_path = dest_path.join(name); 461 - if let Some(parent) = file_path.parent() { 462 - std::fs::create_dir_all(parent)?; 463 - } 464 - // Use streaming I/O instead of loading entire file into memory 465 480 let mut reader = self.reader(name)?; 466 481 let mut file = File::create(&file_path)?; 467 482 io::copy(&mut reader, &mut file)?; ··· 482 497 } 483 498 let compress = self.should_auto_compress(compress, 0); 484 499 let start_offset = self.data_end; 485 - // Only clone file handle if needed for compression 486 500 let encoder = if compress { 487 501 let f = self.file.try_clone()?; 488 502 Some(zstd::Encoder::new(f, 3)?)
+22 -13
src/writer.rs
··· 58 58 self.uncompressed_size += data.len() as u64; 59 59 self.crc32_hasher.update(data); 60 60 61 - if let Some(encoder) = &mut self.encoder { 62 - encoder.write_all(data)?; 63 - } else { 64 - self.bindle.file.write_all(data)?; 61 + match &mut self.encoder { 62 + Some(encoder) => { 63 + // Compressed: write to zstd encoder 64 + encoder.write_all(data)?; 65 + } 66 + None => { 67 + // Uncompressed: write directly to file 68 + self.bindle.file.write_all(data)?; 69 + } 65 70 } 66 71 67 72 Ok(()) ··· 72 77 return Ok(()); 73 78 } 74 79 75 - let (compression_type, current_pos) = if let Some(encoder) = self.encoder.take() { 76 - let mut f = encoder.finish()?; 77 - let pos = f.stream_position()?; 78 - // Sync the main file handle to match the encoder's position 79 - self.bindle.file.seek(SeekFrom::Start(pos))?; 80 - (1, pos) 81 - } else { 82 - let pos = self.bindle.file.stream_position()?; 83 - (0, pos) 80 + let (compression_type, current_pos) = match self.encoder.take() { 81 + Some(encoder) => { 82 + // Compressed: finish encoder and sync position 83 + let mut f = encoder.finish()?; 84 + let pos = f.stream_position()?; 85 + self.bindle.file.seek(SeekFrom::Start(pos))?; 86 + (1, pos) 87 + } 88 + None => { 89 + // Uncompressed: already wrote directly to file, just get position 90 + let pos = self.bindle.file.stream_position()?; 91 + (0, pos) 92 + } 84 93 }; 85 94 86 95 let compressed_size = current_pos - self.start_offset;