an efficient binary archive format
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

wip

zach fa52fab4 3d7a0332

+381 -58
+9 -1
README.md
··· 29 29 30 30 ## C API 31 31 32 - The library includes C bindings for use from other languages: 32 + The library includes C bindings: 33 33 34 34 ```c 35 35 #include "bindle.h" ··· 47 47 free(data); 48 48 bindle_close(bindle); 49 49 ``` 50 + 51 + Run: 52 + 53 + ```sh 54 + make build 55 + ``` 56 + 57 + To build `libbindle` and copy in to the root of repository 50 58 51 59 ## CLI 52 60
+277
bench/bench.py
··· 1 + #!/usr/bin/env -S uv run 2 + # /// script 3 + # requires-python = ">=3.11" 4 + # dependencies = [] 5 + # /// 6 + """ 7 + Benchmark comparing bindle vs tar/tar.gz/zip for archive operations. 8 + 9 + Measures: 10 + - Archive creation time 11 + - Archive size 12 + - Extraction/read time 13 + """ 14 + 15 + import subprocess 16 + import tempfile 17 + import time 18 + from pathlib import Path 19 + 20 + 21 + def format_size(bytes: int) -> str: 22 + """Format bytes as human readable string.""" 23 + for unit in ["B", "KB", "MB", "GB"]: 24 + if bytes < 1024: 25 + return f"{bytes:.1f} {unit}" 26 + bytes //= 1024 27 + return f"{bytes:.1f} TB" 28 + 29 + 30 + def format_time(seconds: float) -> str: 31 + """Format seconds as human readable string.""" 32 + if seconds < 0.001: 33 + return f"{seconds * 1_000_000:.1f} µs" 34 + elif seconds < 1: 35 + return f"{seconds * 1000:.1f} ms" 36 + else: 37 + return f"{seconds:.3f} s" 38 + 39 + 40 + def create_test_data(base_dir: Path) -> None: 41 + """Create a variety of test files.""" 42 + base_dir.mkdir(parents=True, exist_ok=True) 43 + 44 + # Small text files (highly compressible) 45 + for i in range(100): 46 + (base_dir / f"text_{i}.txt").write_text( 47 + f"This is test file {i}\n" * 100 48 + ) 49 + 50 + # Medium files with repetitive data 51 + for i in range(10): 52 + (base_dir / f"medium_{i}.dat").write_bytes( 53 + bytes([i % 256] * 100_000) 54 + ) 55 + 56 + # Large file with repetitive data 57 + (base_dir / "large.dat").write_bytes(b"X" * 10_000_000) 58 + 59 + # Binary-like data (less compressible) 60 + import random 61 + random.seed(42) 62 + (base_dir / "random.bin").write_bytes( 63 + bytes(random.randint(0, 255) for _ in range(1_000_000)) 64 + ) 65 + 66 + 67 + def benchmark_bindle_uncompressed(src_dir: Path, archive_path: Path) -> tuple[float, int, float]: 68 + """Benchmark bindle without compression.""" 69 + # Pack 70 + start = time.perf_counter() 71 + subprocess.run( 72 + ["cargo", "run", "--release", "--", "pack", str(archive_path), str(src_dir)], 73 + cwd=Path(__file__).parent.parent, 74 + capture_output=True, 75 + check=True, 76 + ) 77 + pack_time = time.perf_counter() - start 78 + 79 + size = archive_path.stat().st_size 80 + 81 + # Unpack 82 + extract_dir = archive_path.parent / "extract_bindle_none" 83 + extract_dir.mkdir(exist_ok=True) 84 + start = time.perf_counter() 85 + subprocess.run( 86 + ["cargo", "run", "--release", "--", "unpack", str(archive_path), str(extract_dir)], 87 + cwd=Path(__file__).parent.parent, 88 + capture_output=True, 89 + check=True, 90 + ) 91 + unpack_time = time.perf_counter() - start 92 + 93 + return pack_time, size, unpack_time 94 + 95 + 96 + def benchmark_bindle_compressed(src_dir: Path, archive_path: Path) -> tuple[float, int, float]: 97 + """Benchmark bindle with zstd compression.""" 98 + # Pack 99 + start = time.perf_counter() 100 + subprocess.run( 101 + ["cargo", "run", "--release", "--", "pack", str(archive_path), str(src_dir), "--compress"], 102 + cwd=Path(__file__).parent.parent, 103 + capture_output=True, 104 + check=True, 105 + ) 106 + pack_time = time.perf_counter() - start 107 + 108 + size = archive_path.stat().st_size 109 + 110 + # Unpack 111 + extract_dir = archive_path.parent / "extract_bindle_zstd" 112 + extract_dir.mkdir(exist_ok=True) 113 + start = time.perf_counter() 114 + subprocess.run( 115 + ["cargo", "run", "--release", "--", "unpack", str(archive_path), str(extract_dir)], 116 + cwd=Path(__file__).parent.parent, 117 + capture_output=True, 118 + check=True, 119 + ) 120 + unpack_time = time.perf_counter() - start 121 + 122 + return pack_time, size, unpack_time 123 + 124 + 125 + def benchmark_tar(src_dir: Path, archive_path: Path) -> tuple[float, int, float]: 126 + """Benchmark tar (uncompressed) using CLI.""" 127 + # Create 128 + start = time.perf_counter() 129 + subprocess.run( 130 + ["tar", "-cf", str(archive_path), "-C", str(src_dir), "."], 131 + capture_output=True, 132 + check=True, 133 + ) 134 + pack_time = time.perf_counter() - start 135 + 136 + size = archive_path.stat().st_size 137 + 138 + # Extract 139 + extract_dir = archive_path.parent / "extract_tar" 140 + extract_dir.mkdir(exist_ok=True) 141 + start = time.perf_counter() 142 + subprocess.run( 143 + ["tar", "-xf", str(archive_path), "-C", str(extract_dir)], 144 + capture_output=True, 145 + check=True, 146 + ) 147 + unpack_time = time.perf_counter() - start 148 + 149 + return pack_time, size, unpack_time 150 + 151 + 152 + def benchmark_tar_gz(src_dir: Path, archive_path: Path) -> tuple[float, int, float]: 153 + """Benchmark tar.gz using CLI.""" 154 + # Create 155 + start = time.perf_counter() 156 + subprocess.run( 157 + ["tar", "-czf", str(archive_path), "-C", str(src_dir), "."], 158 + capture_output=True, 159 + check=True, 160 + ) 161 + pack_time = time.perf_counter() - start 162 + 163 + size = archive_path.stat().st_size 164 + 165 + # Extract 166 + extract_dir = archive_path.parent / "extract_tar_gz" 167 + extract_dir.mkdir(exist_ok=True) 168 + start = time.perf_counter() 169 + subprocess.run( 170 + ["tar", "-xzf", str(archive_path), "-C", str(extract_dir)], 171 + capture_output=True, 172 + check=True, 173 + ) 174 + unpack_time = time.perf_counter() - start 175 + 176 + return pack_time, size, unpack_time 177 + 178 + 179 + def benchmark_zip(src_dir: Path, archive_path: Path) -> tuple[float, int, float]: 180 + """Benchmark zip using CLI.""" 181 + # Create - zip requires being in the directory or using find 182 + start = time.perf_counter() 183 + subprocess.run( 184 + ["sh", "-c", f"cd {src_dir} && zip -r -q {archive_path} ."], 185 + capture_output=True, 186 + check=True, 187 + ) 188 + pack_time = time.perf_counter() - start 189 + 190 + size = archive_path.stat().st_size 191 + 192 + # Extract 193 + extract_dir = archive_path.parent / "extract_zip" 194 + extract_dir.mkdir(exist_ok=True) 195 + start = time.perf_counter() 196 + subprocess.run( 197 + ["unzip", "-q", str(archive_path), "-d", str(extract_dir)], 198 + capture_output=True, 199 + check=True, 200 + ) 201 + unpack_time = time.perf_counter() - start 202 + 203 + return pack_time, size, unpack_time 204 + 205 + 206 + def main(): 207 + print("Building bindle in release mode...") 208 + subprocess.run( 209 + ["cargo", "build", "--release", "--features", "cli"], 210 + cwd=Path(__file__).parent.parent, 211 + capture_output=True, 212 + check=True, 213 + ) 214 + 215 + with tempfile.TemporaryDirectory() as tmpdir: 216 + tmpdir = Path(tmpdir) 217 + 218 + # Create test data 219 + print("Creating test dataset...") 220 + test_data = tmpdir / "test_data" 221 + create_test_data(test_data) 222 + 223 + # Calculate total size 224 + total_size = sum(f.stat().st_size for f in test_data.rglob("*") if f.is_file()) 225 + file_count = len(list(test_data.rglob("*"))) 226 + 227 + print(f"Test dataset: {file_count} files, {format_size(total_size)}\n") 228 + 229 + benchmarks = [ 230 + ("bindle (uncompressed)", lambda: benchmark_bindle_uncompressed( 231 + test_data, tmpdir / "test.bndl" 232 + )), 233 + ("bindle (zstd)", lambda: benchmark_bindle_compressed( 234 + test_data, tmpdir / "test_zstd.bndl" 235 + )), 236 + ("tar", lambda: benchmark_tar( 237 + test_data, tmpdir / "test.tar" 238 + )), 239 + ("tar.gz", lambda: benchmark_tar_gz( 240 + test_data, tmpdir / "test.tar.gz" 241 + )), 242 + ("zip", lambda: benchmark_zip( 243 + test_data, tmpdir / "test.zip" 244 + )), 245 + ] 246 + 247 + results = [] 248 + for name, bench_fn in benchmarks: 249 + print(f"Benchmarking {name}...", flush=True) 250 + try: 251 + pack_time, size, unpack_time = bench_fn() 252 + results.append((name, pack_time, size, unpack_time)) 253 + except Exception as e: 254 + print(f" ERROR: {e}") 255 + results.append((name, 0, 0, 0)) 256 + 257 + # Print results 258 + print("\n" + "=" * 90) 259 + print(f"{'Format':<22} {'Pack Time':<15} {'Size':<15} {'Unpack Time':<15} {'Ratio':>10}") 260 + print("=" * 90) 261 + 262 + for name, pack_time, size, unpack_time in results: 263 + if size > 0: 264 + ratio = (size / total_size) * 100 265 + print( 266 + f"{name:<22} {format_time(pack_time):<15} " 267 + f"{format_size(size):<15} {format_time(unpack_time):<15} " 268 + f"{ratio:>9.1f}%" 269 + ) 270 + else: 271 + print(f"{name:<22} {'FAILED'}") 272 + 273 + print("=" * 90) 274 + 275 + 276 + if __name__ == "__main__": 277 + main()
+48 -27
src/bin/bindle.rs
··· 1 1 use clap::{Parser, Subcommand}; 2 - use std::io::{self, Write}; 2 + use std::io::{self}; 3 3 use std::path::PathBuf; 4 4 use std::process; 5 5 ··· 32 32 33 33 /// Name of the entry inside the archive 34 34 name: String, 35 - /// Path to the local file to read from 36 - file_path: PathBuf, 35 + /// Path to the local file to read from (reads from stdin if omitted) 36 + file_path: Option<PathBuf>, 37 37 /// Use zstd compression 38 38 #[arg(short, long)] 39 39 compress: bool, 40 + /// Pass data directly as an argument 41 + #[arg(short, long, conflicts_with = "file_path")] 42 + data: Option<String>, 40 43 /// Run vacuum after adding 41 44 #[arg(long)] 42 45 vacuum: bool, ··· 118 121 } 119 122 }; 120 123 124 + let init_load = |path: PathBuf| match Bindle::load(&path) { 125 + Ok(bindle) => bindle, 126 + Err(e) => { 127 + eprintln!("ERROR unable to open {}: {}", path.display(), e); 128 + process::exit(1); 129 + } 130 + }; 131 + 121 132 match command { 122 133 Commands::List { bindle_file } => { 123 134 println!( ··· 128 139 if !bindle_file.exists() { 129 140 return Ok(()); 130 141 } 131 - let b = init(bindle_file); 142 + let b = init_load(bindle_file); 132 143 133 144 for (name, entry) in b.index().iter() { 134 145 let size = entry.uncompressed_size(); ··· 147 158 Commands::Add { 148 159 name, 149 160 file_path, 161 + data: data_arg, 150 162 compress, 151 163 bindle_file, 152 164 vacuum, 153 165 } => { 154 166 let mut b = init(bindle_file.clone()); 155 - let data = std::fs::read(&file_path)?; 167 + let compress_mode = if compress { 168 + Compress::Zstd 169 + } else { 170 + Compress::None 171 + }; 172 + 173 + // Determine data source and method: --data flag, file path, or stdin 174 + let size = if let Some(d) = data_arg { 175 + // Direct data from argument 176 + let bytes = d.into_bytes(); 177 + let len = bytes.len(); 178 + b.add(&name, &bytes, compress_mode)?; 179 + len 180 + } else if let Some(path) = file_path { 181 + // Use add_file to avoid loading entire file into memory 182 + b.add_file(&name, &path, compress_mode)?; 183 + std::fs::metadata(&path)?.len() as usize 184 + } else { 185 + // Stream from stdin using writer 186 + let mut writer = b.writer(&name, compress_mode)?; 187 + let size = io::copy(&mut io::stdin(), &mut writer)?; 188 + writer.close()?; 189 + size as usize 190 + }; 156 191 157 - b.add( 158 - &name, 159 - &data, 160 - if compress { 161 - Compress::Zstd 162 - } else { 163 - Compress::None 164 - }, 165 - )?; 166 192 println!( 167 193 "ADD '{}' -> {} ({} bytes)", 168 194 name, 169 195 bindle_file.display(), 170 - data.len() 196 + size 171 197 ); 172 198 b.save()?; 173 199 ··· 180 206 } 181 207 182 208 Commands::Cat { name, bindle_file } => { 183 - let b = init(bindle_file.clone()); 184 - match b.read(&name) { 185 - Some(data) => { 186 - io::stdout().write_all(&data)?; 187 - } 188 - None => { 189 - return Err(io::Error::new( 190 - io::ErrorKind::NotFound, 191 - format!("ERROR '{}' not found in {}", name, bindle_file.display()), 192 - )); 209 + let b = init_load(bindle_file.clone()); 210 + match b.read_to(name.as_str(), io::stdout()) { 211 + Ok(_n) => {} 212 + Err(e) => { 213 + return Err(io::Error::new(io::ErrorKind::NotFound, e)); 193 214 } 194 215 } 195 216 } ··· 253 274 dest_dir, 254 275 } => { 255 276 println!("UNPACK {} -> {}", bindle_file.display(), dest_dir.display()); 256 - let b = init(bindle_file); 277 + let b = init_load(bindle_file); 257 278 b.unpack(dest_dir)?; 258 279 println!("OK"); 259 280 } 260 281 261 282 Commands::Vacuum { bindle_file } => { 262 283 println!("VACUUM {}", bindle_file.display()); 263 - let mut b = init(bindle_file); 284 + let mut b = init_load(bindle_file); 264 285 b.vacuum()?; 265 286 println!("OK"); 266 287 }
+47 -30
src/bindle.rs
··· 4 4 use std::borrow::Cow; 5 5 use std::collections::BTreeMap; 6 6 use std::fs::{File, OpenOptions}; 7 - use std::io::{self, Read, Seek, SeekFrom, Write}; 7 + use std::io::{self, BufWriter, Read, Seek, SeekFrom, Write}; 8 8 use std::path::{Path, PathBuf}; 9 9 use zerocopy::{FromBytes, IntoBytes}; 10 10 ··· 194 194 self.file.seek(SeekFrom::Start(self.data_end))?; 195 195 let index_start = self.data_end; 196 196 197 - for (name, entry) in &self.index { 198 - self.file.write_all(entry.as_bytes())?; 199 - self.file.write_all(name.as_bytes())?; 200 - let pad = pad::<BNDL_ALIGN, usize>(ENTRY_SIZE + name.len()); 201 - if pad > 0 { 202 - write_padding(&mut self.file, pad)?; 197 + // Use buffered writer to batch index writes 198 + { 199 + let mut writer = BufWriter::new(&mut self.file); 200 + for (name, entry) in &self.index { 201 + writer.write_all(entry.as_bytes())?; 202 + writer.write_all(name.as_bytes())?; 203 + let pad = pad::<BNDL_ALIGN, usize>(ENTRY_SIZE + name.len()); 204 + if pad > 0 { 205 + write_padding(&mut writer, pad)?; 206 + } 203 207 } 204 - } 205 208 206 - let footer = Footer::new(index_start, self.index.len() as u32, FOOTER_MAGIC); 207 - self.file.write_all(footer.as_bytes())?; 209 + let footer = Footer::new(index_start, self.index.len() as u32, FOOTER_MAGIC); 210 + writer.write_all(footer.as_bytes())?; 211 + writer.flush()?; 212 + } // Drop writer here to release borrow 208 213 209 214 // Truncate file to current position to remove any old data 210 215 let current_pos = self.file.stream_position()?; 211 216 self.file.set_len(current_pos)?; 212 - self.file.flush()?; 213 217 214 - self.mmap = Some(unsafe { Mmap::map(&self.file)? }); 218 + let mmap = unsafe { Mmap::map(&self.file)? }; 219 + self.mmap = Some(mmap); 215 220 self.file.lock_shared()?; 216 221 Ok(()) 217 222 } ··· 431 436 let name = current 432 437 .strip_prefix(base) 433 438 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))? 434 - .to_string_lossy(); 435 - let mut data = Vec::new(); 436 - File::open(current)?.read_to_end(&mut data)?; 437 - self.add(&name, &data, compress)?; 439 + .to_str() 440 + .unwrap_or_default(); 441 + self.add_file(&name, current, compress)?; 438 442 } 439 443 Ok(()) 440 444 } ··· 447 451 if let Some(parent) = dest_path.parent() { 448 452 std::fs::create_dir_all(parent)?; 449 453 } 450 - for (name, _) in &self.index { 451 - if let Some(data) = self.read(name) { 452 - let file_path = dest_path.join(name); 453 - if let Some(parent) = file_path.parent() { 454 - std::fs::create_dir_all(parent)?; 455 - } 456 - std::fs::write(file_path, data)?; 454 + 455 + // Sort entries by physical offset for sequential reads (better cache locality) 456 + let mut entries: Vec<_> = self.index.iter().collect(); 457 + entries.sort_by_key(|(_, entry)| entry.offset()); 458 + 459 + for (name, _) in entries { 460 + let file_path = dest_path.join(name); 461 + if let Some(parent) = file_path.parent() { 462 + std::fs::create_dir_all(parent)?; 457 463 } 464 + // Use streaming I/O instead of loading entire file into memory 465 + let mut reader = self.reader(name)?; 466 + let mut file = File::create(&file_path)?; 467 + io::copy(&mut reader, &mut file)?; 468 + reader.verify_crc32()?; 458 469 } 459 470 Ok(()) 460 471 } ··· 464 475 /// The writer must be closed and then [`save()`](Bindle::save) must be called to commit the entry. 465 476 pub fn writer<'a>(&'a mut self, name: &str, compress: Compress) -> io::Result<Writer<'a>> { 466 477 self.file.lock_exclusive()?; 467 - self.file.seek(SeekFrom::Start(self.data_end))?; 478 + // Only seek if not already at the correct position 479 + let current_pos = self.file.stream_position()?; 480 + if current_pos != self.data_end { 481 + self.file.seek(SeekFrom::Start(self.data_end))?; 482 + } 468 483 let compress = self.should_auto_compress(compress, 0); 469 - let f = self.file.try_clone()?; 470 484 let start_offset = self.data_end; 485 + // Only clone file handle if needed for compression 486 + let encoder = if compress { 487 + let f = self.file.try_clone()?; 488 + Some(zstd::Encoder::new(f, 3)?) 489 + } else { 490 + None 491 + }; 471 492 Ok(Writer { 472 493 name: name.to_string(), 473 494 bindle: self, 474 - encoder: if compress { 475 - Some(zstd::Encoder::new(f, 3)?) 476 - } else { 477 - None 478 - }, 495 + encoder, 479 496 start_offset, 480 497 uncompressed_size: 0, 481 498 crc32_hasher: Hasher::new(),