···11+extern crate repo_stream;
22+use repo_stream::{DriverBuilder, Step};
33+use std::collections::HashSet;
44+use std::path::Path;
55+66+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
77+88+use mimalloc::MiMalloc;
99+#[global_allocator]
1010+static GLOBAL: MiMalloc = MiMalloc;
1111+1212+const EMPTY_CAR: &[u8] = include_bytes!("../car-samples/empty.car");
1313+const TINY_CAR: &[u8] = include_bytes!("../car-samples/tiny.car");
1414+const LITTLE_CAR: &[u8] = include_bytes!("../car-samples/little.car");
1515+const MIDSIZE_CAR: &[u8] = include_bytes!("../car-samples/midsize.car");
1616+1717+/// Walk every record and collect unique collection prefixes via HashSet dedup.
1818+async fn collect_naive(bytes: &[u8]) -> Vec<String> {
1919+ let mut mem_car = DriverBuilder::new()
2020+ .with_mem_limit_mb(100)
2121+ .load_car(bytes)
2222+ .await
2323+ .unwrap();
2424+2525+ let mut seen = HashSet::new();
2626+ let mut collections = vec![];
2727+ loop {
2828+ match mem_car.next_chunk(256).unwrap() {
2929+ Step::End(_) => break,
3030+ Step::Value(outputs) => {
3131+ for output in outputs {
3232+ let collection = output.key.split_once('/').unwrap().0.to_string();
3333+ if seen.insert(collection.clone()) {
3434+ collections.push(collection);
3535+ }
3636+ }
3737+ }
3838+ }
3939+ }
4040+ collections
4141+}
4242+4343+/// Seek past each collection using a sentinel that sorts strictly after any valid key
4444+/// in the collection. Atproto rkeys are capped at 512 chars; 513 tildes exceeds that
4545+/// maximum, so `collection/<513 tildes>` can never equal an actual record key and
4646+/// is guaranteed to be greater than `collection/<512 tildes>` (the max valid key).
4747+async fn collect_seeking(bytes: &[u8]) -> Vec<String> {
4848+ // 513 > max rkey length (512), so this is strictly greater than any valid key
4949+ let tilde_max = "~".repeat(513);
5050+ let mut mem_car = DriverBuilder::new()
5151+ .with_mem_limit_mb(100)
5252+ .load_car(bytes)
5353+ .await
5454+ .unwrap();
5555+5656+ let mut collections = vec![];
5757+ loop {
5858+ match mem_car.next().unwrap() {
5959+ Step::End(_) => break,
6060+ Step::Value(output) => {
6161+ let collection = output.key.split_once('/').unwrap().0.to_string();
6262+ collections.push(collection.clone());
6363+ mem_car.seek(&format!("{collection}/{tilde_max}")).unwrap();
6464+ }
6565+ }
6666+ }
6767+ collections
6868+}
6969+7070+async fn collect_naive_file(path: &Path) -> Vec<String> {
7171+ let reader = tokio::io::BufReader::new(tokio::fs::File::open(path).await.unwrap());
7272+ let mut mem_car = DriverBuilder::new()
7373+ .with_mem_limit_mb(1024)
7474+ .load_car(reader)
7575+ .await
7676+ .unwrap();
7777+7878+ let mut seen = HashSet::new();
7979+ let mut collections = vec![];
8080+ loop {
8181+ match mem_car.next_chunk(256).unwrap() {
8282+ Step::End(_) => break,
8383+ Step::Value(outputs) => {
8484+ for output in outputs {
8585+ let collection = output.key.split_once('/').unwrap().0.to_string();
8686+ if seen.insert(collection.clone()) {
8787+ collections.push(collection);
8888+ }
8989+ }
9090+ }
9191+ }
9292+ }
9393+ collections
9494+}
9595+9696+async fn collect_seeking_file(path: &Path) -> Vec<String> {
9797+ let tilde_max = "~".repeat(513);
9898+ let reader = tokio::io::BufReader::new(tokio::fs::File::open(path).await.unwrap());
9999+ let mut mem_car = DriverBuilder::new()
100100+ .with_mem_limit_mb(1024)
101101+ .load_car(reader)
102102+ .await
103103+ .unwrap();
104104+105105+ let mut collections = vec![];
106106+ loop {
107107+ match mem_car.next().unwrap() {
108108+ Step::End(_) => break,
109109+ Step::Value(output) => {
110110+ let collection = output.key.split_once('/').unwrap().0.to_string();
111111+ collections.push(collection.clone());
112112+ mem_car.seek(&format!("{collection}/{tilde_max}")).unwrap();
113113+ }
114114+ }
115115+ }
116116+ collections
117117+}
118118+119119+pub fn criterion_benchmark(c: &mut Criterion) {
120120+ let rt = tokio::runtime::Builder::new_multi_thread()
121121+ .enable_all()
122122+ .build()
123123+ .expect("Creating runtime failed");
124124+125125+ let cars = [
126126+ ("empty", EMPTY_CAR),
127127+ ("tiny", TINY_CAR),
128128+ ("little", LITTLE_CAR),
129129+ ("midsize", MIDSIZE_CAR),
130130+ ];
131131+132132+ let mut group = c.benchmark_group("collections");
133133+134134+ for (name, bytes) in cars {
135135+ // Sanity-check: both approaches must return the same collections
136136+ let naive = rt.block_on(collect_naive(bytes));
137137+ let mut seeking = rt.block_on(collect_seeking(bytes));
138138+ seeking.sort();
139139+ let mut naive_sorted = naive.clone();
140140+ naive_sorted.sort();
141141+ assert_eq!(naive_sorted, seeking, "approaches disagree for {name}");
142142+ println!("{name}: {naive_sorted:?}");
143143+144144+ group.bench_with_input(BenchmarkId::new("naive", name), bytes, |b, bytes| {
145145+ b.to_async(&rt).iter(async || collect_naive(bytes).await)
146146+ });
147147+ group.bench_with_input(BenchmarkId::new("seeking", name), bytes, |b, bytes| {
148148+ b.to_async(&rt).iter(async || collect_seeking(bytes).await)
149149+ });
150150+ }
151151+152152+ group.finish();
153153+154154+ if let Ok(huge_car) = std::env::var("HUGE_CAR") {
155155+ let path: std::path::PathBuf = huge_car.into();
156156+157157+ // Sanity-check the huge car too
158158+ let naive = rt.block_on(collect_naive_file(&path));
159159+ let mut seeking = rt.block_on(collect_seeking_file(&path));
160160+ seeking.sort();
161161+ let mut naive_sorted = naive.clone();
162162+ naive_sorted.sort();
163163+ assert_eq!(naive_sorted, seeking, "approaches disagree for huge-car");
164164+ println!("huge: {naive_sorted:?}");
165165+166166+ let mut group = c.benchmark_group("collections-huge");
167167+168168+ group.bench_with_input(BenchmarkId::new("naive", "huge"), &path, |b, path| {
169169+ b.to_async(&rt)
170170+ .iter(async || collect_naive_file(path).await)
171171+ });
172172+ group.bench_with_input(BenchmarkId::new("seeking", "huge"), &path, |b, path| {
173173+ b.to_async(&rt)
174174+ .iter(async || collect_seeking_file(path).await)
175175+ });
176176+177177+ group.finish();
178178+ }
179179+}
180180+181181+criterion_group!(benches, criterion_benchmark);
182182+criterion_main!(benches);
+5-3
src/mem.rs
···2828 ///
2929 /// The partial state is returned so the caller can decide what to do
3030 /// (e.g. resume with disk storage via `PartialCar::finish_loading`).
3131+ ///
3232+ /// boxed because it's big, to avoid making normal load errors heavy
3133 #[error("partially loaded car")]
3232- MemoryLimitReached(PartialCar<R>),
3434+ MemoryLimitReached(Box<PartialCar<R>>),
3335}
34363537/// A partially memory-loaded CAR file that hit the memory limit mid-stream.
···130132 mem_blocks.insert(cid.into(), maybe_processed);
131133 if mem_size >= max_size {
132134 log::debug!("blocks loaded before memory limit: {block_count}");
133133- return Err(LoadError::MemoryLimitReached(PartialCar {
135135+ return Err(LoadError::MemoryLimitReached(Box::new(PartialCar {
134136 car,
135137 root,
136138 process,
137139 max_size,
138140 blocks: mem_blocks,
139141 commit,
140140- }));
142142+ })));
141143 }
142144 }
143145