···3535 - [x] #commit and #sync
3636- [x] make sure blocking db calls are in `spawn_blocking`!!
3737- [x] db queries
3838-- [~] configuration
3939- - [~] copy applicable from tap
4040- - [ ] copy applicable from collectiondir
3838+- [x] implement `com.atproto.sync.listRepos`
4139- [x] sync1.1!!!
4240 - [x] verify #commit event
4341 - [x] verify #sync event
4442 - [x] inductive proof for #commits
4343+- [~] configuration
4444+ - [~] copy applicable from tap
4545+ - [ ] copy applicable from collectiondir
4546- [~] actually firehose-index!!
4647 - [x] extract collections-added/removed directly from CAR slice
4748 - [ ] (spend some time on tests here)
4849 - [x] do the thing (write them to the db)
4950 - [ ] swap in repo-stream
5151+- [~] metrics
5252+ - [x] basic metrics
5353+ - [x] serve prom-style
5454+ - [ ] copy applicable ones from collectiondir
5555+- [~] prefix-merge walker (limit by total collections to be merged?)
5656+ - [x] add an all-collections index
5757+- [ ] swap in repo-stream for backfill
5858+ - [ ] with memory limit
5959+ - [ ] with a global concurrency limit for big repos
6060+ - [ ] with disk spilling for huge repo
6161+ - [ ] with queueing resync for large repos if resources are taken??
6262+ - [ ] (self-reminder: get_repo should be rare in lightrail)
5063- [ ] lenient sync1.1
5164 - [ ] *don't* allow non-validating commits that look like sync1.1
5265 - [ ] rachet by PDS host: be lenient if we have never seen a sync1.1-looking commit, always strict after we see one.
···5770- [ ] "deep crawl" mode for relays that listHosts -> listRepos on host instead of relying on relay listRepos
5871- [ ] special did:web behaviour to keep reusing a stale resolution on failure
5972- [ ] filter dids from inactive accounts
6060-- [ ] implement `com.atproto.sync.listRepos`
6173- [ ] multi-collection parallel walk/merge
6262-- [ ] add collection-list index
6363-- [ ] prefix-merge walker (limit by total collections to be merged?)
6464-- [ ] do we need
6565-- [ ] metrics
6666- - [ ] copy applicable ones from collectiondir
6767- - [ ] serve prom-style
6874- [ ] admin view of backfill state etc
6975- [ ] vanity stats for optimizations, like how many in-flight repos were saved from resync due to high-water-mark firehose cursor persistence
7076- [ ] account status convergeance: if we receive commits from apparently-inactive accounts, should we check upstream status to make sure we're not stale?
···106112## state/db models
107113108114taking [inspiration from tap](https://github.com/bluesky-social/indigo/blob/main/cmd/tap/models/models.go) here!
115115+116116+TODO: fix outdated prefixes here
109117110118```
111119main index:
···152160 "repoPrev"||<did> => <rev:string>||<prevData:cid>
153161154162 note: kept separate and small because it very frequently updates!
163163+164164+165165+all-collections list:
166166+167167+ "col"||<collection> => ()
155168156169157170resync queue:
+196
src/storage/collection_list.rs
···11+//! Global collection list — every NSID ever seen across all repos.
22+//!
33+//! Key: `"col"<nsid>` — presence indicates the collection exists (or once existed).
44+//! Value: empty.
55+//!
66+//! Entries are written on every collection birth and are never deleted, so the
77+//! set can retain NSIDs after all repos have removed their last record. This is
88+//! acceptable: the index is a superset of live collections, not an exact set.
99+//!
1010+//! Writes are blind overwrites — no read-before-write. With only a few thousand
1111+//! collections in the network the key space is tiny and compaction handles the
1212+//! redundant writes cheaply.
1313+1414+use jacquard_common::types::nsid::Nsid;
1515+1616+use crate::storage::{DbRef, PREFIX_COLLECTION_LIST, StorageResult, error::StorageError};
1717+1818+// ---------------------------------------------------------------------------
1919+// Key encoding
2020+// ---------------------------------------------------------------------------
2121+2222+fn key(collection: Nsid<'_>) -> Vec<u8> {
2323+ let s = collection.as_str();
2424+ let mut k = Vec::with_capacity(PREFIX_COLLECTION_LIST.len() + s.len());
2525+ k.extend_from_slice(&PREFIX_COLLECTION_LIST);
2626+ k.extend_from_slice(s.as_bytes());
2727+ k
2828+}
2929+3030+fn parse_nsid(raw_key: &[u8]) -> StorageResult<Nsid<'static>> {
3131+ let prefix_len = PREFIX_COLLECTION_LIST.len();
3232+ let key_str = String::from_utf8_lossy(raw_key).to_string();
3333+ let nsid_bytes = raw_key.get(prefix_len..).ok_or(StorageError::Corrupt {
3434+ key: key_str.clone(),
3535+ reason: "collection list key shorter than prefix",
3636+ })?;
3737+ let nsid_str = std::str::from_utf8(nsid_bytes).map_err(|_| StorageError::Corrupt {
3838+ key: key_str.clone(),
3939+ reason: "non-UTF-8 NSID in collection list key",
4040+ })?;
4141+ Nsid::new_owned(nsid_str).map_err(|_| StorageError::Corrupt {
4242+ key: key_str,
4343+ reason: "invalid NSID in collection list key",
4444+ })
4545+}
4646+4747+// ---------------------------------------------------------------------------
4848+// Write
4949+// ---------------------------------------------------------------------------
5050+5151+/// Add `collection` to the global list within an existing batch.
5252+///
5353+/// Blindly overwrites any existing entry — no read needed.
5454+pub fn insert_into(batch: &mut fjall::OwnedWriteBatch, db: &DbRef, collection: Nsid<'_>) {
5555+ batch.insert(&db.ks, key(collection), b"");
5656+}
5757+5858+// ---------------------------------------------------------------------------
5959+// Read
6060+// ---------------------------------------------------------------------------
6161+6262+/// Iterate over all known collections, starting at `cursor` (inclusive).
6363+///
6464+/// Returns at most `limit` NSIDs sorted lexicographically. `next` is the first
6565+/// NSID of the following page, or `None` if this is the last page.
6666+pub fn scan(
6767+ db: &DbRef,
6868+ cursor: Option<Nsid<'_>>,
6969+ limit: usize,
7070+) -> StorageResult<(Vec<Nsid<'static>>, Option<Nsid<'static>>)> {
7171+ let start_key: Vec<u8> = {
7272+ let mut k = PREFIX_COLLECTION_LIST.to_vec();
7373+ if let Some(ref nsid) = cursor {
7474+ k.extend_from_slice(nsid.as_str().as_bytes());
7575+ }
7676+ k
7777+ };
7878+7979+ let mut ranger = db.ks.range(start_key..);
8080+ let mut collections = Vec::with_capacity(limit);
8181+8282+ for guard in ranger.by_ref() {
8383+ let (k, _v) = guard.into_inner()?;
8484+ if !k.starts_with(&PREFIX_COLLECTION_LIST) {
8585+ break;
8686+ }
8787+ collections.push(parse_nsid(&k)?);
8888+ if collections.len() >= limit {
8989+ break;
9090+ }
9191+ }
9292+9393+ let next = loop {
9494+ let Some(guard) = ranger.next() else {
9595+ break None;
9696+ };
9797+ let key = guard.key()?;
9898+ if !key.starts_with(&PREFIX_COLLECTION_LIST) {
9999+ break None;
100100+ }
101101+ match parse_nsid(&key) {
102102+ Ok(nsid) => break Some(nsid),
103103+ Err(_) => continue,
104104+ }
105105+ };
106106+107107+ Ok((collections, next))
108108+}
109109+110110+// ---------------------------------------------------------------------------
111111+// Tests
112112+// ---------------------------------------------------------------------------
113113+114114+#[cfg(test)]
115115+mod tests {
116116+ use super::*;
117117+ use crate::storage::open_temporary;
118118+119119+ fn nsid(s: &str) -> Nsid<'static> {
120120+ Nsid::new_owned(s.to_owned()).unwrap()
121121+ }
122122+123123+ #[test]
124124+ fn insert_and_scan_single() {
125125+ let db = open_temporary().unwrap();
126126+ let mut batch = db.database.batch();
127127+ insert_into(&mut batch, &db, nsid("app.bsky.feed.post"));
128128+ batch.commit().unwrap();
129129+130130+ let (cols, next) = scan(&db, None, 100).unwrap();
131131+ assert_eq!(cols, vec![nsid("app.bsky.feed.post")]);
132132+ assert!(next.is_none());
133133+ }
134134+135135+ #[test]
136136+ fn scan_is_sorted() {
137137+ let db = open_temporary().unwrap();
138138+ let mut batch = db.database.batch();
139139+ insert_into(&mut batch, &db, nsid("app.bsky.graph.follow"));
140140+ insert_into(&mut batch, &db, nsid("app.bsky.actor.profile"));
141141+ insert_into(&mut batch, &db, nsid("app.bsky.feed.post"));
142142+ batch.commit().unwrap();
143143+144144+ let (cols, _) = scan(&db, None, 100).unwrap();
145145+ assert_eq!(
146146+ cols,
147147+ vec![
148148+ nsid("app.bsky.actor.profile"),
149149+ nsid("app.bsky.feed.post"),
150150+ nsid("app.bsky.graph.follow"),
151151+ ]
152152+ );
153153+ }
154154+155155+ #[test]
156156+ fn blind_overwrite_is_idempotent() {
157157+ let db = open_temporary().unwrap();
158158+ for _ in 0..3 {
159159+ let mut batch = db.database.batch();
160160+ insert_into(&mut batch, &db, nsid("app.bsky.feed.post"));
161161+ batch.commit().unwrap();
162162+ }
163163+ let (cols, _) = scan(&db, None, 100).unwrap();
164164+ assert_eq!(cols, vec![nsid("app.bsky.feed.post")]);
165165+ }
166166+167167+ #[test]
168168+ fn scan_pagination() {
169169+ let db = open_temporary().unwrap();
170170+ let mut batch = db.database.batch();
171171+ for col in &[
172172+ "app.bsky.actor.profile",
173173+ "app.bsky.feed.like",
174174+ "app.bsky.feed.post",
175175+ "app.bsky.graph.follow",
176176+ ] {
177177+ insert_into(&mut batch, &db, nsid(col));
178178+ }
179179+ batch.commit().unwrap();
180180+181181+ let (page1, next) = scan(&db, None, 2).unwrap();
182182+ assert_eq!(
183183+ page1,
184184+ vec![nsid("app.bsky.actor.profile"), nsid("app.bsky.feed.like")]
185185+ );
186186+ let cursor = next.unwrap();
187187+ assert_eq!(cursor, nsid("app.bsky.feed.post"));
188188+189189+ let (page2, next2) = scan(&db, Some(cursor), 2).unwrap();
190190+ assert_eq!(
191191+ page2,
192192+ vec![nsid("app.bsky.feed.post"), nsid("app.bsky.graph.follow")]
193193+ );
194194+ assert!(next2.is_none());
195195+ }
196196+}
+3
src/storage/mod.rs
···11pub mod backfill_progress;
22pub mod collection_index;
33+pub mod collection_list;
34pub mod error;
45pub mod firehose_cursor;
56pub mod repo;
···1920/// Fixed-length (3 byte) key prefix per data type
2021type KeyPrefix = [u8; 3];
21222323+/// Global collection list (collection → ()). See [`collection_list`].
2424+pub(crate) const PREFIX_COLLECTION_LIST: KeyPrefix = *b"col";
2225/// Main collection index (collection → did). See [`collection_index`].
2326pub(crate) const PREFIX_RBC: KeyPrefix = *b"rbc";
2427/// Reversed collection index (did → collection). See [`collection_index`].
+3
src/storage/repo.rs
···346346 Vec<(Did<'static>, RepoInfo, Option<RepoPrev>)>,
347347 Option<Did<'static>>,
348348)> {
349349+ // TODO: use fjall prefix_range
350350+ // TODO: probably snapshot so we get a consistent account view?
351351+349352 let prefix_len = PREFIX_REPO.len();
350353351354 let start_key: Vec<u8> = {
+4-2
src/sync/firehose/commit_event.rs
···494494 }
495495496496 // All checks passed — atomically update the chain tip and the collection
497497- // index (born → insert, died → remove).
497497+ // index (born → insert, died → remove). Also record each born collection
498498+ // in the global collection list (blind overwrite, never deleted).
498499 let mut batch = db.database.batch();
499500 storage::repo::put_prev_into(
500501 &mut batch,
···506507 },
507508 );
508509 for coll in born {
509509- storage::collection_index::insert_into(&mut batch, db, did.clone(), coll);
510510+ storage::collection_index::insert_into(&mut batch, db, did.clone(), coll.clone());
511511+ storage::collection_list::insert_into(&mut batch, db, coll);
510512 }
511513 for coll in died {
512514 storage::collection_index::remove_into(&mut batch, db, did.clone(), coll);