very fast at protocol indexer with flexible filtering, xrpc queries, cursor-backed event stream, and more, built on fjall
rust fjall at-protocol atproto indexer
60
fork

Configure Feed

Select the types of activity you want to include in your feed.

[lib,api,db] implement getHostStatus, listHosts and add db migrations

dawn 0f356524 6624ccf7

+332 -82
+2
README.md
··· 310 310 the following are implemented currently: 311 311 - `com.atproto.repo.getRecord` 312 312 - `com.atproto.repo.listRecords` 313 + - `com.atproto.sync.getHostStatus` 314 + - `com.atproto.sync.listHosts` 313 315 314 316 ### systems.gaze.hydrant.* 315 317
+29
src/api/xrpc/get_host_status.rs
··· 1 + use jacquard_api::com_atproto::sync::get_host_status::{ 2 + GetHostStatusError, GetHostStatusOutput, GetHostStatusRequest, GetHostStatusResponse, 3 + }; 4 + use jacquard_common::CowStr; 5 + 6 + use super::*; 7 + 8 + pub async fn handle( 9 + State(hydrant): State<Hydrant>, 10 + ExtractXrpc(req): ExtractXrpc<GetHostStatusRequest>, 11 + ) -> XrpcResult<Json<GetHostStatusOutput<'static>>, GetHostStatusError<'static>> { 12 + let nsid = GetHostStatusResponse::NSID; 13 + 14 + let Some(host) = hydrant 15 + .get_host_status(&req.hostname) 16 + .await 17 + .map_err(|e| internal_error(nsid, e))? 18 + else { 19 + return Err(bad_request(nsid, "host does not exist")); 20 + }; 21 + 22 + Ok(Json(GetHostStatusOutput { 23 + account_count: None, 24 + hostname: CowStr::Owned(host.name), 25 + seq: Some(host.seq), 26 + status: None, 27 + extra_data: None, 28 + })) 29 + }
+34
src/api/xrpc/list_hosts.rs
··· 1 + use jacquard_api::com_atproto::sync::list_hosts::{ 2 + Host, ListHostsOutput, ListHostsRequest, ListHostsResponse, 3 + }; 4 + use jacquard_common::CowStr; 5 + 6 + use super::*; 7 + 8 + pub async fn handle( 9 + State(hydrant): State<Hydrant>, 10 + ExtractXrpc(req): ExtractXrpc<ListHostsRequest>, 11 + ) -> XrpcResult<Json<ListHostsOutput<'static>>> { 12 + let nsid = ListHostsResponse::NSID; 13 + let limit = req.limit.unwrap_or(200).clamp(1, 1000) as usize; 14 + 15 + let (hosts, cursor) = hydrant 16 + .list_hosts(req.cursor.as_deref(), limit) 17 + .await 18 + .map_err(|e| internal_error(nsid, e))?; 19 + 20 + Ok(Json(ListHostsOutput { 21 + cursor: cursor.map(CowStr::Owned), 22 + hosts: hosts 23 + .into_iter() 24 + .map(|h| Host { 25 + hostname: CowStr::Owned(h.name), 26 + seq: Some(h.seq), 27 + status: None, 28 + account_count: None, 29 + extra_data: None, 30 + }) 31 + .collect(), 32 + extra_data: None, 33 + })) 34 + }
+6
src/api/xrpc/mod.rs
··· 9 9 get_record::{GetRecordError, GetRecordOutput, GetRecordRequest}, 10 10 list_records::{ListRecordsOutput, ListRecordsRequest, Record as RepoRecord}, 11 11 }; 12 + use jacquard_api::com_atproto::sync::get_host_status::GetHostStatusRequest; 13 + use jacquard_api::com_atproto::sync::list_hosts::ListHostsRequest; 12 14 use jacquard_common::types::ident::AtIdentifier; 13 15 use jacquard_common::xrpc::XrpcResp; 14 16 use jacquard_common::xrpc::{XrpcEndpoint, XrpcMethod}; ··· 23 25 24 26 mod count_records; 25 27 mod describe_repo; 28 + mod get_host_status; 26 29 mod get_record; 30 + mod list_hosts; 27 31 mod list_records; 28 32 29 33 pub fn router() -> Router<Hydrant> { ··· 32 36 .route(ListRecordsRequest::PATH, get(list_records::handle)) 33 37 .route(CountRecords::PATH, get(count_records::handle)) 34 38 .route(DescribeRepo::PATH, get(describe_repo::handle)) 39 + .route(GetHostStatusRequest::PATH, get(get_host_status::handle)) 40 + .route(ListHostsRequest::PATH, get(list_hosts::handle)) 35 41 } 36 42 37 43 #[derive(Debug)]
+10 -9
src/control/firehose.rs
··· 1 1 use std::sync::Arc; 2 2 use std::sync::atomic::Ordering; 3 3 4 - use miette::{IntoDiagnostic, Result}; 4 + use miette::{Context, IntoDiagnostic, Result}; 5 5 use tokio::sync::watch; 6 6 use tracing::{error, info}; 7 7 use url::Url; ··· 98 98 99 99 /// reset the stored cursor for the given relay URL. 100 100 /// 101 - /// clears the `firehose_cursor|{url}` entry from the cursors keyspace and zeroes the 102 - /// in-memory cursor. the next connection will tail live events from the current head. 101 + /// clears the `firehose_cursor|{host}|{scheme}` entry from the cursors keyspace and zeroes 102 + /// the in-memory cursor. the next connection will tail live events from the current head. 103 103 pub async fn reset_cursor(&self, url: &str) -> Result<()> { 104 + let relay_url = Url::parse(url) 105 + .into_diagnostic() 106 + .wrap_err_with(|| format!("invalid relay url: {url:?}"))?; 107 + let key = keys::firehose_cursor_key_from_url(&relay_url); 104 108 let db = self.state.db.clone(); 105 - let key = keys::firehose_cursor_key(url); 106 109 tokio::task::spawn_blocking(move || db.cursors.remove(key).into_diagnostic()) 107 110 .await 108 111 .into_diagnostic()??; 109 112 110 - if let Ok(relay_url) = Url::parse(url) { 111 - self.state.relay_cursors.peek_with(&relay_url, |_, c| { 112 - c.store(0, Ordering::SeqCst); 113 - }); 114 - } 113 + self.state.relay_cursors.peek_with(&relay_url, |_, c| { 114 + c.store(0, Ordering::SeqCst); 115 + }); 115 116 Ok(()) 116 117 } 117 118
+96 -2
src/control/mod.rs
··· 8 8 pub use filter::{FilterControl, FilterPatch, FilterSnapshot}; 9 9 pub use firehose::{FirehoseHandle, FirehoseSourceInfo}; 10 10 pub use repos::{ListedRecord, Record, RecordList, RepoHandle, RepoInfo, ReposControl}; 11 + use smol_str::{SmolStr, ToSmolStr}; 11 12 12 13 use std::collections::BTreeMap; 13 14 use std::future::Future; ··· 17 18 use std::task::{Context, Poll}; 18 19 19 20 use futures::{FutureExt, Stream}; 20 - use miette::{IntoDiagnostic, Result}; 21 + use miette::{IntoDiagnostic, Result, WrapErr}; 21 22 use tokio::sync::{mpsc, watch}; 22 23 use tracing::{debug, error, info}; 23 24 24 25 use crate::backfill::BackfillWorker; 25 26 use crate::config::{Config, SignatureVerification}; 26 27 use crate::db::{ 27 - self, filter as db_filter, load_persisted_crawler_sources, load_persisted_firehose_sources, 28 + self, filter as db_filter, keys, load_persisted_crawler_sources, 29 + load_persisted_firehose_sources, 28 30 }; 29 31 use crate::filter::FilterMode; 30 32 use crate::ingest::worker::FirehoseWorker; ··· 34 36 use crawler::{CrawlerShared, spawn_crawler_producer}; 35 37 use firehose::{FirehoseShared, spawn_firehose_ingestor}; 36 38 use stream::event_stream_thread; 39 + 40 + /// infromation about a host hydrant is consuming from. 41 + pub struct Host { 42 + pub name: SmolStr, 43 + pub seq: i64, 44 + } 37 45 38 46 /// an event emitted by the hydrant event stream. 39 47 /// ··· 655 663 pub fn serve_debug(&self, port: u16) -> impl Future<Output = Result<()>> { 656 664 let state = self.state.clone(); 657 665 async move { crate::api::serve_debug(state, port).await } 666 + } 667 + 668 + /// get the status of a (firehose) host we are consuming from. 669 + /// 670 + /// returns the seq we are on for this host. 671 + pub async fn get_host_status(&self, hostname: &str) -> Result<Option<Host>> { 672 + let db = self.state.db.clone(); 673 + let hostname = hostname.to_smolstr(); 674 + 675 + tokio::task::spawn_blocking(move || { 676 + let key = keys::firehose_cursor_key(&hostname); 677 + let Some(seq) = db.cursors.get(&key).into_diagnostic()? else { 678 + return Ok(None); 679 + }; 680 + let seq = i64::from_be_bytes( 681 + seq.as_ref() 682 + .try_into() 683 + .into_diagnostic() 684 + .wrap_err("cursor value is not 8 bytes")?, 685 + ); 686 + 687 + Ok(Some(Host { 688 + name: hostname.into(), 689 + seq, 690 + })) 691 + }) 692 + .await 693 + .into_diagnostic()? 694 + } 695 + 696 + /// enumerates all hosts hydrant is consuming from. 697 + /// 698 + /// returns hosts enumerated in this pagination and the cursor to paginate from. 699 + pub async fn list_hosts( 700 + &self, 701 + cursor: Option<&str>, 702 + limit: usize, 703 + ) -> Result<(Vec<Host>, Option<SmolStr>)> { 704 + let db = self.state.db.clone(); 705 + let cursor = cursor.map(str::to_string); 706 + 707 + tokio::task::spawn_blocking(move || { 708 + let prefix_end = { 709 + let mut end = keys::FIREHOSE_CURSOR_PREFIX.to_vec(); 710 + *end.last_mut().unwrap() += 1; 711 + end 712 + }; 713 + let start_bound = match cursor.as_deref() { 714 + Some(host) => std::ops::Bound::Excluded(keys::firehose_cursor_key(host)), 715 + None => std::ops::Bound::Included(keys::FIREHOSE_CURSOR_PREFIX.to_vec()), 716 + }; 717 + 718 + // fetch one extra item to detect whether there is a next page 719 + let mut hosts: Vec<Host> = Vec::with_capacity(limit + 1); 720 + for item in db 721 + .cursors 722 + .range((start_bound, std::ops::Bound::Excluded(prefix_end))) 723 + .take(limit + 1) 724 + { 725 + let (k, v) = item.into_inner().into_diagnostic()?; 726 + let hostname = std::str::from_utf8(&k[keys::FIREHOSE_CURSOR_PREFIX.len()..]) 727 + .into_diagnostic() 728 + .wrap_err("firehose cursor key contains non-utf8 hostname")?; 729 + let seq = i64::from_be_bytes( 730 + v.as_ref() 731 + .try_into() 732 + .into_diagnostic() 733 + .wrap_err("cursor value is not 8 bytes")?, 734 + ); 735 + hosts.push(Host { 736 + name: hostname.into(), 737 + seq, 738 + }); 739 + } 740 + 741 + let next_cursor = if hosts.len() > limit { 742 + hosts.pop(); 743 + hosts.last().map(|h| h.name.clone()) 744 + } else { 745 + None 746 + }; 747 + 748 + Ok((hosts, next_cursor)) 749 + }) 750 + .await 751 + .into_diagnostic()? 658 752 } 659 753 } 660 754
+19 -16
src/db/keys.rs src/db/keys/mod.rs
··· 3 3 4 4 use crate::db::types::{DbRkey, DbTid, TrimmedDid}; 5 5 6 + pub mod v1; 7 + 8 + pub use v1::{firehose_cursor_key, firehose_cursor_key_from_url}; 9 + 6 10 /// separator used for composite keys 7 11 pub const SEP: u8 = b'|'; 8 12 9 - pub const CURSOR_KEY: &[u8] = b"firehose_cursor"; 13 + pub const EVENT_WATERMARK_PREFIX: &[u8] = b"ewm|"; 10 14 11 - pub const EVENT_WATERMARK_PREFIX: &[u8] = b"ewm|"; 15 + /// THIS SHOULD ALWAYS BE STABLE. DO NOT CHANGE 16 + pub const VERSIONING_KEY: &[u8] = b"db_version"; 12 17 13 18 // key format: {DID} 14 19 pub fn repo_key<'a>(did: &'a Did) -> Vec<u8> { ··· 160 165 TrimmedDid::try_from(&key[CRAWLER_RETRY_PREFIX.len()..]) 161 166 } 162 167 168 + pub const CRAWLER_CURSOR_PREFIX: &[u8] = b"crawler_cursor|"; 169 + 163 170 pub fn crawler_cursor_key(relay: &str) -> Vec<u8> { 164 - let mut key = b"crawler_cursor|".to_vec(); 171 + let mut key = CRAWLER_CURSOR_PREFIX.to_vec(); 165 172 key.extend_from_slice(relay.as_bytes()); 166 173 key 167 174 } 168 175 169 - pub fn by_collection_cursor_key(url: &str, collection: &str) -> Vec<u8> { 170 - let mut key = b"by_collection_cursor|".to_vec(); 171 - key.extend_from_slice(url.as_bytes()); 172 - key.push(SEP); 173 - key.extend_from_slice(collection.as_bytes()); 174 - key 175 - } 176 + pub const BY_COLLECTION_CURSOR_PREFIX: &[u8] = b"by_collection_cursor|"; 176 177 177 178 /// prefix for all by-collection cursors belonging to a given index URL. 178 179 pub fn by_collection_cursor_prefix(url: &str) -> Vec<u8> { 179 - let mut prefix = b"by_collection_cursor|".to_vec(); 180 + let mut prefix = BY_COLLECTION_CURSOR_PREFIX.to_vec(); 180 181 prefix.extend_from_slice(url.as_bytes()); 181 182 prefix.push(SEP); 182 183 prefix 184 + } 185 + 186 + pub fn by_collection_cursor_key(url: &str, collection: &str) -> Vec<u8> { 187 + let mut key = by_collection_cursor_prefix(url); 188 + key.extend_from_slice(collection.as_bytes()); 189 + key 183 190 } 184 191 185 192 pub const CRAWLER_SOURCE_PREFIX: &[u8] = b"src|"; ··· 191 198 key 192 199 } 193 200 194 - pub fn firehose_cursor_key(relay: &str) -> Vec<u8> { 195 - let mut key = b"firehose_cursor|".to_vec(); 196 - key.extend_from_slice(relay.as_bytes()); 197 - key 198 - } 201 + pub const FIREHOSE_CURSOR_PREFIX: &[u8] = b"firehose_cursor|"; 199 202 200 203 pub const FIREHOSE_SOURCE_PREFIX: &[u8] = b"firehose|"; 201 204
+14
src/db/keys/v1.rs
··· 1 + use url::Url; 2 + 3 + use super::FIREHOSE_CURSOR_PREFIX; 4 + 5 + /// firehose cursor key for schema v1: `firehose_cursor|{host}` 6 + pub fn firehose_cursor_key(host: &str) -> Vec<u8> { 7 + let mut key = FIREHOSE_CURSOR_PREFIX.to_vec(); 8 + key.extend_from_slice(host.as_bytes()); 9 + key 10 + } 11 + 12 + pub fn firehose_cursor_key_from_url(url: &Url) -> Vec<u8> { 13 + firehose_cursor_key(url.host_str().unwrap_or("")) 14 + }
+46
src/db/migration/mod.rs
··· 1 + use fjall::OwnedWriteBatch; 2 + use miette::{Context, IntoDiagnostic, Result}; 3 + 4 + use crate::db::Db; 5 + use crate::db::keys::VERSIONING_KEY; 6 + 7 + mod v1; 8 + 9 + type MigrationFn = fn(&Db, &mut OwnedWriteBatch) -> Result<()>; 10 + 11 + /// ordered list of migrations. migration at index `i` upgrades the schema from version `i` to `i+1`. 12 + const MIGRATIONS: &[(&str, MigrationFn)] = 13 + &[("stable_firehose_cursors", v1::stable_firehose_cursors)]; 14 + 15 + fn read_version(db: &Db) -> Result<u64> { 16 + db.counts 17 + .get(VERSIONING_KEY) 18 + .into_diagnostic()? 19 + .map(|r| { 20 + r.as_ref() 21 + .try_into() 22 + .into_diagnostic() 23 + .wrap_err("db version key expected to be 8 bytes") 24 + .map(u64::from_be_bytes) 25 + }) 26 + .transpose() 27 + .map(|v| v.unwrap_or(0)) 28 + } 29 + 30 + /// run all pending database migrations in order. 31 + /// 32 + /// each migration and its version bump are committed atomically. safe to run on a fresh 33 + /// database (all migrations are no-ops when no relevant data exists). called during [`Db::open`]. 34 + pub(super) fn run(db: &Db) -> Result<()> { 35 + let version = read_version(db)? as usize; 36 + for (i, (name, migration)) in MIGRATIONS.iter().enumerate().skip(version) { 37 + tracing::info!("db: running migration {name} (v{i} -> v{})", i + 1); 38 + let mut batch = db.inner.batch(); 39 + migration(db, &mut batch)?; 40 + let new_version = (i + 1) as u64; 41 + batch.insert(&db.counts, VERSIONING_KEY, new_version.to_be_bytes()); 42 + batch.commit().into_diagnostic()?; 43 + tracing::info!("db: migration {name} complete"); 44 + } 45 + Ok(()) 46 + }
+40
src/db/migration/v1.rs
··· 1 + use std::str::FromStr; 2 + 3 + use fjall::OwnedWriteBatch; 4 + use miette::{Context, IntoDiagnostic, Result}; 5 + use url::Url; 6 + 7 + use crate::db::{Db, keys}; 8 + 9 + /// migrates firehose cursors from `firehose_cursor|{url}` to `firehose_cursor|{host}`. 10 + pub(super) fn stable_firehose_cursors(db: &Db, batch: &mut OwnedWriteBatch) -> Result<()> { 11 + let entries: Vec<(Vec<u8>, Vec<u8>)> = db 12 + .cursors 13 + .prefix(keys::FIREHOSE_CURSOR_PREFIX) 14 + .map(|item| { 15 + let (k, v) = item.into_inner().into_diagnostic()?; 16 + Ok((k.to_vec(), v.to_vec())) 17 + }) 18 + .collect::<Result<_>>()?; 19 + 20 + for (old_key, value) in entries { 21 + let suffix = &old_key[keys::FIREHOSE_CURSOR_PREFIX.len()..]; 22 + // old-format: suffix is a full URL containing "://" (e.g. "wss://bsky.network") 23 + // new-format (v1): suffix is just a hostname, no "://" 24 + if !suffix.windows(3).any(|w| w == b"://") { 25 + continue; // already in new format 26 + } 27 + let url_str = std::str::from_utf8(suffix) 28 + .into_diagnostic() 29 + .wrap_err("firehose cursor key contains non-utf8 url")?; 30 + let url = Url::from_str(url_str) 31 + .into_diagnostic() 32 + .wrap_err_with(|| format!("firehose cursor key contains invalid url {url_str:?}"))?; 33 + 34 + let new_key = keys::v1::firehose_cursor_key_from_url(&url); 35 + batch.insert(&db.cursors, &new_key, &value); 36 + batch.remove(&db.cursors, &old_key); 37 + } 38 + 39 + Ok(()) 40 + }
+36 -52
src/db/mod.rs
··· 24 24 pub mod ephemeral; 25 25 pub mod filter; 26 26 pub mod keys; 27 + pub mod migration; 27 28 pub mod types; 28 29 29 30 use tokio::sync::broadcast; ··· 376 377 // when adding new keyspaces, make sure to add them to the /stats endpoint 377 378 // and also update any relevant /debug/* endpoints 378 379 380 + let (event_tx, _) = broadcast::channel(10000); 381 + 382 + let this = Self { 383 + inner: db, 384 + path: cfg.database_path.clone(), 385 + repos, 386 + records, 387 + blocks, 388 + cursors, 389 + pending, 390 + resync, 391 + resync_buffer, 392 + events, 393 + counts, 394 + filter, 395 + crawler, 396 + #[cfg(feature = "backlinks")] 397 + backlinks, 398 + event_tx, 399 + counts_map: HashMap::new(), 400 + next_event_id: Arc::new(AtomicU64::new(0)), 401 + }; 402 + 403 + migration::run(&this)?; 404 + 379 405 let mut last_id = 0; 380 - if let Some(guard) = events.iter().next_back() { 406 + if let Some(guard) = this.events.iter().next_back() { 381 407 let k = guard.key().into_diagnostic()?; 382 408 last_id = u64::from_be_bytes( 383 409 k.as_ref() ··· 386 412 .wrap_err("expected to be id (8 bytes)")?, 387 413 ); 388 414 } 415 + // relaxed is fine since we are just initializing the db 416 + this.next_event_id 417 + .store(last_id + 1, std::sync::atomic::Ordering::Relaxed); 389 418 390 419 // load counts into memory 391 - let counts_map = HashMap::new(); 392 - for guard in counts.prefix(keys::COUNT_KS_PREFIX) { 420 + for guard in this.counts.prefix(keys::COUNT_KS_PREFIX) { 393 421 let (k, v) = guard.into_inner().into_diagnostic()?; 394 422 let name = std::str::from_utf8(&k[keys::COUNT_KS_PREFIX.len()..]) 395 423 .into_diagnostic() 396 424 .wrap_err("expected valid utf8 for ks count key")?; 397 - let _ = counts_map.insert_sync( 425 + let _ = this.counts_map.insert_sync( 398 426 SmolStr::new(name), 399 427 u64::from_be_bytes(v.as_ref().try_into().unwrap()), 400 428 ); 401 429 } 402 - // ensure critical counts are initialized 403 - for ks_name in ["repos", "pending", "resync"] { 404 - let _ = counts_map 405 - .entry_sync(SmolStr::new(ks_name)) 406 - .or_insert_with(|| { 407 - let ks = match ks_name { 408 - "repos" => &repos, 409 - "pending" => &pending, 410 - "resync" => &resync, 411 - _ => unreachable!(), 412 - }; 413 - ks.iter().count() as u64 414 - }); 415 - } 416 430 417 - let (event_tx, _) = broadcast::channel(10000); 418 - 419 - Ok(Self { 420 - inner: db, 421 - path: cfg.database_path.clone(), 422 - repos, 423 - records, 424 - blocks, 425 - cursors, 426 - pending, 427 - resync, 428 - resync_buffer, 429 - events, 430 - counts, 431 - filter, 432 - crawler, 433 - #[cfg(feature = "backlinks")] 434 - backlinks, 435 - event_tx, 436 - counts_map, 437 - next_event_id: Arc::new(AtomicU64::new(last_id + 1)), 438 - }) 431 + Ok(this) 439 432 } 440 433 441 434 pub fn train_dict(&self, ks_name: &str) -> Result<()> { ··· 751 744 pub fn set_firehose_cursor(db: &Db, relay: &Url, cursor: i64) -> Result<()> { 752 745 db.cursors 753 746 .insert( 754 - keys::firehose_cursor_key(relay.as_str()), 747 + keys::firehose_cursor_key_from_url(relay), 755 748 cursor.to_be_bytes(), 756 749 ) 757 750 .into_diagnostic() 758 751 } 759 752 760 753 pub async fn get_firehose_cursor(db: &Db, relay: &Url) -> Result<Option<i64>> { 761 - let per_relay_key = keys::firehose_cursor_key(relay.as_str()); 762 - if let Some(v) = Db::get(db.cursors.clone(), per_relay_key).await? { 763 - return Ok(Some(i64::from_be_bytes( 764 - v.as_ref() 765 - .try_into() 766 - .into_diagnostic() 767 - .wrap_err("cursor is not 8 bytes")?, 768 - ))); 769 - } 770 - 771 - Db::get(db.cursors.clone(), keys::CURSOR_KEY) 754 + let key = keys::firehose_cursor_key_from_url(relay); 755 + Db::get(db.cursors.clone(), key) 772 756 .await? 773 757 .map(|v| { 774 758 Ok(i64::from_be_bytes(
-3
src/state.rs
··· 44 44 let filter = new_handle(filter_config); 45 45 46 46 let relay_cursors = scc::HashIndex::new(); 47 - for url in &config.relays { 48 - let _ = relay_cursors.insert_sync(url.clone(), AtomicI64::new(0)); 49 - } 50 47 51 48 let (crawler_enabled, _) = watch::channel(crawler_default); 52 49 let (firehose_enabled, _) = watch::channel(config.enable_firehose);