very fast at protocol indexer with flexible filtering, xrpc queries, cursor-backed event stream, and more, built on fjall
rust fjall at-protocol atproto indexer
59
fork

Configure Feed

Select the types of activity you want to include in your feed.

[relay] ratelimits, ratelimit tiers and management

dawn d73d9a15 abace9d6

+1126 -232
+1 -1
AGENTS.md
··· 143 143 <!-- gitnexus:start --> 144 144 # GitNexus — Code Intelligence 145 145 146 - This project is indexed by GitNexus as **hydrant** (655 symbols, 1810 relationships, 55 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely. 146 + This project is indexed by GitNexus as **hydrant** (1339 symbols, 3645 relationships, 113 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely. 147 147 148 148 > If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first. 149 149
+1
Cargo.lock
··· 1491 1491 "miette", 1492 1492 "mimalloc", 1493 1493 "multibase", 1494 + "parking_lot", 1494 1495 "rand 0.10.0", 1495 1496 "reqwest", 1496 1497 "rmp-serde",
+1
Cargo.toml
··· 58 58 tokio-tungstenite = { version = "0.28.0", features = ["rustls-tls-native-roots"] } 59 59 multibase = "0.9.2" 60 60 sha2 = "0.10.9" 61 + parking_lot = "0.12.5" 61 62 62 63 [dev-dependencies] 63 64 tempfile = "3.26.0"
+15 -9
README.md
··· 2 2 3 3 -> [hydrant](#hydrant)</br> 4 4 -> [vs tap](#vs-tap) | [stream](#stream-behavior) | [multi-relay](#multiple-relay-support) | [crawler sources](#crawler-sources)</br> 5 - -> [configuration](#configuration)</br> 5 + -> [configuration](#configuration) | [build features](#build-features)</br> 6 6 -> [rest api](#rest-api) | [filter](#filter-management) | [ingestion](#ingestion-control) | [crawler](#crawler-management) | [firehose](#firehose-management) | [repos](#repository-management)</br> 7 7 -> [xrpc api](#data-access-xrpc) | [backlinks](#bluemicrocosmlinks) | [identity](#bluemicrocosmidentity) | [atproto](#comatproto) | [custom](#systemsgazehydrant) 8 8 ··· 142 142 | `CRAWLER_MAX_PENDING_REPOS` | `2000` | max pending repos for crawler. | 143 143 | `CRAWLER_RESUME_PENDING_REPOS` | `1000` | resume threshold for crawler pending repos. | 144 144 145 + ## build features 146 + 147 + <small>[<- back to toc](#table-of-contents)</small> 148 + 149 + `hydrant` has several optional compile-time features: 150 + 151 + | feature | default | description | 152 + | :--- | :--- | :--- | 153 + | `indexer` | yes | enables the indexing logic. | 154 + | `relay` | no | enables relay functionality. | 155 + | `backlinks` | no | enables the backlinks indexer and XRPC endpoints (`blue.microcosm.links.*`). | 156 + 145 157 ## REST api 146 158 147 159 <small>[<- back to toc](#table-of-contents)</small> ··· 329 341 - `com.atproto.sync.getRepoStatus` 330 342 - `com.atproto.sync.listRepos` 331 343 - `com.atproto.sync.getLatestCommit` 344 + - `com.atproto.sync.requestCrawl` (adds the host to firehose sources in relay mode) 345 + - `com.atproto.sync.subscribeRepos` (WebSocket firehose stream, requires `relay` feature) 332 346 333 347 ### systems.gaze.hydrant.* 334 348 ··· 397 411 | `source` | no | filter by source collection (same format as `getBacklinks`). | 398 412 399 413 returns `{ count }`. 400 - 401 - ### blue.microcosm.identity.* 402 - 403 - <small>[<- back to toc](#table-of-contents)</small> 404 - 405 - #### blue.microcosm.identity.resolveMiniDoc 406 - 407 - see [here](https://slingshot.microcosm.blue/#tag/slingshot-specific-queries/GET/xrpc/blue.microcosm.identity.resolveMiniDoc) for this XRPC's documentation.
+2 -1
src/api/filter.rs
··· 1 1 use crate::control::{FilterPatch, Hydrant}; 2 - use crate::filter::{FilterMode, SetUpdate}; 2 + use crate::filter::FilterMode; 3 + use crate::patch::SetUpdate; 3 4 use axum::{ 4 5 Json, Router, 5 6 extract::State,
+2
src/api/mod.rs
··· 11 11 mod filter; 12 12 mod firehose; 13 13 mod ingestion; 14 + mod pds; 14 15 mod repos; 15 16 mod stats; 16 17 #[cfg(feature = "indexer")] ··· 27 28 let app = app 28 29 .merge(xrpc::router()) 29 30 .merge(filter::router()) 31 + .merge(pds::router()) 30 32 .merge(repos::router()) 31 33 .merge(ingestion::router()) 32 34 .merge(crawler::router())
+74
src/api/pds.rs
··· 1 + use std::collections::HashMap; 2 + 3 + use axum::{ 4 + Json, Router, 5 + extract::State, 6 + http::StatusCode, 7 + routing::{delete, get, put}, 8 + }; 9 + use serde::{Deserialize, Serialize}; 10 + 11 + use crate::control::{Hydrant, PdsTierAssignment, PdsTierDefinition}; 12 + 13 + pub fn router() -> Router<Hydrant> { 14 + Router::new() 15 + .route("/pds/tiers", get(list_tiers)) 16 + .route("/pds/tiers", put(set_tier)) 17 + .route("/pds/tiers", delete(remove_tier)) 18 + .route("/pds/rate-tiers", get(list_rate_tiers)) 19 + } 20 + 21 + /// combined response: tier assignments + available tier definitions. 22 + #[derive(Serialize)] 23 + pub struct TiersResponse { 24 + pub assignments: Vec<PdsTierAssignment>, 25 + pub rate_tiers: HashMap<String, PdsTierDefinition>, 26 + } 27 + 28 + pub async fn list_tiers(State(hydrant): State<Hydrant>) -> Json<TiersResponse> { 29 + Json(TiersResponse { 30 + assignments: hydrant.pds.list_assignments().await, 31 + rate_tiers: hydrant.pds.list_rate_tiers(), 32 + }) 33 + } 34 + 35 + pub async fn list_rate_tiers( 36 + State(hydrant): State<Hydrant>, 37 + ) -> Json<HashMap<String, PdsTierDefinition>> { 38 + Json(hydrant.pds.list_rate_tiers()) 39 + } 40 + 41 + #[derive(Deserialize)] 42 + pub struct SetTierBody { 43 + pub host: String, 44 + pub tier: String, 45 + } 46 + 47 + pub async fn set_tier( 48 + State(hydrant): State<Hydrant>, 49 + Json(body): Json<SetTierBody>, 50 + ) -> Result<StatusCode, (StatusCode, String)> { 51 + hydrant 52 + .pds 53 + .set_tier(body.host, body.tier) 54 + .await 55 + .map(|_| StatusCode::OK) 56 + .map_err(|e| (StatusCode::BAD_REQUEST, e.to_string())) 57 + } 58 + 59 + #[derive(Deserialize)] 60 + pub struct RemoveTierBody { 61 + pub host: String, 62 + } 63 + 64 + pub async fn remove_tier( 65 + State(hydrant): State<Hydrant>, 66 + Json(body): Json<RemoveTierBody>, 67 + ) -> Result<StatusCode, (StatusCode, String)> { 68 + hydrant 69 + .pds 70 + .remove_tier(body.host) 71 + .await 72 + .map(|_| StatusCode::OK) 73 + .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e.to_string())) 74 + }
+47 -24
src/api/xrpc/mod.rs
··· 1 - use crate::api::xrpc::count_records::CountRecords; 2 - use crate::api::xrpc::describe_repo::DescribeRepo; 3 1 use crate::control::Hydrant; 4 2 use axum::extract::FromRequest; 5 3 use axum::response::IntoResponse; 6 4 use axum::routing::get; 7 5 use axum::{Json, Router, extract::State, http::StatusCode}; 8 - use jacquard_api::com_atproto::repo::{ 9 - describe_repo::DescribeRepoRequest as AtprotoDescribeRepoRequest, 10 - get_record::{GetRecordError, GetRecordOutput, GetRecordRequest}, 11 - list_records::{ListRecordsOutput, ListRecordsRequest, Record as RepoRecord}, 12 - }; 13 6 use jacquard_api::com_atproto::sync::get_host_status::GetHostStatusRequest; 14 7 use jacquard_api::com_atproto::sync::get_latest_commit::GetLatestCommitRequest; 15 - use jacquard_api::com_atproto::sync::get_repo::GetRepoRequest; 16 8 use jacquard_api::com_atproto::sync::get_repo_status::GetRepoStatusRequest; 17 9 use jacquard_api::com_atproto::sync::list_hosts::ListHostsRequest; 18 10 use jacquard_api::com_atproto::sync::list_repos::ListReposRequest; ··· 27 19 use serde::{Deserialize, Serialize}; 28 20 use smol_str::ToSmolStr; 29 21 use std::fmt::Display; 22 + use std::result::Result; 23 + #[cfg(feature = "indexer")] 24 + use { 25 + crate::api::xrpc::count_records::CountRecords, 26 + crate::api::xrpc::describe_repo::DescribeRepo, 27 + jacquard_api::com_atproto::repo::{ 28 + describe_repo::DescribeRepoRequest as AtprotoDescribeRepoRequest, 29 + get_record::{GetRecordError, GetRecordOutput, GetRecordRequest}, 30 + list_records::{ListRecordsOutput, ListRecordsRequest, Record as RepoRecord}, 31 + }, 32 + jacquard_api::com_atproto::sync::get_repo::GetRepoRequest, 33 + }; 30 34 #[cfg(feature = "relay")] 31 35 use { 36 + jacquard_api::com_atproto::sync::request_crawl::RequestCrawlRequest, 32 37 jacquard_api::com_atproto::sync::subscribe_repos::SubscribeReposEndpoint, 33 38 jacquard_common::xrpc::SubscriptionEndpoint, 34 39 }; 35 40 41 + mod get_host_status; 42 + mod get_latest_commit; 43 + mod get_repo_status; 44 + mod list_hosts; 45 + mod list_repos; 46 + 47 + #[cfg(feature = "indexer")] 36 48 mod com_atproto_describe_repo; 49 + #[cfg(feature = "indexer")] 37 50 mod count_records; 51 + #[cfg(feature = "indexer")] 38 52 mod describe_repo; 39 - mod get_host_status; 40 - mod get_latest_commit; 53 + #[cfg(feature = "indexer")] 41 54 mod get_record; 55 + #[cfg(feature = "indexer")] 42 56 mod get_repo; 43 - mod get_repo_status; 44 - mod list_hosts; 57 + #[cfg(feature = "indexer")] 45 58 mod list_records; 46 - mod list_repos; 59 + 60 + #[cfg(feature = "relay")] 61 + mod request_crawl; 47 62 #[cfg(feature = "relay")] 48 63 mod subscribe_repos; 49 64 50 65 pub fn router() -> Router<Hydrant> { 51 66 let r = Router::new() 67 + .route(GetHostStatusRequest::PATH, get(get_host_status::handle)) 68 + .route(ListHostsRequest::PATH, get(list_hosts::handle)) 69 + .route(GetLatestCommitRequest::PATH, get(get_latest_commit::handle)) 70 + .route(GetRepoStatusRequest::PATH, get(get_repo_status::handle)) 71 + .route(ListReposRequest::PATH, get(list_repos::handle)); 72 + 73 + #[cfg(feature = "indexer")] 74 + let r = r 52 75 .route(GetRecordRequest::PATH, get(get_record::handle)) 53 76 .route(ListRecordsRequest::PATH, get(list_records::handle)) 54 77 .route(CountRecords::PATH, get(count_records::handle)) 78 + .route(GetRepoRequest::PATH, get(get_repo::handle)) 55 79 .route(DescribeRepo::PATH, get(describe_repo::handle)) 56 80 .route( 57 81 AtprotoDescribeRepoRequest::PATH, 58 82 get(com_atproto_describe_repo::handle), 59 - ) 60 - .route(GetHostStatusRequest::PATH, get(get_host_status::handle)) 61 - .route(ListHostsRequest::PATH, get(list_hosts::handle)) 62 - .route(GetLatestCommitRequest::PATH, get(get_latest_commit::handle)) 63 - .route(GetRepoRequest::PATH, get(get_repo::handle)) 64 - .route(GetRepoStatusRequest::PATH, get(get_repo_status::handle)) 65 - .route(ListReposRequest::PATH, get(list_repos::handle)); 83 + ); 66 84 67 85 #[cfg(feature = "relay")] 68 - let r = r.route( 69 - SubscribeReposEndpoint::PATH, 70 - axum::routing::get(subscribe_repos::handle), 71 - ); 86 + let r = r 87 + .route( 88 + SubscribeReposEndpoint::PATH, 89 + axum::routing::get(subscribe_repos::handle), 90 + ) 91 + .route( 92 + RequestCrawlRequest::PATH, 93 + axum::routing::get(subscribe_repos::handle), 94 + ); 72 95 73 96 r 74 97 }
+24
src/api/xrpc/request_crawl.rs
··· 1 + use jacquard_api::com_atproto::sync::request_crawl::{ 2 + RequestCrawlError, RequestCrawlRequest, RequestCrawlResponse, 3 + }; 4 + use url::Url; 5 + 6 + use super::*; 7 + 8 + pub async fn handle( 9 + State(hydrant): State<Hydrant>, 10 + ExtractXrpc(req): ExtractXrpc<RequestCrawlRequest>, 11 + ) -> XrpcResult<StatusCode, RequestCrawlError<'static>> { 12 + let nsid = RequestCrawlResponse::NSID; 13 + 14 + let url_str = format!("wss://{}/", req.hostname); 15 + let url = Url::parse(&url_str).map_err(|e| bad_request(nsid, e))?; 16 + 17 + hydrant 18 + .firehose 19 + .add_source(url, true) 20 + .await 21 + .map_err(|e| internal_error(nsid, e))?; 22 + 23 + Ok(StatusCode::OK) 24 + }
+102
src/config.rs
··· 1 1 use miette::Result; 2 2 use serde::{Deserialize, Serialize}; 3 3 use smol_str::ToSmolStr; 4 + use std::collections::HashMap; 4 5 use std::fmt; 5 6 use std::path::PathBuf; 6 7 use std::str::FromStr; 7 8 use std::time::Duration; 8 9 use url::Url; 9 10 11 + /// rate limit parameters for a named tier of PDS connections. 12 + /// 13 + /// the per-second limit is `max(per_second_base, accounts * per_second_account_mul)`, 14 + /// giving a floor at `per_second_base` that scales up with the PDS's active account count. 15 + #[derive(Debug, Clone, Copy)] 16 + pub struct RateTier { 17 + /// floor for the per-second limit, regardless of account count. 18 + pub per_second_base: u64, 19 + /// per-second events allowed per active account on this PDS. 20 + pub per_second_account_mul: f64, 21 + /// per-hour limit. 22 + pub per_hour: u64, 23 + /// per-day limit. 24 + pub per_day: u64, 25 + } 26 + 27 + impl RateTier { 28 + /// built-in "trusted" tier: high limits for well-behaved PDS operators. 29 + pub fn trusted() -> Self { 30 + Self { 31 + per_second_base: 5000, 32 + per_second_account_mul: 10.0, 33 + per_hour: 5000 * 3600, 34 + per_day: 5000 * 86400, 35 + } 36 + } 37 + 38 + /// built-in "default" tier: conservative limits for unknown PDS operators. 39 + pub fn default_tier() -> Self { 40 + Self { 41 + per_second_base: 50, 42 + per_second_account_mul: 0.5, 43 + per_hour: 1000 * 3600, 44 + per_day: 1000 * 86400, 45 + } 46 + } 47 + 48 + /// parse `base/mul/hourly/daily` format used by `HYDRANT_RATE_TIERS`. 49 + fn parse(s: &str) -> Option<Self> { 50 + let parts: Vec<&str> = s.split('/').collect(); 51 + if parts.len() != 4 { 52 + return None; 53 + } 54 + Some(Self { 55 + per_second_base: parts[0].parse().ok()?, 56 + per_second_account_mul: parts[1].parse().ok()?, 57 + per_hour: parts[2].parse().ok()?, 58 + per_day: parts[3].parse().ok()?, 59 + }) 60 + } 61 + } 62 + 10 63 /// this is for internal use only, please don't use this macro. 11 64 #[doc(hidden)] 12 65 #[macro_export] ··· 309 362 /// set via `HYDRANT_ENABLE_BACKLINKS=true`. 310 363 pub enable_backlinks: bool, 311 364 365 + /// list of trusted PDS/relay hosts to pre-assign to the "trusted" rate tier at startup. 366 + /// set via `HYDRANT_TRUSTED_HOSTS` as a comma-separated list of hostnames. 367 + /// hosts not present in this list use the "default" tier unless assigned via the API. 368 + pub trusted_hosts: Vec<String>, 369 + /// named rate tier definitions for PDS rate limiting. 370 + /// 371 + /// built-in tiers ("default" and "trusted") are always present and may be overridden. 372 + /// set via `HYDRANT_RATE_TIERS` as a comma-separated list of `name:base/mul/hourly/daily` entries, 373 + /// e.g. `trusted:5000/10.0/18000000/432000000,custom:100/1.0/7200000/172800000`. 374 + pub rate_tiers: HashMap<String, RateTier>, 375 + 312 376 /// db internals, tune only if you know what you're doing. 313 377 /// 314 378 /// size of the fjall block cache in MB. set via `HYDRANT_CACHE_SIZE`. ··· 388 452 filter_collections: None, 389 453 filter_excludes: None, 390 454 enable_backlinks: false, 455 + trusted_hosts: vec![], 456 + rate_tiers: { 457 + let mut m = HashMap::new(); 458 + m.insert("default".to_string(), RateTier::default_tier()); 459 + m.insert("trusted".to_string(), RateTier::trusted()); 460 + m 461 + }, 391 462 cache_size: 256, 392 463 data_compression: Compression::Lz4, 393 464 journal_compression: Compression::Lz4, ··· 549 620 550 621 let enable_backlinks: bool = cfg!("ENABLE_BACKLINKS", defaults.enable_backlinks); 551 622 623 + // start with built-in tiers, then layer in any env-defined overrides. 624 + // format: HYDRANT_RATE_TIERS=name:base/mul/hourly/daily,... 625 + let mut rate_tiers = defaults.rate_tiers.clone(); 626 + if let Ok(s) = std::env::var("HYDRANT_RATE_TIERS") { 627 + for entry in s.split(',') { 628 + let entry = entry.trim(); 629 + if let Some((name, spec)) = entry.split_once(':') { 630 + match RateTier::parse(spec) { 631 + Some(tier) => { 632 + rate_tiers.insert(name.trim().to_string(), tier); 633 + } 634 + None => tracing::warn!( 635 + "ignoring invalid rate tier '{name}': expected base/mul/hourly/daily format" 636 + ), 637 + } 638 + } 639 + } 640 + } 641 + 642 + let trusted_hosts = std::env::var("HYDRANT_TRUSTED_HOSTS") 643 + .ok() 644 + .map(|s| { 645 + s.split(',') 646 + .map(|s| s.trim().to_string()) 647 + .filter(|s| !s.is_empty()) 648 + .collect() 649 + }) 650 + .unwrap_or_else(|| defaults.trusted_hosts.clone()); 651 + 552 652 let default_mode = CrawlerMode::default_for(full_network); 553 653 let crawler_sources = match std::env::var("HYDRANT_CRAWLER_URLS") { 554 654 Ok(s) => s ··· 593 693 filter_collections, 594 694 filter_excludes, 595 695 enable_backlinks, 696 + trusted_hosts, 697 + rate_tiers, 596 698 cache_size, 597 699 data_compression, 598 700 journal_compression,
+21 -14
src/control/crawler.rs
··· 149 149 150 150 /// delete all cursor entries associated with the given URL. 151 151 pub async fn reset_cursor(&self, url: &str) -> Result<()> { 152 - let db = self.state.db.clone(); 152 + let state = self.state.clone(); 153 153 let point_keys = [keys::crawler_cursor_key(url)]; 154 154 let by_collection_prefix = keys::by_collection_cursor_prefix(url); 155 155 tokio::task::spawn_blocking(move || { 156 - let mut batch = db.inner.batch(); 156 + let mut batch = state.db.inner.batch(); 157 157 for k in point_keys { 158 - batch.remove(&db.cursors, k); 158 + batch.remove(&state.db.cursors, k); 159 159 } 160 - for entry in db.cursors.prefix(&by_collection_prefix) { 160 + for entry in state.db.cursors.prefix(&by_collection_prefix) { 161 161 let k = entry.key().into_diagnostic()?; 162 - batch.remove(&db.cursors, k); 162 + batch.remove(&state.db.cursors, k); 163 163 } 164 - batch.commit().into_diagnostic() 164 + batch.commit().into_diagnostic()?; 165 + state.db.persist() 165 166 }) 166 167 .await 167 168 .into_diagnostic()??; ··· 198 199 miette::bail!("crawler not yet started: call Hydrant::run() first"); 199 200 }; 200 201 201 - let db = self.state.db.clone(); 202 + let state = self.state.clone(); 202 203 let key = keys::crawler_source_key(source.url.as_str()); 203 204 let val = rmp_serde::to_vec(&source.mode).into_diagnostic()?; 204 - tokio::task::spawn_blocking(move || db.crawler.insert(key, val).into_diagnostic()) 205 - .await 206 - .into_diagnostic()??; 205 + tokio::task::spawn_blocking(move || { 206 + state.db.crawler.insert(key, val).into_diagnostic()?; 207 + state.db.persist() 208 + }) 209 + .await 210 + .into_diagnostic()??; 207 211 208 212 let enabled_rx = self.state.crawler_enabled.subscribe(); 209 213 let handle = spawn_crawler_producer( ··· 249 253 250 254 // remove from DB if it was a persisted source 251 255 if self.persisted.remove_async(url).await.is_some() { 252 - let db = self.state.db.clone(); 256 + let state = self.state.clone(); 253 257 let key = keys::crawler_source_key(url.as_str()); 254 - tokio::task::spawn_blocking(move || db.crawler.remove(key).into_diagnostic()) 255 - .await 256 - .into_diagnostic()??; 258 + tokio::task::spawn_blocking(move || { 259 + state.db.crawler.remove(key).into_diagnostic()?; 260 + state.db.persist() 261 + }) 262 + .await 263 + .into_diagnostic()??; 257 264 } 258 265 259 266 Ok(true)
+4 -1
src/control/filter.rs
··· 4 4 use miette::{IntoDiagnostic, Result}; 5 5 6 6 use crate::db::filter as db_filter; 7 - use crate::filter::{FilterMode, SetUpdate}; 7 + use crate::filter::FilterMode; 8 + use crate::patch::SetUpdate; 8 9 use crate::state::AppState; 9 10 10 11 /// a point-in-time snapshot of the filter configuration. returned by all [`FilterControl`] methods. ··· 273 274 let filter_ks = self.state.db.filter.clone(); 274 275 let inner = self.state.db.inner.clone(); 275 276 let filter_handle = self.state.filter.clone(); 277 + let state = self.state.clone(); 276 278 let mode = self.mode; 277 279 let signals = self.signals; 278 280 let collections = self.collections; ··· 282 284 let mut batch = inner.batch(); 283 285 db_filter::apply_patch(&mut batch, &filter_ks, mode, signals, collections, excludes)?; 284 286 batch.commit().into_diagnostic()?; 287 + state.db.persist()?; 285 288 db_filter::load(&filter_ks) 286 289 }) 287 290 .await
+84 -78
src/control/firehose.rs
··· 1 1 use std::sync::Arc; 2 - use std::sync::atomic::Ordering; 3 2 4 - use miette::{Context, IntoDiagnostic, Result}; 3 + use miette::{IntoDiagnostic, Result}; 5 4 use tokio::sync::watch; 6 5 use tracing::{error, info}; 7 6 use url::Url; ··· 62 61 state.filter.clone(), 63 62 enabled, 64 63 shared.verify_signatures, 65 - ); 64 + ) 65 + .await; 66 66 67 67 let relay_for_log = relay_url.clone(); 68 68 let abort = tokio::spawn(async move { ··· 88 88 } 89 89 90 90 impl FirehoseHandle { 91 - /// enable the firehose. no-op if already enabled. 91 + pub(super) fn new(state: Arc<AppState>) -> Self { 92 + Self { 93 + state, 94 + shared: Arc::new(std::sync::OnceLock::new()), 95 + tasks: Arc::new(scc::HashMap::new()), 96 + persisted: Arc::new(scc::HashSet::new()), 97 + } 98 + } 99 + 100 + /// enable firehose ingestion, no-op if already enabled. 92 101 pub fn enable(&self) { 93 102 self.state.firehose_enabled.send_replace(true); 94 103 } 95 - /// disable the firehose. the current message finishes processing before the connection closes. 104 + /// disable firehose ingestion, in-flight messages complete before pausing. 96 105 pub fn disable(&self) { 97 106 self.state.firehose_enabled.send_replace(false); 98 107 } 99 - /// returns the current enabled state of the firehose. 108 + /// returns the current enabled state of firehose ingestion. 100 109 pub fn is_enabled(&self) -> bool { 101 110 *self.state.firehose_enabled.borrow() 102 111 } 103 112 104 - /// reset the stored cursor for the given relay URL. 105 - /// 106 - /// clears the `firehose_cursor|{host}|{scheme}` entry from the cursors keyspace and zeroes 107 - /// the in-memory cursor. the next connection will tail live events from the current head. 108 - pub async fn reset_cursor(&self, url: &str) -> Result<()> { 109 - let relay_url = Url::parse(url) 110 - .into_diagnostic() 111 - .wrap_err_with(|| format!("invalid relay url: {url:?}"))?; 112 - let key = keys::firehose_cursor_key_from_url(&relay_url); 113 - let db = self.state.db.clone(); 114 - tokio::task::spawn_blocking(move || db.cursors.remove(key).into_diagnostic()) 115 - .await 116 - .into_diagnostic()??; 117 - 118 - self.state.firehose_cursors.peek_with(&relay_url, |_, c| { 119 - c.store(0, Ordering::SeqCst); 120 - }); 121 - Ok(()) 122 - } 123 - 124 - /// return info on all currently active firehose sources. 113 + /// list all currently active firehose sources. 125 114 pub async fn list_sources(&self) -> Vec<FirehoseSourceInfo> { 126 - let mut sources = Vec::new(); 115 + let mut out = Vec::new(); 127 116 self.tasks 128 - .iter_async(|url, handle| { 129 - sources.push(FirehoseSourceInfo { 117 + .any_async(|url, handle| { 118 + out.push(FirehoseSourceInfo { 130 119 url: url.clone(), 131 120 persisted: self.persisted.contains_sync(url), 132 121 is_pds: handle.is_pds, 133 122 }); 134 - true 123 + false 135 124 }) 136 125 .await; 137 - sources 126 + out 138 127 } 139 128 140 - /// add a new firehose relay at runtime. 141 - /// 142 - /// the URL is persisted to the database and will be re-spawned on restart. if a relay with 143 - /// the same URL already exists it is replaced: the running task is stopped and a new one 144 - /// is started. any cursor state for that URL is preserved. 129 + /// add a new firehose source at runtime, persisting it to the database. 145 130 /// 146 - /// returns an error if called before [`Hydrant::run`]. 131 + /// if a source with the same URL already exists, it is replaced: the 132 + /// running task is stopped and a new one is started with the new `is_pds` 133 + /// setting. existing cursor state for the URL is preserved. 147 134 pub async fn add_source(&self, url: Url, is_pds: bool) -> Result<()> { 148 - let Some(shared) = self.shared.get() else { 149 - miette::bail!("firehose not yet started: call Hydrant::run() first"); 150 - }; 135 + let shared = self 136 + .shared 137 + .get() 138 + .ok_or_else(|| miette::miette!("firehose worker not started"))?; 151 139 152 - let db = self.state.db.clone(); 140 + // persist to db first 153 141 let key = keys::firehose_source_key(url.as_str()); 154 - let value = rmp_serde::to_vec(&crate::db::FirehoseSourceMeta { is_pds }) 155 - .map_err(|e| miette::miette!("failed to serialize firehose source meta: {e}"))?; 156 - tokio::task::spawn_blocking(move || db.crawler.insert(key, value).into_diagnostic()) 157 - .await 158 - .into_diagnostic()??; 142 + tokio::task::spawn_blocking({ 143 + let state = self.state.clone(); 144 + move || { 145 + let mut batch = state.db.inner.batch(); 146 + let value = rmp_serde::to_vec(&db::FirehoseSourceMeta { is_pds }).map_err(|e| { 147 + miette::miette!("failed to serialize firehose source meta: {e}") 148 + })?; 149 + batch.insert(&state.db.crawler, key, &value); 150 + batch.commit().into_diagnostic()?; 151 + state.db.persist() 152 + } 153 + }) 154 + .await 155 + .into_diagnostic()??; 156 + 157 + let _ = self.persisted.insert_async(url.clone()).await; 159 158 160 159 let enabled_rx = self.state.firehose_enabled.subscribe(); 161 160 let handle = spawn_firehose_ingestor(&url, is_pds, &self.state, shared, enabled_rx).await?; 161 + self.tasks.upsert_async(url, handle).await; 162 162 163 - let _ = self.persisted.insert_async(url.clone()).await; 164 - match self.tasks.entry_async(url).await { 165 - scc::hash_map::Entry::Vacant(e) => { 166 - e.insert_entry(handle); 167 - } 168 - scc::hash_map::Entry::Occupied(mut e) => { 169 - *e.get_mut() = handle; 170 - } 171 - } 172 163 Ok(()) 173 164 } 174 165 175 - /// remove a firehose relay at runtime by URL. 176 - /// 177 - /// aborts the running ingestor task. if the source was added via the API it is removed from 178 - /// the database and will not reappear on restart. `RELAY_HOSTS` sources are only stopped for 179 - /// the current session; they reappear on the next restart. 166 + /// remove a firehose source at runtime. 180 167 /// 181 - /// returns `true` if the relay was found and removed, `false` if it was not running. 182 - /// returns an error if called before [`Hydrant::run`]. 168 + /// returns `true` if the source was found and removed, `false` otherwise. 169 + /// if the source was added via the API, it is removed from the database; 170 + /// if it came from the static config, only the running task is stopped. 183 171 pub async fn remove_source(&self, url: &Url) -> Result<bool> { 184 - if self.shared.get().is_none() { 185 - miette::bail!("firehose not yet started: call Hydrant::run() first"); 172 + if self.persisted.contains_async(url).await { 173 + let url_str = url.to_string(); 174 + tokio::task::spawn_blocking({ 175 + let state = self.state.clone(); 176 + move || { 177 + state 178 + .db 179 + .crawler 180 + .remove(keys::firehose_source_key(&url_str)) 181 + .into_diagnostic()?; 182 + state.db.persist() 183 + } 184 + }) 185 + .await 186 + .into_diagnostic()??; 187 + self.persisted.remove_async(url).await; 186 188 } 187 189 188 - if self.tasks.remove_async(url).await.is_none() { 189 - return Ok(false); 190 - } 190 + Ok(self.tasks.remove_async(url).await.is_some()) 191 + } 191 192 192 - // remove from relay_cursors (persist thread will stop tracking it) 193 - self.state.firehose_cursors.remove_async(url).await; 193 + /// reset the stored firehose cursor for a given URL. 194 + pub async fn reset_cursor(&self, url: &str) -> Result<()> { 195 + let url = Url::parse(url).into_diagnostic()?; 196 + let key = keys::firehose_cursor_key_from_url(&url); 197 + tokio::task::spawn_blocking({ 198 + let state = self.state.clone(); 199 + move || { 200 + state.db.cursors.remove(key).into_diagnostic()?; 201 + state.db.persist() 202 + } 203 + }) 204 + .await 205 + .into_diagnostic()??; 194 206 195 - if self.persisted.remove_async(url).await.is_some() { 196 - let db = self.state.db.clone(); 197 - let key = keys::firehose_source_key(url.as_str()); 198 - tokio::task::spawn_blocking(move || db.crawler.remove(key).into_diagnostic()) 199 - .await 200 - .into_diagnostic()??; 201 - } 207 + self.state.firehose_cursors.remove_async(&url).await; 202 208 203 - Ok(true) 209 + Ok(()) 204 210 } 205 211 }
+33 -33
src/control/mod.rs
··· 3 3 pub(crate) mod crawler; 4 4 pub(crate) mod filter; 5 5 pub(crate) mod firehose; 6 + pub(crate) mod pds; 6 7 pub(crate) mod repos; 7 8 pub(crate) mod stream; 8 9 9 10 pub use crawler::{CrawlerHandle, CrawlerSourceInfo}; 10 11 pub use filter::{FilterControl, FilterPatch, FilterSnapshot}; 11 12 pub use firehose::{FirehoseHandle, FirehoseSourceInfo}; 13 + pub use pds::{PdsControl, PdsTierAssignment, PdsTierDefinition}; 12 14 pub use repos::{ListedRecord, Record, RecordList, RepoHandle, RepoInfo, ReposControl}; 13 15 use smol_str::{SmolStr, ToSmolStr}; 14 16 ··· 87 89 pub firehose: FirehoseHandle, 88 90 pub backfill: BackfillHandle, 89 91 pub filter: FilterControl, 92 + pub pds: PdsControl, 90 93 pub repos: ReposControl, 91 94 pub db: DbControl, 92 95 #[cfg(feature = "backlinks")] ··· 121 124 let signals = config 122 125 .filter_signals 123 126 .clone() 124 - .map(crate::filter::SetUpdate::Set); 127 + .map(crate::patch::SetUpdate::Set); 125 128 let collections = config 126 129 .filter_collections 127 130 .clone() 128 - .map(crate::filter::SetUpdate::Set); 131 + .map(crate::patch::SetUpdate::Set); 129 132 let excludes = config 130 133 .filter_excludes 131 134 .clone() 132 - .map(crate::filter::SetUpdate::Set); 135 + .map(crate::patch::SetUpdate::Set); 133 136 134 137 tokio::task::spawn_blocking(move || { 135 138 let mut batch = inner.batch(); ··· 174 177 tasks: Arc::new(scc::HashMap::new()), 175 178 persisted: Arc::new(scc::HashSet::new()), 176 179 }, 177 - firehose: FirehoseHandle { 178 - state: state.clone(), 179 - shared: Arc::new(std::sync::OnceLock::new()), 180 - tasks: Arc::new(scc::HashMap::new()), 181 - persisted: Arc::new(scc::HashSet::new()), 182 - }, 180 + firehose: FirehoseHandle::new(state.clone()), 183 181 backfill: BackfillHandle(state.clone()), 184 182 filter: FilterControl(state.clone()), 183 + pds: pds::PdsControl(state.clone()), 185 184 repos: ReposControl(state.clone()), 186 185 db: DbControl(state.clone()), 187 186 #[cfg(feature = "backlinks")] ··· 436 435 // 11. spawn crawler infrastructure 437 436 #[cfg(feature = "indexer")] 438 437 { 439 - use crate::crawler::throttle::Throttler; 440 438 use crate::crawler::{ 441 439 CrawlerStats, CrawlerWorker, InFlight, RetryProducer, SignalChecker, 442 440 }; 441 + use crate::util::throttle::Throttler; 443 442 444 443 let http = reqwest::Client::builder() 445 444 .user_agent(concat!( ··· 450 449 .gzip(true) 451 450 .build() 452 451 .expect("that reqwest will build"); 453 - let pds_throttler = Throttler::new(); 452 + let pds_throttler = state.throttler.clone(); 454 453 let in_flight = InFlight::new(); 455 454 let stats = CrawlerStats::new( 456 455 state.clone(), ··· 714 713 /// 715 714 /// sizes are in bytes, reported per keyspace. 716 715 pub async fn stats(&self) -> Result<StatsResponse> { 717 - let db = self.state.db.clone(); 716 + let state = self.state.clone(); 718 717 719 718 let mut counts: BTreeMap<&'static str, u64> = futures::future::join_all( 720 719 [ ··· 729 728 ] 730 729 .into_iter() 731 730 .map(|name| { 732 - let db = db.clone(); 733 - async move { (name, db.get_count(name).await) } 731 + let state = state.clone(); 732 + async move { (name, state.db.get_count(name).await) } 734 733 }), 735 734 ) 736 735 .await 737 736 .into_iter() 738 737 .collect(); 739 738 740 - counts.insert("events", db.events.approximate_len() as u64); 739 + counts.insert("events", state.db.events.approximate_len() as u64); 741 740 742 741 let sizes = tokio::task::spawn_blocking(move || { 743 742 let mut s = BTreeMap::new(); 744 - s.insert("repos", db.repos.disk_space()); 745 - s.insert("records", db.records.disk_space()); 746 - s.insert("blocks", db.blocks.disk_space()); 747 - s.insert("cursors", db.cursors.disk_space()); 748 - s.insert("pending", db.pending.disk_space()); 749 - s.insert("resync", db.resync.disk_space()); 750 - s.insert("resync_buffer", db.resync_buffer.disk_space()); 751 - s.insert("events", db.events.disk_space()); 752 - s.insert("counts", db.counts.disk_space()); 753 - s.insert("filter", db.filter.disk_space()); 754 - s.insert("crawler", db.crawler.disk_space()); 743 + s.insert("repos", state.db.repos.disk_space()); 744 + s.insert("records", state.db.records.disk_space()); 745 + s.insert("blocks", state.db.blocks.disk_space()); 746 + s.insert("cursors", state.db.cursors.disk_space()); 747 + s.insert("pending", state.db.pending.disk_space()); 748 + s.insert("resync", state.db.resync.disk_space()); 749 + s.insert("resync_buffer", state.db.resync_buffer.disk_space()); 750 + s.insert("events", state.db.events.disk_space()); 751 + s.insert("counts", state.db.counts.disk_space()); 752 + s.insert("filter", state.db.filter.disk_space()); 753 + s.insert("crawler", state.db.crawler.disk_space()); 755 754 s 756 755 }) 757 756 .await ··· 788 787 /// 789 788 /// returns the seq we are on for this host. 790 789 pub async fn get_host_status(&self, hostname: &str) -> Result<Option<Host>> { 791 - let db = self.state.db.clone(); 790 + let state = self.state.clone(); 792 791 let hostname = hostname.to_smolstr(); 793 792 794 793 tokio::task::spawn_blocking(move || { 795 794 let key = keys::firehose_cursor_key(&hostname); 796 - let Some(seq) = db.cursors.get(&key).into_diagnostic()? else { 795 + let Some(seq) = state.db.cursors.get(&key).into_diagnostic()? else { 797 796 return Ok(None); 798 797 }; 799 798 let seq = i64::from_be_bytes( ··· 820 819 cursor: Option<&str>, 821 820 limit: usize, 822 821 ) -> Result<(Vec<Host>, Option<SmolStr>)> { 823 - let db = self.state.db.clone(); 822 + let state = self.state.clone(); 824 823 let cursor = cursor.map(str::to_string); 825 824 826 825 tokio::task::spawn_blocking(move || { ··· 836 835 837 836 // fetch one extra item to detect whether there is a next page 838 837 let mut hosts: Vec<Host> = Vec::with_capacity(limit + 1); 839 - for item in db 838 + for item in state 839 + .db 840 840 .cursors 841 841 .range((start_bound, std::ops::Bound::Excluded(prefix_end))) 842 842 .take(limit + 1) ··· 972 972 state 973 973 .with_ingestion_paused(async || { 974 974 let train = |name: &'static str| { 975 - let db = state.db.clone(); 976 - tokio::task::spawn_blocking(move || db.train_dict(name)) 977 - .map(|res| res.into_diagnostic().flatten()) 975 + let state = state.clone(); 976 + tokio::task::spawn_blocking(move || state.db.train_dict(name)) 977 + .map(|res: Result<_, _>| res.into_diagnostic().flatten()) 978 978 }; 979 979 tokio::try_join!(train("repos"), train("blocks"), train("events")).map(|_| ()) 980 980 })
+113
src/control/pds.rs
··· 1 + use std::collections::HashMap; 2 + use std::sync::Arc; 3 + 4 + use miette::{IntoDiagnostic, Result}; 5 + use serde::Serialize; 6 + use smol_str::SmolStr; 7 + 8 + use crate::config::RateTier; 9 + use crate::db::pds_tiers as db_pds; 10 + use crate::state::AppState; 11 + 12 + /// a single PDS-to-tier assignment. 13 + #[derive(Debug, Clone, Serialize)] 14 + pub struct PdsTierAssignment { 15 + pub host: String, 16 + pub tier: String, 17 + } 18 + 19 + /// a rate tier definition, as returned by the API. 20 + #[derive(Debug, Clone, Serialize)] 21 + pub struct PdsTierDefinition { 22 + pub per_second_base: u64, 23 + pub per_second_account_mul: f64, 24 + pub per_hour: u64, 25 + pub per_day: u64, 26 + } 27 + 28 + impl From<RateTier> for PdsTierDefinition { 29 + fn from(t: RateTier) -> Self { 30 + Self { 31 + per_second_base: t.per_second_base, 32 + per_second_account_mul: t.per_second_account_mul, 33 + per_hour: t.per_hour, 34 + per_day: t.per_day, 35 + } 36 + } 37 + } 38 + 39 + /// runtime control over pds related behaviour (eg. ratelimits). 40 + #[derive(Clone)] 41 + pub struct PdsControl(pub(super) Arc<AppState>); 42 + 43 + impl PdsControl { 44 + /// list all current per-PDS tier assignments. 45 + pub async fn list_assignments(&self) -> Vec<PdsTierAssignment> { 46 + let snapshot = self.0.pds_tiers.load(); 47 + snapshot 48 + .iter() 49 + .map(|(host, tier)| PdsTierAssignment { 50 + host: host.clone(), 51 + tier: tier.to_string(), 52 + }) 53 + .collect() 54 + } 55 + 56 + /// list all configured rate tier definitions. 57 + pub fn list_rate_tiers(&self) -> HashMap<String, PdsTierDefinition> { 58 + self.0 59 + .rate_tiers 60 + .iter() 61 + .map(|(name, tier)| (name.clone(), PdsTierDefinition::from(*tier))) 62 + .collect() 63 + } 64 + 65 + /// assign `host` to `tier`, persisting the change to the database. 66 + /// returns an error if `tier` is not a known tier name. 67 + pub async fn set_tier(&self, host: String, tier: String) -> Result<()> { 68 + if !self.0.rate_tiers.contains_key(&tier) { 69 + miette::bail!( 70 + "unknown tier '{tier}'; known tiers: {:?}", 71 + self.0.rate_tiers.keys().collect::<Vec<_>>() 72 + ); 73 + } 74 + 75 + let state = self.0.clone(); 76 + let host_clone = host.clone(); 77 + let tier_clone = tier.clone(); 78 + tokio::task::spawn_blocking(move || { 79 + let mut batch = state.db.inner.batch(); 80 + db_pds::set(&mut batch, &state.db.filter, &host_clone, &tier_clone); 81 + batch.commit().into_diagnostic()?; 82 + state.db.persist() 83 + }) 84 + .await 85 + .into_diagnostic()??; 86 + 87 + let mut snapshot = (**self.0.pds_tiers.load()).clone(); 88 + snapshot.insert(host, SmolStr::new(&tier)); 89 + self.0.pds_tiers.store(Arc::new(snapshot)); 90 + 91 + Ok(()) 92 + } 93 + 94 + /// remove any explicit tier assignment for `host`, reverting it to the default tier. 95 + pub async fn remove_tier(&self, host: String) -> Result<()> { 96 + let state = self.0.clone(); 97 + let host_clone = host.clone(); 98 + tokio::task::spawn_blocking(move || { 99 + let mut batch = state.db.inner.batch(); 100 + db_pds::remove(&mut batch, &state.db.filter, &host_clone); 101 + batch.commit().into_diagnostic()?; 102 + state.db.persist() 103 + }) 104 + .await 105 + .into_diagnostic()??; 106 + 107 + let mut snapshot = (**self.0.pds_tiers.load()).clone(); 108 + snapshot.remove(&host); 109 + self.0.pds_tiers.store(Arc::new(snapshot)); 110 + 111 + Ok(()) 112 + } 113 + }
+12 -6
src/control/repos.rs
··· 86 86 std::ops::Bound::Unbounded 87 87 }; 88 88 89 - let db = self.0.db.clone(); 89 + let state = self.0.clone(); 90 90 self.0 91 91 .db 92 92 .repos ··· 96 96 let repo_state = crate::db::deser_repo_state(&v)?.into_static(); 97 97 let did = TrimmedDid::try_from(k.as_ref())?.to_did(); 98 98 let metadata_key = keys::repo_metadata_key(&did); 99 - let metadata = db 99 + let metadata = state 100 + .db 100 101 .repo_metadata 101 102 .get(&metadata_key) 102 103 .into_diagnostic()? ··· 122 123 }; 123 124 124 125 let repos = self.0.db.repos.clone(); 125 - let db = self.0.db.clone(); 126 + let state = self.0.clone(); 126 127 self.0 127 128 .db 128 129 .pending ··· 144 145 let repo_state = crate::db::deser_repo_state(bytes.as_ref())?; 145 146 let did = TrimmedDid::try_from(did_key.as_ref())?.to_did(); 146 147 let metadata_key = keys::repo_metadata_key(&did); 147 - let metadata = db 148 + let metadata = state 149 + .db 148 150 .repo_metadata 149 151 .get(&metadata_key) 150 152 .into_diagnostic()? ··· 169 171 }; 170 172 171 173 let repos = self.0.db.repos.clone(); 172 - let db = self.0.db.clone(); 174 + let state = self.0.clone(); 173 175 self.0 174 176 .db 175 177 .resync ··· 184 186 let repo_state = crate::db::deser_repo_state(bytes.as_ref())?; 185 187 let did = TrimmedDid::try_from(did_key.as_ref())?.to_did(); 186 188 let metadata_key = keys::repo_metadata_key(&did); 187 - let metadata = db 189 + let metadata = state 190 + .db 188 191 .repo_metadata 189 192 .get(&metadata_key) 190 193 .into_diagnostic()? ··· 304 307 } 305 308 306 309 batch.commit().into_diagnostic()?; 310 + state.db.persist()?; 307 311 Ok::<_, miette::Report>((queued, transitions)) 308 312 }) 309 313 .await ··· 369 373 } 370 374 371 375 batch.commit().into_diagnostic()?; 376 + state.db.persist()?; 372 377 Ok::<_, miette::Report>((added, queued, transitions)) 373 378 }) 374 379 .await ··· 438 443 } 439 444 440 445 batch.commit().into_diagnostic()?; 446 + state.db.persist()?; 441 447 Ok::<_, miette::Report>((untracked, gauge_decrements)) 442 448 }) 443 449 .await
+3 -3
src/crawler/list_repos.rs
··· 1 - use crate::crawler::throttle::{OrFailure, ThrottleHandle, Throttler}; 2 1 use crate::db::keys::crawler_cursor_key; 3 2 use crate::db::{Db, keys}; 4 3 use crate::state::AppState; 4 + use crate::util::throttle::{OrFailure, ThrottleHandle, Throttler}; 5 5 use crate::util::{ 6 6 ErrorForStatus, RetryOutcome, RetryWithBackoff, WatchEnabledExt, parse_retry_after, 7 7 }; ··· 653 653 } 654 654 655 655 async fn process_queue(&self) -> Result<Option<Duration>> { 656 - let db = self.checker.state.db.clone(); 656 + let state = self.checker.state.clone(); 657 657 658 658 struct ScanResult { 659 659 ready: Vec<Did<'static>>, ··· 675 675 let mut next_wake: Option<Duration> = None; 676 676 let mut had_more = false; 677 677 678 - for guard in db.crawler.prefix(keys::CRAWLER_RETRY_PREFIX) { 678 + for guard in state.db.crawler.prefix(keys::CRAWLER_RETRY_PREFIX) { 679 679 let (key, val) = guard.into_inner().into_diagnostic()?; 680 680 let state: RetryState = rmp_serde::from_slice(&val).into_diagnostic()?; 681 681 let did = keys::crawler_retry_parse_key(&key)?.to_did();
+1 -2
src/crawler/mod.rs
··· 14 14 15 15 mod by_collection; 16 16 mod list_repos; 17 - pub mod throttle; 18 17 mod worker; 19 18 20 - use throttle::Throttler; 19 + use crate::util::throttle::Throttler; 21 20 22 21 pub(crate) use by_collection::ByCollectionProducer; 23 22 pub(crate) use list_repos::{ListReposProducer, RetryProducer, SignalChecker};
+114 -5
src/crawler/throttle.rs src/util/throttle.rs
··· 1 + use crate::config::RateTier; 2 + use parking_lot::Mutex; 1 3 use scc::HashMap; 2 4 use std::future::Future; 3 5 use std::sync::Arc; 4 6 use std::sync::atomic::{AtomicI64, AtomicUsize, Ordering}; 5 - use std::time::Duration; 7 + use std::time::{Duration, Instant}; 6 8 use tokio::sync::{Notify, Semaphore, SemaphorePermit}; 7 9 use url::Url; 8 10 ··· 10 12 /// ref pds allows 10 requests per second... so 10 should be fine 11 13 const PER_PDS_CONCURRENCY: usize = 10; 12 14 15 + // per second, hour and day 16 + const DURATIONS: [Duration; 3] = [ 17 + Duration::from_secs(1), 18 + Duration::from_secs(3600), 19 + Duration::from_secs(86400), 20 + ]; 21 + 13 22 #[derive(Clone)] 14 23 pub struct Throttler { 15 24 states: Arc<HashMap<Url, Arc<State>>>, ··· 54 63 /// let tasks exit naturally, deferring to the background retry loop. 55 64 failure_notify: Notify, 56 65 semaphore: Semaphore, 66 + rate_limiter: RateLimiter, 57 67 } 58 68 59 69 impl State { ··· 64 74 consecutive_timeouts: AtomicUsize::new(0), 65 75 failure_notify: Notify::new(), 66 76 semaphore: Semaphore::new(PER_PDS_CONCURRENCY), 77 + rate_limiter: RateLimiter::new(), 67 78 } 68 79 } 69 80 } ··· 92 103 /// called on a 429 response. `retry_after_secs` comes from the `Retry-After` 93 104 /// header if present; falls back to 60s. uses `fetch_max` so concurrent callers 94 105 /// don't race each other back to a shorter window. 95 - /// 96 - /// deliberately does NOT notify waiters — 429s are soft and tasks should exit 97 - /// naturally via the `Retry` result rather than being cancelled. 98 106 pub fn record_ratelimit(&self, retry_after_secs: Option<u64>) { 99 107 let secs = retry_after_secs.unwrap_or(60) as i64; 100 108 let until = chrono::Utc::now().timestamp() + secs; ··· 155 163 } 156 164 157 165 /// resolves when this PDS gets a hard failure notification. 158 - /// used by `or_throttle` and the semaphore acquire select to cancel in-flight work. 159 166 pub async fn wait_for_failure(&self) { 160 167 loop { 161 168 let notified = self.state.failure_notify.notified(); ··· 163 170 return; 164 171 } 165 172 notified.await; 173 + } 174 + } 175 + 176 + /// waits until the rate tier's limits allow more events for this PDS. 177 + /// sleeps precisely until the most restrictive window opens rather than polling. 178 + pub async fn wait_for_allow(&self, num_accounts: u64, tier: &RateTier) { 179 + let limits = limits_for(num_accounts, tier); 180 + while let Some(wait) = self.state.rate_limiter.try_acquire(limits) { 181 + tokio::time::sleep(wait).await; 182 + } 183 + } 184 + } 185 + 186 + fn limits_for(num_accounts: u64, tier: &RateTier) -> [u64; 3] { 187 + let per_sec = tier 188 + .per_second_base 189 + .max((num_accounts as f64 * tier.per_second_account_mul) as u64); 190 + [per_sec, tier.per_hour, tier.per_day] 191 + } 192 + 193 + struct WindowState { 194 + count: u64, 195 + prev_count: u64, 196 + window_start: Instant, 197 + } 198 + 199 + impl WindowState { 200 + fn new() -> Self { 201 + Self { 202 + count: 0, 203 + prev_count: 0, 204 + window_start: Instant::now(), 205 + } 206 + } 207 + 208 + fn rotate(&mut self, dur: Duration) { 209 + let elapsed = self.window_start.elapsed(); 210 + if elapsed >= dur { 211 + let n = (elapsed.as_nanos() / dur.as_nanos()).max(1) as u32; 212 + self.prev_count = if n == 1 { self.count } else { 0 }; 213 + self.count = 0; 214 + self.window_start += dur * n; 215 + } 216 + } 217 + 218 + /// returns how long to sleep before this window would allow one more event. 219 + /// Duration::ZERO means allow now. 220 + fn wait_needed(&self, dur: Duration, limit: u64) -> Duration { 221 + let elapsed = self.window_start.elapsed(); 222 + let remaining = dur.saturating_sub(elapsed); 223 + let weight = remaining.as_secs_f64() / dur.as_secs_f64(); 224 + let effective = self.count as f64 + self.prev_count as f64 * weight; 225 + 226 + if effective < limit as f64 { 227 + return Duration::ZERO; 228 + } 229 + 230 + if self.prev_count == 0 || self.count as f64 >= limit as f64 { 231 + // must wait for a full window rotation 232 + remaining + Duration::from_millis(1) 233 + } else { 234 + let secs = remaining.as_secs_f64() 235 + - dur.as_secs_f64() * (limit as f64 - self.count as f64) / self.prev_count as f64; 236 + Duration::from_secs_f64(secs.max(0.0)) + Duration::from_micros(500) 237 + } 238 + } 239 + } 240 + 241 + struct RateLimiter { 242 + // parking_lot::Mutex — uncontended path never touches the kernel 243 + windows: Mutex<[WindowState; 3]>, 244 + } 245 + 246 + impl RateLimiter { 247 + fn new() -> Self { 248 + Self { 249 + windows: Mutex::new([WindowState::new(), WindowState::new(), WindowState::new()]), 250 + } 251 + } 252 + 253 + /// returns None if the slot was acquired, or Some(sleep_for) if limited. 254 + fn try_acquire(&self, limits: [u64; 3]) -> Option<Duration> { 255 + let mut windows = self.windows.lock(); 256 + 257 + windows 258 + .iter_mut() 259 + .zip(DURATIONS) 260 + .for_each(|(w, d)| w.rotate(d)); 261 + 262 + let max_wait = windows 263 + .iter() 264 + .zip(DURATIONS) 265 + .zip(limits) 266 + .map(|((w, dur), limit)| w.wait_needed(dur, limit)) 267 + .max() 268 + .unwrap_or(Duration::ZERO); 269 + 270 + if max_wait.is_zero() { 271 + windows.iter_mut().for_each(|w| w.count += 1); 272 + None 273 + } else { 274 + Some(max_wait) 166 275 } 167 276 } 168 277 }
+20 -11
src/crawler/worker.rs
··· 137 137 138 138 // filter already-known repos, build and commit the write batch, then return 139 139 // the surviving guards so they are dropped on the async side after commit. 140 - let db = self.state.db.clone(); 140 + let app_state = self.state.clone(); 141 141 let surviving = tokio::time::timeout( 142 142 BLOCKING_TASK_TIMEOUT, 143 143 tokio::task::spawn_blocking(move || -> Result<Vec<InFlightGuard>> { 144 144 let mut rng: SmallRng = rand::make_rng(); 145 - let mut batch = db.inner.batch(); 145 + let mut batch = app_state.db.inner.batch(); 146 146 let mut surviving = Vec::new(); 147 147 for guard in guards { 148 148 let did_key = keys::repo_key(&*guard); 149 149 let metadata_key = keys::repo_metadata_key(&*guard); 150 - if db.repos.contains_key(&did_key).into_diagnostic()? { 150 + if app_state 151 + .db 152 + .repos 153 + .contains_key(&did_key) 154 + .into_diagnostic()? 155 + { 151 156 continue; 152 157 } 153 158 let state = RepoState::backfilling(); 154 159 let metadata = RepoMetadata::backfilling(rng.next_u64()); 155 - batch.insert(&db.repos, &did_key, ser_repo_state(&state)?); 160 + batch.insert(&app_state.db.repos, &did_key, ser_repo_state(&state)?); 156 161 batch.insert( 157 - &db.repo_metadata, 162 + &app_state.db.repo_metadata, 158 163 &metadata_key, 159 164 crate::db::ser_repo_metadata(&metadata)?, 160 165 ); 161 - batch.insert(&db.pending, keys::pending_key(metadata.index_id), &did_key); 166 + batch.insert( 167 + &app_state.db.pending, 168 + keys::pending_key(metadata.index_id), 169 + &did_key, 170 + ); 162 171 // clear any stale retry entry, this DID is confirmed and being enqueued 163 - batch.remove(&db.crawler, keys::crawler_retry_key(&*guard)); 172 + batch.remove(&app_state.db.crawler, keys::crawler_retry_key(&*guard)); 164 173 trace!(did = %*guard, "enqueuing repo"); 165 174 surviving.push(guard); 166 175 } 167 176 if let Some(cursor) = cursor_update { 168 - batch.insert(&db.cursors, cursor.key, cursor.value); 177 + batch.insert(&app_state.db.cursors, cursor.key, cursor.value); 169 178 } 170 179 // todo: repo state overwrites here are acceptable? 171 180 batch.commit().into_diagnostic()?; ··· 202 211 } 203 212 204 213 async fn commit_cursor(&self, cursor: CursorUpdate) -> Result<()> { 205 - let db = self.state.db.clone(); 214 + let state = self.state.clone(); 206 215 tokio::time::timeout( 207 216 BLOCKING_TASK_TIMEOUT, 208 217 tokio::task::spawn_blocking(move || { 209 - let mut batch = db.inner.batch(); 210 - batch.insert(&db.cursors, cursor.key, cursor.value); 218 + let mut batch = state.db.inner.batch(); 219 + batch.insert(&state.db.cursors, cursor.key, cursor.value); 211 220 batch.commit().into_diagnostic() 212 221 }), 213 222 )
+8 -13
src/db/filter.rs
··· 1 1 use fjall::{Keyspace, OwnedWriteBatch}; 2 - use jacquard_common::IntoStatic; 3 - use jacquard_common::types::nsid::Nsid; 4 2 use jacquard_common::types::string::Did; 5 3 use miette::{IntoDiagnostic, Result}; 6 4 7 5 use crate::db::types::TrimmedDid; 8 - use crate::filter::{FilterConfig, FilterMode, SetUpdate}; 6 + use crate::filter::{FilterConfig, FilterMode}; 7 + use crate::patch::SetUpdate; 9 8 10 9 pub const MODE_KEY: &[u8] = b"m"; 11 10 pub const SIGNAL_PREFIX: u8 = b's'; ··· 113 112 for guard in ks.prefix(signal_prefix) { 114 113 let (k, _) = guard.into_inner().into_diagnostic()?; 115 114 let val = std::str::from_utf8(&k[signal_prefix.len()..]).into_diagnostic()?; 116 - config.signals.push(Nsid::new(val)?.into_static()); 115 + config.signals.push(val.into()); 117 116 } 118 117 119 118 let col_prefix = [COLLECTION_PREFIX, SEP]; 120 119 for guard in ks.prefix(col_prefix) { 121 120 let (k, _) = guard.into_inner().into_diagnostic()?; 122 121 let val = std::str::from_utf8(&k[col_prefix.len()..]).into_diagnostic()?; 123 - config.collections.push(Nsid::new(val)?.into_static()); 122 + config.collections.push(val.into()); 124 123 } 125 124 126 125 Ok(config) ··· 144 143 145 144 #[cfg(test)] 146 145 mod tests { 146 + use smol_str::SmolStr; 147 + 147 148 use super::*; 148 149 149 150 #[test] ··· 198 199 199 200 let config = load(&ks)?; 200 201 assert_eq!(config.mode, FilterMode::Filter); 201 - assert_eq!( 202 - config.signals, 203 - vec![Nsid::new("a.b.c").unwrap().into_static()] 204 - ); 205 - assert_eq!( 206 - config.collections, 207 - vec![Nsid::new("d.e.f").unwrap().into_static()] 208 - ); 202 + assert_eq!(config.signals, vec![SmolStr::new("a.b.c")]); 203 + assert_eq!(config.collections, vec![SmolStr::new("d.e.f")]); 209 204 210 205 let excludes = read_set(&ks, EXCLUDE_PREFIX)?; 211 206 assert_eq!(excludes, vec!["did:plc:yk4q3id7id6p5z3bypvshc64"]);
+4
src/db/keys/mod.rs
··· 231 231 key 232 232 } 233 233 234 + pub fn pds_account_count_key(host: &str) -> String { 235 + format!("p|{host}") 236 + } 237 + 234 238 #[cfg(feature = "relay")] 235 239 /// key format: {SEQ} (u64 big-endian), mirroring event_key 236 240 pub fn relay_event_key(seq: u64) -> [u8; 8] {
+1 -1
src/db/mod.rs
··· 29 29 pub mod filter; 30 30 pub mod keys; 31 31 pub mod migration; 32 + pub mod pds_tiers; 32 33 pub mod types; 33 34 34 35 use tokio::sync::broadcast; ··· 38 39 KeyspaceCreateOptions::default() 39 40 } 40 41 41 - #[derive(Clone)] 42 42 pub struct Db { 43 43 pub inner: Arc<Database>, 44 44 pub path: std::path::PathBuf,
+33
src/db/pds_tiers.rs
··· 1 + use fjall::{Keyspace, OwnedWriteBatch}; 2 + use miette::{IntoDiagnostic, Result}; 3 + use smol_str::SmolStr; 4 + 5 + pub const PDS_TIER_PREFIX: &[u8] = b"pt|"; 6 + 7 + // `pt|{host}` -> tier name 8 + pub fn pds_tier_key(host: &str) -> Vec<u8> { 9 + let mut key = Vec::with_capacity(PDS_TIER_PREFIX.len() + host.len()); 10 + key.extend_from_slice(PDS_TIER_PREFIX); 11 + key.extend_from_slice(host.as_bytes()); 12 + key 13 + } 14 + 15 + /// load all PDS tier assignments from the filter keyspace 16 + pub fn load(ks: &Keyspace) -> Result<Vec<(SmolStr, SmolStr)>> { 17 + let mut out = Vec::new(); 18 + for guard in ks.prefix(PDS_TIER_PREFIX) { 19 + let (k, v) = guard.into_inner().into_diagnostic()?; 20 + let host = std::str::from_utf8(&k[PDS_TIER_PREFIX.len()..]).into_diagnostic()?; 21 + let tier = std::str::from_utf8(&v).into_diagnostic()?; 22 + out.push((SmolStr::new(host), SmolStr::new(tier))); 23 + } 24 + Ok(out) 25 + } 26 + 27 + pub fn set(batch: &mut OwnedWriteBatch, ks: &Keyspace, host: &str, tier: &str) { 28 + batch.insert(ks, pds_tier_key(host), tier.as_bytes()); 29 + } 30 + 31 + pub fn remove(batch: &mut OwnedWriteBatch, ks: &Keyspace, host: &str) { 32 + batch.remove(ks, pds_tier_key(host)); 33 + }
+3 -13
src/filter.rs
··· 1 - use jacquard_common::types::nsid::Nsid; 2 1 use serde::{Deserialize, Serialize}; 2 + use smol_str::SmolStr; 3 3 use std::sync::Arc; 4 4 5 5 pub(crate) type FilterHandle = Arc<arc_swap::ArcSwap<FilterConfig>>; ··· 8 8 Arc::new(arc_swap::ArcSwap::new(Arc::new(config))) 9 9 } 10 10 11 - /// apply a bool patch or set replacement for a single set update. 12 - #[derive(Debug, Clone, Serialize, Deserialize)] 13 - #[serde(untagged)] 14 - pub(crate) enum SetUpdate { 15 - /// replace the entire set with this list 16 - Set(Vec<String>), 17 - /// patch: true = add, false = remove 18 - Patch(std::collections::HashMap<String, bool>), 19 - } 20 - 21 11 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] 22 12 #[serde(rename_all = "snake_case")] 23 13 pub enum FilterMode { ··· 28 18 #[derive(Debug, Clone, Serialize)] 29 19 pub(crate) struct FilterConfig { 30 20 pub mode: FilterMode, 31 - pub signals: Vec<Nsid<'static>>, 32 - pub collections: Vec<Nsid<'static>>, 21 + pub signals: Vec<SmolStr>, 22 + pub collections: Vec<SmolStr>, 33 23 } 34 24 35 25 impl FilterConfig {
+17 -2
src/ingest/firehose.rs
··· 3 3 use crate::ingest::{BufferTx, IngestMessage}; 4 4 use crate::state::AppState; 5 5 use crate::util::WatchEnabledExt; 6 + use crate::util::throttle::ThrottleHandle; 6 7 use jacquard_common::IntoStatic; 7 8 use jacquard_common::types::did::Did; 8 9 use miette::{IntoDiagnostic, Result}; ··· 21 22 filter: FilterHandle, 22 23 enabled: watch::Receiver<bool>, 23 24 _verify_signatures: bool, 25 + throttle: ThrottleHandle, 24 26 } 25 27 26 28 impl FirehoseIngestor { 27 - pub fn new( 29 + pub async fn new( 28 30 state: Arc<AppState>, 29 31 buffer_tx: BufferTx, 30 32 relay_host: Url, ··· 33 35 enabled: watch::Receiver<bool>, 34 36 verify_signatures: bool, 35 37 ) -> Self { 38 + let throttle = state.throttler.get_handle(&relay_host).await; 36 39 Self { 37 40 state, 38 41 buffer_tx, ··· 41 44 filter, 42 45 enabled, 43 46 _verify_signatures: verify_signatures, 47 + throttle, 44 48 } 45 49 } 46 50 47 51 #[tracing::instrument(skip(self), fields(relay = %self.relay_host))] 48 52 pub async fn run(mut self) -> Result<()> { 53 + // extract host as owned String to avoid borrow conflicts with &self inside the loop 54 + let host = self.relay_host.host_str().unwrap_or("").to_string(); 55 + let count_key = crate::db::keys::pds_account_count_key(&host); 56 + 49 57 loop { 50 58 self.enabled.wait_enabled("firehose").await; 51 59 ··· 87 95 } 88 96 }; 89 97 match decode_frame(&bytes) { 90 - Ok(msg) => self.handle_message(msg).await, 98 + Ok(msg) => { 99 + if self.is_pds { 100 + let accounts = self.state.db.get_count(&count_key).await; 101 + let tier = self.state.pds_tier_for(&host); 102 + self.throttle.wait_for_allow(accounts, &tier).await; 103 + } 104 + self.handle_message(msg).await 105 + }, 91 106 Err(e) => { 92 107 match e { 93 108 // dont disconnect on unknown op or type
+33 -14
src/ingest/relay.rs
··· 274 274 } 275 275 SubscribeReposMessage::Account(account) => { 276 276 debug!("processing account"); 277 - Self::handle_account(ctx, &mut repo_state, &msg.firehose, *account) 277 + Self::handle_account(ctx, &mut repo_state, &msg.firehose, *account, msg.is_pds) 278 278 } 279 279 _ => Ok(()), 280 280 } ··· 472 472 fn handle_account( 473 473 ctx: &mut WorkerContext, 474 474 repo_state: &mut RepoState, 475 - #[allow(unused_variables)] firehose: &Url, 475 + firehose: &Url, 476 476 #[allow(unused_mut)] mut account: Account<'static>, 477 + is_pds: bool, 477 478 ) -> Result<()> { 478 479 let event_ms = account.time.0.timestamp_millis(); 479 480 if repo_state.last_message_time.is_some_and(|t| event_ms <= t) { ··· 483 484 484 485 repo_state.advance_message_time(event_ms); 485 486 487 + // always capture was_active for count tracking, not just in indexer mode 488 + let was_active = repo_state.active; 486 489 #[cfg(feature = "indexer")] 487 - let (was_active, was_status) = (repo_state.active, repo_state.status.clone()); 490 + let was_status = repo_state.status.clone(); 488 491 489 492 repo_state.active = account.active; 490 493 if !account.active { ··· 512 515 }; 513 516 } 514 517 518 + // update per-PDS active account count on transitions 519 + if is_pds { 520 + if let Some(host) = firehose.host_str() { 521 + let count_key = keys::pds_account_count_key(host); 522 + if !was_active && repo_state.active { 523 + ctx.state.db.update_count(&count_key, 1); 524 + } else if was_active && !repo_state.active { 525 + ctx.state.db.update_count(&count_key, -1); 526 + } 527 + } 528 + } 529 + 515 530 let repo_key = keys::repo_key(&account.did); 516 531 517 532 #[cfg(feature = "indexer")] ··· 558 573 repo_state: &mut RepoState, 559 574 source_host: &str, 560 575 ) -> Result<AuthorityOutcome> { 561 - let expected = pds_host(repo_state.pds.as_deref()); 576 + let pds_host = |pds: &str| { 577 + Url::parse(pds) 578 + .ok() 579 + .and_then(|u| u.host_str().map(SmolStr::new)) 580 + }; 581 + 582 + let expected = repo_state.pds.as_deref().and_then(pds_host); 562 583 if expected.as_deref() == Some(source_host) { 563 584 return Ok(AuthorityOutcome::Authorized); 564 585 } 565 586 566 587 // try again once 567 588 self.refresh_doc(did, repo_state)?; 568 - let Some(expected) = pds_host(repo_state.pds.as_deref()) else { 589 + let Some(expected) = repo_state.pds.as_deref().and_then(pds_host) else { 569 590 miette::bail!("can't get pds host???"); 570 591 }; 571 592 ··· 837 858 838 859 db.update_count("repos", 1); 839 860 861 + // track initial active state for per-PDS rate limiting 862 + if msg.is_pds && repo_state.active { 863 + if let Some(host) = msg.firehose.host_str() { 864 + db.update_count(&keys::pds_account_count_key(host), 1); 865 + } 866 + } 867 + 840 868 Ok(Some(repo_state)) 841 869 } 842 870 ··· 861 889 /// host did not match even after doc resolution. 862 890 WrongHost { expected: SmolStr }, 863 891 } 864 - 865 - fn pds_host(pds: Option<&str>) -> Option<SmolStr> { 866 - // todo: add faster host parsing since we only need that 867 - pds.and_then(|pds| Url::parse(pds).ok()).map(|u| { 868 - u.host_str() 869 - .map(SmolStr::new) 870 - .expect("that there is host in pds url") 871 - }) 872 - }
+1
src/lib.rs
··· 18 18 pub(crate) mod ingest; 19 19 #[cfg(feature = "indexer")] 20 20 pub(crate) mod ops; 21 + pub(crate) mod patch; 21 22 pub(crate) mod resolver; 22 23 pub(crate) mod state; 23 24 pub(crate) mod util;
+11
src/patch.rs
··· 1 + use serde::{Deserialize, Serialize}; 2 + 3 + /// apply a bool patch or set replacement for a single set update. 4 + #[derive(Debug, Clone, Serialize, Deserialize)] 5 + #[serde(untagged)] 6 + pub(crate) enum SetUpdate { 7 + /// replace the entire set with this list 8 + Set(Vec<String>), 9 + /// patch: true = add, false = remove 10 + Patch(std::collections::HashMap<String, bool>), 11 + }
+45 -1
src/state.rs
··· 1 + use std::collections::HashMap; 2 + use std::sync::Arc; 1 3 use std::sync::atomic::AtomicI64; 2 4 use std::time::Duration; 3 5 6 + use arc_swap::ArcSwap; 4 7 use miette::Result; 8 + use smol_str::SmolStr; 5 9 use tokio::sync::{Notify, watch}; 6 10 use url::Url; 7 11 8 12 use crate::{ 9 - config::Config, 13 + config::{Config, RateTier}, 10 14 db::Db, 11 15 filter::{FilterHandle, new_handle}, 12 16 resolver::Resolver, 17 + util::throttle::Throttler, 13 18 }; 14 19 20 + /// pds hostname -> tier name. updated atomically via ArcSwap. 21 + pub(crate) type PdsTierHandle = Arc<ArcSwap<HashMap<String, SmolStr>>>; 22 + 15 23 pub struct AppState { 16 24 pub db: Db, 17 25 pub resolver: Resolver, 18 26 pub(crate) filter: FilterHandle, 27 + pub(crate) pds_tiers: PdsTierHandle, 28 + pub(crate) rate_tiers: HashMap<String, RateTier>, 19 29 pub firehose_cursors: scc::HashIndex<Url, AtomicI64>, 20 30 pub backfill_notify: Notify, 21 31 pub crawler_enabled: watch::Sender<bool>, 22 32 pub firehose_enabled: watch::Sender<bool>, 23 33 pub backfill_enabled: watch::Sender<bool>, 24 34 pub ephemeral_ttl: Duration, 35 + pub throttler: Throttler, 25 36 } 26 37 27 38 impl AppState { ··· 41 52 42 53 let filter = new_handle(filter_config); 43 54 55 + // load persisted per-PDS tier assignments from the filter keyspace. 56 + // trusted_hosts from config are merged in as defaults (not persisted here; they seed 57 + // only if the host has no existing assignment in the DB). 58 + let mut tier_map: HashMap<String, SmolStr> = crate::db::pds_tiers::load(&db.filter) 59 + .unwrap_or_default() 60 + .into_iter() 61 + .map(|(host, tier)| (host.to_string(), tier)) 62 + .collect(); 63 + for host in &config.trusted_hosts { 64 + tier_map 65 + .entry(host.clone()) 66 + .or_insert_with(|| SmolStr::new("trusted")); 67 + } 68 + let pds_tiers = Arc::new(ArcSwap::new(Arc::new(tier_map))); 69 + 44 70 let relay_cursors = scc::HashIndex::new(); 45 71 46 72 let (crawler_enabled, _) = watch::channel(crawler_default); ··· 51 77 db, 52 78 resolver, 53 79 filter, 80 + pds_tiers, 81 + rate_tiers: config.rate_tiers.clone(), 54 82 firehose_cursors: relay_cursors, 55 83 backfill_notify: Notify::new(), 56 84 crawler_enabled, 57 85 firehose_enabled, 58 86 backfill_enabled, 59 87 ephemeral_ttl: config.ephemeral_ttl.clone(), 88 + throttler: Throttler::new(), 60 89 }) 61 90 } 62 91 63 92 pub fn notify_backfill(&self) { 64 93 self.backfill_notify.notify_one(); 94 + } 95 + 96 + /// returns the rate tier for the given PDS hostname. 97 + /// falls back to the "default" tier if no assignment exists or the assigned tier is unknown. 98 + pub fn pds_tier_for(&self, host: &str) -> RateTier { 99 + let default = self 100 + .rate_tiers 101 + .get("default") 102 + .copied() 103 + .unwrap_or_else(RateTier::default_tier); 104 + let snapshot = self.pds_tiers.load(); 105 + snapshot 106 + .get(host) 107 + .and_then(|name| self.rate_tiers.get(name.as_str()).copied()) 108 + .unwrap_or(default) 65 109 } 66 110 67 111 /// pauses the crawler, firehose, and backfill worker, runs `f`, then restores their prior state.
+2
src/util.rs src/util/mod.rs
··· 10 10 11 11 use crate::{db::types::DidKey, types::RepoStatus}; 12 12 13 + pub mod throttle; 14 + 13 15 /// outcome of [`RetryWithBackoff::retry`] when the operation does not succeed. 14 16 pub enum RetryOutcome<E> { 15 17 /// ratelimited after exhausting all retries
+294
tests/api.nu
··· 250 250 print "firehose source tests passed!" 251 251 } 252 252 253 + def test-pds-tiers [url: string, pid: int] { 254 + print "=== test: pds tier management ===" 255 + 256 + # initial state: no assignments, built-in rate tiers present 257 + print " GET /pds/tiers (expect empty assignments, built-in rate_tiers)..." 258 + let initial = (http get $"($url)/pds/tiers") 259 + if ($initial.assignments | length) != 0 { 260 + fail $"expected empty assignments, got ($initial.assignments | length)" $pid 261 + } 262 + if not ("default" in $initial.rate_tiers) { 263 + fail "expected 'default' tier in rate_tiers" $pid 264 + } 265 + if not ("trusted" in $initial.rate_tiers) { 266 + fail "expected 'trusted' tier in rate_tiers" $pid 267 + } 268 + print " ok: empty assignments and built-in rate tiers present" 269 + 270 + # GET /pds/rate-tiers returns the same definitions with the right fields 271 + print " GET /pds/rate-tiers (check structure)..." 272 + let rate_tiers = (http get $"($url)/pds/rate-tiers") 273 + for tier_name in ["default", "trusted"] { 274 + let tier = ($rate_tiers | get $tier_name) 275 + for field in ["per_second_base", "per_second_account_mul", "per_hour", "per_day"] { 276 + if not ($field in $tier) { 277 + fail $"($tier_name) tier missing field ($field)" $pid 278 + } 279 + } 280 + } 281 + # trusted tier must have higher per-second limit than default 282 + if ($rate_tiers.trusted.per_second_base) <= ($rate_tiers.default.per_second_base) { 283 + fail $"expected trusted.per_second_base > default, got ($rate_tiers.trusted.per_second_base) vs ($rate_tiers.default.per_second_base)" $pid 284 + } 285 + print " ok: rate tier definitions have correct fields and expected ordering" 286 + 287 + # assign a host to the trusted tier 288 + print " PUT /pds/tiers (assign to trusted)..." 289 + http put -f -e -t application/json $"($url)/pds/tiers" { 290 + host: "pds.example.com", 291 + tier: "trusted" 292 + } | assert-status 200 "PUT /pds/tiers" $pid 293 + let after_assign = (http get $"($url)/pds/tiers") 294 + if ($after_assign.assignments | length) != 1 { 295 + fail $"expected 1 assignment, got ($after_assign.assignments | length)" $pid 296 + } 297 + let a = ($after_assign.assignments | first) 298 + if $a.host != "pds.example.com" { 299 + fail $"expected host=pds.example.com, got ($a.host)" $pid 300 + } 301 + if $a.tier != "trusted" { 302 + fail $"expected tier=trusted, got ($a.tier)" $pid 303 + } 304 + print $" ok: assignment created host=($a.host), tier=($a.tier)" 305 + 306 + # re-assigning the same host to a different tier updates without creating a duplicate 307 + print " PUT /pds/tiers (re-assign to default)..." 308 + http put -f -e -t application/json $"($url)/pds/tiers" { 309 + host: "pds.example.com", 310 + tier: "default" 311 + } | assert-status 200 "PUT /pds/tiers re-assign" $pid 312 + let after_reassign = (http get $"($url)/pds/tiers") 313 + if ($after_reassign.assignments | length) != 1 { 314 + fail $"expected 1 assignment after re-assign, got ($after_reassign.assignments | length)" $pid 315 + } 316 + if ($after_reassign.assignments | first).tier != "default" { 317 + fail $"expected tier=default after re-assign, got (($after_reassign.assignments | first).tier)" $pid 318 + } 319 + print " ok: re-assign updates tier without creating a duplicate" 320 + 321 + # assigning an unknown tier name is rejected with 400 322 + print " PUT /pds/tiers (unknown tier, expect 400)..." 323 + http put -f -e -t application/json $"($url)/pds/tiers" { 324 + host: "pds.example.com", 325 + tier: "nonexistent" 326 + } | assert-status 400 "PUT /pds/tiers unknown tier" $pid 327 + let after_bad = (http get $"($url)/pds/tiers") 328 + if ($after_bad.assignments | length) != 1 { 329 + fail "expected assignment count unchanged after rejected request" $pid 330 + } 331 + if ($after_bad.assignments | first).tier != "default" { 332 + fail "expected tier unchanged after rejected request" $pid 333 + } 334 + print " ok: unknown tier name rejected with 400, existing assignment unchanged" 335 + 336 + # add a second host to verify multi-assignment listing works 337 + print " PUT /pds/tiers (second host)..." 338 + http put -f -e -t application/json $"($url)/pds/tiers" { 339 + host: "other.example.com", 340 + tier: "trusted" 341 + } | assert-status 200 "PUT /pds/tiers second host" $pid 342 + let after_second = (http get $"($url)/pds/tiers") 343 + if ($after_second.assignments | length) != 2 { 344 + fail $"expected 2 assignments, got ($after_second.assignments | length)" $pid 345 + } 346 + print " ok: two distinct hosts listed independently" 347 + 348 + # remove the first host 349 + print " DELETE /pds/tiers (first host)..." 350 + http delete -f -e -t application/json $"($url)/pds/tiers" --data { 351 + host: "pds.example.com" 352 + } | assert-status 200 "DELETE /pds/tiers" $pid 353 + let after_del = (http get $"($url)/pds/tiers") 354 + if ($after_del.assignments | length) != 1 { 355 + fail $"expected 1 assignment after delete, got ($after_del.assignments | length)" $pid 356 + } 357 + if ($after_del.assignments | first).host != "other.example.com" { 358 + fail "expected only other.example.com to remain after delete" $pid 359 + } 360 + print " ok: correct host removed, other assignment intact" 361 + 362 + # remove the second host 363 + http delete -f -e -t application/json $"($url)/pds/tiers" --data { 364 + host: "other.example.com" 365 + } | assert-status 200 "DELETE /pds/tiers second" $pid 366 + 367 + # deleting a non-existent host is idempotent (returns 200, not an error) 368 + print " DELETE /pds/tiers (non-existent, expect 200)..." 369 + http delete -f -e -t application/json $"($url)/pds/tiers" --data { 370 + host: "pds.example.com" 371 + } | assert-status 200 "DELETE /pds/tiers non-existent" $pid 372 + let after_idempotent = (http get $"($url)/pds/tiers") 373 + if ($after_idempotent.assignments | length) != 0 { 374 + fail "expected empty assignments after cleanup" $pid 375 + } 376 + print " ok: delete of non-existent host is idempotent" 377 + 378 + print "pds tier management tests passed!" 379 + } 380 + 381 + # verify that tier assignments are written to the database and survive a restart. 382 + def test-pds-tier-persistence [binary: string, db_path: string, port: int] { 383 + print "=== test: pds tier assignments persist across restart ===" 384 + 385 + let url = $"http://localhost:($port)" 386 + 387 + let instance = (with-env { HYDRANT_CRAWLER_URLS: "", HYDRANT_RELAY_HOSTS: "" } { 388 + start-hydrant $binary $db_path $port 389 + }) 390 + if not (wait-for-api $url) { 391 + fail "hydrant did not start" 392 + } 393 + 394 + print " assigning host to trusted tier..." 395 + http put -t application/json $"($url)/pds/tiers" { 396 + host: "persist.example.com", 397 + tier: "trusted" 398 + } 399 + 400 + let before = (http get $"($url)/pds/tiers") 401 + if ($before.assignments | length) != 1 { 402 + fail "assignment was not created" $instance.pid 403 + } 404 + 405 + print " restarting hydrant..." 406 + kill $instance.pid 407 + sleep 2sec 408 + 409 + let instance2 = (with-env { HYDRANT_CRAWLER_URLS: "", HYDRANT_RELAY_HOSTS: "" } { 410 + start-hydrant $binary $db_path $port 411 + }) 412 + if not (wait-for-api $url) { 413 + fail "hydrant did not restart" $instance2.pid 414 + } 415 + 416 + print " checking assignment survived restart..." 417 + let after = (http get $"($url)/pds/tiers") 418 + if ($after.assignments | length) != 1 { 419 + fail $"expected 1 assignment after restart, got ($after.assignments | length)" $instance2.pid 420 + } 421 + let a = ($after.assignments | first) 422 + if $a.host != "persist.example.com" { 423 + fail $"expected host=persist.example.com after restart, got ($a.host)" $instance2.pid 424 + } 425 + if $a.tier != "trusted" { 426 + fail $"expected tier=trusted after restart, got ($a.tier)" $instance2.pid 427 + } 428 + print " ok: tier assignment persisted across restart" 429 + 430 + kill $instance2.pid 431 + print "pds tier persistence test passed!" 432 + } 433 + 434 + # verify that HYDRANT_TRUSTED_HOSTS pre-assigns hosts to the trusted tier at startup. 435 + def test-pds-trusted-hosts [binary: string, db_path: string, port: int] { 436 + print "=== test: HYDRANT_TRUSTED_HOSTS pre-assigns tier at startup ===" 437 + 438 + let url = $"http://localhost:($port)" 439 + let host_a = "alpha.example.com" 440 + let host_b = "beta.example.com" 441 + 442 + let instance = (with-env { 443 + HYDRANT_CRAWLER_URLS: "", 444 + HYDRANT_RELAY_HOSTS: "", 445 + HYDRANT_TRUSTED_HOSTS: $"($host_a),($host_b)" 446 + } { 447 + start-hydrant $binary $db_path $port 448 + }) 449 + if not (wait-for-api $url) { 450 + fail "hydrant did not start" 451 + } 452 + 453 + print " checking pre-assigned trusted hosts..." 454 + let tiers = (http get $"($url)/pds/tiers") 455 + let assignments = $tiers.assignments 456 + 457 + for host in [$host_a, $host_b] { 458 + let match = ($assignments | where host == $host) 459 + if ($match | length) != 1 { 460 + fail $"expected assignment for ($host) from HYDRANT_TRUSTED_HOSTS, got ($assignments)" $instance.pid 461 + } 462 + if ($match | first).tier != "trusted" { 463 + fail $"expected tier=trusted for ($host), got (($match | first).tier)" $instance.pid 464 + } 465 + } 466 + print $" ok: ($host_a) and ($host_b) pre-assigned to trusted tier" 467 + 468 + kill $instance.pid 469 + print "trusted hosts startup test passed!" 470 + } 471 + 472 + # verify that a custom tier defined via HYDRANT_RATE_TIERS is visible and assignable. 473 + def test-pds-custom-rate-tier [binary: string, db_path: string, port: int] { 474 + print "=== test: custom rate tier via HYDRANT_RATE_TIERS ===" 475 + 476 + let url = $"http://localhost:($port)" 477 + 478 + # custom:100/1.0/360000/8640000 — base=100, mul=1.0, hourly=360000, daily=8640000 479 + let instance = (with-env { 480 + HYDRANT_CRAWLER_URLS: "", 481 + HYDRANT_RELAY_HOSTS: "", 482 + HYDRANT_RATE_TIERS: "custom:100/1.0/360000/8640000" 483 + } { 484 + start-hydrant $binary $db_path $port 485 + }) 486 + if not (wait-for-api $url) { 487 + fail "hydrant did not start" 488 + } 489 + 490 + # custom tier should appear alongside the built-in tiers 491 + print " checking custom tier is listed in /pds/rate-tiers..." 492 + let rate_tiers = (http get $"($url)/pds/rate-tiers") 493 + if not ("custom" in $rate_tiers) { 494 + fail "expected 'custom' tier in rate_tiers" $instance.pid 495 + } 496 + if not ("default" in $rate_tiers) { 497 + fail "built-in 'default' tier should still be present alongside custom tier" $instance.pid 498 + } 499 + let custom = ($rate_tiers | get custom) 500 + if $custom.per_second_base != 100 { 501 + fail $"expected custom.per_second_base=100, got ($custom.per_second_base)" $instance.pid 502 + } 503 + if $custom.per_hour != 360000 { 504 + fail $"expected custom.per_hour=360000, got ($custom.per_hour)" $instance.pid 505 + } 506 + print $" ok: custom tier listed with correct parameters" 507 + 508 + # a host can be assigned to the custom tier 509 + print " assigning host to custom tier..." 510 + http put -f -e -t application/json $"($url)/pds/tiers" { 511 + host: "custom.example.com", 512 + tier: "custom" 513 + } | assert-status 200 "PUT /pds/tiers custom tier" $instance.pid 514 + let after = (http get $"($url)/pds/tiers") 515 + let match = ($after.assignments | where host == "custom.example.com") 516 + if ($match | length) != 1 { 517 + fail "expected assignment for custom.example.com" $instance.pid 518 + } 519 + if ($match | first).tier != "custom" { 520 + fail $"expected tier=custom, got (($match | first).tier)" $instance.pid 521 + } 522 + print " ok: host assigned to custom tier successfully" 523 + 524 + kill $instance.pid 525 + print "custom rate tier test passed!" 526 + } 527 + 253 528 def main [] { 254 529 let port = resolve-test-port 3007 255 530 let url = $"http://localhost:($port)" ··· 268 543 269 544 test-crawler-sources $url $instance.pid 270 545 test-firehose-sources $url $instance.pid 546 + test-pds-tiers $url $instance.pid 271 547 272 548 kill $instance.pid 273 549 sleep 2sec ··· 281 557 let db_config = (mktemp -d -t hydrant_api.XXXXXX) 282 558 print $"db: ($db_config)" 283 559 test-config-source-not-persisted $binary $db_config $port 560 + 561 + sleep 1sec 562 + 563 + let db_pds_persist = (mktemp -d -t hydrant_api.XXXXXX) 564 + print $"db: ($db_pds_persist)" 565 + test-pds-tier-persistence $binary $db_pds_persist $port 566 + 567 + sleep 1sec 568 + 569 + let db_pds_trusted = (mktemp -d -t hydrant_api.XXXXXX) 570 + print $"db: ($db_pds_trusted)" 571 + test-pds-trusted-hosts $binary $db_pds_trusted $port 572 + 573 + sleep 1sec 574 + 575 + let db_pds_custom = (mktemp -d -t hydrant_api.XXXXXX) 576 + print $"db: ($db_pds_custom)" 577 + test-pds-custom-rate-tier $binary $db_pds_custom $port 284 578 285 579 print "" 286 580 print "all api tests passed!"