···238238| `ENABLE_CRAWLER` | `true` if full network or crawler sources are configured, `false` otherwise | whether to actively query the network for unknown repositories. |
239239| `CRAWLER_MAX_PENDING_REPOS` | `2000` | max pending repos for crawler. |
240240| `CRAWLER_RESUME_PENDING_REPOS` | `1000` | resume threshold for crawler pending repos. |
241241+| `NEW_HOST_LIMIT` | `50` | in relay mode, decides how many new hosts can be added via `com.atproto.sync.requestCrawl` in a day. |
241242| `RATE_TIERS` | | comma-separated list of named rate tier definitions in `name:base/mul/hourly/daily[/account_limit]` format (e.g. `trusted:5000/10.0/18000000/432000000/10000000`). the optional account limit prevents new accounts from being created on this PDS once reached. built-in tiers (`default`, `trusted`) are always present and can be overridden. |
242243| `TIER_RULES` | | comma-separated ordered list of glob rules in `pattern:tier_name` format (e.g. `*.bsky.network:trusted`). rules are evaluated in order; first match wins. explicit API assignments via `PUT /pds/tiers` take precedence over rules; the `default` tier is the final fallback. uses standard glob wildcards (`*`, `?`) matched against the PDS hostname. |
243244
···11use jacquard_api::com_atproto::sync::request_crawl::{
22 RequestCrawlError, RequestCrawlRequest, RequestCrawlResponse,
33};
44+use miette::IntoDiagnostic;
45use url::Url;
5667use super::*;
···2122 "host is banned".into(),
2223 ))),
2324 });
2525+ }
2626+2727+ // enforce daily new pds limit on unknown hosts
2828+ if !hydrant.firehose.is_source_known(&url) {
2929+ let (allowed, to_persist) = hydrant.state.pds_daily_limit.try_increment();
3030+ if !allowed {
3131+ return Err(rate_limited(
3232+ nsid,
3333+ "daily limit for new PDS sources reached",
3434+ ));
3535+ }
3636+3737+ // persist the new count before returning so a crash cannot reset the counter
3838+ // and allow the budget to be replayed.
3939+ if let Some((day, count)) = to_persist {
4040+ let state = hydrant.state.clone();
4141+ tokio::task::spawn_blocking(move || {
4242+ crate::db::save_pds_daily_adds(&state.db, day, count)
4343+ })
4444+ .await
4545+ .into_diagnostic()
4646+ .flatten()
4747+ .map_err(|e| internal_error(nsid, e))?;
4848+ }
2449 }
25502651 hydrant
+13
src/config.rs
···375375 /// set via `HYDRANT_ONLY_INDEX_LINKS=true`.
376376 pub only_index_links: bool,
377377378378+ /// maximum number of new PDS sources that may be added (via seeding or API) in a single
379379+ /// UTC calendar day. `None` means unlimited.
380380+ /// set via `HYDRANT_NEW_HOST_LIMIT`.
381381+ pub new_host_limit: Option<u64>,
382382+378383 /// base URL(s) of relay or aggregator services to seed firehose PDS sources from at startup.
379384 ///
380385 /// hydrant calls `com.atproto.sync.listHosts` on each URL and adds the returned PDSes
···495500 filter_excludes: None,
496501 enable_backlinks: false,
497502 only_index_links: false,
503503+ new_host_limit: Some(50),
498504 tier_rules: vec![],
499505 tier_policy: {
500506 let mut tiers = HashMap::new();
···671677672678 let enable_backlinks: bool = cfg!("ENABLE_BACKLINKS", defaults.enable_backlinks);
673679 let only_index_links: bool = cfg!("ONLY_INDEX_LINKS", defaults.only_index_links);
680680+ let max_pds_added_per_day: Option<u64> = std::env::var("HYDRANT_NEW_HOST_LIMIT")
681681+ .ok()
682682+ .and_then(|s| s.parse().ok());
674683675684 // start with built-in tier definitions, then layer in any env-defined overrides.
676685 // format: HYDRANT_RATE_TIERS=name:base/mul/hourly/daily,...
···785794 filter_excludes,
786795 enable_backlinks,
787796 only_index_links,
797797+ new_host_limit: max_pds_added_per_day,
788798 tier_policy,
789799 tier_rules,
790800 cache_size,
···914924 .collect::<Vec<_>>()
915925 )
916926 )?;
927927+ }
928928+ if let Some(limit) = self.new_host_limit {
929929+ config_line!(f, "max pds/day", limit)?;
917930 }
918931 Ok(())
919932 }
-4
src/control/crawler.rs
···3232pub struct CrawlerSourceInfo {
3333 pub url: Url,
3434 pub mode: crate::config::CrawlerMode,
3535- /// whether this source is persisted in the database (i.e. it was dynamically added
3636- /// and will survive restarts). config-sourced entries have `persisted: false`.
3737- pub persisted: bool,
3835}
39364037pub(super) fn spawn_crawler_producer(
···179176 sources.push(CrawlerSourceInfo {
180177 url: url.clone(),
181178 mode: h.mode,
182182- persisted: self.persisted.contains_sync(url),
183179 });
184180 true
185181 })
+12-9
src/control/firehose.rs
···3434#[derive(Debug, Clone, serde::Serialize)]
3535pub struct FirehoseSourceInfo {
3636 pub url: Url,
3737- /// true if added via the API and persisted to the database; false for `RELAY_HOSTS` sources.
3838- pub persisted: bool,
3937 /// true when this is a direct PDS connection; enables host authority enforcement.
4038 pub is_pds: bool,
4139}
···4846 pub(super) shared: Arc<std::sync::OnceLock<FirehoseShared>>,
4947 /// per-relay running tasks, keyed by url.
5048 pub(super) tasks: Arc<scc::HashMap<Url, FirehoseIngestorHandle>>,
5151- /// set of urls persisted in the database (dynamically added sources).
5252- pub(super) persisted: Arc<scc::HashSet<Url>>,
4949+ /// set of known source urls, includes API-added (db-persisted) and static config sources.
5050+ pub(super) known_sources: Arc<scc::HashSet<Url>>,
5351 /// ids assigned to spawned tasks
5452 next_task_id: Arc<AtomicUsize>,
5553}
···6058 state,
6159 shared: Arc::new(std::sync::OnceLock::new()),
6260 tasks: Arc::new(scc::HashMap::new()),
6363- persisted: Arc::new(scc::HashSet::new()),
6161+ known_sources: Arc::new(scc::HashSet::new()),
6462 next_task_id: Arc::new(AtomicUsize::new(0)),
6563 }
6664 }
···153151 *self.state.firehose_enabled.borrow()
154152 }
155153154154+ /// returns `true` if this URL is already a known firehose source — either currently
155155+ /// running or persisted (e.g. the host is offline but was previously added).
156156+ pub fn is_source_known(&self, url: &Url) -> bool {
157157+ self.known_sources.contains_sync(url)
158158+ }
159159+156160 /// list all currently active firehose sources.
157161 pub async fn list_sources(&self) -> Vec<FirehoseSourceInfo> {
158162 let mut out = Vec::new();
···160164 .any_async(|url, handle| {
161165 out.push(FirehoseSourceInfo {
162166 url: url.clone(),
163163- persisted: self.persisted.contains_sync(url),
164167 is_pds: handle.is_pds,
165168 });
166169 false
···197200 .await
198201 .into_diagnostic()??;
199202200200- let _ = self.persisted.insert_async(url.clone()).await;
203203+ let _ = self.known_sources.insert_async(url.clone()).await;
201204202205 // reset failure state so the fresh task gets a clean slate.
203206 // if the previous task exited after max failures, the failure counter
···217220 /// if the source was added via the API, it is removed from the database;
218221 /// if it came from the static config, only the running task is stopped.
219222 pub async fn remove_source(&self, url: &Url) -> Result<bool> {
220220- if self.persisted.contains_async(url).await {
223223+ if self.known_sources.contains_async(url).await {
221224 let url_str = url.to_string();
222225 tokio::task::spawn_blocking({
223226 let state = self.state.clone();
···232235 })
233236 .await
234237 .into_diagnostic()??;
235235- self.persisted.remove_async(url).await;
238238+ self.known_sources.remove_async(url).await;
236239 }
237240238241 Ok(self.tasks.remove_async(url).await.is_some())
+8-1
src/control/mod.rs
···414414 "starting firehose ingestor(s)"
415415 );
416416 for source in &relay_hosts {
417417+ let _ = firehose
418418+ .known_sources
419419+ .insert_async(source.url.clone())
420420+ .await;
417421 firehose
418422 .spawn_firehose_ingestor(source, fire_shared, true)
419423 .await?;
···428432 .into_diagnostic()??;
429433430434 for source in &persisted_sources {
431431- let _ = firehose.persisted.insert_async(source.url.clone()).await;
435435+ let _ = firehose
436436+ .known_sources
437437+ .insert_async(source.url.clone())
438438+ .await;
432439 if firehose.tasks.contains_async(&source.url).await {
433440 continue;
434441 }
+5
src/db/keys/mod.rs
···100100pub fn pds_account_count_key(host: &str) -> String {
101101 format!("p|{host}")
102102}
103103+104104+/// key for the persisted daily-PDS-add counter in the cursors keyspace.
105105+/// value layout: [day: u64 BE][count: u64 BE] = 16 bytes.
106106+#[cfg(feature = "relay")]
107107+pub const PDS_DAILY_ADDS_KEY: &[u8] = b"pds_daily_adds";
+30
src/db/mod.rs
···836836 batch.commit().into_diagnostic()
837837}
838838839839+/// load the persisted (day, count) pair for the daily PDS add counter, if present.
840840+/// returns `None` if no entry exists or the stored data is malformed.
841841+#[cfg(feature = "relay")]
842842+pub fn load_pds_daily_adds(db: &Db) -> Result<Option<(u64, u64)>> {
843843+ let Some(val) = db.cursors.get(keys::PDS_DAILY_ADDS_KEY).into_diagnostic()? else {
844844+ return Ok(None);
845845+ };
846846+ if val.len() < 16 {
847847+ miette::bail!("malformed pds daily limit value");
848848+ }
849849+ let day = u64::from_be_bytes(val[..8].try_into().into_diagnostic()?);
850850+ let count = u64::from_be_bytes(val[8..].try_into().into_diagnostic()?);
851851+ Ok(Some((day, count)))
852852+}
853853+854854+/// persist the daily PDS add counter (day, count) to the cursors keyspace.
855855+/// value layout: [day: u64 BE][count: u64 BE] = 16 bytes.
856856+///
857857+/// takes the `cursors` keyspace directly so the caller can clone it into a
858858+/// `spawn_blocking` closure without needing an owned `Db`.
859859+#[cfg(feature = "relay")]
860860+pub fn save_pds_daily_adds(db: &Db, day: u64, count: u64) -> Result<()> {
861861+ let mut value = [0u8; 16];
862862+ value[..8].copy_from_slice(&day.to_be_bytes());
863863+ value[8..].copy_from_slice(&count.to_be_bytes());
864864+ db.cursors
865865+ .insert(keys::PDS_DAILY_ADDS_KEY, value)
866866+ .into_diagnostic()
867867+}
868868+839869pub fn load_persisted_firehose_sources(
840870 db: &crate::db::Db,
841871) -> Result<Vec<crate::config::FirehoseSource>> {
+2
src/lib.rs
···33/// hydrant main api, includes the Hydrant type for programmatic control.
44pub mod control;
55pub(crate) mod filter;
66+#[cfg(feature = "relay")]
77+pub(crate) mod pds_daily_limit;
68pub(crate) mod pds_meta;
79pub mod types;
810
+73
src/pds_daily_limit.rs
···11+use std::sync::atomic::{AtomicU64, Ordering};
22+use std::time::{SystemTime, UNIX_EPOCH};
33+44+/// per-UTC-day counter for PDS additions via `requestCrawl`.
55+///
66+/// the in-memory state is initialised from the database on startup (see
77+/// [`crate::db::load_pds_daily_adds`]). the counter resets automatically when the UTC day
88+/// rolls over.
99+pub(crate) struct PdsDailyLimit {
1010+ limit: Option<u64>,
1111+ /// current UTC day index (unix seconds / 86400).
1212+ day: AtomicU64,
1313+ /// requestCrawl calls accepted on the current UTC day.
1414+ count: AtomicU64,
1515+}
1616+1717+impl PdsDailyLimit {
1818+ /// construct from the previously-persisted `(day, count)` pair loaded from the database.
1919+ /// if the stored day doesn't match today the count is treated as 0.
2020+ pub(crate) fn new(limit: Option<u64>, stored: Option<(u64, u64)>) -> Self {
2121+ let today = utc_day();
2222+ let count = stored
2323+ .filter(|(day, _)| *day == today)
2424+ .map(|(_, count)| count)
2525+ .unwrap_or(0);
2626+ Self {
2727+ limit,
2828+ day: AtomicU64::new(today),
2929+ count: AtomicU64::new(count),
3030+ }
3131+ }
3232+3333+ /// attempt to consume a daily slot.
3434+ ///
3535+ /// returns `(allowed, to_persist)`:
3636+ /// - `allowed`: whether the request is permitted.
3737+ /// - `to_persist`: when `Some((day, new_count))`, the caller must persist these values to
3838+ /// the database before returning success, so that a process crash cannot reset the counter
3939+ /// and allow the budget to be replayed. `None` when no limit is configured.
4040+ ///
4141+ /// when the UTC day rolls over the counter resets and a fresh quota starts.
4242+ pub(crate) fn try_increment(&self) -> (bool, Option<(u64, u64)>) {
4343+ let Some(limit) = self.limit else {
4444+ return (true, None);
4545+ };
4646+4747+ let today = utc_day();
4848+ if self.day.load(Ordering::Relaxed) != today {
4949+ self.count.store(0, Ordering::Relaxed);
5050+ self.day.store(today, Ordering::Relaxed);
5151+ }
5252+5353+ // fetch_add returns the value *before* the increment
5454+ let prev = self.count.fetch_add(1, Ordering::Relaxed);
5555+ if prev >= limit {
5656+ // undo to avoid the counter drifting upwards on repeated rejections
5757+ self.count.fetch_sub(1, Ordering::Relaxed);
5858+ return (false, None);
5959+ }
6060+6161+ let new_count = prev + 1;
6262+ let day = self.day.load(Ordering::Relaxed);
6363+ (true, Some((day, new_count)))
6464+ }
6565+}
6666+6767+fn utc_day() -> u64 {
6868+ SystemTime::now()
6969+ .duration_since(UNIX_EPOCH)
7070+ .unwrap_or_default()
7171+ .as_secs()
7272+ / 86400
7373+}