[lib,api] implement runtime crawler source control

+37 -6

README.md

··· 69 69 70 70 each source maintains its own cursor so restarts resume mid-pass. 71 71 72 + sources can also be added and removed at runtime via the `/crawler/sources` API 73 + (see below). dynamically added sources are persisted to the database and survive 74 + restarts. `CRAWLER_URLS` sources are startup-only: they are not written to the 75 + database and will always reappear after a restart regardless of runtime changes. 76 + 72 77 ## configuration 73 78 74 79 `hydrant` is configured via environment variables. all variables are prefixed ··· 117 122 - returns `{ "crawler": bool, "firehose": bool, "backfill": bool }`. 118 123 - `PATCH /ingestion`: enable or disable ingestion components at runtime without 119 124 restarting. 120 - - body: `{ "crawler"?: bool, "firehose"?: bool, "backfill"?: bool }` — only 121 - provided fields are updated. 125 + - body: `{ "crawler"?: bool, "firehose"?: bool, "backfill"?: bool }`. only provided fields are updated. 122 126 - when disabled, each component finishes its current task before pausing (e.g. 123 127 the backfill worker completes any in-flight repo syncs, the firehose 124 128 finishes processing the current message). they resume immediately when 125 129 re-enabled. 126 130 131 + #### crawler source management 132 + 133 + - `GET /crawler/sources`: list all currently active crawler sources. 134 + - returns a JSON array of `{ "url": string, "mode": "relay" | "by_collection", "persisted": bool }`. 135 + - `persisted: true` means the source was added via the API and is stored in the 136 + database, it will survive a restart. `persisted: false` means the source 137 + came from `CRAWLER_URLS` and is not written to the database. 138 + - `POST /crawler/sources`: add a crawler source at runtime. 139 + - body: `{ "url": string, "mode": "relay" | "by_collection" }`. 140 + - the source is written to the database before the producer task is started, so 141 + it is safe to add sources and then immediately restart without losing them. 142 + - if a source with the same URL already exists (whether from `CRAWLER_URLS` or 143 + a previous `POST`), it is replaced: the running task is stopped and a new one 144 + is started with the new mode. any cursor state for that URL is preserved. 145 + - returns `201 Created` on success. 146 + - `DELETE /crawler/sources`: remove a crawler source at runtime. 147 + - body: `{ "url": string }`. 148 + - the producer task is stopped immediately. 149 + - if the source was added via the API (`persisted: true`), it is removed from 150 + the database and will not reappear on restart. if it came from `CRAWLER_URLS` 151 + (`persisted: false`), only the running task is stopped, the source will 152 + reappear on the next restart since `CRAWLER_URLS` is re-applied at startup. 153 + (unless you remove it manually from your configuration of course). 154 + - cursor state is not cleared. use `DELETE /cursors` separately if you want 155 + the source to restart from the beginning when re-added. 156 + - returns `200 OK` if the source was found and removed, `404 Not Found` otherwise. 157 + 127 158 #### database operations 128 159 129 160 - `POST /db/train`: train zstd compression dictionaries for the `repos`, ··· 159 190 160 191 each set field accepts one of two forms: 161 192 162 - - **replace**: an array replaces the entire set — `["did:plc:abc", "did:web:example.org"]` 163 - - **patch**: an object maps items to `true` (add) or `false` (remove) — `{"did:plc:abc": true, "did:web:example.org": false}` 193 + - **replace**: an array replaces the entire set, eg. `["did:plc:abc", "did:web:example.org"]` 194 + - **patch**: an object maps items to `true` (add) or `false` (remove), eg. `{"did:plc:abc": true, "did:web:example.org": false}` 164 195 165 196 #### NSID patterns 166 197 167 198 `signals` and `collections` support an optional `.*` suffix to match an entire namespace: 168 199 169 - - `app.bsky.feed.post` — exact match only 170 - - `app.bsky.feed.*` — matches any collection under `app.bsky.feed` 200 + - `app.bsky.feed.post`: exact match only 201 + - `app.bsky.feed.*`: matches any collection under `app.bsky.feed` 171 202 172 203 ### repository management 173 204

+64

src/api/crawler.rs

··· 1 + use axum::{ 2 + Json, Router, 3 + extract::State, 4 + http::StatusCode, 5 + routing::{delete, get, post}, 6 + }; 7 + use serde::Deserialize; 8 + use url::Url; 9 + 10 + use crate::config::{CrawlerMode, CrawlerSource}; 11 + use crate::control::{CrawlerSourceInfo, Hydrant}; 12 + 13 + pub fn router() -> Router<Hydrant> { 14 + Router::new() 15 + .route("/crawler/sources", get(list_sources)) 16 + .route("/crawler/sources", post(add_source)) 17 + .route("/crawler/sources", delete(remove_source)) 18 + } 19 + 20 + pub async fn list_sources(State(hydrant): State<Hydrant>) -> Json<Vec<CrawlerSourceInfo>> { 21 + Json(hydrant.crawler.list_sources().await) 22 + } 23 + 24 + #[derive(Deserialize)] 25 + pub struct AddSourceRequest { 26 + pub url: Url, 27 + pub mode: CrawlerMode, 28 + } 29 + 30 + pub async fn add_source( 31 + State(hydrant): State<Hydrant>, 32 + Json(body): Json<AddSourceRequest>, 33 + ) -> Result<StatusCode, (StatusCode, String)> { 34 + hydrant 35 + .crawler 36 + .add_source(CrawlerSource { 37 + url: body.url, 38 + mode: body.mode, 39 + }) 40 + .await 41 + .map(|_| StatusCode::CREATED) 42 + .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e.to_string())) 43 + } 44 + 45 + #[derive(Deserialize)] 46 + pub struct RemoveSourceRequest { 47 + pub url: Url, 48 + } 49 + 50 + pub async fn remove_source( 51 + State(hydrant): State<Hydrant>, 52 + Json(body): Json<RemoveSourceRequest>, 53 + ) -> Result<StatusCode, (StatusCode, String)> { 54 + hydrant 55 + .crawler 56 + .remove_source(&body.url) 57 + .await 58 + .map(|found| { 59 + found 60 + .then_some(StatusCode::OK) 61 + .unwrap_or(StatusCode::NOT_FOUND) 62 + }) 63 + .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e.to_string())) 64 + }

+2

src/api/mod.rs

··· 5 5 use tower_http::cors::CorsLayer; 6 6 use tower_http::trace::TraceLayer; 7 7 8 + mod crawler; 8 9 mod db; 9 10 mod debug; 10 11 mod filter; ··· 24 25 .merge(filter::router()) 25 26 .merge(repos::router()) 26 27 .merge(ingestion::router()) 28 + .merge(crawler::router()) 27 29 .merge(db::router()); 28 30 29 31 #[cfg(feature = "backlinks")]

+24 -5

src/config.rs

··· 1 1 use miette::Result; 2 + use serde::{Deserialize, Serialize}; 3 + use smol_str::ToSmolStr; 2 4 use std::fmt; 3 5 use std::path::PathBuf; 4 6 use std::str::FromStr; ··· 42 44 43 45 impl CrawlerMode { 44 46 fn default_for(full_network: bool) -> Self { 45 - if full_network { 46 - Self::Relay 47 - } else { 48 - Self::ByCollection 49 - } 47 + full_network 48 + .then_some(Self::Relay) 49 + .unwrap_or(Self::ByCollection) 50 + } 51 + } 52 + 53 + impl Serialize for CrawlerMode { 54 + fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> 55 + where 56 + S: serde::Serializer, 57 + { 58 + serializer.serialize_str(&self.to_smolstr()) 59 + } 60 + } 61 + 62 + impl<'de> Deserialize<'de> for CrawlerMode { 63 + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> 64 + where 65 + D: serde::Deserializer<'de>, 66 + { 67 + let s = String::deserialize(deserializer)?; 68 + FromStr::from_str(&s).map_err(serde::de::Error::custom) 50 69 } 51 70 } 52 71

+291 -77

src/control.rs

··· 144 144 let state = Arc::new(state); 145 145 146 146 Ok(Self { 147 - crawler: CrawlerHandle(state.clone()), 147 + crawler: CrawlerHandle { 148 + state: state.clone(), 149 + shared: Arc::new(std::sync::OnceLock::new()), 150 + tasks: Arc::new(scc::HashMap::new()), 151 + persisted: Arc::new(scc::HashSet::new()), 152 + }, 148 153 firehose: FirehoseHandle(state.clone()), 149 154 backfill: BackfillHandle(state.clone()), 150 155 filter: FilterControl(state.clone()), ··· 175 180 pub fn run(&self) -> Result<impl Future<Output = Result<()>>> { 176 181 let state = self.state.clone(); 177 182 let config = self.config.clone(); 183 + let crawler = self.crawler.clone(); 178 184 179 185 if self.started.swap(true, Ordering::SeqCst) { 180 186 miette::bail!("Hydrant::run() called more than once"); ··· 329 335 } 330 336 } 331 337 332 - // 11. spawn crawler components 333 - if !config.crawler_sources.is_empty() { 334 - use crate::config::CrawlerMode; 338 + // 11. spawn crawler infrastructure (always, to support dynamic source management) 339 + { 335 340 use crate::crawler::throttle::Throttler; 336 341 use crate::crawler::{ 337 - ByCollectionProducer, CrawlerStats, CrawlerWorker, InFlight, RelayProducer, 338 - RetryProducer, SignalChecker, 342 + CrawlerStats, CrawlerWorker, InFlight, RetryProducer, SignalChecker, 339 343 }; 340 - use std::time::Duration; 341 - use tracing::Instrument; 342 344 343 345 let http = reqwest::Client::builder() 344 346 .user_agent(concat!( ··· 402 404 .run(), 403 405 ); 404 406 405 - let crawler_rx = state.crawler_enabled.subscribe(); 406 - for source in config.crawler_sources.iter().cloned() { 407 - let http = http.clone(); 408 - let state = state.clone(); 409 - let in_flight = in_flight.clone(); 410 - let tx = tx.clone(); 411 - let stats = stats.clone(); 412 - let enabled = crawler_rx.clone(); 413 - match source.mode { 414 - CrawlerMode::Relay => { 415 - info!(relay = %source.url, enabled = *state.crawler_enabled.borrow(), "starting relay crawler"); 416 - let span = tracing::info_span!("crawl", url = %source.url); 417 - tokio::spawn( 418 - RelayProducer { 419 - relay_url: source.url, 420 - checker: checker.clone(), 421 - in_flight, 422 - tx, 423 - enabled, 424 - stats, 425 - } 426 - .run() 427 - .instrument(span), 428 - ); 429 - } 430 - CrawlerMode::ByCollection => { 431 - info!( 432 - host = source.url.host_str(), 433 - enabled = *state.crawler_enabled.borrow(), 434 - "starting by-collection crawler" 435 - ); 436 - let span = 437 - tracing::info_span!("by_collection", host = source.url.host_str()); 438 - tokio::spawn( 439 - async move { 440 - loop { 441 - let producer = ByCollectionProducer { 442 - index_url: source.url.clone(), 443 - http: http.clone(), 444 - state: state.clone(), 445 - in_flight: in_flight.clone(), 446 - tx: tx.clone(), 447 - enabled: enabled.clone(), 448 - stats: stats.clone(), 449 - }; 450 - if let Err(e) = producer.run().await { 451 - error!(err = ?e, "by-collection crawler fatal error, restarting in 30s"); 452 - tokio::time::sleep(Duration::from_secs(30)).await; 453 - } 454 - } 455 - } 456 - .instrument(span), 457 - ); 458 - } 407 + // set shared objects so CrawlerHandle methods can use them 408 + crawler 409 + .shared 410 + .set(CrawlerShared { 411 + http, 412 + checker, 413 + in_flight, 414 + tx, 415 + stats, 416 + }) 417 + .ok() 418 + .expect("crawler shared already set"); 419 + let shared = crawler.shared.get().unwrap(); 420 + 421 + // spawn initial sources from config 422 + for source in config.crawler_sources.iter() { 423 + let enabled_rx = state.crawler_enabled.subscribe(); 424 + let handle = spawn_crawler_producer( 425 + source, 426 + &shared.http, 427 + &state, 428 + &shared.checker, 429 + &shared.in_flight, 430 + &shared.tx, 431 + &shared.stats, 432 + enabled_rx, 433 + ); 434 + let _ = crawler.tasks.insert_async(source.url.clone(), handle).await; 435 + } 436 + 437 + // load and spawn any sources persisted in the database 438 + let db = state.db.clone(); 439 + let persisted_sources = 440 + tokio::task::spawn_blocking(move || load_persisted_crawler_sources(&db)) 441 + .await 442 + .into_diagnostic()??; 443 + 444 + for source in &persisted_sources { 445 + let _ = crawler.persisted.insert_async(source.url.clone()).await; 446 + if crawler.tasks.contains_async(&source.url).await { 447 + continue; 459 448 } 449 + let enabled_rx = state.crawler_enabled.subscribe(); 450 + let handle = spawn_crawler_producer( 451 + source, 452 + &shared.http, 453 + &state, 454 + &shared.checker, 455 + &shared.in_flight, 456 + &shared.tx, 457 + &shared.stats, 458 + enabled_rx, 459 + ); 460 + let _ = crawler.tasks.insert_async(source.url.clone(), handle).await; 460 461 } 461 462 } 462 463 ··· 625 626 } 626 627 } 627 628 628 - // --- event stream --- 629 - 630 629 /// a stream of [`Event`]s. returned by [`Hydrant::subscribe`]. 631 630 /// 632 631 /// implements [`futures::Stream`] and can be used with `StreamExt::next`, ··· 642 641 } 643 642 } 644 643 645 - // --- stats --- 646 - 647 644 /// database statistics returned by [`Hydrant::stats`]. 648 645 #[derive(serde::Serialize)] 649 646 pub struct StatsResponse { ··· 653 650 pub sizes: BTreeMap<&'static str, u64>, 654 651 } 655 652 656 - // --- ingestion handles --- 653 + struct ProducerHandle { 654 + mode: crate::config::CrawlerMode, 655 + abort: tokio::task::AbortHandle, 656 + } 657 + 658 + impl Drop for ProducerHandle { 659 + fn drop(&mut self) { 660 + self.abort.abort(); 661 + } 662 + } 663 + 664 + struct CrawlerShared { 665 + http: reqwest::Client, 666 + checker: crate::crawler::SignalChecker, 667 + in_flight: crate::crawler::InFlight, 668 + tx: mpsc::Sender<crate::crawler::CrawlerBatch>, 669 + stats: crate::crawler::CrawlerStats, 670 + } 671 + 672 + /// a snapshot of a single crawler source's runtime state. 673 + #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] 674 + pub struct CrawlerSourceInfo { 675 + pub url: Url, 676 + pub mode: crate::config::CrawlerMode, 677 + /// whether this source is persisted in the database (i.e. it was dynamically added 678 + /// and will survive restarts). config-sourced entries have `persisted: false`. 679 + pub persisted: bool, 680 + } 681 + 682 + fn spawn_crawler_producer( 683 + source: &crate::config::CrawlerSource, 684 + http: &reqwest::Client, 685 + state: &Arc<AppState>, 686 + checker: &crate::crawler::SignalChecker, 687 + in_flight: &crate::crawler::InFlight, 688 + tx: &mpsc::Sender<crate::crawler::CrawlerBatch>, 689 + stats: &crate::crawler::CrawlerStats, 690 + enabled: watch::Receiver<bool>, 691 + ) -> ProducerHandle { 692 + use crate::config::CrawlerMode; 693 + use crate::crawler::{ByCollectionProducer, RelayProducer}; 694 + use std::time::Duration; 695 + use tracing::Instrument; 696 + 697 + let abort = match source.mode { 698 + CrawlerMode::Relay => { 699 + info!(relay = %source.url, enabled = *state.crawler_enabled.borrow(), "starting relay crawler"); 700 + let span = tracing::info_span!("crawl", url = %source.url); 701 + tokio::spawn( 702 + RelayProducer { 703 + relay_url: source.url.clone(), 704 + checker: checker.clone(), 705 + in_flight: in_flight.clone(), 706 + tx: tx.clone(), 707 + enabled, 708 + stats: stats.clone(), 709 + } 710 + .run() 711 + .instrument(span), 712 + ) 713 + .abort_handle() 714 + } 715 + CrawlerMode::ByCollection => { 716 + info!( 717 + host = source.url.host_str(), 718 + enabled = *state.crawler_enabled.borrow(), 719 + "starting by-collection crawler" 720 + ); 721 + let span = tracing::info_span!("by_collection", host = source.url.host_str()); 722 + let http = http.clone(); 723 + let state = state.clone(); 724 + let in_flight = in_flight.clone(); 725 + let tx = tx.clone(); 726 + let stats = stats.clone(); 727 + let url = source.url.clone(); 728 + tokio::spawn( 729 + async move { 730 + loop { 731 + let producer = ByCollectionProducer { 732 + index_url: url.clone(), 733 + http: http.clone(), 734 + state: state.clone(), 735 + in_flight: in_flight.clone(), 736 + tx: tx.clone(), 737 + enabled: enabled.clone(), 738 + stats: stats.clone(), 739 + }; 740 + if let Err(e) = producer.run().await { 741 + error!(err = ?e, "by-collection crawler fatal error, restarting in 30s"); 742 + tokio::time::sleep(Duration::from_secs(30)).await; 743 + } 744 + } 745 + } 746 + .instrument(span), 747 + ) 748 + .abort_handle() 749 + } 750 + }; 751 + ProducerHandle { 752 + mode: source.mode, 753 + abort, 754 + } 755 + } 756 + 757 + /// load all crawler sources persisted in the database. 758 + fn load_persisted_crawler_sources(db: &crate::db::Db) -> Result<Vec<crate::config::CrawlerSource>> { 759 + use crate::db::keys::CRAWLER_SOURCE_PREFIX; 760 + 761 + let mut sources = Vec::new(); 762 + for entry in db.crawler.prefix(CRAWLER_SOURCE_PREFIX) { 763 + let (key, val) = entry.into_inner().into_diagnostic()?; 764 + let url_bytes = &key[CRAWLER_SOURCE_PREFIX.len()..]; 765 + let url_str = std::str::from_utf8(url_bytes).into_diagnostic()?; 766 + let url = Url::parse(url_str).into_diagnostic()?; 767 + let mode: crate::config::CrawlerMode = rmp_serde::from_slice(&val).into_diagnostic()?; 768 + sources.push(crate::config::CrawlerSource { url, mode }); 769 + } 770 + Ok(sources) 771 + } 657 772 658 773 /// runtime control over the crawler component. 659 774 /// ··· 665 780 /// disabling the crawler does not affect in-progress repo checks. each one completes 666 781 /// its current PDS request before pausing. 667 782 #[derive(Clone)] 668 - pub struct CrawlerHandle(Arc<AppState>); 783 + pub struct CrawlerHandle { 784 + state: Arc<AppState>, 785 + /// set once by [`Hydrant::run`]; `None` means run() has not been called yet. 786 + shared: Arc<std::sync::OnceLock<CrawlerShared>>, 787 + /// per-source running tasks, keyed by url. 788 + tasks: Arc<scc::HashMap<Url, ProducerHandle>>, 789 + /// set of urls persisted in the database (dynamically added sources). 790 + persisted: Arc<scc::HashSet<Url>>, 791 + } 669 792 670 793 impl CrawlerHandle { 671 794 /// enable the crawler (enables all configured producers). no-op if already enabled. 672 795 pub fn enable(&self) { 673 - self.0.crawler_enabled.send_replace(true); 796 + self.state.crawler_enabled.send_replace(true); 674 797 } 675 798 /// disable the crawler (disables all configured producers). 676 799 /// in-progress repo checks finish before the crawler pauses. 677 800 pub fn disable(&self) { 678 - self.0.crawler_enabled.send_replace(false); 801 + self.state.crawler_enabled.send_replace(false); 679 802 } 680 803 /// returns the current enabled state of the crawler. 681 804 pub fn is_enabled(&self) -> bool { 682 - *self.0.crawler_enabled.borrow() 805 + *self.state.crawler_enabled.borrow() 683 806 } 684 807 685 808 /// delete all cursor entries associated with the given URL. 686 809 pub async fn reset_cursor(&self, url: &str) -> Result<()> { 687 - let db = self.0.db.clone(); 810 + let db = self.state.db.clone(); 688 811 let point_keys = [keys::crawler_cursor_key(url)]; 689 812 let by_collection_prefix = keys::by_collection_cursor_prefix(url); 690 813 tokio::task::spawn_blocking(move || { ··· 693 816 batch.remove(&db.cursors, k); 694 817 } 695 818 for entry in db.cursors.prefix(&by_collection_prefix) { 696 - let (k, _) = entry.into_inner().into_diagnostic()?; 819 + let k = entry.key().into_diagnostic()?; 697 820 batch.remove(&db.cursors, k); 698 821 } 699 822 batch.commit().into_diagnostic() ··· 702 825 .into_diagnostic()??; 703 826 Ok(()) 704 827 } 828 + 829 + /// return info on all currently active crawler sources. 830 + /// 831 + /// returns an empty list if called before [`Hydrant::run`]. 832 + pub async fn list_sources(&self) -> Vec<CrawlerSourceInfo> { 833 + let mut sources = Vec::new(); 834 + self.tasks 835 + .iter_async(|url, h| { 836 + sources.push(CrawlerSourceInfo { 837 + url: url.clone(), 838 + mode: h.mode, 839 + persisted: self.persisted.contains_sync(url), 840 + }); 841 + true 842 + }) 843 + .await; 844 + sources 845 + } 846 + 847 + /// add a new crawler source at runtime. 848 + /// 849 + /// the source is persisted to the database and will be re-spawned on restart. 850 + /// if a source with the same URL already exists, it is replaced (the old task is 851 + /// aborted and a new one is started with the new mode). 852 + /// 853 + /// returns an error if called before [`Hydrant::run`]. 854 + pub async fn add_source(&self, source: crate::config::CrawlerSource) -> Result<()> { 855 + let Some(shared) = self.shared.get() else { 856 + miette::bail!("crawler not yet started: call Hydrant::run() first"); 857 + }; 858 + 859 + let db = self.state.db.clone(); 860 + let key = keys::crawler_source_key(source.url.as_str()); 861 + let val = rmp_serde::to_vec(&source.mode).into_diagnostic()?; 862 + tokio::task::spawn_blocking(move || db.crawler.insert(key, val).into_diagnostic()) 863 + .await 864 + .into_diagnostic()??; 865 + 866 + let enabled_rx = self.state.crawler_enabled.subscribe(); 867 + let handle = spawn_crawler_producer( 868 + &source, 869 + &shared.http, 870 + &self.state, 871 + &shared.checker, 872 + &shared.in_flight, 873 + &shared.tx, 874 + &shared.stats, 875 + enabled_rx, 876 + ); 877 + 878 + let _ = self.persisted.insert_async(source.url.clone()).await; 879 + match self.tasks.entry_async(source.url).await { 880 + scc::hash_map::Entry::Vacant(e) => { 881 + e.insert_entry(handle); 882 + } 883 + scc::hash_map::Entry::Occupied(mut e) => { 884 + *e.get_mut() = handle; 885 + } 886 + } 887 + Ok(()) 888 + } 889 + 890 + /// remove a crawler source at runtime by URL. 891 + /// 892 + /// aborts the running producer task and removes the source from the database if it 893 + /// was dynamically added. config-sourced entries are aborted but not persisted, so 894 + /// they will reappear on restart. 895 + /// 896 + /// returns `true` if a source with the given URL was found and removed. 897 + /// returns an error if called before [`Hydrant::run`]. 898 + pub async fn remove_source(&self, url: &Url) -> Result<bool> { 899 + if self.shared.get().is_none() { 900 + miette::bail!("crawler not yet started: call Hydrant::run() first"); 901 + } 902 + 903 + // dropping the ProducerHandle aborts the task via Drop 904 + if self.tasks.remove_async(url).await.is_none() { 905 + return Ok(false); 906 + } 907 + 908 + // remove from DB if it was a persisted source 909 + if self.persisted.remove_async(url).await.is_some() { 910 + let db = self.state.db.clone(); 911 + let key = keys::crawler_source_key(url.as_str()); 912 + tokio::task::spawn_blocking(move || db.crawler.remove(key).into_diagnostic()) 913 + .await 914 + .into_diagnostic()??; 915 + } 916 + 917 + Ok(true) 918 + } 705 919 } 706 920 707 921 /// runtime control over the firehose ingestor component. ··· 709 923 pub struct FirehoseHandle(Arc<AppState>); 710 924 711 925 impl FirehoseHandle { 712 - /// enable the firehose. no-op if already enabled. 926 + /// enable the firehose, no-op if already enabled. 713 927 pub fn enable(&self) { 714 928 self.0.firehose_enabled.send_replace(true); 715 929 } 716 - /// disable the firehose. the current message finishes processing before the connection closes. 930 + /// disable the firehose, the current message finishes processing before the connection closes. 717 931 pub fn disable(&self) { 718 932 self.0.firehose_enabled.send_replace(false); 719 933 } ··· 732 946 pub struct BackfillHandle(Arc<AppState>); 733 947 734 948 impl BackfillHandle { 735 - /// enable the backfill worker. no-op if already enabled. 949 + /// enable the backfill worker, no-op if already enabled. 736 950 pub fn enable(&self) { 737 951 self.0.backfill_enabled.send_replace(true); 738 952 } 739 - /// disable the backfill worker. in-flight repos complete before pausing. 953 + /// disable the backfill worker, in-flight repos complete before pausing. 740 954 pub fn disable(&self) { 741 955 self.0.backfill_enabled.send_replace(false); 742 956 }

+1 -1

src/crawler/mod.rs

··· 19 19 20 20 pub(crate) use by_collection::ByCollectionProducer; 21 21 pub(crate) use relay::{RelayProducer, RetryProducer, SignalChecker}; 22 - pub(crate) use worker::CrawlerWorker; 22 + pub(crate) use worker::{CrawlerBatch, CrawlerWorker}; 23 23 24 24 // -- InFlight ------------------------------------------------------------ 25 25

+9

src/db/keys.rs

··· 178 178 prefix 179 179 } 180 180 181 + pub const CRAWLER_SOURCE_PREFIX: &[u8] = b"src|"; 182 + 183 + pub fn crawler_source_key(url: &str) -> Vec<u8> { 184 + let mut key = Vec::with_capacity(CRAWLER_SOURCE_PREFIX.len() + url.len()); 185 + key.extend_from_slice(CRAWLER_SOURCE_PREFIX); 186 + key.extend_from_slice(url.as_bytes()); 187 + key 188 + } 189 + 181 190 pub fn firehose_cursor_key(relay: &str) -> Vec<u8> { 182 191 let mut key = b"firehose_cursor|".to_vec(); 183 192 key.extend_from_slice(relay.as_bytes());

+248

tests/api_test.nu

··· 1 + #!/usr/bin/env nu 2 + use common.nu * 3 + 4 + # print a failure message, kill any running hydrant instances, and exit. 5 + def fail [msg: string, ...pids: int] { 6 + print $" FAILED: ($msg)" 7 + for pid in $pids { 8 + try { kill $pid } 9 + } 10 + exit 1 11 + } 12 + 13 + def test-crawler-sources [url: string, pid: int] { 14 + print "=== test: crawler sources ===" 15 + 16 + # initial state: no sources 17 + print " GET /crawler/sources (expect empty)..." 18 + let initial = (http get $"($url)/crawler/sources") 19 + if ($initial | length) != 0 { 20 + fail $"expected empty list, got ($initial | length) entries" $pid 21 + } 22 + print " ok: starts empty" 23 + 24 + # add a relay source 25 + print " POST /crawler/sources (relay)..." 26 + let resp_add = (http post -f -e -t application/json $"($url)/crawler/sources" { 27 + url: "https://bsky.network", 28 + mode: "relay" 29 + }) 30 + if $resp_add.status != 201 { 31 + fail $"expected 201, got ($resp_add.status)" $pid 32 + } 33 + print " ok: 201 Created" 34 + 35 + # verify the source appears with correct fields 36 + print " GET /crawler/sources (expect 1 entry)..." 37 + let sources = (http get $"($url)/crawler/sources") 38 + if ($sources | length) != 1 { 39 + fail $"expected 1 source, got ($sources | length)" $pid 40 + } 41 + let s = ($sources | first) 42 + if $s.mode != "relay" { 43 + fail $"expected mode=relay, got ($s.mode)" $pid 44 + } 45 + if not $s.persisted { 46 + fail "expected persisted=true for dynamically added source" $pid 47 + } 48 + print $" ok: 1 source — url=($s.url), mode=($s.mode), persisted=($s.persisted)" 49 + 50 + # posting the same URL with a different mode replaces the existing entry 51 + print " POST /crawler/sources (should override)..." 52 + let resp_replace = (http post -f -e -t application/json $"($url)/crawler/sources" { 53 + url: "https://bsky.network", 54 + mode: "by_collection" 55 + }) 56 + if $resp_replace.status != 201 { 57 + fail $"expected 201, got ($resp_replace.status)" $pid 58 + } 59 + let after_replace = (http get $"($url)/crawler/sources") 60 + if ($after_replace | length) != 1 { 61 + fail $"expected 1 source after override, got ($after_replace | length)" $pid 62 + } 63 + if ($after_replace | first).mode != "by_collection" { 64 + fail "expected mode to be updated to by_collection after override" $pid 65 + } 66 + print " ok: duplicate add replaced existing entry (mode updated)" 67 + 68 + # remove the source 69 + print " DELETE /crawler/sources..." 70 + let resp_del = (http delete -f -e -t application/json $"($url)/crawler/sources" --data { 71 + url: "https://bsky.network" 72 + }) 73 + if $resp_del.status != 200 { 74 + fail $"expected 200, got ($resp_del.status)" $pid 75 + } 76 + let after_del = (http get $"($url)/crawler/sources") 77 + if ($after_del | length) != 0 { 78 + fail "expected empty list after delete" $pid 79 + } 80 + print " ok: source removed" 81 + 82 + # deleting a non-existent source returns 404 83 + print " DELETE /crawler/sources (should be 404)..." 84 + let resp_del_missing = (http delete -f -e -t application/json $"($url)/crawler/sources" --data { 85 + url: "https://bsky.network" 86 + }) 87 + if $resp_del_missing.status != 404 { 88 + fail $"expected 404, got ($resp_del_missing.status)" $pid 89 + } 90 + print " ok: 404 for non-existent source" 91 + 92 + print "crawler source tests passed!" 93 + } 94 + 95 + # verify that dynamically added sources are written to the database and survive a restart. 96 + def test-source-persistence [binary: string, db_path: string, port: int] { 97 + print "=== test: dynamically added sources persist across restart ===" 98 + 99 + let url = $"http://localhost:($port)" 100 + 101 + let instance = (with-env { HYDRANT_CRAWLER_URLS: "" } { 102 + start-hydrant $binary $db_path $port 103 + }) 104 + if not (wait-for-api $url) { 105 + fail "hydrant did not start" 106 + } 107 + 108 + print " adding source..." 109 + http post -t application/json $"($url)/crawler/sources" { 110 + url: "https://lightrail.microcosm.blue", 111 + mode: "by_collection" 112 + } 113 + 114 + let before = (http get $"($url)/crawler/sources") 115 + if ($before | length) != 1 { 116 + fail "source was not added" $instance.pid 117 + } 118 + 119 + # restart hydrant against the same database 120 + print " restarting hydrant..." 121 + kill $instance.pid 122 + sleep 2sec 123 + 124 + let instance2 = (with-env { HYDRANT_CRAWLER_URLS: "" } { 125 + start-hydrant $binary $db_path $port 126 + }) 127 + if not (wait-for-api $url) { 128 + fail "hydrant did not restart" $instance2.pid 129 + } 130 + 131 + print " checking source survived restart..." 132 + let after = (http get $"($url)/crawler/sources") 133 + if ($after | length) != 1 { 134 + fail $"expected 1 source after restart, got ($after | length)" $instance2.pid 135 + } 136 + let s = ($after | first) 137 + if not $s.persisted { 138 + fail "expected persisted=true after restart" $instance2.pid 139 + } 140 + if $s.mode != "by_collection" { 141 + fail $"expected mode=by_collection after restart, got ($s.mode)" $instance2.pid 142 + } 143 + print " ok: persisted source survived restart" 144 + 145 + kill $instance2.pid 146 + print "source persistence test passed!" 147 + } 148 + 149 + # verify that CRAWLER_URLS sources are not written to the database (persisted=false), 150 + # can be stopped at runtime, but reappear on the next restart because the env var 151 + # is re-applied at startup. 152 + def test-config-source-not-persisted [binary: string, db_path: string, port: int] { 153 + print "=== test: CRAWLER_URLS sources are not persisted ===" 154 + 155 + let url = $"http://localhost:($port)" 156 + let crawler_url = "https://lightrail.microcosm.blue" 157 + 158 + let instance = (with-env { HYDRANT_CRAWLER_URLS: $"by_collection::($crawler_url)" } { 159 + start-hydrant $binary $db_path $port 160 + }) 161 + if not (wait-for-api $url) { 162 + fail "hydrant did not start" 163 + } 164 + 165 + # config source should appear, but with persisted=false 166 + print " checking config source appears with persisted=false..." 167 + let sources = (http get $"($url)/crawler/sources") 168 + if ($sources | length) != 1 { 169 + fail $"expected 1 source, got ($sources | length)" $instance.pid 170 + } 171 + if ($sources | first).persisted { 172 + fail "expected persisted=false for a CRAWLER_URLS source" $instance.pid 173 + } 174 + print " ok: config source has persisted=false" 175 + 176 + # the task can be stopped at runtime 177 + print " deleting config source at runtime..." 178 + let resp = (http delete -f -e -t application/json $"($url)/crawler/sources" --data { 179 + url: $crawler_url 180 + }) 181 + if $resp.status != 200 { 182 + fail $"expected 200, got ($resp.status)" $instance.pid 183 + } 184 + let after_del = (http get $"($url)/crawler/sources") 185 + if ($after_del | length) != 0 { 186 + fail "expected source to be gone after runtime delete" $instance.pid 187 + } 188 + print " ok: config source removed at runtime" 189 + 190 + # after a restart with the same CRAWLER_URLS, the config source reappears 191 + print " restarting with same CRAWLER_URLS..." 192 + kill $instance.pid 193 + sleep 2sec 194 + 195 + let instance2 = (with-env { HYDRANT_CRAWLER_URLS: $"by_collection::($crawler_url)" } { 196 + start-hydrant $binary $db_path $port 197 + }) 198 + if not (wait-for-api $url) { 199 + fail "hydrant did not restart" $instance2.pid 200 + } 201 + 202 + let after_restart = (http get $"($url)/crawler/sources") 203 + if ($after_restart | length) != 1 { 204 + fail $"expected config source to reappear after restart, got ($after_restart | length)" $instance2.pid 205 + } 206 + if ($after_restart | first).persisted { 207 + fail "expected persisted=false after restart" $instance2.pid 208 + } 209 + print " ok: config source reappears on restart (not persisted to DB)" 210 + 211 + kill $instance2.pid 212 + print "config source persistence test passed!" 213 + } 214 + 215 + def main [] { 216 + let port = 3007 217 + let url = $"http://localhost:($port)" 218 + 219 + let binary = build-hydrant 220 + 221 + let db = (mktemp -d -t hydrant_api_test.XXXXXX) 222 + print $"db: ($db)" 223 + 224 + let instance = (with-env { HYDRANT_CRAWLER_URLS: "" } { 225 + start-hydrant $binary $db $port 226 + }) 227 + if not (wait-for-api $url) { 228 + fail "hydrant did not start" $instance.pid 229 + } 230 + 231 + test-crawler-sources $url $instance.pid 232 + 233 + kill $instance.pid 234 + sleep 2sec 235 + 236 + let db_persist = (mktemp -d -t hydrant_api_test.XXXXXX) 237 + print $"db: ($db_persist)" 238 + test-source-persistence $binary $db_persist $port 239 + 240 + sleep 1sec 241 + 242 + let db_config = (mktemp -d -t hydrant_api_test.XXXXXX) 243 + print $"db: ($db_config)" 244 + test-config-source-not-persisted $binary $db_config $port 245 + 246 + print "" 247 + print "all api tests passed!" 248 + }

Configure Feed

Configure Feed