···11-use std::collections::BTreeMap;
22-use std::future::Future;
33-use std::pin::Pin;
44-use std::sync::Arc;
55-use std::sync::atomic::{AtomicBool, Ordering};
66-use std::task::{Context, Poll};
77-88-use chrono::{DateTime, Utc};
99-use futures::{FutureExt, Stream};
1010-use jacquard_common::cowstr::ToCowStr;
1111-use jacquard_common::types::cid::{ATP_CID_HASH, Cid, IpldCid};
1212-use jacquard_common::types::ident::AtIdentifier;
1313-use jacquard_common::types::nsid::Nsid;
1414-use jacquard_common::types::string::{Did, Handle, Rkey};
1515-use jacquard_common::types::tid::Tid;
1616-use jacquard_common::{CowStr, Data, IntoStatic, RawData};
1717-use jacquard_repo::DAG_CBOR_CID_CODEC;
1818-use miette::{IntoDiagnostic, Result};
1919-use rand::Rng;
2020-use sha2::{Digest, Sha256};
2121-use smol_str::ToSmolStr;
2222-use tokio::sync::{mpsc, watch};
2323-use tracing::{debug, error, info};
2424-use url::Url;
2525-2626-use crate::backfill::BackfillWorker;
2727-use crate::config::{Config, SignatureVerification};
2828-use crate::db::types::DbRkey;
2929-use crate::db::{
3030- self, filter as db_filter, keys, load_persisted_crawler_sources,
3131- load_persisted_firehose_sources, ser_repo_state,
3232-};
3333-use crate::filter::{FilterMode, SetUpdate};
3434-use crate::ingest::{firehose::FirehoseIngestor, worker::FirehoseWorker};
3535-use crate::state::AppState;
3636-use crate::types::{
3737- BroadcastEvent, GaugeState, MarshallableEvt, RecordEvt, RepoState, RepoStatus, StoredData,
3838- StoredEvent,
3939-};
4040-4141-/// an event emitted by the hydrant event stream.
4242-///
4343-/// three variants are possible depending on the `type` field:
4444-/// - `"record"`: a repo record was created, updated, or deleted. carries a [`RecordEvt`].
4545-/// - `"identity"`: a DID's handle or PDS changed. carries an [`IdentityEvt`]. ephemeral, not replayable.
4646-/// - `"account"`: a repo's active/inactive status changed. carries an [`AccountEvt`]. ephemeral, not replayable.
4747-///
4848-/// the `id` field is a monotonically increasing sequence number usable as a cursor for [`Hydrant::subscribe`].
4949-pub type Event = MarshallableEvt<'static>;
5050-5151-/// the top-level handle to a hydrant instance.
5252-///
5353-/// `Hydrant` is cheaply cloneable. all sub-handles share the same underlying state.
5454-/// construct it via [`Hydrant::new`] or [`Hydrant::from_env`], configure the filter
5555-/// and repos as needed, then call [`Hydrant::run`] to start all background components.
5656-///
5757-/// # example
5858-///
5959-/// ```rust,no_run
6060-/// use hydrant::control::Hydrant;
6161-///
6262-/// #[tokio::main]
6363-/// async fn main() -> miette::Result<()> {
6464-/// let hydrant = Hydrant::from_env().await?;
6565-///
6666-/// tokio::select! {
6767-/// r = hydrant.run()? => r,
6868-/// r = hydrant.serve(3000) => r,
6969-/// }
7070-/// }
7171-/// ```
7272-#[derive(Clone)]
7373-pub struct Hydrant {
7474- pub crawler: CrawlerHandle,
7575- pub firehose: FirehoseHandle,
7676- pub backfill: BackfillHandle,
7777- pub filter: FilterControl,
7878- pub repos: ReposControl,
7979- pub db: DbControl,
8080- #[cfg(feature = "backlinks")]
8181- pub backlinks: crate::backlinks::BacklinksControl,
8282- pub(crate) state: Arc<AppState>,
8383- config: Arc<Config>,
8484- started: Arc<AtomicBool>,
8585- _priv: (),
8686-}
8787-8888-impl Hydrant {
8989- /// open the database and configure hydrant from `config`.
9090- ///
9191- /// this sets up the database, applies any filter configuration from `config`, and
9292- /// initializes all sub-handles. no background tasks are started yet: call
9393- /// [`run`](Self::run) to start all components and drive the instance.
9494- pub async fn new(config: Config) -> Result<Self> {
9595- info!("{config}");
9696-9797- // 1. open database and construct AppState
9898- let state = AppState::new(&config)?;
9999-100100- // 2. apply any filter config from env variables
101101- if config.full_network
102102- || config.filter_signals.is_some()
103103- || config.filter_collections.is_some()
104104- || config.filter_excludes.is_some()
105105- {
106106- let filter_ks = state.db.filter.clone();
107107- let inner = state.db.inner.clone();
108108- let mode = config.full_network.then_some(FilterMode::Full);
109109- let signals = config.filter_signals.clone().map(SetUpdate::Set);
110110- let collections = config.filter_collections.clone().map(SetUpdate::Set);
111111- let excludes = config.filter_excludes.clone().map(SetUpdate::Set);
112112-113113- tokio::task::spawn_blocking(move || {
114114- let mut batch = inner.batch();
115115- db_filter::apply_patch(
116116- &mut batch,
117117- &filter_ks,
118118- mode,
119119- signals,
120120- collections,
121121- excludes,
122122- )?;
123123- batch.commit().into_diagnostic()
124124- })
125125- .await
126126- .into_diagnostic()??;
127127-128128- // 3. reload the live filter into the hot-path arc-swap
129129- let new_filter = tokio::task::spawn_blocking({
130130- let filter_ks = state.db.filter.clone();
131131- move || db_filter::load(&filter_ks)
132132- })
133133- .await
134134- .into_diagnostic()??;
135135- state.filter.store(Arc::new(new_filter));
136136- }
137137-138138- // 4. set crawler enabled state from config, evaluated against the post-patch filter
139139- let post_patch_crawler = match config.enable_crawler {
140140- Some(b) => b,
141141- None => {
142142- state.filter.load().mode == FilterMode::Full || !config.crawler_sources.is_empty()
143143- }
144144- };
145145- state.crawler_enabled.send_replace(post_patch_crawler);
146146-147147- let state = Arc::new(state);
148148-149149- Ok(Self {
150150- crawler: CrawlerHandle {
151151- state: state.clone(),
152152- shared: Arc::new(std::sync::OnceLock::new()),
153153- tasks: Arc::new(scc::HashMap::new()),
154154- persisted: Arc::new(scc::HashSet::new()),
155155- },
156156- firehose: FirehoseHandle {
157157- state: state.clone(),
158158- shared: Arc::new(std::sync::OnceLock::new()),
159159- tasks: Arc::new(scc::HashMap::new()),
160160- persisted: Arc::new(scc::HashSet::new()),
161161- },
162162- backfill: BackfillHandle(state.clone()),
163163- filter: FilterControl(state.clone()),
164164- repos: ReposControl(state.clone()),
165165- db: DbControl(state.clone()),
166166- #[cfg(feature = "backlinks")]
167167- backlinks: crate::backlinks::BacklinksControl(state.clone()),
168168- state,
169169- config: Arc::new(config),
170170- started: Arc::new(AtomicBool::new(false)),
171171- _priv: (),
172172- })
173173- }
174174-175175- /// reads config from environment variables and calls [`Hydrant::new`].
176176- pub async fn from_env() -> Result<Self> {
177177- Self::new(Config::from_env()?).await
178178- }
179179-180180- /// start all background components and return a future that resolves when any
181181- /// fatal component exits.
182182- ///
183183- /// starts the backfill worker, firehose ingestors, crawler, and worker thread.
184184- /// resolves with `Ok(())` if a fatal component exits cleanly, or `Err(e)` if it
185185- /// fails. intended for use in `tokio::select!` alongside [`serve`](Self::serve).
186186- ///
187187- /// returns an error if called more than once on the same `Hydrant` instance.
188188- pub fn run(&self) -> Result<impl Future<Output = Result<()>>> {
189189- let state = self.state.clone();
190190- let config = self.config.clone();
191191- let crawler = self.crawler.clone();
192192- let firehose = self.firehose.clone();
193193-194194- if self.started.swap(true, Ordering::SeqCst) {
195195- miette::bail!("Hydrant::run() called more than once");
196196- }
197197-198198- let fut = async move {
199199- // internal buffered channel between ingestors / backfill and the firehose worker
200200- let (buffer_tx, buffer_rx) = mpsc::unbounded_channel();
201201-202202- // 5. spawn the backfill worker
203203- tokio::spawn({
204204- let state = state.clone();
205205- BackfillWorker::new(
206206- state.clone(),
207207- buffer_tx.clone(),
208208- config.repo_fetch_timeout,
209209- config.backfill_concurrency_limit,
210210- matches!(
211211- config.verify_signatures,
212212- SignatureVerification::Full | SignatureVerification::BackfillOnly
213213- ),
214214- config.ephemeral,
215215- state.backfill_enabled.subscribe(),
216216- )
217217- .run()
218218- });
219219-220220- // 6. re-queue any repos that lost their backfill state, then start the retry worker
221221- if let Err(e) = tokio::task::spawn_blocking({
222222- let state = state.clone();
223223- move || crate::backfill::manager::queue_gone_backfills(&state)
224224- })
225225- .await
226226- .into_diagnostic()?
227227- {
228228- error!(err = %e, "failed to queue gone backfills");
229229- db::check_poisoned_report(&e);
230230- }
231231-232232- std::thread::spawn({
233233- let state = state.clone();
234234- move || crate::backfill::manager::retry_worker(state)
235235- });
236236-237237- // 7. ephemeral GC thread
238238- if config.ephemeral {
239239- let state = state.clone();
240240- std::thread::Builder::new()
241241- .name("ephemeral-gc".into())
242242- .spawn(move || crate::db::ephemeral::ephemeral_ttl_worker(state))
243243- .into_diagnostic()?;
244244- }
245245-246246- // 8. cursor / counts persist thread
247247- std::thread::spawn({
248248- let state = state.clone();
249249- let persist_interval = config.cursor_save_interval;
250250- move || loop {
251251- std::thread::sleep(persist_interval);
252252-253253- state.relay_cursors.iter_sync(|relay, cursor| {
254254- let seq = cursor.load(Ordering::SeqCst);
255255- if seq > 0 {
256256- if let Err(e) = db::set_firehose_cursor(&state.db, relay, seq) {
257257- error!(relay = %relay, err = %e, "failed to save cursor");
258258- db::check_poisoned_report(&e);
259259- }
260260- }
261261- true
262262- });
263263-264264- if let Err(e) = db::persist_counts(&state.db) {
265265- error!(err = %e, "failed to persist counts");
266266- db::check_poisoned_report(&e);
267267- }
268268-269269- if let Err(e) = state.db.persist() {
270270- error!(err = %e, "db persist failed");
271271- db::check_poisoned_report(&e);
272272- }
273273- }
274274- });
275275-276276- // 9. events/sec stats ticker
277277- tokio::spawn({
278278- let state = state.clone();
279279- let mut last_id = state.db.next_event_id.load(Ordering::Relaxed);
280280- let mut last_time = std::time::Instant::now();
281281- let mut interval = tokio::time::interval(std::time::Duration::from_secs(60));
282282- async move {
283283- loop {
284284- interval.tick().await;
285285-286286- let current_id = state.db.next_event_id.load(Ordering::Relaxed);
287287- let current_time = std::time::Instant::now();
288288- let delta = current_id.saturating_sub(last_id);
289289-290290- if delta == 0 {
291291- debug!("no new events in 60s");
292292- continue;
293293- }
294294-295295- let elapsed = current_time.duration_since(last_time).as_secs_f64();
296296- let rate = if elapsed > 0.0 {
297297- delta as f64 / elapsed
298298- } else {
299299- 0.0
300300- };
301301- info!("{rate:.2} events/s ({delta} events in {elapsed:.1}s)");
302302-303303- last_id = current_id;
304304- last_time = current_time;
305305- }
306306- }
307307- });
308308-309309- let (fatal_tx_inner, mut fatal_rx) = watch::channel(None);
310310- let fatal_tx = Arc::new(fatal_tx_inner);
311311-312312- info!(
313313- crawler_enabled = *state.crawler_enabled.borrow(),
314314- firehose_enabled = *state.firehose_enabled.borrow(),
315315- filter_mode = ?state.filter.load().mode,
316316- "starting ingestion"
317317- );
318318-319319- // 10. set shared and spawn firehose ingestors
320320- firehose
321321- .shared
322322- .set(FirehoseShared {
323323- buffer_tx: buffer_tx.clone(),
324324- verify_signatures: matches!(
325325- config.verify_signatures,
326326- SignatureVerification::Full
327327- ),
328328- })
329329- .ok()
330330- .expect("firehose shared already set");
331331- let fire_shared = firehose.shared.get().unwrap();
332332-333333- let relay_hosts = config.relays.clone();
334334- if !relay_hosts.is_empty() {
335335- info!(
336336- relay_count = relay_hosts.len(),
337337- hosts = relay_hosts
338338- .iter()
339339- .map(|h| h.as_str())
340340- .collect::<Vec<_>>()
341341- .join(", "),
342342- "starting firehose ingestor(s)"
343343- );
344344- for relay_url in &relay_hosts {
345345- let enabled_rx = state.firehose_enabled.subscribe();
346346- let handle =
347347- spawn_firehose_ingestor(relay_url, &state, fire_shared, enabled_rx).await?;
348348- let _ = firehose.tasks.insert_async(relay_url.clone(), handle).await;
349349- }
350350- }
351351-352352- let persisted_relay_urls = tokio::task::spawn_blocking({
353353- let state = state.clone();
354354- move || load_persisted_firehose_sources(&state.db)
355355- })
356356- .await
357357- .into_diagnostic()??;
358358-359359- for relay_url in &persisted_relay_urls {
360360- let _ = firehose.persisted.insert_async(relay_url.clone()).await;
361361- if firehose.tasks.contains_async(relay_url).await {
362362- continue;
363363- }
364364- let enabled_rx = state.firehose_enabled.subscribe();
365365- let handle =
366366- spawn_firehose_ingestor(relay_url, &state, fire_shared, enabled_rx).await?;
367367- let _ = firehose.tasks.insert_async(relay_url.clone(), handle).await;
368368- }
369369-370370- // 11. spawn crawler infrastructure (always, to support dynamic source management)
371371- {
372372- use crate::crawler::throttle::Throttler;
373373- use crate::crawler::{
374374- CrawlerStats, CrawlerWorker, InFlight, RetryProducer, SignalChecker,
375375- };
376376-377377- let http = reqwest::Client::builder()
378378- .user_agent(concat!(
379379- env!("CARGO_PKG_NAME"),
380380- "/",
381381- env!("CARGO_PKG_VERSION")
382382- ))
383383- .gzip(true)
384384- .build()
385385- .expect("that reqwest will build");
386386- let pds_throttler = Throttler::new();
387387- let in_flight = InFlight::new();
388388- let stats = CrawlerStats::new(
389389- state.clone(),
390390- config
391391- .crawler_sources
392392- .iter()
393393- .map(|s| s.url.clone())
394394- .collect(),
395395- pds_throttler.clone(),
396396- );
397397- let checker = SignalChecker {
398398- http: http.clone(),
399399- state: state.clone(),
400400- throttler: pds_throttler,
401401- };
402402-403403- info!(
404404- max_pending = config.crawler_max_pending_repos,
405405- resume_pending = config.crawler_resume_pending_repos,
406406- enabled = *state.crawler_enabled.borrow(),
407407- "starting crawler worker"
408408- );
409409- let (worker, tx) = CrawlerWorker::new(
410410- state.clone(),
411411- config.crawler_max_pending_repos,
412412- config.crawler_resume_pending_repos,
413413- stats.clone(),
414414- );
415415- tokio::spawn(async move {
416416- worker.run().await;
417417- error!("crawler worker exited unexpectedly, aborting");
418418- std::process::abort();
419419- });
420420-421421- let ticker = tokio::spawn(stats.clone().task());
422422- tokio::spawn(async move {
423423- match ticker.await {
424424- Err(e) => error!(err = ?e, "stats ticker panicked, aborting"),
425425- Ok(()) => error!("stats ticker exited unexpectedly, aborting"),
426426- }
427427- std::process::abort();
428428- });
429429-430430- tokio::spawn(
431431- RetryProducer {
432432- checker: checker.clone(),
433433- in_flight: in_flight.clone(),
434434- tx: tx.clone(),
435435- }
436436- .run(),
437437- );
438438-439439- // set shared objects so CrawlerHandle methods can use them
440440- crawler
441441- .shared
442442- .set(CrawlerShared {
443443- http,
444444- checker,
445445- in_flight,
446446- tx,
447447- stats,
448448- })
449449- .ok()
450450- .expect("crawler shared already set");
451451- let shared = crawler.shared.get().unwrap();
452452-453453- // spawn initial sources from config
454454- for source in config.crawler_sources.iter() {
455455- let enabled_rx = state.crawler_enabled.subscribe();
456456- let handle = spawn_crawler_producer(
457457- source,
458458- &shared.http,
459459- &state,
460460- &shared.checker,
461461- &shared.in_flight,
462462- &shared.tx,
463463- &shared.stats,
464464- enabled_rx,
465465- );
466466- let _ = crawler.tasks.insert_async(source.url.clone(), handle).await;
467467- }
468468-469469- let persisted_sources = tokio::task::spawn_blocking({
470470- let state = state.clone();
471471- move || load_persisted_crawler_sources(&state.db)
472472- })
473473- .await
474474- .into_diagnostic()??;
475475-476476- for source in &persisted_sources {
477477- let _ = crawler.persisted.insert_async(source.url.clone()).await;
478478- if crawler.tasks.contains_async(&source.url).await {
479479- continue;
480480- }
481481- let enabled_rx = state.crawler_enabled.subscribe();
482482- let handle = spawn_crawler_producer(
483483- source,
484484- &shared.http,
485485- &state,
486486- &shared.checker,
487487- &shared.in_flight,
488488- &shared.tx,
489489- &shared.stats,
490490- enabled_rx,
491491- );
492492- let _ = crawler.tasks.insert_async(source.url.clone(), handle).await;
493493- }
494494- }
495495-496496- // 12. spawn the firehose worker on a blocking thread (fatal task)
497497- let handle = tokio::runtime::Handle::current();
498498- let firehose_worker = std::thread::spawn({
499499- let state = state.clone();
500500- move || {
501501- FirehoseWorker::new(
502502- state,
503503- buffer_rx,
504504- matches!(config.verify_signatures, SignatureVerification::Full),
505505- config.ephemeral,
506506- config.firehose_workers,
507507- )
508508- .run(handle)
509509- }
510510- });
511511-512512- {
513513- let tx = Arc::clone(&fatal_tx);
514514- tokio::spawn(
515515- tokio::task::spawn_blocking(move || {
516516- firehose_worker
517517- .join()
518518- .map_err(|e| miette::miette!("buffer processor died: {e:?}"))
519519- })
520520- .map(move |r| {
521521- let result = r.into_diagnostic().flatten().flatten();
522522- let _ = tx.send(Some(result.map_err(|e| e.to_string())));
523523- }),
524524- );
525525- }
526526-527527- // drop the local fatal_tx so the watch channel is only kept alive by the
528528- // spawned tasks. when all fatal tasks exit (and drop their tx clones),
529529- // fatal_rx.changed() returns Err and we return Ok(()).
530530- drop(fatal_tx);
531531-532532- loop {
533533- match fatal_rx.changed().await {
534534- Ok(()) => {
535535- if let Some(result) = fatal_rx.borrow().clone() {
536536- return result.map_err(|s| miette::miette!("{s}"));
537537- }
538538- }
539539- // all fatal_tx clones dropped: all tasks finished cleanly
540540- Err(_) => return Ok(()),
541541- }
542542- }
543543- };
544544- Ok(fut)
545545- }
546546-547547- /// subscribe to the ordered event stream.
548548- ///
549549- /// returns an [`EventStream`] that implements [`futures::Stream`].
550550- ///
551551- /// - if `cursor` is `None`, streaming starts from the current head (live tail only).
552552- /// - if `cursor` is `Some(id)`, all persisted `record` events from that ID onward are
553553- /// replayed first, then live events follow seamlessly.
554554- ///
555555- /// `identity` and `account` events are ephemeral and are never replayed from a cursor -
556556- /// only live occurrences are delivered. use [`ReposControl::get`] to fetch current
557557- /// identity/account state for a specific DID.
558558- ///
559559- /// multiple concurrent subscribers each receive a full independent copy of the stream.
560560- /// the stream ends when the `EventStream` is dropped.
561561- pub fn subscribe(&self, cursor: Option<u64>) -> EventStream {
562562- let (tx, rx) = mpsc::channel(500);
563563- let state = self.state.clone();
564564- let runtime = tokio::runtime::Handle::current();
565565-566566- std::thread::Builder::new()
567567- .name("hydrant-stream".into())
568568- .spawn(move || {
569569- let _g = runtime.enter();
570570- event_stream_thread(state, tx, cursor);
571571- })
572572- .expect("failed to spawn stream thread");
573573-574574- EventStream(rx)
575575- }
576576-577577- /// return database counts and on-disk sizes for all keyspaces.
578578- ///
579579- /// counts include: `repos`, `pending`, `resync`, `records`, `blocks`, `events`,
580580- /// `error_ratelimited`, `error_transport`, `error_generic`.
581581- ///
582582- /// sizes are in bytes, reported per keyspace.
583583- pub async fn stats(&self) -> Result<StatsResponse> {
584584- let db = self.state.db.clone();
585585-586586- let mut counts: BTreeMap<&'static str, u64> = futures::future::join_all(
587587- [
588588- "repos",
589589- "pending",
590590- "resync",
591591- "records",
592592- "blocks",
593593- "error_ratelimited",
594594- "error_transport",
595595- "error_generic",
596596- ]
597597- .into_iter()
598598- .map(|name| {
599599- let db = db.clone();
600600- async move { (name, db.get_count(name).await) }
601601- }),
602602- )
603603- .await
604604- .into_iter()
605605- .collect();
606606-607607- counts.insert("events", db.events.approximate_len() as u64);
608608-609609- let sizes = tokio::task::spawn_blocking(move || {
610610- let mut s = BTreeMap::new();
611611- s.insert("repos", db.repos.disk_space());
612612- s.insert("records", db.records.disk_space());
613613- s.insert("blocks", db.blocks.disk_space());
614614- s.insert("cursors", db.cursors.disk_space());
615615- s.insert("pending", db.pending.disk_space());
616616- s.insert("resync", db.resync.disk_space());
617617- s.insert("resync_buffer", db.resync_buffer.disk_space());
618618- s.insert("events", db.events.disk_space());
619619- s.insert("counts", db.counts.disk_space());
620620- s.insert("filter", db.filter.disk_space());
621621- s.insert("crawler", db.crawler.disk_space());
622622- s
623623- })
624624- .await
625625- .into_diagnostic()?;
626626-627627- Ok(StatsResponse { counts, sizes })
628628- }
629629-630630- /// returns a future that runs the HTTP management API server on `0.0.0.0:{port}`.
631631- ///
632632- /// the server exposes all management endpoints (`/filter`, `/repos`, `/ingestion`,
633633- /// `/stream`, `/stats`, `/db/*`, `/xrpc/*`). it runs indefinitely and resolves
634634- /// only on error.
635635- ///
636636- /// intended for `tokio::spawn` or inclusion in a `select!` / task list. the clone
637637- /// of `self` is deferred until the future is first polled.
638638- ///
639639- /// to disable the HTTP API entirely, simply don't call this method.
640640- pub fn serve(&self, port: u16) -> impl Future<Output = Result<()>> {
641641- let hydrant = self.clone();
642642- async move { crate::api::serve(hydrant, port).await }
643643- }
644644-645645- /// returns a future that runs the debug HTTP API server on `127.0.0.1:{port}`.
646646- ///
647647- /// exposes internal inspection endpoints (`/debug/get`, `/debug/iter`, etc.)
648648- /// that are not safe to expose publicly. binds only to loopback.
649649- pub fn serve_debug(&self, port: u16) -> impl Future<Output = Result<()>> {
650650- let state = self.state.clone();
651651- async move { crate::api::serve_debug(state, port).await }
652652- }
653653-}
654654-655655-impl axum::extract::FromRef<Hydrant> for Arc<AppState> {
656656- fn from_ref(h: &Hydrant) -> Self {
657657- h.state.clone()
658658- }
659659-}
660660-661661-/// a stream of [`Event`]s. returned by [`Hydrant::subscribe`].
662662-///
663663-/// implements [`futures::Stream`] and can be used with `StreamExt::next`,
664664-/// `while let Some(evt) = stream.next().await`, `forward`, etc.
665665-/// the stream terminates when the underlying channel closes (i.e. hydrant shuts down).
666666-pub struct EventStream(mpsc::Receiver<Event>);
667667-668668-impl Stream for EventStream {
669669- type Item = Event;
670670-671671- fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
672672- self.0.poll_recv(cx)
673673- }
674674-}
675675-676676-/// database statistics returned by [`Hydrant::stats`].
677677-#[derive(serde::Serialize)]
678678-pub struct StatsResponse {
679679- /// record counts per logical category (repos, records, events, error kinds, etc.)
680680- pub counts: BTreeMap<&'static str, u64>,
681681- /// on-disk size in bytes per keyspace
682682- pub sizes: BTreeMap<&'static str, u64>,
683683-}
684684-685685-struct ProducerHandle {
686686- mode: crate::config::CrawlerMode,
687687- abort: tokio::task::AbortHandle,
688688-}
689689-690690-impl Drop for ProducerHandle {
691691- fn drop(&mut self) {
692692- self.abort.abort();
693693- }
694694-}
695695-696696-struct CrawlerShared {
697697- http: reqwest::Client,
698698- checker: crate::crawler::SignalChecker,
699699- in_flight: crate::crawler::InFlight,
700700- tx: mpsc::Sender<crate::crawler::CrawlerBatch>,
701701- stats: crate::crawler::CrawlerStats,
702702-}
703703-704704-/// a snapshot of a single crawler source's runtime state.
705705-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
706706-pub struct CrawlerSourceInfo {
707707- pub url: Url,
708708- pub mode: crate::config::CrawlerMode,
709709- /// whether this source is persisted in the database (i.e. it was dynamically added
710710- /// and will survive restarts). config-sourced entries have `persisted: false`.
711711- pub persisted: bool,
712712-}
713713-714714-fn spawn_crawler_producer(
715715- source: &crate::config::CrawlerSource,
716716- http: &reqwest::Client,
717717- state: &Arc<AppState>,
718718- checker: &crate::crawler::SignalChecker,
719719- in_flight: &crate::crawler::InFlight,
720720- tx: &mpsc::Sender<crate::crawler::CrawlerBatch>,
721721- stats: &crate::crawler::CrawlerStats,
722722- enabled: watch::Receiver<bool>,
723723-) -> ProducerHandle {
724724- use crate::config::CrawlerMode;
725725- use crate::crawler::{ByCollectionProducer, RelayProducer};
726726- use std::time::Duration;
727727- use tracing::Instrument;
728728-729729- let abort = match source.mode {
730730- CrawlerMode::Relay => {
731731- info!(relay = %source.url, enabled = *state.crawler_enabled.borrow(), "starting relay crawler");
732732- let span = tracing::info_span!("crawl", url = %source.url);
733733- tokio::spawn(
734734- RelayProducer {
735735- relay_url: source.url.clone(),
736736- checker: checker.clone(),
737737- in_flight: in_flight.clone(),
738738- tx: tx.clone(),
739739- enabled,
740740- stats: stats.clone(),
741741- }
742742- .run()
743743- .instrument(span),
744744- )
745745- .abort_handle()
746746- }
747747- CrawlerMode::ByCollection => {
748748- info!(
749749- host = source.url.host_str(),
750750- enabled = *state.crawler_enabled.borrow(),
751751- "starting by-collection crawler"
752752- );
753753- let span = tracing::info_span!("by_collection", host = source.url.host_str());
754754- let http = http.clone();
755755- let state = state.clone();
756756- let in_flight = in_flight.clone();
757757- let tx = tx.clone();
758758- let stats = stats.clone();
759759- let url = source.url.clone();
760760- tokio::spawn(
761761- async move {
762762- loop {
763763- let producer = ByCollectionProducer {
764764- index_url: url.clone(),
765765- http: http.clone(),
766766- state: state.clone(),
767767- in_flight: in_flight.clone(),
768768- tx: tx.clone(),
769769- enabled: enabled.clone(),
770770- stats: stats.clone(),
771771- };
772772- if let Err(e) = producer.run().await {
773773- error!(err = ?e, "by-collection crawler fatal error, restarting in 30s");
774774- tokio::time::sleep(Duration::from_secs(30)).await;
775775- }
776776- }
777777- }
778778- .instrument(span),
779779- )
780780- .abort_handle()
781781- }
782782- };
783783- ProducerHandle {
784784- mode: source.mode,
785785- abort,
786786- }
787787-}
788788-789789-async fn spawn_firehose_ingestor(
790790- relay_url: &Url,
791791- state: &Arc<AppState>,
792792- shared: &FirehoseShared,
793793- enabled: watch::Receiver<bool>,
794794-) -> Result<FirehoseIngestorHandle> {
795795- use std::sync::atomic::AtomicI64;
796796-797797- let start = db::get_firehose_cursor(&state.db, relay_url).await?;
798798- // insert into relay_cursors if not already present; existing in-memory cursor takes precedence
799799- let _ = state
800800- .relay_cursors
801801- .insert_async(relay_url.clone(), AtomicI64::new(start.unwrap_or(0)))
802802- .await;
803803-804804- info!(relay = %relay_url, cursor = ?start, "starting firehose ingestor");
805805-806806- let ingestor = FirehoseIngestor::new(
807807- state.clone(),
808808- shared.buffer_tx.clone(),
809809- relay_url.clone(),
810810- state.filter.clone(),
811811- enabled,
812812- shared.verify_signatures,
813813- );
814814-815815- let relay_for_log = relay_url.clone();
816816- let abort = tokio::spawn(async move {
817817- if let Err(e) = ingestor.run().await {
818818- error!(relay = %relay_for_log, err = %e, "firehose ingestor exited with error");
819819- }
820820- })
821821- .abort_handle();
822822-823823- Ok(FirehoseIngestorHandle { abort })
824824-}
825825-826826-/// runtime control over the crawler component.
827827-///
828828-/// the crawler walks `com.atproto.sync.listRepos` on each configured relay to discover
829829-/// repositories that have never emitted a firehose event. in `filter` mode it also
830830-/// checks each discovered repo against the configured signal collections before
831831-/// enqueuing it for backfill.
832832-///
833833-/// disabling the crawler does not affect in-progress repo checks. each one completes
834834-/// its current PDS request before pausing.
835835-#[derive(Clone)]
836836-pub struct CrawlerHandle {
837837- state: Arc<AppState>,
838838- /// set once by [`Hydrant::run`]; `None` means run() has not been called yet.
839839- shared: Arc<std::sync::OnceLock<CrawlerShared>>,
840840- /// per-source running tasks, keyed by url.
841841- tasks: Arc<scc::HashMap<Url, ProducerHandle>>,
842842- /// set of urls persisted in the database (dynamically added sources).
843843- persisted: Arc<scc::HashSet<Url>>,
844844-}
845845-846846-impl CrawlerHandle {
847847- /// enable the crawler (enables all configured producers). no-op if already enabled.
848848- pub fn enable(&self) {
849849- self.state.crawler_enabled.send_replace(true);
850850- }
851851- /// disable the crawler (disables all configured producers).
852852- /// in-progress repo checks finish before the crawler pauses.
853853- pub fn disable(&self) {
854854- self.state.crawler_enabled.send_replace(false);
855855- }
856856- /// returns the current enabled state of the crawler.
857857- pub fn is_enabled(&self) -> bool {
858858- *self.state.crawler_enabled.borrow()
859859- }
860860-861861- /// delete all cursor entries associated with the given URL.
862862- pub async fn reset_cursor(&self, url: &str) -> Result<()> {
863863- let db = self.state.db.clone();
864864- let point_keys = [keys::crawler_cursor_key(url)];
865865- let by_collection_prefix = keys::by_collection_cursor_prefix(url);
866866- tokio::task::spawn_blocking(move || {
867867- let mut batch = db.inner.batch();
868868- for k in point_keys {
869869- batch.remove(&db.cursors, k);
870870- }
871871- for entry in db.cursors.prefix(&by_collection_prefix) {
872872- let k = entry.key().into_diagnostic()?;
873873- batch.remove(&db.cursors, k);
874874- }
875875- batch.commit().into_diagnostic()
876876- })
877877- .await
878878- .into_diagnostic()??;
879879- Ok(())
880880- }
881881-882882- /// return info on all currently active crawler sources.
883883- ///
884884- /// returns an empty list if called before [`Hydrant::run`].
885885- pub async fn list_sources(&self) -> Vec<CrawlerSourceInfo> {
886886- let mut sources = Vec::new();
887887- self.tasks
888888- .iter_async(|url, h| {
889889- sources.push(CrawlerSourceInfo {
890890- url: url.clone(),
891891- mode: h.mode,
892892- persisted: self.persisted.contains_sync(url),
893893- });
894894- true
895895- })
896896- .await;
897897- sources
898898- }
899899-900900- /// add a new crawler source at runtime.
901901- ///
902902- /// the source is persisted to the database and will be re-spawned on restart.
903903- /// if a source with the same URL already exists, it is replaced (the old task is
904904- /// aborted and a new one is started with the new mode).
905905- ///
906906- /// returns an error if called before [`Hydrant::run`].
907907- pub async fn add_source(&self, source: crate::config::CrawlerSource) -> Result<()> {
908908- let Some(shared) = self.shared.get() else {
909909- miette::bail!("crawler not yet started: call Hydrant::run() first");
910910- };
911911-912912- let db = self.state.db.clone();
913913- let key = keys::crawler_source_key(source.url.as_str());
914914- let val = rmp_serde::to_vec(&source.mode).into_diagnostic()?;
915915- tokio::task::spawn_blocking(move || db.crawler.insert(key, val).into_diagnostic())
916916- .await
917917- .into_diagnostic()??;
918918-919919- let enabled_rx = self.state.crawler_enabled.subscribe();
920920- let handle = spawn_crawler_producer(
921921- &source,
922922- &shared.http,
923923- &self.state,
924924- &shared.checker,
925925- &shared.in_flight,
926926- &shared.tx,
927927- &shared.stats,
928928- enabled_rx,
929929- );
930930-931931- let _ = self.persisted.insert_async(source.url.clone()).await;
932932- match self.tasks.entry_async(source.url).await {
933933- scc::hash_map::Entry::Vacant(e) => {
934934- e.insert_entry(handle);
935935- }
936936- scc::hash_map::Entry::Occupied(mut e) => {
937937- *e.get_mut() = handle;
938938- }
939939- }
940940- Ok(())
941941- }
942942-943943- /// remove a crawler source at runtime by URL.
944944- ///
945945- /// aborts the running producer task and removes the source from the database if it
946946- /// was dynamically added. config-sourced entries are aborted but not persisted, so
947947- /// they will reappear on restart.
948948- ///
949949- /// returns `true` if a source with the given URL was found and removed.
950950- /// returns an error if called before [`Hydrant::run`].
951951- pub async fn remove_source(&self, url: &Url) -> Result<bool> {
952952- if self.shared.get().is_none() {
953953- miette::bail!("crawler not yet started: call Hydrant::run() first");
954954- }
955955-956956- // dropping the ProducerHandle aborts the task via Drop
957957- if self.tasks.remove_async(url).await.is_none() {
958958- return Ok(false);
959959- }
960960-961961- // remove from DB if it was a persisted source
962962- if self.persisted.remove_async(url).await.is_some() {
963963- let db = self.state.db.clone();
964964- let key = keys::crawler_source_key(url.as_str());
965965- tokio::task::spawn_blocking(move || db.crawler.remove(key).into_diagnostic())
966966- .await
967967- .into_diagnostic()??;
968968- }
969969-970970- Ok(true)
971971- }
972972-}
973973-974974-struct FirehoseIngestorHandle {
975975- abort: tokio::task::AbortHandle,
976976-}
977977-978978-impl Drop for FirehoseIngestorHandle {
979979- fn drop(&mut self) {
980980- self.abort.abort();
981981- }
982982-}
983983-984984-struct FirehoseShared {
985985- buffer_tx: crate::ingest::BufferTx,
986986- verify_signatures: bool,
987987-}
988988-989989-/// a snapshot of a single firehose relay's runtime state.
990990-#[derive(Debug, Clone, serde::Serialize)]
991991-pub struct FirehoseSourceInfo {
992992- pub url: Url,
993993- /// true if added via the API and persisted to the database; false for `RELAY_HOSTS` sources.
994994- pub persisted: bool,
995995-}
996996-997997-/// runtime control over the firehose ingestor component.
998998-#[derive(Clone)]
999999-pub struct FirehoseHandle {
10001000- state: Arc<AppState>,
10011001- /// set once by [`Hydrant::run`]; `None` means run() has not been called yet.
10021002- shared: Arc<std::sync::OnceLock<FirehoseShared>>,
10031003- /// per-relay running tasks, keyed by url.
10041004- tasks: Arc<scc::HashMap<Url, FirehoseIngestorHandle>>,
10051005- /// set of urls persisted in the database (dynamically added sources).
10061006- persisted: Arc<scc::HashSet<Url>>,
10071007-}
10081008-10091009-impl FirehoseHandle {
10101010- /// enable the firehose. no-op if already enabled.
10111011- pub fn enable(&self) {
10121012- self.state.firehose_enabled.send_replace(true);
10131013- }
10141014- /// disable the firehose. the current message finishes processing before the connection closes.
10151015- pub fn disable(&self) {
10161016- self.state.firehose_enabled.send_replace(false);
10171017- }
10181018- /// returns the current enabled state of the firehose.
10191019- pub fn is_enabled(&self) -> bool {
10201020- *self.state.firehose_enabled.borrow()
10211021- }
10221022-10231023- /// reset the stored cursor for the given relay URL.
10241024- ///
10251025- /// clears the `firehose_cursor|{url}` entry from the cursors keyspace and zeroes the
10261026- /// in-memory cursor. the next connection will tail live events from the current head.
10271027- pub async fn reset_cursor(&self, url: &str) -> Result<()> {
10281028- let db = self.state.db.clone();
10291029- let key = keys::firehose_cursor_key(url);
10301030- tokio::task::spawn_blocking(move || db.cursors.remove(key).into_diagnostic())
10311031- .await
10321032- .into_diagnostic()??;
10331033-10341034- if let Ok(relay_url) = Url::parse(url) {
10351035- self.state.relay_cursors.peek_with(&relay_url, |_, c| {
10361036- c.store(0, std::sync::atomic::Ordering::SeqCst);
10371037- });
10381038- }
10391039- Ok(())
10401040- }
10411041-10421042- /// return info on all currently active firehose sources.
10431043- pub async fn list_sources(&self) -> Vec<FirehoseSourceInfo> {
10441044- let mut sources = Vec::new();
10451045- self.tasks
10461046- .iter_async(|url, _| {
10471047- sources.push(FirehoseSourceInfo {
10481048- url: url.clone(),
10491049- persisted: self.persisted.contains_sync(url),
10501050- });
10511051- true
10521052- })
10531053- .await;
10541054- sources
10551055- }
10561056-10571057- /// add a new firehose relay at runtime.
10581058- ///
10591059- /// the URL is persisted to the database and will be re-spawned on restart. if a relay with
10601060- /// the same URL already exists it is replaced: the running task is stopped and a new one
10611061- /// is started. any cursor state for that URL is preserved.
10621062- ///
10631063- /// returns an error if called before [`Hydrant::run`].
10641064- pub async fn add_source(&self, url: Url) -> Result<()> {
10651065- let Some(shared) = self.shared.get() else {
10661066- miette::bail!("firehose not yet started: call Hydrant::run() first");
10671067- };
10681068-10691069- let db = self.state.db.clone();
10701070- let key = keys::firehose_source_key(url.as_str());
10711071- tokio::task::spawn_blocking(move || db.crawler.insert(key, b"").into_diagnostic())
10721072- .await
10731073- .into_diagnostic()??;
10741074-10751075- let enabled_rx = self.state.firehose_enabled.subscribe();
10761076- let handle = spawn_firehose_ingestor(&url, &self.state, shared, enabled_rx).await?;
10771077-10781078- let _ = self.persisted.insert_async(url.clone()).await;
10791079- match self.tasks.entry_async(url).await {
10801080- scc::hash_map::Entry::Vacant(e) => {
10811081- e.insert_entry(handle);
10821082- }
10831083- scc::hash_map::Entry::Occupied(mut e) => {
10841084- *e.get_mut() = handle;
10851085- }
10861086- }
10871087- Ok(())
10881088- }
10891089-10901090- /// remove a firehose relay at runtime by URL.
10911091- ///
10921092- /// aborts the running ingestor task. if the source was added via the API it is removed from
10931093- /// the database and will not reappear on restart. `RELAY_HOSTS` sources are only stopped for
10941094- /// the current session; they reappear on the next restart.
10951095- ///
10961096- /// returns `true` if the relay was found and removed, `false` if it was not running.
10971097- /// returns an error if called before [`Hydrant::run`].
10981098- pub async fn remove_source(&self, url: &Url) -> Result<bool> {
10991099- if self.shared.get().is_none() {
11001100- miette::bail!("firehose not yet started: call Hydrant::run() first");
11011101- }
11021102-11031103- if self.tasks.remove_async(url).await.is_none() {
11041104- return Ok(false);
11051105- }
11061106-11071107- // remove from relay_cursors (persist thread will stop tracking it)
11081108- self.state.relay_cursors.remove_async(url).await;
11091109-11101110- if self.persisted.remove_async(url).await.is_some() {
11111111- let db = self.state.db.clone();
11121112- let key = keys::firehose_source_key(url.as_str());
11131113- tokio::task::spawn_blocking(move || db.crawler.remove(key).into_diagnostic())
11141114- .await
11151115- .into_diagnostic()??;
11161116- }
11171117-11181118- Ok(true)
11191119- }
11201120-}
11211121-11221122-/// runtime control over the backfill worker component.
11231123-///
11241124-/// the backfill worker fetches full repo CAR files from each repo's PDS for any
11251125-/// repository in the pending queue, parses the MST, and inserts all matching records
11261126-/// into the database. concurrency is bounded by `HYDRANT_BACKFILL_CONCURRENCY_LIMIT`.
11271127-#[derive(Clone)]
11281128-pub struct BackfillHandle(Arc<AppState>);
11291129-11301130-impl BackfillHandle {
11311131- /// enable the backfill worker, no-op if already enabled.
11321132- pub fn enable(&self) {
11331133- self.0.backfill_enabled.send_replace(true);
11341134- }
11351135- /// disable the backfill worker, in-flight repos complete before pausing.
11361136- pub fn disable(&self) {
11371137- self.0.backfill_enabled.send_replace(false);
11381138- }
11391139- /// returns the current enabled state of the backfill worker.
11401140- pub fn is_enabled(&self) -> bool {
11411141- *self.0.backfill_enabled.borrow()
11421142- }
11431143-}
11441144-11451145-/// a point-in-time snapshot of the filter configuration. returned by all [`FilterControl`] methods.
11461146-///
11471147-/// because the filter is stored in the database and loaded on demand, this snapshot
11481148-/// may be stale if another caller modifies the filter concurrently. for the authoritative
11491149-/// live config use [`FilterControl::get`].
11501150-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
11511151-pub struct FilterSnapshot {
11521152- pub mode: FilterMode,
11531153- pub signals: Vec<String>,
11541154- pub collections: Vec<String>,
11551155- pub excludes: Vec<String>,
11561156-}
11571157-11581158-/// runtime control over the indexing filter.
11591159-///
11601160-/// the filter has two orthogonal axes:
11611161-///
11621162-/// **mode** controls discovery:
11631163-/// - [`FilterMode::Filter`]: only indexes repos whose firehose commits touch a collection
11641164-/// matching a configured `signal`. explicit [`ReposControl::track`] always works regardless.
11651165-/// - [`FilterMode::Full`]: indexes the entire network. `signals` are ignored for discovery
11661166-/// but `collections` and `excludes` still apply.
11671167-///
11681168-/// **sets** are each independently configurable:
11691169-/// - `signals`: NSID patterns that trigger auto-discovery in `filter` mode (e.g. `app.bsky.feed.post`, `app.bsky.graph.*`)
11701170-/// - `collections`: NSID patterns that filter which records are *stored*. empty means store all.
11711171-/// - `excludes`: DIDs that are always skipped regardless of mode.
11721172-///
11731173-/// NSID patterns support an optional `.*` suffix to match an entire namespace.
11741174-/// all mutations are persisted to the database and take effect immediately.
11751175-#[derive(Clone)]
11761176-pub struct FilterControl(Arc<AppState>);
11771177-11781178-impl FilterControl {
11791179- /// return the current filter configuration from the database.
11801180- pub async fn get(&self) -> Result<FilterSnapshot> {
11811181- let filter_ks = self.0.db.filter.clone();
11821182- tokio::task::spawn_blocking(move || {
11831183- let hot = db_filter::load(&filter_ks)?;
11841184- let excludes = db_filter::read_set(&filter_ks, db_filter::EXCLUDE_PREFIX)?;
11851185- Ok(FilterSnapshot {
11861186- mode: hot.mode,
11871187- signals: hot.signals.iter().map(|s| s.to_string()).collect(),
11881188- collections: hot.collections.iter().map(|s| s.to_string()).collect(),
11891189- excludes,
11901190- })
11911191- })
11921192- .await
11931193- .into_diagnostic()?
11941194- }
11951195-11961196- /// set the indexing mode. see [`FilterControl`] for mode semantics.
11971197- pub fn set_mode(&self, mode: FilterMode) -> FilterPatch {
11981198- FilterPatch::new(self).set_mode(mode)
11991199- }
12001200-12011201- /// replace the entire signals set. existing signals are removed.
12021202- pub fn set_signals(&self, signals: impl IntoIterator<Item = impl Into<String>>) -> FilterPatch {
12031203- FilterPatch::new(self).set_signals(signals)
12041204- }
12051205-12061206- /// add multiple signals without disturbing existing ones.
12071207- pub fn append_signals(
12081208- &self,
12091209- signals: impl IntoIterator<Item = impl Into<String>>,
12101210- ) -> FilterPatch {
12111211- FilterPatch::new(self).append_signals(signals)
12121212- }
12131213-12141214- /// add a single signal. no-op if already present.
12151215- pub fn add_signal(&self, signal: impl Into<String>) -> FilterPatch {
12161216- FilterPatch::new(self).add_signal(signal)
12171217- }
12181218-12191219- /// remove a single signal. no-op if not present.
12201220- pub fn remove_signal(&self, signal: impl Into<String>) -> FilterPatch {
12211221- FilterPatch::new(self).remove_signal(signal)
12221222- }
12231223-12241224- /// replace the entire collections set. pass an empty iterator to store all collections.
12251225- pub fn set_collections(
12261226- &self,
12271227- collections: impl IntoIterator<Item = impl Into<String>>,
12281228- ) -> FilterPatch {
12291229- FilterPatch::new(self).set_collections(collections)
12301230- }
12311231-12321232- /// add multiple collections without disturbing existing ones.
12331233- pub fn append_collections(
12341234- &self,
12351235- collections: impl IntoIterator<Item = impl Into<String>>,
12361236- ) -> FilterPatch {
12371237- FilterPatch::new(self).append_collections(collections)
12381238- }
12391239-12401240- /// add a single collection filter. no-op if already present.
12411241- pub fn add_collection(&self, collection: impl Into<String>) -> FilterPatch {
12421242- FilterPatch::new(self).add_collection(collection)
12431243- }
12441244-12451245- /// remove a single collection filter. no-op if not present.
12461246- pub fn remove_collection(&self, collection: impl Into<String>) -> FilterPatch {
12471247- FilterPatch::new(self).remove_collection(collection)
12481248- }
12491249-12501250- /// replace the entire excludes set.
12511251- pub fn set_excludes(
12521252- &self,
12531253- excludes: impl IntoIterator<Item = impl Into<String>>,
12541254- ) -> FilterPatch {
12551255- FilterPatch::new(self).set_excludes(excludes)
12561256- }
12571257-12581258- /// add multiple DIDs to the excludes set without disturbing existing ones.
12591259- pub fn append_excludes(
12601260- &self,
12611261- excludes: impl IntoIterator<Item = impl Into<String>>,
12621262- ) -> FilterPatch {
12631263- FilterPatch::new(self).append_excludes(excludes)
12641264- }
12651265-12661266- /// add a single DID to the excludes set. no-op if already excluded.
12671267- pub fn add_exclude(&self, did: impl Into<String>) -> FilterPatch {
12681268- FilterPatch::new(self).add_exclude(did)
12691269- }
12701270-12711271- /// remove a single DID from the excludes set. no-op if not present.
12721272- pub fn remove_exclude(&self, did: impl Into<String>) -> FilterPatch {
12731273- FilterPatch::new(self).remove_exclude(did)
12741274- }
12751275-}
12761276-12771277-/// a staged set of filter mutations. all methods accumulate changes without touching
12781278-/// the database. call [`FilterPatch::apply`] to commit the entire patch atomically.
12791279-///
12801280-/// obtain an instance by calling any mutation method on [`FilterControl`], or via
12811281-/// [`FilterPatch::new`] to start from a blank patch.
12821282-pub struct FilterPatch {
12831283- state: Arc<AppState>,
12841284- /// if set, replaces the current indexing mode.
12851285- pub mode: Option<FilterMode>,
12861286- /// if set, replaces or patches the signals set.
12871287- pub signals: Option<SetUpdate>,
12881288- /// if set, replaces or patches the collections set.
12891289- pub collections: Option<SetUpdate>,
12901290- /// if set, replaces or patches the excludes set.
12911291- pub excludes: Option<SetUpdate>,
12921292-}
12931293-12941294-impl FilterPatch {
12951295- /// create a new blank patch associated with the given [`FilterControl`].
12961296- pub fn new(control: &FilterControl) -> Self {
12971297- Self {
12981298- state: control.0.clone(),
12991299- mode: None,
13001300- signals: None,
13011301- collections: None,
13021302- excludes: None,
13031303- }
13041304- }
13051305-13061306- /// set the indexing mode. see [`FilterControl`] for mode semantics.
13071307- pub fn set_mode(mut self, mode: FilterMode) -> Self {
13081308- self.mode = Some(mode);
13091309- self
13101310- }
13111311-13121312- /// replace the entire signals set. existing signals are removed.
13131313- pub fn set_signals(mut self, signals: impl IntoIterator<Item = impl Into<String>>) -> Self {
13141314- self.signals = Some(SetUpdate::Set(
13151315- signals.into_iter().map(Into::into).collect(),
13161316- ));
13171317- self
13181318- }
13191319-13201320- /// add multiple signals without disturbing existing ones.
13211321- pub fn append_signals(mut self, signals: impl IntoIterator<Item = impl Into<String>>) -> Self {
13221322- self.signals = Some(SetUpdate::Patch(
13231323- signals.into_iter().map(|s| (s.into(), true)).collect(),
13241324- ));
13251325- self
13261326- }
13271327-13281328- /// add a single signal. no-op if already present.
13291329- pub fn add_signal(mut self, signal: impl Into<String>) -> Self {
13301330- self.signals = Some(SetUpdate::Patch([(signal.into(), true)].into()));
13311331- self
13321332- }
13331333-13341334- /// remove a single signal. no-op if not present.
13351335- pub fn remove_signal(mut self, signal: impl Into<String>) -> Self {
13361336- self.signals = Some(SetUpdate::Patch([(signal.into(), false)].into()));
13371337- self
13381338- }
13391339-13401340- /// replace the entire collections set. pass an empty iterator to store all collections.
13411341- pub fn set_collections(
13421342- mut self,
13431343- collections: impl IntoIterator<Item = impl Into<String>>,
13441344- ) -> Self {
13451345- self.collections = Some(SetUpdate::Set(
13461346- collections.into_iter().map(Into::into).collect(),
13471347- ));
13481348- self
13491349- }
13501350-13511351- /// add multiple collections without disturbing existing ones.
13521352- pub fn append_collections(
13531353- mut self,
13541354- collections: impl IntoIterator<Item = impl Into<String>>,
13551355- ) -> Self {
13561356- self.collections = Some(SetUpdate::Patch(
13571357- collections.into_iter().map(|c| (c.into(), true)).collect(),
13581358- ));
13591359- self
13601360- }
13611361-13621362- /// add a single collection filter. no-op if already present.
13631363- pub fn add_collection(mut self, collection: impl Into<String>) -> Self {
13641364- self.collections = Some(SetUpdate::Patch([(collection.into(), true)].into()));
13651365- self
13661366- }
13671367-13681368- /// remove a single collection filter. no-op if not present.
13691369- pub fn remove_collection(mut self, collection: impl Into<String>) -> Self {
13701370- self.collections = Some(SetUpdate::Patch([(collection.into(), false)].into()));
13711371- self
13721372- }
13731373-13741374- /// replace the entire excludes set.
13751375- pub fn set_excludes(mut self, excludes: impl IntoIterator<Item = impl Into<String>>) -> Self {
13761376- self.excludes = Some(SetUpdate::Set(
13771377- excludes.into_iter().map(Into::into).collect(),
13781378- ));
13791379- self
13801380- }
13811381-13821382- /// add multiple DIDs to the excludes set without disturbing existing ones.
13831383- pub fn append_excludes(
13841384- mut self,
13851385- excludes: impl IntoIterator<Item = impl Into<String>>,
13861386- ) -> Self {
13871387- self.excludes = Some(SetUpdate::Patch(
13881388- excludes.into_iter().map(|d| (d.into(), true)).collect(),
13891389- ));
13901390- self
13911391- }
13921392-13931393- /// add a single DID to the excludes set. no-op if already excluded.
13941394- pub fn add_exclude(mut self, did: impl Into<String>) -> Self {
13951395- self.excludes = Some(SetUpdate::Patch([(did.into(), true)].into()));
13961396- self
13971397- }
13981398-13991399- /// remove a single DID from the excludes set. no-op if not present.
14001400- pub fn remove_exclude(mut self, did: impl Into<String>) -> Self {
14011401- self.excludes = Some(SetUpdate::Patch([(did.into(), false)].into()));
14021402- self
14031403- }
14041404-14051405- /// commit the patch atomically to the database and update the in-memory filter.
14061406- /// returns the updated [`FilterSnapshot`].
14071407- pub async fn apply(self) -> Result<FilterSnapshot> {
14081408- let filter_ks = self.state.db.filter.clone();
14091409- let inner = self.state.db.inner.clone();
14101410- let filter_handle = self.state.filter.clone();
14111411- let mode = self.mode;
14121412- let signals = self.signals;
14131413- let collections = self.collections;
14141414- let excludes = self.excludes;
14151415-14161416- let new_filter = tokio::task::spawn_blocking(move || {
14171417- let mut batch = inner.batch();
14181418- db_filter::apply_patch(&mut batch, &filter_ks, mode, signals, collections, excludes)?;
14191419- batch.commit().into_diagnostic()?;
14201420- db_filter::load(&filter_ks)
14211421- })
14221422- .await
14231423- .into_diagnostic()??;
14241424-14251425- let exclude_list = {
14261426- let filter_ks = self.state.db.filter.clone();
14271427- tokio::task::spawn_blocking(move || {
14281428- db_filter::read_set(&filter_ks, db_filter::EXCLUDE_PREFIX)
14291429- })
14301430- .await
14311431- .into_diagnostic()??
14321432- };
14331433-14341434- let snapshot = FilterSnapshot {
14351435- mode: new_filter.mode,
14361436- signals: new_filter.signals.iter().map(|s| s.to_string()).collect(),
14371437- collections: new_filter
14381438- .collections
14391439- .iter()
14401440- .map(|s| s.to_string())
14411441- .collect(),
14421442- excludes: exclude_list,
14431443- };
14441444-14451445- filter_handle.store(Arc::new(new_filter));
14461446- Ok(snapshot)
14471447- }
14481448-}
14491449-14501450-/// information about a tracked or known repository. returned by [`ReposControl`] methods.
14511451-#[derive(Debug, Clone, serde::Serialize)]
14521452-pub struct RepoInfo {
14531453- /// the DID of the repository.
14541454- pub did: Did<'static>,
14551455- /// the status of the repository.
14561456- #[serde(serialize_with = "crate::util::repo_status_serialize_str")]
14571457- pub status: RepoStatus,
14581458- /// whether this repository is tracked or not.
14591459- /// untracked repositories are not updated and they stay frozen.
14601460- pub tracked: bool,
14611461- /// the revision of the root commit of this repository.
14621462- #[serde(skip_serializing_if = "Option::is_none")]
14631463- pub rev: Option<Tid>,
14641464- /// the CID of the root commit of this repository.
14651465- #[serde(serialize_with = "crate::util::opt_cid_serialize_str")]
14661466- #[serde(skip_serializing_if = "Option::is_none")]
14671467- pub data: Option<IpldCid>,
14681468- /// the handle for the DID of this repository.
14691469- #[serde(skip_serializing_if = "Option::is_none")]
14701470- pub handle: Option<Handle<'static>>,
14711471- /// the URL for the PDS in which this repository is hosted on.
14721472- #[serde(skip_serializing_if = "Option::is_none")]
14731473- pub pds: Option<Url>,
14741474- /// ATProto signing key of this repository.
14751475- #[serde(skip_serializing_if = "Option::is_none")]
14761476- pub signing_key: Option<String>,
14771477- /// when this repository was last touched (status update, commit ingested, etc.).
14781478- #[serde(skip_serializing_if = "Option::is_none")]
14791479- pub last_updated_at: Option<DateTime<Utc>>,
14801480- /// the time of the last message gotten from the firehose for this repository.
14811481- /// this is equal to the `time` field.
14821482- #[serde(skip_serializing_if = "Option::is_none")]
14831483- pub last_message_at: Option<DateTime<Utc>>,
14841484-}
14851485-14861486-/// control over which repositories are tracked and access to their state.
14871487-///
14881488-/// in `filter` mode, a repo is only indexed if it either matches a signal or is
14891489-/// explicitly tracked via [`ReposControl::track`]. in `full` mode all repos are indexed
14901490-/// and tracking is implicit.
14911491-///
14921492-/// tracking a DID that hydrant has never seen enqueues an immediate backfill.
14931493-/// tracking a DID that hydrant already knows about (but has marked untracked)
14941494-/// re-enqueues it for backfill.
14951495-#[derive(Clone)]
14961496-pub struct ReposControl(Arc<AppState>);
14971497-14981498-impl ReposControl {
14991499- /// gets a handle for a repository to allow acting upon it.
15001500- pub fn get<'i>(&self, did: &Did<'i>) -> Result<RepoHandle<'i>> {
15011501- Ok(RepoHandle {
15021502- state: self.0.clone(),
15031503- did: did.clone(),
15041504- })
15051505- }
15061506-15071507- /// same as [`ReposControl::get`] but allows you to pass in an identifier that can be
15081508- /// either a handle or a DID.
15091509- pub async fn resolve(&self, repo: &AtIdentifier<'_>) -> Result<RepoHandle<'static>> {
15101510- let did = self.0.resolver.resolve_did(repo).await?;
15111511- Ok(RepoHandle {
15121512- state: self.0.clone(),
15131513- did,
15141514- })
15151515- }
15161516-15171517- /// fetch the current state of a single repository. returns `None` if hydrant
15181518- /// has never seen this DID.
15191519- pub async fn info(&self, did: &Did<'_>) -> Result<Option<RepoInfo>> {
15201520- self.get(did)?.info().await
15211521- }
15221522-15231523- /// explicitly track one or more repositories, enqueuing them for backfill if needed.
15241524- ///
15251525- /// - if a DID is new, a fresh [`RepoState`] is created and backfill is queued.
15261526- /// - if a DID is already known but untracked, it is marked tracked and re-enqueued.
15271527- /// - if a DID is already tracked, this is a no-op.
15281528- pub async fn track(&self, dids: impl IntoIterator<Item = Did<'_>>) -> Result<()> {
15291529- let dids: Vec<Did<'static>> = dids.into_iter().map(|d| d.into_static()).collect();
15301530- let state = self.0.clone();
15311531-15321532- let (new_count, transitions) = tokio::task::spawn_blocking(move || {
15331533- let db = &state.db;
15341534- let mut batch = db.inner.batch();
15351535- let mut added = 0i64;
15361536- let mut transitions: Vec<(GaugeState, GaugeState)> = Vec::new();
15371537- let mut rng = rand::rng();
15381538-15391539- for did in &dids {
15401540- let did_key = keys::repo_key(did);
15411541- let repo_bytes = db.repos.get(&did_key).into_diagnostic()?;
15421542- let existing = repo_bytes
15431543- .as_deref()
15441544- .map(db::deser_repo_state)
15451545- .transpose()?;
15461546-15471547- if let Some(mut repo_state) = existing {
15481548- if !repo_state.tracked {
15491549- let resync = db.resync.get(&did_key).into_diagnostic()?;
15501550- let old = db::Db::repo_gauge_state(&repo_state, resync.as_deref());
15511551- repo_state.tracked = true;
15521552- batch.insert(&db.repos, &did_key, ser_repo_state(&repo_state)?);
15531553- batch.insert(
15541554- &db.pending,
15551555- keys::pending_key(repo_state.index_id),
15561556- &did_key,
15571557- );
15581558- batch.remove(&db.resync, &did_key);
15591559- transitions.push((old, GaugeState::Pending));
15601560- }
15611561- } else {
15621562- let repo_state = RepoState::backfilling(rng.next_u64());
15631563- batch.insert(&db.repos, &did_key, ser_repo_state(&repo_state)?);
15641564- batch.insert(
15651565- &db.pending,
15661566- keys::pending_key(repo_state.index_id),
15671567- &did_key,
15681568- );
15691569- added += 1;
15701570- transitions.push((GaugeState::Synced, GaugeState::Pending));
15711571- }
15721572- }
15731573-15741574- batch.commit().into_diagnostic()?;
15751575- Ok::<_, miette::Report>((added, transitions))
15761576- })
15771577- .await
15781578- .into_diagnostic()??;
15791579-15801580- if new_count > 0 {
15811581- self.0.db.update_count_async("repos", new_count).await;
15821582- }
15831583- for (old, new) in transitions {
15841584- self.0.db.update_gauge_diff_async(&old, &new).await;
15851585- }
15861586- self.0.notify_backfill();
15871587- Ok(())
15881588- }
15891589-15901590- /// stop tracking one or more repositories. hydrant will stop processing new events
15911591- /// for them and remove them from the pending/resync queues, but existing indexed
15921592- /// records are **not** deleted.
15931593- pub async fn untrack(&self, dids: impl IntoIterator<Item = Did<'_>>) -> Result<()> {
15941594- let dids: Vec<Did<'static>> = dids.into_iter().map(|d| d.into_static()).collect();
15951595- let state = self.0.clone();
15961596-15971597- let gauge_decrements = tokio::task::spawn_blocking(move || {
15981598- let db = &state.db;
15991599- let mut batch = db.inner.batch();
16001600- let mut gauge_decrements = Vec::new();
16011601-16021602- for did in &dids {
16031603- let did_key = keys::repo_key(did);
16041604- let repo_bytes = db.repos.get(&did_key).into_diagnostic()?;
16051605- let existing = repo_bytes
16061606- .as_deref()
16071607- .map(db::deser_repo_state)
16081608- .transpose()?;
16091609-16101610- if let Some(repo_state) = existing {
16111611- if repo_state.tracked {
16121612- let resync = db.resync.get(&did_key).into_diagnostic()?;
16131613- let old = db::Db::repo_gauge_state(&repo_state, resync.as_deref());
16141614- let mut repo_state = repo_state.into_static();
16151615- repo_state.tracked = false;
16161616- batch.insert(&db.repos, &did_key, ser_repo_state(&repo_state)?);
16171617- batch.remove(&db.pending, keys::pending_key(repo_state.index_id));
16181618- batch.remove(&db.resync, &did_key);
16191619- if old != GaugeState::Synced {
16201620- gauge_decrements.push(old);
16211621- }
16221622- }
16231623- }
16241624- }
16251625-16261626- batch.commit().into_diagnostic()?;
16271627- Ok::<_, miette::Report>(gauge_decrements)
16281628- })
16291629- .await
16301630- .into_diagnostic()??;
16311631-16321632- for gauge in gauge_decrements {
16331633- self.0
16341634- .db
16351635- .update_gauge_diff_async(&gauge, &GaugeState::Synced)
16361636- .await;
16371637- }
16381638- Ok(())
16391639- }
16401640-}
16411641-16421642-pub(crate) fn repo_state_to_info(did: Did<'static>, s: RepoState<'_>) -> RepoInfo {
16431643- RepoInfo {
16441644- did,
16451645- status: s.status,
16461646- tracked: s.tracked,
16471647- rev: s.rev.map(|r| r.to_tid()),
16481648- data: s.data,
16491649- handle: s.handle.map(|h| h.into_static()),
16501650- pds: s.pds.and_then(|p| p.parse().ok()),
16511651- signing_key: s.signing_key.map(|k| k.encode()),
16521652- last_updated_at: DateTime::from_timestamp_secs(s.last_updated_at),
16531653- last_message_at: s.last_message_time.and_then(DateTime::from_timestamp_secs),
16541654- }
16551655-}
16561656-16571657-/// control over database maintenance operations.
16581658-///
16591659-/// all methods pause the crawler, firehose, and backfill worker for the duration
16601660-/// of the operation and restore their prior state on completion, whether or not
16611661-/// the operation succeeds.
16621662-#[derive(Clone)]
16631663-pub struct DbControl(Arc<AppState>);
16641664-16651665-impl DbControl {
16661666- /// trigger a major compaction of all keyspaces in parallel.
16671667- ///
16681668- /// compaction reclaims disk space from deleted/updated keys and improves
16691669- /// read performance. can take several minutes on large datasets.
16701670- pub async fn compact(&self) -> Result<()> {
16711671- let state = self.0.clone();
16721672- state
16731673- .with_ingestion_paused(async || state.db.compact().await)
16741674- .await
16751675- }
16761676-16771677- /// train zstd compression dictionaries for the `repos`, `blocks`, and `events` keyspaces.
16781678- ///
16791679- /// dictionaries are written to `dict_{name}.bin` files next to the database.
16801680- /// a restart is required to apply them. training samples data blocks from the
16811681- /// existing database, so the database must have a reasonable amount of data first.
16821682- pub async fn train_dicts(&self) -> Result<()> {
16831683- let state = self.0.clone();
16841684- state
16851685- .with_ingestion_paused(async || {
16861686- let train = |name: &'static str| {
16871687- let db = state.db.clone();
16881688- tokio::task::spawn_blocking(move || db.train_dict(name))
16891689- .map(|res| res.into_diagnostic().flatten())
16901690- };
16911691- tokio::try_join!(train("repos"), train("blocks"), train("events")).map(|_| ())
16921692- })
16931693- .await
16941694- }
16951695-}
16961696-16971697-pub struct Record {
16981698- pub did: Did<'static>,
16991699- pub cid: Cid<'static>,
17001700- pub value: Data<'static>,
17011701-}
17021702-17031703-pub struct ListedRecord {
17041704- pub rkey: Rkey<'static>,
17051705- pub cid: Cid<'static>,
17061706- pub value: Data<'static>,
17071707-}
17081708-17091709-pub struct RecordList {
17101710- pub records: Vec<ListedRecord>,
17111711- pub cursor: Option<Rkey<'static>>,
17121712-}
17131713-17141714-/// handle to access data related to this repository.
17151715-#[derive(Clone)]
17161716-pub struct RepoHandle<'i> {
17171717- state: Arc<AppState>,
17181718- pub did: Did<'i>,
17191719-}
17201720-17211721-impl<'i> RepoHandle<'i> {
17221722- pub async fn info(&self) -> Result<Option<RepoInfo>> {
17231723- let did_key = keys::repo_key(&self.did);
17241724- let state = self.state.clone();
17251725- let did = self.did.clone().into_static();
17261726-17271727- tokio::task::spawn_blocking(move || {
17281728- let bytes = state.db.repos.get(&did_key).into_diagnostic()?;
17291729- let state = bytes.as_deref().map(db::deser_repo_state).transpose()?;
17301730- Ok(state.map(|s| repo_state_to_info(did, s)))
17311731- })
17321732- .await
17331733- .into_diagnostic()?
17341734- }
17351735-17361736- pub async fn get_record(&self, collection: &str, rkey: &str) -> Result<Option<Record>> {
17371737- let did = self.did.clone().into_static();
17381738- let db_key = keys::record_key(&did, collection, &DbRkey::new(rkey));
17391739-17401740- let collection = collection.to_smolstr();
17411741- let state = self.state.clone();
17421742- tokio::task::spawn_blocking(move || {
17431743- use miette::WrapErr;
17441744-17451745- let cid_bytes = state.db.records.get(db_key).into_diagnostic()?;
17461746- let Some(cid_bytes) = cid_bytes else {
17471747- return Ok(None);
17481748- };
17491749-17501750- // lookup block using col|cid key
17511751- let block_key = keys::block_key(&collection, &cid_bytes);
17521752- let Some(block_bytes) = state.db.blocks.get(block_key).into_diagnostic()? else {
17531753- miette::bail!("block {cid_bytes:?} not found, this is a bug!!");
17541754- };
17551755-17561756- let value = serde_ipld_dagcbor::from_slice::<Data>(&block_bytes)
17571757- .into_diagnostic()
17581758- .wrap_err("cant parse block")?
17591759- .into_static();
17601760- let cid = Cid::new(&cid_bytes)
17611761- .into_diagnostic()
17621762- .wrap_err("cant parse block cid")?;
17631763- let cid = Cid::Str(cid.to_cowstr().into_static());
17641764-17651765- Ok(Some(Record { did, cid, value }))
17661766- })
17671767- .await
17681768- .into_diagnostic()?
17691769- }
17701770-17711771- pub async fn list_records(
17721772- &self,
17731773- collection: &str,
17741774- limit: usize,
17751775- reverse: bool,
17761776- cursor: Option<&str>,
17771777- ) -> Result<RecordList> {
17781778- let did = self.did.clone().into_static();
17791779-17801780- let state = self.state.clone();
17811781- let prefix = keys::record_prefix_collection(&did, collection);
17821782- let collection = collection.to_smolstr();
17831783- let cursor = cursor.map(|c| c.to_smolstr());
17841784-17851785- tokio::task::spawn_blocking(move || {
17861786- let mut results = Vec::new();
17871787- let mut next_cursor = None;
17881788-17891789- let iter: Box<dyn Iterator<Item = _>> = if !reverse {
17901790- let mut end_prefix = prefix.clone();
17911791- if let Some(last) = end_prefix.last_mut() {
17921792- *last += 1;
17931793- }
17941794-17951795- let end_key = if let Some(cursor) = &cursor {
17961796- let mut k = prefix.clone();
17971797- k.extend_from_slice(cursor.as_bytes());
17981798- k
17991799- } else {
18001800- end_prefix
18011801- };
18021802-18031803- Box::new(
18041804- state
18051805- .db
18061806- .records
18071807- .range(prefix.as_slice()..end_key.as_slice())
18081808- .rev(),
18091809- )
18101810- } else {
18111811- let start_key = if let Some(cursor) = &cursor {
18121812- let mut k = prefix.clone();
18131813- k.extend_from_slice(cursor.as_bytes());
18141814- k.push(0);
18151815- k
18161816- } else {
18171817- prefix.clone()
18181818- };
18191819-18201820- Box::new(state.db.records.range(start_key.as_slice()..))
18211821- };
18221822-18231823- for item in iter {
18241824- let (key, cid_bytes) = item.into_inner().into_diagnostic()?;
18251825-18261826- if !key.starts_with(prefix.as_slice()) {
18271827- break;
18281828- }
18291829-18301830- let rkey = keys::parse_rkey(&key[prefix.len()..])?;
18311831- if results.len() >= limit {
18321832- next_cursor = Some(rkey);
18331833- break;
18341834- }
18351835-18361836- // look up using col|cid key built from collection and binary cid bytes
18371837- if let Ok(Some(block_bytes)) = state
18381838- .db
18391839- .blocks
18401840- .get(&keys::block_key(collection.as_str(), &cid_bytes))
18411841- {
18421842- let value: Data =
18431843- serde_ipld_dagcbor::from_slice(&block_bytes).unwrap_or(Data::Null);
18441844- let cid = Cid::new(&cid_bytes).into_diagnostic()?;
18451845- let cid = Cid::Str(cid.to_cowstr().into_static());
18461846- results.push(ListedRecord {
18471847- rkey: Rkey::new_cow(CowStr::Owned(rkey.to_smolstr()))
18481848- .expect("that rkey is validated"),
18491849- cid,
18501850- value: value.into_static(),
18511851- });
18521852- }
18531853- }
18541854- Result::<_, miette::Report>::Ok((results, next_cursor))
18551855- })
18561856- .await
18571857- .into_diagnostic()?
18581858- .map(|(records, next_cursor)| RecordList {
18591859- records,
18601860- cursor: next_cursor.map(|rkey| {
18611861- Rkey::new_cow(CowStr::Owned(rkey.to_smolstr())).expect("that rkey is validated")
18621862- }),
18631863- })
18641864- }
18651865-18661866- pub async fn count_records(&self, collection: &str) -> Result<u64> {
18671867- let did = self.did.clone().into_static();
18681868- let state = self.state.clone();
18691869- let collection = collection.to_string();
18701870- tokio::task::spawn_blocking(move || db::get_record_count(&state.db, &did, &collection))
18711871- .await
18721872- .into_diagnostic()?
18731873- }
18741874-}
18751875-18761876-fn event_stream_thread(state: Arc<AppState>, tx: mpsc::Sender<Event>, cursor: Option<u64>) {
18771877- let db = &state.db;
18781878- let mut event_rx = db.event_tx.subscribe();
18791879- let ks = db.events.clone();
18801880- let mut current_id = match cursor {
18811881- Some(c) => c.saturating_sub(1),
18821882- None => db.next_event_id.load(Ordering::SeqCst).saturating_sub(1),
18831883- };
18841884-18851885- loop {
18861886- // catch up from db
18871887- loop {
18881888- let mut found = false;
18891889- for item in ks.range(keys::event_key(current_id + 1)..) {
18901890- let (k, v) = match item.into_inner() {
18911891- Ok(kv) => kv,
18921892- Err(e) => {
18931893- error!(err = %e, "failed to read event from db");
18941894- break;
18951895- }
18961896- };
18971897-18981898- let id = match k.as_ref().try_into().map(u64::from_be_bytes) {
18991899- Ok(id) => id,
19001900- Err(_) => {
19011901- error!("failed to parse event id");
19021902- continue;
19031903- }
19041904- };
19051905- current_id = id;
19061906-19071907- let stored: StoredEvent = match rmp_serde::from_slice(&v) {
19081908- Ok(e) => e,
19091909- Err(e) => {
19101910- error!(err = %e, "failed to deserialize stored event");
19111911- continue;
19121912- }
19131913- };
19141914-19151915- let Some(evt) = stored_to_event(&state, id, stored) else {
19161916- continue;
19171917- };
19181918-19191919- if tx.blocking_send(evt).is_err() {
19201920- return; // receiver dropped
19211921- }
19221922- found = true;
19231923- }
19241924- if !found {
19251925- break;
19261926- }
19271927- }
19281928-19291929- // wait for live events
19301930- match event_rx.blocking_recv() {
19311931- Ok(BroadcastEvent::Persisted(_)) => {} // re-run catch-up
19321932- Ok(BroadcastEvent::Ephemeral(evt)) => {
19331933- if tx.blocking_send(*evt).is_err() {
19341934- return;
19351935- }
19361936- }
19371937- Err(tokio::sync::broadcast::error::RecvError::Lagged(_)) => {}
19381938- Err(tokio::sync::broadcast::error::RecvError::Closed) => break,
19391939- }
19401940- }
19411941-}
19421942-19431943-fn stored_to_event(state: &AppState, id: u64, stored: StoredEvent<'_>) -> Option<Event> {
19441944- let StoredEvent {
19451945- live,
19461946- did,
19471947- rev,
19481948- collection,
19491949- rkey,
19501950- action,
19511951- data,
19521952- } = stored;
19531953-19541954- let record = match data {
19551955- StoredData::Ptr(cid) => {
19561956- let block = state
19571957- .db
19581958- .blocks
19591959- .get(&keys::block_key(collection.as_str(), &cid.to_bytes()));
19601960- match block {
19611961- Ok(Some(bytes)) => match serde_ipld_dagcbor::from_slice::<RawData>(&bytes) {
19621962- Ok(val) => Some((cid, serde_json::to_value(val).ok()?)),
19631963- Err(e) => {
19641964- error!(err = %e, "cant parse block");
19651965- return None;
19661966- }
19671967- },
19681968- Ok(None) => {
19691969- error!("block not found, this is a bug");
19701970- return None;
19711971- }
19721972- Err(e) => {
19731973- error!(err = %e, "cant get block");
19741974- db::check_poisoned(&e);
19751975- return None;
19761976- }
19771977- }
19781978- }
19791979- StoredData::Block(block) => {
19801980- let digest = Sha256::digest(&block);
19811981- let hash =
19821982- cid::multihash::Multihash::wrap(ATP_CID_HASH, &digest).expect("valid sha256 hash");
19831983- let cid = IpldCid::new_v1(DAG_CBOR_CID_CODEC, hash);
19841984- match serde_ipld_dagcbor::from_slice::<RawData>(&block) {
19851985- Ok(val) => Some((cid, serde_json::to_value(val).ok()?)),
19861986- Err(e) => {
19871987- error!(err = %e, "cant parse block");
19881988- return None;
19891989- }
19901990- }
19911991- }
19921992- StoredData::Nothing => None,
19931993- };
19941994-19951995- let (cid, record) = record
19961996- .map(|(c, r)| (Some(c), Some(r)))
19971997- .unwrap_or((None, None));
19981998-19991999- Some(MarshallableEvt {
20002000- id,
20012001- kind: crate::types::EventType::Record,
20022002- record: Some(RecordEvt {
20032003- live,
20042004- did: did.to_did(),
20052005- rev: rev.to_tid(),
20062006- collection: Nsid::new_cow(collection.clone().into_static())
20072007- .expect("that collection is already validated"),
20082008- rkey: Rkey::new_cow(rkey.to_cowstr().into_static())
20092009- .expect("that rkey is already validated"),
20102010- action: CowStr::Borrowed(action.as_str()),
20112011- record,
20122012- cid,
20132013- }),
20142014- identity: None,
20152015- account: None,
20162016- })
20172017-}
+261
src/control/crawler.rs
···11+use std::sync::Arc;
22+33+use miette::{IntoDiagnostic, Result};
44+use tokio::sync::{mpsc, watch};
55+use tracing::{error, info};
66+use url::Url;
77+88+use crate::db::keys;
99+use crate::state::AppState;
1010+1111+pub(super) struct ProducerHandle {
1212+ mode: crate::config::CrawlerMode,
1313+ abort: tokio::task::AbortHandle,
1414+}
1515+1616+impl Drop for ProducerHandle {
1717+ fn drop(&mut self) {
1818+ self.abort.abort();
1919+ }
2020+}
2121+2222+pub(super) struct CrawlerShared {
2323+ pub(super) http: reqwest::Client,
2424+ pub(super) checker: crate::crawler::SignalChecker,
2525+ pub(super) in_flight: crate::crawler::InFlight,
2626+ pub(super) tx: mpsc::Sender<crate::crawler::CrawlerBatch>,
2727+ pub(super) stats: crate::crawler::CrawlerStats,
2828+}
2929+3030+/// a snapshot of a single crawler source's runtime state.
3131+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
3232+pub struct CrawlerSourceInfo {
3333+ pub url: Url,
3434+ pub mode: crate::config::CrawlerMode,
3535+ /// whether this source is persisted in the database (i.e. it was dynamically added
3636+ /// and will survive restarts). config-sourced entries have `persisted: false`.
3737+ pub persisted: bool,
3838+}
3939+4040+pub(super) fn spawn_crawler_producer(
4141+ source: &crate::config::CrawlerSource,
4242+ http: &reqwest::Client,
4343+ state: &Arc<AppState>,
4444+ checker: &crate::crawler::SignalChecker,
4545+ in_flight: &crate::crawler::InFlight,
4646+ tx: &mpsc::Sender<crate::crawler::CrawlerBatch>,
4747+ stats: &crate::crawler::CrawlerStats,
4848+ enabled: watch::Receiver<bool>,
4949+) -> ProducerHandle {
5050+ use crate::config::CrawlerMode;
5151+ use crate::crawler::{ByCollectionProducer, RelayProducer};
5252+ use std::time::Duration;
5353+ use tracing::Instrument;
5454+5555+ let abort = match source.mode {
5656+ CrawlerMode::Relay => {
5757+ info!(relay = %source.url, enabled = *state.crawler_enabled.borrow(), "starting relay crawler");
5858+ let span = tracing::info_span!("crawl", url = %source.url);
5959+ tokio::spawn(
6060+ RelayProducer {
6161+ relay_url: source.url.clone(),
6262+ checker: checker.clone(),
6363+ in_flight: in_flight.clone(),
6464+ tx: tx.clone(),
6565+ enabled,
6666+ stats: stats.clone(),
6767+ }
6868+ .run()
6969+ .instrument(span),
7070+ )
7171+ .abort_handle()
7272+ }
7373+ CrawlerMode::ByCollection => {
7474+ info!(
7575+ host = source.url.host_str(),
7676+ enabled = *state.crawler_enabled.borrow(),
7777+ "starting by-collection crawler"
7878+ );
7979+ let span = tracing::info_span!("by_collection", host = source.url.host_str());
8080+ let http = http.clone();
8181+ let state = state.clone();
8282+ let in_flight = in_flight.clone();
8383+ let tx = tx.clone();
8484+ let stats = stats.clone();
8585+ let url = source.url.clone();
8686+ tokio::spawn(
8787+ async move {
8888+ loop {
8989+ let producer = ByCollectionProducer {
9090+ index_url: url.clone(),
9191+ http: http.clone(),
9292+ state: state.clone(),
9393+ in_flight: in_flight.clone(),
9494+ tx: tx.clone(),
9595+ enabled: enabled.clone(),
9696+ stats: stats.clone(),
9797+ };
9898+ if let Err(e) = producer.run().await {
9999+ error!(err = ?e, "by-collection crawler fatal error, restarting in 30s");
100100+ tokio::time::sleep(Duration::from_secs(30)).await;
101101+ }
102102+ }
103103+ }
104104+ .instrument(span),
105105+ )
106106+ .abort_handle()
107107+ }
108108+ };
109109+ ProducerHandle {
110110+ mode: source.mode,
111111+ abort,
112112+ }
113113+}
114114+115115+/// runtime control over the crawler component.
116116+///
117117+/// the crawler walks `com.atproto.sync.listRepos` on each configured relay to discover
118118+/// repositories that have never emitted a firehose event. in `filter` mode it also
119119+/// checks each discovered repo against the configured signal collections before
120120+/// enqueuing it for backfill.
121121+///
122122+/// disabling the crawler does not affect in-progress repo checks. each one completes
123123+/// its current PDS request before pausing.
124124+#[derive(Clone)]
125125+pub struct CrawlerHandle {
126126+ pub(super) state: Arc<AppState>,
127127+ /// set once by [`Hydrant::run`]; `None` means run() has not been called yet.
128128+ pub(super) shared: Arc<std::sync::OnceLock<CrawlerShared>>,
129129+ /// per-source running tasks, keyed by url.
130130+ pub(super) tasks: Arc<scc::HashMap<Url, ProducerHandle>>,
131131+ /// set of urls persisted in the database (dynamically added sources).
132132+ pub(super) persisted: Arc<scc::HashSet<Url>>,
133133+}
134134+135135+impl CrawlerHandle {
136136+ /// enable the crawler (enables all configured producers). no-op if already enabled.
137137+ pub fn enable(&self) {
138138+ self.state.crawler_enabled.send_replace(true);
139139+ }
140140+ /// disable the crawler (disables all configured producers).
141141+ /// in-progress repo checks finish before the crawler pauses.
142142+ pub fn disable(&self) {
143143+ self.state.crawler_enabled.send_replace(false);
144144+ }
145145+ /// returns the current enabled state of the crawler.
146146+ pub fn is_enabled(&self) -> bool {
147147+ *self.state.crawler_enabled.borrow()
148148+ }
149149+150150+ /// delete all cursor entries associated with the given URL.
151151+ pub async fn reset_cursor(&self, url: &str) -> Result<()> {
152152+ let db = self.state.db.clone();
153153+ let point_keys = [keys::crawler_cursor_key(url)];
154154+ let by_collection_prefix = keys::by_collection_cursor_prefix(url);
155155+ tokio::task::spawn_blocking(move || {
156156+ let mut batch = db.inner.batch();
157157+ for k in point_keys {
158158+ batch.remove(&db.cursors, k);
159159+ }
160160+ for entry in db.cursors.prefix(&by_collection_prefix) {
161161+ let k = entry.key().into_diagnostic()?;
162162+ batch.remove(&db.cursors, k);
163163+ }
164164+ batch.commit().into_diagnostic()
165165+ })
166166+ .await
167167+ .into_diagnostic()??;
168168+ Ok(())
169169+ }
170170+171171+ /// return info on all currently active crawler sources.
172172+ ///
173173+ /// returns an empty list if called before [`Hydrant::run`].
174174+ pub async fn list_sources(&self) -> Vec<CrawlerSourceInfo> {
175175+ let mut sources = Vec::new();
176176+ self.tasks
177177+ .iter_async(|url, h| {
178178+ sources.push(CrawlerSourceInfo {
179179+ url: url.clone(),
180180+ mode: h.mode,
181181+ persisted: self.persisted.contains_sync(url),
182182+ });
183183+ true
184184+ })
185185+ .await;
186186+ sources
187187+ }
188188+189189+ /// add a new crawler source at runtime.
190190+ ///
191191+ /// the source is persisted to the database and will be re-spawned on restart.
192192+ /// if a source with the same URL already exists, it is replaced (the old task is
193193+ /// aborted and a new one is started with the new mode).
194194+ ///
195195+ /// returns an error if called before [`Hydrant::run`].
196196+ pub async fn add_source(&self, source: crate::config::CrawlerSource) -> Result<()> {
197197+ let Some(shared) = self.shared.get() else {
198198+ miette::bail!("crawler not yet started: call Hydrant::run() first");
199199+ };
200200+201201+ let db = self.state.db.clone();
202202+ let key = keys::crawler_source_key(source.url.as_str());
203203+ let val = rmp_serde::to_vec(&source.mode).into_diagnostic()?;
204204+ tokio::task::spawn_blocking(move || db.crawler.insert(key, val).into_diagnostic())
205205+ .await
206206+ .into_diagnostic()??;
207207+208208+ let enabled_rx = self.state.crawler_enabled.subscribe();
209209+ let handle = spawn_crawler_producer(
210210+ &source,
211211+ &shared.http,
212212+ &self.state,
213213+ &shared.checker,
214214+ &shared.in_flight,
215215+ &shared.tx,
216216+ &shared.stats,
217217+ enabled_rx,
218218+ );
219219+220220+ let _ = self.persisted.insert_async(source.url.clone()).await;
221221+ match self.tasks.entry_async(source.url).await {
222222+ scc::hash_map::Entry::Vacant(e) => {
223223+ e.insert_entry(handle);
224224+ }
225225+ scc::hash_map::Entry::Occupied(mut e) => {
226226+ *e.get_mut() = handle;
227227+ }
228228+ }
229229+ Ok(())
230230+ }
231231+232232+ /// remove a crawler source at runtime by URL.
233233+ ///
234234+ /// aborts the running producer task and removes the source from the database if it
235235+ /// was dynamically added. config-sourced entries are aborted but not persisted, so
236236+ /// they will reappear on restart.
237237+ ///
238238+ /// returns `true` if a source with the given URL was found and removed.
239239+ /// returns an error if called before [`Hydrant::run`].
240240+ pub async fn remove_source(&self, url: &Url) -> Result<bool> {
241241+ if self.shared.get().is_none() {
242242+ miette::bail!("crawler not yet started: call Hydrant::run() first");
243243+ }
244244+245245+ // dropping the ProducerHandle aborts the task via Drop
246246+ if self.tasks.remove_async(url).await.is_none() {
247247+ return Ok(false);
248248+ }
249249+250250+ // remove from DB if it was a persisted source
251251+ if self.persisted.remove_async(url).await.is_some() {
252252+ let db = self.state.db.clone();
253253+ let key = keys::crawler_source_key(url.as_str());
254254+ tokio::task::spawn_blocking(move || db.crawler.remove(key).into_diagnostic())
255255+ .await
256256+ .into_diagnostic()??;
257257+ }
258258+259259+ Ok(true)
260260+ }
261261+}
+312
src/control/filter.rs
···11+use std::sync::Arc;
22+33+use miette::{IntoDiagnostic, Result};
44+55+use crate::db::filter as db_filter;
66+use crate::filter::{FilterMode, SetUpdate};
77+use crate::state::AppState;
88+99+/// a point-in-time snapshot of the filter configuration. returned by all [`FilterControl`] methods.
1010+///
1111+/// because the filter is stored in the database and loaded on demand, this snapshot
1212+/// may be stale if another caller modifies the filter concurrently. for the authoritative
1313+/// live config use [`FilterControl::get`].
1414+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1515+pub struct FilterSnapshot {
1616+ pub mode: FilterMode,
1717+ pub signals: Vec<String>,
1818+ pub collections: Vec<String>,
1919+ pub excludes: Vec<String>,
2020+}
2121+2222+/// runtime control over the indexing filter.
2323+///
2424+/// the filter has two orthogonal axes:
2525+///
2626+/// **mode** controls discovery:
2727+/// - [`FilterMode::Filter`]: only indexes repos whose firehose commits touch a collection
2828+/// matching a configured `signal`. explicit [`ReposControl::track`] always works regardless.
2929+/// - [`FilterMode::Full`]: indexes the entire network. `signals` are ignored for discovery
3030+/// but `collections` and `excludes` still apply.
3131+///
3232+/// **sets** are each independently configurable:
3333+/// - `signals`: NSID patterns that trigger auto-discovery in `filter` mode (e.g. `app.bsky.feed.post`, `app.bsky.graph.*`)
3434+/// - `collections`: NSID patterns that filter which records are *stored*. empty means store all.
3535+/// - `excludes`: DIDs that are always skipped regardless of mode.
3636+///
3737+/// NSID patterns support an optional `.*` suffix to match an entire namespace.
3838+/// all mutations are persisted to the database and take effect immediately.
3939+#[derive(Clone)]
4040+pub struct FilterControl(pub(super) Arc<AppState>);
4141+4242+impl FilterControl {
4343+ /// return the current filter configuration from the database.
4444+ pub async fn get(&self) -> Result<FilterSnapshot> {
4545+ let filter_ks = self.0.db.filter.clone();
4646+ tokio::task::spawn_blocking(move || {
4747+ let hot = db_filter::load(&filter_ks)?;
4848+ let excludes = db_filter::read_set(&filter_ks, db_filter::EXCLUDE_PREFIX)?;
4949+ Ok(FilterSnapshot {
5050+ mode: hot.mode,
5151+ signals: hot.signals.iter().map(|s| s.to_string()).collect(),
5252+ collections: hot.collections.iter().map(|s| s.to_string()).collect(),
5353+ excludes,
5454+ })
5555+ })
5656+ .await
5757+ .into_diagnostic()?
5858+ }
5959+6060+ /// set the indexing mode. see [`FilterControl`] for mode semantics.
6161+ pub fn set_mode(&self, mode: FilterMode) -> FilterPatch {
6262+ FilterPatch::new(self).set_mode(mode)
6363+ }
6464+6565+ /// replace the entire signals set. existing signals are removed.
6666+ pub fn set_signals(&self, signals: impl IntoIterator<Item = impl Into<String>>) -> FilterPatch {
6767+ FilterPatch::new(self).set_signals(signals)
6868+ }
6969+7070+ /// add multiple signals without disturbing existing ones.
7171+ pub fn append_signals(
7272+ &self,
7373+ signals: impl IntoIterator<Item = impl Into<String>>,
7474+ ) -> FilterPatch {
7575+ FilterPatch::new(self).append_signals(signals)
7676+ }
7777+7878+ /// add a single signal. no-op if already present.
7979+ pub fn add_signal(&self, signal: impl Into<String>) -> FilterPatch {
8080+ FilterPatch::new(self).add_signal(signal)
8181+ }
8282+8383+ /// remove a single signal. no-op if not present.
8484+ pub fn remove_signal(&self, signal: impl Into<String>) -> FilterPatch {
8585+ FilterPatch::new(self).remove_signal(signal)
8686+ }
8787+8888+ /// replace the entire collections set. pass an empty iterator to store all collections.
8989+ pub fn set_collections(
9090+ &self,
9191+ collections: impl IntoIterator<Item = impl Into<String>>,
9292+ ) -> FilterPatch {
9393+ FilterPatch::new(self).set_collections(collections)
9494+ }
9595+9696+ /// add multiple collections without disturbing existing ones.
9797+ pub fn append_collections(
9898+ &self,
9999+ collections: impl IntoIterator<Item = impl Into<String>>,
100100+ ) -> FilterPatch {
101101+ FilterPatch::new(self).append_collections(collections)
102102+ }
103103+104104+ /// add a single collection filter. no-op if already present.
105105+ pub fn add_collection(&self, collection: impl Into<String>) -> FilterPatch {
106106+ FilterPatch::new(self).add_collection(collection)
107107+ }
108108+109109+ /// remove a single collection filter. no-op if not present.
110110+ pub fn remove_collection(&self, collection: impl Into<String>) -> FilterPatch {
111111+ FilterPatch::new(self).remove_collection(collection)
112112+ }
113113+114114+ /// replace the entire excludes set.
115115+ pub fn set_excludes(
116116+ &self,
117117+ excludes: impl IntoIterator<Item = impl Into<String>>,
118118+ ) -> FilterPatch {
119119+ FilterPatch::new(self).set_excludes(excludes)
120120+ }
121121+122122+ /// add multiple DIDs to the excludes set without disturbing existing ones.
123123+ pub fn append_excludes(
124124+ &self,
125125+ excludes: impl IntoIterator<Item = impl Into<String>>,
126126+ ) -> FilterPatch {
127127+ FilterPatch::new(self).append_excludes(excludes)
128128+ }
129129+130130+ /// add a single DID to the excludes set. no-op if already excluded.
131131+ pub fn add_exclude(&self, did: impl Into<String>) -> FilterPatch {
132132+ FilterPatch::new(self).add_exclude(did)
133133+ }
134134+135135+ /// remove a single DID from the excludes set. no-op if not present.
136136+ pub fn remove_exclude(&self, did: impl Into<String>) -> FilterPatch {
137137+ FilterPatch::new(self).remove_exclude(did)
138138+ }
139139+}
140140+141141+/// a staged set of filter mutations. all methods accumulate changes without touching
142142+/// the database. call [`FilterPatch::apply`] to commit the entire patch atomically.
143143+///
144144+/// obtain an instance by calling any mutation method on [`FilterControl`], or via
145145+/// [`FilterPatch::new`] to start from a blank patch.
146146+pub struct FilterPatch {
147147+ state: Arc<AppState>,
148148+ /// if set, replaces the current indexing mode.
149149+ pub mode: Option<FilterMode>,
150150+ /// if set, replaces or patches the signals set.
151151+ pub signals: Option<SetUpdate>,
152152+ /// if set, replaces or patches the collections set.
153153+ pub collections: Option<SetUpdate>,
154154+ /// if set, replaces or patches the excludes set.
155155+ pub excludes: Option<SetUpdate>,
156156+}
157157+158158+impl FilterPatch {
159159+ /// create a new blank patch associated with the given [`FilterControl`].
160160+ pub fn new(control: &FilterControl) -> Self {
161161+ Self {
162162+ state: control.0.clone(),
163163+ mode: None,
164164+ signals: None,
165165+ collections: None,
166166+ excludes: None,
167167+ }
168168+ }
169169+170170+ /// set the indexing mode. see [`FilterControl`] for mode semantics.
171171+ pub fn set_mode(mut self, mode: FilterMode) -> Self {
172172+ self.mode = Some(mode);
173173+ self
174174+ }
175175+176176+ /// replace the entire signals set. existing signals are removed.
177177+ pub fn set_signals(mut self, signals: impl IntoIterator<Item = impl Into<String>>) -> Self {
178178+ self.signals = Some(SetUpdate::Set(
179179+ signals.into_iter().map(Into::into).collect(),
180180+ ));
181181+ self
182182+ }
183183+184184+ /// add multiple signals without disturbing existing ones.
185185+ pub fn append_signals(mut self, signals: impl IntoIterator<Item = impl Into<String>>) -> Self {
186186+ self.signals = Some(SetUpdate::Patch(
187187+ signals.into_iter().map(|s| (s.into(), true)).collect(),
188188+ ));
189189+ self
190190+ }
191191+192192+ /// add a single signal. no-op if already present.
193193+ pub fn add_signal(mut self, signal: impl Into<String>) -> Self {
194194+ self.signals = Some(SetUpdate::Patch([(signal.into(), true)].into()));
195195+ self
196196+ }
197197+198198+ /// remove a single signal. no-op if not present.
199199+ pub fn remove_signal(mut self, signal: impl Into<String>) -> Self {
200200+ self.signals = Some(SetUpdate::Patch([(signal.into(), false)].into()));
201201+ self
202202+ }
203203+204204+ /// replace the entire collections set. pass an empty iterator to store all collections.
205205+ pub fn set_collections(
206206+ mut self,
207207+ collections: impl IntoIterator<Item = impl Into<String>>,
208208+ ) -> Self {
209209+ self.collections = Some(SetUpdate::Set(
210210+ collections.into_iter().map(Into::into).collect(),
211211+ ));
212212+ self
213213+ }
214214+215215+ /// add multiple collections without disturbing existing ones.
216216+ pub fn append_collections(
217217+ mut self,
218218+ collections: impl IntoIterator<Item = impl Into<String>>,
219219+ ) -> Self {
220220+ self.collections = Some(SetUpdate::Patch(
221221+ collections.into_iter().map(|c| (c.into(), true)).collect(),
222222+ ));
223223+ self
224224+ }
225225+226226+ /// add a single collection filter. no-op if already present.
227227+ pub fn add_collection(mut self, collection: impl Into<String>) -> Self {
228228+ self.collections = Some(SetUpdate::Patch([(collection.into(), true)].into()));
229229+ self
230230+ }
231231+232232+ /// remove a single collection filter. no-op if not present.
233233+ pub fn remove_collection(mut self, collection: impl Into<String>) -> Self {
234234+ self.collections = Some(SetUpdate::Patch([(collection.into(), false)].into()));
235235+ self
236236+ }
237237+238238+ /// replace the entire excludes set.
239239+ pub fn set_excludes(mut self, excludes: impl IntoIterator<Item = impl Into<String>>) -> Self {
240240+ self.excludes = Some(SetUpdate::Set(
241241+ excludes.into_iter().map(Into::into).collect(),
242242+ ));
243243+ self
244244+ }
245245+246246+ /// add multiple DIDs to the excludes set without disturbing existing ones.
247247+ pub fn append_excludes(
248248+ mut self,
249249+ excludes: impl IntoIterator<Item = impl Into<String>>,
250250+ ) -> Self {
251251+ self.excludes = Some(SetUpdate::Patch(
252252+ excludes.into_iter().map(|d| (d.into(), true)).collect(),
253253+ ));
254254+ self
255255+ }
256256+257257+ /// add a single DID to the excludes set. no-op if already excluded.
258258+ pub fn add_exclude(mut self, did: impl Into<String>) -> Self {
259259+ self.excludes = Some(SetUpdate::Patch([(did.into(), true)].into()));
260260+ self
261261+ }
262262+263263+ /// remove a single DID from the excludes set. no-op if not present.
264264+ pub fn remove_exclude(mut self, did: impl Into<String>) -> Self {
265265+ self.excludes = Some(SetUpdate::Patch([(did.into(), false)].into()));
266266+ self
267267+ }
268268+269269+ /// commit the patch atomically to the database and update the in-memory filter.
270270+ /// returns the updated [`FilterSnapshot`].
271271+ pub async fn apply(self) -> Result<FilterSnapshot> {
272272+ let filter_ks = self.state.db.filter.clone();
273273+ let inner = self.state.db.inner.clone();
274274+ let filter_handle = self.state.filter.clone();
275275+ let mode = self.mode;
276276+ let signals = self.signals;
277277+ let collections = self.collections;
278278+ let excludes = self.excludes;
279279+280280+ let new_filter = tokio::task::spawn_blocking(move || {
281281+ let mut batch = inner.batch();
282282+ db_filter::apply_patch(&mut batch, &filter_ks, mode, signals, collections, excludes)?;
283283+ batch.commit().into_diagnostic()?;
284284+ db_filter::load(&filter_ks)
285285+ })
286286+ .await
287287+ .into_diagnostic()??;
288288+289289+ let exclude_list = {
290290+ let filter_ks = self.state.db.filter.clone();
291291+ tokio::task::spawn_blocking(move || {
292292+ db_filter::read_set(&filter_ks, db_filter::EXCLUDE_PREFIX)
293293+ })
294294+ .await
295295+ .into_diagnostic()??
296296+ };
297297+298298+ let snapshot = FilterSnapshot {
299299+ mode: new_filter.mode,
300300+ signals: new_filter.signals.iter().map(|s| s.to_string()).collect(),
301301+ collections: new_filter
302302+ .collections
303303+ .iter()
304304+ .map(|s| s.to_string())
305305+ .collect(),
306306+ excludes: exclude_list,
307307+ };
308308+309309+ filter_handle.store(Arc::new(new_filter));
310310+ Ok(snapshot)
311311+ }
312312+}
+196
src/control/firehose.rs
···11+use std::sync::Arc;
22+use std::sync::atomic::Ordering;
33+44+use miette::{IntoDiagnostic, Result};
55+use tokio::sync::watch;
66+use tracing::{error, info};
77+use url::Url;
88+99+use crate::db::{self, keys};
1010+use crate::ingest::{BufferTx, firehose::FirehoseIngestor};
1111+use crate::state::AppState;
1212+1313+pub(super) struct FirehoseIngestorHandle {
1414+ abort: tokio::task::AbortHandle,
1515+}
1616+1717+impl Drop for FirehoseIngestorHandle {
1818+ fn drop(&mut self) {
1919+ self.abort.abort();
2020+ }
2121+}
2222+2323+pub(super) struct FirehoseShared {
2424+ pub(super) buffer_tx: BufferTx,
2525+ pub(super) verify_signatures: bool,
2626+}
2727+2828+/// a snapshot of a single firehose relay's runtime state.
2929+#[derive(Debug, Clone, serde::Serialize)]
3030+pub struct FirehoseSourceInfo {
3131+ pub url: Url,
3232+ /// true if added via the API and persisted to the database; false for `RELAY_HOSTS` sources.
3333+ pub persisted: bool,
3434+}
3535+3636+pub(super) async fn spawn_firehose_ingestor(
3737+ relay_url: &Url,
3838+ state: &Arc<AppState>,
3939+ shared: &FirehoseShared,
4040+ enabled: watch::Receiver<bool>,
4141+) -> Result<FirehoseIngestorHandle> {
4242+ use std::sync::atomic::AtomicI64;
4343+4444+ let start = db::get_firehose_cursor(&state.db, relay_url).await?;
4545+ // insert into relay_cursors if not already present; existing in-memory cursor takes precedence
4646+ let _ = state
4747+ .relay_cursors
4848+ .insert_async(relay_url.clone(), AtomicI64::new(start.unwrap_or(0)))
4949+ .await;
5050+5151+ info!(relay = %relay_url, cursor = ?start, "starting firehose ingestor");
5252+5353+ let ingestor = FirehoseIngestor::new(
5454+ state.clone(),
5555+ shared.buffer_tx.clone(),
5656+ relay_url.clone(),
5757+ state.filter.clone(),
5858+ enabled,
5959+ shared.verify_signatures,
6060+ );
6161+6262+ let relay_for_log = relay_url.clone();
6363+ let abort = tokio::spawn(async move {
6464+ if let Err(e) = ingestor.run().await {
6565+ error!(relay = %relay_for_log, err = %e, "firehose ingestor exited with error");
6666+ }
6767+ })
6868+ .abort_handle();
6969+7070+ Ok(FirehoseIngestorHandle { abort })
7171+}
7272+7373+/// runtime control over the firehose ingestor component.
7474+#[derive(Clone)]
7575+pub struct FirehoseHandle {
7676+ pub(super) state: Arc<AppState>,
7777+ /// set once by [`Hydrant::run`]; `None` means run() has not been called yet.
7878+ pub(super) shared: Arc<std::sync::OnceLock<FirehoseShared>>,
7979+ /// per-relay running tasks, keyed by url.
8080+ pub(super) tasks: Arc<scc::HashMap<Url, FirehoseIngestorHandle>>,
8181+ /// set of urls persisted in the database (dynamically added sources).
8282+ pub(super) persisted: Arc<scc::HashSet<Url>>,
8383+}
8484+8585+impl FirehoseHandle {
8686+ /// enable the firehose. no-op if already enabled.
8787+ pub fn enable(&self) {
8888+ self.state.firehose_enabled.send_replace(true);
8989+ }
9090+ /// disable the firehose. the current message finishes processing before the connection closes.
9191+ pub fn disable(&self) {
9292+ self.state.firehose_enabled.send_replace(false);
9393+ }
9494+ /// returns the current enabled state of the firehose.
9595+ pub fn is_enabled(&self) -> bool {
9696+ *self.state.firehose_enabled.borrow()
9797+ }
9898+9999+ /// reset the stored cursor for the given relay URL.
100100+ ///
101101+ /// clears the `firehose_cursor|{url}` entry from the cursors keyspace and zeroes the
102102+ /// in-memory cursor. the next connection will tail live events from the current head.
103103+ pub async fn reset_cursor(&self, url: &str) -> Result<()> {
104104+ let db = self.state.db.clone();
105105+ let key = keys::firehose_cursor_key(url);
106106+ tokio::task::spawn_blocking(move || db.cursors.remove(key).into_diagnostic())
107107+ .await
108108+ .into_diagnostic()??;
109109+110110+ if let Ok(relay_url) = Url::parse(url) {
111111+ self.state.relay_cursors.peek_with(&relay_url, |_, c| {
112112+ c.store(0, Ordering::SeqCst);
113113+ });
114114+ }
115115+ Ok(())
116116+ }
117117+118118+ /// return info on all currently active firehose sources.
119119+ pub async fn list_sources(&self) -> Vec<FirehoseSourceInfo> {
120120+ let mut sources = Vec::new();
121121+ self.tasks
122122+ .iter_async(|url, _| {
123123+ sources.push(FirehoseSourceInfo {
124124+ url: url.clone(),
125125+ persisted: self.persisted.contains_sync(url),
126126+ });
127127+ true
128128+ })
129129+ .await;
130130+ sources
131131+ }
132132+133133+ /// add a new firehose relay at runtime.
134134+ ///
135135+ /// the URL is persisted to the database and will be re-spawned on restart. if a relay with
136136+ /// the same URL already exists it is replaced: the running task is stopped and a new one
137137+ /// is started. any cursor state for that URL is preserved.
138138+ ///
139139+ /// returns an error if called before [`Hydrant::run`].
140140+ pub async fn add_source(&self, url: Url) -> Result<()> {
141141+ let Some(shared) = self.shared.get() else {
142142+ miette::bail!("firehose not yet started: call Hydrant::run() first");
143143+ };
144144+145145+ let db = self.state.db.clone();
146146+ let key = keys::firehose_source_key(url.as_str());
147147+ tokio::task::spawn_blocking(move || db.crawler.insert(key, b"").into_diagnostic())
148148+ .await
149149+ .into_diagnostic()??;
150150+151151+ let enabled_rx = self.state.firehose_enabled.subscribe();
152152+ let handle = spawn_firehose_ingestor(&url, &self.state, shared, enabled_rx).await?;
153153+154154+ let _ = self.persisted.insert_async(url.clone()).await;
155155+ match self.tasks.entry_async(url).await {
156156+ scc::hash_map::Entry::Vacant(e) => {
157157+ e.insert_entry(handle);
158158+ }
159159+ scc::hash_map::Entry::Occupied(mut e) => {
160160+ *e.get_mut() = handle;
161161+ }
162162+ }
163163+ Ok(())
164164+ }
165165+166166+ /// remove a firehose relay at runtime by URL.
167167+ ///
168168+ /// aborts the running ingestor task. if the source was added via the API it is removed from
169169+ /// the database and will not reappear on restart. `RELAY_HOSTS` sources are only stopped for
170170+ /// the current session; they reappear on the next restart.
171171+ ///
172172+ /// returns `true` if the relay was found and removed, `false` if it was not running.
173173+ /// returns an error if called before [`Hydrant::run`].
174174+ pub async fn remove_source(&self, url: &Url) -> Result<bool> {
175175+ if self.shared.get().is_none() {
176176+ miette::bail!("firehose not yet started: call Hydrant::run() first");
177177+ }
178178+179179+ if self.tasks.remove_async(url).await.is_none() {
180180+ return Ok(false);
181181+ }
182182+183183+ // remove from relay_cursors (persist thread will stop tracking it)
184184+ self.state.relay_cursors.remove_async(url).await;
185185+186186+ if self.persisted.remove_async(url).await.is_some() {
187187+ let db = self.state.db.clone();
188188+ let key = keys::firehose_source_key(url.as_str());
189189+ tokio::task::spawn_blocking(move || db.crawler.remove(key).into_diagnostic())
190190+ .await
191191+ .into_diagnostic()??;
192192+ }
193193+194194+ Ok(true)
195195+ }
196196+}
+753
src/control/mod.rs
···11+mod crawler;
22+mod filter;
33+mod firehose;
44+mod repos;
55+mod stream;
66+77+pub use crawler::{CrawlerHandle, CrawlerSourceInfo};
88+pub use filter::{FilterControl, FilterPatch, FilterSnapshot};
99+pub use firehose::{FirehoseHandle, FirehoseSourceInfo};
1010+pub(crate) use repos::repo_state_to_info;
1111+pub use repos::{ListedRecord, Record, RecordList, RepoHandle, RepoInfo, ReposControl};
1212+1313+use std::collections::BTreeMap;
1414+use std::future::Future;
1515+use std::pin::Pin;
1616+use std::sync::Arc;
1717+use std::sync::atomic::{AtomicBool, Ordering};
1818+use std::task::{Context, Poll};
1919+2020+use futures::{FutureExt, Stream};
2121+use miette::{IntoDiagnostic, Result};
2222+use tokio::sync::{mpsc, watch};
2323+use tracing::{debug, error, info};
2424+2525+use crate::backfill::BackfillWorker;
2626+use crate::config::{Config, SignatureVerification};
2727+use crate::db::{
2828+ self, filter as db_filter, load_persisted_crawler_sources, load_persisted_firehose_sources,
2929+};
3030+use crate::filter::FilterMode;
3131+use crate::ingest::worker::FirehoseWorker;
3232+use crate::state::AppState;
3333+use crate::types::MarshallableEvt;
3434+3535+use crawler::{CrawlerShared, spawn_crawler_producer};
3636+use firehose::{FirehoseShared, spawn_firehose_ingestor};
3737+use stream::event_stream_thread;
3838+3939+/// an event emitted by the hydrant event stream.
4040+///
4141+/// three variants are possible depending on the `type` field:
4242+/// - `"record"`: a repo record was created, updated, or deleted. carries a [`RecordEvt`].
4343+/// - `"identity"`: a DID's handle or PDS changed. carries an [`IdentityEvt`]. ephemeral, not replayable.
4444+/// - `"account"`: a repo's active/inactive status changed. carries an [`AccountEvt`]. ephemeral, not replayable.
4545+///
4646+/// the `id` field is a monotonically increasing sequence number usable as a cursor for [`Hydrant::subscribe`].
4747+pub type Event = MarshallableEvt<'static>;
4848+4949+/// the top-level handle to a hydrant instance.
5050+///
5151+/// `Hydrant` is cheaply cloneable. all sub-handles share the same underlying state.
5252+/// construct it via [`Hydrant::new`] or [`Hydrant::from_env`], configure the filter
5353+/// and repos as needed, then call [`Hydrant::run`] to start all background components.
5454+///
5555+/// # example
5656+///
5757+/// ```rust,no_run
5858+/// use hydrant::control::Hydrant;
5959+///
6060+/// #[tokio::main]
6161+/// async fn main() -> miette::Result<()> {
6262+/// let hydrant = Hydrant::from_env().await?;
6363+///
6464+/// tokio::select! {
6565+/// r = hydrant.run()? => r,
6666+/// r = hydrant.serve(3000) => r,
6767+/// }
6868+/// }
6969+/// ```
7070+#[derive(Clone)]
7171+pub struct Hydrant {
7272+ pub crawler: CrawlerHandle,
7373+ pub firehose: FirehoseHandle,
7474+ pub backfill: BackfillHandle,
7575+ pub filter: FilterControl,
7676+ pub repos: ReposControl,
7777+ pub db: DbControl,
7878+ #[cfg(feature = "backlinks")]
7979+ pub backlinks: crate::backlinks::BacklinksControl,
8080+ pub(crate) state: Arc<AppState>,
8181+ config: Arc<Config>,
8282+ started: Arc<AtomicBool>,
8383+ _priv: (),
8484+}
8585+8686+impl Hydrant {
8787+ /// open the database and configure hydrant from `config`.
8888+ ///
8989+ /// this sets up the database, applies any filter configuration from `config`, and
9090+ /// initializes all sub-handles. no background tasks are started yet: call
9191+ /// [`run`](Self::run) to start all components and drive the instance.
9292+ pub async fn new(config: Config) -> Result<Self> {
9393+ info!("{config}");
9494+9595+ // 1. open database and construct AppState
9696+ let state = AppState::new(&config)?;
9797+9898+ // 2. apply any filter config from env variables
9999+ if config.full_network
100100+ || config.filter_signals.is_some()
101101+ || config.filter_collections.is_some()
102102+ || config.filter_excludes.is_some()
103103+ {
104104+ let filter_ks = state.db.filter.clone();
105105+ let inner = state.db.inner.clone();
106106+ let mode = config.full_network.then_some(FilterMode::Full);
107107+ let signals = config
108108+ .filter_signals
109109+ .clone()
110110+ .map(crate::filter::SetUpdate::Set);
111111+ let collections = config
112112+ .filter_collections
113113+ .clone()
114114+ .map(crate::filter::SetUpdate::Set);
115115+ let excludes = config
116116+ .filter_excludes
117117+ .clone()
118118+ .map(crate::filter::SetUpdate::Set);
119119+120120+ tokio::task::spawn_blocking(move || {
121121+ let mut batch = inner.batch();
122122+ db_filter::apply_patch(
123123+ &mut batch,
124124+ &filter_ks,
125125+ mode,
126126+ signals,
127127+ collections,
128128+ excludes,
129129+ )?;
130130+ batch.commit().into_diagnostic()
131131+ })
132132+ .await
133133+ .into_diagnostic()??;
134134+135135+ // 3. reload the live filter into the hot-path arc-swap
136136+ let new_filter = tokio::task::spawn_blocking({
137137+ let filter_ks = state.db.filter.clone();
138138+ move || db_filter::load(&filter_ks)
139139+ })
140140+ .await
141141+ .into_diagnostic()??;
142142+ state.filter.store(Arc::new(new_filter));
143143+ }
144144+145145+ // 4. set crawler enabled state from config, evaluated against the post-patch filter
146146+ let post_patch_crawler = match config.enable_crawler {
147147+ Some(b) => b,
148148+ None => {
149149+ state.filter.load().mode == FilterMode::Full || !config.crawler_sources.is_empty()
150150+ }
151151+ };
152152+ state.crawler_enabled.send_replace(post_patch_crawler);
153153+154154+ let state = Arc::new(state);
155155+156156+ Ok(Self {
157157+ crawler: CrawlerHandle {
158158+ state: state.clone(),
159159+ shared: Arc::new(std::sync::OnceLock::new()),
160160+ tasks: Arc::new(scc::HashMap::new()),
161161+ persisted: Arc::new(scc::HashSet::new()),
162162+ },
163163+ firehose: FirehoseHandle {
164164+ state: state.clone(),
165165+ shared: Arc::new(std::sync::OnceLock::new()),
166166+ tasks: Arc::new(scc::HashMap::new()),
167167+ persisted: Arc::new(scc::HashSet::new()),
168168+ },
169169+ backfill: BackfillHandle(state.clone()),
170170+ filter: FilterControl(state.clone()),
171171+ repos: ReposControl(state.clone()),
172172+ db: DbControl(state.clone()),
173173+ #[cfg(feature = "backlinks")]
174174+ backlinks: crate::backlinks::BacklinksControl(state.clone()),
175175+ state,
176176+ config: Arc::new(config),
177177+ started: Arc::new(AtomicBool::new(false)),
178178+ _priv: (),
179179+ })
180180+ }
181181+182182+ /// reads config from environment variables and calls [`Hydrant::new`].
183183+ pub async fn from_env() -> Result<Self> {
184184+ Self::new(Config::from_env()?).await
185185+ }
186186+187187+ /// start all background components and return a future that resolves when any
188188+ /// fatal component exits.
189189+ ///
190190+ /// starts the backfill worker, firehose ingestors, crawler, and worker thread.
191191+ /// resolves with `Ok(())` if a fatal component exits cleanly, or `Err(e)` if it
192192+ /// fails. intended for use in `tokio::select!` alongside [`serve`](Self::serve).
193193+ ///
194194+ /// returns an error if called more than once on the same `Hydrant` instance.
195195+ pub fn run(&self) -> Result<impl Future<Output = Result<()>>> {
196196+ let state = self.state.clone();
197197+ let config = self.config.clone();
198198+ let crawler = self.crawler.clone();
199199+ let firehose = self.firehose.clone();
200200+201201+ if self.started.swap(true, Ordering::SeqCst) {
202202+ miette::bail!("Hydrant::run() called more than once");
203203+ }
204204+205205+ let fut = async move {
206206+ // internal buffered channel between ingestors / backfill and the firehose worker
207207+ let (buffer_tx, buffer_rx) = mpsc::unbounded_channel();
208208+209209+ // 5. spawn the backfill worker
210210+ tokio::spawn({
211211+ let state = state.clone();
212212+ BackfillWorker::new(
213213+ state.clone(),
214214+ buffer_tx.clone(),
215215+ config.repo_fetch_timeout,
216216+ config.backfill_concurrency_limit,
217217+ matches!(
218218+ config.verify_signatures,
219219+ SignatureVerification::Full | SignatureVerification::BackfillOnly
220220+ ),
221221+ config.ephemeral,
222222+ state.backfill_enabled.subscribe(),
223223+ )
224224+ .run()
225225+ });
226226+227227+ // 6. re-queue any repos that lost their backfill state, then start the retry worker
228228+ if let Err(e) = tokio::task::spawn_blocking({
229229+ let state = state.clone();
230230+ move || crate::backfill::manager::queue_gone_backfills(&state)
231231+ })
232232+ .await
233233+ .into_diagnostic()?
234234+ {
235235+ error!(err = %e, "failed to queue gone backfills");
236236+ db::check_poisoned_report(&e);
237237+ }
238238+239239+ std::thread::spawn({
240240+ let state = state.clone();
241241+ move || crate::backfill::manager::retry_worker(state)
242242+ });
243243+244244+ // 7. ephemeral GC thread
245245+ if config.ephemeral {
246246+ let state = state.clone();
247247+ std::thread::Builder::new()
248248+ .name("ephemeral-gc".into())
249249+ .spawn(move || crate::db::ephemeral::ephemeral_ttl_worker(state))
250250+ .into_diagnostic()?;
251251+ }
252252+253253+ // 8. cursor / counts persist thread
254254+ std::thread::spawn({
255255+ let state = state.clone();
256256+ let persist_interval = config.cursor_save_interval;
257257+ move || loop {
258258+ std::thread::sleep(persist_interval);
259259+260260+ state.relay_cursors.iter_sync(|relay, cursor| {
261261+ let seq = cursor.load(Ordering::SeqCst);
262262+ if seq > 0 {
263263+ if let Err(e) = db::set_firehose_cursor(&state.db, relay, seq) {
264264+ error!(relay = %relay, err = %e, "failed to save cursor");
265265+ db::check_poisoned_report(&e);
266266+ }
267267+ }
268268+ true
269269+ });
270270+271271+ if let Err(e) = db::persist_counts(&state.db) {
272272+ error!(err = %e, "failed to persist counts");
273273+ db::check_poisoned_report(&e);
274274+ }
275275+276276+ if let Err(e) = state.db.persist() {
277277+ error!(err = %e, "db persist failed");
278278+ db::check_poisoned_report(&e);
279279+ }
280280+ }
281281+ });
282282+283283+ // 9. events/sec stats ticker
284284+ tokio::spawn({
285285+ let state = state.clone();
286286+ let mut last_id = state.db.next_event_id.load(Ordering::Relaxed);
287287+ let mut last_time = std::time::Instant::now();
288288+ let mut interval = tokio::time::interval(std::time::Duration::from_secs(60));
289289+ async move {
290290+ loop {
291291+ interval.tick().await;
292292+293293+ let current_id = state.db.next_event_id.load(Ordering::Relaxed);
294294+ let current_time = std::time::Instant::now();
295295+ let delta = current_id.saturating_sub(last_id);
296296+297297+ if delta == 0 {
298298+ debug!("no new events in 60s");
299299+ continue;
300300+ }
301301+302302+ let elapsed = current_time.duration_since(last_time).as_secs_f64();
303303+ let rate = if elapsed > 0.0 {
304304+ delta as f64 / elapsed
305305+ } else {
306306+ 0.0
307307+ };
308308+ info!("{rate:.2} events/s ({delta} events in {elapsed:.1}s)");
309309+310310+ last_id = current_id;
311311+ last_time = current_time;
312312+ }
313313+ }
314314+ });
315315+316316+ let (fatal_tx_inner, mut fatal_rx) = watch::channel(None);
317317+ let fatal_tx = Arc::new(fatal_tx_inner);
318318+319319+ info!(
320320+ crawler_enabled = *state.crawler_enabled.borrow(),
321321+ firehose_enabled = *state.firehose_enabled.borrow(),
322322+ filter_mode = ?state.filter.load().mode,
323323+ "starting ingestion"
324324+ );
325325+326326+ // 10. set shared and spawn firehose ingestors
327327+ firehose
328328+ .shared
329329+ .set(FirehoseShared {
330330+ buffer_tx: buffer_tx.clone(),
331331+ verify_signatures: matches!(
332332+ config.verify_signatures,
333333+ SignatureVerification::Full
334334+ ),
335335+ })
336336+ .ok()
337337+ .expect("firehose shared already set");
338338+ let fire_shared = firehose.shared.get().unwrap();
339339+340340+ let relay_hosts = config.relays.clone();
341341+ if !relay_hosts.is_empty() {
342342+ info!(
343343+ relay_count = relay_hosts.len(),
344344+ hosts = relay_hosts
345345+ .iter()
346346+ .map(|h| h.as_str())
347347+ .collect::<Vec<_>>()
348348+ .join(", "),
349349+ "starting firehose ingestor(s)"
350350+ );
351351+ for relay_url in &relay_hosts {
352352+ let enabled_rx = state.firehose_enabled.subscribe();
353353+ let handle =
354354+ spawn_firehose_ingestor(relay_url, &state, fire_shared, enabled_rx).await?;
355355+ let _ = firehose.tasks.insert_async(relay_url.clone(), handle).await;
356356+ }
357357+ }
358358+359359+ let persisted_relay_urls = tokio::task::spawn_blocking({
360360+ let state = state.clone();
361361+ move || load_persisted_firehose_sources(&state.db)
362362+ })
363363+ .await
364364+ .into_diagnostic()??;
365365+366366+ for relay_url in &persisted_relay_urls {
367367+ let _ = firehose.persisted.insert_async(relay_url.clone()).await;
368368+ if firehose.tasks.contains_async(relay_url).await {
369369+ continue;
370370+ }
371371+ let enabled_rx = state.firehose_enabled.subscribe();
372372+ let handle =
373373+ spawn_firehose_ingestor(relay_url, &state, fire_shared, enabled_rx).await?;
374374+ let _ = firehose.tasks.insert_async(relay_url.clone(), handle).await;
375375+ }
376376+377377+ // 11. spawn crawler infrastructure (always, to support dynamic source management)
378378+ {
379379+ use crate::crawler::throttle::Throttler;
380380+ use crate::crawler::{
381381+ CrawlerStats, CrawlerWorker, InFlight, RetryProducer, SignalChecker,
382382+ };
383383+384384+ let http = reqwest::Client::builder()
385385+ .user_agent(concat!(
386386+ env!("CARGO_PKG_NAME"),
387387+ "/",
388388+ env!("CARGO_PKG_VERSION")
389389+ ))
390390+ .gzip(true)
391391+ .build()
392392+ .expect("that reqwest will build");
393393+ let pds_throttler = Throttler::new();
394394+ let in_flight = InFlight::new();
395395+ let stats = CrawlerStats::new(
396396+ state.clone(),
397397+ config
398398+ .crawler_sources
399399+ .iter()
400400+ .map(|s| s.url.clone())
401401+ .collect(),
402402+ pds_throttler.clone(),
403403+ );
404404+ let checker = SignalChecker {
405405+ http: http.clone(),
406406+ state: state.clone(),
407407+ throttler: pds_throttler,
408408+ };
409409+410410+ info!(
411411+ max_pending = config.crawler_max_pending_repos,
412412+ resume_pending = config.crawler_resume_pending_repos,
413413+ enabled = *state.crawler_enabled.borrow(),
414414+ "starting crawler worker"
415415+ );
416416+ let (worker, tx) = CrawlerWorker::new(
417417+ state.clone(),
418418+ config.crawler_max_pending_repos,
419419+ config.crawler_resume_pending_repos,
420420+ stats.clone(),
421421+ );
422422+ tokio::spawn(async move {
423423+ worker.run().await;
424424+ error!("crawler worker exited unexpectedly, aborting");
425425+ std::process::abort();
426426+ });
427427+428428+ let ticker = tokio::spawn(stats.clone().task());
429429+ tokio::spawn(async move {
430430+ match ticker.await {
431431+ Err(e) => error!(err = ?e, "stats ticker panicked, aborting"),
432432+ Ok(()) => error!("stats ticker exited unexpectedly, aborting"),
433433+ }
434434+ std::process::abort();
435435+ });
436436+437437+ tokio::spawn(
438438+ RetryProducer {
439439+ checker: checker.clone(),
440440+ in_flight: in_flight.clone(),
441441+ tx: tx.clone(),
442442+ }
443443+ .run(),
444444+ );
445445+446446+ // set shared objects so CrawlerHandle methods can use them
447447+ crawler
448448+ .shared
449449+ .set(CrawlerShared {
450450+ http,
451451+ checker,
452452+ in_flight,
453453+ tx,
454454+ stats,
455455+ })
456456+ .ok()
457457+ .expect("crawler shared already set");
458458+ let shared = crawler.shared.get().unwrap();
459459+460460+ // spawn initial sources from config
461461+ for source in config.crawler_sources.iter() {
462462+ let enabled_rx = state.crawler_enabled.subscribe();
463463+ let handle = spawn_crawler_producer(
464464+ source,
465465+ &shared.http,
466466+ &state,
467467+ &shared.checker,
468468+ &shared.in_flight,
469469+ &shared.tx,
470470+ &shared.stats,
471471+ enabled_rx,
472472+ );
473473+ let _ = crawler.tasks.insert_async(source.url.clone(), handle).await;
474474+ }
475475+476476+ let persisted_sources = tokio::task::spawn_blocking({
477477+ let state = state.clone();
478478+ move || load_persisted_crawler_sources(&state.db)
479479+ })
480480+ .await
481481+ .into_diagnostic()??;
482482+483483+ for source in &persisted_sources {
484484+ let _ = crawler.persisted.insert_async(source.url.clone()).await;
485485+ if crawler.tasks.contains_async(&source.url).await {
486486+ continue;
487487+ }
488488+ let enabled_rx = state.crawler_enabled.subscribe();
489489+ let handle = spawn_crawler_producer(
490490+ source,
491491+ &shared.http,
492492+ &state,
493493+ &shared.checker,
494494+ &shared.in_flight,
495495+ &shared.tx,
496496+ &shared.stats,
497497+ enabled_rx,
498498+ );
499499+ let _ = crawler.tasks.insert_async(source.url.clone(), handle).await;
500500+ }
501501+ }
502502+503503+ // 12. spawn the firehose worker on a blocking thread (fatal task)
504504+ let handle = tokio::runtime::Handle::current();
505505+ let firehose_worker = std::thread::spawn({
506506+ let state = state.clone();
507507+ move || {
508508+ FirehoseWorker::new(
509509+ state,
510510+ buffer_rx,
511511+ matches!(config.verify_signatures, SignatureVerification::Full),
512512+ config.ephemeral,
513513+ config.firehose_workers,
514514+ )
515515+ .run(handle)
516516+ }
517517+ });
518518+519519+ {
520520+ let tx = Arc::clone(&fatal_tx);
521521+ tokio::spawn(
522522+ tokio::task::spawn_blocking(move || {
523523+ firehose_worker
524524+ .join()
525525+ .map_err(|e| miette::miette!("buffer processor died: {e:?}"))
526526+ })
527527+ .map(move |r| {
528528+ let result = r.into_diagnostic().flatten().flatten();
529529+ let _ = tx.send(Some(result.map_err(|e| e.to_string())));
530530+ }),
531531+ );
532532+ }
533533+534534+ // drop the local fatal_tx so the watch channel is only kept alive by the
535535+ // spawned tasks. when all fatal tasks exit (and drop their tx clones),
536536+ // fatal_rx.changed() returns Err and we return Ok(()).
537537+ drop(fatal_tx);
538538+539539+ loop {
540540+ match fatal_rx.changed().await {
541541+ Ok(()) => {
542542+ if let Some(result) = fatal_rx.borrow().clone() {
543543+ return result.map_err(|s| miette::miette!("{s}"));
544544+ }
545545+ }
546546+ // all fatal_tx clones dropped: all tasks finished cleanly
547547+ Err(_) => return Ok(()),
548548+ }
549549+ }
550550+ };
551551+ Ok(fut)
552552+ }
553553+554554+ /// subscribe to the ordered event stream.
555555+ ///
556556+ /// returns an [`EventStream`] that implements [`futures::Stream`].
557557+ ///
558558+ /// - if `cursor` is `None`, streaming starts from the current head (live tail only).
559559+ /// - if `cursor` is `Some(id)`, all persisted `record` events from that ID onward are
560560+ /// replayed first, then live events follow seamlessly.
561561+ ///
562562+ /// `identity` and `account` events are ephemeral and are never replayed from a cursor -
563563+ /// only live occurrences are delivered. use [`ReposControl::get`] to fetch current
564564+ /// identity/account state for a specific DID.
565565+ ///
566566+ /// multiple concurrent subscribers each receive a full independent copy of the stream.
567567+ /// the stream ends when the `EventStream` is dropped.
568568+ pub fn subscribe(&self, cursor: Option<u64>) -> EventStream {
569569+ let (tx, rx) = mpsc::channel(500);
570570+ let state = self.state.clone();
571571+ let runtime = tokio::runtime::Handle::current();
572572+573573+ std::thread::Builder::new()
574574+ .name("hydrant-stream".into())
575575+ .spawn(move || {
576576+ let _g = runtime.enter();
577577+ event_stream_thread(state, tx, cursor);
578578+ })
579579+ .expect("failed to spawn stream thread");
580580+581581+ EventStream(rx)
582582+ }
583583+584584+ /// return database counts and on-disk sizes for all keyspaces.
585585+ ///
586586+ /// counts include: `repos`, `pending`, `resync`, `records`, `blocks`, `events`,
587587+ /// `error_ratelimited`, `error_transport`, `error_generic`.
588588+ ///
589589+ /// sizes are in bytes, reported per keyspace.
590590+ pub async fn stats(&self) -> Result<StatsResponse> {
591591+ let db = self.state.db.clone();
592592+593593+ let mut counts: BTreeMap<&'static str, u64> = futures::future::join_all(
594594+ [
595595+ "repos",
596596+ "pending",
597597+ "resync",
598598+ "records",
599599+ "blocks",
600600+ "error_ratelimited",
601601+ "error_transport",
602602+ "error_generic",
603603+ ]
604604+ .into_iter()
605605+ .map(|name| {
606606+ let db = db.clone();
607607+ async move { (name, db.get_count(name).await) }
608608+ }),
609609+ )
610610+ .await
611611+ .into_iter()
612612+ .collect();
613613+614614+ counts.insert("events", db.events.approximate_len() as u64);
615615+616616+ let sizes = tokio::task::spawn_blocking(move || {
617617+ let mut s = BTreeMap::new();
618618+ s.insert("repos", db.repos.disk_space());
619619+ s.insert("records", db.records.disk_space());
620620+ s.insert("blocks", db.blocks.disk_space());
621621+ s.insert("cursors", db.cursors.disk_space());
622622+ s.insert("pending", db.pending.disk_space());
623623+ s.insert("resync", db.resync.disk_space());
624624+ s.insert("resync_buffer", db.resync_buffer.disk_space());
625625+ s.insert("events", db.events.disk_space());
626626+ s.insert("counts", db.counts.disk_space());
627627+ s.insert("filter", db.filter.disk_space());
628628+ s.insert("crawler", db.crawler.disk_space());
629629+ s
630630+ })
631631+ .await
632632+ .into_diagnostic()?;
633633+634634+ Ok(StatsResponse { counts, sizes })
635635+ }
636636+637637+ /// returns a future that runs the HTTP management API server on `0.0.0.0:{port}`.
638638+ ///
639639+ /// the server exposes all management endpoints (`/filter`, `/repos`, `/ingestion`,
640640+ /// `/stream`, `/stats`, `/db/*`, `/xrpc/*`). it runs indefinitely and resolves
641641+ /// only on error.
642642+ ///
643643+ /// intended for `tokio::spawn` or inclusion in a `select!` / task list. the clone
644644+ /// of `self` is deferred until the future is first polled.
645645+ ///
646646+ /// to disable the HTTP API entirely, simply don't call this method.
647647+ pub fn serve(&self, port: u16) -> impl Future<Output = Result<()>> {
648648+ let hydrant = self.clone();
649649+ async move { crate::api::serve(hydrant, port).await }
650650+ }
651651+652652+ /// returns a future that runs the debug HTTP API server on `127.0.0.1:{port}`.
653653+ ///
654654+ /// exposes internal inspection endpoints (`/debug/get`, `/debug/iter`, etc.)
655655+ /// that are not safe to expose publicly. binds only to loopback.
656656+ pub fn serve_debug(&self, port: u16) -> impl Future<Output = Result<()>> {
657657+ let state = self.state.clone();
658658+ async move { crate::api::serve_debug(state, port).await }
659659+ }
660660+}
661661+662662+impl axum::extract::FromRef<Hydrant> for Arc<AppState> {
663663+ fn from_ref(h: &Hydrant) -> Self {
664664+ h.state.clone()
665665+ }
666666+}
667667+668668+/// a stream of [`Event`]s. returned by [`Hydrant::subscribe`].
669669+///
670670+/// implements [`futures::Stream`] and can be used with `StreamExt::next`,
671671+/// `while let Some(evt) = stream.next().await`, `forward`, etc.
672672+/// the stream terminates when the underlying channel closes (i.e. hydrant shuts down).
673673+pub struct EventStream(mpsc::Receiver<Event>);
674674+675675+impl Stream for EventStream {
676676+ type Item = Event;
677677+678678+ fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
679679+ self.0.poll_recv(cx)
680680+ }
681681+}
682682+683683+/// database statistics returned by [`Hydrant::stats`].
684684+#[derive(serde::Serialize)]
685685+pub struct StatsResponse {
686686+ /// record counts per logical category (repos, records, events, error kinds, etc.)
687687+ pub counts: BTreeMap<&'static str, u64>,
688688+ /// on-disk size in bytes per keyspace
689689+ pub sizes: BTreeMap<&'static str, u64>,
690690+}
691691+692692+/// runtime control over the backfill worker component.
693693+///
694694+/// the backfill worker fetches full repo CAR files from each repo's PDS for any
695695+/// repository in the pending queue, parses the MST, and inserts all matching records
696696+/// into the database. concurrency is bounded by `HYDRANT_BACKFILL_CONCURRENCY_LIMIT`.
697697+#[derive(Clone)]
698698+pub struct BackfillHandle(Arc<AppState>);
699699+700700+impl BackfillHandle {
701701+ /// enable the backfill worker, no-op if already enabled.
702702+ pub fn enable(&self) {
703703+ self.0.backfill_enabled.send_replace(true);
704704+ }
705705+ /// disable the backfill worker, in-flight repos complete before pausing.
706706+ pub fn disable(&self) {
707707+ self.0.backfill_enabled.send_replace(false);
708708+ }
709709+ /// returns the current enabled state of the backfill worker.
710710+ pub fn is_enabled(&self) -> bool {
711711+ *self.0.backfill_enabled.borrow()
712712+ }
713713+}
714714+715715+/// control over database maintenance operations.
716716+///
717717+/// all methods pause the crawler, firehose, and backfill worker for the duration
718718+/// of the operation and restore their prior state on completion, whether or not
719719+/// the operation succeeds.
720720+#[derive(Clone)]
721721+pub struct DbControl(Arc<AppState>);
722722+723723+impl DbControl {
724724+ /// trigger a major compaction of all keyspaces in parallel.
725725+ ///
726726+ /// compaction reclaims disk space from deleted/updated keys and improves
727727+ /// read performance. can take several minutes on large datasets.
728728+ pub async fn compact(&self) -> Result<()> {
729729+ let state = self.0.clone();
730730+ state
731731+ .with_ingestion_paused(async || state.db.compact().await)
732732+ .await
733733+ }
734734+735735+ /// train zstd compression dictionaries for the `repos`, `blocks`, and `events` keyspaces.
736736+ ///
737737+ /// dictionaries are written to `dict_{name}.bin` files next to the database.
738738+ /// a restart is required to apply them. training samples data blocks from the
739739+ /// existing database, so the database must have a reasonable amount of data first.
740740+ pub async fn train_dicts(&self) -> Result<()> {
741741+ let state = self.0.clone();
742742+ state
743743+ .with_ingestion_paused(async || {
744744+ let train = |name: &'static str| {
745745+ let db = state.db.clone();
746746+ tokio::task::spawn_blocking(move || db.train_dict(name))
747747+ .map(|res| res.into_diagnostic().flatten())
748748+ };
749749+ tokio::try_join!(train("repos"), train("blocks"), train("events")).map(|_| ())
750750+ })
751751+ .await
752752+ }
753753+}
+404
src/control/repos.rs
···11+use std::sync::Arc;
22+33+use chrono::{DateTime, Utc};
44+use jacquard_common::cowstr::ToCowStr;
55+use jacquard_common::types::cid::{Cid, IpldCid};
66+use jacquard_common::types::ident::AtIdentifier;
77+use jacquard_common::types::string::{Did, Handle, Rkey};
88+use jacquard_common::types::tid::Tid;
99+use jacquard_common::{CowStr, Data, IntoStatic};
1010+use miette::{IntoDiagnostic, Result};
1111+use rand::Rng;
1212+use smol_str::ToSmolStr;
1313+use url::Url;
1414+1515+use crate::db::types::DbRkey;
1616+use crate::db::{self, keys, ser_repo_state};
1717+use crate::state::AppState;
1818+use crate::types::{GaugeState, RepoState, RepoStatus};
1919+2020+/// information about a tracked or known repository. returned by [`ReposControl`] methods.
2121+#[derive(Debug, Clone, serde::Serialize)]
2222+pub struct RepoInfo {
2323+ /// the DID of the repository.
2424+ pub did: Did<'static>,
2525+ /// the status of the repository.
2626+ #[serde(serialize_with = "crate::util::repo_status_serialize_str")]
2727+ pub status: RepoStatus,
2828+ /// whether this repository is tracked or not.
2929+ /// untracked repositories are not updated and they stay frozen.
3030+ pub tracked: bool,
3131+ /// the revision of the root commit of this repository.
3232+ #[serde(skip_serializing_if = "Option::is_none")]
3333+ pub rev: Option<Tid>,
3434+ /// the CID of the root commit of this repository.
3535+ #[serde(serialize_with = "crate::util::opt_cid_serialize_str")]
3636+ #[serde(skip_serializing_if = "Option::is_none")]
3737+ pub data: Option<IpldCid>,
3838+ /// the handle for the DID of this repository.
3939+ #[serde(skip_serializing_if = "Option::is_none")]
4040+ pub handle: Option<Handle<'static>>,
4141+ /// the URL for the PDS in which this repository is hosted on.
4242+ #[serde(skip_serializing_if = "Option::is_none")]
4343+ pub pds: Option<Url>,
4444+ /// ATProto signing key of this repository.
4545+ #[serde(skip_serializing_if = "Option::is_none")]
4646+ pub signing_key: Option<String>,
4747+ /// when this repository was last touched (status update, commit ingested, etc.).
4848+ #[serde(skip_serializing_if = "Option::is_none")]
4949+ pub last_updated_at: Option<DateTime<Utc>>,
5050+ /// the time of the last message gotten from the firehose for this repository.
5151+ /// this is equal to the `time` field.
5252+ #[serde(skip_serializing_if = "Option::is_none")]
5353+ pub last_message_at: Option<DateTime<Utc>>,
5454+}
5555+5656+/// control over which repositories are tracked and access to their state.
5757+///
5858+/// in `filter` mode, a repo is only indexed if it either matches a signal or is
5959+/// explicitly tracked via [`ReposControl::track`]. in `full` mode all repos are indexed
6060+/// and tracking is implicit.
6161+///
6262+/// tracking a DID that hydrant has never seen enqueues an immediate backfill.
6363+/// tracking a DID that hydrant already knows about (but has marked untracked)
6464+/// re-enqueues it for backfill.
6565+#[derive(Clone)]
6666+pub struct ReposControl(pub(super) Arc<AppState>);
6767+6868+impl ReposControl {
6969+ /// gets a handle for a repository to allow acting upon it.
7070+ pub fn get<'i>(&self, did: &Did<'i>) -> Result<RepoHandle<'i>> {
7171+ Ok(RepoHandle {
7272+ state: self.0.clone(),
7373+ did: did.clone(),
7474+ })
7575+ }
7676+7777+ /// same as [`ReposControl::get`] but allows you to pass in an identifier that can be
7878+ /// either a handle or a DID.
7979+ pub async fn resolve(&self, repo: &AtIdentifier<'_>) -> Result<RepoHandle<'static>> {
8080+ let did = self.0.resolver.resolve_did(repo).await?;
8181+ Ok(RepoHandle {
8282+ state: self.0.clone(),
8383+ did,
8484+ })
8585+ }
8686+8787+ /// fetch the current state of a single repository. returns `None` if hydrant
8888+ /// has never seen this DID.
8989+ pub async fn info(&self, did: &Did<'_>) -> Result<Option<RepoInfo>> {
9090+ self.get(did)?.info().await
9191+ }
9292+9393+ /// explicitly track one or more repositories, enqueuing them for backfill if needed.
9494+ ///
9595+ /// - if a DID is new, a fresh [`RepoState`] is created and backfill is queued.
9696+ /// - if a DID is already known but untracked, it is marked tracked and re-enqueued.
9797+ /// - if a DID is already tracked, this is a no-op.
9898+ pub async fn track(&self, dids: impl IntoIterator<Item = Did<'_>>) -> Result<()> {
9999+ let dids: Vec<Did<'static>> = dids.into_iter().map(|d| d.into_static()).collect();
100100+ let state = self.0.clone();
101101+102102+ let (new_count, transitions) = tokio::task::spawn_blocking(move || {
103103+ let db = &state.db;
104104+ let mut batch = db.inner.batch();
105105+ let mut added = 0i64;
106106+ let mut transitions: Vec<(GaugeState, GaugeState)> = Vec::new();
107107+ let mut rng = rand::rng();
108108+109109+ for did in &dids {
110110+ let did_key = keys::repo_key(did);
111111+ let repo_bytes = db.repos.get(&did_key).into_diagnostic()?;
112112+ let existing = repo_bytes
113113+ .as_deref()
114114+ .map(db::deser_repo_state)
115115+ .transpose()?;
116116+117117+ if let Some(mut repo_state) = existing {
118118+ if !repo_state.tracked {
119119+ let resync = db.resync.get(&did_key).into_diagnostic()?;
120120+ let old = db::Db::repo_gauge_state(&repo_state, resync.as_deref());
121121+ repo_state.tracked = true;
122122+ batch.insert(&db.repos, &did_key, ser_repo_state(&repo_state)?);
123123+ batch.insert(
124124+ &db.pending,
125125+ keys::pending_key(repo_state.index_id),
126126+ &did_key,
127127+ );
128128+ batch.remove(&db.resync, &did_key);
129129+ transitions.push((old, GaugeState::Pending));
130130+ }
131131+ } else {
132132+ let repo_state = RepoState::backfilling(rng.next_u64());
133133+ batch.insert(&db.repos, &did_key, ser_repo_state(&repo_state)?);
134134+ batch.insert(
135135+ &db.pending,
136136+ keys::pending_key(repo_state.index_id),
137137+ &did_key,
138138+ );
139139+ added += 1;
140140+ transitions.push((GaugeState::Synced, GaugeState::Pending));
141141+ }
142142+ }
143143+144144+ batch.commit().into_diagnostic()?;
145145+ Ok::<_, miette::Report>((added, transitions))
146146+ })
147147+ .await
148148+ .into_diagnostic()??;
149149+150150+ if new_count > 0 {
151151+ self.0.db.update_count_async("repos", new_count).await;
152152+ }
153153+ for (old, new) in transitions {
154154+ self.0.db.update_gauge_diff_async(&old, &new).await;
155155+ }
156156+ self.0.notify_backfill();
157157+ Ok(())
158158+ }
159159+160160+ /// stop tracking one or more repositories. hydrant will stop processing new events
161161+ /// for them and remove them from the pending/resync queues, but existing indexed
162162+ /// records are **not** deleted.
163163+ pub async fn untrack(&self, dids: impl IntoIterator<Item = Did<'_>>) -> Result<()> {
164164+ let dids: Vec<Did<'static>> = dids.into_iter().map(|d| d.into_static()).collect();
165165+ let state = self.0.clone();
166166+167167+ let gauge_decrements = tokio::task::spawn_blocking(move || {
168168+ let db = &state.db;
169169+ let mut batch = db.inner.batch();
170170+ let mut gauge_decrements = Vec::new();
171171+172172+ for did in &dids {
173173+ let did_key = keys::repo_key(did);
174174+ let repo_bytes = db.repos.get(&did_key).into_diagnostic()?;
175175+ let existing = repo_bytes
176176+ .as_deref()
177177+ .map(db::deser_repo_state)
178178+ .transpose()?;
179179+180180+ if let Some(repo_state) = existing {
181181+ if repo_state.tracked {
182182+ let resync = db.resync.get(&did_key).into_diagnostic()?;
183183+ let old = db::Db::repo_gauge_state(&repo_state, resync.as_deref());
184184+ let mut repo_state = repo_state.into_static();
185185+ repo_state.tracked = false;
186186+ batch.insert(&db.repos, &did_key, ser_repo_state(&repo_state)?);
187187+ batch.remove(&db.pending, keys::pending_key(repo_state.index_id));
188188+ batch.remove(&db.resync, &did_key);
189189+ if old != GaugeState::Synced {
190190+ gauge_decrements.push(old);
191191+ }
192192+ }
193193+ }
194194+ }
195195+196196+ batch.commit().into_diagnostic()?;
197197+ Ok::<_, miette::Report>(gauge_decrements)
198198+ })
199199+ .await
200200+ .into_diagnostic()??;
201201+202202+ for gauge in gauge_decrements {
203203+ self.0
204204+ .db
205205+ .update_gauge_diff_async(&gauge, &GaugeState::Synced)
206206+ .await;
207207+ }
208208+ Ok(())
209209+ }
210210+}
211211+212212+pub(crate) fn repo_state_to_info(did: Did<'static>, s: RepoState<'_>) -> RepoInfo {
213213+ RepoInfo {
214214+ did,
215215+ status: s.status,
216216+ tracked: s.tracked,
217217+ rev: s.rev.map(|r| r.to_tid()),
218218+ data: s.data,
219219+ handle: s.handle.map(|h| h.into_static()),
220220+ pds: s.pds.and_then(|p| p.parse().ok()),
221221+ signing_key: s.signing_key.map(|k| k.encode()),
222222+ last_updated_at: DateTime::from_timestamp_secs(s.last_updated_at),
223223+ last_message_at: s.last_message_time.and_then(DateTime::from_timestamp_secs),
224224+ }
225225+}
226226+227227+pub struct Record {
228228+ pub did: Did<'static>,
229229+ pub cid: Cid<'static>,
230230+ pub value: Data<'static>,
231231+}
232232+233233+pub struct ListedRecord {
234234+ pub rkey: Rkey<'static>,
235235+ pub cid: Cid<'static>,
236236+ pub value: Data<'static>,
237237+}
238238+239239+pub struct RecordList {
240240+ pub records: Vec<ListedRecord>,
241241+ pub cursor: Option<Rkey<'static>>,
242242+}
243243+244244+/// handle to access data related to this repository.
245245+#[derive(Clone)]
246246+pub struct RepoHandle<'i> {
247247+ state: Arc<AppState>,
248248+ pub did: Did<'i>,
249249+}
250250+251251+impl<'i> RepoHandle<'i> {
252252+ pub async fn info(&self) -> Result<Option<RepoInfo>> {
253253+ let did_key = keys::repo_key(&self.did);
254254+ let state = self.state.clone();
255255+ let did = self.did.clone().into_static();
256256+257257+ tokio::task::spawn_blocking(move || {
258258+ let bytes = state.db.repos.get(&did_key).into_diagnostic()?;
259259+ let state = bytes.as_deref().map(db::deser_repo_state).transpose()?;
260260+ Ok(state.map(|s| repo_state_to_info(did, s)))
261261+ })
262262+ .await
263263+ .into_diagnostic()?
264264+ }
265265+266266+ pub async fn get_record(&self, collection: &str, rkey: &str) -> Result<Option<Record>> {
267267+ let did = self.did.clone().into_static();
268268+ let db_key = keys::record_key(&did, collection, &DbRkey::new(rkey));
269269+270270+ let collection = collection.to_smolstr();
271271+ let state = self.state.clone();
272272+ tokio::task::spawn_blocking(move || {
273273+ use miette::WrapErr;
274274+275275+ let cid_bytes = state.db.records.get(db_key).into_diagnostic()?;
276276+ let Some(cid_bytes) = cid_bytes else {
277277+ return Ok(None);
278278+ };
279279+280280+ // lookup block using col|cid key
281281+ let block_key = keys::block_key(&collection, &cid_bytes);
282282+ let Some(block_bytes) = state.db.blocks.get(block_key).into_diagnostic()? else {
283283+ miette::bail!("block {cid_bytes:?} not found, this is a bug!!");
284284+ };
285285+286286+ let value = serde_ipld_dagcbor::from_slice::<Data>(&block_bytes)
287287+ .into_diagnostic()
288288+ .wrap_err("cant parse block")?
289289+ .into_static();
290290+ let cid = Cid::new(&cid_bytes)
291291+ .into_diagnostic()
292292+ .wrap_err("cant parse block cid")?;
293293+ let cid = Cid::Str(cid.to_cowstr().into_static());
294294+295295+ Ok(Some(Record { did, cid, value }))
296296+ })
297297+ .await
298298+ .into_diagnostic()?
299299+ }
300300+301301+ pub async fn list_records(
302302+ &self,
303303+ collection: &str,
304304+ limit: usize,
305305+ reverse: bool,
306306+ cursor: Option<&str>,
307307+ ) -> Result<RecordList> {
308308+ let did = self.did.clone().into_static();
309309+310310+ let state = self.state.clone();
311311+ let prefix = keys::record_prefix_collection(&did, collection);
312312+ let collection = collection.to_smolstr();
313313+ let cursor = cursor.map(|c| c.to_smolstr());
314314+315315+ tokio::task::spawn_blocking(move || {
316316+ let mut results = Vec::new();
317317+ let mut next_cursor = None;
318318+319319+ let iter: Box<dyn Iterator<Item = _>> = if !reverse {
320320+ let mut end_prefix = prefix.clone();
321321+ if let Some(last) = end_prefix.last_mut() {
322322+ *last += 1;
323323+ }
324324+325325+ let end_key = if let Some(cursor) = &cursor {
326326+ let mut k = prefix.clone();
327327+ k.extend_from_slice(cursor.as_bytes());
328328+ k
329329+ } else {
330330+ end_prefix
331331+ };
332332+333333+ Box::new(
334334+ state
335335+ .db
336336+ .records
337337+ .range(prefix.as_slice()..end_key.as_slice())
338338+ .rev(),
339339+ )
340340+ } else {
341341+ let start_key = if let Some(cursor) = &cursor {
342342+ let mut k = prefix.clone();
343343+ k.extend_from_slice(cursor.as_bytes());
344344+ k.push(0);
345345+ k
346346+ } else {
347347+ prefix.clone()
348348+ };
349349+350350+ Box::new(state.db.records.range(start_key.as_slice()..))
351351+ };
352352+353353+ for item in iter {
354354+ let (key, cid_bytes) = item.into_inner().into_diagnostic()?;
355355+356356+ if !key.starts_with(prefix.as_slice()) {
357357+ break;
358358+ }
359359+360360+ let rkey = keys::parse_rkey(&key[prefix.len()..])?;
361361+ if results.len() >= limit {
362362+ next_cursor = Some(rkey);
363363+ break;
364364+ }
365365+366366+ // look up using col|cid key built from collection and binary cid bytes
367367+ if let Ok(Some(block_bytes)) = state
368368+ .db
369369+ .blocks
370370+ .get(&keys::block_key(collection.as_str(), &cid_bytes))
371371+ {
372372+ let value: Data =
373373+ serde_ipld_dagcbor::from_slice(&block_bytes).unwrap_or(Data::Null);
374374+ let cid = Cid::new(&cid_bytes).into_diagnostic()?;
375375+ let cid = Cid::Str(cid.to_cowstr().into_static());
376376+ results.push(ListedRecord {
377377+ rkey: Rkey::new_cow(CowStr::Owned(rkey.to_smolstr()))
378378+ .expect("that rkey is validated"),
379379+ cid,
380380+ value: value.into_static(),
381381+ });
382382+ }
383383+ }
384384+ Result::<_, miette::Report>::Ok((results, next_cursor))
385385+ })
386386+ .await
387387+ .into_diagnostic()?
388388+ .map(|(records, next_cursor)| RecordList {
389389+ records,
390390+ cursor: next_cursor.map(|rkey| {
391391+ Rkey::new_cow(CowStr::Owned(rkey.to_smolstr())).expect("that rkey is validated")
392392+ }),
393393+ })
394394+ }
395395+396396+ pub async fn count_records(&self, collection: &str) -> Result<u64> {
397397+ let did = self.did.clone().into_static();
398398+ let state = self.state.clone();
399399+ let collection = collection.to_string();
400400+ tokio::task::spawn_blocking(move || db::get_record_count(&state.db, &did, &collection))
401401+ .await
402402+ .into_diagnostic()?
403403+ }
404404+}
+164
src/control/stream.rs
···11+use std::sync::Arc;
22+use std::sync::atomic::Ordering;
33+44+use jacquard_common::types::cid::{ATP_CID_HASH, IpldCid};
55+use jacquard_common::types::nsid::Nsid;
66+use jacquard_common::types::string::Rkey;
77+use jacquard_common::{CowStr, IntoStatic, RawData};
88+use jacquard_repo::DAG_CBOR_CID_CODEC;
99+use sha2::{Digest, Sha256};
1010+use tokio::sync::mpsc;
1111+use tracing::error;
1212+1313+use crate::db::{self, keys};
1414+use crate::state::AppState;
1515+use crate::types::{BroadcastEvent, MarshallableEvt, RecordEvt, StoredData, StoredEvent};
1616+1717+use super::Event;
1818+1919+pub(super) fn event_stream_thread(
2020+ state: Arc<AppState>,
2121+ tx: mpsc::Sender<Event>,
2222+ cursor: Option<u64>,
2323+) {
2424+ let db = &state.db;
2525+ let mut event_rx = db.event_tx.subscribe();
2626+ let ks = db.events.clone();
2727+ let mut current_id = match cursor {
2828+ Some(c) => c.saturating_sub(1),
2929+ None => db.next_event_id.load(Ordering::SeqCst).saturating_sub(1),
3030+ };
3131+3232+ loop {
3333+ // catch up from db
3434+ loop {
3535+ let mut found = false;
3636+ for item in ks.range(keys::event_key(current_id + 1)..) {
3737+ let (k, v) = match item.into_inner() {
3838+ Ok(kv) => kv,
3939+ Err(e) => {
4040+ error!(err = %e, "failed to read event from db");
4141+ break;
4242+ }
4343+ };
4444+4545+ let id = match k.as_ref().try_into().map(u64::from_be_bytes) {
4646+ Ok(id) => id,
4747+ Err(_) => {
4848+ error!("failed to parse event id");
4949+ continue;
5050+ }
5151+ };
5252+ current_id = id;
5353+5454+ let stored: StoredEvent = match rmp_serde::from_slice(&v) {
5555+ Ok(e) => e,
5656+ Err(e) => {
5757+ error!(err = %e, "failed to deserialize stored event");
5858+ continue;
5959+ }
6060+ };
6161+6262+ let Some(evt) = stored_to_event(&state, id, stored) else {
6363+ continue;
6464+ };
6565+6666+ if tx.blocking_send(evt).is_err() {
6767+ return; // receiver dropped
6868+ }
6969+ found = true;
7070+ }
7171+ if !found {
7272+ break;
7373+ }
7474+ }
7575+7676+ // wait for live events
7777+ match event_rx.blocking_recv() {
7878+ Ok(BroadcastEvent::Persisted(_)) => {} // re-run catch-up
7979+ Ok(BroadcastEvent::Ephemeral(evt)) => {
8080+ if tx.blocking_send(*evt).is_err() {
8181+ return;
8282+ }
8383+ }
8484+ Err(tokio::sync::broadcast::error::RecvError::Lagged(_)) => {}
8585+ Err(tokio::sync::broadcast::error::RecvError::Closed) => break,
8686+ }
8787+ }
8888+}
8989+9090+fn stored_to_event(state: &AppState, id: u64, stored: StoredEvent<'_>) -> Option<Event> {
9191+ let StoredEvent {
9292+ live,
9393+ did,
9494+ rev,
9595+ collection,
9696+ rkey,
9797+ action,
9898+ data,
9999+ } = stored;
100100+101101+ let record = match data {
102102+ StoredData::Ptr(cid) => {
103103+ let block = state
104104+ .db
105105+ .blocks
106106+ .get(&keys::block_key(collection.as_str(), &cid.to_bytes()));
107107+ match block {
108108+ Ok(Some(bytes)) => match serde_ipld_dagcbor::from_slice::<RawData>(&bytes) {
109109+ Ok(val) => Some((cid, serde_json::to_value(val).ok()?)),
110110+ Err(e) => {
111111+ error!(err = %e, "cant parse block");
112112+ return None;
113113+ }
114114+ },
115115+ Ok(None) => {
116116+ error!("block not found, this is a bug");
117117+ return None;
118118+ }
119119+ Err(e) => {
120120+ error!(err = %e, "cant get block");
121121+ db::check_poisoned(&e);
122122+ return None;
123123+ }
124124+ }
125125+ }
126126+ StoredData::Block(block) => {
127127+ let digest = Sha256::digest(&block);
128128+ let hash =
129129+ cid::multihash::Multihash::wrap(ATP_CID_HASH, &digest).expect("valid sha256 hash");
130130+ let cid = IpldCid::new_v1(DAG_CBOR_CID_CODEC, hash);
131131+ match serde_ipld_dagcbor::from_slice::<RawData>(&block) {
132132+ Ok(val) => Some((cid, serde_json::to_value(val).ok()?)),
133133+ Err(e) => {
134134+ error!(err = %e, "cant parse block");
135135+ return None;
136136+ }
137137+ }
138138+ }
139139+ StoredData::Nothing => None,
140140+ };
141141+142142+ let (cid, record) = record
143143+ .map(|(c, r)| (Some(c), Some(r)))
144144+ .unwrap_or((None, None));
145145+146146+ Some(MarshallableEvt {
147147+ id,
148148+ kind: crate::types::EventType::Record,
149149+ record: Some(RecordEvt {
150150+ live,
151151+ did: did.to_did(),
152152+ rev: rev.to_tid(),
153153+ collection: Nsid::new_cow(collection.clone().into_static())
154154+ .expect("that collection is already validated"),
155155+ rkey: Rkey::new_cow(CowStr::Owned(rkey.to_smolstr()))
156156+ .expect("that rkey is already validated"),
157157+ action: CowStr::Borrowed(action.as_str()),
158158+ record,
159159+ cid,
160160+ }),
161161+ identity: None,
162162+ account: None,
163163+ })
164164+}