very fast at protocol indexer with flexible filtering, xrpc queries, cursor-backed event stream, and more, built on fjall
rust fjall at-protocol atproto indexer
58
fork

Configure Feed

Select the types of activity you want to include in your feed.

[lib] split into multiple modules

dawn 5d9cdd75 71a42799

+2090 -2017
-2017
src/control.rs
··· 1 - use std::collections::BTreeMap; 2 - use std::future::Future; 3 - use std::pin::Pin; 4 - use std::sync::Arc; 5 - use std::sync::atomic::{AtomicBool, Ordering}; 6 - use std::task::{Context, Poll}; 7 - 8 - use chrono::{DateTime, Utc}; 9 - use futures::{FutureExt, Stream}; 10 - use jacquard_common::cowstr::ToCowStr; 11 - use jacquard_common::types::cid::{ATP_CID_HASH, Cid, IpldCid}; 12 - use jacquard_common::types::ident::AtIdentifier; 13 - use jacquard_common::types::nsid::Nsid; 14 - use jacquard_common::types::string::{Did, Handle, Rkey}; 15 - use jacquard_common::types::tid::Tid; 16 - use jacquard_common::{CowStr, Data, IntoStatic, RawData}; 17 - use jacquard_repo::DAG_CBOR_CID_CODEC; 18 - use miette::{IntoDiagnostic, Result}; 19 - use rand::Rng; 20 - use sha2::{Digest, Sha256}; 21 - use smol_str::ToSmolStr; 22 - use tokio::sync::{mpsc, watch}; 23 - use tracing::{debug, error, info}; 24 - use url::Url; 25 - 26 - use crate::backfill::BackfillWorker; 27 - use crate::config::{Config, SignatureVerification}; 28 - use crate::db::types::DbRkey; 29 - use crate::db::{ 30 - self, filter as db_filter, keys, load_persisted_crawler_sources, 31 - load_persisted_firehose_sources, ser_repo_state, 32 - }; 33 - use crate::filter::{FilterMode, SetUpdate}; 34 - use crate::ingest::{firehose::FirehoseIngestor, worker::FirehoseWorker}; 35 - use crate::state::AppState; 36 - use crate::types::{ 37 - BroadcastEvent, GaugeState, MarshallableEvt, RecordEvt, RepoState, RepoStatus, StoredData, 38 - StoredEvent, 39 - }; 40 - 41 - /// an event emitted by the hydrant event stream. 42 - /// 43 - /// three variants are possible depending on the `type` field: 44 - /// - `"record"`: a repo record was created, updated, or deleted. carries a [`RecordEvt`]. 45 - /// - `"identity"`: a DID's handle or PDS changed. carries an [`IdentityEvt`]. ephemeral, not replayable. 46 - /// - `"account"`: a repo's active/inactive status changed. carries an [`AccountEvt`]. ephemeral, not replayable. 47 - /// 48 - /// the `id` field is a monotonically increasing sequence number usable as a cursor for [`Hydrant::subscribe`]. 49 - pub type Event = MarshallableEvt<'static>; 50 - 51 - /// the top-level handle to a hydrant instance. 52 - /// 53 - /// `Hydrant` is cheaply cloneable. all sub-handles share the same underlying state. 54 - /// construct it via [`Hydrant::new`] or [`Hydrant::from_env`], configure the filter 55 - /// and repos as needed, then call [`Hydrant::run`] to start all background components. 56 - /// 57 - /// # example 58 - /// 59 - /// ```rust,no_run 60 - /// use hydrant::control::Hydrant; 61 - /// 62 - /// #[tokio::main] 63 - /// async fn main() -> miette::Result<()> { 64 - /// let hydrant = Hydrant::from_env().await?; 65 - /// 66 - /// tokio::select! { 67 - /// r = hydrant.run()? => r, 68 - /// r = hydrant.serve(3000) => r, 69 - /// } 70 - /// } 71 - /// ``` 72 - #[derive(Clone)] 73 - pub struct Hydrant { 74 - pub crawler: CrawlerHandle, 75 - pub firehose: FirehoseHandle, 76 - pub backfill: BackfillHandle, 77 - pub filter: FilterControl, 78 - pub repos: ReposControl, 79 - pub db: DbControl, 80 - #[cfg(feature = "backlinks")] 81 - pub backlinks: crate::backlinks::BacklinksControl, 82 - pub(crate) state: Arc<AppState>, 83 - config: Arc<Config>, 84 - started: Arc<AtomicBool>, 85 - _priv: (), 86 - } 87 - 88 - impl Hydrant { 89 - /// open the database and configure hydrant from `config`. 90 - /// 91 - /// this sets up the database, applies any filter configuration from `config`, and 92 - /// initializes all sub-handles. no background tasks are started yet: call 93 - /// [`run`](Self::run) to start all components and drive the instance. 94 - pub async fn new(config: Config) -> Result<Self> { 95 - info!("{config}"); 96 - 97 - // 1. open database and construct AppState 98 - let state = AppState::new(&config)?; 99 - 100 - // 2. apply any filter config from env variables 101 - if config.full_network 102 - || config.filter_signals.is_some() 103 - || config.filter_collections.is_some() 104 - || config.filter_excludes.is_some() 105 - { 106 - let filter_ks = state.db.filter.clone(); 107 - let inner = state.db.inner.clone(); 108 - let mode = config.full_network.then_some(FilterMode::Full); 109 - let signals = config.filter_signals.clone().map(SetUpdate::Set); 110 - let collections = config.filter_collections.clone().map(SetUpdate::Set); 111 - let excludes = config.filter_excludes.clone().map(SetUpdate::Set); 112 - 113 - tokio::task::spawn_blocking(move || { 114 - let mut batch = inner.batch(); 115 - db_filter::apply_patch( 116 - &mut batch, 117 - &filter_ks, 118 - mode, 119 - signals, 120 - collections, 121 - excludes, 122 - )?; 123 - batch.commit().into_diagnostic() 124 - }) 125 - .await 126 - .into_diagnostic()??; 127 - 128 - // 3. reload the live filter into the hot-path arc-swap 129 - let new_filter = tokio::task::spawn_blocking({ 130 - let filter_ks = state.db.filter.clone(); 131 - move || db_filter::load(&filter_ks) 132 - }) 133 - .await 134 - .into_diagnostic()??; 135 - state.filter.store(Arc::new(new_filter)); 136 - } 137 - 138 - // 4. set crawler enabled state from config, evaluated against the post-patch filter 139 - let post_patch_crawler = match config.enable_crawler { 140 - Some(b) => b, 141 - None => { 142 - state.filter.load().mode == FilterMode::Full || !config.crawler_sources.is_empty() 143 - } 144 - }; 145 - state.crawler_enabled.send_replace(post_patch_crawler); 146 - 147 - let state = Arc::new(state); 148 - 149 - Ok(Self { 150 - crawler: CrawlerHandle { 151 - state: state.clone(), 152 - shared: Arc::new(std::sync::OnceLock::new()), 153 - tasks: Arc::new(scc::HashMap::new()), 154 - persisted: Arc::new(scc::HashSet::new()), 155 - }, 156 - firehose: FirehoseHandle { 157 - state: state.clone(), 158 - shared: Arc::new(std::sync::OnceLock::new()), 159 - tasks: Arc::new(scc::HashMap::new()), 160 - persisted: Arc::new(scc::HashSet::new()), 161 - }, 162 - backfill: BackfillHandle(state.clone()), 163 - filter: FilterControl(state.clone()), 164 - repos: ReposControl(state.clone()), 165 - db: DbControl(state.clone()), 166 - #[cfg(feature = "backlinks")] 167 - backlinks: crate::backlinks::BacklinksControl(state.clone()), 168 - state, 169 - config: Arc::new(config), 170 - started: Arc::new(AtomicBool::new(false)), 171 - _priv: (), 172 - }) 173 - } 174 - 175 - /// reads config from environment variables and calls [`Hydrant::new`]. 176 - pub async fn from_env() -> Result<Self> { 177 - Self::new(Config::from_env()?).await 178 - } 179 - 180 - /// start all background components and return a future that resolves when any 181 - /// fatal component exits. 182 - /// 183 - /// starts the backfill worker, firehose ingestors, crawler, and worker thread. 184 - /// resolves with `Ok(())` if a fatal component exits cleanly, or `Err(e)` if it 185 - /// fails. intended for use in `tokio::select!` alongside [`serve`](Self::serve). 186 - /// 187 - /// returns an error if called more than once on the same `Hydrant` instance. 188 - pub fn run(&self) -> Result<impl Future<Output = Result<()>>> { 189 - let state = self.state.clone(); 190 - let config = self.config.clone(); 191 - let crawler = self.crawler.clone(); 192 - let firehose = self.firehose.clone(); 193 - 194 - if self.started.swap(true, Ordering::SeqCst) { 195 - miette::bail!("Hydrant::run() called more than once"); 196 - } 197 - 198 - let fut = async move { 199 - // internal buffered channel between ingestors / backfill and the firehose worker 200 - let (buffer_tx, buffer_rx) = mpsc::unbounded_channel(); 201 - 202 - // 5. spawn the backfill worker 203 - tokio::spawn({ 204 - let state = state.clone(); 205 - BackfillWorker::new( 206 - state.clone(), 207 - buffer_tx.clone(), 208 - config.repo_fetch_timeout, 209 - config.backfill_concurrency_limit, 210 - matches!( 211 - config.verify_signatures, 212 - SignatureVerification::Full | SignatureVerification::BackfillOnly 213 - ), 214 - config.ephemeral, 215 - state.backfill_enabled.subscribe(), 216 - ) 217 - .run() 218 - }); 219 - 220 - // 6. re-queue any repos that lost their backfill state, then start the retry worker 221 - if let Err(e) = tokio::task::spawn_blocking({ 222 - let state = state.clone(); 223 - move || crate::backfill::manager::queue_gone_backfills(&state) 224 - }) 225 - .await 226 - .into_diagnostic()? 227 - { 228 - error!(err = %e, "failed to queue gone backfills"); 229 - db::check_poisoned_report(&e); 230 - } 231 - 232 - std::thread::spawn({ 233 - let state = state.clone(); 234 - move || crate::backfill::manager::retry_worker(state) 235 - }); 236 - 237 - // 7. ephemeral GC thread 238 - if config.ephemeral { 239 - let state = state.clone(); 240 - std::thread::Builder::new() 241 - .name("ephemeral-gc".into()) 242 - .spawn(move || crate::db::ephemeral::ephemeral_ttl_worker(state)) 243 - .into_diagnostic()?; 244 - } 245 - 246 - // 8. cursor / counts persist thread 247 - std::thread::spawn({ 248 - let state = state.clone(); 249 - let persist_interval = config.cursor_save_interval; 250 - move || loop { 251 - std::thread::sleep(persist_interval); 252 - 253 - state.relay_cursors.iter_sync(|relay, cursor| { 254 - let seq = cursor.load(Ordering::SeqCst); 255 - if seq > 0 { 256 - if let Err(e) = db::set_firehose_cursor(&state.db, relay, seq) { 257 - error!(relay = %relay, err = %e, "failed to save cursor"); 258 - db::check_poisoned_report(&e); 259 - } 260 - } 261 - true 262 - }); 263 - 264 - if let Err(e) = db::persist_counts(&state.db) { 265 - error!(err = %e, "failed to persist counts"); 266 - db::check_poisoned_report(&e); 267 - } 268 - 269 - if let Err(e) = state.db.persist() { 270 - error!(err = %e, "db persist failed"); 271 - db::check_poisoned_report(&e); 272 - } 273 - } 274 - }); 275 - 276 - // 9. events/sec stats ticker 277 - tokio::spawn({ 278 - let state = state.clone(); 279 - let mut last_id = state.db.next_event_id.load(Ordering::Relaxed); 280 - let mut last_time = std::time::Instant::now(); 281 - let mut interval = tokio::time::interval(std::time::Duration::from_secs(60)); 282 - async move { 283 - loop { 284 - interval.tick().await; 285 - 286 - let current_id = state.db.next_event_id.load(Ordering::Relaxed); 287 - let current_time = std::time::Instant::now(); 288 - let delta = current_id.saturating_sub(last_id); 289 - 290 - if delta == 0 { 291 - debug!("no new events in 60s"); 292 - continue; 293 - } 294 - 295 - let elapsed = current_time.duration_since(last_time).as_secs_f64(); 296 - let rate = if elapsed > 0.0 { 297 - delta as f64 / elapsed 298 - } else { 299 - 0.0 300 - }; 301 - info!("{rate:.2} events/s ({delta} events in {elapsed:.1}s)"); 302 - 303 - last_id = current_id; 304 - last_time = current_time; 305 - } 306 - } 307 - }); 308 - 309 - let (fatal_tx_inner, mut fatal_rx) = watch::channel(None); 310 - let fatal_tx = Arc::new(fatal_tx_inner); 311 - 312 - info!( 313 - crawler_enabled = *state.crawler_enabled.borrow(), 314 - firehose_enabled = *state.firehose_enabled.borrow(), 315 - filter_mode = ?state.filter.load().mode, 316 - "starting ingestion" 317 - ); 318 - 319 - // 10. set shared and spawn firehose ingestors 320 - firehose 321 - .shared 322 - .set(FirehoseShared { 323 - buffer_tx: buffer_tx.clone(), 324 - verify_signatures: matches!( 325 - config.verify_signatures, 326 - SignatureVerification::Full 327 - ), 328 - }) 329 - .ok() 330 - .expect("firehose shared already set"); 331 - let fire_shared = firehose.shared.get().unwrap(); 332 - 333 - let relay_hosts = config.relays.clone(); 334 - if !relay_hosts.is_empty() { 335 - info!( 336 - relay_count = relay_hosts.len(), 337 - hosts = relay_hosts 338 - .iter() 339 - .map(|h| h.as_str()) 340 - .collect::<Vec<_>>() 341 - .join(", "), 342 - "starting firehose ingestor(s)" 343 - ); 344 - for relay_url in &relay_hosts { 345 - let enabled_rx = state.firehose_enabled.subscribe(); 346 - let handle = 347 - spawn_firehose_ingestor(relay_url, &state, fire_shared, enabled_rx).await?; 348 - let _ = firehose.tasks.insert_async(relay_url.clone(), handle).await; 349 - } 350 - } 351 - 352 - let persisted_relay_urls = tokio::task::spawn_blocking({ 353 - let state = state.clone(); 354 - move || load_persisted_firehose_sources(&state.db) 355 - }) 356 - .await 357 - .into_diagnostic()??; 358 - 359 - for relay_url in &persisted_relay_urls { 360 - let _ = firehose.persisted.insert_async(relay_url.clone()).await; 361 - if firehose.tasks.contains_async(relay_url).await { 362 - continue; 363 - } 364 - let enabled_rx = state.firehose_enabled.subscribe(); 365 - let handle = 366 - spawn_firehose_ingestor(relay_url, &state, fire_shared, enabled_rx).await?; 367 - let _ = firehose.tasks.insert_async(relay_url.clone(), handle).await; 368 - } 369 - 370 - // 11. spawn crawler infrastructure (always, to support dynamic source management) 371 - { 372 - use crate::crawler::throttle::Throttler; 373 - use crate::crawler::{ 374 - CrawlerStats, CrawlerWorker, InFlight, RetryProducer, SignalChecker, 375 - }; 376 - 377 - let http = reqwest::Client::builder() 378 - .user_agent(concat!( 379 - env!("CARGO_PKG_NAME"), 380 - "/", 381 - env!("CARGO_PKG_VERSION") 382 - )) 383 - .gzip(true) 384 - .build() 385 - .expect("that reqwest will build"); 386 - let pds_throttler = Throttler::new(); 387 - let in_flight = InFlight::new(); 388 - let stats = CrawlerStats::new( 389 - state.clone(), 390 - config 391 - .crawler_sources 392 - .iter() 393 - .map(|s| s.url.clone()) 394 - .collect(), 395 - pds_throttler.clone(), 396 - ); 397 - let checker = SignalChecker { 398 - http: http.clone(), 399 - state: state.clone(), 400 - throttler: pds_throttler, 401 - }; 402 - 403 - info!( 404 - max_pending = config.crawler_max_pending_repos, 405 - resume_pending = config.crawler_resume_pending_repos, 406 - enabled = *state.crawler_enabled.borrow(), 407 - "starting crawler worker" 408 - ); 409 - let (worker, tx) = CrawlerWorker::new( 410 - state.clone(), 411 - config.crawler_max_pending_repos, 412 - config.crawler_resume_pending_repos, 413 - stats.clone(), 414 - ); 415 - tokio::spawn(async move { 416 - worker.run().await; 417 - error!("crawler worker exited unexpectedly, aborting"); 418 - std::process::abort(); 419 - }); 420 - 421 - let ticker = tokio::spawn(stats.clone().task()); 422 - tokio::spawn(async move { 423 - match ticker.await { 424 - Err(e) => error!(err = ?e, "stats ticker panicked, aborting"), 425 - Ok(()) => error!("stats ticker exited unexpectedly, aborting"), 426 - } 427 - std::process::abort(); 428 - }); 429 - 430 - tokio::spawn( 431 - RetryProducer { 432 - checker: checker.clone(), 433 - in_flight: in_flight.clone(), 434 - tx: tx.clone(), 435 - } 436 - .run(), 437 - ); 438 - 439 - // set shared objects so CrawlerHandle methods can use them 440 - crawler 441 - .shared 442 - .set(CrawlerShared { 443 - http, 444 - checker, 445 - in_flight, 446 - tx, 447 - stats, 448 - }) 449 - .ok() 450 - .expect("crawler shared already set"); 451 - let shared = crawler.shared.get().unwrap(); 452 - 453 - // spawn initial sources from config 454 - for source in config.crawler_sources.iter() { 455 - let enabled_rx = state.crawler_enabled.subscribe(); 456 - let handle = spawn_crawler_producer( 457 - source, 458 - &shared.http, 459 - &state, 460 - &shared.checker, 461 - &shared.in_flight, 462 - &shared.tx, 463 - &shared.stats, 464 - enabled_rx, 465 - ); 466 - let _ = crawler.tasks.insert_async(source.url.clone(), handle).await; 467 - } 468 - 469 - let persisted_sources = tokio::task::spawn_blocking({ 470 - let state = state.clone(); 471 - move || load_persisted_crawler_sources(&state.db) 472 - }) 473 - .await 474 - .into_diagnostic()??; 475 - 476 - for source in &persisted_sources { 477 - let _ = crawler.persisted.insert_async(source.url.clone()).await; 478 - if crawler.tasks.contains_async(&source.url).await { 479 - continue; 480 - } 481 - let enabled_rx = state.crawler_enabled.subscribe(); 482 - let handle = spawn_crawler_producer( 483 - source, 484 - &shared.http, 485 - &state, 486 - &shared.checker, 487 - &shared.in_flight, 488 - &shared.tx, 489 - &shared.stats, 490 - enabled_rx, 491 - ); 492 - let _ = crawler.tasks.insert_async(source.url.clone(), handle).await; 493 - } 494 - } 495 - 496 - // 12. spawn the firehose worker on a blocking thread (fatal task) 497 - let handle = tokio::runtime::Handle::current(); 498 - let firehose_worker = std::thread::spawn({ 499 - let state = state.clone(); 500 - move || { 501 - FirehoseWorker::new( 502 - state, 503 - buffer_rx, 504 - matches!(config.verify_signatures, SignatureVerification::Full), 505 - config.ephemeral, 506 - config.firehose_workers, 507 - ) 508 - .run(handle) 509 - } 510 - }); 511 - 512 - { 513 - let tx = Arc::clone(&fatal_tx); 514 - tokio::spawn( 515 - tokio::task::spawn_blocking(move || { 516 - firehose_worker 517 - .join() 518 - .map_err(|e| miette::miette!("buffer processor died: {e:?}")) 519 - }) 520 - .map(move |r| { 521 - let result = r.into_diagnostic().flatten().flatten(); 522 - let _ = tx.send(Some(result.map_err(|e| e.to_string()))); 523 - }), 524 - ); 525 - } 526 - 527 - // drop the local fatal_tx so the watch channel is only kept alive by the 528 - // spawned tasks. when all fatal tasks exit (and drop their tx clones), 529 - // fatal_rx.changed() returns Err and we return Ok(()). 530 - drop(fatal_tx); 531 - 532 - loop { 533 - match fatal_rx.changed().await { 534 - Ok(()) => { 535 - if let Some(result) = fatal_rx.borrow().clone() { 536 - return result.map_err(|s| miette::miette!("{s}")); 537 - } 538 - } 539 - // all fatal_tx clones dropped: all tasks finished cleanly 540 - Err(_) => return Ok(()), 541 - } 542 - } 543 - }; 544 - Ok(fut) 545 - } 546 - 547 - /// subscribe to the ordered event stream. 548 - /// 549 - /// returns an [`EventStream`] that implements [`futures::Stream`]. 550 - /// 551 - /// - if `cursor` is `None`, streaming starts from the current head (live tail only). 552 - /// - if `cursor` is `Some(id)`, all persisted `record` events from that ID onward are 553 - /// replayed first, then live events follow seamlessly. 554 - /// 555 - /// `identity` and `account` events are ephemeral and are never replayed from a cursor - 556 - /// only live occurrences are delivered. use [`ReposControl::get`] to fetch current 557 - /// identity/account state for a specific DID. 558 - /// 559 - /// multiple concurrent subscribers each receive a full independent copy of the stream. 560 - /// the stream ends when the `EventStream` is dropped. 561 - pub fn subscribe(&self, cursor: Option<u64>) -> EventStream { 562 - let (tx, rx) = mpsc::channel(500); 563 - let state = self.state.clone(); 564 - let runtime = tokio::runtime::Handle::current(); 565 - 566 - std::thread::Builder::new() 567 - .name("hydrant-stream".into()) 568 - .spawn(move || { 569 - let _g = runtime.enter(); 570 - event_stream_thread(state, tx, cursor); 571 - }) 572 - .expect("failed to spawn stream thread"); 573 - 574 - EventStream(rx) 575 - } 576 - 577 - /// return database counts and on-disk sizes for all keyspaces. 578 - /// 579 - /// counts include: `repos`, `pending`, `resync`, `records`, `blocks`, `events`, 580 - /// `error_ratelimited`, `error_transport`, `error_generic`. 581 - /// 582 - /// sizes are in bytes, reported per keyspace. 583 - pub async fn stats(&self) -> Result<StatsResponse> { 584 - let db = self.state.db.clone(); 585 - 586 - let mut counts: BTreeMap<&'static str, u64> = futures::future::join_all( 587 - [ 588 - "repos", 589 - "pending", 590 - "resync", 591 - "records", 592 - "blocks", 593 - "error_ratelimited", 594 - "error_transport", 595 - "error_generic", 596 - ] 597 - .into_iter() 598 - .map(|name| { 599 - let db = db.clone(); 600 - async move { (name, db.get_count(name).await) } 601 - }), 602 - ) 603 - .await 604 - .into_iter() 605 - .collect(); 606 - 607 - counts.insert("events", db.events.approximate_len() as u64); 608 - 609 - let sizes = tokio::task::spawn_blocking(move || { 610 - let mut s = BTreeMap::new(); 611 - s.insert("repos", db.repos.disk_space()); 612 - s.insert("records", db.records.disk_space()); 613 - s.insert("blocks", db.blocks.disk_space()); 614 - s.insert("cursors", db.cursors.disk_space()); 615 - s.insert("pending", db.pending.disk_space()); 616 - s.insert("resync", db.resync.disk_space()); 617 - s.insert("resync_buffer", db.resync_buffer.disk_space()); 618 - s.insert("events", db.events.disk_space()); 619 - s.insert("counts", db.counts.disk_space()); 620 - s.insert("filter", db.filter.disk_space()); 621 - s.insert("crawler", db.crawler.disk_space()); 622 - s 623 - }) 624 - .await 625 - .into_diagnostic()?; 626 - 627 - Ok(StatsResponse { counts, sizes }) 628 - } 629 - 630 - /// returns a future that runs the HTTP management API server on `0.0.0.0:{port}`. 631 - /// 632 - /// the server exposes all management endpoints (`/filter`, `/repos`, `/ingestion`, 633 - /// `/stream`, `/stats`, `/db/*`, `/xrpc/*`). it runs indefinitely and resolves 634 - /// only on error. 635 - /// 636 - /// intended for `tokio::spawn` or inclusion in a `select!` / task list. the clone 637 - /// of `self` is deferred until the future is first polled. 638 - /// 639 - /// to disable the HTTP API entirely, simply don't call this method. 640 - pub fn serve(&self, port: u16) -> impl Future<Output = Result<()>> { 641 - let hydrant = self.clone(); 642 - async move { crate::api::serve(hydrant, port).await } 643 - } 644 - 645 - /// returns a future that runs the debug HTTP API server on `127.0.0.1:{port}`. 646 - /// 647 - /// exposes internal inspection endpoints (`/debug/get`, `/debug/iter`, etc.) 648 - /// that are not safe to expose publicly. binds only to loopback. 649 - pub fn serve_debug(&self, port: u16) -> impl Future<Output = Result<()>> { 650 - let state = self.state.clone(); 651 - async move { crate::api::serve_debug(state, port).await } 652 - } 653 - } 654 - 655 - impl axum::extract::FromRef<Hydrant> for Arc<AppState> { 656 - fn from_ref(h: &Hydrant) -> Self { 657 - h.state.clone() 658 - } 659 - } 660 - 661 - /// a stream of [`Event`]s. returned by [`Hydrant::subscribe`]. 662 - /// 663 - /// implements [`futures::Stream`] and can be used with `StreamExt::next`, 664 - /// `while let Some(evt) = stream.next().await`, `forward`, etc. 665 - /// the stream terminates when the underlying channel closes (i.e. hydrant shuts down). 666 - pub struct EventStream(mpsc::Receiver<Event>); 667 - 668 - impl Stream for EventStream { 669 - type Item = Event; 670 - 671 - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> { 672 - self.0.poll_recv(cx) 673 - } 674 - } 675 - 676 - /// database statistics returned by [`Hydrant::stats`]. 677 - #[derive(serde::Serialize)] 678 - pub struct StatsResponse { 679 - /// record counts per logical category (repos, records, events, error kinds, etc.) 680 - pub counts: BTreeMap<&'static str, u64>, 681 - /// on-disk size in bytes per keyspace 682 - pub sizes: BTreeMap<&'static str, u64>, 683 - } 684 - 685 - struct ProducerHandle { 686 - mode: crate::config::CrawlerMode, 687 - abort: tokio::task::AbortHandle, 688 - } 689 - 690 - impl Drop for ProducerHandle { 691 - fn drop(&mut self) { 692 - self.abort.abort(); 693 - } 694 - } 695 - 696 - struct CrawlerShared { 697 - http: reqwest::Client, 698 - checker: crate::crawler::SignalChecker, 699 - in_flight: crate::crawler::InFlight, 700 - tx: mpsc::Sender<crate::crawler::CrawlerBatch>, 701 - stats: crate::crawler::CrawlerStats, 702 - } 703 - 704 - /// a snapshot of a single crawler source's runtime state. 705 - #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] 706 - pub struct CrawlerSourceInfo { 707 - pub url: Url, 708 - pub mode: crate::config::CrawlerMode, 709 - /// whether this source is persisted in the database (i.e. it was dynamically added 710 - /// and will survive restarts). config-sourced entries have `persisted: false`. 711 - pub persisted: bool, 712 - } 713 - 714 - fn spawn_crawler_producer( 715 - source: &crate::config::CrawlerSource, 716 - http: &reqwest::Client, 717 - state: &Arc<AppState>, 718 - checker: &crate::crawler::SignalChecker, 719 - in_flight: &crate::crawler::InFlight, 720 - tx: &mpsc::Sender<crate::crawler::CrawlerBatch>, 721 - stats: &crate::crawler::CrawlerStats, 722 - enabled: watch::Receiver<bool>, 723 - ) -> ProducerHandle { 724 - use crate::config::CrawlerMode; 725 - use crate::crawler::{ByCollectionProducer, RelayProducer}; 726 - use std::time::Duration; 727 - use tracing::Instrument; 728 - 729 - let abort = match source.mode { 730 - CrawlerMode::Relay => { 731 - info!(relay = %source.url, enabled = *state.crawler_enabled.borrow(), "starting relay crawler"); 732 - let span = tracing::info_span!("crawl", url = %source.url); 733 - tokio::spawn( 734 - RelayProducer { 735 - relay_url: source.url.clone(), 736 - checker: checker.clone(), 737 - in_flight: in_flight.clone(), 738 - tx: tx.clone(), 739 - enabled, 740 - stats: stats.clone(), 741 - } 742 - .run() 743 - .instrument(span), 744 - ) 745 - .abort_handle() 746 - } 747 - CrawlerMode::ByCollection => { 748 - info!( 749 - host = source.url.host_str(), 750 - enabled = *state.crawler_enabled.borrow(), 751 - "starting by-collection crawler" 752 - ); 753 - let span = tracing::info_span!("by_collection", host = source.url.host_str()); 754 - let http = http.clone(); 755 - let state = state.clone(); 756 - let in_flight = in_flight.clone(); 757 - let tx = tx.clone(); 758 - let stats = stats.clone(); 759 - let url = source.url.clone(); 760 - tokio::spawn( 761 - async move { 762 - loop { 763 - let producer = ByCollectionProducer { 764 - index_url: url.clone(), 765 - http: http.clone(), 766 - state: state.clone(), 767 - in_flight: in_flight.clone(), 768 - tx: tx.clone(), 769 - enabled: enabled.clone(), 770 - stats: stats.clone(), 771 - }; 772 - if let Err(e) = producer.run().await { 773 - error!(err = ?e, "by-collection crawler fatal error, restarting in 30s"); 774 - tokio::time::sleep(Duration::from_secs(30)).await; 775 - } 776 - } 777 - } 778 - .instrument(span), 779 - ) 780 - .abort_handle() 781 - } 782 - }; 783 - ProducerHandle { 784 - mode: source.mode, 785 - abort, 786 - } 787 - } 788 - 789 - async fn spawn_firehose_ingestor( 790 - relay_url: &Url, 791 - state: &Arc<AppState>, 792 - shared: &FirehoseShared, 793 - enabled: watch::Receiver<bool>, 794 - ) -> Result<FirehoseIngestorHandle> { 795 - use std::sync::atomic::AtomicI64; 796 - 797 - let start = db::get_firehose_cursor(&state.db, relay_url).await?; 798 - // insert into relay_cursors if not already present; existing in-memory cursor takes precedence 799 - let _ = state 800 - .relay_cursors 801 - .insert_async(relay_url.clone(), AtomicI64::new(start.unwrap_or(0))) 802 - .await; 803 - 804 - info!(relay = %relay_url, cursor = ?start, "starting firehose ingestor"); 805 - 806 - let ingestor = FirehoseIngestor::new( 807 - state.clone(), 808 - shared.buffer_tx.clone(), 809 - relay_url.clone(), 810 - state.filter.clone(), 811 - enabled, 812 - shared.verify_signatures, 813 - ); 814 - 815 - let relay_for_log = relay_url.clone(); 816 - let abort = tokio::spawn(async move { 817 - if let Err(e) = ingestor.run().await { 818 - error!(relay = %relay_for_log, err = %e, "firehose ingestor exited with error"); 819 - } 820 - }) 821 - .abort_handle(); 822 - 823 - Ok(FirehoseIngestorHandle { abort }) 824 - } 825 - 826 - /// runtime control over the crawler component. 827 - /// 828 - /// the crawler walks `com.atproto.sync.listRepos` on each configured relay to discover 829 - /// repositories that have never emitted a firehose event. in `filter` mode it also 830 - /// checks each discovered repo against the configured signal collections before 831 - /// enqueuing it for backfill. 832 - /// 833 - /// disabling the crawler does not affect in-progress repo checks. each one completes 834 - /// its current PDS request before pausing. 835 - #[derive(Clone)] 836 - pub struct CrawlerHandle { 837 - state: Arc<AppState>, 838 - /// set once by [`Hydrant::run`]; `None` means run() has not been called yet. 839 - shared: Arc<std::sync::OnceLock<CrawlerShared>>, 840 - /// per-source running tasks, keyed by url. 841 - tasks: Arc<scc::HashMap<Url, ProducerHandle>>, 842 - /// set of urls persisted in the database (dynamically added sources). 843 - persisted: Arc<scc::HashSet<Url>>, 844 - } 845 - 846 - impl CrawlerHandle { 847 - /// enable the crawler (enables all configured producers). no-op if already enabled. 848 - pub fn enable(&self) { 849 - self.state.crawler_enabled.send_replace(true); 850 - } 851 - /// disable the crawler (disables all configured producers). 852 - /// in-progress repo checks finish before the crawler pauses. 853 - pub fn disable(&self) { 854 - self.state.crawler_enabled.send_replace(false); 855 - } 856 - /// returns the current enabled state of the crawler. 857 - pub fn is_enabled(&self) -> bool { 858 - *self.state.crawler_enabled.borrow() 859 - } 860 - 861 - /// delete all cursor entries associated with the given URL. 862 - pub async fn reset_cursor(&self, url: &str) -> Result<()> { 863 - let db = self.state.db.clone(); 864 - let point_keys = [keys::crawler_cursor_key(url)]; 865 - let by_collection_prefix = keys::by_collection_cursor_prefix(url); 866 - tokio::task::spawn_blocking(move || { 867 - let mut batch = db.inner.batch(); 868 - for k in point_keys { 869 - batch.remove(&db.cursors, k); 870 - } 871 - for entry in db.cursors.prefix(&by_collection_prefix) { 872 - let k = entry.key().into_diagnostic()?; 873 - batch.remove(&db.cursors, k); 874 - } 875 - batch.commit().into_diagnostic() 876 - }) 877 - .await 878 - .into_diagnostic()??; 879 - Ok(()) 880 - } 881 - 882 - /// return info on all currently active crawler sources. 883 - /// 884 - /// returns an empty list if called before [`Hydrant::run`]. 885 - pub async fn list_sources(&self) -> Vec<CrawlerSourceInfo> { 886 - let mut sources = Vec::new(); 887 - self.tasks 888 - .iter_async(|url, h| { 889 - sources.push(CrawlerSourceInfo { 890 - url: url.clone(), 891 - mode: h.mode, 892 - persisted: self.persisted.contains_sync(url), 893 - }); 894 - true 895 - }) 896 - .await; 897 - sources 898 - } 899 - 900 - /// add a new crawler source at runtime. 901 - /// 902 - /// the source is persisted to the database and will be re-spawned on restart. 903 - /// if a source with the same URL already exists, it is replaced (the old task is 904 - /// aborted and a new one is started with the new mode). 905 - /// 906 - /// returns an error if called before [`Hydrant::run`]. 907 - pub async fn add_source(&self, source: crate::config::CrawlerSource) -> Result<()> { 908 - let Some(shared) = self.shared.get() else { 909 - miette::bail!("crawler not yet started: call Hydrant::run() first"); 910 - }; 911 - 912 - let db = self.state.db.clone(); 913 - let key = keys::crawler_source_key(source.url.as_str()); 914 - let val = rmp_serde::to_vec(&source.mode).into_diagnostic()?; 915 - tokio::task::spawn_blocking(move || db.crawler.insert(key, val).into_diagnostic()) 916 - .await 917 - .into_diagnostic()??; 918 - 919 - let enabled_rx = self.state.crawler_enabled.subscribe(); 920 - let handle = spawn_crawler_producer( 921 - &source, 922 - &shared.http, 923 - &self.state, 924 - &shared.checker, 925 - &shared.in_flight, 926 - &shared.tx, 927 - &shared.stats, 928 - enabled_rx, 929 - ); 930 - 931 - let _ = self.persisted.insert_async(source.url.clone()).await; 932 - match self.tasks.entry_async(source.url).await { 933 - scc::hash_map::Entry::Vacant(e) => { 934 - e.insert_entry(handle); 935 - } 936 - scc::hash_map::Entry::Occupied(mut e) => { 937 - *e.get_mut() = handle; 938 - } 939 - } 940 - Ok(()) 941 - } 942 - 943 - /// remove a crawler source at runtime by URL. 944 - /// 945 - /// aborts the running producer task and removes the source from the database if it 946 - /// was dynamically added. config-sourced entries are aborted but not persisted, so 947 - /// they will reappear on restart. 948 - /// 949 - /// returns `true` if a source with the given URL was found and removed. 950 - /// returns an error if called before [`Hydrant::run`]. 951 - pub async fn remove_source(&self, url: &Url) -> Result<bool> { 952 - if self.shared.get().is_none() { 953 - miette::bail!("crawler not yet started: call Hydrant::run() first"); 954 - } 955 - 956 - // dropping the ProducerHandle aborts the task via Drop 957 - if self.tasks.remove_async(url).await.is_none() { 958 - return Ok(false); 959 - } 960 - 961 - // remove from DB if it was a persisted source 962 - if self.persisted.remove_async(url).await.is_some() { 963 - let db = self.state.db.clone(); 964 - let key = keys::crawler_source_key(url.as_str()); 965 - tokio::task::spawn_blocking(move || db.crawler.remove(key).into_diagnostic()) 966 - .await 967 - .into_diagnostic()??; 968 - } 969 - 970 - Ok(true) 971 - } 972 - } 973 - 974 - struct FirehoseIngestorHandle { 975 - abort: tokio::task::AbortHandle, 976 - } 977 - 978 - impl Drop for FirehoseIngestorHandle { 979 - fn drop(&mut self) { 980 - self.abort.abort(); 981 - } 982 - } 983 - 984 - struct FirehoseShared { 985 - buffer_tx: crate::ingest::BufferTx, 986 - verify_signatures: bool, 987 - } 988 - 989 - /// a snapshot of a single firehose relay's runtime state. 990 - #[derive(Debug, Clone, serde::Serialize)] 991 - pub struct FirehoseSourceInfo { 992 - pub url: Url, 993 - /// true if added via the API and persisted to the database; false for `RELAY_HOSTS` sources. 994 - pub persisted: bool, 995 - } 996 - 997 - /// runtime control over the firehose ingestor component. 998 - #[derive(Clone)] 999 - pub struct FirehoseHandle { 1000 - state: Arc<AppState>, 1001 - /// set once by [`Hydrant::run`]; `None` means run() has not been called yet. 1002 - shared: Arc<std::sync::OnceLock<FirehoseShared>>, 1003 - /// per-relay running tasks, keyed by url. 1004 - tasks: Arc<scc::HashMap<Url, FirehoseIngestorHandle>>, 1005 - /// set of urls persisted in the database (dynamically added sources). 1006 - persisted: Arc<scc::HashSet<Url>>, 1007 - } 1008 - 1009 - impl FirehoseHandle { 1010 - /// enable the firehose. no-op if already enabled. 1011 - pub fn enable(&self) { 1012 - self.state.firehose_enabled.send_replace(true); 1013 - } 1014 - /// disable the firehose. the current message finishes processing before the connection closes. 1015 - pub fn disable(&self) { 1016 - self.state.firehose_enabled.send_replace(false); 1017 - } 1018 - /// returns the current enabled state of the firehose. 1019 - pub fn is_enabled(&self) -> bool { 1020 - *self.state.firehose_enabled.borrow() 1021 - } 1022 - 1023 - /// reset the stored cursor for the given relay URL. 1024 - /// 1025 - /// clears the `firehose_cursor|{url}` entry from the cursors keyspace and zeroes the 1026 - /// in-memory cursor. the next connection will tail live events from the current head. 1027 - pub async fn reset_cursor(&self, url: &str) -> Result<()> { 1028 - let db = self.state.db.clone(); 1029 - let key = keys::firehose_cursor_key(url); 1030 - tokio::task::spawn_blocking(move || db.cursors.remove(key).into_diagnostic()) 1031 - .await 1032 - .into_diagnostic()??; 1033 - 1034 - if let Ok(relay_url) = Url::parse(url) { 1035 - self.state.relay_cursors.peek_with(&relay_url, |_, c| { 1036 - c.store(0, std::sync::atomic::Ordering::SeqCst); 1037 - }); 1038 - } 1039 - Ok(()) 1040 - } 1041 - 1042 - /// return info on all currently active firehose sources. 1043 - pub async fn list_sources(&self) -> Vec<FirehoseSourceInfo> { 1044 - let mut sources = Vec::new(); 1045 - self.tasks 1046 - .iter_async(|url, _| { 1047 - sources.push(FirehoseSourceInfo { 1048 - url: url.clone(), 1049 - persisted: self.persisted.contains_sync(url), 1050 - }); 1051 - true 1052 - }) 1053 - .await; 1054 - sources 1055 - } 1056 - 1057 - /// add a new firehose relay at runtime. 1058 - /// 1059 - /// the URL is persisted to the database and will be re-spawned on restart. if a relay with 1060 - /// the same URL already exists it is replaced: the running task is stopped and a new one 1061 - /// is started. any cursor state for that URL is preserved. 1062 - /// 1063 - /// returns an error if called before [`Hydrant::run`]. 1064 - pub async fn add_source(&self, url: Url) -> Result<()> { 1065 - let Some(shared) = self.shared.get() else { 1066 - miette::bail!("firehose not yet started: call Hydrant::run() first"); 1067 - }; 1068 - 1069 - let db = self.state.db.clone(); 1070 - let key = keys::firehose_source_key(url.as_str()); 1071 - tokio::task::spawn_blocking(move || db.crawler.insert(key, b"").into_diagnostic()) 1072 - .await 1073 - .into_diagnostic()??; 1074 - 1075 - let enabled_rx = self.state.firehose_enabled.subscribe(); 1076 - let handle = spawn_firehose_ingestor(&url, &self.state, shared, enabled_rx).await?; 1077 - 1078 - let _ = self.persisted.insert_async(url.clone()).await; 1079 - match self.tasks.entry_async(url).await { 1080 - scc::hash_map::Entry::Vacant(e) => { 1081 - e.insert_entry(handle); 1082 - } 1083 - scc::hash_map::Entry::Occupied(mut e) => { 1084 - *e.get_mut() = handle; 1085 - } 1086 - } 1087 - Ok(()) 1088 - } 1089 - 1090 - /// remove a firehose relay at runtime by URL. 1091 - /// 1092 - /// aborts the running ingestor task. if the source was added via the API it is removed from 1093 - /// the database and will not reappear on restart. `RELAY_HOSTS` sources are only stopped for 1094 - /// the current session; they reappear on the next restart. 1095 - /// 1096 - /// returns `true` if the relay was found and removed, `false` if it was not running. 1097 - /// returns an error if called before [`Hydrant::run`]. 1098 - pub async fn remove_source(&self, url: &Url) -> Result<bool> { 1099 - if self.shared.get().is_none() { 1100 - miette::bail!("firehose not yet started: call Hydrant::run() first"); 1101 - } 1102 - 1103 - if self.tasks.remove_async(url).await.is_none() { 1104 - return Ok(false); 1105 - } 1106 - 1107 - // remove from relay_cursors (persist thread will stop tracking it) 1108 - self.state.relay_cursors.remove_async(url).await; 1109 - 1110 - if self.persisted.remove_async(url).await.is_some() { 1111 - let db = self.state.db.clone(); 1112 - let key = keys::firehose_source_key(url.as_str()); 1113 - tokio::task::spawn_blocking(move || db.crawler.remove(key).into_diagnostic()) 1114 - .await 1115 - .into_diagnostic()??; 1116 - } 1117 - 1118 - Ok(true) 1119 - } 1120 - } 1121 - 1122 - /// runtime control over the backfill worker component. 1123 - /// 1124 - /// the backfill worker fetches full repo CAR files from each repo's PDS for any 1125 - /// repository in the pending queue, parses the MST, and inserts all matching records 1126 - /// into the database. concurrency is bounded by `HYDRANT_BACKFILL_CONCURRENCY_LIMIT`. 1127 - #[derive(Clone)] 1128 - pub struct BackfillHandle(Arc<AppState>); 1129 - 1130 - impl BackfillHandle { 1131 - /// enable the backfill worker, no-op if already enabled. 1132 - pub fn enable(&self) { 1133 - self.0.backfill_enabled.send_replace(true); 1134 - } 1135 - /// disable the backfill worker, in-flight repos complete before pausing. 1136 - pub fn disable(&self) { 1137 - self.0.backfill_enabled.send_replace(false); 1138 - } 1139 - /// returns the current enabled state of the backfill worker. 1140 - pub fn is_enabled(&self) -> bool { 1141 - *self.0.backfill_enabled.borrow() 1142 - } 1143 - } 1144 - 1145 - /// a point-in-time snapshot of the filter configuration. returned by all [`FilterControl`] methods. 1146 - /// 1147 - /// because the filter is stored in the database and loaded on demand, this snapshot 1148 - /// may be stale if another caller modifies the filter concurrently. for the authoritative 1149 - /// live config use [`FilterControl::get`]. 1150 - #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] 1151 - pub struct FilterSnapshot { 1152 - pub mode: FilterMode, 1153 - pub signals: Vec<String>, 1154 - pub collections: Vec<String>, 1155 - pub excludes: Vec<String>, 1156 - } 1157 - 1158 - /// runtime control over the indexing filter. 1159 - /// 1160 - /// the filter has two orthogonal axes: 1161 - /// 1162 - /// **mode** controls discovery: 1163 - /// - [`FilterMode::Filter`]: only indexes repos whose firehose commits touch a collection 1164 - /// matching a configured `signal`. explicit [`ReposControl::track`] always works regardless. 1165 - /// - [`FilterMode::Full`]: indexes the entire network. `signals` are ignored for discovery 1166 - /// but `collections` and `excludes` still apply. 1167 - /// 1168 - /// **sets** are each independently configurable: 1169 - /// - `signals`: NSID patterns that trigger auto-discovery in `filter` mode (e.g. `app.bsky.feed.post`, `app.bsky.graph.*`) 1170 - /// - `collections`: NSID patterns that filter which records are *stored*. empty means store all. 1171 - /// - `excludes`: DIDs that are always skipped regardless of mode. 1172 - /// 1173 - /// NSID patterns support an optional `.*` suffix to match an entire namespace. 1174 - /// all mutations are persisted to the database and take effect immediately. 1175 - #[derive(Clone)] 1176 - pub struct FilterControl(Arc<AppState>); 1177 - 1178 - impl FilterControl { 1179 - /// return the current filter configuration from the database. 1180 - pub async fn get(&self) -> Result<FilterSnapshot> { 1181 - let filter_ks = self.0.db.filter.clone(); 1182 - tokio::task::spawn_blocking(move || { 1183 - let hot = db_filter::load(&filter_ks)?; 1184 - let excludes = db_filter::read_set(&filter_ks, db_filter::EXCLUDE_PREFIX)?; 1185 - Ok(FilterSnapshot { 1186 - mode: hot.mode, 1187 - signals: hot.signals.iter().map(|s| s.to_string()).collect(), 1188 - collections: hot.collections.iter().map(|s| s.to_string()).collect(), 1189 - excludes, 1190 - }) 1191 - }) 1192 - .await 1193 - .into_diagnostic()? 1194 - } 1195 - 1196 - /// set the indexing mode. see [`FilterControl`] for mode semantics. 1197 - pub fn set_mode(&self, mode: FilterMode) -> FilterPatch { 1198 - FilterPatch::new(self).set_mode(mode) 1199 - } 1200 - 1201 - /// replace the entire signals set. existing signals are removed. 1202 - pub fn set_signals(&self, signals: impl IntoIterator<Item = impl Into<String>>) -> FilterPatch { 1203 - FilterPatch::new(self).set_signals(signals) 1204 - } 1205 - 1206 - /// add multiple signals without disturbing existing ones. 1207 - pub fn append_signals( 1208 - &self, 1209 - signals: impl IntoIterator<Item = impl Into<String>>, 1210 - ) -> FilterPatch { 1211 - FilterPatch::new(self).append_signals(signals) 1212 - } 1213 - 1214 - /// add a single signal. no-op if already present. 1215 - pub fn add_signal(&self, signal: impl Into<String>) -> FilterPatch { 1216 - FilterPatch::new(self).add_signal(signal) 1217 - } 1218 - 1219 - /// remove a single signal. no-op if not present. 1220 - pub fn remove_signal(&self, signal: impl Into<String>) -> FilterPatch { 1221 - FilterPatch::new(self).remove_signal(signal) 1222 - } 1223 - 1224 - /// replace the entire collections set. pass an empty iterator to store all collections. 1225 - pub fn set_collections( 1226 - &self, 1227 - collections: impl IntoIterator<Item = impl Into<String>>, 1228 - ) -> FilterPatch { 1229 - FilterPatch::new(self).set_collections(collections) 1230 - } 1231 - 1232 - /// add multiple collections without disturbing existing ones. 1233 - pub fn append_collections( 1234 - &self, 1235 - collections: impl IntoIterator<Item = impl Into<String>>, 1236 - ) -> FilterPatch { 1237 - FilterPatch::new(self).append_collections(collections) 1238 - } 1239 - 1240 - /// add a single collection filter. no-op if already present. 1241 - pub fn add_collection(&self, collection: impl Into<String>) -> FilterPatch { 1242 - FilterPatch::new(self).add_collection(collection) 1243 - } 1244 - 1245 - /// remove a single collection filter. no-op if not present. 1246 - pub fn remove_collection(&self, collection: impl Into<String>) -> FilterPatch { 1247 - FilterPatch::new(self).remove_collection(collection) 1248 - } 1249 - 1250 - /// replace the entire excludes set. 1251 - pub fn set_excludes( 1252 - &self, 1253 - excludes: impl IntoIterator<Item = impl Into<String>>, 1254 - ) -> FilterPatch { 1255 - FilterPatch::new(self).set_excludes(excludes) 1256 - } 1257 - 1258 - /// add multiple DIDs to the excludes set without disturbing existing ones. 1259 - pub fn append_excludes( 1260 - &self, 1261 - excludes: impl IntoIterator<Item = impl Into<String>>, 1262 - ) -> FilterPatch { 1263 - FilterPatch::new(self).append_excludes(excludes) 1264 - } 1265 - 1266 - /// add a single DID to the excludes set. no-op if already excluded. 1267 - pub fn add_exclude(&self, did: impl Into<String>) -> FilterPatch { 1268 - FilterPatch::new(self).add_exclude(did) 1269 - } 1270 - 1271 - /// remove a single DID from the excludes set. no-op if not present. 1272 - pub fn remove_exclude(&self, did: impl Into<String>) -> FilterPatch { 1273 - FilterPatch::new(self).remove_exclude(did) 1274 - } 1275 - } 1276 - 1277 - /// a staged set of filter mutations. all methods accumulate changes without touching 1278 - /// the database. call [`FilterPatch::apply`] to commit the entire patch atomically. 1279 - /// 1280 - /// obtain an instance by calling any mutation method on [`FilterControl`], or via 1281 - /// [`FilterPatch::new`] to start from a blank patch. 1282 - pub struct FilterPatch { 1283 - state: Arc<AppState>, 1284 - /// if set, replaces the current indexing mode. 1285 - pub mode: Option<FilterMode>, 1286 - /// if set, replaces or patches the signals set. 1287 - pub signals: Option<SetUpdate>, 1288 - /// if set, replaces or patches the collections set. 1289 - pub collections: Option<SetUpdate>, 1290 - /// if set, replaces or patches the excludes set. 1291 - pub excludes: Option<SetUpdate>, 1292 - } 1293 - 1294 - impl FilterPatch { 1295 - /// create a new blank patch associated with the given [`FilterControl`]. 1296 - pub fn new(control: &FilterControl) -> Self { 1297 - Self { 1298 - state: control.0.clone(), 1299 - mode: None, 1300 - signals: None, 1301 - collections: None, 1302 - excludes: None, 1303 - } 1304 - } 1305 - 1306 - /// set the indexing mode. see [`FilterControl`] for mode semantics. 1307 - pub fn set_mode(mut self, mode: FilterMode) -> Self { 1308 - self.mode = Some(mode); 1309 - self 1310 - } 1311 - 1312 - /// replace the entire signals set. existing signals are removed. 1313 - pub fn set_signals(mut self, signals: impl IntoIterator<Item = impl Into<String>>) -> Self { 1314 - self.signals = Some(SetUpdate::Set( 1315 - signals.into_iter().map(Into::into).collect(), 1316 - )); 1317 - self 1318 - } 1319 - 1320 - /// add multiple signals without disturbing existing ones. 1321 - pub fn append_signals(mut self, signals: impl IntoIterator<Item = impl Into<String>>) -> Self { 1322 - self.signals = Some(SetUpdate::Patch( 1323 - signals.into_iter().map(|s| (s.into(), true)).collect(), 1324 - )); 1325 - self 1326 - } 1327 - 1328 - /// add a single signal. no-op if already present. 1329 - pub fn add_signal(mut self, signal: impl Into<String>) -> Self { 1330 - self.signals = Some(SetUpdate::Patch([(signal.into(), true)].into())); 1331 - self 1332 - } 1333 - 1334 - /// remove a single signal. no-op if not present. 1335 - pub fn remove_signal(mut self, signal: impl Into<String>) -> Self { 1336 - self.signals = Some(SetUpdate::Patch([(signal.into(), false)].into())); 1337 - self 1338 - } 1339 - 1340 - /// replace the entire collections set. pass an empty iterator to store all collections. 1341 - pub fn set_collections( 1342 - mut self, 1343 - collections: impl IntoIterator<Item = impl Into<String>>, 1344 - ) -> Self { 1345 - self.collections = Some(SetUpdate::Set( 1346 - collections.into_iter().map(Into::into).collect(), 1347 - )); 1348 - self 1349 - } 1350 - 1351 - /// add multiple collections without disturbing existing ones. 1352 - pub fn append_collections( 1353 - mut self, 1354 - collections: impl IntoIterator<Item = impl Into<String>>, 1355 - ) -> Self { 1356 - self.collections = Some(SetUpdate::Patch( 1357 - collections.into_iter().map(|c| (c.into(), true)).collect(), 1358 - )); 1359 - self 1360 - } 1361 - 1362 - /// add a single collection filter. no-op if already present. 1363 - pub fn add_collection(mut self, collection: impl Into<String>) -> Self { 1364 - self.collections = Some(SetUpdate::Patch([(collection.into(), true)].into())); 1365 - self 1366 - } 1367 - 1368 - /// remove a single collection filter. no-op if not present. 1369 - pub fn remove_collection(mut self, collection: impl Into<String>) -> Self { 1370 - self.collections = Some(SetUpdate::Patch([(collection.into(), false)].into())); 1371 - self 1372 - } 1373 - 1374 - /// replace the entire excludes set. 1375 - pub fn set_excludes(mut self, excludes: impl IntoIterator<Item = impl Into<String>>) -> Self { 1376 - self.excludes = Some(SetUpdate::Set( 1377 - excludes.into_iter().map(Into::into).collect(), 1378 - )); 1379 - self 1380 - } 1381 - 1382 - /// add multiple DIDs to the excludes set without disturbing existing ones. 1383 - pub fn append_excludes( 1384 - mut self, 1385 - excludes: impl IntoIterator<Item = impl Into<String>>, 1386 - ) -> Self { 1387 - self.excludes = Some(SetUpdate::Patch( 1388 - excludes.into_iter().map(|d| (d.into(), true)).collect(), 1389 - )); 1390 - self 1391 - } 1392 - 1393 - /// add a single DID to the excludes set. no-op if already excluded. 1394 - pub fn add_exclude(mut self, did: impl Into<String>) -> Self { 1395 - self.excludes = Some(SetUpdate::Patch([(did.into(), true)].into())); 1396 - self 1397 - } 1398 - 1399 - /// remove a single DID from the excludes set. no-op if not present. 1400 - pub fn remove_exclude(mut self, did: impl Into<String>) -> Self { 1401 - self.excludes = Some(SetUpdate::Patch([(did.into(), false)].into())); 1402 - self 1403 - } 1404 - 1405 - /// commit the patch atomically to the database and update the in-memory filter. 1406 - /// returns the updated [`FilterSnapshot`]. 1407 - pub async fn apply(self) -> Result<FilterSnapshot> { 1408 - let filter_ks = self.state.db.filter.clone(); 1409 - let inner = self.state.db.inner.clone(); 1410 - let filter_handle = self.state.filter.clone(); 1411 - let mode = self.mode; 1412 - let signals = self.signals; 1413 - let collections = self.collections; 1414 - let excludes = self.excludes; 1415 - 1416 - let new_filter = tokio::task::spawn_blocking(move || { 1417 - let mut batch = inner.batch(); 1418 - db_filter::apply_patch(&mut batch, &filter_ks, mode, signals, collections, excludes)?; 1419 - batch.commit().into_diagnostic()?; 1420 - db_filter::load(&filter_ks) 1421 - }) 1422 - .await 1423 - .into_diagnostic()??; 1424 - 1425 - let exclude_list = { 1426 - let filter_ks = self.state.db.filter.clone(); 1427 - tokio::task::spawn_blocking(move || { 1428 - db_filter::read_set(&filter_ks, db_filter::EXCLUDE_PREFIX) 1429 - }) 1430 - .await 1431 - .into_diagnostic()?? 1432 - }; 1433 - 1434 - let snapshot = FilterSnapshot { 1435 - mode: new_filter.mode, 1436 - signals: new_filter.signals.iter().map(|s| s.to_string()).collect(), 1437 - collections: new_filter 1438 - .collections 1439 - .iter() 1440 - .map(|s| s.to_string()) 1441 - .collect(), 1442 - excludes: exclude_list, 1443 - }; 1444 - 1445 - filter_handle.store(Arc::new(new_filter)); 1446 - Ok(snapshot) 1447 - } 1448 - } 1449 - 1450 - /// information about a tracked or known repository. returned by [`ReposControl`] methods. 1451 - #[derive(Debug, Clone, serde::Serialize)] 1452 - pub struct RepoInfo { 1453 - /// the DID of the repository. 1454 - pub did: Did<'static>, 1455 - /// the status of the repository. 1456 - #[serde(serialize_with = "crate::util::repo_status_serialize_str")] 1457 - pub status: RepoStatus, 1458 - /// whether this repository is tracked or not. 1459 - /// untracked repositories are not updated and they stay frozen. 1460 - pub tracked: bool, 1461 - /// the revision of the root commit of this repository. 1462 - #[serde(skip_serializing_if = "Option::is_none")] 1463 - pub rev: Option<Tid>, 1464 - /// the CID of the root commit of this repository. 1465 - #[serde(serialize_with = "crate::util::opt_cid_serialize_str")] 1466 - #[serde(skip_serializing_if = "Option::is_none")] 1467 - pub data: Option<IpldCid>, 1468 - /// the handle for the DID of this repository. 1469 - #[serde(skip_serializing_if = "Option::is_none")] 1470 - pub handle: Option<Handle<'static>>, 1471 - /// the URL for the PDS in which this repository is hosted on. 1472 - #[serde(skip_serializing_if = "Option::is_none")] 1473 - pub pds: Option<Url>, 1474 - /// ATProto signing key of this repository. 1475 - #[serde(skip_serializing_if = "Option::is_none")] 1476 - pub signing_key: Option<String>, 1477 - /// when this repository was last touched (status update, commit ingested, etc.). 1478 - #[serde(skip_serializing_if = "Option::is_none")] 1479 - pub last_updated_at: Option<DateTime<Utc>>, 1480 - /// the time of the last message gotten from the firehose for this repository. 1481 - /// this is equal to the `time` field. 1482 - #[serde(skip_serializing_if = "Option::is_none")] 1483 - pub last_message_at: Option<DateTime<Utc>>, 1484 - } 1485 - 1486 - /// control over which repositories are tracked and access to their state. 1487 - /// 1488 - /// in `filter` mode, a repo is only indexed if it either matches a signal or is 1489 - /// explicitly tracked via [`ReposControl::track`]. in `full` mode all repos are indexed 1490 - /// and tracking is implicit. 1491 - /// 1492 - /// tracking a DID that hydrant has never seen enqueues an immediate backfill. 1493 - /// tracking a DID that hydrant already knows about (but has marked untracked) 1494 - /// re-enqueues it for backfill. 1495 - #[derive(Clone)] 1496 - pub struct ReposControl(Arc<AppState>); 1497 - 1498 - impl ReposControl { 1499 - /// gets a handle for a repository to allow acting upon it. 1500 - pub fn get<'i>(&self, did: &Did<'i>) -> Result<RepoHandle<'i>> { 1501 - Ok(RepoHandle { 1502 - state: self.0.clone(), 1503 - did: did.clone(), 1504 - }) 1505 - } 1506 - 1507 - /// same as [`ReposControl::get`] but allows you to pass in an identifier that can be 1508 - /// either a handle or a DID. 1509 - pub async fn resolve(&self, repo: &AtIdentifier<'_>) -> Result<RepoHandle<'static>> { 1510 - let did = self.0.resolver.resolve_did(repo).await?; 1511 - Ok(RepoHandle { 1512 - state: self.0.clone(), 1513 - did, 1514 - }) 1515 - } 1516 - 1517 - /// fetch the current state of a single repository. returns `None` if hydrant 1518 - /// has never seen this DID. 1519 - pub async fn info(&self, did: &Did<'_>) -> Result<Option<RepoInfo>> { 1520 - self.get(did)?.info().await 1521 - } 1522 - 1523 - /// explicitly track one or more repositories, enqueuing them for backfill if needed. 1524 - /// 1525 - /// - if a DID is new, a fresh [`RepoState`] is created and backfill is queued. 1526 - /// - if a DID is already known but untracked, it is marked tracked and re-enqueued. 1527 - /// - if a DID is already tracked, this is a no-op. 1528 - pub async fn track(&self, dids: impl IntoIterator<Item = Did<'_>>) -> Result<()> { 1529 - let dids: Vec<Did<'static>> = dids.into_iter().map(|d| d.into_static()).collect(); 1530 - let state = self.0.clone(); 1531 - 1532 - let (new_count, transitions) = tokio::task::spawn_blocking(move || { 1533 - let db = &state.db; 1534 - let mut batch = db.inner.batch(); 1535 - let mut added = 0i64; 1536 - let mut transitions: Vec<(GaugeState, GaugeState)> = Vec::new(); 1537 - let mut rng = rand::rng(); 1538 - 1539 - for did in &dids { 1540 - let did_key = keys::repo_key(did); 1541 - let repo_bytes = db.repos.get(&did_key).into_diagnostic()?; 1542 - let existing = repo_bytes 1543 - .as_deref() 1544 - .map(db::deser_repo_state) 1545 - .transpose()?; 1546 - 1547 - if let Some(mut repo_state) = existing { 1548 - if !repo_state.tracked { 1549 - let resync = db.resync.get(&did_key).into_diagnostic()?; 1550 - let old = db::Db::repo_gauge_state(&repo_state, resync.as_deref()); 1551 - repo_state.tracked = true; 1552 - batch.insert(&db.repos, &did_key, ser_repo_state(&repo_state)?); 1553 - batch.insert( 1554 - &db.pending, 1555 - keys::pending_key(repo_state.index_id), 1556 - &did_key, 1557 - ); 1558 - batch.remove(&db.resync, &did_key); 1559 - transitions.push((old, GaugeState::Pending)); 1560 - } 1561 - } else { 1562 - let repo_state = RepoState::backfilling(rng.next_u64()); 1563 - batch.insert(&db.repos, &did_key, ser_repo_state(&repo_state)?); 1564 - batch.insert( 1565 - &db.pending, 1566 - keys::pending_key(repo_state.index_id), 1567 - &did_key, 1568 - ); 1569 - added += 1; 1570 - transitions.push((GaugeState::Synced, GaugeState::Pending)); 1571 - } 1572 - } 1573 - 1574 - batch.commit().into_diagnostic()?; 1575 - Ok::<_, miette::Report>((added, transitions)) 1576 - }) 1577 - .await 1578 - .into_diagnostic()??; 1579 - 1580 - if new_count > 0 { 1581 - self.0.db.update_count_async("repos", new_count).await; 1582 - } 1583 - for (old, new) in transitions { 1584 - self.0.db.update_gauge_diff_async(&old, &new).await; 1585 - } 1586 - self.0.notify_backfill(); 1587 - Ok(()) 1588 - } 1589 - 1590 - /// stop tracking one or more repositories. hydrant will stop processing new events 1591 - /// for them and remove them from the pending/resync queues, but existing indexed 1592 - /// records are **not** deleted. 1593 - pub async fn untrack(&self, dids: impl IntoIterator<Item = Did<'_>>) -> Result<()> { 1594 - let dids: Vec<Did<'static>> = dids.into_iter().map(|d| d.into_static()).collect(); 1595 - let state = self.0.clone(); 1596 - 1597 - let gauge_decrements = tokio::task::spawn_blocking(move || { 1598 - let db = &state.db; 1599 - let mut batch = db.inner.batch(); 1600 - let mut gauge_decrements = Vec::new(); 1601 - 1602 - for did in &dids { 1603 - let did_key = keys::repo_key(did); 1604 - let repo_bytes = db.repos.get(&did_key).into_diagnostic()?; 1605 - let existing = repo_bytes 1606 - .as_deref() 1607 - .map(db::deser_repo_state) 1608 - .transpose()?; 1609 - 1610 - if let Some(repo_state) = existing { 1611 - if repo_state.tracked { 1612 - let resync = db.resync.get(&did_key).into_diagnostic()?; 1613 - let old = db::Db::repo_gauge_state(&repo_state, resync.as_deref()); 1614 - let mut repo_state = repo_state.into_static(); 1615 - repo_state.tracked = false; 1616 - batch.insert(&db.repos, &did_key, ser_repo_state(&repo_state)?); 1617 - batch.remove(&db.pending, keys::pending_key(repo_state.index_id)); 1618 - batch.remove(&db.resync, &did_key); 1619 - if old != GaugeState::Synced { 1620 - gauge_decrements.push(old); 1621 - } 1622 - } 1623 - } 1624 - } 1625 - 1626 - batch.commit().into_diagnostic()?; 1627 - Ok::<_, miette::Report>(gauge_decrements) 1628 - }) 1629 - .await 1630 - .into_diagnostic()??; 1631 - 1632 - for gauge in gauge_decrements { 1633 - self.0 1634 - .db 1635 - .update_gauge_diff_async(&gauge, &GaugeState::Synced) 1636 - .await; 1637 - } 1638 - Ok(()) 1639 - } 1640 - } 1641 - 1642 - pub(crate) fn repo_state_to_info(did: Did<'static>, s: RepoState<'_>) -> RepoInfo { 1643 - RepoInfo { 1644 - did, 1645 - status: s.status, 1646 - tracked: s.tracked, 1647 - rev: s.rev.map(|r| r.to_tid()), 1648 - data: s.data, 1649 - handle: s.handle.map(|h| h.into_static()), 1650 - pds: s.pds.and_then(|p| p.parse().ok()), 1651 - signing_key: s.signing_key.map(|k| k.encode()), 1652 - last_updated_at: DateTime::from_timestamp_secs(s.last_updated_at), 1653 - last_message_at: s.last_message_time.and_then(DateTime::from_timestamp_secs), 1654 - } 1655 - } 1656 - 1657 - /// control over database maintenance operations. 1658 - /// 1659 - /// all methods pause the crawler, firehose, and backfill worker for the duration 1660 - /// of the operation and restore their prior state on completion, whether or not 1661 - /// the operation succeeds. 1662 - #[derive(Clone)] 1663 - pub struct DbControl(Arc<AppState>); 1664 - 1665 - impl DbControl { 1666 - /// trigger a major compaction of all keyspaces in parallel. 1667 - /// 1668 - /// compaction reclaims disk space from deleted/updated keys and improves 1669 - /// read performance. can take several minutes on large datasets. 1670 - pub async fn compact(&self) -> Result<()> { 1671 - let state = self.0.clone(); 1672 - state 1673 - .with_ingestion_paused(async || state.db.compact().await) 1674 - .await 1675 - } 1676 - 1677 - /// train zstd compression dictionaries for the `repos`, `blocks`, and `events` keyspaces. 1678 - /// 1679 - /// dictionaries are written to `dict_{name}.bin` files next to the database. 1680 - /// a restart is required to apply them. training samples data blocks from the 1681 - /// existing database, so the database must have a reasonable amount of data first. 1682 - pub async fn train_dicts(&self) -> Result<()> { 1683 - let state = self.0.clone(); 1684 - state 1685 - .with_ingestion_paused(async || { 1686 - let train = |name: &'static str| { 1687 - let db = state.db.clone(); 1688 - tokio::task::spawn_blocking(move || db.train_dict(name)) 1689 - .map(|res| res.into_diagnostic().flatten()) 1690 - }; 1691 - tokio::try_join!(train("repos"), train("blocks"), train("events")).map(|_| ()) 1692 - }) 1693 - .await 1694 - } 1695 - } 1696 - 1697 - pub struct Record { 1698 - pub did: Did<'static>, 1699 - pub cid: Cid<'static>, 1700 - pub value: Data<'static>, 1701 - } 1702 - 1703 - pub struct ListedRecord { 1704 - pub rkey: Rkey<'static>, 1705 - pub cid: Cid<'static>, 1706 - pub value: Data<'static>, 1707 - } 1708 - 1709 - pub struct RecordList { 1710 - pub records: Vec<ListedRecord>, 1711 - pub cursor: Option<Rkey<'static>>, 1712 - } 1713 - 1714 - /// handle to access data related to this repository. 1715 - #[derive(Clone)] 1716 - pub struct RepoHandle<'i> { 1717 - state: Arc<AppState>, 1718 - pub did: Did<'i>, 1719 - } 1720 - 1721 - impl<'i> RepoHandle<'i> { 1722 - pub async fn info(&self) -> Result<Option<RepoInfo>> { 1723 - let did_key = keys::repo_key(&self.did); 1724 - let state = self.state.clone(); 1725 - let did = self.did.clone().into_static(); 1726 - 1727 - tokio::task::spawn_blocking(move || { 1728 - let bytes = state.db.repos.get(&did_key).into_diagnostic()?; 1729 - let state = bytes.as_deref().map(db::deser_repo_state).transpose()?; 1730 - Ok(state.map(|s| repo_state_to_info(did, s))) 1731 - }) 1732 - .await 1733 - .into_diagnostic()? 1734 - } 1735 - 1736 - pub async fn get_record(&self, collection: &str, rkey: &str) -> Result<Option<Record>> { 1737 - let did = self.did.clone().into_static(); 1738 - let db_key = keys::record_key(&did, collection, &DbRkey::new(rkey)); 1739 - 1740 - let collection = collection.to_smolstr(); 1741 - let state = self.state.clone(); 1742 - tokio::task::spawn_blocking(move || { 1743 - use miette::WrapErr; 1744 - 1745 - let cid_bytes = state.db.records.get(db_key).into_diagnostic()?; 1746 - let Some(cid_bytes) = cid_bytes else { 1747 - return Ok(None); 1748 - }; 1749 - 1750 - // lookup block using col|cid key 1751 - let block_key = keys::block_key(&collection, &cid_bytes); 1752 - let Some(block_bytes) = state.db.blocks.get(block_key).into_diagnostic()? else { 1753 - miette::bail!("block {cid_bytes:?} not found, this is a bug!!"); 1754 - }; 1755 - 1756 - let value = serde_ipld_dagcbor::from_slice::<Data>(&block_bytes) 1757 - .into_diagnostic() 1758 - .wrap_err("cant parse block")? 1759 - .into_static(); 1760 - let cid = Cid::new(&cid_bytes) 1761 - .into_diagnostic() 1762 - .wrap_err("cant parse block cid")?; 1763 - let cid = Cid::Str(cid.to_cowstr().into_static()); 1764 - 1765 - Ok(Some(Record { did, cid, value })) 1766 - }) 1767 - .await 1768 - .into_diagnostic()? 1769 - } 1770 - 1771 - pub async fn list_records( 1772 - &self, 1773 - collection: &str, 1774 - limit: usize, 1775 - reverse: bool, 1776 - cursor: Option<&str>, 1777 - ) -> Result<RecordList> { 1778 - let did = self.did.clone().into_static(); 1779 - 1780 - let state = self.state.clone(); 1781 - let prefix = keys::record_prefix_collection(&did, collection); 1782 - let collection = collection.to_smolstr(); 1783 - let cursor = cursor.map(|c| c.to_smolstr()); 1784 - 1785 - tokio::task::spawn_blocking(move || { 1786 - let mut results = Vec::new(); 1787 - let mut next_cursor = None; 1788 - 1789 - let iter: Box<dyn Iterator<Item = _>> = if !reverse { 1790 - let mut end_prefix = prefix.clone(); 1791 - if let Some(last) = end_prefix.last_mut() { 1792 - *last += 1; 1793 - } 1794 - 1795 - let end_key = if let Some(cursor) = &cursor { 1796 - let mut k = prefix.clone(); 1797 - k.extend_from_slice(cursor.as_bytes()); 1798 - k 1799 - } else { 1800 - end_prefix 1801 - }; 1802 - 1803 - Box::new( 1804 - state 1805 - .db 1806 - .records 1807 - .range(prefix.as_slice()..end_key.as_slice()) 1808 - .rev(), 1809 - ) 1810 - } else { 1811 - let start_key = if let Some(cursor) = &cursor { 1812 - let mut k = prefix.clone(); 1813 - k.extend_from_slice(cursor.as_bytes()); 1814 - k.push(0); 1815 - k 1816 - } else { 1817 - prefix.clone() 1818 - }; 1819 - 1820 - Box::new(state.db.records.range(start_key.as_slice()..)) 1821 - }; 1822 - 1823 - for item in iter { 1824 - let (key, cid_bytes) = item.into_inner().into_diagnostic()?; 1825 - 1826 - if !key.starts_with(prefix.as_slice()) { 1827 - break; 1828 - } 1829 - 1830 - let rkey = keys::parse_rkey(&key[prefix.len()..])?; 1831 - if results.len() >= limit { 1832 - next_cursor = Some(rkey); 1833 - break; 1834 - } 1835 - 1836 - // look up using col|cid key built from collection and binary cid bytes 1837 - if let Ok(Some(block_bytes)) = state 1838 - .db 1839 - .blocks 1840 - .get(&keys::block_key(collection.as_str(), &cid_bytes)) 1841 - { 1842 - let value: Data = 1843 - serde_ipld_dagcbor::from_slice(&block_bytes).unwrap_or(Data::Null); 1844 - let cid = Cid::new(&cid_bytes).into_diagnostic()?; 1845 - let cid = Cid::Str(cid.to_cowstr().into_static()); 1846 - results.push(ListedRecord { 1847 - rkey: Rkey::new_cow(CowStr::Owned(rkey.to_smolstr())) 1848 - .expect("that rkey is validated"), 1849 - cid, 1850 - value: value.into_static(), 1851 - }); 1852 - } 1853 - } 1854 - Result::<_, miette::Report>::Ok((results, next_cursor)) 1855 - }) 1856 - .await 1857 - .into_diagnostic()? 1858 - .map(|(records, next_cursor)| RecordList { 1859 - records, 1860 - cursor: next_cursor.map(|rkey| { 1861 - Rkey::new_cow(CowStr::Owned(rkey.to_smolstr())).expect("that rkey is validated") 1862 - }), 1863 - }) 1864 - } 1865 - 1866 - pub async fn count_records(&self, collection: &str) -> Result<u64> { 1867 - let did = self.did.clone().into_static(); 1868 - let state = self.state.clone(); 1869 - let collection = collection.to_string(); 1870 - tokio::task::spawn_blocking(move || db::get_record_count(&state.db, &did, &collection)) 1871 - .await 1872 - .into_diagnostic()? 1873 - } 1874 - } 1875 - 1876 - fn event_stream_thread(state: Arc<AppState>, tx: mpsc::Sender<Event>, cursor: Option<u64>) { 1877 - let db = &state.db; 1878 - let mut event_rx = db.event_tx.subscribe(); 1879 - let ks = db.events.clone(); 1880 - let mut current_id = match cursor { 1881 - Some(c) => c.saturating_sub(1), 1882 - None => db.next_event_id.load(Ordering::SeqCst).saturating_sub(1), 1883 - }; 1884 - 1885 - loop { 1886 - // catch up from db 1887 - loop { 1888 - let mut found = false; 1889 - for item in ks.range(keys::event_key(current_id + 1)..) { 1890 - let (k, v) = match item.into_inner() { 1891 - Ok(kv) => kv, 1892 - Err(e) => { 1893 - error!(err = %e, "failed to read event from db"); 1894 - break; 1895 - } 1896 - }; 1897 - 1898 - let id = match k.as_ref().try_into().map(u64::from_be_bytes) { 1899 - Ok(id) => id, 1900 - Err(_) => { 1901 - error!("failed to parse event id"); 1902 - continue; 1903 - } 1904 - }; 1905 - current_id = id; 1906 - 1907 - let stored: StoredEvent = match rmp_serde::from_slice(&v) { 1908 - Ok(e) => e, 1909 - Err(e) => { 1910 - error!(err = %e, "failed to deserialize stored event"); 1911 - continue; 1912 - } 1913 - }; 1914 - 1915 - let Some(evt) = stored_to_event(&state, id, stored) else { 1916 - continue; 1917 - }; 1918 - 1919 - if tx.blocking_send(evt).is_err() { 1920 - return; // receiver dropped 1921 - } 1922 - found = true; 1923 - } 1924 - if !found { 1925 - break; 1926 - } 1927 - } 1928 - 1929 - // wait for live events 1930 - match event_rx.blocking_recv() { 1931 - Ok(BroadcastEvent::Persisted(_)) => {} // re-run catch-up 1932 - Ok(BroadcastEvent::Ephemeral(evt)) => { 1933 - if tx.blocking_send(*evt).is_err() { 1934 - return; 1935 - } 1936 - } 1937 - Err(tokio::sync::broadcast::error::RecvError::Lagged(_)) => {} 1938 - Err(tokio::sync::broadcast::error::RecvError::Closed) => break, 1939 - } 1940 - } 1941 - } 1942 - 1943 - fn stored_to_event(state: &AppState, id: u64, stored: StoredEvent<'_>) -> Option<Event> { 1944 - let StoredEvent { 1945 - live, 1946 - did, 1947 - rev, 1948 - collection, 1949 - rkey, 1950 - action, 1951 - data, 1952 - } = stored; 1953 - 1954 - let record = match data { 1955 - StoredData::Ptr(cid) => { 1956 - let block = state 1957 - .db 1958 - .blocks 1959 - .get(&keys::block_key(collection.as_str(), &cid.to_bytes())); 1960 - match block { 1961 - Ok(Some(bytes)) => match serde_ipld_dagcbor::from_slice::<RawData>(&bytes) { 1962 - Ok(val) => Some((cid, serde_json::to_value(val).ok()?)), 1963 - Err(e) => { 1964 - error!(err = %e, "cant parse block"); 1965 - return None; 1966 - } 1967 - }, 1968 - Ok(None) => { 1969 - error!("block not found, this is a bug"); 1970 - return None; 1971 - } 1972 - Err(e) => { 1973 - error!(err = %e, "cant get block"); 1974 - db::check_poisoned(&e); 1975 - return None; 1976 - } 1977 - } 1978 - } 1979 - StoredData::Block(block) => { 1980 - let digest = Sha256::digest(&block); 1981 - let hash = 1982 - cid::multihash::Multihash::wrap(ATP_CID_HASH, &digest).expect("valid sha256 hash"); 1983 - let cid = IpldCid::new_v1(DAG_CBOR_CID_CODEC, hash); 1984 - match serde_ipld_dagcbor::from_slice::<RawData>(&block) { 1985 - Ok(val) => Some((cid, serde_json::to_value(val).ok()?)), 1986 - Err(e) => { 1987 - error!(err = %e, "cant parse block"); 1988 - return None; 1989 - } 1990 - } 1991 - } 1992 - StoredData::Nothing => None, 1993 - }; 1994 - 1995 - let (cid, record) = record 1996 - .map(|(c, r)| (Some(c), Some(r))) 1997 - .unwrap_or((None, None)); 1998 - 1999 - Some(MarshallableEvt { 2000 - id, 2001 - kind: crate::types::EventType::Record, 2002 - record: Some(RecordEvt { 2003 - live, 2004 - did: did.to_did(), 2005 - rev: rev.to_tid(), 2006 - collection: Nsid::new_cow(collection.clone().into_static()) 2007 - .expect("that collection is already validated"), 2008 - rkey: Rkey::new_cow(rkey.to_cowstr().into_static()) 2009 - .expect("that rkey is already validated"), 2010 - action: CowStr::Borrowed(action.as_str()), 2011 - record, 2012 - cid, 2013 - }), 2014 - identity: None, 2015 - account: None, 2016 - }) 2017 - }
+261
src/control/crawler.rs
··· 1 + use std::sync::Arc; 2 + 3 + use miette::{IntoDiagnostic, Result}; 4 + use tokio::sync::{mpsc, watch}; 5 + use tracing::{error, info}; 6 + use url::Url; 7 + 8 + use crate::db::keys; 9 + use crate::state::AppState; 10 + 11 + pub(super) struct ProducerHandle { 12 + mode: crate::config::CrawlerMode, 13 + abort: tokio::task::AbortHandle, 14 + } 15 + 16 + impl Drop for ProducerHandle { 17 + fn drop(&mut self) { 18 + self.abort.abort(); 19 + } 20 + } 21 + 22 + pub(super) struct CrawlerShared { 23 + pub(super) http: reqwest::Client, 24 + pub(super) checker: crate::crawler::SignalChecker, 25 + pub(super) in_flight: crate::crawler::InFlight, 26 + pub(super) tx: mpsc::Sender<crate::crawler::CrawlerBatch>, 27 + pub(super) stats: crate::crawler::CrawlerStats, 28 + } 29 + 30 + /// a snapshot of a single crawler source's runtime state. 31 + #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] 32 + pub struct CrawlerSourceInfo { 33 + pub url: Url, 34 + pub mode: crate::config::CrawlerMode, 35 + /// whether this source is persisted in the database (i.e. it was dynamically added 36 + /// and will survive restarts). config-sourced entries have `persisted: false`. 37 + pub persisted: bool, 38 + } 39 + 40 + pub(super) fn spawn_crawler_producer( 41 + source: &crate::config::CrawlerSource, 42 + http: &reqwest::Client, 43 + state: &Arc<AppState>, 44 + checker: &crate::crawler::SignalChecker, 45 + in_flight: &crate::crawler::InFlight, 46 + tx: &mpsc::Sender<crate::crawler::CrawlerBatch>, 47 + stats: &crate::crawler::CrawlerStats, 48 + enabled: watch::Receiver<bool>, 49 + ) -> ProducerHandle { 50 + use crate::config::CrawlerMode; 51 + use crate::crawler::{ByCollectionProducer, RelayProducer}; 52 + use std::time::Duration; 53 + use tracing::Instrument; 54 + 55 + let abort = match source.mode { 56 + CrawlerMode::Relay => { 57 + info!(relay = %source.url, enabled = *state.crawler_enabled.borrow(), "starting relay crawler"); 58 + let span = tracing::info_span!("crawl", url = %source.url); 59 + tokio::spawn( 60 + RelayProducer { 61 + relay_url: source.url.clone(), 62 + checker: checker.clone(), 63 + in_flight: in_flight.clone(), 64 + tx: tx.clone(), 65 + enabled, 66 + stats: stats.clone(), 67 + } 68 + .run() 69 + .instrument(span), 70 + ) 71 + .abort_handle() 72 + } 73 + CrawlerMode::ByCollection => { 74 + info!( 75 + host = source.url.host_str(), 76 + enabled = *state.crawler_enabled.borrow(), 77 + "starting by-collection crawler" 78 + ); 79 + let span = tracing::info_span!("by_collection", host = source.url.host_str()); 80 + let http = http.clone(); 81 + let state = state.clone(); 82 + let in_flight = in_flight.clone(); 83 + let tx = tx.clone(); 84 + let stats = stats.clone(); 85 + let url = source.url.clone(); 86 + tokio::spawn( 87 + async move { 88 + loop { 89 + let producer = ByCollectionProducer { 90 + index_url: url.clone(), 91 + http: http.clone(), 92 + state: state.clone(), 93 + in_flight: in_flight.clone(), 94 + tx: tx.clone(), 95 + enabled: enabled.clone(), 96 + stats: stats.clone(), 97 + }; 98 + if let Err(e) = producer.run().await { 99 + error!(err = ?e, "by-collection crawler fatal error, restarting in 30s"); 100 + tokio::time::sleep(Duration::from_secs(30)).await; 101 + } 102 + } 103 + } 104 + .instrument(span), 105 + ) 106 + .abort_handle() 107 + } 108 + }; 109 + ProducerHandle { 110 + mode: source.mode, 111 + abort, 112 + } 113 + } 114 + 115 + /// runtime control over the crawler component. 116 + /// 117 + /// the crawler walks `com.atproto.sync.listRepos` on each configured relay to discover 118 + /// repositories that have never emitted a firehose event. in `filter` mode it also 119 + /// checks each discovered repo against the configured signal collections before 120 + /// enqueuing it for backfill. 121 + /// 122 + /// disabling the crawler does not affect in-progress repo checks. each one completes 123 + /// its current PDS request before pausing. 124 + #[derive(Clone)] 125 + pub struct CrawlerHandle { 126 + pub(super) state: Arc<AppState>, 127 + /// set once by [`Hydrant::run`]; `None` means run() has not been called yet. 128 + pub(super) shared: Arc<std::sync::OnceLock<CrawlerShared>>, 129 + /// per-source running tasks, keyed by url. 130 + pub(super) tasks: Arc<scc::HashMap<Url, ProducerHandle>>, 131 + /// set of urls persisted in the database (dynamically added sources). 132 + pub(super) persisted: Arc<scc::HashSet<Url>>, 133 + } 134 + 135 + impl CrawlerHandle { 136 + /// enable the crawler (enables all configured producers). no-op if already enabled. 137 + pub fn enable(&self) { 138 + self.state.crawler_enabled.send_replace(true); 139 + } 140 + /// disable the crawler (disables all configured producers). 141 + /// in-progress repo checks finish before the crawler pauses. 142 + pub fn disable(&self) { 143 + self.state.crawler_enabled.send_replace(false); 144 + } 145 + /// returns the current enabled state of the crawler. 146 + pub fn is_enabled(&self) -> bool { 147 + *self.state.crawler_enabled.borrow() 148 + } 149 + 150 + /// delete all cursor entries associated with the given URL. 151 + pub async fn reset_cursor(&self, url: &str) -> Result<()> { 152 + let db = self.state.db.clone(); 153 + let point_keys = [keys::crawler_cursor_key(url)]; 154 + let by_collection_prefix = keys::by_collection_cursor_prefix(url); 155 + tokio::task::spawn_blocking(move || { 156 + let mut batch = db.inner.batch(); 157 + for k in point_keys { 158 + batch.remove(&db.cursors, k); 159 + } 160 + for entry in db.cursors.prefix(&by_collection_prefix) { 161 + let k = entry.key().into_diagnostic()?; 162 + batch.remove(&db.cursors, k); 163 + } 164 + batch.commit().into_diagnostic() 165 + }) 166 + .await 167 + .into_diagnostic()??; 168 + Ok(()) 169 + } 170 + 171 + /// return info on all currently active crawler sources. 172 + /// 173 + /// returns an empty list if called before [`Hydrant::run`]. 174 + pub async fn list_sources(&self) -> Vec<CrawlerSourceInfo> { 175 + let mut sources = Vec::new(); 176 + self.tasks 177 + .iter_async(|url, h| { 178 + sources.push(CrawlerSourceInfo { 179 + url: url.clone(), 180 + mode: h.mode, 181 + persisted: self.persisted.contains_sync(url), 182 + }); 183 + true 184 + }) 185 + .await; 186 + sources 187 + } 188 + 189 + /// add a new crawler source at runtime. 190 + /// 191 + /// the source is persisted to the database and will be re-spawned on restart. 192 + /// if a source with the same URL already exists, it is replaced (the old task is 193 + /// aborted and a new one is started with the new mode). 194 + /// 195 + /// returns an error if called before [`Hydrant::run`]. 196 + pub async fn add_source(&self, source: crate::config::CrawlerSource) -> Result<()> { 197 + let Some(shared) = self.shared.get() else { 198 + miette::bail!("crawler not yet started: call Hydrant::run() first"); 199 + }; 200 + 201 + let db = self.state.db.clone(); 202 + let key = keys::crawler_source_key(source.url.as_str()); 203 + let val = rmp_serde::to_vec(&source.mode).into_diagnostic()?; 204 + tokio::task::spawn_blocking(move || db.crawler.insert(key, val).into_diagnostic()) 205 + .await 206 + .into_diagnostic()??; 207 + 208 + let enabled_rx = self.state.crawler_enabled.subscribe(); 209 + let handle = spawn_crawler_producer( 210 + &source, 211 + &shared.http, 212 + &self.state, 213 + &shared.checker, 214 + &shared.in_flight, 215 + &shared.tx, 216 + &shared.stats, 217 + enabled_rx, 218 + ); 219 + 220 + let _ = self.persisted.insert_async(source.url.clone()).await; 221 + match self.tasks.entry_async(source.url).await { 222 + scc::hash_map::Entry::Vacant(e) => { 223 + e.insert_entry(handle); 224 + } 225 + scc::hash_map::Entry::Occupied(mut e) => { 226 + *e.get_mut() = handle; 227 + } 228 + } 229 + Ok(()) 230 + } 231 + 232 + /// remove a crawler source at runtime by URL. 233 + /// 234 + /// aborts the running producer task and removes the source from the database if it 235 + /// was dynamically added. config-sourced entries are aborted but not persisted, so 236 + /// they will reappear on restart. 237 + /// 238 + /// returns `true` if a source with the given URL was found and removed. 239 + /// returns an error if called before [`Hydrant::run`]. 240 + pub async fn remove_source(&self, url: &Url) -> Result<bool> { 241 + if self.shared.get().is_none() { 242 + miette::bail!("crawler not yet started: call Hydrant::run() first"); 243 + } 244 + 245 + // dropping the ProducerHandle aborts the task via Drop 246 + if self.tasks.remove_async(url).await.is_none() { 247 + return Ok(false); 248 + } 249 + 250 + // remove from DB if it was a persisted source 251 + if self.persisted.remove_async(url).await.is_some() { 252 + let db = self.state.db.clone(); 253 + let key = keys::crawler_source_key(url.as_str()); 254 + tokio::task::spawn_blocking(move || db.crawler.remove(key).into_diagnostic()) 255 + .await 256 + .into_diagnostic()??; 257 + } 258 + 259 + Ok(true) 260 + } 261 + }
+312
src/control/filter.rs
··· 1 + use std::sync::Arc; 2 + 3 + use miette::{IntoDiagnostic, Result}; 4 + 5 + use crate::db::filter as db_filter; 6 + use crate::filter::{FilterMode, SetUpdate}; 7 + use crate::state::AppState; 8 + 9 + /// a point-in-time snapshot of the filter configuration. returned by all [`FilterControl`] methods. 10 + /// 11 + /// because the filter is stored in the database and loaded on demand, this snapshot 12 + /// may be stale if another caller modifies the filter concurrently. for the authoritative 13 + /// live config use [`FilterControl::get`]. 14 + #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] 15 + pub struct FilterSnapshot { 16 + pub mode: FilterMode, 17 + pub signals: Vec<String>, 18 + pub collections: Vec<String>, 19 + pub excludes: Vec<String>, 20 + } 21 + 22 + /// runtime control over the indexing filter. 23 + /// 24 + /// the filter has two orthogonal axes: 25 + /// 26 + /// **mode** controls discovery: 27 + /// - [`FilterMode::Filter`]: only indexes repos whose firehose commits touch a collection 28 + /// matching a configured `signal`. explicit [`ReposControl::track`] always works regardless. 29 + /// - [`FilterMode::Full`]: indexes the entire network. `signals` are ignored for discovery 30 + /// but `collections` and `excludes` still apply. 31 + /// 32 + /// **sets** are each independently configurable: 33 + /// - `signals`: NSID patterns that trigger auto-discovery in `filter` mode (e.g. `app.bsky.feed.post`, `app.bsky.graph.*`) 34 + /// - `collections`: NSID patterns that filter which records are *stored*. empty means store all. 35 + /// - `excludes`: DIDs that are always skipped regardless of mode. 36 + /// 37 + /// NSID patterns support an optional `.*` suffix to match an entire namespace. 38 + /// all mutations are persisted to the database and take effect immediately. 39 + #[derive(Clone)] 40 + pub struct FilterControl(pub(super) Arc<AppState>); 41 + 42 + impl FilterControl { 43 + /// return the current filter configuration from the database. 44 + pub async fn get(&self) -> Result<FilterSnapshot> { 45 + let filter_ks = self.0.db.filter.clone(); 46 + tokio::task::spawn_blocking(move || { 47 + let hot = db_filter::load(&filter_ks)?; 48 + let excludes = db_filter::read_set(&filter_ks, db_filter::EXCLUDE_PREFIX)?; 49 + Ok(FilterSnapshot { 50 + mode: hot.mode, 51 + signals: hot.signals.iter().map(|s| s.to_string()).collect(), 52 + collections: hot.collections.iter().map(|s| s.to_string()).collect(), 53 + excludes, 54 + }) 55 + }) 56 + .await 57 + .into_diagnostic()? 58 + } 59 + 60 + /// set the indexing mode. see [`FilterControl`] for mode semantics. 61 + pub fn set_mode(&self, mode: FilterMode) -> FilterPatch { 62 + FilterPatch::new(self).set_mode(mode) 63 + } 64 + 65 + /// replace the entire signals set. existing signals are removed. 66 + pub fn set_signals(&self, signals: impl IntoIterator<Item = impl Into<String>>) -> FilterPatch { 67 + FilterPatch::new(self).set_signals(signals) 68 + } 69 + 70 + /// add multiple signals without disturbing existing ones. 71 + pub fn append_signals( 72 + &self, 73 + signals: impl IntoIterator<Item = impl Into<String>>, 74 + ) -> FilterPatch { 75 + FilterPatch::new(self).append_signals(signals) 76 + } 77 + 78 + /// add a single signal. no-op if already present. 79 + pub fn add_signal(&self, signal: impl Into<String>) -> FilterPatch { 80 + FilterPatch::new(self).add_signal(signal) 81 + } 82 + 83 + /// remove a single signal. no-op if not present. 84 + pub fn remove_signal(&self, signal: impl Into<String>) -> FilterPatch { 85 + FilterPatch::new(self).remove_signal(signal) 86 + } 87 + 88 + /// replace the entire collections set. pass an empty iterator to store all collections. 89 + pub fn set_collections( 90 + &self, 91 + collections: impl IntoIterator<Item = impl Into<String>>, 92 + ) -> FilterPatch { 93 + FilterPatch::new(self).set_collections(collections) 94 + } 95 + 96 + /// add multiple collections without disturbing existing ones. 97 + pub fn append_collections( 98 + &self, 99 + collections: impl IntoIterator<Item = impl Into<String>>, 100 + ) -> FilterPatch { 101 + FilterPatch::new(self).append_collections(collections) 102 + } 103 + 104 + /// add a single collection filter. no-op if already present. 105 + pub fn add_collection(&self, collection: impl Into<String>) -> FilterPatch { 106 + FilterPatch::new(self).add_collection(collection) 107 + } 108 + 109 + /// remove a single collection filter. no-op if not present. 110 + pub fn remove_collection(&self, collection: impl Into<String>) -> FilterPatch { 111 + FilterPatch::new(self).remove_collection(collection) 112 + } 113 + 114 + /// replace the entire excludes set. 115 + pub fn set_excludes( 116 + &self, 117 + excludes: impl IntoIterator<Item = impl Into<String>>, 118 + ) -> FilterPatch { 119 + FilterPatch::new(self).set_excludes(excludes) 120 + } 121 + 122 + /// add multiple DIDs to the excludes set without disturbing existing ones. 123 + pub fn append_excludes( 124 + &self, 125 + excludes: impl IntoIterator<Item = impl Into<String>>, 126 + ) -> FilterPatch { 127 + FilterPatch::new(self).append_excludes(excludes) 128 + } 129 + 130 + /// add a single DID to the excludes set. no-op if already excluded. 131 + pub fn add_exclude(&self, did: impl Into<String>) -> FilterPatch { 132 + FilterPatch::new(self).add_exclude(did) 133 + } 134 + 135 + /// remove a single DID from the excludes set. no-op if not present. 136 + pub fn remove_exclude(&self, did: impl Into<String>) -> FilterPatch { 137 + FilterPatch::new(self).remove_exclude(did) 138 + } 139 + } 140 + 141 + /// a staged set of filter mutations. all methods accumulate changes without touching 142 + /// the database. call [`FilterPatch::apply`] to commit the entire patch atomically. 143 + /// 144 + /// obtain an instance by calling any mutation method on [`FilterControl`], or via 145 + /// [`FilterPatch::new`] to start from a blank patch. 146 + pub struct FilterPatch { 147 + state: Arc<AppState>, 148 + /// if set, replaces the current indexing mode. 149 + pub mode: Option<FilterMode>, 150 + /// if set, replaces or patches the signals set. 151 + pub signals: Option<SetUpdate>, 152 + /// if set, replaces or patches the collections set. 153 + pub collections: Option<SetUpdate>, 154 + /// if set, replaces or patches the excludes set. 155 + pub excludes: Option<SetUpdate>, 156 + } 157 + 158 + impl FilterPatch { 159 + /// create a new blank patch associated with the given [`FilterControl`]. 160 + pub fn new(control: &FilterControl) -> Self { 161 + Self { 162 + state: control.0.clone(), 163 + mode: None, 164 + signals: None, 165 + collections: None, 166 + excludes: None, 167 + } 168 + } 169 + 170 + /// set the indexing mode. see [`FilterControl`] for mode semantics. 171 + pub fn set_mode(mut self, mode: FilterMode) -> Self { 172 + self.mode = Some(mode); 173 + self 174 + } 175 + 176 + /// replace the entire signals set. existing signals are removed. 177 + pub fn set_signals(mut self, signals: impl IntoIterator<Item = impl Into<String>>) -> Self { 178 + self.signals = Some(SetUpdate::Set( 179 + signals.into_iter().map(Into::into).collect(), 180 + )); 181 + self 182 + } 183 + 184 + /// add multiple signals without disturbing existing ones. 185 + pub fn append_signals(mut self, signals: impl IntoIterator<Item = impl Into<String>>) -> Self { 186 + self.signals = Some(SetUpdate::Patch( 187 + signals.into_iter().map(|s| (s.into(), true)).collect(), 188 + )); 189 + self 190 + } 191 + 192 + /// add a single signal. no-op if already present. 193 + pub fn add_signal(mut self, signal: impl Into<String>) -> Self { 194 + self.signals = Some(SetUpdate::Patch([(signal.into(), true)].into())); 195 + self 196 + } 197 + 198 + /// remove a single signal. no-op if not present. 199 + pub fn remove_signal(mut self, signal: impl Into<String>) -> Self { 200 + self.signals = Some(SetUpdate::Patch([(signal.into(), false)].into())); 201 + self 202 + } 203 + 204 + /// replace the entire collections set. pass an empty iterator to store all collections. 205 + pub fn set_collections( 206 + mut self, 207 + collections: impl IntoIterator<Item = impl Into<String>>, 208 + ) -> Self { 209 + self.collections = Some(SetUpdate::Set( 210 + collections.into_iter().map(Into::into).collect(), 211 + )); 212 + self 213 + } 214 + 215 + /// add multiple collections without disturbing existing ones. 216 + pub fn append_collections( 217 + mut self, 218 + collections: impl IntoIterator<Item = impl Into<String>>, 219 + ) -> Self { 220 + self.collections = Some(SetUpdate::Patch( 221 + collections.into_iter().map(|c| (c.into(), true)).collect(), 222 + )); 223 + self 224 + } 225 + 226 + /// add a single collection filter. no-op if already present. 227 + pub fn add_collection(mut self, collection: impl Into<String>) -> Self { 228 + self.collections = Some(SetUpdate::Patch([(collection.into(), true)].into())); 229 + self 230 + } 231 + 232 + /// remove a single collection filter. no-op if not present. 233 + pub fn remove_collection(mut self, collection: impl Into<String>) -> Self { 234 + self.collections = Some(SetUpdate::Patch([(collection.into(), false)].into())); 235 + self 236 + } 237 + 238 + /// replace the entire excludes set. 239 + pub fn set_excludes(mut self, excludes: impl IntoIterator<Item = impl Into<String>>) -> Self { 240 + self.excludes = Some(SetUpdate::Set( 241 + excludes.into_iter().map(Into::into).collect(), 242 + )); 243 + self 244 + } 245 + 246 + /// add multiple DIDs to the excludes set without disturbing existing ones. 247 + pub fn append_excludes( 248 + mut self, 249 + excludes: impl IntoIterator<Item = impl Into<String>>, 250 + ) -> Self { 251 + self.excludes = Some(SetUpdate::Patch( 252 + excludes.into_iter().map(|d| (d.into(), true)).collect(), 253 + )); 254 + self 255 + } 256 + 257 + /// add a single DID to the excludes set. no-op if already excluded. 258 + pub fn add_exclude(mut self, did: impl Into<String>) -> Self { 259 + self.excludes = Some(SetUpdate::Patch([(did.into(), true)].into())); 260 + self 261 + } 262 + 263 + /// remove a single DID from the excludes set. no-op if not present. 264 + pub fn remove_exclude(mut self, did: impl Into<String>) -> Self { 265 + self.excludes = Some(SetUpdate::Patch([(did.into(), false)].into())); 266 + self 267 + } 268 + 269 + /// commit the patch atomically to the database and update the in-memory filter. 270 + /// returns the updated [`FilterSnapshot`]. 271 + pub async fn apply(self) -> Result<FilterSnapshot> { 272 + let filter_ks = self.state.db.filter.clone(); 273 + let inner = self.state.db.inner.clone(); 274 + let filter_handle = self.state.filter.clone(); 275 + let mode = self.mode; 276 + let signals = self.signals; 277 + let collections = self.collections; 278 + let excludes = self.excludes; 279 + 280 + let new_filter = tokio::task::spawn_blocking(move || { 281 + let mut batch = inner.batch(); 282 + db_filter::apply_patch(&mut batch, &filter_ks, mode, signals, collections, excludes)?; 283 + batch.commit().into_diagnostic()?; 284 + db_filter::load(&filter_ks) 285 + }) 286 + .await 287 + .into_diagnostic()??; 288 + 289 + let exclude_list = { 290 + let filter_ks = self.state.db.filter.clone(); 291 + tokio::task::spawn_blocking(move || { 292 + db_filter::read_set(&filter_ks, db_filter::EXCLUDE_PREFIX) 293 + }) 294 + .await 295 + .into_diagnostic()?? 296 + }; 297 + 298 + let snapshot = FilterSnapshot { 299 + mode: new_filter.mode, 300 + signals: new_filter.signals.iter().map(|s| s.to_string()).collect(), 301 + collections: new_filter 302 + .collections 303 + .iter() 304 + .map(|s| s.to_string()) 305 + .collect(), 306 + excludes: exclude_list, 307 + }; 308 + 309 + filter_handle.store(Arc::new(new_filter)); 310 + Ok(snapshot) 311 + } 312 + }
+196
src/control/firehose.rs
··· 1 + use std::sync::Arc; 2 + use std::sync::atomic::Ordering; 3 + 4 + use miette::{IntoDiagnostic, Result}; 5 + use tokio::sync::watch; 6 + use tracing::{error, info}; 7 + use url::Url; 8 + 9 + use crate::db::{self, keys}; 10 + use crate::ingest::{BufferTx, firehose::FirehoseIngestor}; 11 + use crate::state::AppState; 12 + 13 + pub(super) struct FirehoseIngestorHandle { 14 + abort: tokio::task::AbortHandle, 15 + } 16 + 17 + impl Drop for FirehoseIngestorHandle { 18 + fn drop(&mut self) { 19 + self.abort.abort(); 20 + } 21 + } 22 + 23 + pub(super) struct FirehoseShared { 24 + pub(super) buffer_tx: BufferTx, 25 + pub(super) verify_signatures: bool, 26 + } 27 + 28 + /// a snapshot of a single firehose relay's runtime state. 29 + #[derive(Debug, Clone, serde::Serialize)] 30 + pub struct FirehoseSourceInfo { 31 + pub url: Url, 32 + /// true if added via the API and persisted to the database; false for `RELAY_HOSTS` sources. 33 + pub persisted: bool, 34 + } 35 + 36 + pub(super) async fn spawn_firehose_ingestor( 37 + relay_url: &Url, 38 + state: &Arc<AppState>, 39 + shared: &FirehoseShared, 40 + enabled: watch::Receiver<bool>, 41 + ) -> Result<FirehoseIngestorHandle> { 42 + use std::sync::atomic::AtomicI64; 43 + 44 + let start = db::get_firehose_cursor(&state.db, relay_url).await?; 45 + // insert into relay_cursors if not already present; existing in-memory cursor takes precedence 46 + let _ = state 47 + .relay_cursors 48 + .insert_async(relay_url.clone(), AtomicI64::new(start.unwrap_or(0))) 49 + .await; 50 + 51 + info!(relay = %relay_url, cursor = ?start, "starting firehose ingestor"); 52 + 53 + let ingestor = FirehoseIngestor::new( 54 + state.clone(), 55 + shared.buffer_tx.clone(), 56 + relay_url.clone(), 57 + state.filter.clone(), 58 + enabled, 59 + shared.verify_signatures, 60 + ); 61 + 62 + let relay_for_log = relay_url.clone(); 63 + let abort = tokio::spawn(async move { 64 + if let Err(e) = ingestor.run().await { 65 + error!(relay = %relay_for_log, err = %e, "firehose ingestor exited with error"); 66 + } 67 + }) 68 + .abort_handle(); 69 + 70 + Ok(FirehoseIngestorHandle { abort }) 71 + } 72 + 73 + /// runtime control over the firehose ingestor component. 74 + #[derive(Clone)] 75 + pub struct FirehoseHandle { 76 + pub(super) state: Arc<AppState>, 77 + /// set once by [`Hydrant::run`]; `None` means run() has not been called yet. 78 + pub(super) shared: Arc<std::sync::OnceLock<FirehoseShared>>, 79 + /// per-relay running tasks, keyed by url. 80 + pub(super) tasks: Arc<scc::HashMap<Url, FirehoseIngestorHandle>>, 81 + /// set of urls persisted in the database (dynamically added sources). 82 + pub(super) persisted: Arc<scc::HashSet<Url>>, 83 + } 84 + 85 + impl FirehoseHandle { 86 + /// enable the firehose. no-op if already enabled. 87 + pub fn enable(&self) { 88 + self.state.firehose_enabled.send_replace(true); 89 + } 90 + /// disable the firehose. the current message finishes processing before the connection closes. 91 + pub fn disable(&self) { 92 + self.state.firehose_enabled.send_replace(false); 93 + } 94 + /// returns the current enabled state of the firehose. 95 + pub fn is_enabled(&self) -> bool { 96 + *self.state.firehose_enabled.borrow() 97 + } 98 + 99 + /// reset the stored cursor for the given relay URL. 100 + /// 101 + /// clears the `firehose_cursor|{url}` entry from the cursors keyspace and zeroes the 102 + /// in-memory cursor. the next connection will tail live events from the current head. 103 + pub async fn reset_cursor(&self, url: &str) -> Result<()> { 104 + let db = self.state.db.clone(); 105 + let key = keys::firehose_cursor_key(url); 106 + tokio::task::spawn_blocking(move || db.cursors.remove(key).into_diagnostic()) 107 + .await 108 + .into_diagnostic()??; 109 + 110 + if let Ok(relay_url) = Url::parse(url) { 111 + self.state.relay_cursors.peek_with(&relay_url, |_, c| { 112 + c.store(0, Ordering::SeqCst); 113 + }); 114 + } 115 + Ok(()) 116 + } 117 + 118 + /// return info on all currently active firehose sources. 119 + pub async fn list_sources(&self) -> Vec<FirehoseSourceInfo> { 120 + let mut sources = Vec::new(); 121 + self.tasks 122 + .iter_async(|url, _| { 123 + sources.push(FirehoseSourceInfo { 124 + url: url.clone(), 125 + persisted: self.persisted.contains_sync(url), 126 + }); 127 + true 128 + }) 129 + .await; 130 + sources 131 + } 132 + 133 + /// add a new firehose relay at runtime. 134 + /// 135 + /// the URL is persisted to the database and will be re-spawned on restart. if a relay with 136 + /// the same URL already exists it is replaced: the running task is stopped and a new one 137 + /// is started. any cursor state for that URL is preserved. 138 + /// 139 + /// returns an error if called before [`Hydrant::run`]. 140 + pub async fn add_source(&self, url: Url) -> Result<()> { 141 + let Some(shared) = self.shared.get() else { 142 + miette::bail!("firehose not yet started: call Hydrant::run() first"); 143 + }; 144 + 145 + let db = self.state.db.clone(); 146 + let key = keys::firehose_source_key(url.as_str()); 147 + tokio::task::spawn_blocking(move || db.crawler.insert(key, b"").into_diagnostic()) 148 + .await 149 + .into_diagnostic()??; 150 + 151 + let enabled_rx = self.state.firehose_enabled.subscribe(); 152 + let handle = spawn_firehose_ingestor(&url, &self.state, shared, enabled_rx).await?; 153 + 154 + let _ = self.persisted.insert_async(url.clone()).await; 155 + match self.tasks.entry_async(url).await { 156 + scc::hash_map::Entry::Vacant(e) => { 157 + e.insert_entry(handle); 158 + } 159 + scc::hash_map::Entry::Occupied(mut e) => { 160 + *e.get_mut() = handle; 161 + } 162 + } 163 + Ok(()) 164 + } 165 + 166 + /// remove a firehose relay at runtime by URL. 167 + /// 168 + /// aborts the running ingestor task. if the source was added via the API it is removed from 169 + /// the database and will not reappear on restart. `RELAY_HOSTS` sources are only stopped for 170 + /// the current session; they reappear on the next restart. 171 + /// 172 + /// returns `true` if the relay was found and removed, `false` if it was not running. 173 + /// returns an error if called before [`Hydrant::run`]. 174 + pub async fn remove_source(&self, url: &Url) -> Result<bool> { 175 + if self.shared.get().is_none() { 176 + miette::bail!("firehose not yet started: call Hydrant::run() first"); 177 + } 178 + 179 + if self.tasks.remove_async(url).await.is_none() { 180 + return Ok(false); 181 + } 182 + 183 + // remove from relay_cursors (persist thread will stop tracking it) 184 + self.state.relay_cursors.remove_async(url).await; 185 + 186 + if self.persisted.remove_async(url).await.is_some() { 187 + let db = self.state.db.clone(); 188 + let key = keys::firehose_source_key(url.as_str()); 189 + tokio::task::spawn_blocking(move || db.crawler.remove(key).into_diagnostic()) 190 + .await 191 + .into_diagnostic()??; 192 + } 193 + 194 + Ok(true) 195 + } 196 + }
+753
src/control/mod.rs
··· 1 + mod crawler; 2 + mod filter; 3 + mod firehose; 4 + mod repos; 5 + mod stream; 6 + 7 + pub use crawler::{CrawlerHandle, CrawlerSourceInfo}; 8 + pub use filter::{FilterControl, FilterPatch, FilterSnapshot}; 9 + pub use firehose::{FirehoseHandle, FirehoseSourceInfo}; 10 + pub(crate) use repos::repo_state_to_info; 11 + pub use repos::{ListedRecord, Record, RecordList, RepoHandle, RepoInfo, ReposControl}; 12 + 13 + use std::collections::BTreeMap; 14 + use std::future::Future; 15 + use std::pin::Pin; 16 + use std::sync::Arc; 17 + use std::sync::atomic::{AtomicBool, Ordering}; 18 + use std::task::{Context, Poll}; 19 + 20 + use futures::{FutureExt, Stream}; 21 + use miette::{IntoDiagnostic, Result}; 22 + use tokio::sync::{mpsc, watch}; 23 + use tracing::{debug, error, info}; 24 + 25 + use crate::backfill::BackfillWorker; 26 + use crate::config::{Config, SignatureVerification}; 27 + use crate::db::{ 28 + self, filter as db_filter, load_persisted_crawler_sources, load_persisted_firehose_sources, 29 + }; 30 + use crate::filter::FilterMode; 31 + use crate::ingest::worker::FirehoseWorker; 32 + use crate::state::AppState; 33 + use crate::types::MarshallableEvt; 34 + 35 + use crawler::{CrawlerShared, spawn_crawler_producer}; 36 + use firehose::{FirehoseShared, spawn_firehose_ingestor}; 37 + use stream::event_stream_thread; 38 + 39 + /// an event emitted by the hydrant event stream. 40 + /// 41 + /// three variants are possible depending on the `type` field: 42 + /// - `"record"`: a repo record was created, updated, or deleted. carries a [`RecordEvt`]. 43 + /// - `"identity"`: a DID's handle or PDS changed. carries an [`IdentityEvt`]. ephemeral, not replayable. 44 + /// - `"account"`: a repo's active/inactive status changed. carries an [`AccountEvt`]. ephemeral, not replayable. 45 + /// 46 + /// the `id` field is a monotonically increasing sequence number usable as a cursor for [`Hydrant::subscribe`]. 47 + pub type Event = MarshallableEvt<'static>; 48 + 49 + /// the top-level handle to a hydrant instance. 50 + /// 51 + /// `Hydrant` is cheaply cloneable. all sub-handles share the same underlying state. 52 + /// construct it via [`Hydrant::new`] or [`Hydrant::from_env`], configure the filter 53 + /// and repos as needed, then call [`Hydrant::run`] to start all background components. 54 + /// 55 + /// # example 56 + /// 57 + /// ```rust,no_run 58 + /// use hydrant::control::Hydrant; 59 + /// 60 + /// #[tokio::main] 61 + /// async fn main() -> miette::Result<()> { 62 + /// let hydrant = Hydrant::from_env().await?; 63 + /// 64 + /// tokio::select! { 65 + /// r = hydrant.run()? => r, 66 + /// r = hydrant.serve(3000) => r, 67 + /// } 68 + /// } 69 + /// ``` 70 + #[derive(Clone)] 71 + pub struct Hydrant { 72 + pub crawler: CrawlerHandle, 73 + pub firehose: FirehoseHandle, 74 + pub backfill: BackfillHandle, 75 + pub filter: FilterControl, 76 + pub repos: ReposControl, 77 + pub db: DbControl, 78 + #[cfg(feature = "backlinks")] 79 + pub backlinks: crate::backlinks::BacklinksControl, 80 + pub(crate) state: Arc<AppState>, 81 + config: Arc<Config>, 82 + started: Arc<AtomicBool>, 83 + _priv: (), 84 + } 85 + 86 + impl Hydrant { 87 + /// open the database and configure hydrant from `config`. 88 + /// 89 + /// this sets up the database, applies any filter configuration from `config`, and 90 + /// initializes all sub-handles. no background tasks are started yet: call 91 + /// [`run`](Self::run) to start all components and drive the instance. 92 + pub async fn new(config: Config) -> Result<Self> { 93 + info!("{config}"); 94 + 95 + // 1. open database and construct AppState 96 + let state = AppState::new(&config)?; 97 + 98 + // 2. apply any filter config from env variables 99 + if config.full_network 100 + || config.filter_signals.is_some() 101 + || config.filter_collections.is_some() 102 + || config.filter_excludes.is_some() 103 + { 104 + let filter_ks = state.db.filter.clone(); 105 + let inner = state.db.inner.clone(); 106 + let mode = config.full_network.then_some(FilterMode::Full); 107 + let signals = config 108 + .filter_signals 109 + .clone() 110 + .map(crate::filter::SetUpdate::Set); 111 + let collections = config 112 + .filter_collections 113 + .clone() 114 + .map(crate::filter::SetUpdate::Set); 115 + let excludes = config 116 + .filter_excludes 117 + .clone() 118 + .map(crate::filter::SetUpdate::Set); 119 + 120 + tokio::task::spawn_blocking(move || { 121 + let mut batch = inner.batch(); 122 + db_filter::apply_patch( 123 + &mut batch, 124 + &filter_ks, 125 + mode, 126 + signals, 127 + collections, 128 + excludes, 129 + )?; 130 + batch.commit().into_diagnostic() 131 + }) 132 + .await 133 + .into_diagnostic()??; 134 + 135 + // 3. reload the live filter into the hot-path arc-swap 136 + let new_filter = tokio::task::spawn_blocking({ 137 + let filter_ks = state.db.filter.clone(); 138 + move || db_filter::load(&filter_ks) 139 + }) 140 + .await 141 + .into_diagnostic()??; 142 + state.filter.store(Arc::new(new_filter)); 143 + } 144 + 145 + // 4. set crawler enabled state from config, evaluated against the post-patch filter 146 + let post_patch_crawler = match config.enable_crawler { 147 + Some(b) => b, 148 + None => { 149 + state.filter.load().mode == FilterMode::Full || !config.crawler_sources.is_empty() 150 + } 151 + }; 152 + state.crawler_enabled.send_replace(post_patch_crawler); 153 + 154 + let state = Arc::new(state); 155 + 156 + Ok(Self { 157 + crawler: CrawlerHandle { 158 + state: state.clone(), 159 + shared: Arc::new(std::sync::OnceLock::new()), 160 + tasks: Arc::new(scc::HashMap::new()), 161 + persisted: Arc::new(scc::HashSet::new()), 162 + }, 163 + firehose: FirehoseHandle { 164 + state: state.clone(), 165 + shared: Arc::new(std::sync::OnceLock::new()), 166 + tasks: Arc::new(scc::HashMap::new()), 167 + persisted: Arc::new(scc::HashSet::new()), 168 + }, 169 + backfill: BackfillHandle(state.clone()), 170 + filter: FilterControl(state.clone()), 171 + repos: ReposControl(state.clone()), 172 + db: DbControl(state.clone()), 173 + #[cfg(feature = "backlinks")] 174 + backlinks: crate::backlinks::BacklinksControl(state.clone()), 175 + state, 176 + config: Arc::new(config), 177 + started: Arc::new(AtomicBool::new(false)), 178 + _priv: (), 179 + }) 180 + } 181 + 182 + /// reads config from environment variables and calls [`Hydrant::new`]. 183 + pub async fn from_env() -> Result<Self> { 184 + Self::new(Config::from_env()?).await 185 + } 186 + 187 + /// start all background components and return a future that resolves when any 188 + /// fatal component exits. 189 + /// 190 + /// starts the backfill worker, firehose ingestors, crawler, and worker thread. 191 + /// resolves with `Ok(())` if a fatal component exits cleanly, or `Err(e)` if it 192 + /// fails. intended for use in `tokio::select!` alongside [`serve`](Self::serve). 193 + /// 194 + /// returns an error if called more than once on the same `Hydrant` instance. 195 + pub fn run(&self) -> Result<impl Future<Output = Result<()>>> { 196 + let state = self.state.clone(); 197 + let config = self.config.clone(); 198 + let crawler = self.crawler.clone(); 199 + let firehose = self.firehose.clone(); 200 + 201 + if self.started.swap(true, Ordering::SeqCst) { 202 + miette::bail!("Hydrant::run() called more than once"); 203 + } 204 + 205 + let fut = async move { 206 + // internal buffered channel between ingestors / backfill and the firehose worker 207 + let (buffer_tx, buffer_rx) = mpsc::unbounded_channel(); 208 + 209 + // 5. spawn the backfill worker 210 + tokio::spawn({ 211 + let state = state.clone(); 212 + BackfillWorker::new( 213 + state.clone(), 214 + buffer_tx.clone(), 215 + config.repo_fetch_timeout, 216 + config.backfill_concurrency_limit, 217 + matches!( 218 + config.verify_signatures, 219 + SignatureVerification::Full | SignatureVerification::BackfillOnly 220 + ), 221 + config.ephemeral, 222 + state.backfill_enabled.subscribe(), 223 + ) 224 + .run() 225 + }); 226 + 227 + // 6. re-queue any repos that lost their backfill state, then start the retry worker 228 + if let Err(e) = tokio::task::spawn_blocking({ 229 + let state = state.clone(); 230 + move || crate::backfill::manager::queue_gone_backfills(&state) 231 + }) 232 + .await 233 + .into_diagnostic()? 234 + { 235 + error!(err = %e, "failed to queue gone backfills"); 236 + db::check_poisoned_report(&e); 237 + } 238 + 239 + std::thread::spawn({ 240 + let state = state.clone(); 241 + move || crate::backfill::manager::retry_worker(state) 242 + }); 243 + 244 + // 7. ephemeral GC thread 245 + if config.ephemeral { 246 + let state = state.clone(); 247 + std::thread::Builder::new() 248 + .name("ephemeral-gc".into()) 249 + .spawn(move || crate::db::ephemeral::ephemeral_ttl_worker(state)) 250 + .into_diagnostic()?; 251 + } 252 + 253 + // 8. cursor / counts persist thread 254 + std::thread::spawn({ 255 + let state = state.clone(); 256 + let persist_interval = config.cursor_save_interval; 257 + move || loop { 258 + std::thread::sleep(persist_interval); 259 + 260 + state.relay_cursors.iter_sync(|relay, cursor| { 261 + let seq = cursor.load(Ordering::SeqCst); 262 + if seq > 0 { 263 + if let Err(e) = db::set_firehose_cursor(&state.db, relay, seq) { 264 + error!(relay = %relay, err = %e, "failed to save cursor"); 265 + db::check_poisoned_report(&e); 266 + } 267 + } 268 + true 269 + }); 270 + 271 + if let Err(e) = db::persist_counts(&state.db) { 272 + error!(err = %e, "failed to persist counts"); 273 + db::check_poisoned_report(&e); 274 + } 275 + 276 + if let Err(e) = state.db.persist() { 277 + error!(err = %e, "db persist failed"); 278 + db::check_poisoned_report(&e); 279 + } 280 + } 281 + }); 282 + 283 + // 9. events/sec stats ticker 284 + tokio::spawn({ 285 + let state = state.clone(); 286 + let mut last_id = state.db.next_event_id.load(Ordering::Relaxed); 287 + let mut last_time = std::time::Instant::now(); 288 + let mut interval = tokio::time::interval(std::time::Duration::from_secs(60)); 289 + async move { 290 + loop { 291 + interval.tick().await; 292 + 293 + let current_id = state.db.next_event_id.load(Ordering::Relaxed); 294 + let current_time = std::time::Instant::now(); 295 + let delta = current_id.saturating_sub(last_id); 296 + 297 + if delta == 0 { 298 + debug!("no new events in 60s"); 299 + continue; 300 + } 301 + 302 + let elapsed = current_time.duration_since(last_time).as_secs_f64(); 303 + let rate = if elapsed > 0.0 { 304 + delta as f64 / elapsed 305 + } else { 306 + 0.0 307 + }; 308 + info!("{rate:.2} events/s ({delta} events in {elapsed:.1}s)"); 309 + 310 + last_id = current_id; 311 + last_time = current_time; 312 + } 313 + } 314 + }); 315 + 316 + let (fatal_tx_inner, mut fatal_rx) = watch::channel(None); 317 + let fatal_tx = Arc::new(fatal_tx_inner); 318 + 319 + info!( 320 + crawler_enabled = *state.crawler_enabled.borrow(), 321 + firehose_enabled = *state.firehose_enabled.borrow(), 322 + filter_mode = ?state.filter.load().mode, 323 + "starting ingestion" 324 + ); 325 + 326 + // 10. set shared and spawn firehose ingestors 327 + firehose 328 + .shared 329 + .set(FirehoseShared { 330 + buffer_tx: buffer_tx.clone(), 331 + verify_signatures: matches!( 332 + config.verify_signatures, 333 + SignatureVerification::Full 334 + ), 335 + }) 336 + .ok() 337 + .expect("firehose shared already set"); 338 + let fire_shared = firehose.shared.get().unwrap(); 339 + 340 + let relay_hosts = config.relays.clone(); 341 + if !relay_hosts.is_empty() { 342 + info!( 343 + relay_count = relay_hosts.len(), 344 + hosts = relay_hosts 345 + .iter() 346 + .map(|h| h.as_str()) 347 + .collect::<Vec<_>>() 348 + .join(", "), 349 + "starting firehose ingestor(s)" 350 + ); 351 + for relay_url in &relay_hosts { 352 + let enabled_rx = state.firehose_enabled.subscribe(); 353 + let handle = 354 + spawn_firehose_ingestor(relay_url, &state, fire_shared, enabled_rx).await?; 355 + let _ = firehose.tasks.insert_async(relay_url.clone(), handle).await; 356 + } 357 + } 358 + 359 + let persisted_relay_urls = tokio::task::spawn_blocking({ 360 + let state = state.clone(); 361 + move || load_persisted_firehose_sources(&state.db) 362 + }) 363 + .await 364 + .into_diagnostic()??; 365 + 366 + for relay_url in &persisted_relay_urls { 367 + let _ = firehose.persisted.insert_async(relay_url.clone()).await; 368 + if firehose.tasks.contains_async(relay_url).await { 369 + continue; 370 + } 371 + let enabled_rx = state.firehose_enabled.subscribe(); 372 + let handle = 373 + spawn_firehose_ingestor(relay_url, &state, fire_shared, enabled_rx).await?; 374 + let _ = firehose.tasks.insert_async(relay_url.clone(), handle).await; 375 + } 376 + 377 + // 11. spawn crawler infrastructure (always, to support dynamic source management) 378 + { 379 + use crate::crawler::throttle::Throttler; 380 + use crate::crawler::{ 381 + CrawlerStats, CrawlerWorker, InFlight, RetryProducer, SignalChecker, 382 + }; 383 + 384 + let http = reqwest::Client::builder() 385 + .user_agent(concat!( 386 + env!("CARGO_PKG_NAME"), 387 + "/", 388 + env!("CARGO_PKG_VERSION") 389 + )) 390 + .gzip(true) 391 + .build() 392 + .expect("that reqwest will build"); 393 + let pds_throttler = Throttler::new(); 394 + let in_flight = InFlight::new(); 395 + let stats = CrawlerStats::new( 396 + state.clone(), 397 + config 398 + .crawler_sources 399 + .iter() 400 + .map(|s| s.url.clone()) 401 + .collect(), 402 + pds_throttler.clone(), 403 + ); 404 + let checker = SignalChecker { 405 + http: http.clone(), 406 + state: state.clone(), 407 + throttler: pds_throttler, 408 + }; 409 + 410 + info!( 411 + max_pending = config.crawler_max_pending_repos, 412 + resume_pending = config.crawler_resume_pending_repos, 413 + enabled = *state.crawler_enabled.borrow(), 414 + "starting crawler worker" 415 + ); 416 + let (worker, tx) = CrawlerWorker::new( 417 + state.clone(), 418 + config.crawler_max_pending_repos, 419 + config.crawler_resume_pending_repos, 420 + stats.clone(), 421 + ); 422 + tokio::spawn(async move { 423 + worker.run().await; 424 + error!("crawler worker exited unexpectedly, aborting"); 425 + std::process::abort(); 426 + }); 427 + 428 + let ticker = tokio::spawn(stats.clone().task()); 429 + tokio::spawn(async move { 430 + match ticker.await { 431 + Err(e) => error!(err = ?e, "stats ticker panicked, aborting"), 432 + Ok(()) => error!("stats ticker exited unexpectedly, aborting"), 433 + } 434 + std::process::abort(); 435 + }); 436 + 437 + tokio::spawn( 438 + RetryProducer { 439 + checker: checker.clone(), 440 + in_flight: in_flight.clone(), 441 + tx: tx.clone(), 442 + } 443 + .run(), 444 + ); 445 + 446 + // set shared objects so CrawlerHandle methods can use them 447 + crawler 448 + .shared 449 + .set(CrawlerShared { 450 + http, 451 + checker, 452 + in_flight, 453 + tx, 454 + stats, 455 + }) 456 + .ok() 457 + .expect("crawler shared already set"); 458 + let shared = crawler.shared.get().unwrap(); 459 + 460 + // spawn initial sources from config 461 + for source in config.crawler_sources.iter() { 462 + let enabled_rx = state.crawler_enabled.subscribe(); 463 + let handle = spawn_crawler_producer( 464 + source, 465 + &shared.http, 466 + &state, 467 + &shared.checker, 468 + &shared.in_flight, 469 + &shared.tx, 470 + &shared.stats, 471 + enabled_rx, 472 + ); 473 + let _ = crawler.tasks.insert_async(source.url.clone(), handle).await; 474 + } 475 + 476 + let persisted_sources = tokio::task::spawn_blocking({ 477 + let state = state.clone(); 478 + move || load_persisted_crawler_sources(&state.db) 479 + }) 480 + .await 481 + .into_diagnostic()??; 482 + 483 + for source in &persisted_sources { 484 + let _ = crawler.persisted.insert_async(source.url.clone()).await; 485 + if crawler.tasks.contains_async(&source.url).await { 486 + continue; 487 + } 488 + let enabled_rx = state.crawler_enabled.subscribe(); 489 + let handle = spawn_crawler_producer( 490 + source, 491 + &shared.http, 492 + &state, 493 + &shared.checker, 494 + &shared.in_flight, 495 + &shared.tx, 496 + &shared.stats, 497 + enabled_rx, 498 + ); 499 + let _ = crawler.tasks.insert_async(source.url.clone(), handle).await; 500 + } 501 + } 502 + 503 + // 12. spawn the firehose worker on a blocking thread (fatal task) 504 + let handle = tokio::runtime::Handle::current(); 505 + let firehose_worker = std::thread::spawn({ 506 + let state = state.clone(); 507 + move || { 508 + FirehoseWorker::new( 509 + state, 510 + buffer_rx, 511 + matches!(config.verify_signatures, SignatureVerification::Full), 512 + config.ephemeral, 513 + config.firehose_workers, 514 + ) 515 + .run(handle) 516 + } 517 + }); 518 + 519 + { 520 + let tx = Arc::clone(&fatal_tx); 521 + tokio::spawn( 522 + tokio::task::spawn_blocking(move || { 523 + firehose_worker 524 + .join() 525 + .map_err(|e| miette::miette!("buffer processor died: {e:?}")) 526 + }) 527 + .map(move |r| { 528 + let result = r.into_diagnostic().flatten().flatten(); 529 + let _ = tx.send(Some(result.map_err(|e| e.to_string()))); 530 + }), 531 + ); 532 + } 533 + 534 + // drop the local fatal_tx so the watch channel is only kept alive by the 535 + // spawned tasks. when all fatal tasks exit (and drop their tx clones), 536 + // fatal_rx.changed() returns Err and we return Ok(()). 537 + drop(fatal_tx); 538 + 539 + loop { 540 + match fatal_rx.changed().await { 541 + Ok(()) => { 542 + if let Some(result) = fatal_rx.borrow().clone() { 543 + return result.map_err(|s| miette::miette!("{s}")); 544 + } 545 + } 546 + // all fatal_tx clones dropped: all tasks finished cleanly 547 + Err(_) => return Ok(()), 548 + } 549 + } 550 + }; 551 + Ok(fut) 552 + } 553 + 554 + /// subscribe to the ordered event stream. 555 + /// 556 + /// returns an [`EventStream`] that implements [`futures::Stream`]. 557 + /// 558 + /// - if `cursor` is `None`, streaming starts from the current head (live tail only). 559 + /// - if `cursor` is `Some(id)`, all persisted `record` events from that ID onward are 560 + /// replayed first, then live events follow seamlessly. 561 + /// 562 + /// `identity` and `account` events are ephemeral and are never replayed from a cursor - 563 + /// only live occurrences are delivered. use [`ReposControl::get`] to fetch current 564 + /// identity/account state for a specific DID. 565 + /// 566 + /// multiple concurrent subscribers each receive a full independent copy of the stream. 567 + /// the stream ends when the `EventStream` is dropped. 568 + pub fn subscribe(&self, cursor: Option<u64>) -> EventStream { 569 + let (tx, rx) = mpsc::channel(500); 570 + let state = self.state.clone(); 571 + let runtime = tokio::runtime::Handle::current(); 572 + 573 + std::thread::Builder::new() 574 + .name("hydrant-stream".into()) 575 + .spawn(move || { 576 + let _g = runtime.enter(); 577 + event_stream_thread(state, tx, cursor); 578 + }) 579 + .expect("failed to spawn stream thread"); 580 + 581 + EventStream(rx) 582 + } 583 + 584 + /// return database counts and on-disk sizes for all keyspaces. 585 + /// 586 + /// counts include: `repos`, `pending`, `resync`, `records`, `blocks`, `events`, 587 + /// `error_ratelimited`, `error_transport`, `error_generic`. 588 + /// 589 + /// sizes are in bytes, reported per keyspace. 590 + pub async fn stats(&self) -> Result<StatsResponse> { 591 + let db = self.state.db.clone(); 592 + 593 + let mut counts: BTreeMap<&'static str, u64> = futures::future::join_all( 594 + [ 595 + "repos", 596 + "pending", 597 + "resync", 598 + "records", 599 + "blocks", 600 + "error_ratelimited", 601 + "error_transport", 602 + "error_generic", 603 + ] 604 + .into_iter() 605 + .map(|name| { 606 + let db = db.clone(); 607 + async move { (name, db.get_count(name).await) } 608 + }), 609 + ) 610 + .await 611 + .into_iter() 612 + .collect(); 613 + 614 + counts.insert("events", db.events.approximate_len() as u64); 615 + 616 + let sizes = tokio::task::spawn_blocking(move || { 617 + let mut s = BTreeMap::new(); 618 + s.insert("repos", db.repos.disk_space()); 619 + s.insert("records", db.records.disk_space()); 620 + s.insert("blocks", db.blocks.disk_space()); 621 + s.insert("cursors", db.cursors.disk_space()); 622 + s.insert("pending", db.pending.disk_space()); 623 + s.insert("resync", db.resync.disk_space()); 624 + s.insert("resync_buffer", db.resync_buffer.disk_space()); 625 + s.insert("events", db.events.disk_space()); 626 + s.insert("counts", db.counts.disk_space()); 627 + s.insert("filter", db.filter.disk_space()); 628 + s.insert("crawler", db.crawler.disk_space()); 629 + s 630 + }) 631 + .await 632 + .into_diagnostic()?; 633 + 634 + Ok(StatsResponse { counts, sizes }) 635 + } 636 + 637 + /// returns a future that runs the HTTP management API server on `0.0.0.0:{port}`. 638 + /// 639 + /// the server exposes all management endpoints (`/filter`, `/repos`, `/ingestion`, 640 + /// `/stream`, `/stats`, `/db/*`, `/xrpc/*`). it runs indefinitely and resolves 641 + /// only on error. 642 + /// 643 + /// intended for `tokio::spawn` or inclusion in a `select!` / task list. the clone 644 + /// of `self` is deferred until the future is first polled. 645 + /// 646 + /// to disable the HTTP API entirely, simply don't call this method. 647 + pub fn serve(&self, port: u16) -> impl Future<Output = Result<()>> { 648 + let hydrant = self.clone(); 649 + async move { crate::api::serve(hydrant, port).await } 650 + } 651 + 652 + /// returns a future that runs the debug HTTP API server on `127.0.0.1:{port}`. 653 + /// 654 + /// exposes internal inspection endpoints (`/debug/get`, `/debug/iter`, etc.) 655 + /// that are not safe to expose publicly. binds only to loopback. 656 + pub fn serve_debug(&self, port: u16) -> impl Future<Output = Result<()>> { 657 + let state = self.state.clone(); 658 + async move { crate::api::serve_debug(state, port).await } 659 + } 660 + } 661 + 662 + impl axum::extract::FromRef<Hydrant> for Arc<AppState> { 663 + fn from_ref(h: &Hydrant) -> Self { 664 + h.state.clone() 665 + } 666 + } 667 + 668 + /// a stream of [`Event`]s. returned by [`Hydrant::subscribe`]. 669 + /// 670 + /// implements [`futures::Stream`] and can be used with `StreamExt::next`, 671 + /// `while let Some(evt) = stream.next().await`, `forward`, etc. 672 + /// the stream terminates when the underlying channel closes (i.e. hydrant shuts down). 673 + pub struct EventStream(mpsc::Receiver<Event>); 674 + 675 + impl Stream for EventStream { 676 + type Item = Event; 677 + 678 + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> { 679 + self.0.poll_recv(cx) 680 + } 681 + } 682 + 683 + /// database statistics returned by [`Hydrant::stats`]. 684 + #[derive(serde::Serialize)] 685 + pub struct StatsResponse { 686 + /// record counts per logical category (repos, records, events, error kinds, etc.) 687 + pub counts: BTreeMap<&'static str, u64>, 688 + /// on-disk size in bytes per keyspace 689 + pub sizes: BTreeMap<&'static str, u64>, 690 + } 691 + 692 + /// runtime control over the backfill worker component. 693 + /// 694 + /// the backfill worker fetches full repo CAR files from each repo's PDS for any 695 + /// repository in the pending queue, parses the MST, and inserts all matching records 696 + /// into the database. concurrency is bounded by `HYDRANT_BACKFILL_CONCURRENCY_LIMIT`. 697 + #[derive(Clone)] 698 + pub struct BackfillHandle(Arc<AppState>); 699 + 700 + impl BackfillHandle { 701 + /// enable the backfill worker, no-op if already enabled. 702 + pub fn enable(&self) { 703 + self.0.backfill_enabled.send_replace(true); 704 + } 705 + /// disable the backfill worker, in-flight repos complete before pausing. 706 + pub fn disable(&self) { 707 + self.0.backfill_enabled.send_replace(false); 708 + } 709 + /// returns the current enabled state of the backfill worker. 710 + pub fn is_enabled(&self) -> bool { 711 + *self.0.backfill_enabled.borrow() 712 + } 713 + } 714 + 715 + /// control over database maintenance operations. 716 + /// 717 + /// all methods pause the crawler, firehose, and backfill worker for the duration 718 + /// of the operation and restore their prior state on completion, whether or not 719 + /// the operation succeeds. 720 + #[derive(Clone)] 721 + pub struct DbControl(Arc<AppState>); 722 + 723 + impl DbControl { 724 + /// trigger a major compaction of all keyspaces in parallel. 725 + /// 726 + /// compaction reclaims disk space from deleted/updated keys and improves 727 + /// read performance. can take several minutes on large datasets. 728 + pub async fn compact(&self) -> Result<()> { 729 + let state = self.0.clone(); 730 + state 731 + .with_ingestion_paused(async || state.db.compact().await) 732 + .await 733 + } 734 + 735 + /// train zstd compression dictionaries for the `repos`, `blocks`, and `events` keyspaces. 736 + /// 737 + /// dictionaries are written to `dict_{name}.bin` files next to the database. 738 + /// a restart is required to apply them. training samples data blocks from the 739 + /// existing database, so the database must have a reasonable amount of data first. 740 + pub async fn train_dicts(&self) -> Result<()> { 741 + let state = self.0.clone(); 742 + state 743 + .with_ingestion_paused(async || { 744 + let train = |name: &'static str| { 745 + let db = state.db.clone(); 746 + tokio::task::spawn_blocking(move || db.train_dict(name)) 747 + .map(|res| res.into_diagnostic().flatten()) 748 + }; 749 + tokio::try_join!(train("repos"), train("blocks"), train("events")).map(|_| ()) 750 + }) 751 + .await 752 + } 753 + }
+404
src/control/repos.rs
··· 1 + use std::sync::Arc; 2 + 3 + use chrono::{DateTime, Utc}; 4 + use jacquard_common::cowstr::ToCowStr; 5 + use jacquard_common::types::cid::{Cid, IpldCid}; 6 + use jacquard_common::types::ident::AtIdentifier; 7 + use jacquard_common::types::string::{Did, Handle, Rkey}; 8 + use jacquard_common::types::tid::Tid; 9 + use jacquard_common::{CowStr, Data, IntoStatic}; 10 + use miette::{IntoDiagnostic, Result}; 11 + use rand::Rng; 12 + use smol_str::ToSmolStr; 13 + use url::Url; 14 + 15 + use crate::db::types::DbRkey; 16 + use crate::db::{self, keys, ser_repo_state}; 17 + use crate::state::AppState; 18 + use crate::types::{GaugeState, RepoState, RepoStatus}; 19 + 20 + /// information about a tracked or known repository. returned by [`ReposControl`] methods. 21 + #[derive(Debug, Clone, serde::Serialize)] 22 + pub struct RepoInfo { 23 + /// the DID of the repository. 24 + pub did: Did<'static>, 25 + /// the status of the repository. 26 + #[serde(serialize_with = "crate::util::repo_status_serialize_str")] 27 + pub status: RepoStatus, 28 + /// whether this repository is tracked or not. 29 + /// untracked repositories are not updated and they stay frozen. 30 + pub tracked: bool, 31 + /// the revision of the root commit of this repository. 32 + #[serde(skip_serializing_if = "Option::is_none")] 33 + pub rev: Option<Tid>, 34 + /// the CID of the root commit of this repository. 35 + #[serde(serialize_with = "crate::util::opt_cid_serialize_str")] 36 + #[serde(skip_serializing_if = "Option::is_none")] 37 + pub data: Option<IpldCid>, 38 + /// the handle for the DID of this repository. 39 + #[serde(skip_serializing_if = "Option::is_none")] 40 + pub handle: Option<Handle<'static>>, 41 + /// the URL for the PDS in which this repository is hosted on. 42 + #[serde(skip_serializing_if = "Option::is_none")] 43 + pub pds: Option<Url>, 44 + /// ATProto signing key of this repository. 45 + #[serde(skip_serializing_if = "Option::is_none")] 46 + pub signing_key: Option<String>, 47 + /// when this repository was last touched (status update, commit ingested, etc.). 48 + #[serde(skip_serializing_if = "Option::is_none")] 49 + pub last_updated_at: Option<DateTime<Utc>>, 50 + /// the time of the last message gotten from the firehose for this repository. 51 + /// this is equal to the `time` field. 52 + #[serde(skip_serializing_if = "Option::is_none")] 53 + pub last_message_at: Option<DateTime<Utc>>, 54 + } 55 + 56 + /// control over which repositories are tracked and access to their state. 57 + /// 58 + /// in `filter` mode, a repo is only indexed if it either matches a signal or is 59 + /// explicitly tracked via [`ReposControl::track`]. in `full` mode all repos are indexed 60 + /// and tracking is implicit. 61 + /// 62 + /// tracking a DID that hydrant has never seen enqueues an immediate backfill. 63 + /// tracking a DID that hydrant already knows about (but has marked untracked) 64 + /// re-enqueues it for backfill. 65 + #[derive(Clone)] 66 + pub struct ReposControl(pub(super) Arc<AppState>); 67 + 68 + impl ReposControl { 69 + /// gets a handle for a repository to allow acting upon it. 70 + pub fn get<'i>(&self, did: &Did<'i>) -> Result<RepoHandle<'i>> { 71 + Ok(RepoHandle { 72 + state: self.0.clone(), 73 + did: did.clone(), 74 + }) 75 + } 76 + 77 + /// same as [`ReposControl::get`] but allows you to pass in an identifier that can be 78 + /// either a handle or a DID. 79 + pub async fn resolve(&self, repo: &AtIdentifier<'_>) -> Result<RepoHandle<'static>> { 80 + let did = self.0.resolver.resolve_did(repo).await?; 81 + Ok(RepoHandle { 82 + state: self.0.clone(), 83 + did, 84 + }) 85 + } 86 + 87 + /// fetch the current state of a single repository. returns `None` if hydrant 88 + /// has never seen this DID. 89 + pub async fn info(&self, did: &Did<'_>) -> Result<Option<RepoInfo>> { 90 + self.get(did)?.info().await 91 + } 92 + 93 + /// explicitly track one or more repositories, enqueuing them for backfill if needed. 94 + /// 95 + /// - if a DID is new, a fresh [`RepoState`] is created and backfill is queued. 96 + /// - if a DID is already known but untracked, it is marked tracked and re-enqueued. 97 + /// - if a DID is already tracked, this is a no-op. 98 + pub async fn track(&self, dids: impl IntoIterator<Item = Did<'_>>) -> Result<()> { 99 + let dids: Vec<Did<'static>> = dids.into_iter().map(|d| d.into_static()).collect(); 100 + let state = self.0.clone(); 101 + 102 + let (new_count, transitions) = tokio::task::spawn_blocking(move || { 103 + let db = &state.db; 104 + let mut batch = db.inner.batch(); 105 + let mut added = 0i64; 106 + let mut transitions: Vec<(GaugeState, GaugeState)> = Vec::new(); 107 + let mut rng = rand::rng(); 108 + 109 + for did in &dids { 110 + let did_key = keys::repo_key(did); 111 + let repo_bytes = db.repos.get(&did_key).into_diagnostic()?; 112 + let existing = repo_bytes 113 + .as_deref() 114 + .map(db::deser_repo_state) 115 + .transpose()?; 116 + 117 + if let Some(mut repo_state) = existing { 118 + if !repo_state.tracked { 119 + let resync = db.resync.get(&did_key).into_diagnostic()?; 120 + let old = db::Db::repo_gauge_state(&repo_state, resync.as_deref()); 121 + repo_state.tracked = true; 122 + batch.insert(&db.repos, &did_key, ser_repo_state(&repo_state)?); 123 + batch.insert( 124 + &db.pending, 125 + keys::pending_key(repo_state.index_id), 126 + &did_key, 127 + ); 128 + batch.remove(&db.resync, &did_key); 129 + transitions.push((old, GaugeState::Pending)); 130 + } 131 + } else { 132 + let repo_state = RepoState::backfilling(rng.next_u64()); 133 + batch.insert(&db.repos, &did_key, ser_repo_state(&repo_state)?); 134 + batch.insert( 135 + &db.pending, 136 + keys::pending_key(repo_state.index_id), 137 + &did_key, 138 + ); 139 + added += 1; 140 + transitions.push((GaugeState::Synced, GaugeState::Pending)); 141 + } 142 + } 143 + 144 + batch.commit().into_diagnostic()?; 145 + Ok::<_, miette::Report>((added, transitions)) 146 + }) 147 + .await 148 + .into_diagnostic()??; 149 + 150 + if new_count > 0 { 151 + self.0.db.update_count_async("repos", new_count).await; 152 + } 153 + for (old, new) in transitions { 154 + self.0.db.update_gauge_diff_async(&old, &new).await; 155 + } 156 + self.0.notify_backfill(); 157 + Ok(()) 158 + } 159 + 160 + /// stop tracking one or more repositories. hydrant will stop processing new events 161 + /// for them and remove them from the pending/resync queues, but existing indexed 162 + /// records are **not** deleted. 163 + pub async fn untrack(&self, dids: impl IntoIterator<Item = Did<'_>>) -> Result<()> { 164 + let dids: Vec<Did<'static>> = dids.into_iter().map(|d| d.into_static()).collect(); 165 + let state = self.0.clone(); 166 + 167 + let gauge_decrements = tokio::task::spawn_blocking(move || { 168 + let db = &state.db; 169 + let mut batch = db.inner.batch(); 170 + let mut gauge_decrements = Vec::new(); 171 + 172 + for did in &dids { 173 + let did_key = keys::repo_key(did); 174 + let repo_bytes = db.repos.get(&did_key).into_diagnostic()?; 175 + let existing = repo_bytes 176 + .as_deref() 177 + .map(db::deser_repo_state) 178 + .transpose()?; 179 + 180 + if let Some(repo_state) = existing { 181 + if repo_state.tracked { 182 + let resync = db.resync.get(&did_key).into_diagnostic()?; 183 + let old = db::Db::repo_gauge_state(&repo_state, resync.as_deref()); 184 + let mut repo_state = repo_state.into_static(); 185 + repo_state.tracked = false; 186 + batch.insert(&db.repos, &did_key, ser_repo_state(&repo_state)?); 187 + batch.remove(&db.pending, keys::pending_key(repo_state.index_id)); 188 + batch.remove(&db.resync, &did_key); 189 + if old != GaugeState::Synced { 190 + gauge_decrements.push(old); 191 + } 192 + } 193 + } 194 + } 195 + 196 + batch.commit().into_diagnostic()?; 197 + Ok::<_, miette::Report>(gauge_decrements) 198 + }) 199 + .await 200 + .into_diagnostic()??; 201 + 202 + for gauge in gauge_decrements { 203 + self.0 204 + .db 205 + .update_gauge_diff_async(&gauge, &GaugeState::Synced) 206 + .await; 207 + } 208 + Ok(()) 209 + } 210 + } 211 + 212 + pub(crate) fn repo_state_to_info(did: Did<'static>, s: RepoState<'_>) -> RepoInfo { 213 + RepoInfo { 214 + did, 215 + status: s.status, 216 + tracked: s.tracked, 217 + rev: s.rev.map(|r| r.to_tid()), 218 + data: s.data, 219 + handle: s.handle.map(|h| h.into_static()), 220 + pds: s.pds.and_then(|p| p.parse().ok()), 221 + signing_key: s.signing_key.map(|k| k.encode()), 222 + last_updated_at: DateTime::from_timestamp_secs(s.last_updated_at), 223 + last_message_at: s.last_message_time.and_then(DateTime::from_timestamp_secs), 224 + } 225 + } 226 + 227 + pub struct Record { 228 + pub did: Did<'static>, 229 + pub cid: Cid<'static>, 230 + pub value: Data<'static>, 231 + } 232 + 233 + pub struct ListedRecord { 234 + pub rkey: Rkey<'static>, 235 + pub cid: Cid<'static>, 236 + pub value: Data<'static>, 237 + } 238 + 239 + pub struct RecordList { 240 + pub records: Vec<ListedRecord>, 241 + pub cursor: Option<Rkey<'static>>, 242 + } 243 + 244 + /// handle to access data related to this repository. 245 + #[derive(Clone)] 246 + pub struct RepoHandle<'i> { 247 + state: Arc<AppState>, 248 + pub did: Did<'i>, 249 + } 250 + 251 + impl<'i> RepoHandle<'i> { 252 + pub async fn info(&self) -> Result<Option<RepoInfo>> { 253 + let did_key = keys::repo_key(&self.did); 254 + let state = self.state.clone(); 255 + let did = self.did.clone().into_static(); 256 + 257 + tokio::task::spawn_blocking(move || { 258 + let bytes = state.db.repos.get(&did_key).into_diagnostic()?; 259 + let state = bytes.as_deref().map(db::deser_repo_state).transpose()?; 260 + Ok(state.map(|s| repo_state_to_info(did, s))) 261 + }) 262 + .await 263 + .into_diagnostic()? 264 + } 265 + 266 + pub async fn get_record(&self, collection: &str, rkey: &str) -> Result<Option<Record>> { 267 + let did = self.did.clone().into_static(); 268 + let db_key = keys::record_key(&did, collection, &DbRkey::new(rkey)); 269 + 270 + let collection = collection.to_smolstr(); 271 + let state = self.state.clone(); 272 + tokio::task::spawn_blocking(move || { 273 + use miette::WrapErr; 274 + 275 + let cid_bytes = state.db.records.get(db_key).into_diagnostic()?; 276 + let Some(cid_bytes) = cid_bytes else { 277 + return Ok(None); 278 + }; 279 + 280 + // lookup block using col|cid key 281 + let block_key = keys::block_key(&collection, &cid_bytes); 282 + let Some(block_bytes) = state.db.blocks.get(block_key).into_diagnostic()? else { 283 + miette::bail!("block {cid_bytes:?} not found, this is a bug!!"); 284 + }; 285 + 286 + let value = serde_ipld_dagcbor::from_slice::<Data>(&block_bytes) 287 + .into_diagnostic() 288 + .wrap_err("cant parse block")? 289 + .into_static(); 290 + let cid = Cid::new(&cid_bytes) 291 + .into_diagnostic() 292 + .wrap_err("cant parse block cid")?; 293 + let cid = Cid::Str(cid.to_cowstr().into_static()); 294 + 295 + Ok(Some(Record { did, cid, value })) 296 + }) 297 + .await 298 + .into_diagnostic()? 299 + } 300 + 301 + pub async fn list_records( 302 + &self, 303 + collection: &str, 304 + limit: usize, 305 + reverse: bool, 306 + cursor: Option<&str>, 307 + ) -> Result<RecordList> { 308 + let did = self.did.clone().into_static(); 309 + 310 + let state = self.state.clone(); 311 + let prefix = keys::record_prefix_collection(&did, collection); 312 + let collection = collection.to_smolstr(); 313 + let cursor = cursor.map(|c| c.to_smolstr()); 314 + 315 + tokio::task::spawn_blocking(move || { 316 + let mut results = Vec::new(); 317 + let mut next_cursor = None; 318 + 319 + let iter: Box<dyn Iterator<Item = _>> = if !reverse { 320 + let mut end_prefix = prefix.clone(); 321 + if let Some(last) = end_prefix.last_mut() { 322 + *last += 1; 323 + } 324 + 325 + let end_key = if let Some(cursor) = &cursor { 326 + let mut k = prefix.clone(); 327 + k.extend_from_slice(cursor.as_bytes()); 328 + k 329 + } else { 330 + end_prefix 331 + }; 332 + 333 + Box::new( 334 + state 335 + .db 336 + .records 337 + .range(prefix.as_slice()..end_key.as_slice()) 338 + .rev(), 339 + ) 340 + } else { 341 + let start_key = if let Some(cursor) = &cursor { 342 + let mut k = prefix.clone(); 343 + k.extend_from_slice(cursor.as_bytes()); 344 + k.push(0); 345 + k 346 + } else { 347 + prefix.clone() 348 + }; 349 + 350 + Box::new(state.db.records.range(start_key.as_slice()..)) 351 + }; 352 + 353 + for item in iter { 354 + let (key, cid_bytes) = item.into_inner().into_diagnostic()?; 355 + 356 + if !key.starts_with(prefix.as_slice()) { 357 + break; 358 + } 359 + 360 + let rkey = keys::parse_rkey(&key[prefix.len()..])?; 361 + if results.len() >= limit { 362 + next_cursor = Some(rkey); 363 + break; 364 + } 365 + 366 + // look up using col|cid key built from collection and binary cid bytes 367 + if let Ok(Some(block_bytes)) = state 368 + .db 369 + .blocks 370 + .get(&keys::block_key(collection.as_str(), &cid_bytes)) 371 + { 372 + let value: Data = 373 + serde_ipld_dagcbor::from_slice(&block_bytes).unwrap_or(Data::Null); 374 + let cid = Cid::new(&cid_bytes).into_diagnostic()?; 375 + let cid = Cid::Str(cid.to_cowstr().into_static()); 376 + results.push(ListedRecord { 377 + rkey: Rkey::new_cow(CowStr::Owned(rkey.to_smolstr())) 378 + .expect("that rkey is validated"), 379 + cid, 380 + value: value.into_static(), 381 + }); 382 + } 383 + } 384 + Result::<_, miette::Report>::Ok((results, next_cursor)) 385 + }) 386 + .await 387 + .into_diagnostic()? 388 + .map(|(records, next_cursor)| RecordList { 389 + records, 390 + cursor: next_cursor.map(|rkey| { 391 + Rkey::new_cow(CowStr::Owned(rkey.to_smolstr())).expect("that rkey is validated") 392 + }), 393 + }) 394 + } 395 + 396 + pub async fn count_records(&self, collection: &str) -> Result<u64> { 397 + let did = self.did.clone().into_static(); 398 + let state = self.state.clone(); 399 + let collection = collection.to_string(); 400 + tokio::task::spawn_blocking(move || db::get_record_count(&state.db, &did, &collection)) 401 + .await 402 + .into_diagnostic()? 403 + } 404 + }
+164
src/control/stream.rs
··· 1 + use std::sync::Arc; 2 + use std::sync::atomic::Ordering; 3 + 4 + use jacquard_common::types::cid::{ATP_CID_HASH, IpldCid}; 5 + use jacquard_common::types::nsid::Nsid; 6 + use jacquard_common::types::string::Rkey; 7 + use jacquard_common::{CowStr, IntoStatic, RawData}; 8 + use jacquard_repo::DAG_CBOR_CID_CODEC; 9 + use sha2::{Digest, Sha256}; 10 + use tokio::sync::mpsc; 11 + use tracing::error; 12 + 13 + use crate::db::{self, keys}; 14 + use crate::state::AppState; 15 + use crate::types::{BroadcastEvent, MarshallableEvt, RecordEvt, StoredData, StoredEvent}; 16 + 17 + use super::Event; 18 + 19 + pub(super) fn event_stream_thread( 20 + state: Arc<AppState>, 21 + tx: mpsc::Sender<Event>, 22 + cursor: Option<u64>, 23 + ) { 24 + let db = &state.db; 25 + let mut event_rx = db.event_tx.subscribe(); 26 + let ks = db.events.clone(); 27 + let mut current_id = match cursor { 28 + Some(c) => c.saturating_sub(1), 29 + None => db.next_event_id.load(Ordering::SeqCst).saturating_sub(1), 30 + }; 31 + 32 + loop { 33 + // catch up from db 34 + loop { 35 + let mut found = false; 36 + for item in ks.range(keys::event_key(current_id + 1)..) { 37 + let (k, v) = match item.into_inner() { 38 + Ok(kv) => kv, 39 + Err(e) => { 40 + error!(err = %e, "failed to read event from db"); 41 + break; 42 + } 43 + }; 44 + 45 + let id = match k.as_ref().try_into().map(u64::from_be_bytes) { 46 + Ok(id) => id, 47 + Err(_) => { 48 + error!("failed to parse event id"); 49 + continue; 50 + } 51 + }; 52 + current_id = id; 53 + 54 + let stored: StoredEvent = match rmp_serde::from_slice(&v) { 55 + Ok(e) => e, 56 + Err(e) => { 57 + error!(err = %e, "failed to deserialize stored event"); 58 + continue; 59 + } 60 + }; 61 + 62 + let Some(evt) = stored_to_event(&state, id, stored) else { 63 + continue; 64 + }; 65 + 66 + if tx.blocking_send(evt).is_err() { 67 + return; // receiver dropped 68 + } 69 + found = true; 70 + } 71 + if !found { 72 + break; 73 + } 74 + } 75 + 76 + // wait for live events 77 + match event_rx.blocking_recv() { 78 + Ok(BroadcastEvent::Persisted(_)) => {} // re-run catch-up 79 + Ok(BroadcastEvent::Ephemeral(evt)) => { 80 + if tx.blocking_send(*evt).is_err() { 81 + return; 82 + } 83 + } 84 + Err(tokio::sync::broadcast::error::RecvError::Lagged(_)) => {} 85 + Err(tokio::sync::broadcast::error::RecvError::Closed) => break, 86 + } 87 + } 88 + } 89 + 90 + fn stored_to_event(state: &AppState, id: u64, stored: StoredEvent<'_>) -> Option<Event> { 91 + let StoredEvent { 92 + live, 93 + did, 94 + rev, 95 + collection, 96 + rkey, 97 + action, 98 + data, 99 + } = stored; 100 + 101 + let record = match data { 102 + StoredData::Ptr(cid) => { 103 + let block = state 104 + .db 105 + .blocks 106 + .get(&keys::block_key(collection.as_str(), &cid.to_bytes())); 107 + match block { 108 + Ok(Some(bytes)) => match serde_ipld_dagcbor::from_slice::<RawData>(&bytes) { 109 + Ok(val) => Some((cid, serde_json::to_value(val).ok()?)), 110 + Err(e) => { 111 + error!(err = %e, "cant parse block"); 112 + return None; 113 + } 114 + }, 115 + Ok(None) => { 116 + error!("block not found, this is a bug"); 117 + return None; 118 + } 119 + Err(e) => { 120 + error!(err = %e, "cant get block"); 121 + db::check_poisoned(&e); 122 + return None; 123 + } 124 + } 125 + } 126 + StoredData::Block(block) => { 127 + let digest = Sha256::digest(&block); 128 + let hash = 129 + cid::multihash::Multihash::wrap(ATP_CID_HASH, &digest).expect("valid sha256 hash"); 130 + let cid = IpldCid::new_v1(DAG_CBOR_CID_CODEC, hash); 131 + match serde_ipld_dagcbor::from_slice::<RawData>(&block) { 132 + Ok(val) => Some((cid, serde_json::to_value(val).ok()?)), 133 + Err(e) => { 134 + error!(err = %e, "cant parse block"); 135 + return None; 136 + } 137 + } 138 + } 139 + StoredData::Nothing => None, 140 + }; 141 + 142 + let (cid, record) = record 143 + .map(|(c, r)| (Some(c), Some(r))) 144 + .unwrap_or((None, None)); 145 + 146 + Some(MarshallableEvt { 147 + id, 148 + kind: crate::types::EventType::Record, 149 + record: Some(RecordEvt { 150 + live, 151 + did: did.to_did(), 152 + rev: rev.to_tid(), 153 + collection: Nsid::new_cow(collection.clone().into_static()) 154 + .expect("that collection is already validated"), 155 + rkey: Rkey::new_cow(CowStr::Owned(rkey.to_smolstr())) 156 + .expect("that rkey is already validated"), 157 + action: CowStr::Borrowed(action.as_str()), 158 + record, 159 + cid, 160 + }), 161 + identity: None, 162 + account: None, 163 + }) 164 + }