···6464events are de-duplicated using the `time` field. todo: decide what to do on
6565relay-side account takedowns or if relays set the `time` field.
66666767+#### direct PDS connections
6868+6969+a firehose source can also be a direct connection to a PDS rather than a relay.
7070+prefix the URL with `pds::` to mark it as such:
7171+7272+```
7373+HYDRANT_RELAY_HOSTS=wss://bsky.network,pds::wss://pds.example.com
7474+```
7575+7676+only when a source is marked as a direct PDS (`is_pds: true`), hydrant enforces
7777+host authority. relays (`is_pds: false`, the default) are exempt from this check,
7878+since they forward commits from many PDSes by design. this means you will trust
7979+the relay on this though.
8080+6781### crawler sources
68826983<small>[<- back to toc](#table-of-contents)</small>
···104118| `DATABASE_PATH` | `./hydrant.db` | path to the database folder. |
105119| `RUST_LOG` | `info` | log filter directives (e.g., `debug`, `hydrant=trace`). [`tracing` env-filter syntax](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html). |
106120| `RELAY_HOST` | `wss://relay.fire.hose.cam/` | URL of the relay (firehose only). |
107107-| `RELAY_HOSTS` | | comma-separated list of relay URLs (firehose only). if unset, falls back to `RELAY_HOST`. |
121121+| `RELAY_HOSTS` | | comma-separated list of firehose sources (firehose only). if unset, falls back to `RELAY_HOST`. prefix a URL with `pds::` to mark it as a direct PDS connection (e.g. `pds::wss://pds.example.com`). bare URLs are treated as relays. |
108122| `CRAWLER_URLS` | relay hosts in full-network mode, `https://lightrail.microcosm.blue` in filter mode | comma-separated list of `[mode::]url` crawler sources. mode is `relay` or `by_collection`; bare URLs use the default mode. set to empty string to disable crawling. |
109123| `PLC_URL` | `https://plc.wtf`, `https://plc.directory` if full network | base URL(s) of the PLC directory (comma-separated for multiple). |
110124| `EPHEMERAL` | `false` | if enabled, no records are stored. events are deleted after a certain duration (`EPHEMERAL_TTL`). |
···233247234248<small>[<- back to toc](#table-of-contents)</small>
235249236236-- `GET /firehose/sources`: list all currently active firehose relay sources.
237237- - returns a JSON array of `{ "url": string, "persisted": bool }`.
250250+- `GET /firehose/sources`: list all currently active firehose sources.
251251+ - returns a JSON array of `{ "url": string, "persisted": bool, "is_pds": bool }`.
238252 - `persisted: true` means the source was added via the API and is stored in the
239253 database, it will survive a restart. `persisted: false` means the source
240254 came from `RELAY_HOSTS` and is not written to the database.
241241-- `POST /firehose/sources`: add a firehose relay at runtime.
242242- - body: `{ "url": string }`.
255255+ - `is_pds: true` means the source is a direct PDS connection with host authority enforcement enabled.
256256+- `POST /firehose/sources`: add a firehose source at runtime.
257257+ - body: `{ "url": string, "is_pds": bool }`. `is_pds` defaults to `false`.
243258 - the source is persisted to the database before the ingestor task is started.
244244- - if a relay with the same URL already exists, it is replaced: the running
259259+ - if a source with the same URL already exists, it is replaced: the running
245260 task is stopped and a new one is started. any existing cursor state for that
246261 URL is preserved.
247262 - returns `201 Created` on success.
···1414pub enum IngestMessage {
1515 Firehose {
1616 relay: Url,
1717+ /// true when `relay` is a direct PDS connection (not an aggregating relay).
1818+ /// enables host authority enforcement in the worker.
1919+ is_pds: bool,
1720 msg: SubscribeReposMessage<'static>,
1821 },
1922 BackfillFinished(Did<'static>),
+104-8
src/ingest/worker.rs
···5252 }
5353}
54545555+enum HostAuthorityOutcome {
5656+ /// stored pds matched the source host immediately.
5757+ Authorized,
5858+ /// pds migrated: doc now points to this host, but our stored state was stale. trigger backfill.
5959+ Migration,
6060+ /// host did not match even after doc resolution. reject the message.
6161+ WrongHost,
6262+}
6363+5564// gate returned by check_repo_state, tells the shard loop what to do with the message
5665enum ProcessGate<'s, 'c> {
5766 // did not exist in db, newly queued for backfill, drop
···249258 }
250259 }
251260 }
252252- IngestMessage::Firehose { relay, msg } => {
261261+ IngestMessage::Firehose { relay, is_pds, msg } => {
253262 let _span = tracing::info_span!("firehose", relay = %relay).entered();
263263+ // only enforce host authority when the source is a direct PDS connection
264264+ let source_host = is_pds.then(|| relay.host_str()).flatten();
254265 let (did, seq) = match &msg {
255266 SubscribeReposMessage::Commit(c) => (&c.repo, c.seq),
256267 SubscribeReposMessage::Identity(i) => (&i.did, i.seq),
···330341 }
331342 }
332343333333- match Self::process_message(&mut ctx, &msg, did, repo_state, pre_status)
334334- {
344344+ match Self::process_message(
345345+ &mut ctx,
346346+ &msg,
347347+ did,
348348+ repo_state,
349349+ pre_status,
350350+ source_host,
351351+ ) {
335352 Ok(RepoProcessResult::Ok(_)) => {}
336353 Ok(RepoProcessResult::Deleted) => {
337354 state.db.update_count("repos", -1);
···411428 did: &Did,
412429 repo_state: RepoState<'s>,
413430 pre_status: RepoStatus,
431431+ source_host: Option<&str>,
414432 ) -> Result<RepoProcessResult<'s, 'c>, IngestError> {
415433 match msg {
416434 SubscribeReposMessage::Commit(commit) => {
417435 trace!(did = %did, "processing commit");
418418- Self::handle_commit(ctx, did, repo_state, commit)
436436+ Self::handle_commit(ctx, did, repo_state, commit, source_host)
419437 }
420438 SubscribeReposMessage::Sync(sync) => {
421439 debug!(did = %did, "processing sync");
422422- Self::handle_sync(ctx, did, repo_state, sync)
440440+ Self::handle_sync(ctx, did, repo_state, sync, source_host)
423441 }
424442 SubscribeReposMessage::Identity(identity) => {
425443 debug!(did = %did, "processing identity");
···441459 did: &Did,
442460 mut repo_state: RepoState<'s>,
443461 commit: &'c Commit<'c>,
462462+ source_host: Option<&str>,
444463 ) -> Result<RepoProcessResult<'s, 'c>, IngestError> {
445464 repo_state.advance_message_time(commit.time.0.timestamp_millis());
446465447447- // TODO phase 2: host authority check (source_host not available in indexer mode)
466466+ if let Some(host) = source_host {
467467+ match Self::check_host_authority(ctx, did, &mut repo_state, host)? {
468468+ HostAuthorityOutcome::Authorized => {}
469469+ HostAuthorityOutcome::Migration => {
470470+ // pds migrated: our data may be stale, backfill from the new host
471471+ warn!(did = %did, source_host = host, "pds migration detected, triggering backfill");
472472+ let mut batch = ctx.state.db.inner.batch();
473473+ let _repo_state = ops::update_repo_status(
474474+ &mut batch,
475475+ &ctx.state.db,
476476+ did,
477477+ repo_state,
478478+ RepoStatus::Backfilling,
479479+ )?;
480480+ batch.commit().into_diagnostic()?;
481481+ ctx.state
482482+ .db
483483+ .update_gauge_diff(&GaugeState::Synced, &GaugeState::Pending);
484484+ ctx.state.notify_backfill();
485485+ return Ok(RepoProcessResult::NeedsBackfill(Some(commit)));
486486+ }
487487+ // todo: ideally ban pds
488488+ HostAuthorityOutcome::WrongHost => {
489489+ warn!(did = %did, source_host = host, pds = ?repo_state.pds, "commit rejected: wrong host");
490490+ return Ok(RepoProcessResult::Ok(repo_state));
491491+ }
492492+ }
493493+ }
448494449495 // validate the commit: stale rev, size limits, future rev, CAR parse, field
450496 // consistency, signature, and chain-break detection
···536582 did: &Did,
537583 mut repo_state: RepoState<'s>,
538584 sync: &'c Sync<'c>,
585585+ source_host: Option<&str>,
539586 ) -> Result<RepoProcessResult<'s, 'c>, IngestError> {
540587 repo_state.advance_message_time(sync.time.0.timestamp_millis());
541588542542- // TODO phase 2: host authority check
589589+ if let Some(host) = source_host {
590590+ match Self::check_host_authority(ctx, did, &mut repo_state, host)? {
591591+ HostAuthorityOutcome::Authorized | HostAuthorityOutcome::Migration => {
592592+ // migration is fine here — sync already triggers a backfill below
593593+ }
594594+ // todo: ideally ban pds
595595+ HostAuthorityOutcome::WrongHost => {
596596+ warn!(did = %did, source_host = host, pds = ?repo_state.pds, "sync rejected: wrong host");
597597+ return Ok(RepoProcessResult::Ok(repo_state));
598598+ }
599599+ }
600600+ }
543601544602 // validate: size limit, CAR parse, field consistency, signature
545603 let signing_key = Self::fetch_key(ctx, did)?;
···864922 let (key, value) = guard.into_inner().into_diagnostic()?;
865923 let commit: Commit = rmp_serde::from_slice(&value).into_diagnostic()?;
866924867867- let res = Self::handle_commit(ctx, did, repo_state, &commit);
925925+ // buffered commits have already been source-checked on arrival; skip host check
926926+ let res = Self::handle_commit(ctx, did, repo_state, &commit, None);
868927 let res = match res {
869928 Ok(r) => r,
870929 Err(e) => {
···891950 }
892951893952 Ok(RepoProcessResult::Ok(repo_state))
953953+ }
954954+955955+ /// check that `source_host` is the authoritative PDS for `did`.
956956+ ///
957957+ /// - `Authorized`: stored pds matched immediately (fast path).
958958+ /// - `Migration`: stored pds was wrong but doc resolved to this host; caller should backfill.
959959+ /// - `WrongHost`: host did not match even after doc resolution; caller should reject.
960960+ fn check_host_authority(
961961+ ctx: &mut WorkerContext,
962962+ did: &Did,
963963+ repo_state: &mut RepoState,
964964+ source_host: &str,
965965+ ) -> Result<HostAuthorityOutcome, IngestError> {
966966+ let pds_host = repo_state
967967+ .pds
968968+ .as_deref()
969969+ .and_then(|pds| url::Url::parse(pds).ok())
970970+ .and_then(|u| u.host_str().map(str::to_owned));
971971+972972+ if pds_host.as_deref() == Some(source_host) {
973973+ return Ok(HostAuthorityOutcome::Authorized);
974974+ }
975975+976976+ // unknown pds or host mismatch — resolve doc to verify or detect a migration
977977+ Self::refresh_doc(ctx, repo_state, did)?;
978978+979979+ let updated_host = repo_state
980980+ .pds
981981+ .as_deref()
982982+ .and_then(|pds| url::Url::parse(pds).ok())
983983+ .and_then(|u| u.host_str().map(str::to_owned));
984984+985985+ if updated_host.as_deref() == Some(source_host) {
986986+ Ok(HostAuthorityOutcome::Migration)
987987+ } else {
988988+ Ok(HostAuthorityOutcome::WrongHost)
989989+ }
894990 }
895991896992 // refreshes the handle, pds url and signing key of a did
+1-1
tests/common.nu
···5656}
57575858export def resolve-pds [did: string] {
5959- let doc = (http get $"https://plc.wtf/($did)" | from json)
5959+ let doc = (http get $"https://plc.gaze.systems/($did)" | from json)
6060 ($doc.service | where type == "AtprotoPersonalDataServer" | first).serviceEndpoint
6161}
6262