···1717 url::Host,
1818 {IntoStatic, xrpc::XrpcExt},
1919};
2020+use reqwest::Url;
2021use tokio::time::Duration;
2122use tokio_util::sync::CancellationToken;
2222-use tracing::{info, trace, warn};
2323+use tracing::{error, info, trace, warn};
23242425use crate::{
2526 error::Result,
···3940const RETRY_DELAY_SECS: u64 = 10;
4041/// Maximum consecutive transient failures before giving up on this host.
4142const MAX_PAGE_FAILURES: u32 = 3;
4343+/// Typical number of requests required to complete one repo resync
4444+const REQUESTS_PER_RESYNC: u64 = 2;
42454346/// Walk the full `listRepos` feed for `host`, enqueuing newly discovered repos.
4447///
···8184 );
82858386 let mut total_queued: u64 = 0;
8484- // Per-host staggering: track last-scheduled timestamp (in seconds, with
8585- // fractional precision via a separate sub-second counter) so that items
8686- // for the same host are spread across time at 1/crawl_qps intervals.
8787+ // Per-host staggering: track last-scheduled timestamp so that items for the
8888+ // same host are spread across time at 1/crawl_qps intervals.
8789 // This prevents the timestamp-ordered queue from bunching all items for
8890 // a popular host together.
8989- let host_interval_ms: u64 = 1000 / crawl_qps.get() as u64;
9090- let mut host_schedule: HashMap<String, u64> = HashMap::new(); // host → last scheduled ts (seconds)
9191+ let host_interval_ms = 1000 / crawl_qps.get() as u64;
9292+ let mut host_schedule: HashMap<Arc<Url>, u64> = HashMap::new(); // host → last scheduled ts (seconds)
91939294 loop {
9395 if token.is_cancelled() {
···102104103105 let page_len = dids.len();
104106 let now = unix_now_ms();
105105- let host_str = host.to_string();
106107107108 // For untrusted hosts (deep crawl), filter DIDs to those whose
108109 // resolved PDS actually matches this host.
···114115115116 // Resolve each DID's actual PDS host for per-host stagger.
116117 // Cache hits are free; misses fall back to the listed host.
117117- let dids_with_hosts: Vec<(Did<'static>, String)> = {
118118+ //
119119+ // TODO: ...this is basically redundant with validate_dids now?
120120+ let dids_with_hosts: Vec<(Did<'static>, Arc<Url>)> = {
118121 let mut out = Vec::with_capacity(dids.len());
119122 for did in dids {
120123 let pds_host = match resolver.resolve(&did).await {
121121- Ok(resolved) => resolved.pds.host_str().unwrap_or(&host_str).to_string(),
122122- Err(_) => host_str.clone(),
124124+ Ok(resolved) => resolved.pds.clone(),
125125+ Err(e) => {
126126+ error!(did = %did, error = %e, "failed to resolve host for validated did; not enqueuing resync");
127127+ continue;
128128+ }
123129 };
124130 out.push((did, pds_host));
125131 }
···315321fn store_page(
316322 db: &DbRef,
317323 host: &Host,
318318- items: Vec<(Did<'static>, String)>,
324324+ items: Vec<(Did<'static>, Arc<Url>)>,
319325 progress_cursor: String,
320326 now: u64,
321327 interval_ms: u64,
322322- mut host_schedule: HashMap<String, u64>,
323323-) -> Result<(u64, HashMap<String, u64>)> {
328328+ mut host_schedule: HashMap<Arc<Url>, u64>,
329329+) -> Result<(u64, HashMap<Arc<Url>, u64>)> {
324330 let mut count: u64 = 0;
331331+ let meta_interval = interval_ms * REQUESTS_PER_RESYNC;
325332 for (did, pds) in items {
326333 let newly_inserted = storage::repo::ensure_repo(db, &did)?;
327334 if newly_inserted {
328335 let last = host_schedule.get(&pds).copied().unwrap_or(now);
329329- let ts = if last >= now { last + interval_ms } else { now };
336336+ let ts = if last >= now {
337337+ last + meta_interval
338338+ } else {
339339+ now
340340+ };
330341 host_schedule.insert(pds, ts);
331342 let item = ResyncItem {
332343 did,