···106106107107108108very much still todo but i'm getting tired
109109-- [x] add a `--heavy` mode that always uses `getRepo` and never `describeRepo`
109109+- [x] config: add a `--heavy` mode that always uses `getRepo` and never `describeRepo`
110110+- [x] config: db mem limit `--fjall-cache-mb`
111111+- [x] config: per-host request rate self-throttling `--crawl-qps` (name from collectiondir)
110112- [ ] resync: estimate CAR size from `getRecord` mst height; `getRepo` if it's likely very small
111113- [ ] multi-relay subscriber
112114- [ ] special did:web behaviour to keep reusing a stale resolution on failure
···115117- [ ] if the upstream is a PDS (check with describeServer?) then make only accept events for DIDs that have it as their PDS
116118- [ ] use `since` on getRepo for resync to get a smaller partial export in many cases (and then more-carefully do the actual resync)
117119- [ ] combine the throttled http client instance, the db, and the admin info into an appstate fineeeee
120120+- [ ] bad word filtering? (collectiondir has it)
121121+- [ ] check response headers and adjust self-throttling rate limits per-host if present
118122119123120124### special-casing
+1-1
src/examples/enqueue_resync.rs
···27272828fn main() -> Result<(), Box<dyn std::error::Error>> {
2929 let args = Args::parse();
3030- let db = storage::open(&args.db_path)?;
3030+ let db = storage::open(&args.db_path, 64)?;
3131 let now = SystemTime::now()
3232 .duration_since(UNIX_EPOCH)
3333 .unwrap()
+1-1
src/examples/list_repo_collections.rs
···2929#[tokio::main]
3030async fn main() -> Result<(), Box<dyn std::error::Error>> {
3131 let args = Args::parse();
3232- let client = lightrail::http::build_client();
3232+ let client = lightrail::http::build_client(std::num::NonZeroU32::new(10).unwrap());
3333 let base: jacquard_common::url::Url = args.base.parse()?;
3434 let did = Did::new_owned(args.did)?;
3535
+2-5
src/http.rs
···1818use jacquard_common::http_client::{HttpClient, HttpClientExt};
1919use jacquard_common::stream::{ByteStream, StreamError};
20202121-/// Default per-host request rate: 10 req/s.
2222-const DEFAULT_RATE_PER_SEC: u32 = 10;
2323-2421/// State shared across all clones of a [`ThrottledClient`].
2522struct Shared {
2623 /// Duration of one token at the configured rate (= 1s / rate).
···7875}
79768077/// Build the shared HTTP client used for all outbound ATProto requests.
8181-pub fn build_client() -> ThrottledClient {
8282- ThrottledClient::new(NonZeroU32::new(DEFAULT_RATE_PER_SEC).unwrap())
7878+pub fn build_client(rate_per_sec: NonZeroU32) -> ThrottledClient {
7979+ ThrottledClient::new(rate_per_sec)
8380}
84818582impl HttpClient for ThrottledClient {
+10-2
src/main.rs
···9090 #[arg(long, action, env = "LIGHTRAIL_HEAVY")]
9191 heavy: bool,
92929393+ /// Per-PDS HTTP rate limit for crawl/resync requests, in requests per second.
9494+ #[arg(long, env = "LIGHTRAIL_CRAWL_QPS", default_value_t = std::num::NonZeroU32::new(10).unwrap())]
9595+ crawl_qps: std::num::NonZeroU32,
9696+9797+ /// fjall block cache size in MiB.
9898+ #[arg(long, env = "LIGHTRAIL_FJALL_CACHE_MB", default_value_t = 256)]
9999+ fjall_cache_mb: u64,
100100+93101 /// Max concurrent per-PDS listRepos workers during deep crawl.
94102 #[arg(
95103 long,
···133141 install_metrics(addr)?;
134142 }
135143136136- let db = storage::open(&args.db_path)?;
137137- let client = lightrail::http::build_client();
144144+ let db = storage::open(&args.db_path, args.fjall_cache_mb)?;
145145+ let client = lightrail::http::build_client(args.crawl_qps);
138146 let token = CancellationToken::new();
139147140148 let mut tasks: JoinSet<Result<()>> = JoinSet::new();
+16-13
src/storage/mod.rs
···6565pub type DbRef = Arc<Db>;
66666767/// Open (or create) the fjall database at `path` and return a shared handle.
6868-pub fn open(path: &Path) -> StorageResult<DbRef> {
6969- open_inner(path, false)
6868+pub fn open(path: &Path, cache_mb: u64) -> StorageResult<DbRef> {
6969+ open_inner(path, DbConfig::ForReal { cache_mb })
7070+}
7171+7272+enum DbConfig {
7373+ /// temporary db for tests
7474+ #[allow(dead_code)]
7575+ Testing,
7676+ /// bumpable cache for prod
7777+ ForReal { cache_mb: u64 },
7078}
71797280/// Open a temporary database that deletes itself on drop. For tests only.
···7684 static COUNTER: AtomicU64 = AtomicU64::new(0);
7785 let n = COUNTER.fetch_add(1, Ordering::Relaxed);
7886 let path = std::env::temp_dir().join(format!("lightrail-test-{}-{}", std::process::id(), n));
7979- open_inner(&path, true)
8787+ open_inner(&path, DbConfig::Testing)
8088}
81898282-fn open_inner(path: &Path, testing: bool) -> StorageResult<DbRef> {
8383- let mut builder = fjall::Database::builder(path);
8484-8585- builder = if testing {
8686- // for testing, we leave the small default cache and open as temporary
8787- builder.temporary(true)
8888- } else {
8989- // otherwise (prod) we want some more cache moneyyeyyyey
9090- builder.cache_size(256 * 1_024 * 1_024)
9090+fn open_inner(path: &Path, config: DbConfig) -> StorageResult<DbRef> {
9191+ let builder = fjall::Database::builder(path);
9292+ let builder = match config {
9393+ DbConfig::Testing => builder.temporary(true),
9494+ DbConfig::ForReal { cache_mb } => builder.cache_size(cache_mb * 2_u64.pow(20)),
9195 };
9292-9396 let database = builder.open()?;
9497 let ks = database.keyspace("default", fjall::KeyspaceCreateOptions::default)?;
9598 let index_ks = database.keyspace("index", || {
+24-4
src/sync/firehose/event_dispatcher.rs
···453453 async fn commits_for_same_did_are_sequential() {
454454 let db = crate::storage::open_temporary().unwrap();
455455 let resolver = make_resolver();
456456- let mut d = CommitDispatcher::new(resolver, db, 4, crate::http::build_client());
456456+ let mut d = CommitDispatcher::new(
457457+ resolver,
458458+ db,
459459+ 4,
460460+ crate::http::build_client(std::num::NonZeroU32::new(10).unwrap()),
461461+ );
457462458463 let did: Did<'static> = Did::new_owned("did:plc:testsequential").unwrap();
459464 let c1 = {
···493498 async fn commits_for_different_dids_run_in_parallel() {
494499 let db = crate::storage::open_temporary().unwrap();
495500 let resolver = make_resolver();
496496- let mut d = CommitDispatcher::new(resolver, db, 4, crate::http::build_client());
501501+ let mut d = CommitDispatcher::new(
502502+ resolver,
503503+ db,
504504+ 4,
505505+ crate::http::build_client(std::num::NonZeroU32::new(10).unwrap()),
506506+ );
497507498508 let did_a: Did<'static> = Did::new_owned("did:plc:testa").unwrap();
499509 let did_b: Did<'static> = Did::new_owned("did:plc:testb").unwrap();
···510520 async fn watermark_advances_after_completion() {
511521 let db = crate::storage::open_temporary().unwrap();
512522 let resolver = make_resolver();
513513- let mut d = CommitDispatcher::new(resolver, db, 4, crate::http::build_client());
523523+ let mut d = CommitDispatcher::new(
524524+ resolver,
525525+ db,
526526+ 4,
527527+ crate::http::build_client(std::num::NonZeroU32::new(10).unwrap()),
528528+ );
514529515530 let did_a: Did<'static> = Did::new_owned("did:plc:testwma").unwrap();
516531 let did_b: Did<'static> = Did::new_owned("did:plc:testwmb").unwrap();
···531546 async fn stalled_seq_evicted_from_watermark() {
532547 let db = crate::storage::open_temporary().unwrap();
533548 let resolver = make_resolver();
534534- let mut d = CommitDispatcher::new(resolver, db, 4, crate::http::build_client());
549549+ let mut d = CommitDispatcher::new(
550550+ resolver,
551551+ db,
552552+ 4,
553553+ crate::http::build_client(std::num::NonZeroU32::new(10).unwrap()),
554554+ );
535555536556 // Manually inject an old entry into outstanding without spawning a worker.
537557 let stale_instant = Instant::now() - std::time::Duration::from_secs(STALL_EVICT_SECS + 1);