upgrade to cpx41, harden backfill and dashboard

+1 -1

deploy/collectiondir-values.yaml

··· 40 40 memory: 128Mi 41 41 cpu: 50m 42 42 limits: 43 - memory: 1Gi 43 + memory: 2Gi 44 44 45 45 defaultPodOptions: 46 46 imagePullSecrets:

+31 -1

deploy/relay-dashboard.json

··· 223 223 }, 224 224 "targets": [ 225 225 { 226 - "expr": "jetstream_subscribers_connected", 226 + "expr": "max(jetstream_subscribers_connected)", 227 227 "legendFormat": "connected", 228 228 "refId": "A" 229 229 } ··· 397 397 { 398 398 "expr": "rate(container_fs_reads_bytes_total{namespace=\"relay\",pod=~\"collectiondir.*\",container=\"main\"}[5m])", 399 399 "legendFormat": "reads", 400 + "refId": "B" 401 + } 402 + ] 403 + }, 404 + { 405 + "title": "collectiondir memory", 406 + "type": "timeseries", 407 + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 42 }, 408 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 409 + "fieldConfig": { 410 + "defaults": { 411 + "unit": "bytes", 412 + "color": { "mode": "palette-classic" }, 413 + "custom": { 414 + "fillOpacity": 15, 415 + "lineWidth": 2, 416 + "spanNulls": false 417 + } 418 + }, 419 + "overrides": [] 420 + }, 421 + "targets": [ 422 + { 423 + "expr": "container_memory_working_set_bytes{namespace=\"relay\",pod=~\"collectiondir.*\",container=\"main\"}", 424 + "legendFormat": "working set", 425 + "refId": "A" 426 + }, 427 + { 428 + "expr": "kube_pod_container_resource_limits{namespace=\"relay\",pod=~\"collectiondir.*\",container=\"main\",resource=\"memory\"}", 429 + "legendFormat": "limit", 400 430 "refId": "B" 401 431 } 402 432 ]

+2 -1

deploy/relay-values.yaml

··· 13 13 RELAY_PERSIST_DIR: /data 14 14 RELAY_REPLAY_WINDOW: "2h" 15 15 LOG_LEVEL: "info" 16 + GOMEMLIMIT: "6GiB" 16 17 envFrom: 17 18 - secretRef: 18 19 name: relay-secret ··· 34 35 memory: 1Gi 35 36 cpu: 500m 36 37 limits: 37 - memory: 14Gi 38 + memory: 8Gi 38 39 39 40 defaultPodOptions: 40 41 # hostNetwork recommended for full-network relays (high packet volume).

+2 -2

docs/architecture.md

··· 2 2 3 3 ## infrastructure 4 4 5 - - **Hetzner Cloud CPX31** — 8 vCPU (AMD), 16 GB RAM, 160 GB NVMe, 20 TB bandwidth @ ~$15/mo 5 + - **Hetzner Cloud CPX41** — 16 vCPU (AMD), 32 GB RAM, 240 GB NVMe, 20 TB bandwidth @ ~$30/mo 6 6 - **k3s** — single-node kubernetes, installed via cloud-init 7 7 - **traefik** — ingress controller (ships with k3s) 8 8 - **cert-manager** — automatic TLS via Let's Encrypt ··· 13 13 14 14 the core service. [`ghcr.io/bluesky-social/indigo`](https://github.com/bluesky-social/indigo/pkgs/container/indigo), deployed via [bjw-s/app-template](https://github.com/bjw-s-labs/helm-charts) with `hostNetwork: true` for lower-overhead networking. connects to every PDS on the network and aggregates their writes into a single firehose stream (`com.atproto.sync.subscribeRepos`). backed by postgresql for state. 15 15 16 - the relay maintains an in-process identity cache (hashicorp LRU, 5M entries, 24h TTL) — every event requires a DID document lookup, and this cache keeps the relay from hammering PLC. memory usage climbs over the first day as the cache fills, then plateaus once eviction matches insertion. 16 + the relay maintains an in-process identity cache (hashicorp LRU, 5M entries, 24h TTL) — every event requires a DID document lookup, and this cache keeps the relay from hammering PLC. memory usage climbs over the first day as the cache fills, then plateaus once eviction matches insertion. `GOMEMLIMIT=6GiB` is set so the Go runtime returns memory to the OS under pressure rather than holding onto it indefinitely. 17 17 18 18 ### collectiondir 19 19

+3 -1

docs/backfill.md

··· 41 41 42 42 ## batch sizing 43 43 44 - each host in a batch crawls concurrently — they hit different PDS servers, so increasing batch size adds parallelism without increasing load on any individual server. the bottleneck is per-host sequential `describeRepo` calls, not the batch size. batch-size 10 is a reasonable default. 44 + **indie PDS hosts:** each host in a batch crawls concurrently and they're all independent servers, so batch-size 10 is fine. 45 + 46 + **bsky shards:** all the mushroom-named hosts share an IP-based rate limit. more than ~2 concurrent crawls from our IP triggers HTTP 429, and the crawl code has no retry logic — a single rate limit kills the entire crawl for that host. use `--batch-size 1` for bsky shards. this means crawling all 87 shards takes days, not hours. 45 47 46 48 ## monitoring 47 49

+1 -1

docs/deploying.md

··· 26 26 source .env 27 27 28 28 just init # terraform init 29 - just infra # creates a CPX31 in Ashburn (~$15/mo) with k3s via cloud-init 29 + just infra # creates a CPX41 in Ashburn (~$30/mo) with k3s via cloud-init 30 30 just kubeconfig # waits for k3s, pulls kubeconfig (~2 min) 31 31 just deploy # installs cert-manager, postgresql, relay, jetstream, monitoring 32 32 ```

+2 -2

infra/variables.tf

··· 11 11 } 12 12 13 13 variable "server_type" { 14 - description = "Hetzner server type (cpx31 = 8 vCPU, 16 GB RAM, 160 GB disk)" 14 + description = "Hetzner server type (cpx41 = 16 vCPU, 32 GB RAM, 240 GB disk)" 15 15 type = string 16 - default = "cpx31" 16 + default = "cpx41" 17 17 } 18 18 19 19 variable "location" {

+2 -2

justfile

··· 197 197 198 198 # seed the relay with hosts from the network (includes restart so slurper picks them up) 199 199 bootstrap: 200 - kubectl exec -n relay deploy/relay -- relay pull-hosts --relay-host https://relay1.us-west.bsky.network 200 + kubectl exec -n relay deploy/relay -- /relay pull-hosts --relay-host https://relay1.us-west.bsky.network 201 201 kubectl rollout restart deploy/relay -n relay 202 202 kubectl rollout status deploy/relay -n relay --timeout=2m 203 203 204 204 # sync PDS host list from upstream (run periodically to discover new hosts) 205 205 sync-hosts: 206 - kubectl exec -n relay deploy/relay -- relay pull-hosts --relay-host https://relay1.us-west.bsky.network 206 + kubectl exec -n relay deploy/relay -- /relay pull-hosts --relay-host https://relay1.us-west.bsky.network 207 207 208 208 # --- status --- 209 209

+27 -9

scripts/backfill

··· 50 50 return json.loads(resp.read()) 51 51 52 52 53 - def active_crawl_count(url: str, token: str) -> tuple[int, int]: 54 - """returns (active_count, total_repos_seen).""" 55 - status = crawl_status(url, token) 56 - active = status.get("host_starts", {}) 57 - seen = sum(h.get("seen", 0) for h in active.values()) 58 - return len(active), seen 53 + def active_crawl_count(url: str, token: str, retries: int = 3) -> tuple[int, int]: 54 + """returns (active_count, total_repos_seen). retries on transient errors.""" 55 + for attempt in range(retries): 56 + try: 57 + status = crawl_status(url, token) 58 + active = status.get("host_starts", {}) 59 + seen = sum(h.get("seen", 0) for h in active.values()) 60 + return len(active), seen 61 + except (ConnectionError, OSError, urllib.error.URLError) as e: 62 + if attempt < retries - 1: 63 + sys.stdout.write(f"\r connection error ({e}), retrying in 5s...") 64 + sys.stdout.flush() 65 + time.sleep(5) 66 + else: 67 + raise 59 68 60 69 61 - def wait_for_batch(url: str, token: str, baseline: int, poll_interval: float = 3.0) -> int: 62 - """poll until active crawl count returns to baseline. returns repos seen this batch.""" 70 + def wait_for_batch(url: str, token: str, baseline: int, poll_interval: float = 3.0, timeout: float = 0) -> int: 71 + """poll until active crawl count returns to baseline or timeout. returns repos seen this batch.""" 63 72 peak_seen = 0 73 + batch_start = time.time() 64 74 while True: 65 75 count, seen = active_crawl_count(url, token) 66 76 peak_seen = max(peak_seen, seen) 67 77 if count <= baseline: 68 78 sys.stdout.write(f"\r done — {peak_seen} repos described" + " " * 20 + "\n") 69 79 return peak_seen 80 + if timeout > 0 and (time.time() - batch_start) > timeout: 81 + sys.stdout.write(f"\r timeout — {count - baseline} crawls still active, {peak_seen} repos described (skipping)" + " " * 20 + "\n") 82 + return peak_seen 70 83 sys.stdout.write(f"\r {count - baseline} crawls active, {seen} repos described") 71 84 sys.stdout.flush() 72 85 time.sleep(poll_interval) ··· 79 92 parser.add_argument("--url", default="http://localhost:2510", help="collectiondir base url (default: http://localhost:2510)") 80 93 parser.add_argument("--batch-size", type=int, default=10, help="hosts per batch (default: 10)") 81 94 parser.add_argument("--pause", type=int, default=5, help="seconds to pause between batches (default: 5)") 95 + parser.add_argument("--batch-timeout", type=int, default=300, help="max seconds to wait per batch before skipping (default: 300, 0=no timeout)") 96 + parser.add_argument("--skip", type=int, default=0, help="skip the first N batches (for resuming)") 82 97 args = parser.parse_args() 83 98 84 99 with open(args.hosts) as f: ··· 116 131 if stopping: 117 132 break 118 133 134 + if i < args.skip: 135 + continue 136 + 119 137 elapsed = time.time() - start 120 138 rate = hosts_crawled / elapsed if elapsed > 0 else 0 121 139 eta = (len(all_hosts) - hosts_crawled) / rate if rate > 0 else 0 ··· 136 154 print(f" error sending batch: {e}") 137 155 continue 138 156 139 - repos = wait_for_batch(args.url, args.token, baseline) 157 + repos = wait_for_batch(args.url, args.token, baseline, timeout=args.batch_timeout) 140 158 total_repos += repos 141 159 hosts_crawled += len(batch) 142 160

Configure Feed

Configure Feed