tighten relay config, add collectiondir disk i/o dashboard

+2 -2

README.md

··· 8 8 9 9 **jetstream endpoint:** `wss://jetstream.waow.tech/subscribe` — same data, JSON over websockets 10 10 11 - **collection directory:** `https://relay.waow.tech/xrpc/com.atproto.sync.listReposByCollection` — paginated DID lists per collection (live events only — historical backfill is pending) 11 + **collection directory:** `https://relay.waow.tech/xrpc/com.atproto.sync.listReposByCollection` — paginated DID lists per collection (indie PDS hosts backfilled, bsky shard backfill pending) 12 12 13 13 **health check:** [`https://relay.waow.tech/xrpc/_health`](https://relay.waow.tech/xrpc/_health) 14 14 ··· 155 155 | storage (postgres) | ~2.4 GB | 156 156 | CPU usage | 5–15% | 157 157 | network throughput | ~600 events/sec typical, 2000 peak | 158 - | connected PDS hosts | ~1400 | 158 + | connected PDS hosts | ~2200 | 159 159 160 160 </details> 161 161

+2 -2

deploy/monitoring-values.yaml

··· 23 23 coreDns: 24 24 enabled: false 25 25 kubelet: 26 - enabled: false 26 + enabled: true 27 27 28 28 # --- prometheus --- 29 29 prometheus: ··· 35 35 memory: 256Mi 36 36 cpu: 100m 37 37 limits: 38 - memory: 512Mi 38 + memory: 1Gi 39 39 serviceMonitorSelectorNilUsesHelmValues: false 40 40 storageSpec: 41 41 volumeClaimTemplate:

+31 -1

deploy/relay-dashboard.json

··· 57 57 }, 58 58 "targets": [ 59 59 { 60 - "expr": "relay_connected_inbound", 60 + "expr": "sum(relay_connected_inbound)", 61 61 "legendFormat": "hosts", 62 62 "refId": "A" 63 63 } ··· 368 368 "expr": "sum(rate(collectiondir_pebble_new_total[5m]))", 369 369 "legendFormat": "new pairs", 370 370 "refId": "A" 371 + } 372 + ] 373 + }, 374 + { 375 + "title": "collectiondir disk i/o", 376 + "type": "timeseries", 377 + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 42 }, 378 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 379 + "fieldConfig": { 380 + "defaults": { 381 + "unit": "Bps", 382 + "color": { "mode": "palette-classic" }, 383 + "custom": { 384 + "fillOpacity": 15, 385 + "lineWidth": 2, 386 + "spanNulls": false 387 + } 388 + }, 389 + "overrides": [] 390 + }, 391 + "targets": [ 392 + { 393 + "expr": "rate(container_fs_writes_bytes_total{namespace=\"relay\",pod=~\"collectiondir.*\",container=\"main\"}[5m])", 394 + "legendFormat": "writes", 395 + "refId": "A" 396 + }, 397 + { 398 + "expr": "rate(container_fs_reads_bytes_total{namespace=\"relay\",pod=~\"collectiondir.*\",container=\"main\"}[5m])", 399 + "legendFormat": "reads", 400 + "refId": "B" 371 401 } 372 402 ] 373 403 }

+1 -2

deploy/relay-values.yaml

··· 11 11 env: 12 12 # DATABASE_URL injected from secret via envFrom 13 13 RELAY_PERSIST_DIR: /data 14 - RELAY_REPLAY_WINDOW: "24h" 15 - RELAY_LENIENT_SYNC_VALIDATION: "true" 14 + RELAY_REPLAY_WINDOW: "2h" 16 15 LOG_LEVEL: "info" 17 16 envFrom: 18 17 - secretRef:

+54 -16

justfile

··· 169 169 echo " curl https://$GRAFANA_DOMAIN" 170 170 echo " curl https://$JETSTREAM_DOMAIN" 171 171 172 + # deploy only the monitoring stack (grafana + prometheus) 173 + deploy-monitoring: helm-repos 174 + #!/usr/bin/env bash 175 + set -euo pipefail 176 + 177 + GRAFANA_DOMAIN="${GRAFANA_DOMAIN:-relay-metrics.waow.tech}" 178 + 179 + echo "==> installing monitoring stack" 180 + kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f - 181 + kubectl create configmap relay-dashboard \ 182 + --namespace monitoring \ 183 + --from-file=relay-dashboard.json=deploy/relay-dashboard.json \ 184 + --dry-run=client -o yaml | kubectl apply -f - 185 + helm upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ 186 + --namespace monitoring \ 187 + --values deploy/monitoring-values.yaml \ 188 + --set grafana.adminPassword="${GRAFANA_ADMIN_PASSWORD:-prom-operator}" \ 189 + --wait --timeout 5m 190 + kubectl apply -f deploy/relay-servicemonitor.yaml 191 + 192 + echo "==> applying grafana ingress" 193 + sed "s|GRAFANA_DOMAIN_PLACEHOLDER|$GRAFANA_DOMAIN|g" deploy/grafana-ingress.yaml \ 194 + | kubectl apply -f - 195 + 196 + echo "done." 197 + 172 198 # seed the relay with hosts from the network (includes restart so slurper picks them up) 173 199 bootstrap: 174 200 kubectl exec -n relay deploy/relay -- relay pull-hosts --relay-host https://relay1.us-west.bsky.network ··· 231 257 ./scripts/jetstream {{ args }} 232 258 233 259 # backfill collectiondir with full network PDS hosts 260 + # pass --hosts <file> to use a specific host list, otherwise extracts from relay 234 261 backfill *args: 235 262 #!/usr/bin/env bash 236 263 set -euo pipefail 237 264 : "${COLLECTIONDIR_ADMIN_TOKEN:?set COLLECTIONDIR_ADMIN_TOKEN}" 238 - : "${RELAY_ADMIN_PASSWORD:?set RELAY_ADMIN_PASSWORD}" 239 265 240 - echo "==> port-forwarding to relay and collectiondir" 241 - kubectl port-forward -n relay svc/relay 12470:2470 & 242 - RELAY_PF=$! 243 - kubectl port-forward -n relay svc/collectiondir 2510:2510 & 244 - CDIR_PF=$! 245 - trap "kill $RELAY_PF $CDIR_PF 2>/dev/null" EXIT 246 - sleep 3 266 + PIDS=() 267 + cleanup() { kill "${PIDS[@]}" 2>/dev/null; } 268 + trap cleanup EXIT 247 269 248 - echo "==> extracting connected PDS host list from relay" 249 - curl -sf -u "admin:$RELAY_ADMIN_PASSWORD" http://localhost:12470/admin/pds/list \ 250 - | jq -r '.[] | select(.HasActiveConnection) | .Host' > /tmp/relay-hosts.txt 251 - TOTAL=$(curl -sf -u "admin:$RELAY_ADMIN_PASSWORD" http://localhost:12470/admin/pds/list | jq 'length') 252 - echo " $(wc -l < /tmp/relay-hosts.txt | tr -d ' ') connected hosts (of $TOTAL total)" 270 + # port-forward to collectiondir 271 + kubectl port-forward -n relay svc/collectiondir 2510:2510 >/dev/null 2>&1 & 272 + PIDS+=($!) 273 + 274 + EXTRA_ARGS=({{ args }}) 275 + 276 + # if --hosts not provided, extract from relay 277 + if ! printf '%s\n' "${EXTRA_ARGS[@]}" | grep -q '^--hosts$'; then 278 + : "${RELAY_ADMIN_PASSWORD:?set RELAY_ADMIN_PASSWORD}" 279 + kubectl port-forward -n relay svc/relay 12470:2470 >/dev/null 2>&1 & 280 + PIDS+=($!) 281 + sleep 3 282 + 283 + echo "==> extracting connected PDS host list from relay" 284 + curl -sf -u "admin:$RELAY_ADMIN_PASSWORD" http://localhost:12470/admin/pds/list \ 285 + | jq -r '.[] | select(.HasActiveConnection) | .Host' > /tmp/relay-hosts.txt 286 + TOTAL=$(curl -sf -u "admin:$RELAY_ADMIN_PASSWORD" http://localhost:12470/admin/pds/list | jq 'length') 287 + echo " $(wc -l < /tmp/relay-hosts.txt | tr -d ' ') connected hosts (of $TOTAL total)" 288 + echo 289 + EXTRA_ARGS+=(--hosts /tmp/relay-hosts.txt) 290 + else 291 + sleep 3 292 + fi 253 293 254 - echo "==> starting backfill" 255 294 ./scripts/backfill \ 256 295 --token "$COLLECTIONDIR_ADMIN_TOKEN" \ 257 - --hosts /tmp/relay-hosts.txt \ 258 - {{ args }} 296 + "${EXTRA_ARGS[@]}"

+32 -16

scripts/backfill

··· 1 - #!/usr/bin/env -S uv run --script --quiet 1 + #!/usr/bin/env -S PYTHONUNBUFFERED=1 uv run --script --quiet 2 2 # /// script 3 3 # requires-python = ">=3.12" 4 4 # dependencies = [] ··· 18 18 import argparse 19 19 import json 20 20 import signal 21 + import sys 21 22 import time 22 23 import urllib.request 23 24 import urllib.error ··· 49 50 return json.loads(resp.read()) 50 51 51 52 52 - def wait_for_drain(url: str, token: str, poll_interval: float = 3.0) -> int: 53 - """poll crawlStatus until no active crawls remain. returns total repos seen.""" 54 - total_seen = 0 53 + def active_crawl_count(url: str, token: str) -> tuple[int, int]: 54 + """returns (active_count, total_repos_seen).""" 55 + status = crawl_status(url, token) 56 + active = status.get("host_starts", {}) 57 + seen = sum(h.get("seen", 0) for h in active.values()) 58 + return len(active), seen 59 + 60 + 61 + def wait_for_batch(url: str, token: str, baseline: int, poll_interval: float = 3.0) -> int: 62 + """poll until active crawl count returns to baseline. returns repos seen this batch.""" 63 + peak_seen = 0 55 64 while True: 56 - status = crawl_status(url, token) 57 - active = status.get("host_starts", {}) 58 - if not active: 59 - return total_seen 60 - seen = sum(h.get("seen", 0) for h in active.values()) 61 - total_seen = max(total_seen, seen) 62 - print(f" waiting... {len(active)} active crawls, {seen} repos seen", end="\r") 65 + count, seen = active_crawl_count(url, token) 66 + peak_seen = max(peak_seen, seen) 67 + if count <= baseline: 68 + sys.stdout.write(f"\r done — {peak_seen} repos described" + " " * 20 + "\n") 69 + return peak_seen 70 + sys.stdout.write(f"\r {count - baseline} crawls active, {seen} repos described") 71 + sys.stdout.flush() 63 72 time.sleep(poll_interval) 64 73 65 74 ··· 79 88 print("no hosts found in file") 80 89 return 81 90 82 - # batches 83 91 batches = [all_hosts[i : i + args.batch_size] for i in range(0, len(all_hosts), args.batch_size)] 84 92 85 93 stopping = False ··· 93 101 94 102 print(f"backfill: {len(all_hosts)} hosts in {len(batches)} batches of {args.batch_size}") 95 103 print(f"target: {args.url}") 104 + 105 + # snapshot baseline before we start — accounts for leftover crawls 106 + baseline, baseline_repos = active_crawl_count(args.url, args.token) 107 + if baseline > 0: 108 + print(f"note: {baseline} crawls already active ({baseline_repos} repos), waiting for those too") 96 109 print() 97 110 98 111 start = time.time() ··· 104 117 break 105 118 106 119 elapsed = time.time() - start 107 - print(f"batch {i + 1}/{len(batches)} ({len(batch)} hosts) [{elapsed:.0f}s elapsed, {hosts_crawled} hosts done, {total_repos} repos]") 120 + rate = hosts_crawled / elapsed if elapsed > 0 else 0 121 + eta = (len(all_hosts) - hosts_crawled) / rate if rate > 0 else 0 122 + eta_str = f", ~{eta:.0f}s remaining" if hosts_crawled > 0 else "" 123 + print(f"batch {i + 1}/{len(batches)} — {', '.join(batch[:3])}{'...' if len(batch) > 3 else ''}") 124 + print(f" [{hosts_crawled}/{len(all_hosts)} hosts, {total_repos} repos, {elapsed:.0f}s elapsed{eta_str}]") 108 125 109 126 try: 110 127 request_crawl(args.url, args.token, batch) ··· 119 136 print(f" error sending batch: {e}") 120 137 continue 121 138 122 - repos = wait_for_drain(args.url, args.token) 139 + repos = wait_for_batch(args.url, args.token, baseline) 123 140 total_repos += repos 124 141 hosts_crawled += len(batch) 125 - print() 126 142 127 143 if i < len(batches) - 1 and not stopping: 128 144 time.sleep(args.pause) 129 145 130 146 elapsed = time.time() - start 131 - print(f"done: {hosts_crawled}/{len(all_hosts)} hosts, {total_repos} repos, {elapsed:.0f}s") 147 + print(f"\nbackfill complete: {hosts_crawled}/{len(all_hosts)} hosts, {total_repos} repos, {elapsed:.0f}s") 132 148 133 149 134 150 if __name__ == "__main__":

Configure Feed

Configure Feed