feat: add zlay deployment configs, infra, and reconnect tooling

+6 -5

.gitignore

··· 1 1 # terraform 2 - infra/.terraform/ 3 - infra/.terraform.lock.hcl 4 - infra/terraform.tfstate 5 - infra/terraform.tfstate.backup 6 - infra/*.tfvars 2 + infra/**/.terraform/ 3 + infra/**/.terraform.lock.hcl 4 + infra/**/terraform.tfstate 5 + infra/**/terraform.tfstate.backup 6 + infra/**/*.tfvars 7 7 !infra/terraform.tfvars.example 8 8 9 9 # kubeconfig (fetched from server) 10 10 kubeconfig.yaml 11 + zlay-kubeconfig.yaml 11 12 12 13 # secrets 13 14 *.secret

+2

deploy/collectiondir-values.yaml

··· 19 19 - /data/dau 20 20 - --upstream 21 21 - ws://relay:2470 22 + - --crawl-qps 23 + - "8" 22 24 probes: 23 25 liveness: &probes 24 26 enabled: true

+9

deploy/monitoring-values.yaml

··· 72 72 editable: true 73 73 options: 74 74 path: /var/lib/grafana/dashboards/relay 75 + - name: zlay 76 + orgId: 1 77 + folder: "" 78 + type: file 79 + disableDeletion: false 80 + editable: true 81 + options: 82 + path: /var/lib/grafana/dashboards/zlay 75 83 dashboardsConfigMaps: 76 84 relay: relay-dashboard 85 + zlay: zlay-dashboard 77 86 defaultDashboardsEnabled: false 78 87 79 88 # --- operator ---

+70

deploy/reconnect-cronjob.yaml

··· 1 + apiVersion: batch/v1 2 + kind: CronJob 3 + metadata: 4 + name: relay-reconnect 5 + namespace: relay 6 + spec: 7 + schedule: "0 */4 * * *" # every 4 hours 8 + concurrencyPolicy: Forbid 9 + successfulJobsHistoryLimit: 3 10 + failedJobsHistoryLimit: 3 11 + jobTemplate: 12 + spec: 13 + backoffLimit: 1 14 + activeDeadlineSeconds: 1800 # 30 min max 15 + template: 16 + spec: 17 + restartPolicy: Never 18 + containers: 19 + - name: reconnect 20 + image: python:3.12-alpine 21 + env: 22 + - name: PYTHONUNBUFFERED 23 + value: "1" 24 + - name: RELAY_ADMIN_PASSWORD 25 + valueFrom: 26 + secretKeyRef: 27 + name: relay-secret 28 + key: RELAY_ADMIN_PASSWORD 29 + command: 30 + - python3 31 + - -c 32 + - | 33 + import json, urllib.request, time, os, base64, sys 34 + 35 + PDS_LIST_URL = "https://raw.githubusercontent.com/mary-ext/atproto-scraping/refs/heads/trunk/state.json" 36 + RELAY_URL = "http://relay.relay.svc.cluster.local:2470" 37 + PASSWORD = os.environ["RELAY_ADMIN_PASSWORD"] 38 + AUTH = base64.b64encode(f"admin:{PASSWORD}".encode()).decode() 39 + 40 + print(f"fetching PDS list from {PDS_LIST_URL}...") 41 + with urllib.request.urlopen(PDS_LIST_URL, timeout=30) as resp: 42 + data = json.loads(resp.read()) 43 + hosts = [url.rstrip("/") for url in data.get("pdses", {}).keys() if url.startswith("https://")] 44 + print(f"found {len(hosts)} PDS hosts") 45 + 46 + ok = errors = 0 47 + start = time.time() 48 + 49 + for i, host in enumerate(hosts): 50 + payload = json.dumps({"hostname": host}).encode() 51 + req = urllib.request.Request( 52 + f"{RELAY_URL}/admin/pds/requestCrawl", 53 + data=payload, 54 + headers={"Content-Type": "application/json", "Authorization": f"Basic {AUTH}"}, 55 + method="POST", 56 + ) 57 + try: 58 + with urllib.request.urlopen(req, timeout=10) as resp: 59 + ok += 1 60 + except urllib.error.HTTPError: 61 + errors += 1 62 + except (ConnectionError, OSError, urllib.error.URLError): 63 + errors += 1 64 + 65 + if (i + 1) % 500 == 0: 66 + print(f" {i + 1}/{len(hosts)} ({ok} ok, {errors} errors, {time.time() - start:.0f}s)") 67 + 68 + time.sleep(0.05) 69 + 70 + print(f"done: {ok} ok, {errors} errors, {time.time() - start:.0f}s")

+258

deploy/zlay-dashboard.json

··· 1 + { 2 + "annotations": { 3 + "list": [] 4 + }, 5 + "editable": true, 6 + "fiscalYearStartMonth": 0, 7 + "graphTooltip": 1, 8 + "links": [], 9 + "panels": [ 10 + { 11 + "title": "events/sec", 12 + "type": "timeseries", 13 + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 0 }, 14 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 15 + "fieldConfig": { 16 + "defaults": { 17 + "unit": "ops", 18 + "color": { "mode": "palette-classic" }, 19 + "custom": { 20 + "fillOpacity": 15, 21 + "lineWidth": 2, 22 + "spanNulls": false 23 + } 24 + }, 25 + "overrides": [] 26 + }, 27 + "targets": [ 28 + { 29 + "expr": "sum(rate(relay_frames_received_total{job=\"zlay\"}[5m]))", 30 + "legendFormat": "events received", 31 + "refId": "A" 32 + } 33 + ] 34 + }, 35 + { 36 + "title": "connected PDS hosts", 37 + "type": "stat", 38 + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 0 }, 39 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 40 + "fieldConfig": { 41 + "defaults": { 42 + "color": { "mode": "thresholds" }, 43 + "thresholds": { 44 + "steps": [ 45 + { "color": "red", "value": null }, 46 + { "color": "yellow", "value": 500 }, 47 + { "color": "green", "value": 1000 } 48 + ] 49 + } 50 + }, 51 + "overrides": [] 52 + }, 53 + "options": { 54 + "colorMode": "value", 55 + "graphMode": "area", 56 + "reduceOptions": { "calcs": ["lastNotNull"] } 57 + }, 58 + "targets": [ 59 + { 60 + "expr": "sum(relay_connected_inbound{job=\"zlay\"})", 61 + "legendFormat": "hosts", 62 + "refId": "A" 63 + } 64 + ] 65 + }, 66 + { 67 + "title": "downstream consumers", 68 + "type": "timeseries", 69 + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 0 }, 70 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 71 + "fieldConfig": { 72 + "defaults": { 73 + "unit": "ops", 74 + "color": { "mode": "palette-classic" }, 75 + "custom": { 76 + "fillOpacity": 15, 77 + "lineWidth": 2, 78 + "spanNulls": false 79 + } 80 + }, 81 + "overrides": [] 82 + }, 83 + "targets": [ 84 + { 85 + "expr": "sum(rate(relay_frames_broadcast_total{job=\"zlay\"}[5m]))", 86 + "legendFormat": "events sent", 87 + "refId": "A" 88 + } 89 + ] 90 + }, 91 + { 92 + "title": "validation/sec", 93 + "type": "timeseries", 94 + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 8 }, 95 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 96 + "fieldConfig": { 97 + "defaults": { 98 + "unit": "ops", 99 + "color": { "mode": "palette-classic" }, 100 + "custom": { 101 + "fillOpacity": 15, 102 + "lineWidth": 2, 103 + "spanNulls": false 104 + } 105 + }, 106 + "overrides": [] 107 + }, 108 + "targets": [ 109 + { 110 + "expr": "sum(rate(relay_validation_total{job=\"zlay\",result=\"validated\"}[5m]))", 111 + "legendFormat": "validated", 112 + "refId": "A" 113 + }, 114 + { 115 + "expr": "sum(rate(relay_validation_total{job=\"zlay\",result=\"failed\"}[5m]))", 116 + "legendFormat": "failed", 117 + "refId": "B" 118 + }, 119 + { 120 + "expr": "sum(rate(relay_validation_total{job=\"zlay\",result=\"skipped\"}[5m]))", 121 + "legendFormat": "skipped", 122 + "refId": "C" 123 + } 124 + ] 125 + }, 126 + { 127 + "title": "memory", 128 + "type": "timeseries", 129 + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 8 }, 130 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 131 + "fieldConfig": { 132 + "defaults": { 133 + "unit": "bytes", 134 + "color": { "mode": "palette-classic" }, 135 + "custom": { 136 + "fillOpacity": 15, 137 + "lineWidth": 2, 138 + "spanNulls": false 139 + } 140 + }, 141 + "overrides": [] 142 + }, 143 + "targets": [ 144 + { 145 + "expr": "container_memory_working_set_bytes{namespace=\"zlay\",pod=~\"zlay.*\",container=\"main\"}", 146 + "legendFormat": "working set", 147 + "refId": "A" 148 + }, 149 + { 150 + "expr": "kube_pod_container_resource_limits{namespace=\"zlay\",pod=~\"zlay.*\",container=\"main\",resource=\"memory\"}", 151 + "legendFormat": "limit", 152 + "refId": "B" 153 + } 154 + ] 155 + }, 156 + { 157 + "title": "uptime", 158 + "type": "stat", 159 + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 8 }, 160 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 161 + "fieldConfig": { 162 + "defaults": { 163 + "unit": "s", 164 + "color": { "mode": "thresholds" }, 165 + "thresholds": { 166 + "steps": [ 167 + { "color": "red", "value": null }, 168 + { "color": "yellow", "value": 3600 }, 169 + { "color": "green", "value": 86400 } 170 + ] 171 + } 172 + }, 173 + "overrides": [] 174 + }, 175 + "options": { 176 + "colorMode": "value", 177 + "graphMode": "none", 178 + "reduceOptions": { "calcs": ["lastNotNull"] } 179 + }, 180 + "targets": [ 181 + { 182 + "expr": "sum(relay_uptime_seconds{job=\"zlay\"})", 183 + "legendFormat": "uptime", 184 + "refId": "A" 185 + } 186 + ] 187 + }, 188 + { 189 + "title": "key cache/sec", 190 + "type": "timeseries", 191 + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, 192 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 193 + "fieldConfig": { 194 + "defaults": { 195 + "unit": "ops", 196 + "color": { "mode": "palette-classic" }, 197 + "custom": { 198 + "fillOpacity": 15, 199 + "lineWidth": 2, 200 + "spanNulls": false 201 + } 202 + }, 203 + "overrides": [] 204 + }, 205 + "targets": [ 206 + { 207 + "expr": "sum(rate(relay_cache_total{job=\"zlay\",result=\"hit\"}[5m]))", 208 + "legendFormat": "hits", 209 + "refId": "A" 210 + }, 211 + { 212 + "expr": "sum(rate(relay_cache_total{job=\"zlay\",result=\"miss\"}[5m]))", 213 + "legendFormat": "misses", 214 + "refId": "B" 215 + } 216 + ] 217 + }, 218 + { 219 + "title": "errors/sec", 220 + "type": "timeseries", 221 + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, 222 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 223 + "fieldConfig": { 224 + "defaults": { 225 + "unit": "ops", 226 + "color": { "mode": "palette-classic" }, 227 + "custom": { 228 + "fillOpacity": 15, 229 + "lineWidth": 2, 230 + "spanNulls": false 231 + } 232 + }, 233 + "overrides": [] 234 + }, 235 + "targets": [ 236 + { 237 + "expr": "sum(rate(relay_decode_errors_total{job=\"zlay\"}[5m]))", 238 + "legendFormat": "decode errors", 239 + "refId": "A" 240 + }, 241 + { 242 + "expr": "sum(rate(relay_slow_consumers_total{job=\"zlay\"}[5m]))", 243 + "legendFormat": "slow consumer drops", 244 + "refId": "B" 245 + } 246 + ] 247 + } 248 + ], 249 + "schemaVersion": 39, 250 + "tags": ["zlay", "atproto"], 251 + "templating": { "list": [] }, 252 + "time": { "from": "now-1h", "to": "now" }, 253 + "timepicker": {}, 254 + "timezone": "browser", 255 + "title": "zlay.waow.tech", 256 + "uid": "zlay-waow", 257 + "version": 1 258 + }

+68

deploy/zlay-ingress.yaml

··· 1 + apiVersion: networking.k8s.io/v1 2 + kind: Ingress 3 + metadata: 4 + name: zlay 5 + namespace: zlay 6 + annotations: 7 + cert-manager.io/cluster-issuer: letsencrypt-prod 8 + # websocket support 9 + traefik.ingress.kubernetes.io/custom-response-headers: "Connection: Upgrade" 10 + spec: 11 + ingressClassName: traefik 12 + tls: 13 + - hosts: 14 + - ZLAY_DOMAIN_PLACEHOLDER 15 + secretName: zlay-tls 16 + rules: 17 + - host: ZLAY_DOMAIN_PLACEHOLDER 18 + http: 19 + paths: 20 + - path: /xrpc/com.atproto.sync.subscribeRepos 21 + pathType: Exact 22 + backend: 23 + service: 24 + name: zlay 25 + port: 26 + number: 3000 27 + - path: / 28 + pathType: Prefix 29 + backend: 30 + service: 31 + name: zlay 32 + port: 33 + number: 3001 34 + - path: /_health 35 + pathType: Exact 36 + backend: 37 + service: 38 + name: zlay 39 + port: 40 + number: 3001 41 + - path: /_stats 42 + pathType: Exact 43 + backend: 44 + service: 45 + name: zlay 46 + port: 47 + number: 3001 48 + - path: /metrics 49 + pathType: Exact 50 + backend: 51 + service: 52 + name: zlay 53 + port: 54 + number: 3001 55 + - path: /admin 56 + pathType: Prefix 57 + backend: 58 + service: 59 + name: zlay 60 + port: 61 + number: 3001 62 + - path: /xrpc/com.atproto.sync.requestCrawl 63 + pathType: Exact 64 + backend: 65 + service: 66 + name: zlay 67 + port: 68 + number: 3001

+16

deploy/zlay-servicemonitor.yaml

··· 1 + apiVersion: monitoring.coreos.com/v1 2 + kind: ServiceMonitor 3 + metadata: 4 + name: zlay 5 + namespace: monitoring 6 + spec: 7 + selector: 8 + matchLabels: 9 + app.kubernetes.io/name: zlay 10 + namespaceSelector: 11 + matchNames: 12 + - zlay 13 + endpoints: 14 + - port: http 15 + path: /metrics 16 + interval: 30s

+58

deploy/zlay-values.yaml

··· 1 + # bjw-s/app-template helm values for the zat relay (zlay) 2 + # schema: https://github.com/bjw-s-labs/helm-charts/tree/main/charts/other/app-template 3 + 4 + controllers: 5 + zlay: 6 + containers: 7 + main: 8 + image: 9 + repository: atcr.io/zzstoatzz.io/zlay 10 + tag: latest 11 + env: 12 + RELAY_PORT: "3000" 13 + RELAY_HTTP_PORT: "3001" 14 + RELAY_UPSTREAM: "bsky.network" 15 + RELAY_DATA_DIR: /data/events 16 + RELAY_RETENTION_HOURS: "72" 17 + envFrom: 18 + - secretRef: 19 + name: zlay-secret 20 + probes: 21 + liveness: &probes 22 + enabled: true 23 + custom: true 24 + spec: 25 + httpGet: 26 + path: /_health 27 + port: &http-port 3001 28 + initialDelaySeconds: 5 29 + periodSeconds: 10 30 + timeoutSeconds: 3 31 + failureThreshold: 5 32 + readiness: *probes 33 + resources: 34 + requests: 35 + memory: 128Mi 36 + cpu: 250m 37 + limits: 38 + memory: 2Gi 39 + 40 + service: 41 + zlay: 42 + controller: zlay 43 + ports: 44 + ws: 45 + port: 3000 46 + http: 47 + port: *http-port 48 + 49 + defaultPodOptions: 50 + imagePullSecrets: 51 + - name: atcr-creds 52 + 53 + persistence: 54 + data: 55 + enabled: true 56 + type: persistentVolumeClaim 57 + accessMode: ReadWriteOnce 58 + size: 20Gi

+8 -2

docs/architecture.md

··· 41 41 42 42 the relay and collectiondir ServiceMonitors are standalone manifests (`kubectl apply -f`) rather than inline in the helm values — the `additionalServiceMonitors` field in kube-prometheus-stack silently fails when targeting services in a different namespace. 43 43 44 + ## PDS connection maintenance 45 + 46 + relays try to reconnect to PDS hosts when connections drop, but eventually give up after repeated failures (exponential backoff). PDS hosts re-announce themselves to bluesky's relay when they come back online, but not to third-party relays like ours. this causes a natural decay in connected host count over time. 47 + 48 + fix: a k8s CronJob (`deploy/reconnect-cronjob.yaml`) runs every 4 hours, fetching the [community PDS list](https://github.com/mary-ext/atproto-scraping) and sending `requestCrawl` for each host. this can also be run manually via `just reconnect`. 49 + 44 50 ## steady-state specs 45 51 46 52 | metric | value | 47 53 |--------|-------| 48 54 | storage (relay data) | ~21 GB | 49 55 | storage (postgres) | ~2.4 GB | 50 - | storage (collectiondir pebble) | ~300 MB (pre-bsky-backfill) | 56 + | storage (collectiondir pebble) | ~5 GB (post-backfill) | 51 57 | CPU usage | 5-15% | 52 58 | network throughput | ~600 events/sec typical, 2000 peak | 53 - | connected PDS hosts | ~2200 | 59 + | connected PDS hosts | ~2800 |

+17 -3

docs/backfill.md

··· 16 16 17 17 **indie PDS hosts** (~2200): independently-run servers, mostly small (1-100 accounts each). backfilling all of them takes minutes. 18 18 19 - **bluesky shards** (~87): the mushroom-named hosts (`amanita.us-east.host.bsky.network`, `chanterelle.us-west.host.bsky.network`, etc.) that host the vast majority of accounts. ~14K repos per shard on average, up to ~40K for the largest. these take hours to crawl. 19 + **bluesky shards** (~88): the mushroom-named hosts (`amanita.us-east.host.bsky.network`, `chanterelle.us-west.host.bsky.network`, etc.) that host the vast majority of accounts. ~30K-50K repos per shard on average, up to ~500K for the largest. these take days to crawl (see batch sizing below). 20 20 21 21 ## running the backfill 22 22 ··· 35 35 ```bash 36 36 ./scripts/backfill --token "$COLLECTIONDIR_ADMIN_TOKEN" --hosts hosts.txt 37 37 ./scripts/backfill --token "$TOKEN" --hosts hosts.txt --batch-size 20 --pause 30 38 + 39 + # resume a run that died partway through (skip first 35 batches) 40 + ./scripts/backfill --token "$TOKEN" --hosts hosts.txt --batch-size 1 --skip 35 41 + 42 + # set a timeout per batch (useful for long-running shard crawls) 43 + ./scripts/backfill --token "$TOKEN" --hosts hosts.txt --batch-size 1 --batch-timeout 600 38 44 ``` 39 45 40 - the script sends batches of N hosts (default: 10) to `POST /admin/pds/requestCrawl`, then polls `GET /admin/crawlStatus` until active crawls drain before sending the next batch. ctrl-c stops after the current batch finishes. 46 + the script sends batches of N hosts (default: 10) to `POST /admin/pds/requestCrawl`, then polls `GET /admin/crawlStatus` until active crawls drain before sending the next batch. if `--batch-timeout` is set, it moves on after that many seconds even if crawls are still active. ctrl-c stops after the current batch finishes. 47 + 48 + the script retries on transient connection errors (e.g. port-forward drops) with exponential backoff — up to 12 retries for status polling, 6 for crawl requests. 41 49 42 50 ## batch sizing 43 51 ··· 53 61 - `kubectl exec -n relay deploy/collectiondir -- df -h /data` for pebble disk usage 54 62 - crawl status API: `curl -H "Authorization: Bearer $TOKEN" localhost:2510/admin/crawlStatus` 55 63 64 + ## gotchas 65 + 66 + - **port-forwards die** after ~80 minutes. server-side crawls survive the disconnect, so progress isn't lost — but the script can't poll status or submit new batches until the port-forward is re-established. the retry logic handles brief drops; for longer outages, re-run with `--skip`. 67 + - **crawl state is in-memory.** a collectiondir pod restart loses all in-progress crawl goroutines. completed pairs are already in pebble and safe. 68 + - **no 429 retry in crawl code.** the collectiondir's crawl thread doesn't retry on HTTP 429. a single rate-limit response kills the entire crawl for that host. this is why bsky shards must be submitted one at a time. 69 + 56 70 ## storage impact 57 71 58 - pebble stores one key per `(collection, DID)` pair. the indie host backfill brought the DB to ~300 MB. the full bsky shard backfill (millions of accounts, each with multiple collections) will likely grow it to a few GB. the collectiondir has a 10Gi PVC, so there's plenty of headroom. 72 + pebble stores one key per `(collection, DID)` pair. post-backfill (indie + all bsky shards, ~2.96M repos), the DB is ~5 GB. the collectiondir has a 10Gi PVC.

+6

docs/deploying.md

··· 45 45 just status # nodes, pods, health check 46 46 just logs # tail relay logs 47 47 just health # curl the public health endpoint 48 + just reconnect # re-announce all known PDS hosts to the relay 49 + just backfill # backfill collectiondir with full network data 48 50 just firehose # consume the firehose (passes args through) 49 51 just jetstream # consume the jetstream (passes args through) 50 52 just ssh # ssh into the server 51 53 just destroy # tear down everything 52 54 ``` 55 + 56 + ## maintenance 57 + 58 + a k8s CronJob (`deploy/reconnect-cronjob.yaml`) runs every 4 hours to re-announce PDS hosts to the relay — see [architecture](architecture.md#pds-connection-maintenance) for why this is needed. `just reconnect` runs the same logic manually. 53 59 54 60 ## targeted deployments 55 61

+216

docs/hacks.md

··· 1 + # the micro-PDS trick 2 + 3 + a technique for injecting specific (DID, collection) pairs into the collectiondir without crawling entire PDS hosts. 4 + 5 + ## the problem 6 + 7 + the collectiondir indexes (DID, collection) pairs via two paths: 8 + 9 + 1. **firehose** — sees new `create`/`update` commits in real time. only indexes the specific collection in each commit. 10 + 2. **requestCrawl** — crawls a PDS by paginating `com.atproto.sync.listRepos` then calling `com.atproto.repo.describeRepo` for each DID. indexes all collections for every repo on that host. 11 + 12 + the backfill uses path 2. but bsky PDS shards have ~500K repos each and enforce a shared IP-based rate limit of 3,000 requests per 300 seconds (~10 QPS). the crawl code has no retry on 429 — a single rate limit response kills the entire crawl. at the default 100 QPS, the crawl dies after processing ~2-3K repos (0.6% of the shard). even at 8 QPS, running multiple shard crawls in parallel exhausts the shared budget. 13 + 14 + this leaves a gap: repos the relay knows about (firehose is current, revs match) but the collectiondir never indexed because the crawl died before reaching them. 15 + 16 + ## the trick 17 + 18 + the collectiondir's `requestCrawl` doesn't know or care what it's crawling. it talks XRPC to whatever hostname you give it. so: stand up a tiny HTTP server that implements just the two required endpoints — `listRepos` and `describeRepo` — with only the specific DIDs you need, and point `requestCrawl` at it. 19 + 20 + 22 DIDs with 784 collection pairs, indexed in 3 seconds. no rate limits, because it's our server talking to our server. 21 + 22 + ## how to do it 23 + 24 + ### 1. gather the data 25 + 26 + for each missing DID, resolve its PDS via `plc.directory` and call `describeRepo` to get its collections: 27 + 28 + ```python 29 + import json, urllib.request 30 + 31 + did = "did:plc:example" 32 + doc = json.loads(urllib.request.urlopen(f"https://plc.directory/{did}").read()) 33 + pds = [s["serviceEndpoint"] for s in doc["service"] if s["id"] == "#atproto_pds"][0] 34 + desc = json.loads(urllib.request.urlopen(f"{pds}/xrpc/com.atproto.repo.describeRepo?repo={did}").read()) 35 + # desc["collections"] is what we need 36 + ``` 37 + 38 + save all results to a JSON file keyed by DID: 39 + 40 + ```json 41 + { 42 + "did:plc:example": { 43 + "handle": "user.bsky.social", 44 + "did": "did:plc:example", 45 + "collections": ["app.bsky.feed.post", "io.atcr.sailor.profile", "..."] 46 + } 47 + } 48 + ``` 49 + 50 + ### 2. deploy as a k8s Job 51 + 52 + create a ConfigMap with the data and a Job that: 53 + - starts an HTTP server implementing `listRepos` and `describeRepo` 54 + - calls `requestCrawl` on the collectiondir pointing at itself (`http://POD_IP:8080`) 55 + - polls `crawlStatus` until the crawl drains 56 + - exits 57 + 58 + ```bash 59 + # create the configmap 60 + kubectl create configmap micro-pds-data \ 61 + --namespace relay \ 62 + --from-file=dids.json=missing-dids.json 63 + 64 + # create the job (see below for manifest) 65 + kubectl apply -f micro-pds-job.yaml 66 + 67 + # watch it run 68 + kubectl logs -n relay job/micro-pds-crawl -f 69 + 70 + # clean up 71 + kubectl delete job micro-pds-crawl configmap micro-pds-data -n relay 72 + ``` 73 + 74 + ### 3. the Job manifest 75 + 76 + ```yaml 77 + apiVersion: batch/v1 78 + kind: Job 79 + metadata: 80 + name: micro-pds-crawl 81 + namespace: relay 82 + spec: 83 + backoffLimit: 0 84 + activeDeadlineSeconds: 120 85 + template: 86 + spec: 87 + restartPolicy: Never 88 + containers: 89 + - name: micro-pds 90 + image: python:3.12-alpine 91 + env: 92 + - name: PYTHONUNBUFFERED 93 + value: "1" 94 + - name: POD_IP 95 + valueFrom: 96 + fieldRef: 97 + fieldPath: status.podIP 98 + - name: ADMIN_TOKEN 99 + valueFrom: 100 + secretKeyRef: 101 + name: collectiondir-secret 102 + key: COLLECTIONS_ADMIN_TOKEN 103 + volumeMounts: 104 + - name: data 105 + mountPath: /data 106 + command: 107 + - python3 108 + - -c 109 + - | 110 + import json, os, sys, time, urllib.request 111 + from http.server import HTTPServer, BaseHTTPRequestHandler 112 + from urllib.parse import urlparse, parse_qs 113 + from threading import Thread 114 + 115 + PORT = 8080 116 + POD_IP = os.environ["POD_IP"] 117 + ADMIN_TOKEN = os.environ["ADMIN_TOKEN"] 118 + COLLECTIONDIR = "http://collectiondir.relay.svc.cluster.local:2510" 119 + 120 + with open("/data/dids.json") as f: 121 + DID_DATA = json.load(f) 122 + 123 + DIDS = list(DID_DATA.keys()) 124 + print(f"micro-pds: serving {len(DIDS)} DIDs") 125 + 126 + class Handler(BaseHTTPRequestHandler): 127 + def log_message(self, *args): 128 + pass 129 + 130 + def do_GET(self): 131 + parsed = urlparse(self.path) 132 + params = parse_qs(parsed.query) 133 + 134 + if parsed.path == "/xrpc/com.atproto.sync.listRepos": 135 + cursor = params.get("cursor", [""])[0] 136 + limit = int(params.get("limit", ["1000"])[0]) 137 + start = int(cursor) if cursor else 0 138 + batch = DIDS[start:start + limit] 139 + repos = [{"did": d, "head": "baf", "rev": "0", "active": True} for d in batch] 140 + resp = {"repos": repos} 141 + if start + limit < len(DIDS): 142 + resp["cursor"] = str(start + limit) 143 + self.send_response(200) 144 + self.send_header("Content-Type", "application/json") 145 + self.end_headers() 146 + self.wfile.write(json.dumps(resp).encode()) 147 + 148 + elif parsed.path == "/xrpc/com.atproto.repo.describeRepo": 149 + repo = params.get("repo", [""])[0] 150 + if repo in DID_DATA: 151 + info = DID_DATA[repo] 152 + resp = { 153 + "handle": info.get("handle", "unknown"), 154 + "did": repo, 155 + "didDoc": {}, 156 + "collections": info["collections"], 157 + "handleIsCorrect": True, 158 + } 159 + self.send_response(200) 160 + self.send_header("Content-Type", "application/json") 161 + self.end_headers() 162 + self.wfile.write(json.dumps(resp).encode()) 163 + else: 164 + self.send_response(404) 165 + self.end_headers() 166 + else: 167 + self.send_response(404) 168 + self.end_headers() 169 + 170 + server = HTTPServer(("0.0.0.0", PORT), Handler) 171 + Thread(target=server.serve_forever, daemon=True).start() 172 + print(f"micro-pds: listening on {POD_IP}:{PORT}") 173 + time.sleep(1) 174 + 175 + # trigger crawl 176 + payload = json.dumps({"hostname": f"http://{POD_IP}:{PORT}"}).encode() 177 + req = urllib.request.Request( 178 + f"{COLLECTIONDIR}/admin/pds/requestCrawl", 179 + data=payload, 180 + headers={"Content-Type": "application/json", "Authorization": f"Bearer {ADMIN_TOKEN}"}, 181 + method="POST", 182 + ) 183 + with urllib.request.urlopen(req, timeout=10) as resp: 184 + print(f"micro-pds: requestCrawl -> {resp.status}") 185 + 186 + # wait for drain 187 + while True: 188 + time.sleep(2) 189 + req = urllib.request.Request( 190 + f"{COLLECTIONDIR}/admin/crawlStatus", 191 + headers={"Authorization": f"Bearer {ADMIN_TOKEN}"}, 192 + ) 193 + with urllib.request.urlopen(req, timeout=10) as resp: 194 + status = json.loads(resp.read()) 195 + if not status.get("host_starts"): 196 + break 197 + 198 + print("micro-pds: done") 199 + server.shutdown() 200 + volumes: 201 + - name: data 202 + configMap: 203 + name: micro-pds-data 204 + ``` 205 + 206 + ## when to use this 207 + 208 + - you have a specific set of DIDs that need indexing and can't wait for a full shard crawl 209 + - the full crawl is blocked by rate limits 210 + - you've verified the DIDs exist on their PDS (call `describeRepo` yourself first) 211 + 212 + ## limitations 213 + 214 + - only indexes what you give it. if you miss a DID, it won't be indexed. 215 + - the collectiondir treats this like any other crawl — it writes to the main pebble DB via the normal `ingestCrawl` pathway. no special code paths, no risk to existing data. 216 + - the data you gather is a point-in-time snapshot. if a user adds a new collection after you gathered the data, the micro-PDS won't know about it (but the firehose will catch it going forward).

+64

infra/zlay/main.tf

··· 1 + data "hcloud_ssh_key" "main" { 2 + name = "relay-key" 3 + } 4 + 5 + resource "hcloud_firewall" "zlay" { 6 + name = "${var.server_name}-fw" 7 + 8 + # ssh 9 + rule { 10 + direction = "in" 11 + protocol = "tcp" 12 + port = "22" 13 + source_ips = ["0.0.0.0/0", "::/0"] 14 + } 15 + 16 + # http 17 + rule { 18 + direction = "in" 19 + protocol = "tcp" 20 + port = "80" 21 + source_ips = ["0.0.0.0/0", "::/0"] 22 + } 23 + 24 + # https 25 + rule { 26 + direction = "in" 27 + protocol = "tcp" 28 + port = "443" 29 + source_ips = ["0.0.0.0/0", "::/0"] 30 + } 31 + 32 + # k3s api (restrict this to your IP in production) 33 + rule { 34 + direction = "in" 35 + protocol = "tcp" 36 + port = "6443" 37 + source_ips = ["0.0.0.0/0", "::/0"] 38 + } 39 + } 40 + 41 + resource "hcloud_server" "zlay" { 42 + name = var.server_name 43 + server_type = var.server_type 44 + location = var.location 45 + image = "ubuntu-24.04" 46 + 47 + ssh_keys = [data.hcloud_ssh_key.main.id] 48 + firewall_ids = [hcloud_firewall.zlay.id] 49 + 50 + user_data = <<-CLOUDINIT 51 + #cloud-config 52 + package_update: true 53 + packages: 54 + - curl 55 + - jq 56 + 57 + runcmd: 58 + - | 59 + PUBLIC_IP=$(curl -s http://169.254.169.254/hetzner/v1/metadata/public-ipv4) 60 + curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server --tls-san $PUBLIC_IP" sh - 61 + - while ! kubectl get nodes >/dev/null 2>&1; do sleep 2; done 62 + - touch /run/k3s-ready 63 + CLOUDINIT 64 + }

+3

infra/zlay/outputs.tf

··· 1 + output "server_ip" { 2 + value = hcloud_server.zlay.ipv4_address 3 + }

+24

infra/zlay/variables.tf

··· 1 + variable "hcloud_token" { 2 + description = "Hetzner Cloud API token" 3 + type = string 4 + sensitive = true 5 + } 6 + 7 + 8 + variable "server_type" { 9 + description = "Hetzner server type (cpx41 = 8 vCPU, 16 GB RAM, 160 GB disk)" 10 + type = string 11 + default = "cpx41" 12 + } 13 + 14 + variable "location" { 15 + description = "Hetzner datacenter location (hil = Hillsboro OR)" 16 + type = string 17 + default = "hil" 18 + } 19 + 20 + variable "server_name" { 21 + description = "Name for the server" 22 + type = string 23 + default = "zlay" 24 + }

+12

infra/zlay/versions.tf

··· 1 + terraform { 2 + required_providers { 3 + hcloud = { 4 + source = "hetznercloud/hcloud" 5 + version = "~> 1.45" 6 + } 7 + } 8 + } 9 + 10 + provider "hcloud" { 11 + token = var.hcloud_token 12 + }

+160

justfile

··· 1 1 # ATProto relay deployment 2 2 # required env vars: HCLOUD_TOKEN, RELAY_DOMAIN, RELAY_ADMIN_PASSWORD, POSTGRES_PASSWORD, LETSENCRYPT_EMAIL 3 3 # optional env vars: GRAFANA_DOMAIN (default: relay-metrics.waow.tech), GRAFANA_ADMIN_PASSWORD, JETSTREAM_DOMAIN (default: jetstream.waow.tech) 4 + # zlay env vars: ZLAY_DOMAIN, ZLAY_ADMIN_PASSWORD, ZLAY_POSTGRES_PASSWORD, LETSENCRYPT_EMAIL 4 5 5 6 set dotenv-load 6 7 ··· 147 148 --wait --timeout 5m 148 149 kubectl apply -f deploy/collectiondir-servicemonitor.yaml 149 150 151 + echo "==> installing reconnect cronjob" 152 + kubectl apply -f deploy/reconnect-cronjob.yaml 153 + 150 154 echo "==> installing jetstream" 151 155 JETSTREAM_DOMAIN="${JETSTREAM_DOMAIN:-jetstream.waow.tech}" 152 156 helm upgrade --install jetstream bjw-s/app-template \ ··· 248 252 249 253 # --- scripts --- 250 254 255 + # reconnect relay to all known PDS hosts (run periodically, e.g. every 4 hours) 256 + reconnect *args: 257 + #!/usr/bin/env bash 258 + set -euo pipefail 259 + : "${RELAY_ADMIN_PASSWORD:?set RELAY_ADMIN_PASSWORD}" 260 + ./scripts/reconnect --password "$RELAY_ADMIN_PASSWORD" {{ args }} 261 + 251 262 # consume the firehose (default: 10s of bsky posts) 252 263 firehose *args: 253 264 ./scripts/firehose {{ args }} ··· 294 305 ./scripts/backfill \ 295 306 --token "$COLLECTIONDIR_ADMIN_TOKEN" \ 296 307 "${EXTRA_ARGS[@]}" 308 + 309 + # === zlay (zig relay) === 310 + 311 + export ZLAY_KUBECONFIG := justfile_directory() / "zlay-kubeconfig.yaml" 312 + 313 + # initialize zlay terraform 314 + zlay-init: 315 + terraform -chdir=infra/zlay init 316 + 317 + # create the zlay hetzner server with k3s 318 + zlay-infra: 319 + terraform -chdir=infra/zlay apply -var="hcloud_token=$HCLOUD_TOKEN" 320 + 321 + # destroy zlay infrastructure 322 + zlay-destroy: 323 + terraform -chdir=infra/zlay destroy -var="hcloud_token=$HCLOUD_TOKEN" 324 + 325 + # get the zlay server IP 326 + zlay-server-ip: 327 + @terraform -chdir=infra/zlay output -raw server_ip 328 + 329 + # ssh into the zlay server 330 + zlay-ssh: 331 + ssh root@$(just zlay-server-ip) 332 + 333 + # fetch zlay kubeconfig 334 + zlay-kubeconfig: 335 + #!/usr/bin/env bash 336 + set -euo pipefail 337 + IP=$(just zlay-server-ip) 338 + echo "fetching kubeconfig from $IP..." 339 + until ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new root@$IP test -f /run/k3s-ready 2>/dev/null; do 340 + echo " waiting for k3s..." 341 + sleep 5 342 + done 343 + scp root@$IP:/etc/rancher/k3s/k3s.yaml zlay-kubeconfig.yaml 344 + if [[ "$(uname)" == "Darwin" ]]; then 345 + sed -i '' "s|127.0.0.1|$IP|g" zlay-kubeconfig.yaml 346 + else 347 + sed -i "s|127.0.0.1|$IP|g" zlay-kubeconfig.yaml 348 + fi 349 + chmod 600 zlay-kubeconfig.yaml 350 + echo "kubeconfig written to zlay-kubeconfig.yaml" 351 + KUBECONFIG=zlay-kubeconfig.yaml kubectl get nodes 352 + 353 + # build and push zlay image (cross-compile on host, tiny docker image) 354 + zlay-publish: 355 + #!/usr/bin/env bash 356 + set -euo pipefail 357 + TMPDIR=$(mktemp -d) 358 + trap "rm -rf $TMPDIR" EXIT 359 + git clone --depth 1 https://tangled.org/zzstoatzz.io/zlay "$TMPDIR" 360 + cd "$TMPDIR" 361 + zig build -Dtarget=x86_64-linux -Doptimize=ReleaseSafe 362 + docker build --platform linux/amd64 -t atcr.io/zzstoatzz.io/zlay:latest . 363 + ATCR_AUTO_AUTH=1 docker push atcr.io/zzstoatzz.io/zlay:latest 364 + 365 + # deploy zlay to its k3s cluster 366 + zlay-deploy: helm-repos 367 + #!/usr/bin/env bash 368 + set -euo pipefail 369 + export KUBECONFIG="$ZLAY_KUBECONFIG" 370 + 371 + : "${ZLAY_DOMAIN:?set ZLAY_DOMAIN}" 372 + : "${ZLAY_POSTGRES_PASSWORD:?set ZLAY_POSTGRES_PASSWORD}" 373 + : "${LETSENCRYPT_EMAIL:?set LETSENCRYPT_EMAIL}" 374 + ZLAY_ADMIN_PASSWORD="${ZLAY_ADMIN_PASSWORD:-}" 375 + 376 + echo "==> creating namespace" 377 + kubectl create namespace zlay --dry-run=client -o yaml | kubectl apply -f - 378 + 379 + echo "==> installing cert-manager" 380 + helm upgrade --install cert-manager jetstack/cert-manager \ 381 + --namespace cert-manager --create-namespace \ 382 + --set crds.enabled=true \ 383 + --wait 384 + 385 + echo "==> applying cluster issuer" 386 + sed "s|you@example.com|$LETSENCRYPT_EMAIL|g" deploy/cluster-issuer.yaml \ 387 + | kubectl apply -f - 388 + 389 + echo "==> installing postgresql" 390 + helm upgrade --install zlay-db bitnami/postgresql \ 391 + --namespace zlay \ 392 + --values deploy/postgres-values.yaml \ 393 + --set auth.password="$ZLAY_POSTGRES_PASSWORD" \ 394 + --wait 395 + 396 + echo "==> creating zlay secret" 397 + kubectl create secret generic zlay-secret \ 398 + --namespace zlay \ 399 + --from-literal=DATABASE_URL="postgres://relay:${ZLAY_POSTGRES_PASSWORD}@zlay-db-postgresql.zlay.svc.cluster.local:5432/relay" \ 400 + --from-literal=RELAY_ADMIN_PASSWORD="$ZLAY_ADMIN_PASSWORD" \ 401 + --dry-run=client -o yaml | kubectl apply -f - 402 + 403 + echo "==> installing zlay" 404 + helm upgrade --install zlay bjw-s/app-template \ 405 + --namespace zlay \ 406 + --values deploy/zlay-values.yaml \ 407 + --wait --timeout 5m 408 + 409 + echo "==> applying ingress" 410 + sed "s|ZLAY_DOMAIN_PLACEHOLDER|$ZLAY_DOMAIN|g" deploy/zlay-ingress.yaml \ 411 + | kubectl apply -f - 412 + 413 + echo "==> installing monitoring" 414 + ZLAY_METRICS_DOMAIN="${ZLAY_METRICS_DOMAIN:-zlay-metrics.waow.tech}" 415 + kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f - 416 + helm upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ 417 + --namespace monitoring \ 418 + --values deploy/monitoring-values.yaml \ 419 + --set grafana.adminPassword="${GRAFANA_ADMIN_PASSWORD:-prom-operator}" \ 420 + --set "grafana.grafana\\.ini.server.root_url=https://$ZLAY_METRICS_DOMAIN" \ 421 + --wait --timeout 5m 422 + kubectl apply -f deploy/zlay-servicemonitor.yaml 423 + 424 + echo "==> applying grafana ingress" 425 + sed "s|GRAFANA_DOMAIN_PLACEHOLDER|$ZLAY_METRICS_DOMAIN|g" deploy/grafana-ingress.yaml \ 426 + | kubectl apply -f - 427 + 428 + echo "" 429 + echo "done. point DNS:" 430 + echo " $ZLAY_DOMAIN -> $(just zlay-server-ip)" 431 + echo " $ZLAY_METRICS_DOMAIN -> $(just zlay-server-ip)" 432 + echo "then check:" 433 + echo " curl https://$ZLAY_DOMAIN/_health" 434 + 435 + # check zlay status 436 + zlay-status: 437 + #!/usr/bin/env bash 438 + export KUBECONFIG="$ZLAY_KUBECONFIG" 439 + echo "==> nodes" 440 + kubectl get nodes 441 + echo "" 442 + echo "==> pods" 443 + kubectl get pods -n zlay 444 + echo "" 445 + echo "==> health" 446 + kubectl exec -n zlay deploy/zlay -- wget -qO- http://localhost:3001/_health 2>/dev/null || echo "(zlay not ready)" 447 + 448 + # tail zlay logs 449 + zlay-logs: 450 + KUBECONFIG="$ZLAY_KUBECONFIG" kubectl logs -n zlay deploy/zlay -f 451 + 452 + # check zlay health via public endpoint 453 + zlay-health: 454 + #!/usr/bin/env bash 455 + : "${ZLAY_DOMAIN:?set ZLAY_DOMAIN}" 456 + curl -sf "https://$ZLAY_DOMAIN/_health" | jq .

+25 -16

scripts/backfill

··· 50 50 return json.loads(resp.read()) 51 51 52 52 53 - def active_crawl_count(url: str, token: str, retries: int = 3) -> tuple[int, int]: 54 - """returns (active_count, total_repos_seen). retries on transient errors.""" 53 + def active_crawl_count(url: str, token: str, retries: int = 12) -> tuple[int, int]: 54 + """returns (active_count, total_repos_seen). retries on transient errors with backoff.""" 55 55 for attempt in range(retries): 56 56 try: 57 57 status = crawl_status(url, token) ··· 60 60 return len(active), seen 61 61 except (ConnectionError, OSError, urllib.error.URLError) as e: 62 62 if attempt < retries - 1: 63 - sys.stdout.write(f"\r connection error ({e}), retrying in 5s...") 63 + delay = min(5 * (attempt + 1), 30) 64 + sys.stdout.write(f"\r connection error (attempt {attempt + 1}/{retries}), retrying in {delay}s...") 64 65 sys.stdout.flush() 65 - time.sleep(5) 66 + time.sleep(delay) 66 67 else: 67 68 raise 68 69 ··· 141 142 print(f"batch {i + 1}/{len(batches)} — {', '.join(batch[:3])}{'...' if len(batch) > 3 else ''}") 142 143 print(f" [{hosts_crawled}/{len(all_hosts)} hosts, {total_repos} repos, {elapsed:.0f}s elapsed{eta_str}]") 143 144 144 - try: 145 - request_crawl(args.url, args.token, batch) 146 - except urllib.error.HTTPError as e: 147 - body = e.read().decode(errors="replace") 148 - print(f" error: {e.code} {e.reason} — {body}") 149 - if e.code == 403: 150 - print(" check that COLLECTIONS_ADMIN_TOKEN is set on the collectiondir pod") 151 - return 152 - continue 153 - except Exception as e: 154 - print(f" error sending batch: {e}") 155 - continue 145 + for attempt in range(6): 146 + try: 147 + request_crawl(args.url, args.token, batch) 148 + break 149 + except urllib.error.HTTPError as e: 150 + body = e.read().decode(errors="replace") 151 + print(f" error: {e.code} {e.reason} — {body}") 152 + if e.code == 403: 153 + print(" check that COLLECTIONS_ADMIN_TOKEN is set on the collectiondir pod") 154 + return 155 + break 156 + except (ConnectionError, OSError, urllib.error.URLError) as e: 157 + if attempt < 5: 158 + delay = min(5 * (attempt + 1), 30) 159 + sys.stdout.write(f"\r request_crawl connection error (attempt {attempt + 1}/6), retrying in {delay}s...") 160 + sys.stdout.flush() 161 + time.sleep(delay) 162 + else: 163 + print(f" error sending batch after 6 attempts: {e}") 164 + break 156 165 157 166 repos = wait_for_batch(args.url, args.token, baseline, timeout=args.batch_timeout) 158 167 total_repos += repos

+106

scripts/reconnect

··· 1 + #!/usr/bin/env -S PYTHONUNBUFFERED=1 uv run --script --quiet 2 + # /// script 3 + # requires-python = ">=3.12" 4 + # dependencies = [] 5 + # /// 6 + """ 7 + reconnect relay to all known PDS hosts. 8 + 9 + fetches the community PDS list and sends requestCrawl for each host, 10 + preventing the natural decay of connections as indie PDS hosts go quiet. 11 + 12 + intended to run on a cron (every ~4 hours). 13 + 14 + usage: 15 + ./scripts/reconnect 16 + ./scripts/reconnect --url https://relay.waow.tech --password "$RELAY_ADMIN_PASSWORD" 17 + ./scripts/reconnect --dry-run 18 + """ 19 + 20 + import argparse 21 + import json 22 + import sys 23 + import time 24 + import urllib.request 25 + 26 + 27 + PDS_LIST_URL = "https://raw.githubusercontent.com/mary-ext/atproto-scraping/refs/heads/trunk/state.json" 28 + 29 + 30 + def fetch_pds_list() -> list[str]: 31 + """fetch community-maintained PDS host list.""" 32 + req = urllib.request.Request(PDS_LIST_URL) 33 + with urllib.request.urlopen(req, timeout=30) as resp: 34 + data = json.loads(resp.read()) 35 + return [url.rstrip("/") for url in data.get("pdses", {}).keys() if url.startswith("https://")] 36 + 37 + 38 + def request_crawl(relay_url: str, password: str, hostname: str) -> int: 39 + """send requestCrawl to relay admin endpoint. returns http status.""" 40 + import base64 41 + payload = json.dumps({"hostname": hostname}).encode() 42 + auth = base64.b64encode(f"admin:{password}".encode()).decode() 43 + req = urllib.request.Request( 44 + f"{relay_url}/admin/pds/requestCrawl", 45 + data=payload, 46 + headers={"Content-Type": "application/json", "Authorization": f"Basic {auth}"}, 47 + method="POST", 48 + ) 49 + try: 50 + with urllib.request.urlopen(req, timeout=10) as resp: 51 + return resp.status 52 + except urllib.error.HTTPError as e: 53 + return e.code 54 + except (ConnectionError, OSError, urllib.error.URLError): 55 + return 0 56 + 57 + 58 + def main(): 59 + parser = argparse.ArgumentParser(description="reconnect relay to all known PDS hosts") 60 + parser.add_argument("--url", default="https://relay.waow.tech", help="relay URL") 61 + parser.add_argument("--password", default=None, help="relay admin password (or RELAY_ADMIN_PASSWORD env)") 62 + parser.add_argument("--delay", type=float, default=0.1, help="delay between requests in seconds") 63 + parser.add_argument("--dry-run", action="store_true", help="just fetch and count, don't send requests") 64 + args = parser.parse_args() 65 + 66 + import os 67 + password = args.password or os.environ.get("RELAY_ADMIN_PASSWORD", "") 68 + if not password and not args.dry_run: 69 + print("error: --password or RELAY_ADMIN_PASSWORD required", file=sys.stderr) 70 + return 71 + 72 + print(f"fetching PDS list from {PDS_LIST_URL}...") 73 + hosts = fetch_pds_list() 74 + print(f"found {len(hosts)} PDS hosts") 75 + 76 + if args.dry_run: 77 + print("dry run, exiting") 78 + return 79 + 80 + ok = 0 81 + errors = 0 82 + start = time.time() 83 + 84 + for i, host in enumerate(hosts): 85 + status = request_crawl(args.url, password, host) 86 + if status == 200: 87 + ok += 1 88 + else: 89 + errors += 1 90 + if errors <= 10: 91 + print(f" error: {host} -> {status}") 92 + elif errors == 11: 93 + print(f" (suppressing further errors)") 94 + 95 + if (i + 1) % 500 == 0: 96 + elapsed = time.time() - start 97 + print(f" {i + 1}/{len(hosts)} ({ok} ok, {errors} errors, {elapsed:.0f}s)") 98 + 99 + time.sleep(args.delay) 100 + 101 + elapsed = time.time() - start 102 + print(f"done: {ok} ok, {errors} errors, {elapsed:.0f}s") 103 + 104 + 105 + if __name__ == "__main__": 106 + main()

Configure Feed

Configure Feed