declarative relay deployment on hetzner relay-eval.waow.tech
atproto relay
14
fork

Configure Feed

Select the types of activity you want to include in your feed.

scripts: add zlay-admin CLI

argparse-style wrapper for zlay's /admin/* HTTP API. fetches the
bearer token from the k8s secret, manages a port-forward for the
call, exposes admin endpoints as subcommands:

list-hosts [--status STATUS] [--json]
block-host HOSTNAME
unblock-host HOSTNAME
change-limits HOSTNAME [--account-limit N]
ban-repo DID
resync DID HOSTNAME
resync-status
backfill-status
audit [--json] # status + worker/delivering summary

the token never touches argv or the shell. required when operating
zlay during incidents — used during 2026-04-17 attack recovery to
audit the host roster, identify exhausted legitimate PDSes, and
confirm worker-vs-active deltas.

Co-Authored-By: Claude Opus 4 (1M context) <noreply@anthropic.com>

+296
+296
scripts/zlay-admin
··· 1 + #!/usr/bin/env python3 2 + """ 3 + zlay-admin — wrapper CLI for zlay's /admin/* HTTP endpoints. 4 + 5 + fetches the bearer token from the k8s secret, manages a port-forward for 6 + the duration of the call, and exposes the admin API as subcommands. the 7 + token is never printed, never passed as an argv, and never written to 8 + shell history. 9 + 10 + usage: 11 + zlay-admin list-hosts [--status STATUS] [--json] 12 + zlay-admin block-host HOSTNAME 13 + zlay-admin unblock-host HOSTNAME 14 + zlay-admin change-limits HOSTNAME [--account-limit N] 15 + zlay-admin ban-repo DID 16 + zlay-admin resync DID HOSTNAME 17 + zlay-admin resync-status 18 + zlay-admin backfill-status 19 + zlay-admin audit [--json] # compact host-status summary 20 + 21 + env: 22 + KUBECONFIG defaults to zlay/kubeconfig.yaml relative to this script 23 + ZLAY_ADMIN_PORT local port-forward target (default 13000) 24 + """ 25 + 26 + import argparse 27 + import base64 28 + import json 29 + import os 30 + import signal 31 + import subprocess 32 + import sys 33 + import time 34 + import urllib.error 35 + import urllib.request 36 + from contextlib import contextmanager 37 + 38 + SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 39 + DEFAULT_KUBECONFIG = os.path.join(SCRIPT_DIR, "..", "zlay", "kubeconfig.yaml") 40 + DEFAULT_PORT = int(os.environ.get("ZLAY_ADMIN_PORT", "13000")) 41 + NAMESPACE = "zlay" 42 + SVC = "svc/zlay" 43 + SVC_PORT = 3000 44 + 45 + 46 + def kubeconfig_path() -> str: 47 + return os.environ.get("KUBECONFIG") or DEFAULT_KUBECONFIG 48 + 49 + 50 + def fetch_token() -> str: 51 + """Read RELAY_ADMIN_PASSWORD from the k8s secret. Never printed.""" 52 + out = subprocess.check_output( 53 + [ 54 + "kubectl", 55 + "--kubeconfig", 56 + kubeconfig_path(), 57 + "-n", 58 + NAMESPACE, 59 + "get", 60 + "secret", 61 + "zlay-secret", 62 + "-o", 63 + "jsonpath={.data.RELAY_ADMIN_PASSWORD}", 64 + ], 65 + stderr=subprocess.DEVNULL, 66 + ) 67 + if not out: 68 + sys.exit("error: RELAY_ADMIN_PASSWORD not found in zlay-secret") 69 + return base64.b64decode(out).decode() 70 + 71 + 72 + @contextmanager 73 + def port_forward(local_port: int = DEFAULT_PORT): 74 + """yield a base URL while a kubectl port-forward is running.""" 75 + proc = subprocess.Popen( 76 + [ 77 + "kubectl", 78 + "--kubeconfig", 79 + kubeconfig_path(), 80 + "-n", 81 + NAMESPACE, 82 + "port-forward", 83 + SVC, 84 + f"{local_port}:{SVC_PORT}", 85 + ], 86 + stdout=subprocess.DEVNULL, 87 + stderr=subprocess.DEVNULL, 88 + preexec_fn=os.setsid, 89 + ) 90 + try: 91 + # poll until the port accepts connections or we give up 92 + base = f"http://127.0.0.1:{local_port}" 93 + for _ in range(30): 94 + time.sleep(0.2) 95 + try: 96 + urllib.request.urlopen(f"{base}/_health", timeout=1).read() 97 + break 98 + except Exception: 99 + if proc.poll() is not None: 100 + sys.exit("error: port-forward exited before becoming ready") 101 + else: 102 + sys.exit("error: port-forward did not become ready in 6s") 103 + yield base 104 + finally: 105 + try: 106 + os.killpg(os.getpgid(proc.pid), signal.SIGTERM) 107 + except ProcessLookupError: 108 + pass 109 + try: 110 + proc.wait(timeout=3) 111 + except subprocess.TimeoutExpired: 112 + try: 113 + os.killpg(os.getpgid(proc.pid), signal.SIGKILL) 114 + except ProcessLookupError: 115 + pass 116 + 117 + 118 + def request(method: str, base: str, path: str, token: str, body: dict | None = None) -> tuple[int, str]: 119 + data = json.dumps(body).encode() if body is not None else None 120 + req = urllib.request.Request(f"{base}{path}", data=data, method=method) 121 + req.add_header("Authorization", f"Bearer {token}") 122 + if data is not None: 123 + req.add_header("Content-Type", "application/json") 124 + try: 125 + with urllib.request.urlopen(req, timeout=10) as r: 126 + return r.status, r.read().decode() 127 + except urllib.error.HTTPError as e: 128 + return e.code, e.read().decode() if e.fp else "" 129 + 130 + 131 + def cmd_list_hosts(args) -> int: 132 + token = fetch_token() 133 + with port_forward() as base: 134 + code, body = request("GET", base, "/admin/hosts", token) 135 + if code != 200: 136 + print(f"HTTP {code}: {body}", file=sys.stderr) 137 + return 1 138 + d = json.loads(body) 139 + hosts = d.get("hosts", []) 140 + if args.status: 141 + hosts = [h for h in hosts if h.get("status") == args.status] 142 + if args.json: 143 + print(json.dumps({"hosts": hosts, "active_workers": d.get("active_workers")}, indent=2)) 144 + return 0 145 + print(f"total: {len(hosts)} active_workers: {d.get('active_workers')}") 146 + for h in hosts[: args.limit]: 147 + print(f" id={h['id']:>6} {h['status']:>9} fails={h['failed_attempts']:>3} last_seq={h['last_seq']:>15} {h['hostname']}") 148 + if len(hosts) > args.limit: 149 + print(f" ... {len(hosts) - args.limit} more (use --limit or --json)") 150 + return 0 151 + 152 + 153 + def cmd_block_host(args) -> int: 154 + token = fetch_token() 155 + with port_forward() as base: 156 + code, body = request("POST", base, "/admin/hosts/block", token, {"hostname": args.hostname}) 157 + print(body) 158 + return 0 if code == 200 else 1 159 + 160 + 161 + def cmd_unblock_host(args) -> int: 162 + token = fetch_token() 163 + with port_forward() as base: 164 + code, body = request("POST", base, "/admin/hosts/unblock", token, {"hostname": args.hostname}) 165 + print(body) 166 + return 0 if code == 200 else 1 167 + 168 + 169 + def cmd_change_limits(args) -> int: 170 + token = fetch_token() 171 + body = {"hostname": args.hostname} 172 + if args.account_limit is not None: 173 + body["account_limit"] = args.account_limit 174 + with port_forward() as base: 175 + code, resp = request("POST", base, "/admin/hosts/changeLimits", token, body) 176 + print(resp) 177 + return 0 if code == 200 else 1 178 + 179 + 180 + def cmd_ban_repo(args) -> int: 181 + token = fetch_token() 182 + with port_forward() as base: 183 + code, body = request("POST", base, "/admin/repo/ban", token, {"did": args.did}) 184 + print(body) 185 + return 0 if code == 200 else 1 186 + 187 + 188 + def cmd_resync(args) -> int: 189 + token = fetch_token() 190 + with port_forward() as base: 191 + code, body = request("POST", base, "/admin/resync", token, {"did": args.did, "hostname": args.hostname}) 192 + print(body) 193 + return 0 if code == 200 else 1 194 + 195 + 196 + def cmd_resync_status(args) -> int: 197 + token = fetch_token() 198 + with port_forward() as base: 199 + code, body = request("GET", base, "/admin/resync", token) 200 + print(body) 201 + return 0 if code == 200 else 1 202 + 203 + 204 + def cmd_backfill_status(args) -> int: 205 + token = fetch_token() 206 + with port_forward() as base: 207 + code, body = request("GET", base, "/admin/backfill-collections", token) 208 + print(body) 209 + return 0 if code == 200 else 1 210 + 211 + 212 + def cmd_audit(args) -> int: 213 + """summary: status breakdown + fraction of active hosts actually delivering.""" 214 + token = fetch_token() 215 + with port_forward() as base: 216 + code, body = request("GET", base, "/admin/hosts", token) 217 + if code != 200: 218 + print(f"HTTP {code}: {body}", file=sys.stderr) 219 + return 1 220 + d = json.loads(body) 221 + hosts = d["hosts"] 222 + by_status: dict[str, int] = {} 223 + for h in hosts: 224 + by_status[h["status"]] = by_status.get(h["status"], 0) + 1 225 + active = [h for h in hosts if h["status"] == "active"] 226 + delivering = [h for h in active if h["last_seq"] > 0] 227 + cold = [h for h in active if h["last_seq"] == 0] 228 + summary = { 229 + "total": len(hosts), 230 + "active_workers": d.get("active_workers"), 231 + "by_status": by_status, 232 + "active_delivering": len(delivering), 233 + "active_cold": len(cold), 234 + "blocked_hostnames": [h["hostname"] for h in hosts if h["status"] == "blocked"], 235 + } 236 + if args.json: 237 + print(json.dumps(summary, indent=2)) 238 + else: 239 + print(f"total hosts: {summary['total']}") 240 + print(f"active workers: {summary['active_workers']}") 241 + print(f"status breakdown: {summary['by_status']}") 242 + print(f"active + delivering (last_seq>0): {summary['active_delivering']}") 243 + print(f"active + cold (last_seq=0): {summary['active_cold']}") 244 + if summary["blocked_hostnames"]: 245 + print(f"blocked: {summary['blocked_hostnames']}") 246 + return 0 247 + 248 + 249 + def main() -> int: 250 + p = argparse.ArgumentParser(prog="zlay-admin", description=__doc__.strip().splitlines()[0]) 251 + sub = p.add_subparsers(dest="cmd", required=True) 252 + 253 + s = sub.add_parser("list-hosts", help="list hosts (GET /admin/hosts)") 254 + s.add_argument("--status", choices=["active", "blocked", "exhausted", "dormant", "idle"]) 255 + s.add_argument("--limit", type=int, default=50) 256 + s.add_argument("--json", action="store_true") 257 + s.set_defaults(func=cmd_list_hosts) 258 + 259 + s = sub.add_parser("block-host", help="POST /admin/hosts/block") 260 + s.add_argument("hostname") 261 + s.set_defaults(func=cmd_block_host) 262 + 263 + s = sub.add_parser("unblock-host", help="POST /admin/hosts/unblock") 264 + s.add_argument("hostname") 265 + s.set_defaults(func=cmd_unblock_host) 266 + 267 + s = sub.add_parser("change-limits", help="POST /admin/hosts/changeLimits") 268 + s.add_argument("hostname") 269 + s.add_argument("--account-limit", type=int) 270 + s.set_defaults(func=cmd_change_limits) 271 + 272 + s = sub.add_parser("ban-repo", help="POST /admin/repo/ban") 273 + s.add_argument("did") 274 + s.set_defaults(func=cmd_ban_repo) 275 + 276 + s = sub.add_parser("resync", help="POST /admin/resync (collection-index resync)") 277 + s.add_argument("did") 278 + s.add_argument("hostname") 279 + s.set_defaults(func=cmd_resync) 280 + 281 + s = sub.add_parser("resync-status", help="GET /admin/resync") 282 + s.set_defaults(func=cmd_resync_status) 283 + 284 + s = sub.add_parser("backfill-status", help="GET /admin/backfill-collections") 285 + s.set_defaults(func=cmd_backfill_status) 286 + 287 + s = sub.add_parser("audit", help="compact host-status audit summary") 288 + s.add_argument("--json", action="store_true") 289 + s.set_defaults(func=cmd_audit) 290 + 291 + args = p.parse_args() 292 + return args.func(args) 293 + 294 + 295 + if __name__ == "__main__": 296 + sys.exit(main())