add prometheus + grafana monitoring with public dashboard

+7

README.md

··· 6 6 7 7 **health check:** [`https://relay.waow.tech/xrpc/_health`](https://relay.waow.tech/xrpc/_health) 8 8 9 + **metrics dashboard:** [`https://relay-metrics.waow.tech`](https://relay-metrics.waow.tech) (public, anonymous read-only) 10 + 9 11 ## try it 10 12 11 13 the `firehose` script consumes events from the relay using the [atproto](https://github.com/MarshalX/atproto) python SDK. it's a self-contained [uv script](https://docs.astral.sh/uv/guides/scripts/) — no virtualenv or install needed. ··· 37 39 └── deploy/ # helm values + k8s manifests 38 40 ├── relay-values.yaml 39 41 ├── postgres-values.yaml 42 + ├── monitoring-values.yaml 43 + ├── relay-dashboard.json 44 + ├── relay-servicemonitor.yaml 40 45 ├── ingress.yaml 46 + ├── grafana-ingress.yaml 41 47 └── cluster-issuer.yaml 42 48 ``` 43 49 ··· 124 130 125 131 - **relay** — [`ghcr.io/bluesky-social/indigo`](https://github.com/bluesky-social/indigo/pkgs/container/indigo) (tagged per-commit, e.g. `relay-bf41e2ee...`), deployed via [bjw-s/app-template](https://github.com/bjw-s-labs/helm-charts) helm chart with `hostNetwork: true` for lower-overhead networking 126 132 - **postgresql** — relay's backing database, deployed via [bitnami/postgresql](https://github.com/bitnami/charts/tree/main/bitnami/postgresql) helm chart 133 + - **prometheus + grafana** — metrics collection and dashboards via [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack), public read-only access at [`relay-metrics.waow.tech`](https://relay-metrics.waow.tech) 127 134 128 135 ### relay specs at steady state 129 136

+24

deploy/grafana-ingress.yaml

··· 1 + apiVersion: networking.k8s.io/v1 2 + kind: Ingress 3 + metadata: 4 + name: grafana 5 + namespace: monitoring 6 + annotations: 7 + cert-manager.io/cluster-issuer: letsencrypt-prod 8 + spec: 9 + ingressClassName: traefik 10 + tls: 11 + - hosts: 12 + - GRAFANA_DOMAIN_PLACEHOLDER 13 + secretName: grafana-tls 14 + rules: 15 + - host: GRAFANA_DOMAIN_PLACEHOLDER 16 + http: 17 + paths: 18 + - path: / 19 + pathType: Prefix 20 + backend: 21 + service: 22 + name: kube-prometheus-stack-grafana 23 + port: 24 + number: 80

+86

deploy/monitoring-values.yaml

··· 1 + # kube-prometheus-stack helm values — trimmed for relay monitoring 2 + # docs: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack 3 + 4 + # --- disable everything we don't need --- 5 + alertmanager: 6 + enabled: false 7 + nodeExporter: 8 + enabled: false 9 + kubeStateMetrics: 10 + enabled: false 11 + kubeApiServer: 12 + enabled: false 13 + kubeControllerManager: 14 + enabled: false 15 + kubeScheduler: 16 + enabled: false 17 + kubeProxy: 18 + enabled: false 19 + kubeEtcd: 20 + enabled: false 21 + kubeDns: 22 + enabled: false 23 + coreDns: 24 + enabled: false 25 + kubelet: 26 + enabled: false 27 + 28 + # --- prometheus --- 29 + prometheus: 30 + prometheusSpec: 31 + scrapeInterval: 30s 32 + retention: 14d 33 + resources: 34 + requests: 35 + memory: 256Mi 36 + cpu: 100m 37 + limits: 38 + memory: 512Mi 39 + serviceMonitorSelectorNilUsesHelmValues: false 40 + storageSpec: 41 + volumeClaimTemplate: 42 + spec: 43 + accessModes: ["ReadWriteOnce"] 44 + resources: 45 + requests: 46 + storage: 10Gi 47 + 48 + # --- grafana --- 49 + grafana: 50 + resources: 51 + requests: 52 + memory: 128Mi 53 + cpu: 50m 54 + limits: 55 + memory: 256Mi 56 + grafana.ini: 57 + auth.anonymous: 58 + enabled: true 59 + org_role: Viewer 60 + server: 61 + root_url: https://relay-metrics.waow.tech 62 + adminPassword: ${GRAFANA_ADMIN_PASSWORD:-prom-operator} 63 + dashboardProviders: 64 + dashboardproviders.yaml: 65 + apiVersion: 1 66 + providers: 67 + - name: relay 68 + orgId: 1 69 + folder: "" 70 + type: file 71 + disableDeletion: false 72 + editable: true 73 + options: 74 + path: /var/lib/grafana/dashboards/relay 75 + dashboardsConfigMaps: 76 + relay: relay-dashboard 77 + defaultDashboardsEnabled: false 78 + 79 + # --- operator --- 80 + prometheusOperator: 81 + resources: 82 + requests: 83 + memory: 64Mi 84 + cpu: 10m 85 + limits: 86 + memory: 128Mi

+205

deploy/relay-dashboard.json

··· 1 + { 2 + "annotations": { 3 + "list": [] 4 + }, 5 + "editable": true, 6 + "fiscalYearStartMonth": 0, 7 + "graphTooltip": 1, 8 + "links": [], 9 + "panels": [ 10 + { 11 + "title": "events/sec", 12 + "type": "timeseries", 13 + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 0 }, 14 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 15 + "fieldConfig": { 16 + "defaults": { 17 + "unit": "ops", 18 + "color": { "mode": "palette-classic" }, 19 + "custom": { 20 + "fillOpacity": 15, 21 + "lineWidth": 2, 22 + "spanNulls": false 23 + } 24 + }, 25 + "overrides": [] 26 + }, 27 + "targets": [ 28 + { 29 + "expr": "sum(rate(events_received_counter[5m]))", 30 + "legendFormat": "events received", 31 + "refId": "A" 32 + } 33 + ] 34 + }, 35 + { 36 + "title": "connected PDS hosts", 37 + "type": "stat", 38 + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 0 }, 39 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 40 + "fieldConfig": { 41 + "defaults": { 42 + "color": { "mode": "thresholds" }, 43 + "thresholds": { 44 + "steps": [ 45 + { "color": "red", "value": null }, 46 + { "color": "yellow", "value": 500 }, 47 + { "color": "green", "value": 1000 } 48 + ] 49 + } 50 + }, 51 + "overrides": [] 52 + }, 53 + "options": { 54 + "colorMode": "value", 55 + "graphMode": "area", 56 + "reduceOptions": { "calcs": ["lastNotNull"] } 57 + }, 58 + "targets": [ 59 + { 60 + "expr": "relay_connected_inbound", 61 + "legendFormat": "hosts", 62 + "refId": "A" 63 + } 64 + ] 65 + }, 66 + { 67 + "title": "downstream consumers", 68 + "type": "timeseries", 69 + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 0 }, 70 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 71 + "fieldConfig": { 72 + "defaults": { 73 + "unit": "ops", 74 + "color": { "mode": "palette-classic" }, 75 + "custom": { 76 + "fillOpacity": 15, 77 + "lineWidth": 2, 78 + "spanNulls": false 79 + } 80 + }, 81 + "overrides": [] 82 + }, 83 + "targets": [ 84 + { 85 + "expr": "sum(rate(events_sent_counter[5m]))", 86 + "legendFormat": "events sent", 87 + "refId": "A" 88 + } 89 + ] 90 + }, 91 + { 92 + "title": "event handle latency p99", 93 + "type": "timeseries", 94 + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 8 }, 95 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 96 + "fieldConfig": { 97 + "defaults": { 98 + "unit": "s", 99 + "color": { "mode": "palette-classic" }, 100 + "custom": { 101 + "fillOpacity": 10, 102 + "lineWidth": 2, 103 + "spanNulls": false 104 + } 105 + }, 106 + "overrides": [] 107 + }, 108 + "targets": [ 109 + { 110 + "expr": "histogram_quantile(0.99, sum(rate(events_handle_duration_bucket[5m])) by (le))", 111 + "legendFormat": "p99", 112 + "refId": "A" 113 + } 114 + ] 115 + }, 116 + { 117 + "title": "go memory", 118 + "type": "timeseries", 119 + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 8 }, 120 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 121 + "fieldConfig": { 122 + "defaults": { 123 + "unit": "bytes", 124 + "color": { "mode": "palette-classic" }, 125 + "custom": { 126 + "fillOpacity": 15, 127 + "lineWidth": 2, 128 + "spanNulls": false 129 + } 130 + }, 131 + "overrides": [] 132 + }, 133 + "targets": [ 134 + { 135 + "expr": "go_memstats_alloc_bytes{job=\"relay\"}", 136 + "legendFormat": "alloc", 137 + "refId": "A" 138 + }, 139 + { 140 + "expr": "go_memstats_sys_bytes{job=\"relay\"}", 141 + "legendFormat": "sys", 142 + "refId": "B" 143 + } 144 + ] 145 + }, 146 + { 147 + "title": "goroutines", 148 + "type": "timeseries", 149 + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 8 }, 150 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 151 + "fieldConfig": { 152 + "defaults": { 153 + "color": { "mode": "palette-classic" }, 154 + "custom": { 155 + "fillOpacity": 15, 156 + "lineWidth": 2, 157 + "spanNulls": false 158 + } 159 + }, 160 + "overrides": [] 161 + }, 162 + "targets": [ 163 + { 164 + "expr": "go_goroutines{job=\"relay\"}", 165 + "legendFormat": "goroutines", 166 + "refId": "A" 167 + } 168 + ] 169 + }, 170 + { 171 + "title": "http request rate", 172 + "type": "timeseries", 173 + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, 174 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 175 + "fieldConfig": { 176 + "defaults": { 177 + "unit": "reqps", 178 + "color": { "mode": "palette-classic" }, 179 + "custom": { 180 + "fillOpacity": 15, 181 + "lineWidth": 2, 182 + "spanNulls": false 183 + } 184 + }, 185 + "overrides": [] 186 + }, 187 + "targets": [ 188 + { 189 + "expr": "sum(rate(http_requests_total{job=\"relay\"}[5m])) by (handler)", 190 + "legendFormat": "{{handler}}", 191 + "refId": "A" 192 + } 193 + ] 194 + } 195 + ], 196 + "schemaVersion": 39, 197 + "tags": ["relay", "atproto"], 198 + "templating": { "list": [] }, 199 + "time": { "from": "now-1h", "to": "now" }, 200 + "timepicker": {}, 201 + "timezone": "browser", 202 + "title": "relay.waow.tech", 203 + "uid": "relay-waow", 204 + "version": 1 205 + }

+15

deploy/relay-servicemonitor.yaml

··· 1 + apiVersion: monitoring.coreos.com/v1 2 + kind: ServiceMonitor 3 + metadata: 4 + name: relay 5 + namespace: monitoring 6 + spec: 7 + selector: 8 + matchLabels: 9 + app.kubernetes.io/name: relay 10 + namespaceSelector: 11 + matchNames: 12 + - relay 13 + endpoints: 14 + - port: metrics 15 + interval: 30s

+31 -2

justfile

··· 1 1 # ATProto relay deployment 2 2 # required env vars: HCLOUD_TOKEN, RELAY_DOMAIN, RELAY_ADMIN_PASSWORD, POSTGRES_PASSWORD, LETSENCRYPT_EMAIL 3 + # optional env vars: GRAFANA_DOMAIN (default: relay-metrics.waow.tech), GRAFANA_ADMIN_PASSWORD 3 4 4 5 export KUBECONFIG := justfile_directory() / "kubeconfig.yaml" 5 6 ··· 62 63 helm repo add bjw-s https://bjw-s-labs.github.io/helm-charts 63 64 helm repo add bitnami https://charts.bitnami.com/bitnami 64 65 helm repo add jetstack https://charts.jetstack.io 66 + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 65 67 helm repo update 66 68 67 69 # deploy everything to the cluster ··· 111 113 sed "s|RELAY_DOMAIN_PLACEHOLDER|$RELAY_DOMAIN|g" deploy/ingress.yaml \ 112 114 | kubectl apply -f - 113 115 116 + GRAFANA_DOMAIN="${GRAFANA_DOMAIN:-relay-metrics.waow.tech}" 117 + 118 + echo "==> installing monitoring stack" 119 + kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f - 120 + kubectl create configmap relay-dashboard \ 121 + --namespace monitoring \ 122 + --from-file=relay-dashboard.json=deploy/relay-dashboard.json \ 123 + --dry-run=client -o yaml | kubectl apply -f - 124 + helm upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ 125 + --namespace monitoring \ 126 + --values deploy/monitoring-values.yaml \ 127 + --set grafana.adminPassword="${GRAFANA_ADMIN_PASSWORD:-prom-operator}" \ 128 + --wait --timeout 5m 129 + kubectl apply -f deploy/relay-servicemonitor.yaml 130 + 131 + echo "==> applying grafana ingress" 132 + sed "s|GRAFANA_DOMAIN_PLACEHOLDER|$GRAFANA_DOMAIN|g" deploy/grafana-ingress.yaml \ 133 + | kubectl apply -f - 134 + 114 135 echo "" 115 - echo "done. point DNS for $RELAY_DOMAIN -> $(just server-ip)" 116 - echo "then check: curl https://$RELAY_DOMAIN/xrpc/_health" 136 + echo "done. point DNS:" 137 + echo " $RELAY_DOMAIN -> $(just server-ip)" 138 + echo " $GRAFANA_DOMAIN -> $(just server-ip)" 139 + echo "then check:" 140 + echo " curl https://$RELAY_DOMAIN/xrpc/_health" 141 + echo " curl https://$GRAFANA_DOMAIN" 117 142 118 143 # seed the relay with hosts from the network (includes restart so slurper picks them up) 119 144 bootstrap: ··· 147 172 #!/usr/bin/env bash 148 173 : "${RELAY_DOMAIN:?set RELAY_DOMAIN}" 149 174 curl -sf "https://$RELAY_DOMAIN/xrpc/_health" | jq . 175 + 176 + # get the grafana admin password from the cluster 177 + grafana-password: 178 + @kubectl get secret -n monitoring kube-prometheus-stack-grafana -o jsonpath="{.data.admin-password}" | base64 -d && echo 150 179 151 180 # --- firehose --- 152 181

Configure Feed

Configure Feed