tune memory usage + fix dashboard + add justfile recipes · zzstoatzz.io/my-prefect-server@edde50e

+295

deploy/dashboards/executive-overview.json

··· 1 + { 2 + "annotations": { "list": [] }, 3 + "editable": true, 4 + "fiscalYearStartMonth": 0, 5 + "graphTooltip": 1, 6 + "links": [], 7 + "panels": [ 8 + { 9 + "collapsed": false, 10 + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, 11 + "id": 100, 12 + "title": "prefect", 13 + "type": "row" 14 + }, 15 + { 16 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 17 + "fieldConfig": { 18 + "defaults": { 19 + "color": { "mode": "thresholds" }, 20 + "thresholds": { "steps": [{ "color": "green", "value": null }] } 21 + } 22 + }, 23 + "gridPos": { "h": 3, "w": 4, "x": 0, "y": 1 }, 24 + "id": 1, 25 + "options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 26 + "title": "flows", 27 + "type": "stat", 28 + "targets": [{ "expr": "sum(prefect_flows_total)", "refId": "A" }] 29 + }, 30 + { 31 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 32 + "fieldConfig": { 33 + "defaults": { 34 + "color": { "mode": "thresholds" }, 35 + "thresholds": { "steps": [{ "color": "green", "value": null }] } 36 + } 37 + }, 38 + "gridPos": { "h": 3, "w": 4, "x": 4, "y": 1 }, 39 + "id": 2, 40 + "options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 41 + "title": "deployments", 42 + "type": "stat", 43 + "targets": [{ "expr": "sum(prefect_deployments_total)", "refId": "A" }] 44 + }, 45 + { 46 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 47 + "fieldConfig": { 48 + "defaults": { 49 + "color": { "mode": "thresholds" }, 50 + "thresholds": { "steps": [{ "color": "green", "value": null }] } 51 + } 52 + }, 53 + "gridPos": { "h": 3, "w": 4, "x": 8, "y": 1 }, 54 + "id": 3, 55 + "options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 56 + "title": "work pools", 57 + "type": "stat", 58 + "targets": [{ "expr": "sum(prefect_work_pools_total)", "refId": "A" }] 59 + }, 60 + { 61 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 62 + "fieldConfig": { 63 + "defaults": { 64 + "color": { "mode": "thresholds" }, 65 + "thresholds": { "steps": [{ "color": "green", "value": null }] } 66 + } 67 + }, 68 + "gridPos": { "h": 3, "w": 4, "x": 12, "y": 1 }, 69 + "id": 4, 70 + "options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 71 + "title": "workers", 72 + "type": "stat", 73 + "targets": [{ "expr": "sum(kube_deployment_status_replicas{deployment=~\".*worker\"})", "refId": "A" }] 74 + }, 75 + { 76 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 77 + "fieldConfig": { 78 + "defaults": { 79 + "color": { "mode": "thresholds" }, 80 + "thresholds": { "steps": [ 81 + { "color": "green", "value": null }, 82 + { "color": "yellow", "value": 60 }, 83 + { "color": "red", "value": 300 } 84 + ] }, 85 + "unit": "s" 86 + } 87 + }, 88 + "gridPos": { "h": 3, "w": 4, "x": 16, "y": 1 }, 89 + "id": 5, 90 + "options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 91 + "title": "avg run time", 92 + "type": "stat", 93 + "targets": [{ "expr": "avg(prefect_flow_runs_total_run_time)", "refId": "A" }] 94 + }, 95 + { 96 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 97 + "fieldConfig": { 98 + "defaults": { 99 + "color": { "mode": "thresholds" }, 100 + "thresholds": { "steps": [ 101 + { "color": "green", "value": null }, 102 + { "color": "red", "value": 1 } 103 + ] } 104 + } 105 + }, 106 + "gridPos": { "h": 3, "w": 4, "x": 20, "y": 1 }, 107 + "id": 6, 108 + "options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 109 + "title": "runs (active)", 110 + "type": "stat", 111 + "targets": [{ "expr": "count(prefect_info_flow_runs{state_name=~\"Running|Pending|Scheduled\"}) or vector(0)", "refId": "A" }] 112 + }, 113 + { 114 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 115 + "fieldConfig": { 116 + "defaults": { 117 + "color": { "mode": "palette-classic" }, 118 + "custom": { 119 + "fillOpacity": 80, 120 + "stacking": { "mode": "normal" } 121 + } 122 + }, 123 + "overrides": [ 124 + { "matcher": { "id": "byName", "options": "COMPLETED" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, 125 + { "matcher": { "id": "byName", "options": "FAILED" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, 126 + { "matcher": { "id": "byName", "options": "RUNNING" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, 127 + { "matcher": { "id": "byName", "options": "CANCELLED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }, 128 + { "matcher": { "id": "byName", "options": "CRASHED" }, "properties": [{ "id": "color", "value": { "fixedColor": "dark-red", "mode": "fixed" } }] }, 129 + { "matcher": { "id": "byName", "options": "PENDING" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] } 130 + ] 131 + }, 132 + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, 133 + "id": 10, 134 + "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "tooltip": { "mode": "multi" }, "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }, "pieType": "donut" }, 135 + "title": "flow runs by state", 136 + "type": "piechart", 137 + "targets": [{ "expr": "count by (state_name) (prefect_info_flow_runs)", "legendFormat": "{{ state_name }}", "refId": "A", "instant": true }] 138 + }, 139 + { 140 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 141 + "fieldConfig": { 142 + "defaults": { 143 + "color": { "mode": "palette-classic" }, 144 + "custom": { "align": "auto" }, 145 + "thresholds": { "steps": [{ "color": "green", "value": null }] } 146 + }, 147 + "overrides": [ 148 + { "matcher": { "id": "byName", "options": "status" }, "properties": [{ "id": "custom.width", "value": 80 }] }, 149 + { "matcher": { "id": "byName", "options": "work_pool_name" }, "properties": [{ "id": "custom.width", "value": 120 }] } 150 + ] 151 + }, 152 + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, 153 + "id": 11, 154 + "options": { "showHeader": true, "sortBy": [{ "displayName": "deployment_name", "desc": false }] }, 155 + "title": "deployments", 156 + "type": "table", 157 + "targets": [{ "expr": "prefect_info_deployment", "format": "table", "instant": true, "refId": "A" }], 158 + "transformations": [ 159 + { "id": "filterFieldsByName", "options": { "include": { "names": ["deployment_name", "flow_name", "status", "work_pool_name"] } } } 160 + ] 161 + }, 162 + { 163 + "collapsed": false, 164 + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 }, 165 + "id": 200, 166 + "title": "infrastructure", 167 + "type": "row" 168 + }, 169 + { 170 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 171 + "fieldConfig": { 172 + "defaults": { 173 + "color": { "mode": "thresholds" }, 174 + "max": 100, 175 + "min": 0, 176 + "thresholds": { "steps": [ 177 + { "color": "green", "value": null }, 178 + { "color": "yellow", "value": 60 }, 179 + { "color": "red", "value": 85 } 180 + ] }, 181 + "unit": "percent" 182 + } 183 + }, 184 + "gridPos": { "h": 5, "w": 6, "x": 0, "y": 13 }, 185 + "id": 20, 186 + "options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"] }, "showThresholdLabels": false, "showThresholdMarkers": true }, 187 + "title": "node CPU", 188 + "type": "gauge", 189 + "targets": [{ "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "refId": "A" }] 190 + }, 191 + { 192 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 193 + "fieldConfig": { 194 + "defaults": { 195 + "color": { "mode": "thresholds" }, 196 + "max": 100, 197 + "min": 0, 198 + "thresholds": { "steps": [ 199 + { "color": "green", "value": null }, 200 + { "color": "yellow", "value": 70 }, 201 + { "color": "red", "value": 90 } 202 + ] }, 203 + "unit": "percent" 204 + } 205 + }, 206 + "gridPos": { "h": 5, "w": 6, "x": 6, "y": 13 }, 207 + "id": 21, 208 + "options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"] }, "showThresholdLabels": false, "showThresholdMarkers": true }, 209 + "title": "node memory", 210 + "type": "gauge", 211 + "targets": [{ "expr": "100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)", "refId": "A" }] 212 + }, 213 + { 214 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 215 + "fieldConfig": { 216 + "defaults": { 217 + "color": { "mode": "thresholds" }, 218 + "max": 100, 219 + "min": 0, 220 + "thresholds": { "steps": [ 221 + { "color": "green", "value": null }, 222 + { "color": "yellow", "value": 70 }, 223 + { "color": "red", "value": 90 } 224 + ] }, 225 + "unit": "percent" 226 + } 227 + }, 228 + "gridPos": { "h": 5, "w": 6, "x": 12, "y": 13 }, 229 + "id": 22, 230 + "options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"] }, "showThresholdLabels": false, "showThresholdMarkers": true }, 231 + "title": "node disk", 232 + "type": "gauge", 233 + "targets": [{ "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"} * 100)", "refId": "A" }] 234 + }, 235 + { 236 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 237 + "fieldConfig": { 238 + "defaults": { 239 + "color": { "mode": "thresholds" }, 240 + "thresholds": { "steps": [ 241 + { "color": "green", "value": null }, 242 + { "color": "red", "value": 1 } 243 + ] } 244 + } 245 + }, 246 + "gridPos": { "h": 5, "w": 6, "x": 18, "y": 13 }, 247 + "id": 23, 248 + "options": { "colorMode": "background", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 249 + "title": "pods not ready", 250 + "type": "stat", 251 + "targets": [{ "expr": "count(kube_pod_status_phase{namespace=~\"prefect|monitoring\", phase=~\"Failed|Unknown\"}) or vector(0)", "refId": "A" }] 252 + }, 253 + { 254 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 255 + "fieldConfig": { 256 + "defaults": { 257 + "color": { "mode": "palette-classic" }, 258 + "custom": { "lineWidth": 1, "fillOpacity": 10, "spanNulls": false }, 259 + "unit": "percentunit" 260 + } 261 + }, 262 + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 18 }, 263 + "id": 30, 264 + "options": { "tooltip": { "mode": "multi" }, "legend": { "displayMode": "list", "placement": "bottom" } }, 265 + "title": "pod CPU usage (prefect)", 266 + "type": "timeseries", 267 + "targets": [{ "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"prefect\", container!=\"\", image!=\"\"}[5m])) by (pod)", "legendFormat": "{{ pod }}", "refId": "A" }] 268 + }, 269 + { 270 + "datasource": { "type": "prometheus", "uid": "prometheus" }, 271 + "fieldConfig": { 272 + "defaults": { 273 + "color": { "mode": "palette-classic" }, 274 + "custom": { "lineWidth": 1, "fillOpacity": 10, "spanNulls": false }, 275 + "unit": "bytes" 276 + } 277 + }, 278 + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 18 }, 279 + "id": 31, 280 + "options": { "tooltip": { "mode": "multi" }, "legend": { "displayMode": "list", "placement": "bottom" } }, 281 + "title": "pod memory usage (prefect)", 282 + "type": "timeseries", 283 + "targets": [{ "expr": "sum(container_memory_working_set_bytes{namespace=\"prefect\", container!=\"\", image!=\"\"}) by (pod)", "legendFormat": "{{ pod }}", "refId": "A" }] 284 + } 285 + ], 286 + "schemaVersion": 39, 287 + "tags": ["prefect", "executive"], 288 + "templating": { "list": [] }, 289 + "time": { "from": "now-6h", "to": "now" }, 290 + "timepicker": {}, 291 + "timezone": "", 292 + "title": "executive overview", 293 + "uid": "executive-overview", 294 + "version": 1 295 + }

+4 -4

deploy/monitoring-values.yaml

··· 36 36 37 37 prometheus: 38 38 prometheusSpec: 39 - scrapeInterval: 30s 40 - retention: 14d 39 + scrapeInterval: 60s 40 + retention: 7d 41 41 serviceMonitorSelectorNilUsesHelmValues: false 42 42 storageSpec: 43 43 volumeClaimTemplate: ··· 48 48 storage: 10Gi 49 49 resources: 50 50 requests: 51 - memory: 256Mi 51 + memory: 200Mi 52 52 limits: 53 - memory: 1Gi 53 + memory: 512Mi 54 54 55 55 prometheusOperator: 56 56 resources:

+1 -1

deploy/prefect-values.yaml

··· 1 1 server: 2 - replicaCount: 2 2 + replicaCount: 1 3 3 4 4 basicAuth: 5 5 enabled: true

+27 -4

justfile

··· 168 168 # check the state of everything 169 169 status: 170 170 @echo "==> nodes" 171 - @kubectl get nodes 171 + @kubectl top nodes 172 + @echo "" 173 + @echo "==> pods (by memory)" 174 + @kubectl top pods --all-namespaces --sort-by=memory 172 175 @echo "" 173 176 @echo "==> pods (prefect)" 174 177 @kubectl get pods -n prefect ··· 176 179 @echo "==> pods (monitoring)" 177 180 @kubectl get pods -n monitoring 178 181 179 - # tail prefect server logs 180 - logs: 181 - kubectl logs -n prefect -l app.kubernetes.io/name=prefect-server -f 182 + # tail logs for a component (server, background-services, worker) 183 + logs component="prefect-server": 184 + kubectl logs -n prefect -l app.kubernetes.io/name={{component}} -f 182 185 183 186 # check prefect health via public endpoint 184 187 health: 185 188 #!/usr/bin/env bash 186 189 : "${DOMAIN:?set DOMAIN}" 187 190 curl -sf "https://$DOMAIN/api/health" | jq . 191 + 192 + # run a prefect CLI command against the remote server 193 + prefect *args: 194 + PREFECT_API_URL="https://$DOMAIN/api" PREFECT_API_AUTH_STRING="$AUTH_STRING" \ 195 + uv run --with prefect prefect {{args}} 196 + 197 + # reload grafana dashboards from deploy/dashboards/ 198 + dashboards: 199 + #!/usr/bin/env bash 200 + set -euo pipefail 201 + for dashboard in deploy/dashboards/*.json; do 202 + name=$(basename "$dashboard" .json | tr '.' '-') 203 + kubectl create configmap "prefect-dashboard-$name" \ 204 + --namespace monitoring \ 205 + --from-file="$dashboard" \ 206 + --dry-run=client -o yaml \ 207 + | kubectl label --local -f - grafana_dashboard=1 -o yaml \ 208 + | kubectl apply -f - 209 + echo " loaded $name" 210 + done

Configure Feed

Configure Feed