redesign executive overview dashboard with honest snapshot metrics

+2

README.md

··· 1 1 self-hosted prefect OSS on a single hetzner VM (k3s), with monitoring. 2 2 3 + [executive dashboard](https://prefect-metrics.waow.tech/d/executive-overview/executive-overview?orgId=1&from=now-6h&to=now&timezone=browser) 4 + 3 5 <details> 4 6 <summary>deployment</summary> 5 7

+38 -86

deploy/dashboards/executive-overview.json

··· 17 17 "fieldConfig": { 18 18 "defaults": { 19 19 "color": { "mode": "thresholds" }, 20 - "thresholds": { "steps": [{ "color": "green", "value": null }] } 20 + "thresholds": { "steps": [ 21 + { "color": "green", "value": null }, 22 + { "color": "red", "value": 1 } 23 + ] } 21 24 } 22 25 }, 23 - "gridPos": { "h": 3, "w": 4, "x": 0, "y": 1 }, 24 - "id": 1, 25 - "options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 26 - "title": "flows", 27 - "type": "stat", 28 - "targets": [{ "expr": "sum(prefect_flows_total)", "refId": "A" }] 29 - }, 30 - { 31 - "datasource": { "type": "prometheus", "uid": "prometheus" }, 32 - "fieldConfig": { 33 - "defaults": { 34 - "color": { "mode": "thresholds" }, 35 - "thresholds": { "steps": [{ "color": "green", "value": null }] } 36 - } 37 - }, 38 - "gridPos": { "h": 3, "w": 4, "x": 4, "y": 1 }, 26 + "gridPos": { "h": 3, "w": 8, "x": 0, "y": 1 }, 39 27 "id": 2, 40 - "options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 41 - "title": "deployments", 28 + "description": "work pools with status != READY", 29 + "options": { "colorMode": "background", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 30 + "title": "pools not ready", 42 31 "type": "stat", 43 - "targets": [{ "expr": "sum(prefect_deployments_total)", "refId": "A" }] 32 + "targets": [{ "expr": "count(group by (name) (prefect_info_work_pools{status!=\"READY\"})) or vector(0)", "refId": "A" }] 44 33 }, 45 34 { 46 35 "datasource": { "type": "prometheus", "uid": "prometheus" }, ··· 50 39 "thresholds": { "steps": [{ "color": "green", "value": null }] } 51 40 } 52 41 }, 53 - "gridPos": { "h": 3, "w": 4, "x": 8, "y": 1 }, 42 + "gridPos": { "h": 3, "w": 8, "x": 8, "y": 1 }, 54 43 "id": 3, 55 - "options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 56 - "title": "work pools", 57 - "type": "stat", 58 - "targets": [{ "expr": "sum(prefect_work_pools_total)", "refId": "A" }] 59 - }, 60 - { 61 - "datasource": { "type": "prometheus", "uid": "prometheus" }, 62 - "fieldConfig": { 63 - "defaults": { 64 - "color": { "mode": "thresholds" }, 65 - "thresholds": { "steps": [{ "color": "green", "value": null }] } 66 - } 67 - }, 68 - "gridPos": { "h": 3, "w": 4, "x": 12, "y": 1 }, 69 - "id": 4, 70 - "options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 71 - "title": "workers", 72 - "type": "stat", 73 - "targets": [{ "expr": "sum(kube_deployment_status_replicas{deployment=~\".*worker\"})", "refId": "A" }] 74 - }, 75 - { 76 - "datasource": { "type": "prometheus", "uid": "prometheus" }, 77 - "fieldConfig": { 78 - "defaults": { 79 - "color": { "mode": "thresholds" }, 80 - "thresholds": { "steps": [ 81 - { "color": "green", "value": null }, 82 - { "color": "yellow", "value": 60 }, 83 - { "color": "red", "value": 300 } 84 - ] }, 85 - "unit": "s" 86 - } 87 - }, 88 - "gridPos": { "h": 3, "w": 4, "x": 16, "y": 1 }, 89 - "id": 5, 44 + "description": "terminal runs in 24h snapshot window", 90 45 "options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 91 - "title": "avg run time", 46 + "title": "terminal runs (24h)", 92 47 "type": "stat", 93 - "targets": [{ "expr": "avg(prefect_flow_runs_total_run_time)", "refId": "A" }] 48 + "targets": [{ "expr": "sum(prefect_info_flow_runs{state_name=~\"Completed|Failed|Crashed|Cancelled\"}) or vector(0)", "refId": "A" }] 94 49 }, 95 50 { 96 51 "datasource": { "type": "prometheus", "uid": "prometheus" }, ··· 103 58 ] } 104 59 } 105 60 }, 106 - "gridPos": { "h": 3, "w": 4, "x": 20, "y": 1 }, 107 - "id": 6, 108 - "options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 109 - "title": "runs (active)", 61 + "gridPos": { "h": 3, "w": 8, "x": 16, "y": 1 }, 62 + "id": 4, 63 + "description": "late_runs_count from work queues", 64 + "options": { "colorMode": "background", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 65 + "title": "late runs", 110 66 "type": "stat", 111 - "targets": [{ "expr": "count(prefect_info_flow_runs{state_name=~\"Running|Pending|Scheduled\"}) or vector(0)", "refId": "A" }] 67 + "targets": [{ "expr": "sum(prefect_late_runs) or vector(0)", "refId": "A" }] 112 68 }, 113 69 { 114 70 "datasource": { "type": "prometheus", "uid": "prometheus" }, 115 71 "fieldConfig": { 116 72 "defaults": { 117 - "color": { "mode": "palette-classic" }, 118 - "custom": { 119 - "fillOpacity": 80, 120 - "stacking": { "mode": "normal" } 121 - } 73 + "color": { "mode": "palette-classic" } 122 74 }, 123 75 "overrides": [ 124 - { "matcher": { "id": "byName", "options": "COMPLETED" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, 125 - { "matcher": { "id": "byName", "options": "FAILED" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, 126 - { "matcher": { "id": "byName", "options": "RUNNING" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, 127 - { "matcher": { "id": "byName", "options": "CANCELLED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }, 128 - { "matcher": { "id": "byName", "options": "CRASHED" }, "properties": [{ "id": "color", "value": { "fixedColor": "dark-red", "mode": "fixed" } }] }, 129 - { "matcher": { "id": "byName", "options": "PENDING" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] } 76 + { "matcher": { "id": "byName", "options": "Completed" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, 77 + { "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, 78 + { "matcher": { "id": "byName", "options": "Cancelled" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }, 79 + { "matcher": { "id": "byName", "options": "Crashed" }, "properties": [{ "id": "color", "value": { "fixedColor": "dark-red", "mode": "fixed" } }] } 130 80 ] 131 81 }, 132 - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, 82 + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 11 }, 133 83 "id": 10, 84 + "description": "snapshot of terminal runs in 24h window", 134 85 "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "tooltip": { "mode": "multi" }, "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }, "pieType": "donut" }, 135 - "title": "flow runs by state", 86 + "title": "runs by state (24h)", 136 87 "type": "piechart", 137 - "targets": [{ "expr": "count by (state_name) (prefect_info_flow_runs)", "legendFormat": "{{ state_name }}", "refId": "A", "instant": true }] 88 + "targets": [{ "expr": "sum by (state_name) (prefect_info_flow_runs{state_name=~\"Completed|Failed|Crashed|Cancelled\"})", "legendFormat": "{{ state_name }}", "refId": "A" }] 138 89 }, 139 90 { 140 91 "datasource": { "type": "prometheus", "uid": "prometheus" }, ··· 149 100 { "matcher": { "id": "byName", "options": "work_pool_name" }, "properties": [{ "id": "custom.width", "value": 120 }] } 150 101 ] 151 102 }, 152 - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, 103 + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 11 }, 153 104 "id": 11, 105 + "description": "current deployment registrations", 154 106 "options": { "showHeader": true, "sortBy": [{ "displayName": "deployment_name", "desc": false }] }, 155 107 "title": "deployments", 156 108 "type": "table", 157 - "targets": [{ "expr": "prefect_info_deployment", "format": "table", "instant": true, "refId": "A" }], 109 + "targets": [{ "expr": "max by (deployment_name, flow_name, status, work_pool_name) (prefect_info_deployment)", "format": "table", "instant": true, "refId": "A" }], 158 110 "transformations": [ 159 111 { "id": "filterFieldsByName", "options": { "include": { "names": ["deployment_name", "flow_name", "status", "work_pool_name"] } } } 160 112 ] 161 113 }, 162 114 { 163 115 "collapsed": false, 164 - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 }, 116 + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 19 }, 165 117 "id": 200, 166 118 "title": "infrastructure", 167 119 "type": "row" ··· 181 133 "unit": "percent" 182 134 } 183 135 }, 184 - "gridPos": { "h": 5, "w": 6, "x": 0, "y": 13 }, 136 + "gridPos": { "h": 5, "w": 6, "x": 0, "y": 20 }, 185 137 "id": 20, 186 138 "options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"] }, "showThresholdLabels": false, "showThresholdMarkers": true }, 187 139 "title": "node CPU", ··· 203 155 "unit": "percent" 204 156 } 205 157 }, 206 - "gridPos": { "h": 5, "w": 6, "x": 6, "y": 13 }, 158 + "gridPos": { "h": 5, "w": 6, "x": 6, "y": 20 }, 207 159 "id": 21, 208 160 "options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"] }, "showThresholdLabels": false, "showThresholdMarkers": true }, 209 161 "title": "node memory", ··· 225 177 "unit": "percent" 226 178 } 227 179 }, 228 - "gridPos": { "h": 5, "w": 6, "x": 12, "y": 13 }, 180 + "gridPos": { "h": 5, "w": 6, "x": 12, "y": 20 }, 229 181 "id": 22, 230 182 "options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"] }, "showThresholdLabels": false, "showThresholdMarkers": true }, 231 183 "title": "node disk", ··· 243 195 ] } 244 196 } 245 197 }, 246 - "gridPos": { "h": 5, "w": 6, "x": 18, "y": 13 }, 198 + "gridPos": { "h": 5, "w": 6, "x": 18, "y": 20 }, 247 199 "id": 23, 248 200 "options": { "colorMode": "background", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, 249 201 "title": "pods not ready", ··· 259 211 "unit": "percentunit" 260 212 } 261 213 }, 262 - "gridPos": { "h": 7, "w": 12, "x": 0, "y": 18 }, 214 + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 4 }, 263 215 "id": 30, 264 216 "options": { "tooltip": { "mode": "multi" }, "legend": { "displayMode": "list", "placement": "bottom" } }, 265 217 "title": "pod CPU usage (prefect)", ··· 275 227 "unit": "bytes" 276 228 } 277 229 }, 278 - "gridPos": { "h": 7, "w": 12, "x": 12, "y": 18 }, 230 + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 4 }, 279 231 "id": 31, 280 232 "options": { "tooltip": { "mode": "multi" }, "legend": { "displayMode": "list", "placement": "bottom" } }, 281 233 "title": "pod memory usage (prefect)",

+12 -1

deploy/exporter-values.yaml

··· 1 - # prometheus-prefect-exporter values 1 + # prometheus-prefect-exporter values (forked: zzstoatzz/prometheus-prefect-exporter) 2 + image: 3 + repository: atcr.io/zzstoatzz.io/prometheus-prefect-exporter 4 + tag: better-metrics 5 + pullPolicy: Always 6 + 7 + imagePullSecrets: 8 + - name: atcr-creds 9 + 2 10 prefectApiUrl: "http://prefect-server.prefect.svc.cluster.local:4200/api" 3 11 4 12 basicAuth: 5 13 enabled: true 6 14 existingSecret: prefect-auth 7 15 existingSecretKey: auth-string 16 + 17 + env: 18 + OFFSET_MINUTES: "1440" 8 19 9 20 serviceMonitor: 10 21 enabled: true

Configure Feed

Configure Feed