Openstatus www.openstatus.dev
6
fork

Configure Feed

Select the types of activity you want to include in your feed.

fix: bug incident (#1772)

* feat(workflows): add incident cleanup endpoint to auto-resolve stale incidents

Add /incident/cleanup endpoint that finds active monitors with unresolved
incidents and automatically resolves them by setting resolvedAt and
autoResolved=true.

* refactor(checker): extract incident helpers for findOpenIncident and resolveIncident

- Add findOpenIncident() to query open incidents by monitorId
- Add resolveIncident() to handle incident resolution with logging and audit
- Replace duplicated incident query and resolution code with helper calls
- Fix typo: change 'return' to 'break' in error case
- Remove redundant monitor status update in error handling

* ci: apply automated fixes

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>

authored by

Thibault Le Ouay
autofix-ci[bot]
and committed by
GitHub
6d27a3d3 5ddd709c

+135 -68
+70 -68
apps/workflows/src/checker/index.ts
··· 29 29 30 30 const logger = getLogger(["workflow"]); 31 31 32 + /** 33 + * Finds an open incident (not resolved and not acknowledged) for the given monitor. 34 + */ 35 + async function findOpenIncident(monitorId: number) { 36 + return db 37 + .select() 38 + .from(incidentTable) 39 + .where( 40 + and( 41 + eq(incidentTable.monitorId, monitorId), 42 + isNull(incidentTable.resolvedAt), 43 + ), 44 + ) 45 + .get(); 46 + } 47 + 48 + /** 49 + * Resolves an open incident by setting resolvedAt and autoResolved flag. 50 + */ 51 + async function resolveIncident(params: { 52 + monitorId: string; 53 + cronTimestamp: number; 54 + }) { 55 + const { monitorId, cronTimestamp } = params; 56 + const incident = await findOpenIncident(Number(monitorId)); 57 + 58 + if (!incident || incident.resolvedAt) { 59 + return null; 60 + } 61 + 62 + logger.info("Recovering incident", { 63 + incident_id: incident.id, 64 + monitor_id: monitorId, 65 + }); 66 + 67 + await db 68 + .update(incidentTable) 69 + .set({ 70 + resolvedAt: new Date(cronTimestamp), 71 + autoResolved: true, 72 + }) 73 + .where(eq(incidentTable.id, incident.id)) 74 + .run(); 75 + 76 + await checkerAudit.publishAuditLog({ 77 + id: `monitor:${monitorId}`, 78 + action: "incident.resolved", 79 + targets: [{ id: monitorId, type: "monitor" }], 80 + metadata: { cronTimestamp, incidentId: incident.id }, 81 + }); 82 + 83 + return incident; 84 + } 85 + 32 86 checkerRoute.post("/updateStatus", async (c) => { 33 87 const auth = c.req.header("Authorization"); 34 88 if (auth !== `Basic ${env().CRON_SECRET}`) { ··· 152 206 if (affectedRegion.count >= numberOfRegions / 2 || numberOfRegions === 1) { 153 207 switch (status) { 154 208 case "active": { 155 - // it's been resolved 156 209 if (monitor.status === "active") { 157 210 break; 158 211 } ··· 166 219 .set({ status: "active" }) 167 220 .where(eq(schema.monitor.id, monitor.id)); 168 221 169 - // we can't have a monitor in error without an incident 170 222 if (monitor.status === "error") { 171 - const incident = await db 172 - .select() 173 - .from(incidentTable) 174 - .where( 175 - and( 176 - eq(incidentTable.monitorId, Number(monitorId)), 177 - isNull(incidentTable.resolvedAt), 178 - isNull(incidentTable.acknowledgedAt), 179 - ), 180 - ) 181 - .get(); 182 - 183 - if (!incident) { 184 - // it was just a single failure not a proper incident 185 - break; 186 - } 187 - if (incident?.resolvedAt) { 188 - // incident is already resolved 189 - break; 190 - } 191 - logger.info("Recovering incident", { 192 - incident_id: incident.id, 193 - monitor_id: monitorId, 194 - }); 195 - 196 - await db 197 - .update(incidentTable) 198 - .set({ 199 - resolvedAt: new Date(cronTimestamp), 200 - autoResolved: true, 201 - }) 202 - .where(eq(incidentTable.id, incident.id)) 203 - .run(); 204 - 205 - await checkerAudit.publishAuditLog({ 206 - id: `monitor:${monitorId}`, 207 - action: "incident.resolved", 208 - targets: [{ id: monitorId, type: "monitor" }], 209 - metadata: { cronTimestamp, incidentId: incident.id }, 210 - }); 223 + await resolveIncident({ monitorId, cronTimestamp }); 211 224 } 212 225 213 226 await triggerNotifications({ ··· 225 238 } 226 239 case "degraded": 227 240 if (monitor.status === "degraded") { 228 - // already degraded let's return early 229 241 break; 230 242 } 243 + 231 244 logger.info("Monitor status changed to degraded", { 232 245 monitor_id: monitor.id, 233 246 workspace_id: monitor.workspaceId, ··· 237 250 .update(schema.monitor) 238 251 .set({ status: "degraded" }) 239 252 .where(eq(schema.monitor.id, monitor.id)); 240 - // figure how to send the notification once 253 + 241 254 await triggerNotifications({ 242 255 monitorId, 243 256 statusCode, ··· 249 262 incidentId: `${cronTimestamp}`, 250 263 }); 251 264 265 + if (monitor.status === "error") { 266 + await resolveIncident({ monitorId, cronTimestamp }); 267 + } 252 268 break; 253 269 case "error": 254 270 if (monitor.status === "error") { 255 - // already in error let's return early 256 271 break; 257 272 } 258 273 ··· 267 282 .where(eq(schema.monitor.id, monitor.id)); 268 283 269 284 try { 270 - const incident = await db 271 - .select() 272 - .from(incidentTable) 273 - .where( 274 - and( 275 - eq(incidentTable.monitorId, Number(monitorId)), 276 - isNull(incidentTable.resolvedAt), 277 - isNull(incidentTable.acknowledgedAt), 278 - ), 279 - ) 280 - .get(); 281 - if (incident) { 282 - logger.info("we are already in incident"); 285 + const existingIncident = await findOpenIncident(Number(monitorId)); 286 + if (existingIncident) { 287 + logger.info("Already in incident", { 288 + incident_id: existingIncident.id, 289 + }); 283 290 break; 284 291 } 292 + 285 293 const [newIncident] = await db 286 294 .insert(incidentTable) 287 295 .values({ ··· 291 299 }) 292 300 .returning(); 293 301 294 - if (!newIncident.id) { 295 - return; 302 + if (!newIncident?.id) { 303 + break; 296 304 } 297 305 298 306 await checkerAudit.publishAuditLog({ ··· 312 320 region, 313 321 incidentId: String(newIncident.id), 314 322 }); 315 - 316 - await db 317 - .update(schema.monitor) 318 - .set({ status: "error" }) 319 - .where(eq(schema.monitor.id, monitor.id)); 320 - } catch { 321 - logger.warning("incident was already created"); 323 + } catch (error) { 324 + logger.warning("Failed to create incident", { error }); 322 325 } 323 326 324 327 break; ··· 328 331 } 329 332 } 330 333 331 - // if we are in error 332 334 return c.text("Ok", 200); 333 335 });
+63
apps/workflows/src/incident/index.ts
··· 1 + import { schema } from "@openstatus/db"; 2 + import { and, eq, inArray, isNotNull, isNull, ne } from "drizzle-orm"; 3 + import { Hono } from "hono"; 4 + import { env } from "../env"; 5 + import { db } from "../lib/db"; 6 + 7 + export const incidentRoute = new Hono({ strict: false }); 8 + 9 + incidentRoute.use("*", async (c, next) => { 10 + if (c.req.header("authorization") !== env().CRON_SECRET) { 11 + return c.text("Unauthorized", 401); 12 + } 13 + 14 + return next(); 15 + }); 16 + 17 + incidentRoute.get("/cleanup", async (c) => { 18 + // Find monitors that have unresolved incidents but are active 19 + const unresolvedIncidentMonitorIds = db 20 + .select({ monitorId: schema.incidentTable.monitorId }) 21 + .from(schema.incidentTable) 22 + .where( 23 + and( 24 + isNull(schema.incidentTable.resolvedAt), 25 + isNotNull(schema.incidentTable.acknowledgedAt), 26 + ), 27 + ); 28 + 29 + const activeMonitorsWithUnresolvedIncidents = await db 30 + .select({ id: schema.monitor.id }) 31 + .from(schema.monitor) 32 + .where( 33 + and( 34 + inArray(schema.monitor.id, unresolvedIncidentMonitorIds), 35 + eq(schema.monitor.active, true), 36 + ne(schema.monitor.status, "error"), 37 + ), 38 + ) 39 + .all(); 40 + 41 + const monitorIds = activeMonitorsWithUnresolvedIncidents.map((m) => m.id); 42 + 43 + if (monitorIds.length === 0) { 44 + return c.json({ status: "ok", updated: 0 }); 45 + } 46 + 47 + // Update incidents for these monitors: set resolvedAt to now and autoResolved to true 48 + const result = await db 49 + .update(schema.incidentTable) 50 + .set({ 51 + resolvedAt: new Date(), 52 + autoResolved: true, 53 + }) 54 + .where( 55 + and( 56 + inArray(schema.incidentTable.monitorId, monitorIds), 57 + isNull(schema.incidentTable.resolvedAt), 58 + ), 59 + ) 60 + .returning({ id: schema.incidentTable.id }); 61 + 62 + return c.json({ status: "ok", updated: result.length }); 63 + });
+2
apps/workflows/src/index.ts
··· 21 21 22 22 import { resourceFromAttributes } from "@opentelemetry/resources"; 23 23 import { ATTR_DEPLOYMENT_ENVIRONMENT_NAME } from "@opentelemetry/semantic-conventions/incubating"; 24 + import { incidentRoute } from "./incident"; 24 25 25 26 const { NODE_ENV, PORT } = env(); 26 27 ··· 190 191 191 192 app.route("/", checkerRoute); 192 193 194 + app.route("/incident", incidentRoute); 193 195 if (NODE_ENV === "development") { 194 196 showRoutes(app, { verbose: true, colorize: true }); 195 197 }