Implement fail-fast health model for observers · solpbc.org/solstone@397dba7

+1 -1

docs/CALLOSUM.md

··· 57 57 **Events:** `status`, `observing`, `detected`, `described`, `transcribed`, `observed` 58 58 **Fields:** 59 59 - `status`: Periodic state (every 5s while running) 60 - - From `observer.py`: `screencast`, `audio`, `activity` - Live capture state 60 + - From `observer.py`: `screencast`, `audio`, `activity` - Live capture state (for UI/debugging) 61 61 - `screencast.files_growing` - Whether recording files are actively being written 62 62 - From `sense.py`: `describe`, `transcribe` - Processing pipeline state (with `running`/`queued` sub-fields) 63 63 - `observing`: `day`, `segment`, `files` - Recording window boundary crossed with saved files

+5 -3

docs/DOCTOR.md

··· 72 72 73 73 ## Health Signals 74 74 75 - Health is derived from `observe.status` Callosum events (emitted every 5 seconds): 75 + Health uses a **fail-fast model**: observers exit if they detect problems, and supervisor restarts them. Health is simply whether the observer is running and sending status events. 76 76 77 77 | Signal | Healthy when | Stale when | 78 78 |--------|--------------|------------| 79 79 | `hear` | Status received within threshold | No status for 60+ seconds | 80 - | `see` | User idle OR (recording AND files growing) | User active AND (not recording OR files not growing) | 80 + | `see` | Status received within threshold | No status for 60+ seconds | 81 + 82 + Both signals track the same thing: is the observer alive and communicating? If the observer has capture problems (e.g., screencast files not growing), it exits gracefully and supervisor restarts it. 81 83 82 84 Staleness threshold: 60 seconds (configurable via `--threshold`). 83 85 ··· 89 91 - `cortex.status` - Running agents list 90 92 - `supervisor.status` - Service health, stale heartbeats 91 93 92 - The supervisor derives health from `observe.status` events and includes `stale_heartbeats` in its own status. 94 + The supervisor checks for `observe.status` event freshness and includes `stale_heartbeats` in its own status. 93 95 94 96 See [CALLOSUM.md](CALLOSUM.md) Tract Registry for event schemas. 95 97

+19

observe/macos/observer.py

··· 41 41 RMS_THRESHOLD = 0.01 42 42 MIN_HITS_FOR_SAVE = 3 43 43 SAMPLE_RATE = 48000 # Standard audio sample rate 44 + STALL_THRESHOLD_CHUNKS = 3 # Exit after this many chunks with no file growth 44 45 45 46 # Host identification for multi-host scenarios 46 47 _HOST = socket.gethostname() ··· 86 87 87 88 # Health tracking 88 89 self.files_growing = False 90 + self.stalled_chunks = ( 91 + 0 # Consecutive chunks with no file growth while capturing 92 + ) 89 93 90 94 async def setup(self): 91 95 """Initialize ScreenCaptureKit and Callosum connection.""" ··· 266 270 self.current_displays = [] 267 271 self.current_audio = None 268 272 self.last_video_sizes = {} 273 + self.stalled_chunks = 0 269 274 270 275 if finalizations: 271 276 self.pending_finalization = finalizations ··· 323 328 self.current_audio = audio 324 329 self.capture_running = True 325 330 self.last_video_sizes = {d.temp_path: 0 for d in displays} 331 + self.stalled_chunks = 0 326 332 327 333 logger.info(f"Started capture with {len(displays)} display(s)") 328 334 for display in displays: ··· 504 510 any_growing = True 505 511 self.last_video_sizes[display.temp_path] = current_size 506 512 self.files_growing = any_growing 513 + 514 + # Fail-fast: exit if capture stalled (files not growing) 515 + if any_growing: 516 + self.stalled_chunks = 0 517 + else: 518 + self.stalled_chunks += 1 519 + if self.stalled_chunks >= STALL_THRESHOLD_CHUNKS: 520 + logger.error( 521 + f"Capture stalled for {self.stalled_chunks} chunks " 522 + f"({self.stalled_chunks * CHUNK_DURATION}s), exiting" 523 + ) 524 + self.running = False 507 525 else: 508 526 self.files_growing = False 527 + self.stalled_chunks = 0 509 528 510 529 # Emit status event 511 530 self.emit_status()

+15 -39

tests/test_supervisor.py

··· 12 12 13 13 14 14 def test_check_health(): 15 - """Test health checking based on observe.status events.""" 15 + """Test health checking based on observe.status event freshness. 16 + 17 + Health model is simple: if observer is running, it sends status events. 18 + If it has problems, it exits and supervisor restarts it (fail-fast). 19 + """ 16 20 mod = importlib.import_module("think.supervisor") 17 21 18 22 # Reset state for clean test 19 23 mod._observe_status_state["last_ts"] = 0.0 20 24 mod._observe_status_state["ever_received"] = False 21 - mod._observe_status_state["activity_active"] = False 22 - mod._observe_status_state["screencast_recording"] = False 23 - mod._observe_status_state["files_growing"] = False 24 25 25 26 # Startup grace period: no status ever received - returns healthy (no alerts) 26 27 stale = mod.check_health(threshold=60) ··· 32 33 stale = mod.check_health(threshold=60) 33 34 assert sorted(stale) == ["hear", "see"] 34 35 35 - # Simulate receiving a fresh status event (user inactive) 36 + # Fresh status event means healthy (observer is running) 36 37 mod._observe_status_state["last_ts"] = time.time() 37 - mod._observe_status_state["activity_active"] = False 38 38 stale = mod.check_health(threshold=60) 39 - assert stale == [] # Healthy - inactive means OK not to record 40 - 41 - # User active but not recording - see should be stale 42 - mod._observe_status_state["activity_active"] = True 43 - mod._observe_status_state["screencast_recording"] = False 44 - stale = mod.check_health(threshold=60) 45 - assert stale == ["see"] 46 - 47 - # User active, recording, but files not growing - see should be stale 48 - mod._observe_status_state["screencast_recording"] = True 49 - mod._observe_status_state["files_growing"] = False 50 - stale = mod.check_health(threshold=60) 51 - assert stale == ["see"] 52 - 53 - # User active, recording, files growing - all healthy 54 - mod._observe_status_state["files_growing"] = True 55 - stale = mod.check_health(threshold=60) 56 - assert stale == [] 39 + assert stale == [] # Healthy - receiving status events 57 40 58 - # Status became stale (old timestamp) 41 + # Status became stale (old timestamp) - observer stopped sending 59 42 mod._observe_status_state["last_ts"] = time.time() - 100 60 43 stale = mod.check_health(threshold=60) 61 44 assert sorted(stale) == ["hear", "see"] 62 45 63 46 64 47 def test_handle_observe_status(): 65 - """Test that observe.status events update health state.""" 48 + """Test that observe.status events update health state. 49 + 50 + Handler just tracks event freshness - observer is responsible for 51 + exiting if it's unhealthy (fail-fast model). 52 + """ 66 53 mod = importlib.import_module("think.supervisor") 67 54 68 55 # Reset state 69 56 mod._observe_status_state["last_ts"] = 0.0 70 57 mod._observe_status_state["ever_received"] = False 71 - mod._observe_status_state["activity_active"] = False 72 - mod._observe_status_state["screencast_recording"] = False 73 - mod._observe_status_state["files_growing"] = False 74 58 75 - # Simulate observe.status message 76 - message = { 77 - "tract": "observe", 78 - "event": "status", 79 - "activity": {"active": True}, 80 - "screencast": {"recording": True, "files_growing": True}, 81 - } 59 + # Simulate observe.status message (only tract/event are required) 60 + message = {"tract": "observe", "event": "status"} 82 61 mod._handle_observe_status(message) 83 62 84 63 assert mod._observe_status_state["last_ts"] > 0 85 64 assert mod._observe_status_state["ever_received"] is True # Grace period ended 86 - assert mod._observe_status_state["activity_active"] is True 87 - assert mod._observe_status_state["screencast_recording"] is True 88 - assert mod._observe_status_state["files_growing"] is True 89 65 90 66 # Non-observe messages should be ignored 91 67 old_ts = mod._observe_status_state["last_ts"]

+17 -37

think/supervisor.py

··· 104 104 _restart_requests: dict[str, tuple[float, subprocess.Popen]] = {} 105 105 106 106 # Observe status state for health monitoring (updated from observe.status events) 107 + # Health is now simple: if observer is running, it sends status events. 108 + # If it has problems, it exits and gets restarted (fail-fast model). 107 109 _observe_status_state: dict = { 108 110 "last_ts": 0.0, # Timestamp of last observe.status event 109 111 "ever_received": False, # Whether we've received at least one status event 110 - "activity_active": False, # Whether user is active (not idle/locked) 111 - "screencast_recording": False, # Whether screencast is running 112 - "files_growing": False, # Whether screencast files are growing 113 112 } 114 113 115 114 ··· 220 219 def check_health(threshold: int = DEFAULT_THRESHOLD) -> list[str]: 221 220 """Return a list of stale heartbeat names based on observe.status events. 222 221 223 - Health is derived from the last observe.status Callosum event: 224 - - hear: Stale if no status received within threshold 225 - - see: Stale if active AND (not recording OR files not growing) 222 + Health model is simple: if observer is running, it sends status events. 223 + If it has problems, it exits and supervisor restarts it (fail-fast). 226 224 227 - During startup grace period (before first status event received), 228 - returns empty list to avoid false alerts while observer is starting. 225 + Returns ["hear", "see"] if no status received within threshold, 226 + empty list otherwise. During startup grace period (before first 227 + status event received), returns empty list to avoid false alerts. 229 228 """ 230 229 # Grace period: don't alert until we've received at least one status event 231 230 if not _observe_status_state["ever_received"]: 232 231 return [] 233 232 234 233 now = time.time() 235 - stale: list[str] = [] 236 - 237 234 last_ts = _observe_status_state["last_ts"] 238 - status_age = now - last_ts 239 235 240 - # If no recent status, both are stale 241 - if status_age > threshold: 236 + # If no recent status, observer is not running - both stale 237 + if now - last_ts > threshold: 242 238 return ["hear", "see"] 243 239 244 - # hear is healthy if we're receiving status events (audio always runs) 245 - # (already passed the threshold check above) 246 - 247 - # see health depends on activity state 248 - activity_active = _observe_status_state["activity_active"] 249 - screencast_recording = _observe_status_state["screencast_recording"] 250 - files_growing = _observe_status_state["files_growing"] 251 - 252 - if activity_active: 253 - # User is active - screencast should be recording and files growing 254 - if not screencast_recording or not files_growing: 255 - stale.append("see") 256 - # If not active (idle/locked/power-save), it's healthy not to record 257 - 258 - return stale 240 + # Receiving status means observer is healthy 241 + return [] 259 242 260 243 261 244 def _get_notifier() -> DesktopNotifier: ··· 1043 1026 1044 1027 1045 1028 def _handle_observe_status(message: dict) -> None: 1046 - """Handle observe.status events for health monitoring.""" 1029 + """Handle observe.status events for health monitoring. 1030 + 1031 + Just tracks that we received a status event. The observer is responsible 1032 + for exiting if it's unhealthy (fail-fast model), so receiving status 1033 + means it's working. 1034 + """ 1047 1035 if message.get("tract") != "observe" or message.get("event") != "status": 1048 1036 return 1049 1037 1050 - # Update observe status state for health checking 1051 1038 _observe_status_state["last_ts"] = time.time() 1052 1039 _observe_status_state["ever_received"] = True 1053 - 1054 - activity = message.get("activity", {}) 1055 - _observe_status_state["activity_active"] = activity.get("active", False) 1056 - 1057 - screencast = message.get("screencast", {}) 1058 - _observe_status_state["screencast_recording"] = screencast.get("recording", False) 1059 - _observe_status_state["files_growing"] = screencast.get("files_growing", False) 1060 1040 1061 1041 1062 1042 def _handle_callosum_message(message: dict) -> None:

Configure Feed

Configure Feed