rename check_monitors -> check_relays, add history mode via name param

+11 -13

src/bot/agent.py

··· 41 41 42 42 check_services checks nate's infrastructure, not yours. only use during reflection or when explicitly asked about services. 43 43 44 - check_monitors reads relay-eval and returns per-relay status headlines. when reporting from it, use the headline verbatim — don't add theories about cause. 44 + check_relays reads relay-eval. no args = fleet snapshot; name="<host>" = history for that relay. when reporting, use headlines verbatim — don't add theories about cause. 45 45 """.strip() 46 46 47 47 ··· 586 586 logger.info(f"musing finished: {summary[:200]}") 587 587 return summary 588 588 589 - async def process_monitor_check(self, recent_posts: list[str] | None = None) -> str: 590 - """Check infrastructure monitors and post about transitions if notable. 589 + async def process_relay_check(self, recent_posts: list[str] | None = None) -> str: 590 + """Scheduled relay-fleet check. Posts about transitions if notable. 591 591 592 - Uses the check_monitors tool to fetch current state. The tool returns 592 + Uses the check_relays tool to fetch current state. The tool returns 593 593 status-grouped headlines that phi should report verbatim — no theories 594 594 about cause, just observation. Stays silent if nothing's changed or 595 595 the change is already reflected in recent posts. 596 596 """ 597 - logger.info("processing monitor check") 597 + logger.info("processing relay check") 598 598 599 599 recent_activity = "" 600 600 if recent_posts: ··· 607 607 recent_activity=recent_activity, 608 608 ) 609 609 610 - monitor_task = ( 611 - "scheduled relay check. call check_monitors to see current relay " 610 + relay_task = ( 611 + "scheduled relay check. call check_relays to see current relay " 612 612 "status. if a relay has transitioned to critical or degraded " 613 613 "recently, post the headline verbatim. silence is fine if " 614 614 "everything's nominal or you've already posted about the current " ··· 624 624 async with contextlib.AsyncExitStack() as stack: 625 625 for ts in toolsets: 626 626 await stack.enter_async_context(ts) 627 - result = await self.agent.run( 628 - monitor_task, deps=deps, toolsets=toolsets 629 - ) 627 + result = await self.agent.run(relay_task, deps=deps, toolsets=toolsets) 630 628 except Exception as e: 631 629 err_type = type(e).__name__ 632 - logger.exception(f"agent.run failed during monitor check: {err_type}") 633 - return f"monitor check failed: {err_type}: {str(e)[:200]}" 630 + logger.exception(f"agent.run failed during relay check: {err_type}") 631 + return f"relay check failed: {err_type}: {str(e)[:200]}" 634 632 635 633 summary = result.output or "" 636 - logger.info(f"monitor check finished: {summary[:200]}") 634 + logger.info(f"relay check finished: {summary[:200]}") 637 635 return summary 638 636 639 637 async def process_extraction(self) -> int:

+7 -7

src/bot/services/message_handler.py

··· 348 348 except Exception as e: 349 349 logger.warning(f"exploration failed: {e}") 350 350 351 - async def check_infrastructure(self): 352 - """Run a scheduled monitor check and let phi decide whether to post.""" 353 - with logfire.span("monitor check"): 351 + async def check_relays(self): 352 + """Run a scheduled relay-fleet check and let phi decide whether to post.""" 353 + with logfire.span("relay check"): 354 354 recent_posts: list[str] = [] 355 355 try: 356 356 # Pass phi's recent posts so the agent can avoid restating ··· 360 360 if hasattr(item.post.record, "text"): 361 361 recent_posts.append(item.post.record.text) 362 362 except Exception as e: 363 - logger.warning(f"failed to fetch recent posts for monitor check: {e}") 363 + logger.warning(f"failed to fetch recent posts for relay check: {e}") 364 364 365 365 try: 366 - summary = await self.agent.process_monitor_check( 366 + summary = await self.agent.process_relay_check( 367 367 recent_posts=recent_posts or None, 368 368 ) 369 - logger.info(f"monitor check: {summary[:200]}") 369 + logger.info(f"relay check: {summary[:200]}") 370 370 except Exception as e: 371 - logger.exception(f"monitor check failed: {e}") 371 + logger.exception(f"relay check failed: {e}") 372 372 373 373 async def review_memories(self): 374 374 """Run the dream/distill pass — review observations with distance."""

+1 -1

src/bot/services/notification_poller.py

··· 353 353 self._polls_since_last_monitor_check = 0 354 354 logger.info("triggering monitor check") 355 355 try: 356 - await self.handler.check_infrastructure() 356 + await self.handler.check_relays() 357 357 except Exception as e: 358 358 logger.error(f"monitor check error: {e}", exc_info=settings.debug)

+70 -5

src/bot/tools/bluesky.py

··· 4 4 import ipaddress 5 5 import socket 6 6 from datetime import date 7 + from typing import Annotated 7 8 from urllib.parse import urlparse 8 9 9 10 import httpx 11 + from pydantic import Field 10 12 from pydantic_ai import RunContext 11 13 12 14 from bot.config import settings ··· 154 156 return await _check_services_impl() 155 157 156 158 @agent.tool 157 - async def check_monitors(ctx: RunContext[PhiDeps]) -> str: 159 + async def check_relays( 160 + ctx: RunContext[PhiDeps], 161 + name: Annotated[ 162 + str | None, 163 + Field( 164 + description=( 165 + "Specific relay hostname to query history for " 166 + "(e.g. 'zlay.waow.tech'). Omit for a fleet-wide " 167 + "snapshot of current status." 168 + ) 169 + ), 170 + ] = None, 171 + limit: Annotated[ 172 + int | None, 173 + Field( 174 + description=( 175 + "Max history points to return when name is set. " 176 + "Default ~288 = one day at ~5-min cadence." 177 + ) 178 + ), 179 + ] = None, 180 + ) -> str: 158 181 """Check the atproto relay fleet nate evaluates via relay-eval. 159 182 160 - Measures firehose connectivity and event coverage vs each relay's 161 - baseline. Returns headlines grouped by status. Report headlines 162 - verbatim — the service knows its own baselines. 183 + Default (no name): current snapshot of every relay, grouped by 184 + status — answers "how's the fleet right now." Report headlines 185 + verbatim. 186 + 187 + With name: recent coverage history for one relay (summary stats + 188 + recent points) — answers "what was X's coverage yesterday." 163 189 164 190 For app health (plyr, PDS, prefect, etc), use check_services.""" 191 + if name: 192 + history_url = settings.monitors_url.replace("/monitors", "/history") 193 + params: dict[str, str | int] = {"name": name} 194 + if limit: 195 + params["limit"] = limit 196 + try: 197 + async with httpx.AsyncClient(timeout=15) as http: 198 + r = await http.get(history_url, params=params) 199 + r.raise_for_status() 200 + data = r.json() 201 + except Exception as e: 202 + return f"history endpoint unreachable: {e}" 203 + 204 + points = data.get("points", []) 205 + summary = data.get("summary", {}) 206 + if not points: 207 + return f"no history found for {name}" 208 + 209 + mean = summary.get("mean_coverage_pct", 0) 210 + lo = summary.get("min_coverage_pct", 0) 211 + hi = summary.get("max_coverage_pct", 0) 212 + connected = summary.get("connected_runs", 0) 213 + total = summary.get("total_runs", 0) 214 + 215 + lines = [ 216 + f"history for {name} ({total} runs):", 217 + f" mean {mean:.2f}% | min {lo:.2f}% | max {hi:.2f}%", 218 + f" connected {connected}/{total} runs", 219 + "", 220 + "recent:", 221 + ] 222 + for p in points[-5:]: 223 + ts = p.get("ts", "")[:16].replace("T", " ") 224 + pct = p.get("coverage_pct", 0) 225 + conn = "connected" if p.get("connected") else "disconnected" 226 + lines.append(f" {ts} — {pct:.2f}% ({conn})") 227 + return "\n".join(lines) 228 + 229 + # snapshot mode 165 230 try: 166 231 async with httpx.AsyncClient(timeout=15) as http: 167 232 r = await http.get(settings.monitors_url) ··· 183 248 by_status.setdefault(status, []).append(m) 184 249 185 250 today = date.today() 186 - lines: list[str] = [] 251 + lines = [] 187 252 for status in ("critical", "degraded", "nominal"): 188 253 items = by_status.get(status, []) 189 254 if not items:

Configure Feed

Configure Feed