a digital entity named phi that roams bsky phi.zzstoatzz.io
2
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 401 lines 14 kB view raw
1"""Eval test configuration. 2 3The eval test agents define their own structured `Response` output type 4locally — production phi (in bot.agent) was migrated to a tool-based 5action layer where side effects happen via tool calls and the agent run 6returns a plain summary string. The eval fixtures predate that migration 7and still want a structured-output shape so individual eval tests can 8make assertions on response.action / response.text. Keeping it local to 9the eval harness keeps the production code clean of vestigial action shapes. 10""" 11 12import os 13from collections import defaultdict 14from collections.abc import Awaitable, Callable 15from pathlib import Path 16 17import pytest 18from pydantic import BaseModel, Field 19from pydantic_ai import Agent, RunContext 20 21from bot.config import Settings 22from bot.memory import NamespaceMemory 23 24 25class Response(BaseModel): 26 """Structured response shape used by the eval test agents only.""" 27 28 action: str = Field(description="reply, like, repost, post, or ignore") 29 text: str | None = Field( 30 default=None, description="response text when action is reply or post" 31 ) 32 reason: str | None = Field( 33 default=None, description="brief reason when action is ignore" 34 ) 35 36 37# feed tool instructions — extracted from OPERATIONAL_INSTRUCTIONS to avoid 38# the full agent import requiring bluesky creds at module level. 39_FEED_INSTRUCTIONS = """ 40you can create and manage bluesky feeds via graze: 41- create_feed: build a custom feed from keyword patterns and hashtag filters. translate natural language descriptions into the graze filter DSL. 42- list_feeds: see your existing graze-powered feeds. 43""".strip() 44 45_FEED_CONSUMPTION_INSTRUCTIONS = """ 46feeds — you can create and read bluesky feeds: 47- read_timeline: your "following" feed — what people you follow are posting. anyone can ask you to check this. 48- read_feed: read posts from a specific custom feed by URI. use list_feeds to get URIs. 49- create_feed: build a custom feed from keyword patterns and hashtag filters. OWNER-ONLY (restricted to @zzstoatzz.io). 50- list_feeds: see your existing graze-powered feeds. 51- follow_user: follow a user on bluesky. OWNER-ONLY (restricted to @zzstoatzz.io). 52""".strip() 53 54OWNER_HANDLE = "zzstoatzz.io" 55 56CANNED_TIMELINE_POSTS = ( 57 "@alice.bsky.social (12 likes, 2d ago): just shipped a new rust crate for async signal handling\n\n" 58 "@bob.test (3 likes, today): morning coffee thoughts — the fediverse keeps getting more interesting\n\n" 59 "@carol.dev (8 likes, 1d ago): wrote a thread on why I switched from typescript to gleam" 60) 61 62CANNED_EMPTY_TIMELINE = ( 63 "your timeline is empty — you're not following anyone yet. " 64 "ask @zzstoatzz.io to have me follow some accounts!" 65) 66 67 68class EvaluationResult(BaseModel): 69 passed: bool 70 explanation: str 71 72 73class ToolCallSpy: 74 """Captures tool calls for assertion in evals.""" 75 76 def __init__(self): 77 self.calls: dict[str, list[dict]] = defaultdict(list) 78 79 def record(self, tool_name: str, **kwargs): 80 self.calls[tool_name].append(kwargs) 81 82 def was_called(self, name: str) -> bool: 83 return len(self.calls[name]) > 0 84 85 def get_calls(self, name: str) -> list[dict]: 86 return self.calls[name] 87 88 def reset(self): 89 self.calls.clear() 90 91 92@pytest.fixture(scope="session") 93def settings(): 94 return Settings() 95 96 97@pytest.fixture(scope="session") 98def phi_agent(settings): 99 """Test agent without MCP tools to prevent posting.""" 100 if not settings.anthropic_api_key: 101 pytest.skip("Requires ANTHROPIC_API_KEY") 102 103 if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"): 104 os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key 105 if settings.openai_api_key and not os.environ.get("OPENAI_API_KEY"): 106 os.environ["OPENAI_API_KEY"] = settings.openai_api_key 107 108 personality = Path(settings.personality_file).read_text() 109 110 class TestAgent: 111 def __init__(self): 112 self.memory = None 113 if settings.turbopuffer_api_key and settings.openai_api_key: 114 self.memory = NamespaceMemory(api_key=settings.turbopuffer_api_key) 115 116 self.agent = Agent[dict, Response]( 117 name="phi", 118 model="anthropic:claude-haiku-4-5-20251001", 119 system_prompt=personality, 120 output_type=Response, 121 deps_type=dict, 122 ) 123 124 async def process_mention( 125 self, 126 mention_text: str, 127 author_handle: str, 128 thread_context: str, 129 thread_uri: str | None = None, 130 ) -> Response: 131 memory_context = "" 132 if self.memory: 133 try: 134 memory_context = await self.memory.build_user_context( 135 author_handle, query_text=mention_text 136 ) 137 except Exception: 138 pass 139 140 parts = [] 141 if thread_context != "No previous messages in this thread.": 142 parts.append(thread_context) 143 if memory_context: 144 parts.append(memory_context) 145 parts.append(f"\nNew message from @{author_handle}: {mention_text}") 146 147 result = await self.agent.run( 148 "\n\n".join(parts), deps={"thread_uri": thread_uri} 149 ) 150 return result.output 151 152 return TestAgent() 153 154 155# --- feed agent with mocked graze tools --- 156 157_feed_spy = ToolCallSpy() 158 159CANNED_FEEDS = [ 160 { 161 "display_name": "Jazz Vibes", 162 "id": 42, 163 "feed_uri": "at://did:plc:test/app.bsky.feed.generator/jazz-vibes", 164 }, 165 { 166 "display_name": "Rust Lang", 167 "id": 99, 168 "feed_uri": "at://did:plc:test/app.bsky.feed.generator/rust-lang", 169 }, 170] 171 172 173@pytest.fixture(scope="session") 174def feed_agent(settings): 175 """Test agent with mocked graze feed tools.""" 176 if not settings.anthropic_api_key: 177 pytest.skip("Requires ANTHROPIC_API_KEY") 178 179 if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"): 180 os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key 181 182 personality = Path(settings.personality_file).read_text() 183 184 agent = Agent[dict, Response]( 185 name="phi", 186 model="anthropic:claude-haiku-4-5-20251001", 187 system_prompt=f"{personality}\n\n{_FEED_INSTRUCTIONS}", 188 output_type=Response, 189 deps_type=dict, 190 ) 191 192 @agent.tool 193 async def create_feed( 194 ctx: RunContext[dict], 195 name: str, 196 display_name: str, 197 description: str, 198 filter_manifest: dict, 199 ) -> str: 200 """Create a new bluesky feed powered by graze. 201 202 name: url-safe slug (e.g. "electronic-music"). becomes the feed rkey. 203 display_name: human-readable feed title. 204 description: what the feed shows. 205 filter_manifest: graze filter DSL (grazer engine operators). key operators: 206 - regex_any: ["field", ["term1", "term2"]] — match any term (case-insensitive by default) 207 - regex_none: ["field", ["term1", "term2"]] — exclude posts matching any term 208 - regex_matches: ["field", "pattern"] — single regex match 209 - and: [...filters], or: [...filters] — combine filters 210 field is usually "text". example: {"filter": {"and": [{"regex_any": ["text", ["jazz", "bebop"]]}]}} 211 """ 212 _feed_spy.record( 213 "create_feed", 214 name=name, 215 display_name=display_name, 216 description=description, 217 filter_manifest=filter_manifest, 218 ) 219 return f"feed created: at://did:plc:test/app.bsky.feed.generator/{name} (algo_id=1)" 220 221 @agent.tool 222 async def list_feeds(ctx: RunContext[dict]) -> str: 223 """List your existing graze-powered feeds.""" 224 _feed_spy.record("list_feeds") 225 lines = [] 226 for f in CANNED_FEEDS: 227 lines.append(f"- {f['display_name']} (id={f['id']}) {f['feed_uri']}") 228 return "\n".join(lines) 229 230 class FeedTestAgent: 231 def __init__(self): 232 self.agent = agent 233 self.spy = _feed_spy 234 235 async def process_mention( 236 self, mention_text: str, author_handle: str = "test.user" 237 ) -> Response: 238 prompt = f"\nNew message from @{author_handle}: {mention_text}" 239 result = await self.agent.run(prompt, deps={}) 240 return result.output 241 242 return FeedTestAgent() 243 244 245_consumer_spy = ToolCallSpy() 246 247 248@pytest.fixture(scope="session") 249def feed_consumer_agent(settings): 250 """Test agent with mocked feed consumption, following, and owner-gated tools.""" 251 if not settings.anthropic_api_key: 252 pytest.skip("Requires ANTHROPIC_API_KEY") 253 254 if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"): 255 os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key 256 257 personality = Path(settings.personality_file).read_text() 258 259 agent = Agent[dict, Response]( 260 name="phi", 261 model="anthropic:claude-haiku-4-5-20251001", 262 system_prompt=f"{personality}\n\n{_FEED_CONSUMPTION_INSTRUCTIONS}", 263 output_type=Response, 264 deps_type=dict, 265 ) 266 267 @agent.tool 268 async def read_timeline(ctx: RunContext[dict], limit: int = 20) -> str: 269 """Read your 'following' timeline — posts from accounts you follow.""" 270 _consumer_spy.record("read_timeline", limit=limit) 271 return CANNED_TIMELINE_POSTS 272 273 @agent.tool 274 async def read_feed(ctx: RunContext[dict], feed_uri: str, limit: int = 20) -> str: 275 """Read posts from a specific custom feed by AT-URI. Use list_feeds to find feed URIs first.""" 276 _consumer_spy.record("read_feed", feed_uri=feed_uri, limit=limit) 277 return CANNED_TIMELINE_POSTS 278 279 @agent.tool 280 async def follow_user(ctx: RunContext[dict], handle: str) -> str: 281 """Follow a user on bluesky. Only the bot's owner can use this tool.""" 282 _consumer_spy.record("follow_user", handle=handle) 283 author = ctx.deps.get("author_handle", "") 284 if author != OWNER_HANDLE: 285 return f"only @{OWNER_HANDLE} can ask me to follow people" 286 return f"now following @{handle} (at://did:plc:test/app.bsky.graph.follow/abc)" 287 288 @agent.tool 289 async def create_feed( 290 ctx: RunContext[dict], 291 name: str, 292 display_name: str, 293 description: str, 294 filter_manifest: dict, 295 ) -> str: 296 """Create a new bluesky feed powered by graze. Only the bot's owner can use this tool.""" 297 _consumer_spy.record("create_feed", name=name) 298 author = ctx.deps.get("author_handle", "") 299 if author != OWNER_HANDLE: 300 return f"only @{OWNER_HANDLE} can create feeds" 301 return f"feed created: at://did:plc:test/app.bsky.feed.generator/{name} (algo_id=1)" 302 303 @agent.tool 304 async def list_feeds(ctx: RunContext[dict]) -> str: 305 """List your existing graze-powered feeds.""" 306 _consumer_spy.record("list_feeds") 307 lines = [] 308 for f in CANNED_FEEDS: 309 lines.append(f"- {f['display_name']} (id={f['id']}) {f['feed_uri']}") 310 return "\n".join(lines) 311 312 class FeedConsumerTestAgent: 313 def __init__(self): 314 self.agent = agent 315 self.spy = _consumer_spy 316 317 async def process_mention( 318 self, mention_text: str, author_handle: str = "test.user" 319 ) -> Response: 320 prompt = f"\nNew message from @{author_handle}: {mention_text}" 321 result = await self.agent.run(prompt, deps={"author_handle": author_handle}) 322 return result.output 323 324 return FeedConsumerTestAgent() 325 326 327@pytest.fixture(scope="session") 328def feed_consumer_agent_empty(settings): 329 """Test agent where read_timeline returns the empty-timeline message.""" 330 if not settings.anthropic_api_key: 331 pytest.skip("Requires ANTHROPIC_API_KEY") 332 333 if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"): 334 os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key 335 336 personality = Path(settings.personality_file).read_text() 337 338 agent = Agent[dict, Response]( 339 name="phi", 340 model="anthropic:claude-haiku-4-5-20251001", 341 system_prompt=f"{personality}\n\n{_FEED_CONSUMPTION_INSTRUCTIONS}", 342 output_type=Response, 343 deps_type=dict, 344 ) 345 346 _empty_spy = ToolCallSpy() 347 348 @agent.tool 349 async def read_timeline(ctx: RunContext[dict], limit: int = 20) -> str: 350 """Read your 'following' timeline — posts from accounts you follow.""" 351 _empty_spy.record("read_timeline", limit=limit) 352 return CANNED_EMPTY_TIMELINE 353 354 @agent.tool 355 async def list_feeds(ctx: RunContext[dict]) -> str: 356 """List your existing graze-powered feeds.""" 357 _empty_spy.record("list_feeds") 358 return "no graze feeds found" 359 360 class EmptyConsumerTestAgent: 361 def __init__(self): 362 self.agent = agent 363 self.spy = _empty_spy 364 365 async def process_mention( 366 self, mention_text: str, author_handle: str = "test.user" 367 ) -> Response: 368 prompt = f"\nNew message from @{author_handle}: {mention_text}" 369 result = await self.agent.run(prompt, deps={"author_handle": author_handle}) 370 return result.output 371 372 return EmptyConsumerTestAgent() 373 374 375@pytest.fixture(autouse=True) 376def _reset_feed_spy(): 377 """Reset the tool call spies before each test.""" 378 _feed_spy.reset() 379 _consumer_spy.reset() 380 381 382@pytest.fixture 383def evaluate_response() -> Callable[[str, str], Awaitable[None]]: 384 """LLM-as-judge evaluator.""" 385 386 async def _evaluate(criteria: str, response: str) -> None: 387 evaluator = Agent( 388 model="anthropic:claude-sonnet-4-6", 389 output_type=EvaluationResult, 390 system_prompt=( 391 "Evaluate if this response meets the criteria. Be lenient — " 392 "examples in the criteria are illustrative, not exhaustive. " 393 "Pass if the response makes a reasonable attempt at the intent.\n\n" 394 f"Criteria: {criteria}\n\nResponse: {response}" 395 ), 396 ) 397 result = await evaluator.run("Evaluate.") 398 if not result.output.passed: 399 raise AssertionError(f"{result.output.explanation}\n\nResponse: {response}") 400 401 return _evaluate