a digital entity named phi that roams bsky
phi.zzstoatzz.io
1"""Eval test configuration.
2
3The eval test agents define their own structured `Response` output type
4locally — production phi (in bot.agent) was migrated to a tool-based
5action layer where side effects happen via tool calls and the agent run
6returns a plain summary string. The eval fixtures predate that migration
7and still want a structured-output shape so individual eval tests can
8make assertions on response.action / response.text. Keeping it local to
9the eval harness keeps the production code clean of vestigial action shapes.
10"""
11
12import os
13from collections import defaultdict
14from collections.abc import Awaitable, Callable
15from pathlib import Path
16
17import pytest
18from pydantic import BaseModel, Field
19from pydantic_ai import Agent, RunContext
20
21from bot.config import Settings
22from bot.memory import NamespaceMemory
23
24
25class Response(BaseModel):
26 """Structured response shape used by the eval test agents only."""
27
28 action: str = Field(description="reply, like, repost, post, or ignore")
29 text: str | None = Field(
30 default=None, description="response text when action is reply or post"
31 )
32 reason: str | None = Field(
33 default=None, description="brief reason when action is ignore"
34 )
35
36
37# feed tool instructions — extracted from OPERATIONAL_INSTRUCTIONS to avoid
38# the full agent import requiring bluesky creds at module level.
39_FEED_INSTRUCTIONS = """
40you can create and manage bluesky feeds via graze:
41- create_feed: build a custom feed from keyword patterns and hashtag filters. translate natural language descriptions into the graze filter DSL.
42- list_feeds: see your existing graze-powered feeds.
43""".strip()
44
45_FEED_CONSUMPTION_INSTRUCTIONS = """
46feeds — you can create and read bluesky feeds:
47- read_timeline: your "following" feed — what people you follow are posting. anyone can ask you to check this.
48- read_feed: read posts from a specific custom feed by URI. use list_feeds to get URIs.
49- create_feed: build a custom feed from keyword patterns and hashtag filters. OWNER-ONLY (restricted to @zzstoatzz.io).
50- list_feeds: see your existing graze-powered feeds.
51- follow_user: follow a user on bluesky. OWNER-ONLY (restricted to @zzstoatzz.io).
52""".strip()
53
54OWNER_HANDLE = "zzstoatzz.io"
55
56CANNED_TIMELINE_POSTS = (
57 "@alice.bsky.social (12 likes, 2d ago): just shipped a new rust crate for async signal handling\n\n"
58 "@bob.test (3 likes, today): morning coffee thoughts — the fediverse keeps getting more interesting\n\n"
59 "@carol.dev (8 likes, 1d ago): wrote a thread on why I switched from typescript to gleam"
60)
61
62CANNED_EMPTY_TIMELINE = (
63 "your timeline is empty — you're not following anyone yet. "
64 "ask @zzstoatzz.io to have me follow some accounts!"
65)
66
67
68class EvaluationResult(BaseModel):
69 passed: bool
70 explanation: str
71
72
73class ToolCallSpy:
74 """Captures tool calls for assertion in evals."""
75
76 def __init__(self):
77 self.calls: dict[str, list[dict]] = defaultdict(list)
78
79 def record(self, tool_name: str, **kwargs):
80 self.calls[tool_name].append(kwargs)
81
82 def was_called(self, name: str) -> bool:
83 return len(self.calls[name]) > 0
84
85 def get_calls(self, name: str) -> list[dict]:
86 return self.calls[name]
87
88 def reset(self):
89 self.calls.clear()
90
91
92@pytest.fixture(scope="session")
93def settings():
94 return Settings()
95
96
97@pytest.fixture(scope="session")
98def phi_agent(settings):
99 """Test agent without MCP tools to prevent posting."""
100 if not settings.anthropic_api_key:
101 pytest.skip("Requires ANTHROPIC_API_KEY")
102
103 if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"):
104 os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key
105 if settings.openai_api_key and not os.environ.get("OPENAI_API_KEY"):
106 os.environ["OPENAI_API_KEY"] = settings.openai_api_key
107
108 personality = Path(settings.personality_file).read_text()
109
110 class TestAgent:
111 def __init__(self):
112 self.memory = None
113 if settings.turbopuffer_api_key and settings.openai_api_key:
114 self.memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)
115
116 self.agent = Agent[dict, Response](
117 name="phi",
118 model="anthropic:claude-haiku-4-5-20251001",
119 system_prompt=personality,
120 output_type=Response,
121 deps_type=dict,
122 )
123
124 async def process_mention(
125 self,
126 mention_text: str,
127 author_handle: str,
128 thread_context: str,
129 thread_uri: str | None = None,
130 ) -> Response:
131 memory_context = ""
132 if self.memory:
133 try:
134 memory_context = await self.memory.build_user_context(
135 author_handle, query_text=mention_text
136 )
137 except Exception:
138 pass
139
140 parts = []
141 if thread_context != "No previous messages in this thread.":
142 parts.append(thread_context)
143 if memory_context:
144 parts.append(memory_context)
145 parts.append(f"\nNew message from @{author_handle}: {mention_text}")
146
147 result = await self.agent.run(
148 "\n\n".join(parts), deps={"thread_uri": thread_uri}
149 )
150 return result.output
151
152 return TestAgent()
153
154
155# --- feed agent with mocked graze tools ---
156
157_feed_spy = ToolCallSpy()
158
159CANNED_FEEDS = [
160 {
161 "display_name": "Jazz Vibes",
162 "id": 42,
163 "feed_uri": "at://did:plc:test/app.bsky.feed.generator/jazz-vibes",
164 },
165 {
166 "display_name": "Rust Lang",
167 "id": 99,
168 "feed_uri": "at://did:plc:test/app.bsky.feed.generator/rust-lang",
169 },
170]
171
172
173@pytest.fixture(scope="session")
174def feed_agent(settings):
175 """Test agent with mocked graze feed tools."""
176 if not settings.anthropic_api_key:
177 pytest.skip("Requires ANTHROPIC_API_KEY")
178
179 if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"):
180 os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key
181
182 personality = Path(settings.personality_file).read_text()
183
184 agent = Agent[dict, Response](
185 name="phi",
186 model="anthropic:claude-haiku-4-5-20251001",
187 system_prompt=f"{personality}\n\n{_FEED_INSTRUCTIONS}",
188 output_type=Response,
189 deps_type=dict,
190 )
191
192 @agent.tool
193 async def create_feed(
194 ctx: RunContext[dict],
195 name: str,
196 display_name: str,
197 description: str,
198 filter_manifest: dict,
199 ) -> str:
200 """Create a new bluesky feed powered by graze.
201
202 name: url-safe slug (e.g. "electronic-music"). becomes the feed rkey.
203 display_name: human-readable feed title.
204 description: what the feed shows.
205 filter_manifest: graze filter DSL (grazer engine operators). key operators:
206 - regex_any: ["field", ["term1", "term2"]] — match any term (case-insensitive by default)
207 - regex_none: ["field", ["term1", "term2"]] — exclude posts matching any term
208 - regex_matches: ["field", "pattern"] — single regex match
209 - and: [...filters], or: [...filters] — combine filters
210 field is usually "text". example: {"filter": {"and": [{"regex_any": ["text", ["jazz", "bebop"]]}]}}
211 """
212 _feed_spy.record(
213 "create_feed",
214 name=name,
215 display_name=display_name,
216 description=description,
217 filter_manifest=filter_manifest,
218 )
219 return f"feed created: at://did:plc:test/app.bsky.feed.generator/{name} (algo_id=1)"
220
221 @agent.tool
222 async def list_feeds(ctx: RunContext[dict]) -> str:
223 """List your existing graze-powered feeds."""
224 _feed_spy.record("list_feeds")
225 lines = []
226 for f in CANNED_FEEDS:
227 lines.append(f"- {f['display_name']} (id={f['id']}) {f['feed_uri']}")
228 return "\n".join(lines)
229
230 class FeedTestAgent:
231 def __init__(self):
232 self.agent = agent
233 self.spy = _feed_spy
234
235 async def process_mention(
236 self, mention_text: str, author_handle: str = "test.user"
237 ) -> Response:
238 prompt = f"\nNew message from @{author_handle}: {mention_text}"
239 result = await self.agent.run(prompt, deps={})
240 return result.output
241
242 return FeedTestAgent()
243
244
245_consumer_spy = ToolCallSpy()
246
247
248@pytest.fixture(scope="session")
249def feed_consumer_agent(settings):
250 """Test agent with mocked feed consumption, following, and owner-gated tools."""
251 if not settings.anthropic_api_key:
252 pytest.skip("Requires ANTHROPIC_API_KEY")
253
254 if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"):
255 os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key
256
257 personality = Path(settings.personality_file).read_text()
258
259 agent = Agent[dict, Response](
260 name="phi",
261 model="anthropic:claude-haiku-4-5-20251001",
262 system_prompt=f"{personality}\n\n{_FEED_CONSUMPTION_INSTRUCTIONS}",
263 output_type=Response,
264 deps_type=dict,
265 )
266
267 @agent.tool
268 async def read_timeline(ctx: RunContext[dict], limit: int = 20) -> str:
269 """Read your 'following' timeline — posts from accounts you follow."""
270 _consumer_spy.record("read_timeline", limit=limit)
271 return CANNED_TIMELINE_POSTS
272
273 @agent.tool
274 async def read_feed(ctx: RunContext[dict], feed_uri: str, limit: int = 20) -> str:
275 """Read posts from a specific custom feed by AT-URI. Use list_feeds to find feed URIs first."""
276 _consumer_spy.record("read_feed", feed_uri=feed_uri, limit=limit)
277 return CANNED_TIMELINE_POSTS
278
279 @agent.tool
280 async def follow_user(ctx: RunContext[dict], handle: str) -> str:
281 """Follow a user on bluesky. Only the bot's owner can use this tool."""
282 _consumer_spy.record("follow_user", handle=handle)
283 author = ctx.deps.get("author_handle", "")
284 if author != OWNER_HANDLE:
285 return f"only @{OWNER_HANDLE} can ask me to follow people"
286 return f"now following @{handle} (at://did:plc:test/app.bsky.graph.follow/abc)"
287
288 @agent.tool
289 async def create_feed(
290 ctx: RunContext[dict],
291 name: str,
292 display_name: str,
293 description: str,
294 filter_manifest: dict,
295 ) -> str:
296 """Create a new bluesky feed powered by graze. Only the bot's owner can use this tool."""
297 _consumer_spy.record("create_feed", name=name)
298 author = ctx.deps.get("author_handle", "")
299 if author != OWNER_HANDLE:
300 return f"only @{OWNER_HANDLE} can create feeds"
301 return f"feed created: at://did:plc:test/app.bsky.feed.generator/{name} (algo_id=1)"
302
303 @agent.tool
304 async def list_feeds(ctx: RunContext[dict]) -> str:
305 """List your existing graze-powered feeds."""
306 _consumer_spy.record("list_feeds")
307 lines = []
308 for f in CANNED_FEEDS:
309 lines.append(f"- {f['display_name']} (id={f['id']}) {f['feed_uri']}")
310 return "\n".join(lines)
311
312 class FeedConsumerTestAgent:
313 def __init__(self):
314 self.agent = agent
315 self.spy = _consumer_spy
316
317 async def process_mention(
318 self, mention_text: str, author_handle: str = "test.user"
319 ) -> Response:
320 prompt = f"\nNew message from @{author_handle}: {mention_text}"
321 result = await self.agent.run(prompt, deps={"author_handle": author_handle})
322 return result.output
323
324 return FeedConsumerTestAgent()
325
326
327@pytest.fixture(scope="session")
328def feed_consumer_agent_empty(settings):
329 """Test agent where read_timeline returns the empty-timeline message."""
330 if not settings.anthropic_api_key:
331 pytest.skip("Requires ANTHROPIC_API_KEY")
332
333 if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"):
334 os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key
335
336 personality = Path(settings.personality_file).read_text()
337
338 agent = Agent[dict, Response](
339 name="phi",
340 model="anthropic:claude-haiku-4-5-20251001",
341 system_prompt=f"{personality}\n\n{_FEED_CONSUMPTION_INSTRUCTIONS}",
342 output_type=Response,
343 deps_type=dict,
344 )
345
346 _empty_spy = ToolCallSpy()
347
348 @agent.tool
349 async def read_timeline(ctx: RunContext[dict], limit: int = 20) -> str:
350 """Read your 'following' timeline — posts from accounts you follow."""
351 _empty_spy.record("read_timeline", limit=limit)
352 return CANNED_EMPTY_TIMELINE
353
354 @agent.tool
355 async def list_feeds(ctx: RunContext[dict]) -> str:
356 """List your existing graze-powered feeds."""
357 _empty_spy.record("list_feeds")
358 return "no graze feeds found"
359
360 class EmptyConsumerTestAgent:
361 def __init__(self):
362 self.agent = agent
363 self.spy = _empty_spy
364
365 async def process_mention(
366 self, mention_text: str, author_handle: str = "test.user"
367 ) -> Response:
368 prompt = f"\nNew message from @{author_handle}: {mention_text}"
369 result = await self.agent.run(prompt, deps={"author_handle": author_handle})
370 return result.output
371
372 return EmptyConsumerTestAgent()
373
374
375@pytest.fixture(autouse=True)
376def _reset_feed_spy():
377 """Reset the tool call spies before each test."""
378 _feed_spy.reset()
379 _consumer_spy.reset()
380
381
382@pytest.fixture
383def evaluate_response() -> Callable[[str, str], Awaitable[None]]:
384 """LLM-as-judge evaluator."""
385
386 async def _evaluate(criteria: str, response: str) -> None:
387 evaluator = Agent(
388 model="anthropic:claude-sonnet-4-6",
389 output_type=EvaluationResult,
390 system_prompt=(
391 "Evaluate if this response meets the criteria. Be lenient — "
392 "examples in the criteria are illustrative, not exhaustive. "
393 "Pass if the response makes a reasonable attempt at the intent.\n\n"
394 f"Criteria: {criteria}\n\nResponse: {response}"
395 ),
396 )
397 result = await evaluator.run("Evaluate.")
398 if not result.output.passed:
399 raise AssertionError(f"{result.output.explanation}\n\nResponse: {response}")
400
401 return _evaluate