evals/conftest.py at main · zzstoatzz.io/bot

zzstoatzz.io / bot
fork
a digital entity named phi that roams bsky phi.zzstoatzz.io
fork
bot / evals / conftest.py
at main 401 lines 14 kB view raw
wrap content
zzstoatzz fix evals: broken import, stale justfile targets, flaky judge 2w ago
aac53260
  1"""Eval test configuration.
  2
  3The eval test agents define their own structured `Response` output type
  4locally — production phi (in bot.agent) was migrated to a tool-based
  5action layer where side effects happen via tool calls and the agent run
  6returns a plain summary string. The eval fixtures predate that migration
  7and still want a structured-output shape so individual eval tests can
  8make assertions on response.action / response.text. Keeping it local to
  9the eval harness keeps the production code clean of vestigial action shapes.
 10"""
 11
 12import os
 13from collections import defaultdict
 14from collections.abc import Awaitable, Callable
 15from pathlib import Path
 16
 17import pytest
 18from pydantic import BaseModel, Field
 19from pydantic_ai import Agent, RunContext
 20
 21from bot.config import Settings
 22from bot.memory import NamespaceMemory
 23
 24
 25class Response(BaseModel):
 26    """Structured response shape used by the eval test agents only."""
 27
 28    action: str = Field(description="reply, like, repost, post, or ignore")
 29    text: str | None = Field(
 30        default=None, description="response text when action is reply or post"
 31    )
 32    reason: str | None = Field(
 33        default=None, description="brief reason when action is ignore"
 34    )
 35
 36
 37# feed tool instructions — extracted from OPERATIONAL_INSTRUCTIONS to avoid
 38# the full agent import requiring bluesky creds at module level.
 39_FEED_INSTRUCTIONS = """
 40you can create and manage bluesky feeds via graze:
 41- create_feed: build a custom feed from keyword patterns and hashtag filters. translate natural language descriptions into the graze filter DSL.
 42- list_feeds: see your existing graze-powered feeds.
 43""".strip()
 44
 45_FEED_CONSUMPTION_INSTRUCTIONS = """
 46feeds — you can create and read bluesky feeds:
 47- read_timeline: your "following" feed — what people you follow are posting. anyone can ask you to check this.
 48- read_feed: read posts from a specific custom feed by URI. use list_feeds to get URIs.
 49- create_feed: build a custom feed from keyword patterns and hashtag filters. OWNER-ONLY (restricted to @zzstoatzz.io).
 50- list_feeds: see your existing graze-powered feeds.
 51- follow_user: follow a user on bluesky. OWNER-ONLY (restricted to @zzstoatzz.io).
 52""".strip()
 53
 54OWNER_HANDLE = "zzstoatzz.io"
 55
 56CANNED_TIMELINE_POSTS = (
 57    "@alice.bsky.social (12 likes, 2d ago): just shipped a new rust crate for async signal handling\n\n"
 58    "@bob.test (3 likes, today): morning coffee thoughts — the fediverse keeps getting more interesting\n\n"
 59    "@carol.dev (8 likes, 1d ago): wrote a thread on why I switched from typescript to gleam"
 60)
 61
 62CANNED_EMPTY_TIMELINE = (
 63    "your timeline is empty — you're not following anyone yet. "
 64    "ask @zzstoatzz.io to have me follow some accounts!"
 65)
 66
 67
 68class EvaluationResult(BaseModel):
 69    passed: bool
 70    explanation: str
 71
 72
 73class ToolCallSpy:
 74    """Captures tool calls for assertion in evals."""
 75
 76    def __init__(self):
 77        self.calls: dict[str, list[dict]] = defaultdict(list)
 78
 79    def record(self, tool_name: str, **kwargs):
 80        self.calls[tool_name].append(kwargs)
 81
 82    def was_called(self, name: str) -> bool:
 83        return len(self.calls[name]) > 0
 84
 85    def get_calls(self, name: str) -> list[dict]:
 86        return self.calls[name]
 87
 88    def reset(self):
 89        self.calls.clear()
 90
 91
 92@pytest.fixture(scope="session")
 93def settings():
 94    return Settings()
 95
 96
 97@pytest.fixture(scope="session")
 98def phi_agent(settings):
 99    """Test agent without MCP tools to prevent posting."""
100    if not settings.anthropic_api_key:
101        pytest.skip("Requires ANTHROPIC_API_KEY")
102
103    if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"):
104        os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key
105    if settings.openai_api_key and not os.environ.get("OPENAI_API_KEY"):
106        os.environ["OPENAI_API_KEY"] = settings.openai_api_key
107
108    personality = Path(settings.personality_file).read_text()
109
110    class TestAgent:
111        def __init__(self):
112            self.memory = None
113            if settings.turbopuffer_api_key and settings.openai_api_key:
114                self.memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)
115
116            self.agent = Agent[dict, Response](
117                name="phi",
118                model="anthropic:claude-haiku-4-5-20251001",
119                system_prompt=personality,
120                output_type=Response,
121                deps_type=dict,
122            )
123
124        async def process_mention(
125            self,
126            mention_text: str,
127            author_handle: str,
128            thread_context: str,
129            thread_uri: str | None = None,
130        ) -> Response:
131            memory_context = ""
132            if self.memory:
133                try:
134                    memory_context = await self.memory.build_user_context(
135                        author_handle, query_text=mention_text
136                    )
137                except Exception:
138                    pass
139
140            parts = []
141            if thread_context != "No previous messages in this thread.":
142                parts.append(thread_context)
143            if memory_context:
144                parts.append(memory_context)
145            parts.append(f"\nNew message from @{author_handle}: {mention_text}")
146
147            result = await self.agent.run(
148                "\n\n".join(parts), deps={"thread_uri": thread_uri}
149            )
150            return result.output
151
152    return TestAgent()
153
154
155# --- feed agent with mocked graze tools ---
156
157_feed_spy = ToolCallSpy()
158
159CANNED_FEEDS = [
160    {
161        "display_name": "Jazz Vibes",
162        "id": 42,
163        "feed_uri": "at://did:plc:test/app.bsky.feed.generator/jazz-vibes",
164    },
165    {
166        "display_name": "Rust Lang",
167        "id": 99,
168        "feed_uri": "at://did:plc:test/app.bsky.feed.generator/rust-lang",
169    },
170]
171
172
173@pytest.fixture(scope="session")
174def feed_agent(settings):
175    """Test agent with mocked graze feed tools."""
176    if not settings.anthropic_api_key:
177        pytest.skip("Requires ANTHROPIC_API_KEY")
178
179    if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"):
180        os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key
181
182    personality = Path(settings.personality_file).read_text()
183
184    agent = Agent[dict, Response](
185        name="phi",
186        model="anthropic:claude-haiku-4-5-20251001",
187        system_prompt=f"{personality}\n\n{_FEED_INSTRUCTIONS}",
188        output_type=Response,
189        deps_type=dict,
190    )
191
192    @agent.tool
193    async def create_feed(
194        ctx: RunContext[dict],
195        name: str,
196        display_name: str,
197        description: str,
198        filter_manifest: dict,
199    ) -> str:
200        """Create a new bluesky feed powered by graze.
201
202        name: url-safe slug (e.g. "electronic-music"). becomes the feed rkey.
203        display_name: human-readable feed title.
204        description: what the feed shows.
205        filter_manifest: graze filter DSL (grazer engine operators). key operators:
206          - regex_any: ["field", ["term1", "term2"]] — match any term (case-insensitive by default)
207          - regex_none: ["field", ["term1", "term2"]] — exclude posts matching any term
208          - regex_matches: ["field", "pattern"] — single regex match
209          - and: [...filters], or: [...filters] — combine filters
210        field is usually "text". example: {"filter": {"and": [{"regex_any": ["text", ["jazz", "bebop"]]}]}}
211        """
212        _feed_spy.record(
213            "create_feed",
214            name=name,
215            display_name=display_name,
216            description=description,
217            filter_manifest=filter_manifest,
218        )
219        return f"feed created: at://did:plc:test/app.bsky.feed.generator/{name} (algo_id=1)"
220
221    @agent.tool
222    async def list_feeds(ctx: RunContext[dict]) -> str:
223        """List your existing graze-powered feeds."""
224        _feed_spy.record("list_feeds")
225        lines = []
226        for f in CANNED_FEEDS:
227            lines.append(f"- {f['display_name']} (id={f['id']}) {f['feed_uri']}")
228        return "\n".join(lines)
229
230    class FeedTestAgent:
231        def __init__(self):
232            self.agent = agent
233            self.spy = _feed_spy
234
235        async def process_mention(
236            self, mention_text: str, author_handle: str = "test.user"
237        ) -> Response:
238            prompt = f"\nNew message from @{author_handle}: {mention_text}"
239            result = await self.agent.run(prompt, deps={})
240            return result.output
241
242    return FeedTestAgent()
243
244
245_consumer_spy = ToolCallSpy()
246
247
248@pytest.fixture(scope="session")
249def feed_consumer_agent(settings):
250    """Test agent with mocked feed consumption, following, and owner-gated tools."""
251    if not settings.anthropic_api_key:
252        pytest.skip("Requires ANTHROPIC_API_KEY")
253
254    if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"):
255        os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key
256
257    personality = Path(settings.personality_file).read_text()
258
259    agent = Agent[dict, Response](
260        name="phi",
261        model="anthropic:claude-haiku-4-5-20251001",
262        system_prompt=f"{personality}\n\n{_FEED_CONSUMPTION_INSTRUCTIONS}",
263        output_type=Response,
264        deps_type=dict,
265    )
266
267    @agent.tool
268    async def read_timeline(ctx: RunContext[dict], limit: int = 20) -> str:
269        """Read your 'following' timeline — posts from accounts you follow."""
270        _consumer_spy.record("read_timeline", limit=limit)
271        return CANNED_TIMELINE_POSTS
272
273    @agent.tool
274    async def read_feed(ctx: RunContext[dict], feed_uri: str, limit: int = 20) -> str:
275        """Read posts from a specific custom feed by AT-URI. Use list_feeds to find feed URIs first."""
276        _consumer_spy.record("read_feed", feed_uri=feed_uri, limit=limit)
277        return CANNED_TIMELINE_POSTS
278
279    @agent.tool
280    async def follow_user(ctx: RunContext[dict], handle: str) -> str:
281        """Follow a user on bluesky. Only the bot's owner can use this tool."""
282        _consumer_spy.record("follow_user", handle=handle)
283        author = ctx.deps.get("author_handle", "")
284        if author != OWNER_HANDLE:
285            return f"only @{OWNER_HANDLE} can ask me to follow people"
286        return f"now following @{handle} (at://did:plc:test/app.bsky.graph.follow/abc)"
287
288    @agent.tool
289    async def create_feed(
290        ctx: RunContext[dict],
291        name: str,
292        display_name: str,
293        description: str,
294        filter_manifest: dict,
295    ) -> str:
296        """Create a new bluesky feed powered by graze. Only the bot's owner can use this tool."""
297        _consumer_spy.record("create_feed", name=name)
298        author = ctx.deps.get("author_handle", "")
299        if author != OWNER_HANDLE:
300            return f"only @{OWNER_HANDLE} can create feeds"
301        return f"feed created: at://did:plc:test/app.bsky.feed.generator/{name} (algo_id=1)"
302
303    @agent.tool
304    async def list_feeds(ctx: RunContext[dict]) -> str:
305        """List your existing graze-powered feeds."""
306        _consumer_spy.record("list_feeds")
307        lines = []
308        for f in CANNED_FEEDS:
309            lines.append(f"- {f['display_name']} (id={f['id']}) {f['feed_uri']}")
310        return "\n".join(lines)
311
312    class FeedConsumerTestAgent:
313        def __init__(self):
314            self.agent = agent
315            self.spy = _consumer_spy
316
317        async def process_mention(
318            self, mention_text: str, author_handle: str = "test.user"
319        ) -> Response:
320            prompt = f"\nNew message from @{author_handle}: {mention_text}"
321            result = await self.agent.run(prompt, deps={"author_handle": author_handle})
322            return result.output
323
324    return FeedConsumerTestAgent()
325
326
327@pytest.fixture(scope="session")
328def feed_consumer_agent_empty(settings):
329    """Test agent where read_timeline returns the empty-timeline message."""
330    if not settings.anthropic_api_key:
331        pytest.skip("Requires ANTHROPIC_API_KEY")
332
333    if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"):
334        os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key
335
336    personality = Path(settings.personality_file).read_text()
337
338    agent = Agent[dict, Response](
339        name="phi",
340        model="anthropic:claude-haiku-4-5-20251001",
341        system_prompt=f"{personality}\n\n{_FEED_CONSUMPTION_INSTRUCTIONS}",
342        output_type=Response,
343        deps_type=dict,
344    )
345
346    _empty_spy = ToolCallSpy()
347
348    @agent.tool
349    async def read_timeline(ctx: RunContext[dict], limit: int = 20) -> str:
350        """Read your 'following' timeline — posts from accounts you follow."""
351        _empty_spy.record("read_timeline", limit=limit)
352        return CANNED_EMPTY_TIMELINE
353
354    @agent.tool
355    async def list_feeds(ctx: RunContext[dict]) -> str:
356        """List your existing graze-powered feeds."""
357        _empty_spy.record("list_feeds")
358        return "no graze feeds found"
359
360    class EmptyConsumerTestAgent:
361        def __init__(self):
362            self.agent = agent
363            self.spy = _empty_spy
364
365        async def process_mention(
366            self, mention_text: str, author_handle: str = "test.user"
367        ) -> Response:
368            prompt = f"\nNew message from @{author_handle}: {mention_text}"
369            result = await self.agent.run(prompt, deps={"author_handle": author_handle})
370            return result.output
371
372    return EmptyConsumerTestAgent()
373
374
375@pytest.fixture(autouse=True)
376def _reset_feed_spy():
377    """Reset the tool call spies before each test."""
378    _feed_spy.reset()
379    _consumer_spy.reset()
380
381
382@pytest.fixture
383def evaluate_response() -> Callable[[str, str], Awaitable[None]]:
384    """LLM-as-judge evaluator."""
385
386    async def _evaluate(criteria: str, response: str) -> None:
387        evaluator = Agent(
388            model="anthropic:claude-sonnet-4-6",
389            output_type=EvaluationResult,
390            system_prompt=(
391                "Evaluate if this response meets the criteria. Be lenient — "
392                "examples in the criteria are illustrative, not exhaustive. "
393                "Pass if the response makes a reasonable attempt at the intent.\n\n"
394                f"Criteria: {criteria}\n\nResponse: {response}"
395            ),
396        )
397        result = await evaluator.run("Evaluate.")
398        if not result.output.passed:
399            raise AssertionError(f"{result.output.explanation}\n\nResponse: {response}")
400
401    return _evaluate
Configure Feed

Configure Feed