Add prompt caching for ~80% input token savings

+120 -11

2 changed files

expand all

src

storied

cli.py

engine.py

+41 -1

src/storied/cli.py

··· 352 352 if stats["total_input"] > 0: 353 353 console.print() 354 354 console.print("[dim]Session totals:[/dim]") 355 - console.print(f" [dim]Input: {stats['total_input']:,} · Output: {stats['total_output']:,}[/dim]") 355 + cache_read = stats.get("total_cache_read", 0) 356 + cache_create = stats.get("total_cache_creation", 0) 357 + if cache_read > 0 or cache_create > 0: 358 + console.print( 359 + f" [dim]Input: {stats['total_input']:,} " 360 + f"(+{cache_read:,} cached, {cache_create:,} written) · " 361 + f"Output: {stats['total_output']:,}[/dim]" 362 + ) 363 + else: 364 + console.print( 365 + f" [dim]Input: {stats['total_input']:,} · " 366 + f"Output: {stats['total_output']:,}[/dim]" 367 + ) 356 368 357 369 console.print() 358 370 continue ··· 375 387 live.update(build_display(parts)) 376 388 377 389 console.print() 390 + 391 + # Show debug token info if enabled 392 + if args.debug: 393 + usage = engine.last_usage 394 + uncached = usage.get("input_tokens", 0) 395 + cache_write = usage.get("cache_creation_input_tokens", 0) 396 + cache_read = usage.get("cache_read_input_tokens", 0) 397 + output = usage.get("output_tokens", 0) 398 + 399 + # Format the debug line 400 + if cache_write > 0: 401 + in_part = f"in: {uncached:,} (writing {cache_write:,} to cache)" 402 + elif cache_read > 0: 403 + in_part = f"in: {uncached:,} +{cache_read:,} cached" 404 + else: 405 + in_part = f"in: {uncached:,}" 406 + 407 + total_in = engine.total_input_tokens + engine.total_cache_read_tokens 408 + total_out = engine.total_output_tokens 409 + console.print( 410 + f"[dim]{in_part} | out: {output:,} | " 411 + f"total: {total_in:,} in / {total_out:,} out[/dim]" 412 + ) 378 413 379 414 # Check if session ended (player quit gracefully) 380 415 if engine.session_ended: ··· 496 531 play_parser.add_argument( 497 532 "--world", "-w", 498 533 help="World ID to use for world-specific content", 534 + ) 535 + play_parser.add_argument( 536 + "--debug", "-d", 537 + action="store_true", 538 + help="Show token usage after each response", 499 539 ) 500 540 play_parser.set_defaults(func=cmd_play) 501 541

+79 -10

src/storied/engine.py

··· 1 1 """DM Engine - the agentic loop for running D&D sessions.""" 2 2 3 + import copy 3 4 from collections.abc import Iterator 4 5 from pathlib import Path 5 6 ··· 33 34 world_id: str = "default", 34 35 player_id: str = "default", 35 36 base_path: Path | None = None, 36 - model: str = "claude-sonnet-4-20250514", 37 + model: str = "claude-sonnet-4-5-20250929", 37 38 prompt_name: str = "dm-system", 38 39 ): 39 40 """Initialize the DM engine. ··· 52 53 self.base_path = base_path or Path.cwd() 53 54 self.messages: list[dict] = [] 54 55 55 - # Token tracking 56 - self.last_usage: dict = {"input_tokens": 0, "output_tokens": 0} 56 + # Token tracking (includes cache metrics) 57 + self.last_usage: dict = { 58 + "input_tokens": 0, 59 + "output_tokens": 0, 60 + "cache_creation_input_tokens": 0, 61 + "cache_read_input_tokens": 0, 62 + } 57 63 self.total_input_tokens: int = 0 58 64 self.total_output_tokens: int = 0 65 + self.total_cache_creation_tokens: int = 0 66 + self.total_cache_read_tokens: int = 0 59 67 60 68 # Session end flag (set when end_session tool is called) 61 69 self.session_ended: bool = False ··· 67 75 self._prompt_name = prompt_name 68 76 self._base_prompt = load_prompt(prompt_name) 69 77 self._context_parts: dict[str, str] = {} 70 - self.system_prompt = self._base_prompt + "\n\n" + self._build_context() 78 + context = self._build_context() 79 + 80 + # System prompt as content blocks with cache breakpoints 81 + self._system_blocks: list[dict] = [ 82 + { 83 + "type": "text", 84 + "text": self._base_prompt, 85 + "cache_control": {"type": "ephemeral"}, 86 + }, 87 + ] 88 + if context: 89 + self._system_blocks.append( 90 + { 91 + "type": "text", 92 + "text": context, 93 + "cache_control": {"type": "ephemeral"}, 94 + } 95 + ) 96 + 97 + # Tools with cache control on last tool 98 + self._cached_tools = self._get_tools_with_cache() 71 99 72 100 def _build_context(self) -> str: 73 101 """Build context string for system prompt. ··· 155 183 return f"## {entity_type}: {name}\n\n{body}" 156 184 157 185 @staticmethod 186 + def _get_tools_with_cache() -> list[dict]: 187 + """Get tool definitions with cache_control on the last tool.""" 188 + tools = copy.deepcopy(TOOL_DEFINITIONS) 189 + if tools: 190 + tools[-1]["cache_control"] = {"type": "ephemeral"} 191 + return tools 192 + 193 + def _prepare_messages_for_cache(self) -> list[dict]: 194 + """Prepare messages with cache_control on the last message.""" 195 + if not self.messages: 196 + return [] 197 + 198 + messages = copy.deepcopy(self.messages) 199 + last = messages[-1] 200 + 201 + # Ensure content is a list of blocks 202 + if isinstance(last["content"], str): 203 + last["content"] = [{"type": "text", "text": last["content"]}] 204 + 205 + # Add cache_control to last block 206 + if last["content"]: 207 + last["content"][-1]["cache_control"] = {"type": "ephemeral"} 208 + 209 + return messages 210 + 211 + @staticmethod 158 212 def _estimate_tokens(text: str) -> int: 159 213 """Rough token estimate (~4 chars per token).""" 160 214 return len(text) // 4 ··· 174 228 """ 175 229 # Model context limits (approximate) 176 230 model_limits = { 177 - "claude-sonnet-4-20250514": 200_000, 178 - "claude-opus-4-20250514": 200_000, 231 + "claude-haiku-4-5-20251001": 200_000, 232 + "claude-sonnet-4-5-20250514": 200_000, 233 + "claude-opus-4-5-20251101": 200_000, 179 234 } 180 235 model_limit = model_limits.get(self.model, 200_000) 181 236 ··· 205 260 "estimated_remaining": model_limit - estimated_total, 206 261 "last_input": self.last_usage.get("input_tokens", 0), 207 262 "last_output": self.last_usage.get("output_tokens", 0), 263 + "last_cache_creation": self.last_usage.get("cache_creation_input_tokens", 0), 264 + "last_cache_read": self.last_usage.get("cache_read_input_tokens", 0), 208 265 "total_input": self.total_input_tokens, 209 266 "total_output": self.total_output_tokens, 267 + "total_cache_creation": self.total_cache_creation_tokens, 268 + "total_cache_read": self.total_cache_read_tokens, 210 269 } 211 270 212 271 def process_action(self, player_input: str) -> str: ··· 239 298 with self.client.messages.stream( 240 299 model=self.model, 241 300 max_tokens=4096, 242 - system=self.system_prompt, 243 - tools=TOOL_DEFINITIONS, 244 - messages=self.messages, 301 + system=self._system_blocks, 302 + tools=self._cached_tools, 303 + messages=self._prepare_messages_for_cache(), 245 304 ) as stream: 246 305 for event in stream: 247 306 if event.type == "content_block_start": ··· 279 338 # Get the final message for conversation history 280 339 final_message = stream.get_final_message() 281 340 282 - # Track token usage 341 + # Track token usage (including cache metrics) 283 342 if final_message.usage: 343 + cache_creation = getattr( 344 + final_message.usage, "cache_creation_input_tokens", 0 345 + ) or 0 346 + cache_read = getattr( 347 + final_message.usage, "cache_read_input_tokens", 0 348 + ) or 0 284 349 self.last_usage = { 285 350 "input_tokens": final_message.usage.input_tokens, 286 351 "output_tokens": final_message.usage.output_tokens, 352 + "cache_creation_input_tokens": cache_creation, 353 + "cache_read_input_tokens": cache_read, 287 354 } 288 355 self.total_input_tokens += final_message.usage.input_tokens 289 356 self.total_output_tokens += final_message.usage.output_tokens 357 + self.total_cache_creation_tokens += cache_creation 358 + self.total_cache_read_tokens += cache_read 290 359 291 360 # Build assistant content for conversation history 292 361 for block in final_message.content:

Configure Feed

Configure Feed