Remove custom Google content caching in favor of implicit caching

-23

tests/test_batch.py

··· 467 467 assert call_kwargs["client"] is mock_client 468 468 469 469 470 - @pytest.mark.asyncio 471 - @patch("think.batch.agenerate", new_callable=AsyncMock) 472 - async def test_batch_cached_content_passthrough(mock_agenerate): 473 - """Test that cached_content is passed through to agenerate.""" 474 - mock_agenerate.return_value = "Response" 475 - 476 - batch = Batch(max_concurrent=5) 477 - req = batch.create( 478 - contents="Test", 479 - context="test.context", 480 - cached_content="my-cached-content", 481 - ) 482 - batch.add(req) 483 - 484 - results = [] 485 - async for completed_req in batch.drain_batch(): 486 - results.append(completed_req) 487 - 488 - assert len(results) == 1 489 - 490 - # Verify cached_content was passed through 491 - call_kwargs = mock_agenerate.call_args[1] 492 - assert call_kwargs["cached_content"] == "my-cached-content"

+11 -103

think/agents.py

··· 24 24 from pathlib import Path 25 25 from typing import Any, Callable, Optional, TypedDict 26 26 27 - from google import genai 28 - from google.genai import types 29 - 30 27 from think.cluster import cluster, cluster_period, cluster_span 31 28 from think.muse import ( 32 29 compose_instructions, ··· 523 520 # Prompts 524 521 prompt: str # Final prompt (with template substitution) 525 522 system_instruction: str 526 - system_prompt_name: str # For cache key construction 523 + system_prompt_name: str # For diagnostic output (dry-run) 527 524 528 525 # Output 529 526 output_path: Optional[Path] ··· 1068 1065 return {"processed": sorted(processed), "repairable": sorted(pending)} 1069 1066 1070 1067 1071 - def _get_or_create_cache( 1072 - client: genai.Client, 1073 - model: str, 1074 - display_name: str, 1075 - transcript: str, 1076 - system_instruction: str, 1077 - ) -> str | None: 1078 - """Return cache name for ``display_name`` or None if content too small. 1079 - 1080 - Creates cache with ``transcript`` and provided system instruction if needed. 1081 - Returns None if content is below estimated 2048 token minimum (~10k chars). 1082 - 1083 - The cache contains the system instruction + transcript which are identical 1084 - for all topics on the same day with the same system prompt, so display_name 1085 - should include both day and system prompt name. 1086 - """ 1087 - MIN_CACHE_CHARS = 10000 # Heuristic: ~4 chars/token → 2048 tokens ≈ 8k-10k chars 1088 - 1089 - # Check existing caches first 1090 - for c in client.caches.list(): 1091 - if c.model == model and c.display_name == display_name: 1092 - return c.name 1093 - 1094 - # Skip cache creation for small content 1095 - if len(transcript) < MIN_CACHE_CHARS: 1096 - return None 1097 - 1098 - cache = client.caches.create( 1099 - model=model, 1100 - config=types.CreateCachedContentConfig( 1101 - display_name=display_name, 1102 - system_instruction=system_instruction, 1103 - contents=[transcript], 1104 - ttl="1800s", # 30 minutes to accommodate multiple topic analyses 1105 - ), 1106 - ) 1107 - return cache.name 1108 - 1109 - 1110 1068 def generate_agent_output( 1111 1069 transcript: str, 1112 1070 prompt: str, 1113 - api_key: str, 1114 - cache_display_name: str | None = None, 1115 1071 name: str | None = None, 1116 1072 json_output: bool = False, 1117 1073 system_instruction: str | None = None, ··· 1124 1080 Args: 1125 1081 transcript: Clustered transcript content (markdown format). 1126 1082 prompt: Agent prompt text. 1127 - api_key: Google API key for caching. 1128 - cache_display_name: Optional cache key for Google content caching. 1129 - Should include system prompt name for proper cache isolation. 1130 1083 name: Agent name for token logging context. 1131 1084 json_output: If True, request JSON response format. 1132 1085 system_instruction: System instruction text. If None, loads default ··· 1139 1092 Generated agent output content (markdown or JSON string), or 1140 1093 GenerateResult dict if return_result=True. 1141 1094 """ 1142 - from think.models import generate_with_result, resolve_provider 1095 + from think.models import generate_with_result 1143 1096 1144 1097 # Use provided system_instruction or fall back to default 1145 1098 if system_instruction is None: ··· 1157 1110 1158 1111 context = key_to_context(name) if name else "muse.system.unknown" 1159 1112 1160 - # Try to use cache if display name provided 1161 - # Note: caching is Google-specific, so we check provider first 1162 - provider, model = resolve_provider(context) 1163 - 1164 - client = None 1165 - cache_name = None 1166 - if cache_display_name and provider == "google": 1167 - client = genai.Client( 1168 - api_key=api_key, 1169 - http_options=types.HttpOptions(retry_options=types.HttpRetryOptions()), 1170 - ) 1171 - cache_name = _get_or_create_cache( 1172 - client, model, cache_display_name, transcript, system_instruction 1173 - ) 1174 - 1175 - if cache_name: 1176 - # Cache hit: content already in cache, just send prompt. 1177 - # Google-specific params (cached_content, client) are passed via kwargs. 1178 - result = generate_with_result( 1179 - contents=[prompt], 1180 - context=context, 1181 - temperature=0.3, 1182 - max_output_tokens=max_output_tokens, 1183 - thinking_budget=thinking_budget, 1184 - model=model, 1185 - cached_content=cache_name, 1186 - client=client, 1187 - json_output=json_output, 1188 - ) 1189 - else: 1190 - # No cache: use unified generate() 1191 - result = generate_with_result( 1192 - contents=[transcript, prompt], 1193 - context=context, 1194 - temperature=0.3, 1195 - max_output_tokens=max_output_tokens, 1196 - thinking_budget=thinking_budget, 1197 - system_instruction=system_instruction, 1198 - json_output=json_output, 1199 - ) 1113 + result = generate_with_result( 1114 + contents=[transcript, prompt], 1115 + context=context, 1116 + temperature=0.3, 1117 + max_output_tokens=max_output_tokens, 1118 + thinking_budget=thinking_budget, 1119 + system_instruction=system_instruction, 1120 + json_output=json_output, 1121 + ) 1200 1122 1201 1123 if return_result: 1202 1124 return result ··· 1245 1167 transcript = inputs.transcript 1246 1168 prompt = inputs.prompt 1247 1169 system_instruction = inputs.system_instruction 1248 - system_prompt_name = inputs.system_prompt_name 1249 1170 output_path = inputs.output_path 1250 1171 output_format = inputs.output_format 1251 1172 meta = inputs.meta ··· 1256 1177 if output_path: 1257 1178 output_exists = output_path.exists() and output_path.stat().st_size > 0 1258 1179 1259 - # Determine cache settings (only for day-based, non-span requests) 1260 - cache_display_name = None 1261 - if day and not span_mode: 1262 - if segment: 1263 - cache_display_name = f"{system_prompt_name}_{day}_{segment}" 1264 - else: 1265 - cache_display_name = f"{system_prompt_name}_{day}" 1266 - 1267 1180 # Extract generation parameters from metadata 1268 1181 meta_thinking_budget = meta.get("thinking_budget") 1269 1182 meta_max_output_tokens = meta.get("max_output_tokens") 1270 - 1271 - # Get API key 1272 - api_key = os.getenv("GOOGLE_API_KEY", "") 1273 1183 1274 1184 usage_data = None 1275 1185 ··· 1336 1246 gen_result = generate_agent_output( 1337 1247 transcript, 1338 1248 prompt, 1339 - api_key, 1340 - cache_display_name=cache_display_name, 1341 1249 name=name, 1342 1250 json_output=is_json_output, 1343 1251 system_instruction=system_instruction,

-7

think/batch.py

··· 20 20 21 21 Provider-specific features: 22 22 - client: Optional client for connection reuse (Google only, others use singletons) 23 - - cached_content: Content caching (Google only) 24 23 """ 25 24 26 25 import asyncio ··· 54 53 system_instruction: Optional[str] = None, 55 54 json_output: bool = False, 56 55 thinking_budget: Optional[int] = None, 57 - cached_content: Optional[str] = None, 58 56 timeout_s: Optional[float] = None, 59 57 ): 60 58 self.contents = contents ··· 65 63 self.system_instruction = system_instruction 66 64 self.json_output = json_output 67 65 self.thinking_budget = thinking_budget 68 - self.cached_content = cached_content 69 66 self.timeout_s = timeout_s 70 67 71 68 # Populated after execution ··· 128 125 system_instruction: Optional[str] = None, 129 126 json_output: bool = False, 130 127 thinking_budget: Optional[int] = None, 131 - cached_content: Optional[str] = None, 132 128 timeout_s: Optional[float] = None, 133 129 ) -> BatchRequest: 134 130 """ ··· 160 156 system_instruction=system_instruction, 161 157 json_output=json_output, 162 158 thinking_budget=thinking_budget, 163 - cached_content=cached_content, 164 159 timeout_s=timeout_s, 165 160 ) 166 161 ··· 256 251 kwargs: dict = {} 257 252 if self.client is not None: 258 253 kwargs["client"] = self.client 259 - if request.cached_content is not None: 260 - kwargs["cached_content"] = request.cached_content 261 254 if request.model is not None: 262 255 kwargs["model"] = request.model 263 256

+1 -9

think/providers/google.py

··· 26 26 timeout_s : float, optional 27 27 Request timeout in seconds. 28 28 **kwargs 29 - Provider-specific options (cached_content, client). 29 + Provider-specific options (client). 30 30 """ 31 31 32 32 from __future__ import annotations ··· 112 112 system_instruction: str | None, 113 113 json_output: bool, 114 114 thinking_budget: int | None, 115 - cached_content: str | None, 116 115 timeout_s: float | None = None, 117 116 ) -> types.GenerateContentConfig: 118 117 """Build the GenerateContentConfig. ··· 139 138 config_args["thinking_config"] = types.ThinkingConfig( 140 139 thinking_budget=thinking_budget 141 140 ) 142 - 143 - if cached_content: 144 - config_args["cached_content"] = cached_content 145 141 146 142 if timeout_s: 147 143 # Convert seconds to milliseconds for the SDK ··· 366 362 Returns GenerateResult with text, usage, finish_reason, and thinking. 367 363 See module docstring for parameter details. 368 364 """ 369 - cached_content = kwargs.get("cached_content") 370 365 client = kwargs.get("client") 371 366 372 367 client = get_or_create_client(client) ··· 378 373 system_instruction=system_instruction, 379 374 json_output=json_output, 380 375 thinking_budget=thinking_budget, 381 - cached_content=cached_content, 382 376 timeout_s=timeout_s, 383 377 ) 384 378 ··· 412 406 Returns GenerateResult with text, usage, finish_reason, and thinking. 413 407 See module docstring for parameter details. 414 408 """ 415 - cached_content = kwargs.get("cached_content") 416 409 client = kwargs.get("client") 417 410 418 411 client = get_or_create_client(client) ··· 424 417 system_instruction=system_instruction, 425 418 json_output=json_output, 426 419 thinking_budget=thinking_budget, 427 - cached_content=cached_content, 428 420 timeout_s=timeout_s, 429 421 ) 430 422

Configure Feed

Configure Feed