personal memory agent
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Add provider fallback for agent system

When an agent's configured AI provider is down (per health/agents.json),
the system now transparently retries with the backup provider:

- Pre-flight swap in prepare_config() using cached health data
- On-failure retry in _execute_with_tools() and _execute_generate()
- FallbackEvent emitted to JSONL stream when swaps happen
- Background health re-check requested when data is stale (>1h)
- provider= override added to generate_with_result() for retry path

+796 -10
+55
tests/fixtures/journal/health/agents.json
··· 1 + { 2 + "results": [ 3 + { 4 + "provider": "google", 5 + "tier": "standard", 6 + "model": "gemini-2.5-flash", 7 + "interface": "generate", 8 + "ok": false, 9 + "message": "Connection error", 10 + "elapsed_s": 5.2 11 + }, 12 + { 13 + "provider": "google", 14 + "tier": "standard", 15 + "model": "gemini-2.5-flash", 16 + "interface": "cogitate", 17 + "ok": false, 18 + "message": "Connection error", 19 + "elapsed_s": 5.1 20 + }, 21 + { 22 + "provider": "google", 23 + "tier": "premium", 24 + "model": "gemini-2.5-pro", 25 + "interface": "generate", 26 + "ok": false, 27 + "message": "Connection error", 28 + "elapsed_s": 5.3 29 + }, 30 + { 31 + "provider": "anthropic", 32 + "tier": "standard", 33 + "model": "claude-sonnet-4-5-20250929", 34 + "interface": "generate", 35 + "ok": true, 36 + "message": "ok", 37 + "elapsed_s": 1.2 38 + }, 39 + { 40 + "provider": "anthropic", 41 + "tier": "standard", 42 + "model": "claude-sonnet-4-5-20250929", 43 + "interface": "cogitate", 44 + "ok": true, 45 + "message": "ok", 46 + "elapsed_s": 2.1 47 + } 48 + ], 49 + "summary": { 50 + "total": 5, 51 + "passed": 2, 52 + "failed": 3 53 + }, 54 + "checked_at": "2026-02-12T12:00:00+00:00" 55 + }
+460
tests/test_agent_fallback.py
··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + import asyncio 5 + import json 6 + from datetime import datetime, timedelta, timezone 7 + from io import StringIO 8 + from types import SimpleNamespace 9 + from unittest.mock import MagicMock 10 + 11 + import pytest 12 + 13 + from think.agents import _is_retryable_error 14 + from think.models import ( 15 + BACKUP_PROVIDER, 16 + get_backup_provider, 17 + is_provider_healthy, 18 + should_recheck_health, 19 + ) 20 + 21 + 22 + def test_is_provider_healthy_all_failed(): 23 + health_data = { 24 + "results": [ 25 + {"provider": "google", "ok": False}, 26 + {"provider": "google", "ok": False}, 27 + ] 28 + } 29 + assert is_provider_healthy("google", health_data) is False 30 + 31 + 32 + def test_is_provider_healthy_some_passed(): 33 + health_data = { 34 + "results": [ 35 + {"provider": "google", "ok": False}, 36 + {"provider": "google", "ok": True}, 37 + ] 38 + } 39 + assert is_provider_healthy("google", health_data) is True 40 + 41 + 42 + def test_is_provider_healthy_no_data(): 43 + assert is_provider_healthy("google", None) is True 44 + 45 + 46 + def test_is_provider_healthy_no_results_for_provider(): 47 + health_data = {"results": [{"provider": "anthropic", "ok": False}]} 48 + assert is_provider_healthy("google", health_data) is True 49 + 50 + 51 + def test_should_recheck_health_stale(): 52 + checked_at = (datetime.now(timezone.utc) - timedelta(hours=2)).isoformat() 53 + health_data = {"checked_at": checked_at} 54 + assert should_recheck_health(health_data) is True 55 + 56 + 57 + def test_should_recheck_health_fresh(): 58 + checked_at = (datetime.now(timezone.utc) - timedelta(minutes=10)).isoformat() 59 + health_data = {"checked_at": checked_at} 60 + assert should_recheck_health(health_data) is False 61 + 62 + 63 + def test_get_backup_provider_from_config(monkeypatch): 64 + monkeypatch.setattr( 65 + "think.models.get_config", 66 + lambda: {"providers": {"backup": {"provider": "openai"}}}, 67 + ) 68 + assert get_backup_provider() == "openai" 69 + 70 + 71 + def test_get_backup_provider_fallback_constant(monkeypatch): 72 + monkeypatch.setattr("think.models.get_config", lambda: {}) 73 + assert get_backup_provider() == BACKUP_PROVIDER 74 + 75 + 76 + def test_get_backup_provider_none_when_same_as_config_default(monkeypatch): 77 + monkeypatch.setattr( 78 + "think.models.get_config", 79 + lambda: { 80 + "providers": { 81 + "default": {"provider": "openai"}, 82 + "backup": {"provider": "openai"}, 83 + } 84 + }, 85 + ) 86 + assert get_backup_provider() is None 87 + 88 + 89 + def _mock_base_agent_config() -> dict: 90 + return { 91 + "type": "cogitate", 92 + "path": None, 93 + "sources": {}, 94 + "system_instruction": "", 95 + "user_instruction": "", 96 + "prompt": "", 97 + "disabled": False, 98 + } 99 + 100 + 101 + def _patch_prepare_config_dependencies(monkeypatch): 102 + monkeypatch.setattr( 103 + "think.muse.get_agent", lambda *args, **kwargs: _mock_base_agent_config() 104 + ) 105 + monkeypatch.setattr( 106 + "think.muse.key_to_context", lambda _name: "muse.system.default" 107 + ) 108 + monkeypatch.setattr( 109 + "think.models.resolve_provider", 110 + lambda _context: ("google", "gemini-3-flash-preview"), 111 + ) 112 + 113 + 114 + def test_preflight_swap_unhealthy_primary(monkeypatch): 115 + from think.agents import prepare_config 116 + 117 + _patch_prepare_config_dependencies(monkeypatch) 118 + monkeypatch.setattr( 119 + "think.models.load_health_status", 120 + lambda: {"results": [{"provider": "google", "ok": False}]}, 121 + ) 122 + monkeypatch.setattr("think.models.should_recheck_health", lambda _h: False) 123 + monkeypatch.setattr("think.models.get_backup_provider", lambda: "anthropic") 124 + monkeypatch.setattr( 125 + "think.models.resolve_model_for_provider", 126 + lambda _context, _provider: "claude-sonnet-4-5", 127 + ) 128 + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") 129 + 130 + config = prepare_config({"name": "default", "prompt": "hello"}) 131 + 132 + assert config["provider"] == "anthropic" 133 + assert config["model"] == "claude-sonnet-4-5" 134 + assert config["fallback_from"] == "google" 135 + 136 + 137 + def test_preflight_no_swap_healthy_primary(monkeypatch): 138 + from think.agents import prepare_config 139 + 140 + _patch_prepare_config_dependencies(monkeypatch) 141 + monkeypatch.setattr( 142 + "think.models.load_health_status", 143 + lambda: {"results": [{"provider": "google", "ok": True}]}, 144 + ) 145 + monkeypatch.setattr("think.models.should_recheck_health", lambda _h: False) 146 + 147 + config = prepare_config({"name": "default", "prompt": "hello"}) 148 + 149 + assert config["provider"] == "google" 150 + assert "fallback_from" not in config 151 + 152 + 153 + def test_preflight_no_swap_no_backup_key(monkeypatch): 154 + from think.agents import prepare_config 155 + 156 + _patch_prepare_config_dependencies(monkeypatch) 157 + monkeypatch.setattr( 158 + "think.models.load_health_status", 159 + lambda: {"results": [{"provider": "google", "ok": False}]}, 160 + ) 161 + monkeypatch.setattr("think.models.should_recheck_health", lambda _h: False) 162 + monkeypatch.setattr("think.models.get_backup_provider", lambda: "anthropic") 163 + monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) 164 + 165 + config = prepare_config({"name": "default", "prompt": "hello"}) 166 + 167 + assert config["provider"] == "google" 168 + assert "fallback_from" not in config 169 + 170 + 171 + def test_on_failure_retry_cogitate(monkeypatch): 172 + from think.agents import _execute_with_tools 173 + 174 + events = [] 175 + attempts = {"primary": 0, "backup": 0} 176 + 177 + async def fail_cogitate(*_args, **_kwargs): 178 + attempts["primary"] += 1 179 + raise RuntimeError("primary down") 180 + 181 + async def pass_cogitate(*_args, **kwargs): 182 + attempts["backup"] += 1 183 + on_event = kwargs.get("on_event") 184 + if on_event: 185 + on_event({"event": "finish", "result": "backup result"}) 186 + return "backup result" 187 + 188 + monkeypatch.setattr( 189 + "think.providers.PROVIDER_REGISTRY", {"google": "x", "anthropic": "y"} 190 + ) 191 + monkeypatch.setattr( 192 + "think.providers.get_provider_module", 193 + lambda provider: SimpleNamespace( 194 + run_cogitate=fail_cogitate if provider == "google" else pass_cogitate 195 + ), 196 + ) 197 + monkeypatch.setattr("think.models.get_backup_provider", lambda: "anthropic") 198 + monkeypatch.setattr( 199 + "think.models.resolve_model_for_provider", 200 + lambda _context, _provider: "claude-sonnet-4-5", 201 + ) 202 + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") 203 + 204 + config = { 205 + "provider": "google", 206 + "model": "gemini-3-flash-preview", 207 + "health_stale": False, 208 + "context": "muse.system.default", 209 + } 210 + 211 + asyncio.run(_execute_with_tools(config, events.append)) 212 + 213 + assert attempts["primary"] == 1 214 + assert attempts["backup"] == 1 215 + assert config["provider"] == "anthropic" 216 + assert config["model"] == "claude-sonnet-4-5" 217 + assert config["fallback_from"] == "google" 218 + assert any(e.get("event") == "fallback" for e in events) 219 + 220 + 221 + def test_on_failure_retry_cogitate_uses_context_from_name(monkeypatch): 222 + from think.agents import _execute_with_tools 223 + 224 + events = [] 225 + seen = {} 226 + 227 + async def fail_cogitate(*_args, **_kwargs): 228 + raise RuntimeError("primary down") 229 + 230 + async def pass_cogitate(*_args, **kwargs): 231 + on_event = kwargs.get("on_event") 232 + if on_event: 233 + on_event({"event": "finish", "result": "backup result"}) 234 + return "backup result" 235 + 236 + def resolve_model(context, _provider): 237 + seen["context"] = context 238 + return "claude-sonnet-4-5" 239 + 240 + monkeypatch.setattr( 241 + "think.providers.PROVIDER_REGISTRY", {"google": "x", "anthropic": "y"} 242 + ) 243 + monkeypatch.setattr( 244 + "think.providers.get_provider_module", 245 + lambda provider: SimpleNamespace( 246 + run_cogitate=fail_cogitate if provider == "google" else pass_cogitate 247 + ), 248 + ) 249 + monkeypatch.setattr( 250 + "think.muse.key_to_context", 251 + lambda _name: "muse.system.default", 252 + ) 253 + monkeypatch.setattr("think.models.get_backup_provider", lambda: "anthropic") 254 + monkeypatch.setattr("think.models.resolve_model_for_provider", resolve_model) 255 + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") 256 + 257 + config = { 258 + "name": "default", 259 + "provider": "google", 260 + "model": "gemini-3-flash-preview", 261 + "health_stale": False, 262 + } 263 + 264 + asyncio.run(_execute_with_tools(config, events.append)) 265 + 266 + assert seen["context"] == "muse.system.default" 267 + 268 + 269 + def test_on_failure_retry_generate(monkeypatch): 270 + from think.agents import _execute_generate 271 + 272 + events = [] 273 + calls = {"count": 0} 274 + 275 + def mock_generate_with_result(**kwargs): 276 + calls["count"] += 1 277 + if calls["count"] == 1: 278 + raise RuntimeError("primary generate failed") 279 + assert kwargs.get("provider") == "anthropic" 280 + assert kwargs.get("model") == "claude-sonnet-4-5" 281 + return {"text": "backup text", "usage": {"input_tokens": 1, "output_tokens": 1}} 282 + 283 + monkeypatch.setattr( 284 + "think.muse.key_to_context", lambda _name: "muse.system.default" 285 + ) 286 + monkeypatch.setattr("think.models.generate_with_result", mock_generate_with_result) 287 + monkeypatch.setattr("think.models.get_backup_provider", lambda: "anthropic") 288 + monkeypatch.setattr( 289 + "think.models.resolve_model_for_provider", 290 + lambda _context, _provider: "claude-sonnet-4-5", 291 + ) 292 + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") 293 + 294 + config = { 295 + "name": "default", 296 + "provider": "google", 297 + "model": "gemini-3-flash-preview", 298 + "prompt": "hello", 299 + "health_stale": False, 300 + } 301 + 302 + asyncio.run(_execute_generate(config, events.append)) 303 + 304 + assert calls["count"] == 2 305 + assert config["provider"] == "anthropic" 306 + assert config["fallback_from"] == "google" 307 + assert any(e.get("event") == "fallback" for e in events) 308 + assert events[-1]["event"] == "finish" 309 + assert events[-1]["result"] == "backup text" 310 + 311 + 312 + def test_on_failure_no_retry_value_error(monkeypatch): 313 + from think.agents import _execute_generate 314 + 315 + events = [] 316 + assert _is_retryable_error(ValueError("bad input")) is False 317 + 318 + def bad_generate(**_kwargs): 319 + raise ValueError("bad input") 320 + 321 + monkeypatch.setattr( 322 + "think.muse.key_to_context", lambda _name: "muse.system.default" 323 + ) 324 + monkeypatch.setattr("think.models.generate_with_result", bad_generate) 325 + 326 + config = { 327 + "name": "default", 328 + "provider": "google", 329 + "model": "gemini-3-flash-preview", 330 + "prompt": "hello", 331 + "health_stale": False, 332 + } 333 + 334 + with pytest.raises(ValueError, match="bad input"): 335 + asyncio.run(_execute_generate(config, events.append)) 336 + 337 + assert not any(e.get("event") == "fallback" for e in events) 338 + 339 + 340 + def test_on_failure_both_fail_raises_original(monkeypatch): 341 + from think.agents import _execute_generate 342 + 343 + events = [] 344 + calls = {"count": 0} 345 + 346 + def always_fail(**kwargs): 347 + calls["count"] += 1 348 + if kwargs.get("provider") == "anthropic": 349 + raise RuntimeError("backup failed") 350 + raise RuntimeError("primary failed") 351 + 352 + monkeypatch.setattr( 353 + "think.muse.key_to_context", lambda _name: "muse.system.default" 354 + ) 355 + monkeypatch.setattr("think.models.generate_with_result", always_fail) 356 + monkeypatch.setattr("think.models.get_backup_provider", lambda: "anthropic") 357 + monkeypatch.setattr( 358 + "think.models.resolve_model_for_provider", 359 + lambda _context, _provider: "claude-sonnet-4-5", 360 + ) 361 + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") 362 + 363 + config = { 364 + "name": "default", 365 + "provider": "google", 366 + "model": "gemini-3-flash-preview", 367 + "prompt": "hello", 368 + "health_stale": False, 369 + } 370 + 371 + with pytest.raises(RuntimeError, match="primary failed"): 372 + asyncio.run(_execute_generate(config, events.append)) 373 + 374 + assert calls["count"] == 2 375 + 376 + 377 + def test_fallback_event_emitted(): 378 + from think.agents import _run_agent 379 + 380 + events = [] 381 + config = { 382 + "type": "cogitate", 383 + "name": "default", 384 + "provider": "anthropic", 385 + "model": "claude-sonnet-4-5", 386 + "prompt": "hello", 387 + "fallback_from": "google", 388 + } 389 + 390 + asyncio.run(_run_agent(config, events.append, dry_run=True)) 391 + 392 + fallback_events = [e for e in events if e.get("event") == "fallback"] 393 + assert len(fallback_events) == 1 394 + assert fallback_events[0]["reason"] == "preflight" 395 + 396 + 397 + def test_recheck_requested_on_stale(monkeypatch): 398 + from think.agents import _execute_with_tools 399 + 400 + async def pass_cogitate(*_args, **kwargs): 401 + on_event = kwargs.get("on_event") 402 + if on_event: 403 + on_event({"event": "finish", "result": "ok"}) 404 + return "ok" 405 + 406 + recheck_mock = MagicMock() 407 + 408 + monkeypatch.setattr("think.providers.PROVIDER_REGISTRY", {"google": "x"}) 409 + monkeypatch.setattr( 410 + "think.providers.get_provider_module", 411 + lambda _provider: SimpleNamespace(run_cogitate=pass_cogitate), 412 + ) 413 + monkeypatch.setattr("think.models.request_health_recheck", recheck_mock) 414 + 415 + config = { 416 + "provider": "google", 417 + "model": "gemini-3-flash-preview", 418 + "health_stale": True, 419 + } 420 + 421 + asyncio.run(_execute_with_tools(config, lambda _e: None)) 422 + 423 + recheck_mock.assert_called_once() 424 + assert config["health_stale"] is False 425 + 426 + 427 + def test_main_async_no_duplicate_error_when_evented(monkeypatch, capsys): 428 + from think.agents import main_async 429 + 430 + ndjson_input = json.dumps({"name": "default", "prompt": "hello"}) 431 + monkeypatch.setattr("sys.stdin", StringIO(ndjson_input)) 432 + 433 + async def fake_run_agent(_config, emit_event, dry_run=False): 434 + emit_event({"event": "error", "error": "provider failed"}) 435 + exc = RuntimeError("provider failed") 436 + setattr(exc, "_evented", True) 437 + raise exc 438 + 439 + mock_args = MagicMock() 440 + mock_args.verbose = False 441 + mock_args.dry_run = False 442 + mock_args.subcommand = None 443 + 444 + monkeypatch.setattr("think.agents.setup_cli", lambda _parser: mock_args) 445 + monkeypatch.setattr( 446 + "think.agents.setup_logging", 447 + lambda _verbose=False: MagicMock(), 448 + ) 449 + monkeypatch.setattr( 450 + "think.agents.prepare_config", lambda _request: {"type": "cogitate"} 451 + ) 452 + monkeypatch.setattr("think.agents.validate_config", lambda _config: None) 453 + monkeypatch.setattr("think.agents._run_agent", fake_run_agent) 454 + 455 + asyncio.run(main_async()) 456 + 457 + lines = [line for line in capsys.readouterr().out.splitlines() if line.strip()] 458 + events = [json.loads(line) for line in lines] 459 + error_events = [event for event in events if event.get("event") == "error"] 460 + assert len(error_events) == 1
+172 -10
think/agents.py
··· 294 294 295 295 config["provider"] = provider 296 296 config["model"] = model 297 + config["context"] = context 298 + 299 + # --- Provider fallback: preflight swap if primary is unhealthy --- 300 + from think.models import ( 301 + get_backup_provider, 302 + is_provider_healthy, 303 + load_health_status, 304 + should_recheck_health, 305 + ) 306 + from think.providers import PROVIDER_METADATA 307 + 308 + health_data = load_health_status() 309 + config["health_stale"] = should_recheck_health(health_data) 310 + 311 + if not is_provider_healthy(provider, health_data): 312 + backup = get_backup_provider() 313 + if backup and backup != provider: 314 + env_key = PROVIDER_METADATA.get(backup, {}).get("env_key") 315 + if env_key and os.getenv(env_key): 316 + config["fallback_from"] = provider 317 + config["provider"] = backup 318 + config["model"] = resolve_model_for_provider(context, backup) 297 319 298 320 # Check if disabled 299 321 if config.get("disabled"): ··· 502 524 return event 503 525 504 526 527 + _NON_RETRYABLE_ERRORS = ( 528 + ValueError, 529 + json.JSONDecodeError, 530 + KeyError, 531 + TypeError, 532 + AttributeError, 533 + FileNotFoundError, 534 + PermissionError, 535 + NotImplementedError, 536 + ) 537 + 538 + 539 + def _is_retryable_error(exc: Exception) -> bool: 540 + """Check if an exception is likely a provider error worth retrying. 541 + 542 + Returns False for local/code errors (ValueError, KeyError, etc.). 543 + Returns True for everything else (SDK connection, timeout, server errors). 544 + """ 545 + return not isinstance(exc, _NON_RETRYABLE_ERRORS) 546 + 547 + 505 548 async def _execute_with_tools( 506 549 config: dict, 507 550 emit_event: Callable[[dict], None], ··· 541 584 542 585 emit_event(data) 543 586 544 - await provider_mod.run_cogitate(config=config, on_event=agent_emit_event) 587 + try: 588 + await provider_mod.run_cogitate(config=config, on_event=agent_emit_event) 589 + except Exception as exc: 590 + if not _is_retryable_error(exc) or config.get("fallback_from"): 591 + raise 592 + from think.models import ( 593 + get_backup_provider, 594 + resolve_model_for_provider, 595 + ) 596 + from think.providers import PROVIDER_METADATA 597 + 598 + backup = get_backup_provider() 599 + if not backup or backup == provider: 600 + raise 601 + env_key = PROVIDER_METADATA.get(backup, {}).get("env_key") 602 + if not env_key or not os.getenv(env_key): 603 + raise 604 + 605 + context = config.get("context") 606 + if not context: 607 + from think.muse import key_to_context 608 + 609 + context = key_to_context(config.get("name", "default")) 610 + backup_model = resolve_model_for_provider(context, backup) 611 + 612 + emit_event( 613 + { 614 + "event": "fallback", 615 + "ts": now_ms(), 616 + "original_provider": provider, 617 + "backup_provider": backup, 618 + "reason": "on_failure", 619 + "error": str(exc), 620 + } 621 + ) 622 + 623 + config["fallback_from"] = provider 624 + config["provider"] = backup 625 + config["model"] = backup_model 626 + 627 + backup_mod = get_provider_module(backup) 628 + try: 629 + await backup_mod.run_cogitate(config=config, on_event=agent_emit_event) 630 + except Exception: 631 + raise exc 632 + finally: 633 + if config.get("health_stale"): 634 + from think.models import request_health_recheck 635 + 636 + request_health_recheck() 637 + config["health_stale"] = False 545 638 546 639 547 640 async def _execute_generate( ··· 589 682 contents = ["No input provided."] 590 683 591 684 context = key_to_context(name) 592 - gen_result = generate_with_result( 593 - contents=contents, 594 - context=context, 595 - temperature=0.3, 596 - max_output_tokens=max_output_tokens, 597 - thinking_budget=thinking_budget, 598 - system_instruction=system_instruction, 599 - json_output=is_json_output, 600 - ) 685 + try: 686 + gen_result = generate_with_result( 687 + contents=contents, 688 + context=context, 689 + temperature=0.3, 690 + max_output_tokens=max_output_tokens, 691 + thinking_budget=thinking_budget, 692 + system_instruction=system_instruction, 693 + json_output=is_json_output, 694 + ) 695 + except Exception as exc: 696 + if not _is_retryable_error(exc) or config.get("fallback_from"): 697 + raise 698 + from think.models import ( 699 + get_backup_provider, 700 + resolve_model_for_provider, 701 + ) 702 + from think.providers import PROVIDER_METADATA 703 + 704 + provider = config.get("provider", "google") 705 + backup = get_backup_provider() 706 + if not backup or backup == provider: 707 + raise 708 + env_key = PROVIDER_METADATA.get(backup, {}).get("env_key") 709 + if not env_key or not os.getenv(env_key): 710 + raise 711 + 712 + backup_model = resolve_model_for_provider(context, backup) 713 + 714 + emit_event( 715 + { 716 + "event": "fallback", 717 + "ts": now_ms(), 718 + "original_provider": provider, 719 + "backup_provider": backup, 720 + "reason": "on_failure", 721 + "error": str(exc), 722 + } 723 + ) 724 + 725 + config["fallback_from"] = provider 726 + config["provider"] = backup 727 + config["model"] = backup_model 728 + 729 + try: 730 + gen_result = generate_with_result( 731 + contents=contents, 732 + context=context, 733 + temperature=0.3, 734 + max_output_tokens=max_output_tokens, 735 + thinking_budget=thinking_budget, 736 + system_instruction=system_instruction, 737 + json_output=is_json_output, 738 + provider=backup, 739 + model=backup_model, 740 + ) 741 + except Exception: 742 + raise exc 743 + finally: 744 + if config.get("health_stale"): 745 + from think.models import request_health_recheck 746 + 747 + request_health_recheck() 748 + config["health_stale"] = False 601 749 602 750 result = gen_result["text"] 603 751 usage_data = gen_result.get("usage") ··· 662 810 if config.get("chat_id"): 663 811 start_event["chat_id"] = config["chat_id"] 664 812 emit_event(start_event) 813 + 814 + # Emit preflight fallback event if provider was swapped 815 + if config.get("fallback_from"): 816 + emit_event( 817 + { 818 + "event": "fallback", 819 + "ts": now_ms(), 820 + "original_provider": config["fallback_from"], 821 + "backup_provider": config["provider"], 822 + "reason": "preflight", 823 + } 824 + ) 665 825 666 826 # Handle skip conditions 667 827 skip_reason = config.get("skip_reason") ··· 1035 1195 } 1036 1196 ) 1037 1197 except Exception as e: 1198 + if getattr(e, "_evented", False): 1199 + continue 1038 1200 emit_event( 1039 1201 { 1040 1202 "event": "error",
+97
think/models.py
··· 6 6 import json 7 7 import logging 8 8 import os 9 + import subprocess 9 10 import time 11 + from datetime import datetime, timezone 10 12 from pathlib import Path 11 13 from typing import Any, Dict, List, Optional, Union 12 14 ··· 995 997 return result["text"] 996 998 997 999 1000 + # --------------------------------------------------------------------------- 1001 + # Provider Health & Fallback Helpers 1002 + # --------------------------------------------------------------------------- 1003 + 1004 + 1005 + def get_backup_provider() -> Optional[str]: 1006 + """Get the backup provider from journal config, falling back to constant. 1007 + 1008 + Returns None if backup would be the same as the default provider. 1009 + """ 1010 + config = get_config() 1011 + providers_config = config.get("providers", {}) 1012 + default_section = providers_config.get("default", {}) 1013 + primary_provider = default_section.get("provider", DEFAULT_PROVIDER) 1014 + backup_section = providers_config.get("backup", {}) 1015 + backup = backup_section.get("provider", BACKUP_PROVIDER) 1016 + if backup == primary_provider: 1017 + return None 1018 + return backup 1019 + 1020 + 1021 + def load_health_status() -> Optional[dict]: 1022 + """Load health status from $JOURNAL_PATH/health/agents.json. 1023 + 1024 + Returns parsed dict or None if file is missing/unreadable. 1025 + """ 1026 + try: 1027 + health_path = Path(get_journal()) / "health" / "agents.json" 1028 + with open(health_path) as f: 1029 + return json.load(f) 1030 + except (FileNotFoundError, json.JSONDecodeError, OSError): 1031 + return None 1032 + 1033 + 1034 + def is_provider_healthy(provider: str, health_data: Optional[dict]) -> bool: 1035 + """Check if a provider is healthy based on health data. 1036 + 1037 + Returns True (assume healthy) when: 1038 + - health_data is None (no data available) 1039 + - No results exist for the provider 1040 + - Any result for the provider has ok=True 1041 + 1042 + Returns False only when all results for the provider have ok=False. 1043 + """ 1044 + if health_data is None: 1045 + return True 1046 + results = health_data.get("results", []) 1047 + provider_results = [r for r in results if r.get("provider") == provider] 1048 + if not provider_results: 1049 + return True 1050 + return any(r.get("ok") for r in provider_results) 1051 + 1052 + 1053 + def should_recheck_health(health_data: Optional[dict]) -> bool: 1054 + """Check if health data is stale (>1 hour old). 1055 + 1056 + Returns False when health_data is None or on parse errors. 1057 + """ 1058 + if health_data is None: 1059 + return False 1060 + checked_at = health_data.get("checked_at") 1061 + if not checked_at: 1062 + return False 1063 + try: 1064 + checked_time = datetime.fromisoformat(checked_at) 1065 + if checked_time.tzinfo is None: 1066 + checked_time = checked_time.replace(tzinfo=timezone.utc) 1067 + age = datetime.now(timezone.utc) - checked_time 1068 + return age.total_seconds() > 3600 1069 + except (ValueError, TypeError): 1070 + return False 1071 + 1072 + 1073 + def request_health_recheck() -> None: 1074 + """Request a health re-check by spawning a background process. 1075 + 1076 + Fire-and-forget; errors are logged but never propagated. 1077 + """ 1078 + try: 1079 + subprocess.Popen( 1080 + ["sol", "agents", "check"], 1081 + stdout=subprocess.DEVNULL, 1082 + stderr=subprocess.DEVNULL, 1083 + ) 1084 + except Exception: 1085 + logging.getLogger(__name__).debug( 1086 + "Failed to request health recheck", exc_info=True 1087 + ) 1088 + 1089 + 998 1090 def generate_with_result( 999 1091 contents: Union[str, List[Any]], 1000 1092 context: str, ··· 1020 1112 from think.providers import get_provider_module 1021 1113 1022 1114 model_override = kwargs.pop("model", None) 1115 + provider_override = kwargs.pop("provider", None) 1023 1116 1024 1117 provider, model = resolve_provider(context) 1118 + if provider_override: 1119 + provider = provider_override 1120 + if not model_override: 1121 + model = resolve_model_for_provider(context, provider) 1025 1122 if model_override: 1026 1123 model = model_override 1027 1124
+12
think/providers/shared.py
··· 109 109 raw: Optional[list[dict[str, Any]]] # Original provider JSON event(s) 110 110 111 111 112 + class FallbackEvent(TypedDict, total=False): 113 + """Event emitted when provider fallback occurs.""" 114 + 115 + event: Required[Literal["fallback"]] 116 + ts: Required[int] 117 + original_provider: Required[str] 118 + backup_provider: Required[str] 119 + reason: Required[str] # "preflight" or "on_failure" 120 + error: Optional[str] # Error message for on_failure case 121 + 122 + 112 123 Event = Union[ 113 124 ToolStartEvent, 114 125 ToolEndEvent, ··· 117 128 ErrorEvent, 118 129 ThinkingEvent, 119 130 AgentUpdatedEvent, 131 + FallbackEvent, 120 132 ] 121 133 122 134