personal memory agent
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

entity_observer: zero-thinking experiment + fix SDK field name

Run tier2-flash-minimal-jsonl-nothink experiment testing absolute floor
config (minimal context, JSONL, terse system prompt, thinking_budget=0).
Fix ThinkingConfig field name: SDK uses thinking_config not thinking/thinkingConfig.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

+585
+95
scratch/entity-observer-prototype/results/tier2-flash-minimal-jsonl-nothink.json
··· 1 + { 2 + "label": "tier2-flash-minimal-jsonl-nothink", 3 + "config": { 4 + "facet": "solstone", 5 + "day": "20260414", 6 + "strategy": "minimal", 7 + "context_style": "compact", 8 + "system_key": "observer_v2_terse", 9 + "format_key": "jsonl", 10 + "model": "gemini-3-flash-preview", 11 + "thinking_budget": 0 12 + }, 13 + "context_stats": { 14 + "total_attached": 129, 15 + "active_count": 39, 16 + "estimated_tokens": 12909, 17 + "system_prompt_chars": 337, 18 + "user_prompt_chars": 46689 19 + }, 20 + "generation": { 21 + "elapsed_seconds": 5.49, 22 + "usage": { 23 + "prompt_tokens": 11439, 24 + "output_tokens": 647, 25 + "total_tokens": 12086, 26 + "thinking_tokens": null 27 + }, 28 + "output_chars": 2923, 29 + "finish_reason": "FinishReason.STOP" 30 + }, 31 + "parsing": { 32 + "parse_success": true, 33 + "observation_count": 8, 34 + "entity_count": 7, 35 + "errors": [] 36 + }, 37 + "evaluation": { 38 + "total_observations": 8, 39 + "entities_with_observations": 7, 40 + "duplicates": 0, 41 + "quality_flags": [], 42 + "quality_score": 1.0 43 + }, 44 + "raw_output": "{\"entity_id\": \"jeremie_miller\", \"entity_name\": \"Jeremie Miller\", \"content\": \"Advocates for a 'tenure-weighted customer standing' model in corporate governance, where long-term customers gain formal standing in covenant enforcement decisions.\", \"reasoning\": \"Jeremie explicitly proposed this concept for Sol PBC to align customer incentives with mission preservation.\"}\n{\"entity_id\": \"sol_pbc\", \"entity_name\": \"Sol PBC\", \"content\": \"Adopts a 'cessation-over-violation' principle, where the entity is structured to shut down rather than violate its core privacy and mission covenants.\", \"reasoning\": \"This governance failure mode was identified as a key architectural constraint for the PBC's legal framework.\"}\n{\"entity_id\": \"solstone\", \"entity_name\": \"Solstone\", \"content\": \"Utilizes a 'status-based routing' architecture for user onboarding, directing users to specific post-login states such as 'applied', 'approved', or 'revoked' based on their Scout portal status.\", \"reasoning\": \"This was documented as the core logic for the Scouts application and user entry flow.\"}\n{\"entity_id\": \"solstone\", \"entity_name\": \"Solstone\", \"content\": \"Architecture for data merging includes a 'staged review' workflow where high-confidence matches are auto-merged while ambiguous data is held in a 'staged_entities' directory for human-in-the-loop audit.\", \"reasoning\": \"Jeremie designed this 'sol transfer merge' architecture to handle the complexity of merging distributed journals.\"}\n{\"entity_id\": \"glenn_otis_brown\", \"entity_name\": \"Glenn Otis Brown\", \"content\": \"Acts as a strategic bridge between Solstone and the Electronic Frontier Foundation (EFF) community, specifically facilitating introductions to leadership like Cindy Cohn.\", \"reasoning\": \"Glenn's role shifted from general advisor to a specific relationship bridge for EFF/legal outreach.\"}\n{\"entity_id\": \"extro\", \"entity_name\": \"Extro\", \"content\": \"Employs a 'draft-approve-send' separation protocol for external communications to ensure an audit trail and maintain a consistent organizational voice.\", \"reasoning\": \"This protocol was codified in the CEO weekly reflection as a necessary governance fix for the organization.\"}\n{\"entity_id\": \"soulstone_observer\", \"entity_name\": \"Soulstone Observer\", \"content\": \"Requires local GUI trust authorization and full Xcode installation on macOS to facilitate self-signed code signing for local data capture services.\", \"reasoning\": \"Technical discovery during onboarding confirmed that headless or custom-keychain workarounds are insufficient for macOS security requirements.\"}\n{\"entity_id\": \"plaud\", \"entity_name\": \"Plaud\", \"content\": \"Lacks a public API for direct integration, requiring manual extraction of Cloud API tokens via browser developer tools to sync recordings into external systems like Solstone.\", \"reasoning\": \"Jeremie documented this specific workaround during a live onboarding and troubleshooting session.\"}", 45 + "observations": [ 46 + { 47 + "entity_id": "jeremie_miller", 48 + "entity_name": "Jeremie Miller", 49 + "content": "Advocates for a 'tenure-weighted customer standing' model in corporate governance, where long-term customers gain formal standing in covenant enforcement decisions.", 50 + "reasoning": "Jeremie explicitly proposed this concept for Sol PBC to align customer incentives with mission preservation." 51 + }, 52 + { 53 + "entity_id": "sol_pbc", 54 + "entity_name": "Sol PBC", 55 + "content": "Adopts a 'cessation-over-violation' principle, where the entity is structured to shut down rather than violate its core privacy and mission covenants.", 56 + "reasoning": "This governance failure mode was identified as a key architectural constraint for the PBC's legal framework." 57 + }, 58 + { 59 + "entity_id": "solstone", 60 + "entity_name": "Solstone", 61 + "content": "Utilizes a 'status-based routing' architecture for user onboarding, directing users to specific post-login states such as 'applied', 'approved', or 'revoked' based on their Scout portal status.", 62 + "reasoning": "This was documented as the core logic for the Scouts application and user entry flow." 63 + }, 64 + { 65 + "entity_id": "solstone", 66 + "entity_name": "Solstone", 67 + "content": "Architecture for data merging includes a 'staged review' workflow where high-confidence matches are auto-merged while ambiguous data is held in a 'staged_entities' directory for human-in-the-loop audit.", 68 + "reasoning": "Jeremie designed this 'sol transfer merge' architecture to handle the complexity of merging distributed journals." 69 + }, 70 + { 71 + "entity_id": "glenn_otis_brown", 72 + "entity_name": "Glenn Otis Brown", 73 + "content": "Acts as a strategic bridge between Solstone and the Electronic Frontier Foundation (EFF) community, specifically facilitating introductions to leadership like Cindy Cohn.", 74 + "reasoning": "Glenn's role shifted from general advisor to a specific relationship bridge for EFF/legal outreach." 75 + }, 76 + { 77 + "entity_id": "extro", 78 + "entity_name": "Extro", 79 + "content": "Employs a 'draft-approve-send' separation protocol for external communications to ensure an audit trail and maintain a consistent organizational voice.", 80 + "reasoning": "This protocol was codified in the CEO weekly reflection as a necessary governance fix for the organization." 81 + }, 82 + { 83 + "entity_id": "soulstone_observer", 84 + "entity_name": "Soulstone Observer", 85 + "content": "Requires local GUI trust authorization and full Xcode installation on macOS to facilitate self-signed code signing for local data capture services.", 86 + "reasoning": "Technical discovery during onboarding confirmed that headless or custom-keychain workarounds are insufficient for macOS security requirements." 87 + }, 88 + { 89 + "entity_id": "plaud", 90 + "entity_name": "Plaud", 91 + "content": "Lacks a public API for direct integration, requiring manual extraction of Cloud API tokens via browser developer tools to sync recordings into external systems like Solstone.", 92 + "reasoning": "Jeremie documented this specific workaround during a live onboarding and troubleshooting session." 93 + } 94 + ] 95 + }
+490
scratch/entity-observer-prototype/run_experiment.py
··· 1 + #!/usr/bin/env python3 2 + """Run entity_observer generate prototype experiments. 3 + 4 + Tests different prompt strategies, output formats, and model tiers against 5 + real pre-computed context from the journal. READ-ONLY on journal data; 6 + writes results to scratch/entity-observer-prototype/results/. 7 + 8 + Usage: 9 + cd /home/jer/projects/solstone 10 + python3 scratch/entity-observer-prototype/run_experiment.py \ 11 + --facet solstone --day 20260414 \ 12 + --prompt structured_json \ 13 + --model gemini-2.5-flash-lite \ 14 + --label lite-structured-json 15 + """ 16 + 17 + import argparse 18 + import json 19 + import os 20 + import sys 21 + import time 22 + from pathlib import Path 23 + 24 + sys.path.insert(0, str(Path(__file__).resolve().parents[2])) 25 + os.environ.setdefault("SOL_JOURNAL", str(Path(__file__).resolve().parents[2] / "journal")) 26 + 27 + from assemble_context import assemble_full_context, format_prompt_context 28 + 29 + RESULTS_DIR = Path(__file__).parent / "results" 30 + 31 + # --- Prompt templates --- 32 + 33 + SYSTEM_PROMPTS = { 34 + "observer_v1": """You are an entity observation agent for a personal knowledge system called Solstone. 35 + Your task: extract durable factoids about entities from today's journal content. 36 + 37 + An observation is a lasting fact about WHO or WHAT an entity IS — not what happened today. 38 + 39 + Good observations: 40 + - "Advocates for Socratic questioning in mentorship" 41 + - "Based in Seattle, previously worked at Google" 42 + - "Prefers async communication over meetings" 43 + 44 + NOT observations (these are activity logs): 45 + - "Discussed migration today" 46 + - "Sent contract for review" 47 + - "Uses v2.1.50" (expires) 48 + 49 + Rules: 50 + 1. One fact per observation — no compound sentences 51 + 2. Must pass BOTH litmus tests: 52 + a) "Would this be true and useful 6 months from now?" 53 + b) "Would this help someone who's never met this entity?" 54 + 3. Check for semantic duplicates against existing observations 55 + 4. If existing observations are already rich, restraint is correct — zero new observations is valid 56 + 5. Skip entities where today's content reveals nothing durable""", 57 + 58 + "observer_v2_terse": """Entity observation agent. Extract durable factoids from today's journal content. 59 + 60 + Observation = lasting fact about WHO/WHAT an entity IS. NOT activity logs, NOT ephemeral state. 61 + 62 + Litmus: (1) true in 6 months? (2) useful to a stranger? Both must be yes. 63 + One fact per observation. No duplicates of existing observations. Zero new is valid.""", 64 + } 65 + 66 + OUTPUT_FORMAT_INSTRUCTIONS = { 67 + "json_array": """Output format: JSON array of observation objects. 68 + ```json 69 + [ 70 + { 71 + "entity_id": "entity_slug", 72 + "entity_name": "Full Name", 73 + "content": "The durable observation text", 74 + "reasoning": "Why this qualifies as a durable observation (1 sentence)" 75 + } 76 + ] 77 + ``` 78 + Output ONLY the JSON array. No markdown, no commentary.""", 79 + 80 + "jsonl": """Output format: one JSON object per line (JSONL), no wrapping array. 81 + {"entity_id": "entity_slug", "entity_name": "Full Name", "content": "The observation", "reasoning": "Why"} 82 + {"entity_id": "other_entity", "entity_name": "Other Name", "content": "Another observation", "reasoning": "Why"} 83 + 84 + Output ONLY the JSONL lines. No markdown, no commentary, no blank lines between entries.""", 85 + 86 + "markdown_structured": """Output format: Markdown with one section per entity that has new observations. 87 + 88 + ## Entity Name (entity_id) 89 + - **Observation:** The durable factoid 90 + - **Reasoning:** Why this is durable (1 sentence) 91 + 92 + ## Another Entity (another_id) 93 + - **Observation:** Another factoid 94 + - **Reasoning:** Why 95 + 96 + Skip entities with no new observations entirely. End with a summary line: 97 + "Observed X entities, Y new observations total." 98 + """, 99 + 100 + "json_grouped": """Output format: JSON object grouped by entity_id. 101 + ```json 102 + { 103 + "observations": { 104 + "entity_slug": [ 105 + {"content": "The observation", "reasoning": "Why"} 106 + ], 107 + "other_entity": [ 108 + {"content": "Another observation", "reasoning": "Why"} 109 + ] 110 + }, 111 + "skipped": ["entity_ids_with_no_new_observations"], 112 + "summary": "Observed X entities, Y new observations total." 113 + } 114 + ``` 115 + Output ONLY the JSON. No markdown wrapping.""", 116 + } 117 + 118 + 119 + def build_prompt( 120 + context: dict, 121 + *, 122 + system_key: str = "observer_v1", 123 + format_key: str = "json_array", 124 + context_style: str = "structured", 125 + ) -> tuple[str, str]: 126 + """Build system + user prompt for an experiment. 127 + 128 + Returns (system_prompt, user_prompt). 129 + """ 130 + system = SYSTEM_PROMPTS[system_key] 131 + format_inst = OUTPUT_FORMAT_INSTRUCTIONS[format_key] 132 + 133 + # Build user prompt with pre-computed context 134 + context_text = format_prompt_context(context, style=context_style) 135 + 136 + user_prompt = f"""{format_inst} 137 + 138 + --- 139 + 140 + {context_text}""" 141 + 142 + return system, user_prompt 143 + 144 + 145 + def call_gemini( 146 + system_prompt: str, 147 + user_prompt: str, 148 + model: str, 149 + *, 150 + max_output_tokens: int = 8192, 151 + temperature: float = 0.3, 152 + thinking_budget: int | None = None, 153 + ) -> dict: 154 + """Call Gemini API and return result dict.""" 155 + from google import genai 156 + from google.genai import types 157 + 158 + api_key = os.environ.get("GOOGLE_API_KEY") 159 + if not api_key: 160 + # Try loading from vault 161 + vault_path = Path(__file__).resolve().parents[2].parent / "extro" / "cso" / "vault" / "api-keys" / "google-ai-studio.json" 162 + if vault_path.exists(): 163 + vault_data = json.loads(vault_path.read_text()) 164 + api_key = vault_data.get("api_key", "") 165 + if api_key: 166 + os.environ["GOOGLE_API_KEY"] = api_key 167 + 168 + if not api_key: 169 + raise RuntimeError("No GOOGLE_API_KEY found") 170 + 171 + client = genai.Client(api_key=api_key) 172 + 173 + config_kwargs = { 174 + "max_output_tokens": max_output_tokens, 175 + "temperature": temperature, 176 + } 177 + if thinking_budget is not None: 178 + config_kwargs["thinking_config"] = types.ThinkingConfig( 179 + thinking_budget=thinking_budget 180 + ) 181 + 182 + config = types.GenerateContentConfig( 183 + system_instruction=system_prompt, 184 + **config_kwargs, 185 + ) 186 + 187 + start = time.time() 188 + response = client.models.generate_content( 189 + model=model, 190 + contents=user_prompt, 191 + config=config, 192 + ) 193 + elapsed = time.time() - start 194 + 195 + # Extract usage 196 + usage = {} 197 + if response.usage_metadata: 198 + um = response.usage_metadata 199 + usage = { 200 + "prompt_tokens": getattr(um, "prompt_token_count", 0), 201 + "output_tokens": getattr(um, "candidates_token_count", 0), 202 + "total_tokens": getattr(um, "total_token_count", 0), 203 + "thinking_tokens": getattr(um, "thoughts_token_count", 0), 204 + } 205 + 206 + return { 207 + "text": response.text or "", 208 + "usage": usage, 209 + "elapsed_seconds": round(elapsed, 2), 210 + "model": model, 211 + "finish_reason": str(getattr(response.candidates[0], "finish_reason", "")) if response.candidates else "", 212 + } 213 + 214 + 215 + def parse_output(text: str, format_key: str) -> dict: 216 + """Attempt to parse the model output and validate structure.""" 217 + result = { 218 + "raw_text": text, 219 + "parse_success": False, 220 + "observation_count": 0, 221 + "entity_count": 0, 222 + "observations": [], 223 + "errors": [], 224 + } 225 + 226 + # Strip markdown code fences if present 227 + cleaned = text.strip() 228 + if cleaned.startswith("```"): 229 + lines = cleaned.split("\n") 230 + # Remove first and last lines (fences) 231 + lines = lines[1:] 232 + if lines and lines[-1].strip() == "```": 233 + lines = lines[:-1] 234 + cleaned = "\n".join(lines).strip() 235 + 236 + try: 237 + if format_key == "json_array": 238 + observations = json.loads(cleaned) 239 + if isinstance(observations, list): 240 + result["parse_success"] = True 241 + result["observations"] = observations 242 + result["observation_count"] = len(observations) 243 + result["entity_count"] = len(set(o.get("entity_id", "") for o in observations)) 244 + 245 + elif format_key == "jsonl": 246 + observations = [] 247 + parse_errors = 0 248 + for line in cleaned.split("\n"): 249 + line = line.strip() 250 + if not line: 251 + continue 252 + try: 253 + observations.append(json.loads(line)) 254 + except json.JSONDecodeError: 255 + parse_errors += 1 256 + result["parse_success"] = len(observations) > 0 257 + result["observations"] = observations 258 + result["observation_count"] = len(observations) 259 + result["entity_count"] = len(set(o.get("entity_id", "") for o in observations)) 260 + if parse_errors: 261 + result["errors"].append(f"{parse_errors} lines failed to parse (likely truncation)") 262 + 263 + elif format_key == "json_grouped": 264 + data = json.loads(cleaned) 265 + if isinstance(data, dict) and "observations" in data: 266 + result["parse_success"] = True 267 + all_obs = [] 268 + for entity_id, obs_list in data["observations"].items(): 269 + for obs in obs_list: 270 + all_obs.append({"entity_id": entity_id, **obs}) 271 + result["observations"] = all_obs 272 + result["observation_count"] = len(all_obs) 273 + result["entity_count"] = len(data["observations"]) 274 + result["skipped_count"] = len(data.get("skipped", [])) 275 + 276 + elif format_key == "markdown_structured": 277 + # Count ## headers and **Observation:** lines 278 + import re 279 + entities = re.findall(r"^## (.+?)(?:\s*\(|$)", cleaned, re.MULTILINE) 280 + observations = re.findall(r"\*\*Observation:\*\*\s*(.+)", cleaned) 281 + result["parse_success"] = len(observations) > 0 or "0 new observations" in cleaned.lower() 282 + result["observation_count"] = len(observations) 283 + result["entity_count"] = len(entities) 284 + result["observations"] = [ 285 + {"content": o, "entity_name": entities[i] if i < len(entities) else "?"} 286 + for i, o in enumerate(observations) 287 + ] 288 + 289 + except (json.JSONDecodeError, KeyError, TypeError) as e: 290 + result["errors"].append(str(e)) 291 + 292 + return result 293 + 294 + 295 + def evaluate_observations(parsed: dict, context: dict) -> dict: 296 + """Evaluate observation quality against the pre-computed context.""" 297 + eval_result = { 298 + "total_observations": parsed["observation_count"], 299 + "entities_with_observations": parsed["entity_count"], 300 + "duplicates": 0, 301 + "quality_flags": [], 302 + } 303 + 304 + # Build existing observation index 305 + existing_obs = {} 306 + for ec in context["entity_contexts"]: 307 + existing_obs[ec["id"]] = set( 308 + o.lower().strip() for o in ec.get("observations", ec.get("recent_observations", [])) 309 + ) 310 + 311 + # Check each observation 312 + for obs in parsed.get("observations", []): 313 + entity_id = obs.get("entity_id", "") 314 + content = obs.get("content", "") 315 + content_lower = content.lower().strip() 316 + 317 + # Check for exact duplicates 318 + if entity_id in existing_obs: 319 + for existing in existing_obs[entity_id]: 320 + if content_lower == existing or content_lower in existing or existing in content_lower: 321 + eval_result["duplicates"] += 1 322 + eval_result["quality_flags"].append( 323 + f"DUPLICATE: {entity_id}: '{content[:60]}...'" 324 + ) 325 + break 326 + 327 + # Check for temporal language (not durable) 328 + temporal_markers = ["today", "currently", "as of", "this week", "yesterday", "right now"] 329 + for marker in temporal_markers: 330 + if marker in content_lower: 331 + eval_result["quality_flags"].append( 332 + f"TEMPORAL: {entity_id}: '{content[:60]}...' (contains '{marker}')" 333 + ) 334 + break 335 + 336 + # Check for activity-log patterns 337 + activity_markers = ["discussed", "sent", "reviewed", "filed", "submitted", "scheduled"] 338 + for marker in activity_markers: 339 + if content_lower.startswith(marker): 340 + eval_result["quality_flags"].append( 341 + f"ACTIVITY_LOG: {entity_id}: '{content[:60]}...' (starts with '{marker}')" 342 + ) 343 + break 344 + 345 + eval_result["quality_score"] = max(0, 1.0 - ( 346 + eval_result["duplicates"] * 0.15 + 347 + len(eval_result["quality_flags"]) * 0.05 348 + )) 349 + 350 + return eval_result 351 + 352 + 353 + def run_experiment( 354 + facet: str, 355 + day: str, 356 + *, 357 + strategy: str = "focused", 358 + context_style: str = "structured", 359 + system_key: str = "observer_v1", 360 + format_key: str = "json_array", 361 + model: str = "gemini-2.5-flash-lite", 362 + thinking_budget: int | None = None, 363 + label: str = "", 364 + ) -> dict: 365 + """Run a single experiment and return full results.""" 366 + print(f"\n{'='*60}") 367 + print(f"Experiment: {label or 'unnamed'}") 368 + print(f" Model: {model}") 369 + print(f" Strategy: {strategy}, Style: {context_style}") 370 + print(f" System: {system_key}, Format: {format_key}") 371 + if thinking_budget: 372 + print(f" Thinking budget: {thinking_budget}") 373 + print(f"{'='*60}") 374 + 375 + # Assemble context 376 + print("Assembling context...") 377 + context = assemble_full_context(facet, day, strategy=strategy) 378 + print(f" Active entities: {context['active_count']}, Est tokens: {context['estimated_tokens']:,}") 379 + 380 + # Build prompt 381 + system_prompt, user_prompt = build_prompt( 382 + context, 383 + system_key=system_key, 384 + format_key=format_key, 385 + context_style=context_style, 386 + ) 387 + print(f" System prompt: {len(system_prompt):,} chars") 388 + print(f" User prompt: {len(user_prompt):,} chars") 389 + 390 + # Call model 391 + print(f"Calling {model}...") 392 + gen_result = call_gemini( 393 + system_prompt, 394 + user_prompt, 395 + model, 396 + thinking_budget=thinking_budget, 397 + ) 398 + print(f" Elapsed: {gen_result['elapsed_seconds']}s") 399 + print(f" Usage: {gen_result['usage']}") 400 + print(f" Output length: {len(gen_result['text']):,} chars") 401 + 402 + # Parse output 403 + print("Parsing output...") 404 + parsed = parse_output(gen_result["text"], format_key) 405 + print(f" Parse success: {parsed['parse_success']}") 406 + print(f" Observations: {parsed['observation_count']}") 407 + print(f" Entities: {parsed['entity_count']}") 408 + if parsed["errors"]: 409 + print(f" Errors: {parsed['errors']}") 410 + 411 + # Evaluate quality 412 + print("Evaluating quality...") 413 + evaluation = evaluate_observations(parsed, context) 414 + print(f" Duplicates: {evaluation['duplicates']}") 415 + print(f" Quality flags: {len(evaluation['quality_flags'])}") 416 + print(f" Quality score: {evaluation['quality_score']:.2f}") 417 + if evaluation["quality_flags"][:5]: 418 + for flag in evaluation["quality_flags"][:5]: 419 + print(f" - {flag}") 420 + 421 + # Compile result 422 + result = { 423 + "label": label, 424 + "config": { 425 + "facet": facet, 426 + "day": day, 427 + "strategy": strategy, 428 + "context_style": context_style, 429 + "system_key": system_key, 430 + "format_key": format_key, 431 + "model": model, 432 + "thinking_budget": thinking_budget, 433 + }, 434 + "context_stats": { 435 + "total_attached": context["total_attached"], 436 + "active_count": context["active_count"], 437 + "estimated_tokens": context["estimated_tokens"], 438 + "system_prompt_chars": len(system_prompt), 439 + "user_prompt_chars": len(user_prompt), 440 + }, 441 + "generation": { 442 + "elapsed_seconds": gen_result["elapsed_seconds"], 443 + "usage": gen_result["usage"], 444 + "output_chars": len(gen_result["text"]), 445 + "finish_reason": gen_result["finish_reason"], 446 + }, 447 + "parsing": { 448 + "parse_success": parsed["parse_success"], 449 + "observation_count": parsed["observation_count"], 450 + "entity_count": parsed["entity_count"], 451 + "errors": parsed["errors"], 452 + }, 453 + "evaluation": evaluation, 454 + "raw_output": gen_result["text"], 455 + "observations": parsed["observations"], 456 + } 457 + 458 + # Save result 459 + RESULTS_DIR.mkdir(parents=True, exist_ok=True) 460 + result_file = RESULTS_DIR / f"{label or 'unnamed'}.json" 461 + result_file.write_text(json.dumps(result, indent=2, ensure_ascii=False)) 462 + print(f"\nResult saved to {result_file}") 463 + 464 + return result 465 + 466 + 467 + if __name__ == "__main__": 468 + parser = argparse.ArgumentParser() 469 + parser.add_argument("--facet", default="solstone") 470 + parser.add_argument("--day", default="20260414") 471 + parser.add_argument("--strategy", default="focused") 472 + parser.add_argument("--context-style", default="structured") 473 + parser.add_argument("--system", default="observer_v1") 474 + parser.add_argument("--format", default="json_array") 475 + parser.add_argument("--model", default="gemini-2.5-flash-lite") 476 + parser.add_argument("--thinking-budget", type=int, default=None) 477 + parser.add_argument("--label", default="") 478 + args = parser.parse_args() 479 + 480 + run_experiment( 481 + args.facet, 482 + args.day, 483 + strategy=args.strategy, 484 + context_style=args.context_style, 485 + system_key=args.system, 486 + format_key=args.format, 487 + model=args.model, 488 + thinking_budget=args.thinking_budget, 489 + label=args.label, 490 + )