Merge branch 'hopper-qpsfxoer-stats-schema' · solpbc.org/solstone@c4c9a64

+1

apps/stats/routes.py

··· 37 37 try: 38 38 with open(stats_path, "r", encoding="utf-8") as f: 39 39 response["stats"] = json.load(f) 40 + response["file_mtime"] = os.path.getmtime(stats_path) 40 41 except Exception: 41 42 logger.exception("Failed to read stats data") 42 43 response["error"] = "Failed to read stats data"

+8 -5

tests/test_journal_stats.py

··· 169 169 170 170 # Test JSON output includes token usage 171 171 data = js.to_dict() 172 - assert "token_usage_by_day" in data 173 - assert "token_totals_by_model" in data 174 - assert "total_transcript_duration" in data 175 - assert "total_percept_duration" in data 172 + assert data["schema_version"] == 2 173 + assert "generated_at" in data 174 + assert data["day_count"] == 2 175 + assert "tokens" in data 176 + assert "by_day" in data["tokens"] 177 + assert "total_transcript_duration" in data["totals"] 178 + assert "total_percept_duration" in data["totals"] 176 179 assert ( 177 - data["token_usage_by_day"]["20240101"]["gemini-2.5-flash"]["total_tokens"] 180 + data["tokens"]["by_day"]["20240101"]["gemini-2.5-flash"]["total_tokens"] 178 181 == 495 179 182 ) 180 183

+104

tests/test_stats_schema.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + import importlib 5 + 6 + import pytest 7 + 8 + 9 + def test_validate_passes_on_valid_output(tmp_path, monkeypatch): 10 + """Build a JournalStats from fixture data, call to_dict(), validate.""" 11 + stats_mod = importlib.import_module("think.journal_stats") 12 + schema_mod = importlib.import_module("think.stats_schema") 13 + journal = tmp_path 14 + day = journal / "20240101" 15 + day.mkdir() 16 + 17 + # Create minimal transcript fixture 18 + ts_dir = day / "default" / "123456_300" 19 + ts_dir.mkdir(parents=True) 20 + (ts_dir / "audio.jsonl").write_text( 21 + '{"raw": "raw.flac"}\n' 22 + '{"start": "10:00:00", "text": "hello"}\n' 23 + ) 24 + 25 + monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(journal)) 26 + js = stats_mod.JournalStats() 27 + js.scan(str(journal)) 28 + 29 + data = js.to_dict() 30 + errors = schema_mod.validate(data) 31 + assert errors == [], f"Validation errors: {errors}" 32 + 33 + 34 + def test_validate_rejects_missing_fields(): 35 + """Incomplete dicts should produce non-empty error lists.""" 36 + schema_mod = importlib.import_module("think.stats_schema") 37 + 38 + # Empty dict 39 + errors = schema_mod.validate({}) 40 + assert len(errors) > 0 41 + assert any("schema_version" in e for e in errors) 42 + 43 + # Missing days 44 + errors = schema_mod.validate( 45 + {"schema_version": 2, "generated_at": "2026-04-10T00:00:00+00:00"} 46 + ) 47 + assert any("days" in e for e in errors) 48 + 49 + # Wrong schema version 50 + errors = schema_mod.validate( 51 + { 52 + "schema_version": 99, 53 + "generated_at": "x", 54 + "day_count": 0, 55 + "days": {}, 56 + "totals": {}, 57 + "heatmap": [], 58 + "tokens": {}, 59 + "agents": {}, 60 + "facets": {}, 61 + } 62 + ) 63 + assert any("schema_version" in e for e in errors) 64 + 65 + 66 + def test_save_json_raises_on_invalid(tmp_path, monkeypatch): 67 + """save_json() must raise ValueError when validation fails.""" 68 + stats_mod = importlib.import_module("think.journal_stats") 69 + monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path)) 70 + js = stats_mod.JournalStats() 71 + # Corrupt the schema version so validation fails 72 + original = js.to_dict 73 + js.to_dict = lambda: {**original(), "schema_version": 99} 74 + with pytest.raises(ValueError, match="Stats validation failed"): 75 + js.save_json(str(tmp_path)) 76 + 77 + 78 + def test_day_fields_present_in_scan_day(tmp_path, monkeypatch): 79 + """Verify every key in DAY_FIELDS appears in scan_day output.""" 80 + stats_mod = importlib.import_module("think.journal_stats") 81 + schema_mod = importlib.import_module("think.stats_schema") 82 + journal = tmp_path 83 + day = journal / "20240101" 84 + day.mkdir() 85 + 86 + # Create transcript and percept fixtures 87 + ts_dir = day / "default" / "123456_300" 88 + ts_dir.mkdir(parents=True) 89 + (ts_dir / "audio.jsonl").write_text( 90 + '{"raw": "raw.flac"}\n' 91 + '{"start": "10:00:00", "text": "hello"}\n' 92 + ) 93 + (ts_dir / "screen.jsonl").write_text( 94 + '{"header": true}\n' 95 + '{"frame_id": 1, "timestamp": "10:00:00"}\n' 96 + ) 97 + 98 + monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(journal)) 99 + js = stats_mod.JournalStats() 100 + day_data = js.scan_day("20240101", str(day)) 101 + 102 + stats = day_data["stats"] 103 + for field in schema_mod.DAY_FIELDS: 104 + assert field in stats, f"DAY_FIELDS field '{field}' missing from scan_day output"

+33 -13

think/journal_stats.py

··· 13 13 from observe.sense import scan_day as sense_scan_day 14 14 from observe.utils import VIDEO_EXTENSIONS, load_analysis_frames 15 15 from think.agents import scan_day as generate_scan_day 16 + from think.stats_schema import DAY_FIELDS, SCHEMA_VERSION, validate as validate_stats 16 17 from think.utils import day_dirs, get_journal, setup_cli 17 18 18 19 logger = logging.getLogger(__name__) ··· 503 504 504 505 def to_dict(self) -> dict: 505 506 """Return a dictionary with all collected statistics.""" 507 + days = { 508 + day: {field: stats.get(field, 0) for field in DAY_FIELDS} 509 + for day, stats in self.days.items() 510 + } 506 511 return { 507 - "days": self.days, 508 - "totals": dict(self.totals), 509 - "total_transcript_duration": self.total_transcript_duration, 510 - "total_percept_duration": self.total_percept_duration, 511 - "agent_counts": dict(self.agent_counts), 512 - "agent_minutes": {k: round(v, 2) for k, v in self.agent_minutes.items()}, 513 - "agent_counts_by_day": self.agent_counts_by_day, 514 - "facet_counts": dict(self.facet_counts), 515 - "facet_minutes": {k: round(v, 2) for k, v in self.facet_minutes.items()}, 516 - "facet_counts_by_day": self.facet_counts_by_day, 512 + "schema_version": SCHEMA_VERSION, 513 + "generated_at": datetime.now(timezone.utc).isoformat(), 514 + "day_count": len(self.days), 515 + "days": days, 516 + "totals": { 517 + **dict(self.totals), 518 + "total_transcript_duration": self.total_transcript_duration, 519 + "total_percept_duration": self.total_percept_duration, 520 + }, 517 521 "heatmap": self.heatmap, 518 - "token_usage_by_day": self.token_usage, 519 - "token_totals_by_model": self.token_totals, 522 + "tokens": { 523 + "by_day": self.token_usage, 524 + "by_model": self.token_totals, 525 + }, 526 + "agents": { 527 + "counts": dict(self.agent_counts), 528 + "minutes": {k: round(v, 2) for k, v in self.agent_minutes.items()}, 529 + "counts_by_day": self.agent_counts_by_day, 530 + }, 531 + "facets": { 532 + "counts": dict(self.facet_counts), 533 + "minutes": {k: round(v, 2) for k, v in self.facet_minutes.items()}, 534 + "counts_by_day": self.facet_counts_by_day, 535 + }, 520 536 } 521 537 522 538 def save_json(self, journal: str) -> None: 523 539 """Write full statistics to ``stats.json`` in ``journal``.""" 540 + data = self.to_dict() 541 + errors = validate_stats(data) 542 + if errors: 543 + raise ValueError(f"Stats validation failed: {'; '.join(errors)}") 524 544 path = os.path.join(journal, "stats.json") 525 545 with open(path, "w", encoding="utf-8") as f: 526 - json.dump(self.to_dict(), f, indent=2) 546 + json.dump(data, f, indent=2) 527 547 528 548 529 549 def main() -> None:

+76

think/stats_schema.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + SCHEMA_VERSION = 2 5 + 6 + DAY_FIELDS = ( 7 + "transcript_sessions", 8 + "transcript_segments", 9 + "transcript_duration", 10 + "percept_sessions", 11 + "percept_frames", 12 + "percept_duration", 13 + "pending_segments", 14 + "outputs_processed", 15 + "outputs_pending", 16 + "day_bytes", 17 + ) 18 + 19 + TOTAL_FIELDS = ( 20 + "transcript_sessions", 21 + "transcript_segments", 22 + "transcript_duration", 23 + "percept_sessions", 24 + "percept_frames", 25 + "percept_duration", 26 + "pending_segments", 27 + "outputs_processed", 28 + "outputs_pending", 29 + "day_bytes", 30 + "total_transcript_duration", 31 + "total_percept_duration", 32 + ) 33 + 34 + REQUIRED_TOP_LEVEL = ( 35 + "schema_version", 36 + "generated_at", 37 + "day_count", 38 + "days", 39 + "totals", 40 + "heatmap", 41 + "tokens", 42 + "agents", 43 + "facets", 44 + ) 45 + 46 + 47 + def validate(data: dict) -> list[str]: 48 + """Validate stats output against schema v2. Returns list of error strings (empty = valid).""" 49 + errors = [] 50 + 51 + # Check schema_version 52 + if "schema_version" not in data: 53 + errors.append("missing 'schema_version'") 54 + elif data["schema_version"] != SCHEMA_VERSION: 55 + errors.append(f"schema_version is {data['schema_version']}, expected {SCHEMA_VERSION}") 56 + 57 + # Check generated_at 58 + if "generated_at" not in data: 59 + errors.append("missing 'generated_at'") 60 + elif not isinstance(data["generated_at"], str): 61 + errors.append("'generated_at' must be a string") 62 + 63 + # Check required top-level keys 64 + for key in REQUIRED_TOP_LEVEL: 65 + if key not in data: 66 + errors.append(f"missing required key '{key}'") 67 + 68 + # Spot-check one day entry if days is non-empty 69 + days = data.get("days", {}) 70 + if isinstance(days, dict) and days: 71 + first_day = next(iter(days.values())) 72 + for field in DAY_FIELDS: 73 + if field not in first_day: 74 + errors.append(f"day entry missing field '{field}'") 75 + 76 + return errors

Configure Feed

Configure Feed