personal memory agent
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'hopper-de3hnu42-speakers-status-gates-json'

# Conflicts:
# apps/speakers/call.py
# apps/speakers/owner.py
# apps/speakers/routes.py
# apps/speakers/status.py
# apps/speakers/tests/test_owner.py
# apps/speakers/tests/test_status.py

+251 -771
+29 -96
apps/speakers/call.py
··· 4 4 """CLI interface for speaker voiceprint management. 5 5 6 6 Provides: 7 - sol call speakers status [--section SECTION] 7 + sol call speakers status [section] 8 8 sol call speakers bootstrap [--dry-run] [--json] 9 9 sol call speakers resolve-names [--dry-run] [--json] 10 - sol call speakers attribute-segment <day> <stream> <segment> 10 + sol call speakers attribute-segment <day> <stream> <segment> [--json] 11 11 sol call speakers backfill [--dry-run] [--json] 12 12 sol call speakers discover [--json] 13 - sol call speakers identify <cluster_id> <name> [--entity-id ID] 14 - sol call speakers merge-names <alias> <canonical> 15 13 """ 16 14 17 15 from __future__ import annotations ··· 27 25 28 26 @app.command("status") 29 27 def status( 30 - section: str = typer.Option( 28 + section: str | None = typer.Argument( 31 29 None, 32 - "--section", 33 - help="Return only one section: embeddings, owner, speakers, clusters, imports, attribution.", 30 + help=( 31 + "Section to show (embeddings, owner, speakers, clusters, imports, " 32 + "attribution). Omit for all." 33 + ), 34 34 ), 35 35 ) -> None: 36 - """Return the full speaker ID state model as JSON. 37 - 38 - Aggregates embedding coverage, owner centroid status, known speakers, 39 - candidate clusters, import signals, and attribution coverage into a 40 - single dashboard view. All data is read from disk — no new computations. 41 - """ 42 - import json 36 + """Show speaker subsystem status as JSON.""" 37 + import json as json_mod 43 38 44 - from apps.speakers.status import get_status 39 + from apps.speakers.status import get_speakers_status 45 40 46 - result = get_status(section=section) 47 - 48 - if "error" in result: 49 - typer.echo(json.dumps(result, indent=2), err=True) 50 - raise typer.Exit(1) 51 - 52 - typer.echo(json.dumps(result, indent=2)) 41 + result = get_speakers_status(section=section) 42 + typer.echo(json_mod.dumps(result, indent=2, default=str)) 53 43 54 44 55 45 @app.command("bootstrap") ··· 58 48 False, "--dry-run", help="Show what would be saved without saving." 59 49 ), 60 50 json_output: bool = typer.Option( 61 - False, "--json", help="Output results as JSON." 51 + False, "--json", help="Output full result as JSON." 62 52 ), 63 53 ) -> None: 64 54 """Bootstrap voiceprints from single-speaker segments. ··· 68 58 speaker. Saves them as voiceprints using the owner centroid for 69 59 owner subtraction. 70 60 """ 71 - import json as json_mod 72 - 73 61 from apps.speakers.bootstrap import bootstrap_voiceprints 74 62 75 63 if dry_run and not json_output: ··· 77 65 78 66 if not json_output: 79 67 typer.echo("Bootstrapping voiceprints from single-speaker segments...") 80 - 81 68 stats = bootstrap_voiceprints(dry_run=dry_run) 82 69 83 - if json_output: 84 - typer.echo(json_mod.dumps(stats, indent=2)) 85 - if "error" in stats: 86 - raise typer.Exit(1) 87 - return 88 - 89 70 if "error" in stats: 90 71 typer.echo(f"Error: {stats['error']}", err=True) 91 72 raise typer.Exit(1) 73 + if json_output: 74 + import json as json_mod 75 + 76 + typer.echo(json_mod.dumps(stats, indent=2, default=str)) 77 + return 92 78 93 79 typer.echo(f"\nSegments scanned: {stats['segments_scanned']}") 94 80 typer.echo(f"Single-speaker segments: {stats['single_speaker_segments']}") ··· 122 108 False, "--dry-run", help="Show merges without applying them." 123 109 ), 124 110 json_output: bool = typer.Option( 125 - False, "--json", help="Output results as JSON." 111 + False, "--json", help="Output full result as JSON." 126 112 ), 127 113 ) -> None: 128 114 """Resolve speaker name variants using voiceprint similarity. ··· 132 118 (short name is first word of full name) are auto-merged by adding the 133 119 short name as an aka on the canonical entity. 134 120 """ 135 - import json as json_mod 136 - 137 121 from apps.speakers.bootstrap import resolve_name_variants 138 122 139 123 if dry_run and not json_output: ··· 141 125 142 126 if not json_output: 143 127 typer.echo("Resolving speaker name variants...") 144 - 145 128 stats = resolve_name_variants(dry_run=dry_run) 146 129 147 130 if json_output: 148 - typer.echo(json_mod.dumps(stats, indent=2)) 131 + import json as json_mod 132 + 133 + typer.echo(json_mod.dumps(stats, indent=2, default=str)) 149 134 return 150 135 151 136 typer.echo(f"\nEntities with voiceprints: {stats['entities_with_voiceprints']}") ··· 252 237 False, "--dry-run", help="Enumerate segments without processing." 253 238 ), 254 239 json_output: bool = typer.Option( 255 - False, "--json", help="Output results as JSON." 240 + False, "--json", help="Output full result as JSON." 256 241 ), 257 242 ) -> None: 258 243 """Run speaker attribution across all segments with embeddings. ··· 260 245 Processes segments oldest-first for progressive voiceprint building. 261 246 Skips segments that already have speaker_labels.json (safe to re-run). 262 247 """ 263 - import json as json_mod 264 248 import time 265 249 266 250 from apps.speakers.attribution import backfill_segments ··· 287 271 288 272 stats = backfill_segments( 289 273 dry_run=dry_run, 290 - progress_callback=None if (dry_run or json_output) else on_progress, 274 + progress_callback=None if dry_run or json_output else on_progress, 291 275 ) 292 276 293 277 elapsed = time.monotonic() - start 294 278 295 279 if json_output: 296 - stats["elapsed_seconds"] = round(elapsed, 1) 297 - typer.echo(json_mod.dumps(stats, indent=2)) 280 + import json as json_mod 281 + 282 + typer.echo(json_mod.dumps(stats, indent=2, default=str)) 298 283 return 299 284 300 285 typer.echo("\n") ··· 325 310 @app.command() 326 311 def discover( 327 312 json_output: bool = typer.Option( 328 - False, "--json", help="Output results as JSON." 313 + False, "--json", help="Output full result as JSON." 329 314 ), 330 315 ) -> None: 331 316 """Discover recurring unknown speakers across segments.""" ··· 334 319 from apps.speakers.discovery import discover_unknown_speakers 335 320 336 321 result = discover_unknown_speakers() 337 - 338 322 if json_output: 339 - typer.echo(json_mod.dumps(result, indent=2)) 323 + typer.echo(json_mod.dumps(result, indent=2, default=str)) 340 324 return 341 - 342 325 clusters = result.get("clusters", []) 343 326 344 327 if not clusters: ··· 358 341 f"sid={sample['sentence_id']}: {text_preview}" 359 342 ) 360 343 typer.echo() 361 - 362 - 363 - @app.command() 364 - def identify( 365 - cluster_id: int = typer.Argument(..., help="Cluster ID from discovery output."), 366 - name: str = typer.Argument(..., help="Speaker name to assign."), 367 - entity_id: str | None = typer.Option( 368 - None, "--entity-id", help="Link to existing entity instead of name matching." 369 - ), 370 - ) -> None: 371 - """Name an unknown speaker cluster from discovery. 372 - 373 - Creates or matches a speaker entity and saves the cluster's embeddings 374 - as voiceprints. Updates speaker labels in all affected segments. 375 - Returns JSON. 376 - """ 377 - import json as json_mod 378 - 379 - from apps.speakers.discovery import identify_cluster 380 - 381 - result = identify_cluster(cluster_id, name, entity_id=entity_id) 382 - 383 - if "error" in result: 384 - typer.echo(json_mod.dumps({"error": result["error"]}, indent=2), err=True) 385 - raise typer.Exit(1) 386 - 387 - typer.echo(json_mod.dumps(result, indent=2)) 388 - 389 - 390 - @app.command("merge-names") 391 - def merge_names_cmd( 392 - alias: str = typer.Argument(..., help="Alias/variant name to merge from."), 393 - canonical: str = typer.Argument(..., help="Canonical/full name to merge into."), 394 - ) -> None: 395 - """Merge a speaker name variant into a canonical entity. 396 - 397 - Adds the alias as an aka on the canonical entity and merges voiceprint 398 - embeddings with deduplication. Returns JSON. 399 - """ 400 - import json as json_mod 401 - 402 - from apps.speakers.bootstrap import merge_names 403 - 404 - result = merge_names(alias, canonical) 405 - 406 - if "error" in result: 407 - typer.echo(json_mod.dumps({"error": result["error"]}, indent=2), err=True) 408 - raise typer.Exit(1) 409 - 410 - typer.echo(json_mod.dumps(result, indent=2))
+24 -67
apps/speakers/owner.py
··· 19 19 20 20 logger = logging.getLogger(__name__) 21 21 22 - MIN_SEGMENTS = 50 23 22 MAX_EMBEDDINGS = 30000 24 23 OWNER_THRESHOLD = 0.82 25 24 ··· 127 126 128 127 129 128 def detect_owner_candidate() -> dict[str, Any]: 130 - """Detect a likely owner voice centroid from journal embeddings. 131 - 132 - Always attempts detection regardless of data volume. Returns quality 133 - metrics so the calling agent can decide whether the data is sufficient. 134 - """ 129 + """Detect a likely owner voice centroid from journal embeddings.""" 135 130 load_embeddings_file, normalize_embedding, scan_segment_embeddings = ( 136 131 _routes_helpers() 137 132 ) ··· 140 135 141 136 embedding_chunks: list[np.ndarray] = [] 142 137 provenance: list[dict[str, Any]] = [] 143 - streams_seen: set[str] = set() 144 138 145 139 for day in day_dirs().keys(): 146 140 for segment in scan_segment_embeddings(day): 147 141 stream = segment["stream"] 148 142 segment_key = segment["key"] 149 143 segment_dir = segment_path(day, segment_key, stream) 150 - streams_seen.add(stream) 151 144 152 145 for source in segment["sources"]: 153 146 emb_data = load_embeddings_file(segment_dir / f"{source}.npz") ··· 170 163 for sid in statement_ids 171 164 ) 172 165 173 - total_embeddings = sum(len(c) for c in embedding_chunks) if embedding_chunks else 0 174 - stream_diversity = len(streams_seen) 175 - 176 166 if not embedding_chunks: 177 167 _mark_no_cluster(segment_count) 178 168 return { 179 - "status": "insufficient_data", 180 - "reason": f"{segment_count} segments with embeddings, 0 embeddings found", 169 + "status": "no_embeddings", 181 170 "segments_available": segment_count, 182 171 "embeddings_available": 0, 183 - "stream_diversity": stream_diversity, 184 - "recommendation": "need_more_data", 172 + "recommendation": "no_embeddings", 185 173 } 186 174 187 175 embeddings_matrix = np.vstack(embedding_chunks) ··· 190 178 if len(embeddings_matrix) < 50: 191 179 _mark_no_cluster(segment_count) 192 180 return { 193 - "status": "insufficient_data", 194 - "reason": ( 195 - f"{len(embeddings_matrix)} embeddings available, " 196 - f"recommend {MIN_SEGMENTS}+ segments for reliable detection" 197 - ), 181 + "status": "low_data", 198 182 "segments_available": segment_count, 199 - "embeddings_available": total_embeddings, 200 - "stream_diversity": stream_diversity, 201 - "recommendation": ( 202 - "ready" if segment_count >= MIN_SEGMENTS and stream_diversity >= 3 203 - else "need_more_data" 204 - ), 183 + "embeddings_available": int(len(embeddings_matrix)), 184 + "recommendation": "low_data", 205 185 } 206 186 207 187 clusterer = HDBSCAN( ··· 216 196 if len(valid_labels) == 0: 217 197 _mark_no_cluster(segment_count) 218 198 return { 219 - "status": "no_cluster", 220 - "reason": "HDBSCAN found no clusters in the embedding space", 199 + "status": "no_clusters", 221 200 "segments_available": segment_count, 222 - "embeddings_available": total_embeddings, 223 - "stream_diversity": stream_diversity, 224 - "recommendation": "need_more_data", 201 + "embeddings_available": int(len(embeddings_matrix)), 202 + "recommendation": "no_clusters", 225 203 } 226 204 227 205 largest_label = int(np.bincount(valid_labels).argmax()) ··· 229 207 if len(cluster_indices) == 0: 230 208 _mark_no_cluster(segment_count) 231 209 return { 232 - "status": "no_cluster", 233 - "reason": "Largest cluster was empty after filtering", 210 + "status": "no_clusters", 234 211 "segments_available": segment_count, 235 - "embeddings_available": total_embeddings, 236 - "stream_diversity": stream_diversity, 237 - "recommendation": "need_more_data", 212 + "embeddings_available": int(len(embeddings_matrix)), 213 + "recommendation": "no_clusters", 238 214 } 239 215 240 216 cluster_embeddings = embeddings_matrix[cluster_indices] ··· 242 218 if centroid is None: 243 219 _mark_no_cluster(segment_count) 244 220 return { 245 - "status": "no_cluster", 246 - "reason": "Could not compute centroid for largest cluster", 221 + "status": "no_clusters", 247 222 "segments_available": segment_count, 248 - "embeddings_available": total_embeddings, 249 - "stream_diversity": stream_diversity, 250 - "recommendation": "need_more_data", 223 + "embeddings_available": int(len(embeddings_matrix)), 224 + "recommendation": "no_clusters", 251 225 } 252 226 253 227 cluster_size = int(len(cluster_indices)) 228 + cluster_streams = {provenance[int(i)]["stream"] for i in cluster_indices} 229 + streams_represented = len(cluster_streams) 230 + recommendation = "ready" if streams_represented > 1 else "single_stream" 254 231 similarities = np.dot(cluster_embeddings, centroid) 255 - mean_similarity = float(np.mean(similarities)) 256 232 sorted_cluster_positions = np.argsort(similarities)[::-1] 257 233 258 - # Compute stream diversity within the cluster 259 - cluster_streams = { 260 - provenance[int(cluster_indices[i])]["stream"] for i in range(len(cluster_indices)) 261 - } 262 - 263 234 samples: list[dict[str, Any]] = [] 264 235 seen_segments: set[tuple[str, str, str]] = set() 265 236 ··· 272 243 samples.append( 273 244 { 274 245 **record, 275 - "similarity": round(float(similarities[position]), 4), 276 246 "audio_url": _audio_url( 277 247 record["day"], 278 248 record["stream"], ··· 289 259 record = provenance[int(cluster_indices[position])] 290 260 sample = { 291 261 **record, 292 - "similarity": round(float(similarities[position]), 4), 293 262 "audio_url": _audio_url( 294 263 record["day"], 295 264 record["stream"], ··· 317 286 { 318 287 "status": "candidate", 319 288 "cluster_size": cluster_size, 289 + "streams_represented": streams_represented, 290 + "recommendation": recommendation, 320 291 "samples": samples, 321 292 "detected_at": version, 322 293 }, 323 294 ) 324 295 325 - # Determine recommendation based on quality metrics 326 - if cluster_size >= 100 and len(cluster_streams) >= 3: 327 - recommendation = "strong_candidate" 328 - elif cluster_size >= 50 and len(cluster_streams) >= 2: 329 - recommendation = "good_candidate" 330 - else: 331 - recommendation = "weak_candidate" 332 - 333 296 return { 334 - "status": "candidate_found", 335 - "candidate": { 336 - "cluster_size": cluster_size, 337 - "mean_similarity": round(mean_similarity, 4), 338 - "threshold": OWNER_THRESHOLD, 339 - "streams": sorted(cluster_streams), 340 - "stream_diversity": len(cluster_streams), 341 - "sample_count": segment_count, 342 - "samples": samples, 343 - }, 297 + "status": "candidate", 298 + "cluster_size": cluster_size, 299 + "streams_represented": streams_represented, 344 300 "recommendation": recommendation, 301 + "samples": samples, 345 302 } 346 303 347 304
+1 -11
apps/speakers/routes.py
··· 1123 1123 1124 1124 if status in {"none", "rejected"}: 1125 1125 seg_count = count_segments_with_embeddings() 1126 - if seg_count >= 50: 1126 + if seg_count > 0: 1127 1127 return jsonify( 1128 1128 { 1129 1129 "status": "needs_detection", ··· 1139 1139 def api_owner_detect() -> Any: 1140 1140 """Run owner voice candidate detection.""" 1141 1141 result = detect_owner_candidate() 1142 - # Map new structured statuses back to the web UI expectations 1143 - status = result.get("status") 1144 - if status == "candidate_found": 1145 - # Flatten for web UI compatibility 1146 - candidate = result.get("candidate", {}) 1147 - return jsonify({ 1148 - "status": "candidate", 1149 - "cluster_size": candidate.get("cluster_size"), 1150 - "samples": candidate.get("samples", []), 1151 - }) 1152 1142 return jsonify(result) 1153 1143 1154 1144
+142 -364
apps/speakers/status.py
··· 1 1 # SPDX-License-Identifier: AGPL-3.0-only 2 2 # Copyright (c) 2026 sol pbc 3 3 4 - """Speaker ID status aggregation — read-only state inspection. 5 - 6 - Aggregates speaker identification state from disk into a structured 7 - JSON dashboard. No new computations — just reads existing files. 8 - """ 4 + """Speaker subsystem status aggregation.""" 9 5 10 6 from __future__ import annotations 11 7 ··· 14 10 from pathlib import Path 15 11 from typing import Any 16 12 17 - import numpy as np 18 - 19 - from think.utils import day_dirs, day_path, get_journal, iter_segments, segment_path 13 + from think.awareness import get_current 14 + from think.utils import day_dirs, get_journal 20 15 21 16 logger = logging.getLogger(__name__) 22 17 23 - VALID_SECTIONS = {"embeddings", "owner", "speakers", "clusters", "imports", "attribution"} 18 + SECTIONS = ("embeddings", "owner", "speakers", "clusters", "imports", "attribution") 24 19 25 20 26 - def _routes_helpers(): 27 - """Load speakers route helpers lazily to avoid import cycles.""" 28 - from apps.speakers.routes import ( 29 - _load_embeddings_file, 30 - _load_entity_voiceprints_file, 31 - _load_speaker_labels, 32 - _normalize_embedding, 33 - _scan_segment_embeddings, 34 - ) 21 + def get_speakers_status(section: str | None = None) -> Any: 22 + """Aggregate speaker subsystem status. 35 23 36 - return ( 37 - _load_embeddings_file, 38 - _load_entity_voiceprints_file, 39 - _load_speaker_labels, 40 - _normalize_embedding, 41 - _scan_segment_embeddings, 42 - ) 24 + Args: 25 + section: Optional section name to return. If None, returns all sections. 43 26 27 + Returns: 28 + Dict with all sections, or a single section's value if section is specified. 29 + """ 30 + builders = { 31 + "embeddings": _embeddings_section, 32 + "owner": _owner_section, 33 + "speakers": _speakers_section, 34 + "clusters": _clusters_section, 35 + "imports": _imports_section, 36 + "attribution": _attribution_section, 37 + } 44 38 45 - def _has_audio_embeddings(seg_path: Path) -> bool: 46 - """Return True if the segment has audio embedding NPZ files.""" 47 - for p in seg_path.glob("*.npz"): 48 - if p.stem.endswith("_audio") or p.stem == "audio": 49 - return True 50 - return False 39 + if section: 40 + builder = builders.get(section) 41 + if builder is None: 42 + return { 43 + "error": f"Unknown section '{section}'. Valid: {', '.join(SECTIONS)}" 44 + } 45 + return builder() 51 46 47 + return {name: builder() for name, builder in builders.items()} 52 48 53 - def get_embeddings_status() -> dict[str, Any]: 54 - """Aggregate embedding coverage statistics.""" 55 - ( 56 - load_embeddings_file, 57 - _, 58 - _, 59 - _, 60 - scan_segment_embeddings, 61 - ) = _routes_helpers() 62 49 63 - total_segments = 0 64 - segments_with_embeddings = 0 65 - total_embeddings = 0 66 - streams: dict[str, dict[str, int]] = {} 67 - days_with: set[str] = set() 50 + def _embeddings_section() -> dict[str, Any]: 51 + from apps.speakers.routes import _scan_segment_embeddings 68 52 69 - for day_name in sorted(day_dirs().keys()): 70 - segments = list(iter_segments(day_name)) 71 - total_segments += len(segments) 53 + segments = 0 54 + streams: dict[str, int] = {} 55 + days_seen: set[str] = set() 72 56 73 - for segment in scan_segment_embeddings(day_name): 74 - segments_with_embeddings += 1 75 - stream = segment["stream"] 76 - seg_key = segment["key"] 77 - seg_dir = segment_path(day_name, seg_key, stream) 57 + for day in day_dirs().keys(): 58 + day_segments = _scan_segment_embeddings(day) 59 + if day_segments: 60 + days_seen.add(day) 61 + for seg in day_segments: 62 + segments += 1 63 + stream = seg["stream"] 64 + streams[stream] = streams.get(stream, 0) + 1 78 65 79 - if stream not in streams: 80 - streams[stream] = {"segments": 0, "embeddings": 0} 81 - streams[stream]["segments"] += 1 82 - days_with.add(day_name) 83 - 84 - for source in segment["sources"]: 85 - emb_data = load_embeddings_file(seg_dir / f"{source}.npz") 86 - if emb_data is not None: 87 - count = len(emb_data[0]) 88 - total_embeddings += count 89 - streams[stream]["embeddings"] += count 90 - 91 - sorted_days = sorted(days_with) if days_with else [] 92 - 66 + sorted_days = sorted(days_seen) if days_seen else [] 93 67 return { 94 - "total_segments": total_segments, 95 - "segments_with_embeddings": segments_with_embeddings, 96 - "total_embeddings": total_embeddings, 97 - "coverage_pct": round( 98 - 100.0 * segments_with_embeddings / total_segments, 1 99 - ) 100 - if total_segments > 0 101 - else 0.0, 102 - "date_range": [sorted_days[0], sorted_days[-1]] if sorted_days else [], 103 - "days_with_embeddings": len(days_with), 104 - "streams": dict(sorted(streams.items(), key=lambda x: x[1]["embeddings"], reverse=True)), 68 + "segments": segments, 69 + "streams": streams, 70 + "days": len(sorted_days), 71 + "date_range": [sorted_days[0], sorted_days[-1]] if sorted_days else None, 105 72 } 106 73 107 74 108 - def get_owner_status() -> dict[str, Any]: 109 - """Aggregate owner centroid status.""" 75 + def _owner_section() -> dict[str, Any]: 110 76 from apps.speakers.owner import load_owner_centroid 111 - from think.entities.journal import get_journal_principal, journal_entity_memory_path 112 77 113 - principal = get_journal_principal() 114 - if not principal: 115 - return {"exists": False} 116 - 117 - centroid_data = load_owner_centroid() 118 - if centroid_data is None: 119 - return {"exists": False} 120 - 121 - # Load full centroid metadata 122 - centroid_path = journal_entity_memory_path(principal["id"]) / "owner_centroid.npz" 123 - result: dict[str, Any] = {"exists": True} 124 - 125 - try: 126 - data = np.load(centroid_path, allow_pickle=False) 127 - cluster_size = data.get("cluster_size") 128 - threshold = data.get("threshold") 129 - version = data.get("version") 130 - 131 - if cluster_size is not None: 132 - result["cluster_size"] = int(np.asarray(cluster_size).item()) 133 - if threshold is not None: 134 - result["threshold"] = round(float(np.asarray(threshold).item()), 2) 135 - if version is not None: 136 - result["version"] = str(np.asarray(version).item()) 137 - except Exception: 138 - pass 139 - 140 - # Estimate coverage: count how many embeddings match the owner centroid 141 - _, centroid_threshold = centroid_data 142 - owner_centroid = centroid_data[0] 143 - 144 - ( 145 - load_embeddings_file, 146 - _, 147 - _, 148 - normalize_embedding, 149 - scan_segment_embeddings, 150 - ) = _routes_helpers() 151 - 152 - total_embeddings = 0 153 - owner_matches = 0 154 - streams_represented: set[str] = set() 155 - 156 - for day_name in day_dirs().keys(): 157 - for segment in scan_segment_embeddings(day_name): 158 - stream = segment["stream"] 159 - seg_dir = segment_path(day_name, segment["key"], stream) 160 - for source in segment["sources"]: 161 - emb_data = load_embeddings_file(seg_dir / f"{source}.npz") 162 - if emb_data is None: 163 - continue 164 - embeddings, _ = emb_data 165 - total_embeddings += len(embeddings) 166 - for emb in embeddings: 167 - normalized = normalize_embedding(emb) 168 - if normalized is not None: 169 - score = float(np.dot(normalized, owner_centroid)) 170 - if score >= centroid_threshold: 171 - owner_matches += 1 172 - streams_represented.add(stream) 78 + voiceprint = get_current().get("voiceprint", {}) 79 + status = voiceprint.get("status", "none") 80 + result: dict[str, Any] = {"status": status} 173 81 174 - result["streams_represented"] = sorted(streams_represented) 175 - result["coverage_estimate_pct"] = ( 176 - round(100.0 * owner_matches / total_embeddings, 1) if total_embeddings > 0 else 0.0 177 - ) 82 + if status == "candidate": 83 + result["cluster_size"] = voiceprint.get("cluster_size") 84 + result["detected_at"] = voiceprint.get("detected_at") 85 + result["streams_represented"] = voiceprint.get("streams_represented") 86 + result["recommendation"] = voiceprint.get("recommendation") 87 + elif status == "no_cluster": 88 + result["segments_checked"] = voiceprint.get("segments_checked") 89 + result["attempted_at"] = voiceprint.get("attempted_at") 178 90 91 + result["centroid_saved"] = load_owner_centroid() is not None 179 92 return result 180 93 181 94 182 - def get_speakers_status() -> dict[str, Any]: 183 - """Aggregate known speaker statistics.""" 184 - from think.entities.journal import load_all_journal_entities 185 - 186 - _, load_entity_voiceprints_file, _, _, _ = _routes_helpers() 187 - 188 - journal_entities = load_all_journal_entities() 189 - speakers: list[dict[str, Any]] = [] 190 - total_voiceprint_embeddings = 0 95 + def _speakers_section() -> list[dict[str, Any]]: 96 + from apps.speakers.routes import _load_entity_voiceprints_file 97 + from think.entities.journal import scan_journal_entities 191 98 192 - for entity_id, entity in journal_entities.items(): 193 - if entity.get("blocked") or entity.get("is_principal"): 194 - continue 195 - 196 - result = load_entity_voiceprints_file(entity_id) 99 + speakers = [] 100 + for entity in scan_journal_entities(): 101 + entity_id = entity["id"] 102 + result = _load_entity_voiceprints_file(entity_id) 197 103 if result is None: 198 104 continue 199 105 200 106 embeddings, metadata_list = result 201 - embedding_count = len(embeddings) 202 - if embedding_count == 0: 203 - continue 204 - 205 - total_voiceprint_embeddings += embedding_count 206 - 207 - # Count unique segments and streams 208 - segments: set[tuple[str, str]] = set() 209 107 streams: set[str] = set() 210 - for m in metadata_list: 211 - day = m.get("day", "") 212 - seg_key = m.get("segment_key", "") 213 - stream = m.get("stream", "") 214 - if day and seg_key: 215 - segments.add((day, seg_key)) 216 - if stream: 217 - streams.add(stream) 218 - 219 - # Derive confidence rating 220 - stream_count = len(streams) 221 - if embedding_count >= 100 and stream_count >= 3: 222 - confidence = "strong" 223 - elif embedding_count >= 20 or stream_count >= 2: 224 - confidence = "moderate" 225 - else: 226 - confidence = "developing" 227 - 228 - speakers.append({ 229 - "entity_id": entity_id, 230 - "name": entity.get("name", entity_id), 231 - "embeddings": embedding_count, 232 - "segments": len(segments), 233 - "streams": stream_count, 234 - "confidence": confidence, 235 - }) 108 + segments: set[tuple[str, str]] = set() 109 + for metadata in metadata_list: 110 + if "stream" in metadata: 111 + streams.add(metadata["stream"]) 112 + segments.add((metadata.get("day", ""), metadata.get("segment_key", ""))) 236 113 237 - # Sort by embedding count descending 238 - speakers.sort(key=lambda s: s["embeddings"], reverse=True) 114 + speakers.append( 115 + { 116 + "entity_id": entity_id, 117 + "name": entity.get("name", entity_id), 118 + "embedding_count": len(embeddings), 119 + "segment_count": len(segments), 120 + "streams": sorted(streams), 121 + } 122 + ) 239 123 240 - return { 241 - "total": len(speakers), 242 - "total_voiceprint_embeddings": total_voiceprint_embeddings, 243 - "top": speakers[:10], 244 - } 124 + return speakers 245 125 246 126 247 - def get_clusters_status() -> dict[str, Any]: 248 - """Aggregate discovery cluster statistics from cache.""" 127 + def _clusters_section() -> dict[str, Any] | None: 249 128 cache_path = Path(get_journal()) / "awareness" / "discovery_clusters.json" 250 - 251 129 if not cache_path.exists(): 252 - return { 253 - "total_unmatched": 0, 254 - "candidate_count": 0, 255 - "candidates": [], 256 - } 257 - 130 + return None 258 131 try: 259 - with open(cache_path, encoding="utf-8") as f: 260 - cache_data = json.load(f) 261 - except (json.JSONDecodeError, OSError): 132 + data = json.loads(cache_path.read_text()) 133 + clusters = data.get("clusters", []) 262 134 return { 263 - "total_unmatched": 0, 264 - "candidate_count": 0, 265 - "candidates": [], 135 + "cached_at": data.get("version"), 136 + "count": len(clusters), 137 + "clusters": clusters, 266 138 } 139 + except Exception: 140 + logger.warning("Failed to read discovery cache", exc_info=True) 141 + return None 267 142 268 - clusters = cache_data.get("clusters", {}) 269 - total_unmatched = sum(len(members) for members in clusters.values()) 270 143 271 - candidates: list[dict[str, Any]] = [] 272 - for cluster_id, members in clusters.items(): 273 - segment_set = { 274 - (m["day"], m["stream"], m["segment_key"]) for m in members 275 - } 276 - # Get a preview from the first member's text if possible 277 - preview = "" 278 - if members: 279 - first = members[0] 280 - seg_dir = segment_path(first["day"], first["segment_key"], first["stream"]) 281 - jsonl_path = seg_dir / f"{first['source']}.jsonl" 282 - if jsonl_path.exists(): 283 - try: 284 - lines = jsonl_path.read_text(encoding="utf-8").splitlines() 285 - sid = int(first.get("sentence_id", 0)) 286 - if 0 < sid < len(lines): 287 - entry = json.loads(lines[sid]) 288 - preview = (entry.get("text") or "")[:80] 289 - except Exception: 290 - pass 144 + def _imports_section() -> dict[str, Any]: 145 + journal = Path(get_journal()) 146 + meetings = 0 147 + screens = 0 291 148 292 - candidates.append({ 293 - "cluster_id": int(cluster_id), 294 - "size": len(members), 295 - "segment_count": len(segment_set), 296 - "preview": preview, 297 - }) 149 + for day in day_dirs().keys(): 150 + day_dir = journal / day 151 + if not day_dir.is_dir(): 152 + continue 153 + for stream_dir in sorted(day_dir.iterdir()): 154 + if not stream_dir.is_dir(): 155 + continue 156 + for seg_dir in sorted(stream_dir.iterdir()): 157 + if not seg_dir.is_dir(): 158 + continue 159 + if (seg_dir / "meetings.md").exists(): 160 + meetings += 1 161 + if (seg_dir / "screen.md").exists(): 162 + screens += 1 298 163 299 - candidates.sort(key=lambda c: c["size"], reverse=True) 164 + return {"meetings_files": meetings, "screen_files": screens} 300 165 301 - return { 302 - "total_unmatched": total_unmatched, 303 - "candidate_count": len(candidates), 304 - "candidates": candidates, 305 - } 306 166 167 + def _attribution_section() -> dict[str, Any]: 168 + journal = Path(get_journal()) 169 + total_files = 0 170 + total_labels = 0 171 + by_confidence: dict[str, int] = {} 172 + by_method: dict[str, int] = {} 307 173 308 - def get_imports_status() -> dict[str, Any]: 309 - """Aggregate import signal statistics.""" 310 - from apps.speakers.attribution import ( 311 - _extract_meeting_participants, 312 - _extract_screen_participants, 313 - _load_setting_field, 314 - _parse_setting_names, 315 - ) 316 - 317 - settings_with_participants = 0 318 - meetings_with_attendees = 0 319 - screen_with_participants = 0 320 - 321 - seen_meeting_days: set[str] = set() 322 - 323 - for day_name in day_dirs().keys(): 324 - for stream, seg_key, seg_path in iter_segments(day_name): 325 - # Check setting field 326 - setting = _load_setting_field(seg_path) 327 - if setting: 328 - names = _parse_setting_names(setting) 329 - if names: 330 - settings_with_participants += 1 331 - 332 - # Check screen.md 333 - screen_names = _extract_screen_participants(seg_path) 334 - if screen_names: 335 - screen_with_participants += 1 336 - 337 - # Check meetings.md (once per day) 338 - if day_name not in seen_meeting_days: 339 - meeting_names = _extract_meeting_participants(day_name, seg_key) 340 - if meeting_names: 341 - meetings_with_attendees += 1 342 - seen_meeting_days.add(day_name) 343 - 344 - return { 345 - "settings_with_participants": settings_with_participants, 346 - "meetings_with_attendees": meetings_with_attendees, 347 - "screen_with_participants": screen_with_participants, 348 - } 349 - 350 - 351 - def get_attribution_status() -> dict[str, Any]: 352 - """Aggregate attribution coverage statistics.""" 353 - _, _, load_speaker_labels, _, scan_segment_embeddings = _routes_helpers() 354 - 355 - segments_with_embeddings = 0 356 - segments_with_labels = 0 357 - total_sentences = 0 358 - high_count = 0 359 - medium_count = 0 360 - null_count = 0 361 - method_breakdown: dict[str, int] = {} 362 - 363 - for day_name in day_dirs().keys(): 364 - for segment in scan_segment_embeddings(day_name): 365 - segments_with_embeddings += 1 366 - seg_dir = segment_path(day_name, segment["key"], segment["stream"]) 367 - labels_data = load_speaker_labels(seg_dir) 368 - if labels_data is None: 174 + for day in day_dirs().keys(): 175 + day_dir = journal / day 176 + if not day_dir.is_dir(): 177 + continue 178 + for stream_dir in sorted(day_dir.iterdir()): 179 + if not stream_dir.is_dir(): 369 180 continue 370 - 371 - segments_with_labels += 1 372 - for label in labels_data.get("labels", []): 373 - total_sentences += 1 374 - confidence = label.get("confidence") 375 - method = label.get("method") or "unmatched" 376 - 377 - if confidence == "high": 378 - high_count += 1 379 - elif confidence == "medium": 380 - medium_count += 1 381 - else: 382 - null_count += 1 383 - 384 - method_breakdown[method] = method_breakdown.get(method, 0) + 1 181 + for seg_dir in sorted(stream_dir.iterdir()): 182 + if not seg_dir.is_dir(): 183 + continue 184 + labels_file = seg_dir / "agents" / "speaker_labels.json" 185 + if not labels_file.exists(): 186 + continue 187 + try: 188 + data = json.loads(labels_file.read_text()) 189 + except Exception: 190 + continue 191 + total_files += 1 192 + for label in data.get("labels", []): 193 + total_labels += 1 194 + confidence = label.get("confidence", "unknown") 195 + method = label.get("method", "unknown") 196 + by_confidence[confidence] = by_confidence.get(confidence, 0) + 1 197 + by_method[method] = by_method.get(method, 0) + 1 385 198 386 199 return { 387 - "segments_with_labels": segments_with_labels, 388 - "coverage_pct": round( 389 - 100.0 * segments_with_labels / segments_with_embeddings, 1 390 - ) 391 - if segments_with_embeddings > 0 392 - else 0.0, 393 - "total_sentences": total_sentences, 394 - "high": high_count, 395 - "medium": medium_count, 396 - "null": null_count, 397 - "needs_review": medium_count + null_count, 398 - "method_breakdown": dict(sorted(method_breakdown.items(), key=lambda x: x[1], reverse=True)), 200 + "files": total_files, 201 + "labels": total_labels, 202 + "by_confidence": by_confidence, 203 + "by_method": by_method, 399 204 } 400 - 401 - 402 - def get_status(section: str | None = None) -> dict[str, Any]: 403 - """Return full speaker ID status or a single section. 404 - 405 - Args: 406 - section: Optional section name to return. If None, returns all sections. 407 - 408 - Returns: 409 - Dict with all six sections, or just the requested section. 410 - """ 411 - if section and section not in VALID_SECTIONS: 412 - return {"error": f"Unknown section: {section}. Valid: {', '.join(sorted(VALID_SECTIONS))}"} 413 - 414 - builders = { 415 - "embeddings": get_embeddings_status, 416 - "owner": get_owner_status, 417 - "speakers": get_speakers_status, 418 - "clusters": get_clusters_status, 419 - "imports": get_imports_status, 420 - "attribution": get_attribution_status, 421 - } 422 - 423 - if section: 424 - return {section: builders[section]()} 425 - 426 - return {name: builder() for name, builder in builders.items()}
+13 -9
apps/speakers/tests/test_owner.py
··· 110 110 ) 111 111 112 112 result = detect_owner_candidate() 113 - assert result["status"] == "insufficient_data" 113 + assert result["status"] == "low_data" 114 114 assert result["segments_available"] == 10 115 - assert "recommendation" in result 115 + assert result["embeddings_available"] == 10 116 + assert result["recommendation"] == "low_data" 116 117 117 118 118 119 def test_detect_owner_no_cluster(speakers_env): ··· 132 133 ) 133 134 134 135 result = detect_owner_candidate() 135 - assert result["status"] == "no_cluster" 136 - assert "recommendation" in result 136 + assert result["status"] == "no_clusters" 137 + assert result["segments_available"] == 50 138 + assert result["recommendation"] == "no_clusters" 137 139 assert get_current()["voiceprint"]["status"] == "no_cluster" 138 140 139 141 ··· 172 174 result = detect_owner_candidate() 173 175 174 176 assert result is not None 175 - assert result["status"] == "candidate_found" 176 - assert result["candidate"]["cluster_size"] >= 50 177 - assert len(result["candidate"]["samples"]) == 3 178 - assert "recommendation" in result 179 - assert result["recommendation"] in ("strong_candidate", "good_candidate", "weak_candidate") 177 + assert result["status"] == "candidate" 178 + assert result["cluster_size"] >= 50 179 + assert result["streams_represented"] == 2 180 + assert result["recommendation"] == "ready" 181 + assert len(result["samples"]) == 3 180 182 assert _candidate_path(env.journal).exists() 181 183 assert get_current()["voiceprint"]["status"] == "candidate" 182 184 ··· 461 463 assert response.status_code == 200 462 464 assert data["status"] == "candidate" 463 465 assert data["cluster_size"] >= 50 466 + assert "streams_represented" in data 467 + assert "recommendation" in data
+42 -224
apps/speakers/tests/test_status.py
··· 1 1 # SPDX-License-Identifier: AGPL-3.0-only 2 2 # Copyright (c) 2026 sol pbc 3 3 4 - """Tests for the speakers status command.""" 4 + """Tests for speaker subsystem status.""" 5 5 6 6 from __future__ import annotations 7 7 8 - import json 9 - from pathlib import Path 10 8 11 - import numpy as np 12 - 13 - from think.entities import entity_slug 14 - 15 - 16 - def _normalized(vector: np.ndarray) -> np.ndarray: 17 - return vector / np.linalg.norm(vector) 18 - 19 - 20 - def test_status_empty_journal(speakers_env): 21 - from apps.speakers.status import get_status 9 + def test_status_all_sections(speakers_env): 10 + from apps.speakers.status import get_speakers_status 22 11 23 12 speakers_env() 24 - result = get_status() 25 - 13 + result = get_speakers_status() 26 14 assert "embeddings" in result 27 15 assert "owner" in result 28 16 assert "speakers" in result ··· 30 18 assert "imports" in result 31 19 assert "attribution" in result 32 20 33 - assert result["embeddings"]["total_segments"] == 0 34 - assert result["embeddings"]["segments_with_embeddings"] == 0 35 - assert result["owner"]["exists"] is False 36 - assert result["speakers"]["total"] == 0 37 - assert result["clusters"]["candidate_count"] == 0 38 - assert result["attribution"]["segments_with_labels"] == 0 39 21 40 - 41 - def test_status_section_filter(speakers_env): 42 - from apps.speakers.status import get_status 22 + def test_status_single_section(speakers_env): 23 + from apps.speakers.status import get_speakers_status 43 24 44 25 speakers_env() 45 - result = get_status(section="embeddings") 26 + result = get_speakers_status(section="owner") 27 + assert "status" in result 28 + assert "centroid_saved" in result 46 29 47 - assert "embeddings" in result 48 - assert "owner" not in result 49 30 50 - 51 - def test_status_invalid_section(speakers_env): 52 - from apps.speakers.status import get_status 31 + def test_status_unknown_section(speakers_env): 32 + from apps.speakers.status import get_speakers_status 53 33 54 34 speakers_env() 55 - result = get_status(section="nonexistent") 56 - 35 + result = get_speakers_status(section="nonexistent") 57 36 assert "error" in result 58 37 59 38 60 - def test_status_embeddings(speakers_env): 61 - from apps.speakers.status import get_status 62 - 63 - env = speakers_env() 64 - env.create_segment("20240101", "090000_300", ["mic_audio"], num_sentences=10) 65 - env.create_segment("20240101", "100000_300", ["mic_audio"], num_sentences=5) 66 - env.create_segment("20240102", "090000_300", ["mic_audio"], num_sentences=8) 67 - 68 - result = get_status(section="embeddings") 69 - emb = result["embeddings"] 70 - 71 - assert emb["segments_with_embeddings"] == 3 72 - assert emb["total_embeddings"] == 23 73 - assert emb["days_with_embeddings"] == 2 74 - assert len(emb["date_range"]) == 2 75 - assert emb["date_range"][0] == "20240101" 76 - assert emb["date_range"][1] == "20240102" 77 - assert "test" in emb["streams"] 78 - assert emb["streams"]["test"]["segments"] == 3 79 - assert emb["streams"]["test"]["embeddings"] == 23 80 - 81 - 82 - def test_status_owner_no_centroid(speakers_env): 83 - from apps.speakers.status import get_status 84 - 85 - speakers_env() 86 - result = get_status(section="owner") 87 - assert result["owner"]["exists"] is False 88 - 89 - 90 - def test_status_owner_with_centroid(speakers_env): 91 - from apps.speakers.owner import OWNER_THRESHOLD 92 - from apps.speakers.status import get_status 93 - 94 - env = speakers_env() 95 - # Create principal entity with owner centroid 96 - principal_dir = env.create_entity("Self Person", is_principal=True) 97 - centroid = _normalized(np.array([1.0] + [0.0] * 255, dtype=np.float32)) 98 - np.savez_compressed( 99 - principal_dir / "owner_centroid.npz", 100 - centroid=centroid, 101 - cluster_size=np.array(100, dtype=np.int32), 102 - threshold=np.array(OWNER_THRESHOLD, dtype=np.float32), 103 - version=np.array("2026-03-15T10:30:00Z"), 104 - ) 105 - 106 - # Create a segment with embeddings similar to owner 107 - close_embs = np.tile( 108 - _normalized(np.array([0.95, 0.05] + [0.0] * 254, dtype=np.float32)), 109 - (5, 1), 110 - ) 111 - env.create_segment("20240101", "090000_300", ["mic_audio"], embeddings=close_embs) 112 - 113 - result = get_status(section="owner") 114 - owner = result["owner"] 115 - 116 - assert owner["exists"] is True 117 - assert owner["cluster_size"] == 100 118 - assert owner["threshold"] == OWNER_THRESHOLD 119 - assert owner["version"] == "2026-03-15T10:30:00Z" 120 - 121 - 122 - def test_status_speakers(speakers_env): 123 - from apps.speakers.status import get_status 124 - 125 - env = speakers_env() 126 - 127 - # Create entities with voiceprints 128 - env.create_entity( 129 - "Alice Test", 130 - voiceprints=[ 131 - ("20240101", "090000_300", "mic_audio", 1), 132 - ("20240101", "090000_300", "mic_audio", 2), 133 - ("20240101", "090000_300", "mic_audio", 3), 134 - ], 135 - ) 136 - env.create_entity( 137 - "Bob Test", 138 - voiceprints=[ 139 - ("20240101", "100000_300", "mic_audio", 1), 140 - ], 141 - ) 142 - 143 - result = get_status(section="speakers") 144 - spk = result["speakers"] 145 - 146 - assert spk["total"] == 2 147 - assert spk["total_voiceprint_embeddings"] == 4 148 - assert len(spk["top"]) == 2 149 - assert spk["top"][0]["name"] == "Alice Test" 150 - assert spk["top"][0]["embeddings"] == 3 151 - assert spk["top"][0]["confidence"] == "developing" 152 - 153 - 154 - def test_status_clusters_empty(speakers_env): 155 - from apps.speakers.status import get_status 156 - 157 - speakers_env() 158 - result = get_status(section="clusters") 159 - 160 - assert result["clusters"]["total_unmatched"] == 0 161 - assert result["clusters"]["candidate_count"] == 0 162 - 163 - 164 - def test_status_clusters_with_cache(speakers_env): 165 - from apps.speakers.status import get_status 39 + def test_status_embeddings_with_data(speakers_env): 40 + from apps.speakers.status import get_speakers_status 166 41 167 42 env = speakers_env() 168 - # Create a segment so transcript text lookup doesn't crash 169 - env.create_segment("20240101", "090000_300", ["mic_audio"], num_sentences=5) 43 + env.create_segment("20240101", "090000_300", ["mic_audio"]) 44 + env.create_segment("20240101", "091000_300", ["sys_audio"]) 45 + env.create_segment("20240102", "090000_300", ["audio"]) 170 46 171 - # Write a discovery cache file 172 - cache_dir = env.journal / "awareness" 173 - cache_dir.mkdir(parents=True, exist_ok=True) 174 - cache_data = { 175 - "version": "2026-03-15T10:00:00", 176 - "clusters": { 177 - "0": [ 178 - { 179 - "day": "20240101", 180 - "stream": "test", 181 - "segment_key": "090000_300", 182 - "source": "mic_audio", 183 - "sentence_id": 1, 184 - }, 185 - { 186 - "day": "20240101", 187 - "stream": "test", 188 - "segment_key": "090000_300", 189 - "source": "mic_audio", 190 - "sentence_id": 2, 191 - }, 192 - ], 193 - }, 194 - } 195 - with open(cache_dir / "discovery_clusters.json", "w") as f: 196 - json.dump(cache_data, f) 47 + result = get_speakers_status(section="embeddings") 48 + assert result["segments"] == 3 49 + assert result["days"] == 2 50 + assert result["date_range"] == ["20240101", "20240102"] 197 51 198 - result = get_status(section="clusters") 199 - clusters = result["clusters"] 200 52 201 - assert clusters["total_unmatched"] == 2 202 - assert clusters["candidate_count"] == 1 203 - assert clusters["candidates"][0]["cluster_id"] == 0 204 - assert clusters["candidates"][0]["size"] == 2 205 - 206 - 207 - def test_status_attribution(speakers_env): 208 - from apps.speakers.status import get_status 53 + def test_status_attribution_with_labels(speakers_env): 54 + from apps.speakers.status import get_speakers_status 209 55 210 56 env = speakers_env() 211 - env.create_segment("20240101", "090000_300", ["mic_audio"], num_sentences=5) 212 - env.create_segment("20240101", "100000_300", ["mic_audio"], num_sentences=3) 213 - 214 - # Create labels for one segment 215 57 env.create_speaker_labels( 216 58 "20240101", 217 59 "090000_300", 218 60 [ 219 - {"sentence_id": 1, "speaker": "alice", "confidence": "high", "method": "owner_centroid"}, 220 - {"sentence_id": 2, "speaker": "alice", "confidence": "high", "method": "owner_centroid"}, 221 - {"sentence_id": 3, "speaker": "bob", "confidence": "medium", "method": "acoustic"}, 222 - {"sentence_id": 4, "speaker": None, "confidence": None, "method": None}, 223 - {"sentence_id": 5, "speaker": "alice", "confidence": "high", "method": "structural_single_speaker"}, 61 + { 62 + "sentence_id": 1, 63 + "speaker": "alice", 64 + "confidence": "high", 65 + "method": "voiceprint", 66 + }, 67 + { 68 + "sentence_id": 2, 69 + "speaker": None, 70 + "confidence": "low", 71 + "method": "unmatched", 72 + }, 224 73 ], 225 74 ) 226 75 227 - result = get_status(section="attribution") 228 - attr = result["attribution"] 229 - 230 - assert attr["segments_with_labels"] == 1 231 - assert attr["total_sentences"] == 5 232 - assert attr["high"] == 3 233 - assert attr["medium"] == 1 234 - assert attr["null"] == 1 235 - assert attr["needs_review"] == 2 236 - assert "owner_centroid" in attr["method_breakdown"] 237 - 238 - 239 - def test_status_full_returns_all_sections(speakers_env): 240 - from apps.speakers.status import get_status 241 - 242 - env = speakers_env() 243 - env.create_segment("20240101", "090000_300", ["mic_audio"]) 244 - 245 - result = get_status() 246 - 247 - assert len(result) == 6 248 - for section in ("embeddings", "owner", "speakers", "clusters", "imports", "attribution"): 249 - assert section in result 250 - 251 - 252 - def test_status_json_serializable(speakers_env): 253 - """Ensure the full status output can be serialized to JSON.""" 254 - from apps.speakers.status import get_status 255 - 256 - env = speakers_env() 257 - env.create_segment("20240101", "090000_300", ["mic_audio"]) 258 - 259 - result = get_status() 260 - # This will raise if any value is not JSON-serializable 261 - serialized = json.dumps(result) 262 - assert isinstance(serialized, str) 263 - parsed = json.loads(serialized) 264 - assert "embeddings" in parsed 76 + result = get_speakers_status(section="attribution") 77 + assert result["files"] == 1 78 + assert result["labels"] == 2 79 + assert result["by_confidence"]["high"] == 1 80 + assert result["by_confidence"]["low"] == 1 81 + assert result["by_method"]["voiceprint"] == 1 82 + assert result["by_method"]["unmatched"] == 1