apps/speakers: wespeaker thresholds + owner-bootstrap quality gates

solpbc.org / solstone

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

personal memory agent

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

apps/speakers: wespeaker thresholds + owner-bootstrap quality gates

Ship spec §1–§3 from cpo/specs/in-flight/speaker-attribution-wespeaker.md.
§4 (noisy-segment overlap guard) is deferred pending CPO spec revision.

- New apps/speakers/encoder_config.py with seven LOCKED constants
(ENCODER_ID + OWNER_THRESHOLD=0.43, ACOUSTIC_HIGH=0.36,
ACOUSTIC_MEDIUM=0.22, OWNER_BOOTSTRAP_MIN_STMTS=30,
OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S=1.5,
OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25=0.30).
- owner.py and attribution.py import constants from encoder_config.py;
resemblyzer-era literals removed.
- _embed_statements persists durations_s lockstep with embeddings into
the per-segment .npz; _load_embeddings_file returns a 3-tuple and
every caller is updated (no compat shim).
- detect_owner_candidate adds three post-cluster quality gates
(too_few_stmts, median_duration_too_short, cluster_too_diffuse) that
defer candidate lock-in with recommendation="low_quality" and the
LOCKED diagnostic shape; awareness state surfaces low_quality
through api_owner_status, _owner_section, and the workspace UI
empty-state copy.
- Tests cover the encoder_config import contract, all three gate
paths, the 3-tuple loader (with and without durations_s), and the
low_quality api_owner_status branch.

Pre-existing flaky-fixture and voiceprints.npz metadata-serialization
bugs surfaced by the speaker test gate are fixed minimally to keep
make test-app APP=speakers green.

Co-Authored-By: OpenAI Codex <codex@openai.com>

Jer Miller 1 month ago 2e41d943 035a0173

+493 -77

17 changed files

expand all collapse all

apps

speakers

attribution.py

bootstrap.py

discovery.py

encoder_config.py

owner.py

routes.py

status.py

tests

conftest.py

test_attribution.py

test_discovery.py

test_encoder_config.py

test_owner.py

test_routes.py

test_suggest.py

workspace.html

observe

transcribe

main.py

think

entities

voiceprints.py

+6 -9

apps/speakers/attribution.py

reviewed

··· 6 6 Runs per-segment after transcription and embedding. Operates in layers 7 7 from cheapest to most expensive: 8 8 9 9 - Layer 1: Owner separation (cosine similarity to owner centroid >= 0.82) 9 9 + Layer 1: Owner separation (cosine similarity to owner centroid passes Layer 1) 10 10 Layer 2: Structural heuristics (speaker count, setting field, screen.md, 11 11 meetings.md) — no LLM 12 12 Layer 3: Acoustic matching (voiceprint cosine similarity, same-stream ··· 28 28 29 29 import numpy as np 30 30 31 31 + from apps.speakers.encoder_config import ACOUSTIC_HIGH, ACOUSTIC_MEDIUM 31 32 from apps.speakers.owner import load_owner_centroid 32 33 from think.entities import find_matching_entity 33 34 from think.entities.journal import ( ··· 38 39 from think.utils import day_path, now_ms, segment_path 39 40 40 41 logger = logging.getLogger(__name__) 41 41 - 42 42 - # Acoustic matching thresholds (from spec) 43 43 - ACOUSTIC_HIGH = 0.70 44 44 - ACOUSTIC_MEDIUM = 0.50 45 42 46 43 47 44 def _routes_helpers(): ··· 235 232 if emb_data is None: 236 233 return {"labels": [], "unmatched": [], "source": source, "metadata": {}} 237 234 238 238 - embeddings, statement_ids = emb_data 235 235 + embeddings, statement_ids, _ = emb_data 239 236 if len(embeddings) == 0: 240 237 return {"labels": [], "unmatched": [], "source": source, "metadata": {}} 241 238 ··· 538 535 539 536 Eligibility: 540 537 - Layer 2 structural attributions (high confidence) 541 541 - - Layer 3 acoustic attributions with confidence "high" (>= 0.7) 538 538 + - Layer 3 acoustic attributions with confidence "high" 542 539 543 540 Guards: 544 541 - Owner contamination: never save embeddings with owner similarity 545 545 - >= OWNER_THRESHOLD to non-owner voiceprints 542 542 + above the owner threshold to non-owner voiceprints 546 543 - Idempotent: checks existing voiceprint keys before saving 547 544 548 545 Returns dict mapping entity_id -> number of new embeddings saved. ··· 569 566 if emb_data is None: 570 567 return {} 571 568 572 572 - embeddings, statement_ids = emb_data 569 569 + embeddings, statement_ids, _ = emb_data 573 570 sid_to_idx = {int(s): i for i, s in enumerate(statement_ids)} 574 571 575 572 # Eligible methods for accumulation

+2 -2

apps/speakers/bootstrap.py

reviewed

··· 155 155 if emb_data is None: 156 156 continue 157 157 158 158 - embeddings, statement_ids = emb_data 158 158 + embeddings, statement_ids, _ = emb_data 159 159 160 160 for embedding, sid in zip(embeddings, statement_ids): 161 161 sentence_id = int(sid) ··· 646 646 if emb_data is None: 647 647 continue 648 648 649 649 - embeddings, statement_ids = emb_data 649 649 + embeddings, statement_ids, _ = emb_data 650 650 651 651 for embedding, sid in zip(embeddings, statement_ids): 652 652 sentence_id = int(sid)

+2 -2

apps/speakers/discovery.py

reviewed

··· 128 128 if emb_data is None: 129 129 continue 130 130 131 131 - embeddings, statement_ids = emb_data 131 131 + embeddings, statement_ids, _ = emb_data 132 132 if len(embeddings) == 0: 133 133 continue 134 134 ··· 390 390 if emb_data is None: 391 391 continue 392 392 393 393 - embeddings, statement_ids = emb_data 393 393 + embeddings, statement_ids, _ = emb_data 394 394 emb_vec = None 395 395 for emb, sid in zip(embeddings, statement_ids): 396 396 if int(sid) == sentence_id:

+14

apps/speakers/encoder_config.py

reviewed

··· 1 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 2 + # Copyright (c) 2026 sol pbc 3 3 + 4 4 + """Encoder-specific calibration constants. Source of truth for owner-separation, acoustic-match, and owner-bootstrap quality thresholds. Changes require CPO spec revision; see cpo/specs/in-flight/speaker-attribution-wespeaker.md.""" 5 5 + 6 6 + ENCODER_ID: str = "wespeaker-resnet34-256" 7 7 + 8 8 + OWNER_THRESHOLD: float = 0.43 9 9 + ACOUSTIC_HIGH: float = 0.36 10 10 + ACOUSTIC_MEDIUM: float = 0.22 11 11 + 12 12 + OWNER_BOOTSTRAP_MIN_STMTS: int = 30 13 13 + OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S: float = 1.5 14 14 + OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25: float = 0.30

+162 -6

apps/speakers/owner.py

reviewed

··· 5 5 6 6 from __future__ import annotations 7 7 8 8 + import json 8 9 import logging 9 10 from datetime import datetime 10 11 from pathlib import Path ··· 13 14 import numpy as np 14 15 from sklearn.cluster import HDBSCAN 15 16 17 17 + from apps.speakers.encoder_config import ( 18 18 + OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25, 19 19 + OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S, 20 20 + OWNER_BOOTSTRAP_MIN_STMTS, 21 21 + OWNER_THRESHOLD, 22 22 + ) 16 23 from think.awareness import update_state 17 24 from think.entities.journal import get_journal_principal, journal_entity_memory_path 18 25 from think.utils import day_dirs, get_journal, segment_path ··· 20 27 logger = logging.getLogger(__name__) 21 28 22 29 MAX_EMBEDDINGS = 30000 23 23 - OWNER_THRESHOLD = 0.82 24 30 25 31 26 32 def _mark_no_cluster(segment_count: int) -> None: ··· 35 41 ) 36 42 37 43 44 44 + def _mark_low_quality( 45 45 + reason: str, observed: float, threshold: float, segment_count: int 46 46 + ) -> None: 47 47 + """Record that detection found a cluster, but it failed quality gates.""" 48 48 + update_state( 49 49 + "voiceprint", 50 50 + { 51 51 + "status": "low_quality", 52 52 + "low_quality_reason": reason, 53 53 + "observed_value": float(observed), 54 54 + "threshold_value": float(threshold), 55 55 + "segments_checked": int(segment_count), 56 56 + "attempted_at": _iso_now(), 57 57 + }, 58 58 + ) 59 59 + 60 60 + 61 61 + def _bail_low_quality( 62 62 + reason: str, 63 63 + observed: float, 64 64 + threshold: float, 65 65 + segment_count: int, 66 66 + embeddings_count: int, 67 67 + ) -> dict[str, Any]: 68 68 + """Record and return a locked low-quality owner detection result.""" 69 69 + _mark_low_quality(reason, observed, threshold, segment_count) 70 70 + return { 71 71 + "status": "low_quality", 72 72 + "recommendation": "low_quality", 73 73 + "segments_available": int(segment_count), 74 74 + "embeddings_available": int(embeddings_count), 75 75 + "low_quality_reason": reason, 76 76 + "observed_value": float(observed), 77 77 + "threshold_value": float(threshold), 78 78 + } 79 79 + 80 80 + 81 81 + def _pairwise_cosines(embeddings: np.ndarray) -> np.ndarray: 82 82 + """Return pairwise cosine similarities for a cluster of embeddings.""" 83 83 + n = embeddings.shape[0] 84 84 + if n < 2: 85 85 + return np.empty(0, dtype=np.float32) 86 86 + norms = np.linalg.norm(embeddings, axis=1, keepdims=True) 87 87 + norms = np.where(norms < 1e-9, 1.0, norms) 88 88 + e_norm = embeddings / norms 89 89 + if n > 5000: 90 90 + rng = np.random.default_rng(seed=0) 91 91 + i = rng.integers(0, n, size=1000) 92 92 + j = rng.integers(0, n, size=1000) 93 93 + mask = i != j 94 94 + i = i[mask] 95 95 + j = j[mask] 96 96 + return np.einsum("ij,ij->i", e_norm[i], e_norm[j]).astype( 97 97 + np.float32, copy=False 98 98 + ) 99 99 + sim = e_norm @ e_norm.T 100 100 + iu = np.triu_indices(n, k=1) 101 101 + return sim[iu].astype(np.float32, copy=False) 102 102 + 103 103 + 38 104 def _routes_helpers(): 39 105 """Load speakers route helpers lazily to avoid import cycles.""" 40 106 from apps.speakers.routes import ( ··· 63 129 return f"/app/speakers/api/serve_audio/{day}/{stream}/{segment_key}/{source}.flac" 64 130 65 131 132 132 + def _fallback_statement_durations(jsonl_path: Path) -> dict[int, float | None]: 133 133 + """Estimate statement durations from adjacent transcript start times.""" 134 134 + if not jsonl_path.exists(): 135 135 + return {} 136 136 + 137 137 + starts: list[tuple[int, int]] = [] 138 138 + try: 139 139 + with open(jsonl_path, encoding="utf-8") as f: 140 140 + lines = f.readlines() 141 141 + except OSError: 142 142 + return {} 143 143 + 144 144 + for sentence_id, line in enumerate(lines[1:], start=1): 145 145 + try: 146 146 + entry = json.loads(line) 147 147 + except json.JSONDecodeError: 148 148 + continue 149 149 + start = entry.get("start") 150 150 + if not isinstance(start, str): 151 151 + continue 152 152 + try: 153 153 + hours, minutes, seconds = (int(part) for part in start.split(":", 2)) 154 154 + except ValueError: 155 155 + continue 156 156 + starts.append((sentence_id, hours * 3600 + minutes * 60 + seconds)) 157 157 + 158 158 + durations: dict[int, float | None] = {} 159 159 + for idx, (sentence_id, start_seconds) in enumerate(starts): 160 160 + next_start = starts[idx + 1][1] if idx + 1 < len(starts) else None 161 161 + # Why: older transcript JSONL files only persist statement starts, so 162 162 + # we estimate legacy durations from adjacent sentence boundaries. 163 163 + durations[sentence_id] = ( 164 164 + None if next_start is None else float(next_start - start_seconds) 165 165 + ) 166 166 + return durations 167 167 + 168 168 + 66 169 def count_segments_with_embeddings() -> int: 67 170 """Count all journal segments that contain audio embedding files.""" 68 171 _, _, scan_segment_embeddings = _routes_helpers() ··· 147 250 if emb_data is None: 148 251 continue 149 252 150 150 - embeddings, statement_ids = emb_data 253 253 + embeddings, statement_ids, durations_data = emb_data 151 254 if len(embeddings) == 0: 152 255 continue 153 256 257 257 + fallback_durations = ( 258 258 + {} 259 259 + if durations_data is not None 260 260 + else _fallback_statement_durations(segment_dir / f"{source}.jsonl") 261 261 + ) 154 262 embedding_chunks.append(embeddings.astype(np.float32)) 155 263 provenance.extend( 156 264 { ··· 159 267 "segment_key": segment_key, 160 268 "source": source, 161 269 "sentence_id": int(sid), 270 270 + "duration_s": ( 271 271 + float(durations_data[idx]) 272 272 + if durations_data is not None 273 273 + else fallback_durations.get(int(sid)) 274 274 + ), 162 275 } 163 163 - for sid in statement_ids 276 276 + for idx, sid in enumerate(statement_ids) 164 277 ) 165 278 166 279 if not embedding_chunks: ··· 214 327 } 215 328 216 329 cluster_embeddings = embeddings_matrix[cluster_indices] 330 330 + cluster_size = int(len(cluster_indices)) 331 331 + embeddings_count = int(embeddings_matrix.shape[0]) 332 332 + 333 333 + if cluster_size < OWNER_BOOTSTRAP_MIN_STMTS: 334 334 + return _bail_low_quality( 335 335 + "too_few_stmts", 336 336 + observed=cluster_size, 337 337 + threshold=OWNER_BOOTSTRAP_MIN_STMTS, 338 338 + segment_count=segment_count, 339 339 + embeddings_count=embeddings_count, 340 340 + ) 341 341 + 342 342 + cluster_durations = [ 343 343 + provenance[int(i)]["duration_s"] 344 344 + for i in cluster_indices 345 345 + if provenance[int(i)].get("duration_s") is not None 346 346 + ] 347 347 + if not cluster_durations: 348 348 + median_duration = 0.0 349 349 + else: 350 350 + median_duration = float(np.median(cluster_durations)) 351 351 + if median_duration < OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S: 352 352 + return _bail_low_quality( 353 353 + "median_duration_too_short", 354 354 + observed=median_duration, 355 355 + threshold=OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S, 356 356 + segment_count=segment_count, 357 357 + embeddings_count=embeddings_count, 358 358 + ) 359 359 + 360 360 + intra_cosines = _pairwise_cosines(cluster_embeddings) 361 361 + if intra_cosines.size == 0: 362 362 + intra_p25 = 0.0 363 363 + else: 364 364 + intra_p25 = float(np.percentile(intra_cosines, 25)) 365 365 + if intra_p25 < OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25: 366 366 + return _bail_low_quality( 367 367 + "cluster_too_diffuse", 368 368 + observed=intra_p25, 369 369 + threshold=OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25, 370 370 + segment_count=segment_count, 371 371 + embeddings_count=embeddings_count, 372 372 + ) 373 373 + 217 374 centroid = normalize_embedding(np.mean(cluster_embeddings, axis=0)) 218 375 if centroid is None: 219 376 _mark_no_cluster(segment_count) 220 377 return { 221 378 "status": "no_clusters", 222 379 "segments_available": segment_count, 223 223 - "embeddings_available": int(len(embeddings_matrix)), 380 380 + "embeddings_available": embeddings_count, 224 381 "recommendation": "no_clusters", 225 382 } 226 383 227 227 - cluster_size = int(len(cluster_indices)) 228 384 cluster_streams = {provenance[int(i)]["stream"] for i in cluster_indices} 229 385 streams_represented = len(cluster_streams) 230 386 recommendation = "ready" if streams_represented > 1 else "single_stream" ··· 347 503 if emb_data is None: 348 504 return [] 349 505 350 350 - embeddings, statement_ids = emb_data 506 506 + embeddings, statement_ids, _ = emb_data 351 507 results = [] 352 508 for embedding, statement_id in zip(embeddings, statement_ids): 353 509 normalized = normalize_embedding(embedding)

+21 -8

apps/speakers/routes.py

reviewed

··· 85 85 return t.hour * 3600 + t.minute * 60 + t.second 86 86 87 87 88 88 - def _load_embeddings_file(npz_path: Path) -> tuple[np.ndarray, np.ndarray] | None: 89 89 - """Load embeddings and statement_ids from NPZ file. 88 88 + def _load_embeddings_file( 89 89 + npz_path: Path, 90 90 + ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None] | None: 91 91 + """Load embeddings, statement_ids, and optional durations from NPZ file. 90 92 91 91 - Returns tuple of (embeddings, statement_ids) or None if file is invalid. 93 93 + Returns tuple of (embeddings, statement_ids, durations_s) or None if file is invalid. 92 94 """ 93 95 if not npz_path.exists(): 94 96 return None ··· 97 99 data = np.load(npz_path) 98 100 embeddings = data.get("embeddings") 99 101 statement_ids = data.get("statement_ids") 102 102 + durations_s = data.get("durations_s") 100 103 101 104 if embeddings is None or statement_ids is None: 102 105 return None 103 106 104 104 - return embeddings, statement_ids 107 107 + return embeddings, statement_ids, durations_s 105 108 except Exception as e: 106 109 logger.warning("Failed to load embeddings %s: %s", npz_path, e) 107 110 return None ··· 418 421 419 422 def _load_sentences( 420 423 day: str, segment_key: str, source: str, stream: str | None = None 421 421 - ) -> tuple[list[dict], tuple[np.ndarray, np.ndarray] | None]: 424 424 + ) -> tuple[list[dict], tuple[np.ndarray, np.ndarray, np.ndarray | None] | None]: 422 425 """Load transcript sentences and their embeddings for an audio source. 423 426 424 427 Args: ··· 430 433 Returns: 431 434 Tuple of (sentences, emb_data): 432 435 - sentences: List of dicts with id, offset, text, has_embedding 433 433 - - emb_data: Tuple of (embeddings, statement_ids) or None if no embeddings 436 436 + - emb_data: Tuple of (embeddings, statement_ids, durations_s) or None if no embeddings 434 437 """ 435 438 if stream: 436 439 segment_dir = get_segment_path(day, segment_key, stream, create=False) ··· 478 481 emb_data = _load_embeddings_file(npz_path) 479 482 480 483 if emb_data is not None: 481 481 - embeddings, statement_ids = emb_data 484 484 + embeddings, statement_ids, _ = emb_data 482 485 emb_map = {int(sid): True for sid in statement_ids} 483 486 484 487 # Mark which sentences have embeddings ··· 502 505 if emb_data is None: 503 506 return None 504 507 505 505 - embeddings, statement_ids = emb_data 508 508 + embeddings, statement_ids, _ = emb_data 506 509 507 510 # Find the embedding for this sentence 508 511 for i, sid in enumerate(statement_ids): ··· 1111 1114 "status": "candidate", 1112 1115 "cluster_size": voiceprint.get("cluster_size"), 1113 1116 "samples": voiceprint.get("samples", []), 1117 1117 + } 1118 1118 + ) 1119 1119 + 1120 1120 + if status == "low_quality": 1121 1121 + return jsonify( 1122 1122 + { 1123 1123 + "status": "low_quality", 1124 1124 + "low_quality_reason": voiceprint.get("low_quality_reason", ""), 1125 1125 + "observed_value": voiceprint.get("observed_value", 0.0), 1126 1126 + "threshold_value": voiceprint.get("threshold_value", 0.0), 1114 1127 } 1115 1128 ) 1116 1129

apps/speakers/status.py

reviewed

··· 84 84 result["detected_at"] = voiceprint.get("detected_at") 85 85 result["streams_represented"] = voiceprint.get("streams_represented") 86 86 result["recommendation"] = voiceprint.get("recommendation") 87 87 + elif status == "low_quality": 88 88 + result["low_quality_reason"] = voiceprint.get("low_quality_reason", "") 89 89 + result["observed_value"] = voiceprint.get("observed_value", 0.0) 90 90 + result["threshold_value"] = voiceprint.get("threshold_value", 0.0) 91 91 + result["segments_checked"] = voiceprint.get("segments_checked", 0) 92 92 + result["attempted_at"] = voiceprint.get("attempted_at", "") 87 93 elif status == "no_cluster": 88 94 result["segments_checked"] = voiceprint.get("segments_checked") 89 95 result["attempted_at"] = voiceprint.get("attempted_at")

apps/speakers/tests/conftest.py

reviewed

··· 76 76 flat_day.symlink_to(chronicle_day, target_is_directory=True) 77 77 flat_dir = flat_day / stream_name / segment_key 78 78 chronicle_dir = chronicle_day / stream_name / segment_key 79 79 + flat_dir.mkdir(parents=True, exist_ok=True) 79 80 chronicle_dir.mkdir(parents=True, exist_ok=True) 80 81 return flat_dir, chronicle_dir 81 82

+6 -35

apps/speakers/tests/test_attribution.py

reviewed

··· 43 43 source: str = "mic_audio", 44 44 ) -> Path: 45 45 """Write a segment with specific embeddings.""" 46 46 - seg_dir = env.journal / day / STREAM / segment_key 47 47 - seg_dir.mkdir(parents=True, exist_ok=True) 48 48 - 49 49 - # NPZ 50 50 - statement_ids = np.arange(1, len(embeddings) + 1, dtype=np.int32) 51 51 - np.savez_compressed( 52 52 - seg_dir / f"{source}.npz", 53 53 - embeddings=embeddings.astype(np.float32), 54 54 - statement_ids=statement_ids, 46 46 + return env.create_segment( 47 47 + day, 48 48 + segment_key, 49 49 + [source], 50 50 + stream=STREAM, 51 51 + embeddings=embeddings, 55 52 ) 56 56 - 57 57 - # JSONL transcript 58 58 - time_part = segment_key.split("_")[0] 59 59 - base_h, base_m, base_s = ( 60 60 - int(time_part[:2]), 61 61 - int(time_part[2:4]), 62 62 - int(time_part[4:6]), 63 63 - ) 64 64 - base_seconds = base_h * 3600 + base_m * 60 + base_s 65 65 - 66 66 - lines = [json.dumps({"raw": f"{source}.flac", "model": "medium.en"})] 67 67 - for idx in range(len(embeddings)): 68 68 - t = base_seconds + idx * 5 69 69 - h, m, s = t // 3600, (t % 3600) // 60, t % 60 70 70 - lines.append( 71 71 - json.dumps( 72 72 - { 73 73 - "start": f"{h:02d}:{m:02d}:{s:02d}", 74 74 - "text": f"Sentence {idx + 1} text.", 75 75 - } 76 76 - ) 77 77 - ) 78 78 - (seg_dir / f"{source}.jsonl").write_text("\n".join(lines) + "\n") 79 79 - (seg_dir / f"{source}.flac").write_bytes(b"") 80 80 - 81 81 - return seg_dir 82 53 83 54 84 55 # ---------------------------------------------------------------------------

+2 -1

apps/speakers/tests/test_discovery.py

reviewed

··· 17 17 discover_unknown_speakers, 18 18 identify_cluster, 19 19 ) 20 20 + from apps.speakers.owner import OWNER_THRESHOLD 20 21 21 22 _runner = CliRunner() 22 23 ··· 61 62 entity_dir / "owner_centroid.npz", 62 63 centroid=centroid, 63 64 cluster_size=np.array(100, dtype=np.int32), 64 64 - threshold=np.array(0.82, dtype=np.float32), 65 65 + threshold=np.array(OWNER_THRESHOLD, dtype=np.float32), 65 66 version=np.array("2026-01-01T00:00:00"), 66 67 ) 67 68 return centroid

+33

apps/speakers/tests/test_encoder_config.py

reviewed

··· 1 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 2 + # Copyright (c) 2026 sol pbc 3 3 + """Test the constant-import contract for encoder_config.""" 4 4 + 5 5 + from apps.speakers import attribution, encoder_config, owner 6 6 + 7 7 + 8 8 + def test_locked_constants(): 9 9 + assert encoder_config.ENCODER_ID == "wespeaker-resnet34-256" 10 10 + assert encoder_config.OWNER_THRESHOLD == 0.43 11 11 + assert encoder_config.ACOUSTIC_HIGH == 0.36 12 12 + assert encoder_config.ACOUSTIC_MEDIUM == 0.22 13 13 + assert encoder_config.OWNER_BOOTSTRAP_MIN_STMTS == 30 14 14 + assert encoder_config.OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S == 1.5 15 15 + assert encoder_config.OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25 == 0.30 16 16 + 17 17 + 18 18 + def test_attribution_imports_acoustic_constants(): 19 19 + assert attribution.ACOUSTIC_HIGH is encoder_config.ACOUSTIC_HIGH 20 20 + assert attribution.ACOUSTIC_MEDIUM is encoder_config.ACOUSTIC_MEDIUM 21 21 + 22 22 + 23 23 + def test_owner_imports_constants(): 24 24 + assert owner.OWNER_THRESHOLD is encoder_config.OWNER_THRESHOLD 25 25 + assert owner.OWNER_BOOTSTRAP_MIN_STMTS is encoder_config.OWNER_BOOTSTRAP_MIN_STMTS 26 26 + assert ( 27 27 + owner.OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S 28 28 + is encoder_config.OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S 29 29 + ) 30 30 + assert ( 31 31 + owner.OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25 32 32 + is encoder_config.OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25 33 33 + )

+179 -9

apps/speakers/tests/test_owner.py

reviewed

··· 25 25 segment_key: str, 26 26 source: str, 27 27 embeddings: np.ndarray, 28 28 + *, 29 29 + durations_s: np.ndarray | None = None, 28 30 ) -> Path: 29 29 - segment_dir = journal / day / stream / segment_key 31 31 + chronicle_day = journal / "chronicle" / day 32 32 + chronicle_day.mkdir(parents=True, exist_ok=True) 33 33 + flat_day = journal / day 34 34 + if not flat_day.exists(): 35 35 + flat_day.symlink_to(chronicle_day, target_is_directory=True) 36 36 + segment_dir = chronicle_day / stream / segment_key 30 37 segment_dir.mkdir(parents=True, exist_ok=True) 31 38 32 39 statement_ids = np.arange(1, len(embeddings) + 1, dtype=np.int32) 33 33 - np.savez_compressed( 34 34 - segment_dir / f"{source}.npz", 35 35 - embeddings=np.asarray(embeddings, dtype=np.float32), 36 36 - statement_ids=statement_ids, 37 37 - ) 40 40 + npz_kwargs = { 41 41 + "embeddings": np.asarray(embeddings, dtype=np.float32), 42 42 + "statement_ids": statement_ids, 43 43 + } 44 44 + if durations_s is not None: 45 45 + npz_kwargs["durations_s"] = np.asarray(durations_s, dtype=np.float32) 46 46 + np.savez_compressed(segment_dir / f"{source}.npz", **npz_kwargs) 38 47 39 48 time_part = segment_key.split("_")[0] 40 49 base_h = int(time_part[0:2]) ··· 183 192 assert get_current()["voiceprint"]["status"] == "candidate" 184 193 185 194 195 195 + def test_low_quality_too_few_stmts(speakers_env, monkeypatch): 196 196 + import apps.speakers.owner as owner_module 197 197 + from apps.speakers.owner import detect_owner_candidate 198 198 + 199 199 + def stub_hdbscan(labels: np.ndarray): 200 200 + class StubHDBSCAN: 201 201 + def __init__(self, **kwargs): 202 202 + self.labels_ = np.asarray(labels, dtype=np.int32) 203 203 + 204 204 + def fit(self, embeddings: np.ndarray): 205 205 + assert embeddings.shape[0] == len(self.labels_) 206 206 + return self 207 207 + 208 208 + return StubHDBSCAN 209 209 + 210 210 + env = speakers_env() 211 211 + rng = np.random.default_rng(0) 212 212 + embeddings = _owner_embeddings(60, rng) 213 213 + _write_segment( 214 214 + env.journal, 215 215 + "20240101", 216 216 + "mic", 217 217 + "090000_300", 218 218 + "audio", 219 219 + embeddings, 220 220 + durations_s=np.full(60, 2.0, dtype=np.float32), 221 221 + ) 222 222 + 223 223 + labels = np.concatenate( 224 224 + [ 225 225 + np.zeros(29, dtype=np.int32), 226 226 + np.full(31, -1, dtype=np.int32), 227 227 + ] 228 228 + ) 229 229 + monkeypatch.setattr(owner_module, "HDBSCAN", stub_hdbscan(labels)) 230 230 + 231 231 + result = detect_owner_candidate() 232 232 + 233 233 + assert result["status"] == "low_quality" 234 234 + assert result["recommendation"] == "low_quality" 235 235 + assert result["low_quality_reason"] == "too_few_stmts" 236 236 + assert get_current()["voiceprint"]["status"] == "low_quality" 237 237 + assert not _candidate_path(env.journal).exists() 238 238 + 239 239 + 240 240 + def test_low_quality_median_duration_too_short(speakers_env, monkeypatch): 241 241 + import apps.speakers.owner as owner_module 242 242 + from apps.speakers.owner import detect_owner_candidate 243 243 + 244 244 + def stub_hdbscan(labels: np.ndarray): 245 245 + class StubHDBSCAN: 246 246 + def __init__(self, **kwargs): 247 247 + self.labels_ = np.asarray(labels, dtype=np.int32) 248 248 + 249 249 + def fit(self, embeddings: np.ndarray): 250 250 + assert embeddings.shape[0] == len(self.labels_) 251 251 + return self 252 252 + 253 253 + return StubHDBSCAN 254 254 + 255 255 + env = speakers_env() 256 256 + rng = np.random.default_rng(1) 257 257 + embeddings = _owner_embeddings(60, rng) 258 258 + _write_segment( 259 259 + env.journal, 260 260 + "20240101", 261 261 + "mic", 262 262 + "090000_300", 263 263 + "audio", 264 264 + embeddings, 265 265 + durations_s=np.full(60, 1.0, dtype=np.float32), 266 266 + ) 267 267 + 268 268 + monkeypatch.setattr( 269 269 + owner_module, "HDBSCAN", stub_hdbscan(np.zeros(60, dtype=np.int32)) 270 270 + ) 271 271 + 272 272 + result = detect_owner_candidate() 273 273 + 274 274 + assert result["status"] == "low_quality" 275 275 + assert result["recommendation"] == "low_quality" 276 276 + assert result["low_quality_reason"] == "median_duration_too_short" 277 277 + assert result["observed_value"] < 1.5 278 278 + assert get_current()["voiceprint"]["status"] == "low_quality" 279 279 + assert not _candidate_path(env.journal).exists() 280 280 + 281 281 + 282 282 + def test_low_quality_cluster_too_diffuse(speakers_env, monkeypatch): 283 283 + import apps.speakers.owner as owner_module 284 284 + from apps.speakers.owner import detect_owner_candidate 285 285 + 286 286 + def stub_hdbscan(labels: np.ndarray): 287 287 + class StubHDBSCAN: 288 288 + def __init__(self, **kwargs): 289 289 + self.labels_ = np.asarray(labels, dtype=np.int32) 290 290 + 291 291 + def fit(self, embeddings: np.ndarray): 292 292 + assert embeddings.shape[0] == len(self.labels_) 293 293 + return self 294 294 + 295 295 + return StubHDBSCAN 296 296 + 297 297 + env = speakers_env() 298 298 + rng = np.random.default_rng(0) 299 299 + template = np.ones((60, 256), dtype=np.float32) 300 300 + embeddings = template + rng.normal(scale=2.0, size=(60, 256)).astype(np.float32) 301 301 + _write_segment( 302 302 + env.journal, 303 303 + "20240101", 304 304 + "mic", 305 305 + "090000_300", 306 306 + "audio", 307 307 + embeddings, 308 308 + durations_s=np.full(60, 1.6, dtype=np.float32), 309 309 + ) 310 310 + 311 311 + monkeypatch.setattr( 312 312 + owner_module, "HDBSCAN", stub_hdbscan(np.zeros(60, dtype=np.int32)) 313 313 + ) 314 314 + 315 315 + result = detect_owner_candidate() 316 316 + 317 317 + assert result["status"] == "low_quality" 318 318 + assert result["recommendation"] == "low_quality" 319 319 + assert result["low_quality_reason"] == "cluster_too_diffuse" 320 320 + assert result["observed_value"] < 0.30 321 321 + assert get_current()["voiceprint"]["status"] == "low_quality" 322 322 + assert not _candidate_path(env.journal).exists() 323 323 + 324 324 + 186 325 def test_load_owner_centroid_no_principal(speakers_env): 187 326 from apps.speakers.owner import load_owner_centroid 188 327 ··· 321 460 assert response.get_json()["status"] == "candidate" 322 461 323 462 463 463 + def test_api_owner_status_low_quality(speakers_env): 464 464 + from apps.speakers.routes import speakers_bp 465 465 + 466 466 + speakers_env() 467 467 + update_state( 468 468 + "voiceprint", 469 469 + { 470 470 + "status": "low_quality", 471 471 + "low_quality_reason": "too_few_stmts", 472 472 + "observed_value": 5, 473 473 + "threshold_value": 30, 474 474 + "segments_checked": 1, 475 475 + "attempted_at": "2026-03-15T12:00:00", 476 476 + }, 477 477 + ) 478 478 + app = Flask(__name__) 479 479 + app.register_blueprint(speakers_bp) 480 480 + 481 481 + with app.test_client() as client: 482 482 + response = client.get("/app/speakers/api/owner/status") 483 483 + 484 484 + assert response.status_code == 200 485 485 + assert response.get_json() == { 486 486 + "status": "low_quality", 487 487 + "low_quality_reason": "too_few_stmts", 488 488 + "observed_value": 5, 489 489 + "threshold_value": 30, 490 490 + } 491 491 + 492 492 + 324 493 def test_api_owner_status_no_cluster(speakers_env): 325 494 from apps.speakers.routes import speakers_bp 326 495 ··· 375 544 376 545 377 546 def test_api_owner_confirm(speakers_env): 547 547 + from apps.speakers.owner import OWNER_THRESHOLD 378 548 from apps.speakers.routes import speakers_bp 379 549 380 550 env = speakers_env() ··· 386 556 candidate_path, 387 557 centroid=centroid, 388 558 cluster_size=np.array(88, dtype=np.int32), 389 389 - threshold=np.array(0.82, dtype=np.float32), 559 559 + threshold=np.array(OWNER_THRESHOLD, dtype=np.float32), 390 560 version=np.array("2026-03-15T12:00:00"), 391 561 ) 392 562 ··· 477 647 478 648 479 649 def test_confirm_owner_candidate_success(speakers_env): 480 480 - from apps.speakers.owner import confirm_owner_candidate 650 650 + from apps.speakers.owner import OWNER_THRESHOLD, confirm_owner_candidate 481 651 482 652 env = speakers_env() 483 653 principal_dir = env.create_entity("Self Person", is_principal=True) ··· 488 658 candidate_path, 489 659 centroid=centroid, 490 660 cluster_size=np.array(88, dtype=np.int32), 491 491 - threshold=np.array(0.82, dtype=np.float32), 661 661 + threshold=np.array(OWNER_THRESHOLD, dtype=np.float32), 492 662 version=np.array("2026-03-19T12:00:00"), 493 663 ) 494 664

+31 -2

apps/speakers/tests/test_routes.py

reviewed

··· 121 121 assert sentences[0]["has_embedding"] is True 122 122 123 123 assert emb_data is not None 124 124 - embeddings, statement_ids = emb_data 124 124 + embeddings, statement_ids, durations_s = emb_data 125 125 assert embeddings.shape == (3, 256) 126 126 assert len(statement_ids) == 3 127 127 + assert durations_s is None 127 128 128 129 129 130 def test_load_sentences_no_transcript(speakers_env): ··· 293 294 result = _load_embeddings_file(npz_path) 294 295 295 296 assert result is not None 296 296 - embeddings, statement_ids = result 297 297 + embeddings, statement_ids, durations_s = result 297 298 assert embeddings.shape == (3, 256) 298 299 assert len(statement_ids) == 3 300 300 + assert durations_s is None 301 301 + 302 302 + 303 303 + def test_load_embeddings_file_with_durations(speakers_env): 304 304 + """Test loading embeddings from NPZ file with durations.""" 305 305 + from apps.speakers.routes import _load_embeddings_file 306 306 + 307 307 + env = speakers_env() 308 308 + embeddings = np.eye(3, 256, dtype=np.float32) 309 309 + statement_ids = np.arange(1, 4, dtype=np.int32) 310 310 + durations_s = np.array([1.6, 2.1, 2.8], dtype=np.float32) 311 311 + npz_path = env.journal / "20240101" / "test" / "143022_300" / "mic_audio.npz" 312 312 + npz_path.parent.mkdir(parents=True, exist_ok=True) 313 313 + np.savez_compressed( 314 314 + npz_path, 315 315 + embeddings=embeddings, 316 316 + statement_ids=statement_ids, 317 317 + durations_s=durations_s, 318 318 + ) 319 319 + 320 320 + result = _load_embeddings_file(npz_path) 321 321 + 322 322 + assert result is not None 323 323 + loaded_embeddings, loaded_ids, loaded_durations = result 324 324 + assert loaded_embeddings.shape == (3, 256) 325 325 + assert np.array_equal(loaded_ids, statement_ids) 326 326 + assert loaded_durations is not None 327 327 + assert np.allclose(loaded_durations, durations_s) 299 328 300 329 301 330 def test_load_embeddings_file_not_found():

+6 -1

apps/speakers/tests/test_suggest.py

reviewed

··· 18 18 19 19 20 20 def create_meetings_md(env, day: str, content: str) -> Path: 21 21 - meetings_path = env.journal / day / "talents" / "meetings.md" 21 21 + chronicle_day = env.journal / "chronicle" / day 22 22 + chronicle_day.mkdir(parents=True, exist_ok=True) 23 23 + flat_day = env.journal / day 24 24 + if not flat_day.exists(): 25 25 + flat_day.symlink_to(chronicle_day, target_is_directory=True) 26 26 + meetings_path = chronicle_day / "talents" / "meetings.md" 22 27 meetings_path.parent.mkdir(parents=True, exist_ok=True) 23 28 meetings_path.write_text(content, encoding="utf-8") 24 29 return meetings_path

+16

apps/speakers/workspace.html

reviewed

··· 1312 1312 hideOwnerBanner(); 1313 1313 return; 1314 1314 } 1315 1315 + if (data.status === 'low_quality') { 1316 1316 + hideDiscovery(); 1317 1317 + renderOwnerLowQuality(data); 1318 1318 + return; 1319 1319 + } 1315 1320 if (data.status === 'candidate') { 1316 1321 hideDiscovery(); 1317 1322 renderOwnerCandidate(data); ··· 1379 1384 }); 1380 1385 1381 1386 attachAudioErrorListeners(ownerBanner); 1387 1387 + } 1388 1388 + 1389 1389 + function renderOwnerLowQuality(data) { 1390 1390 + ownerBanner.style.display = 'block'; 1391 1391 + ownerBanner.innerHTML = ` 1392 1392 + <div class="spk-owner-panel"> 1393 1393 + <p class="spk-owner-low-quality-message"> 1394 1394 + Solstone is still learning your voice. Once it has at least 30 longer recordings of you speaking, it will offer a confirmation prompt. 1395 1395 + </p> 1396 1396 + </div> 1397 1397 + `; 1382 1398 } 1383 1399 1384 1400 function submitOwnerChoice(path) {

observe/transcribe/main.py

reviewed

··· 313 313 314 314 embeddings = [] 315 315 statement_ids = [] 316 316 + durations = [] 316 317 skipped = 0 317 318 318 319 for stmt in valid_statements: ··· 333 334 emb = session.run([output_name], {input_name: feats[None, :, :]})[0] 334 335 embeddings.append(emb[0].astype(np.float32)) 335 336 statement_ids.append(stmt["id"]) 337 337 + durations.append((end_sample - start_sample) / SAMPLE_RATE) 336 338 except Exception: 337 339 logging.exception( 338 340 "wespeaker embedding failed for statement %s", stmt["id"] ··· 354 356 return { 355 357 "embeddings": np.stack(embeddings, axis=0).astype(np.float32), 356 358 "statement_ids": np.asarray(statement_ids, dtype=np.int32), 359 359 + "durations_s": np.asarray(durations, dtype=np.float32), 357 360 "encoder": np.array(EMBEDDER_NAME), 358 361 } 359 362

+3 -2

think/entities/voiceprints.py

reviewed

··· 85 85 existing_emb = data["embeddings"] 86 86 existing_meta_strings = data["metadata"] 87 87 existing_meta_dicts = [json.loads(m) for m in existing_meta_strings] 88 88 - except (FileNotFoundError, ValueError, np.lib.npyio.NpzFile) as exc: 88 88 + except (FileNotFoundError, OSError, ValueError) as exc: 89 89 logger.warning( 90 90 "Failed to load existing voiceprints for %s from %s: %s. Starting fresh.", 91 91 entity_id, ··· 141 141 """Safely save a voiceprint NPZ with file locking and integrity check.""" 142 142 lock_path = npz_path.with_suffix(".lock") 143 143 tmp_path = npz_path.with_name(npz_path.stem + ".tmp.npz") 144 144 + metadata_json = np.asarray([json.dumps(item) for item in metadata], dtype=str) 144 145 145 146 npz_path.parent.mkdir(parents=True, exist_ok=True) 146 147 ··· 151 152 np.savez_compressed( 152 153 tmp_path, 153 154 embeddings=embeddings, 154 154 - metadata=metadata, 155 155 + metadata=metadata_json, 155 156 ) 156 157 if not tmp_path.exists(): 157 158 raise FileNotFoundError(