personal memory agent
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

apps/speakers: wespeaker thresholds + owner-bootstrap quality gates

Ship spec §1–§3 from cpo/specs/in-flight/speaker-attribution-wespeaker.md.
§4 (noisy-segment overlap guard) is deferred pending CPO spec revision.

- New apps/speakers/encoder_config.py with seven LOCKED constants
(ENCODER_ID + OWNER_THRESHOLD=0.43, ACOUSTIC_HIGH=0.36,
ACOUSTIC_MEDIUM=0.22, OWNER_BOOTSTRAP_MIN_STMTS=30,
OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S=1.5,
OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25=0.30).
- owner.py and attribution.py import constants from encoder_config.py;
resemblyzer-era literals removed.
- _embed_statements persists durations_s lockstep with embeddings into
the per-segment .npz; _load_embeddings_file returns a 3-tuple and
every caller is updated (no compat shim).
- detect_owner_candidate adds three post-cluster quality gates
(too_few_stmts, median_duration_too_short, cluster_too_diffuse) that
defer candidate lock-in with recommendation="low_quality" and the
LOCKED diagnostic shape; awareness state surfaces low_quality
through api_owner_status, _owner_section, and the workspace UI
empty-state copy.
- Tests cover the encoder_config import contract, all three gate
paths, the 3-tuple loader (with and without durations_s), and the
low_quality api_owner_status branch.

Pre-existing flaky-fixture and voiceprints.npz metadata-serialization
bugs surfaced by the speaker test gate are fixed minimally to keep
make test-app APP=speakers green.

Co-Authored-By: OpenAI Codex <codex@openai.com>

+493 -77
+6 -9
apps/speakers/attribution.py
··· 6 6 Runs per-segment after transcription and embedding. Operates in layers 7 7 from cheapest to most expensive: 8 8 9 - Layer 1: Owner separation (cosine similarity to owner centroid >= 0.82) 9 + Layer 1: Owner separation (cosine similarity to owner centroid passes Layer 1) 10 10 Layer 2: Structural heuristics (speaker count, setting field, screen.md, 11 11 meetings.md) — no LLM 12 12 Layer 3: Acoustic matching (voiceprint cosine similarity, same-stream ··· 28 28 29 29 import numpy as np 30 30 31 + from apps.speakers.encoder_config import ACOUSTIC_HIGH, ACOUSTIC_MEDIUM 31 32 from apps.speakers.owner import load_owner_centroid 32 33 from think.entities import find_matching_entity 33 34 from think.entities.journal import ( ··· 38 39 from think.utils import day_path, now_ms, segment_path 39 40 40 41 logger = logging.getLogger(__name__) 41 - 42 - # Acoustic matching thresholds (from spec) 43 - ACOUSTIC_HIGH = 0.70 44 - ACOUSTIC_MEDIUM = 0.50 45 42 46 43 47 44 def _routes_helpers(): ··· 235 232 if emb_data is None: 236 233 return {"labels": [], "unmatched": [], "source": source, "metadata": {}} 237 234 238 - embeddings, statement_ids = emb_data 235 + embeddings, statement_ids, _ = emb_data 239 236 if len(embeddings) == 0: 240 237 return {"labels": [], "unmatched": [], "source": source, "metadata": {}} 241 238 ··· 538 535 539 536 Eligibility: 540 537 - Layer 2 structural attributions (high confidence) 541 - - Layer 3 acoustic attributions with confidence "high" (>= 0.7) 538 + - Layer 3 acoustic attributions with confidence "high" 542 539 543 540 Guards: 544 541 - Owner contamination: never save embeddings with owner similarity 545 - >= OWNER_THRESHOLD to non-owner voiceprints 542 + above the owner threshold to non-owner voiceprints 546 543 - Idempotent: checks existing voiceprint keys before saving 547 544 548 545 Returns dict mapping entity_id -> number of new embeddings saved. ··· 569 566 if emb_data is None: 570 567 return {} 571 568 572 - embeddings, statement_ids = emb_data 569 + embeddings, statement_ids, _ = emb_data 573 570 sid_to_idx = {int(s): i for i, s in enumerate(statement_ids)} 574 571 575 572 # Eligible methods for accumulation
+2 -2
apps/speakers/bootstrap.py
··· 155 155 if emb_data is None: 156 156 continue 157 157 158 - embeddings, statement_ids = emb_data 158 + embeddings, statement_ids, _ = emb_data 159 159 160 160 for embedding, sid in zip(embeddings, statement_ids): 161 161 sentence_id = int(sid) ··· 646 646 if emb_data is None: 647 647 continue 648 648 649 - embeddings, statement_ids = emb_data 649 + embeddings, statement_ids, _ = emb_data 650 650 651 651 for embedding, sid in zip(embeddings, statement_ids): 652 652 sentence_id = int(sid)
+2 -2
apps/speakers/discovery.py
··· 128 128 if emb_data is None: 129 129 continue 130 130 131 - embeddings, statement_ids = emb_data 131 + embeddings, statement_ids, _ = emb_data 132 132 if len(embeddings) == 0: 133 133 continue 134 134 ··· 390 390 if emb_data is None: 391 391 continue 392 392 393 - embeddings, statement_ids = emb_data 393 + embeddings, statement_ids, _ = emb_data 394 394 emb_vec = None 395 395 for emb, sid in zip(embeddings, statement_ids): 396 396 if int(sid) == sentence_id:
+14
apps/speakers/encoder_config.py
··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Encoder-specific calibration constants. Source of truth for owner-separation, acoustic-match, and owner-bootstrap quality thresholds. Changes require CPO spec revision; see cpo/specs/in-flight/speaker-attribution-wespeaker.md.""" 5 + 6 + ENCODER_ID: str = "wespeaker-resnet34-256" 7 + 8 + OWNER_THRESHOLD: float = 0.43 9 + ACOUSTIC_HIGH: float = 0.36 10 + ACOUSTIC_MEDIUM: float = 0.22 11 + 12 + OWNER_BOOTSTRAP_MIN_STMTS: int = 30 13 + OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S: float = 1.5 14 + OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25: float = 0.30
+162 -6
apps/speakers/owner.py
··· 5 5 6 6 from __future__ import annotations 7 7 8 + import json 8 9 import logging 9 10 from datetime import datetime 10 11 from pathlib import Path ··· 13 14 import numpy as np 14 15 from sklearn.cluster import HDBSCAN 15 16 17 + from apps.speakers.encoder_config import ( 18 + OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25, 19 + OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S, 20 + OWNER_BOOTSTRAP_MIN_STMTS, 21 + OWNER_THRESHOLD, 22 + ) 16 23 from think.awareness import update_state 17 24 from think.entities.journal import get_journal_principal, journal_entity_memory_path 18 25 from think.utils import day_dirs, get_journal, segment_path ··· 20 27 logger = logging.getLogger(__name__) 21 28 22 29 MAX_EMBEDDINGS = 30000 23 - OWNER_THRESHOLD = 0.82 24 30 25 31 26 32 def _mark_no_cluster(segment_count: int) -> None: ··· 35 41 ) 36 42 37 43 44 + def _mark_low_quality( 45 + reason: str, observed: float, threshold: float, segment_count: int 46 + ) -> None: 47 + """Record that detection found a cluster, but it failed quality gates.""" 48 + update_state( 49 + "voiceprint", 50 + { 51 + "status": "low_quality", 52 + "low_quality_reason": reason, 53 + "observed_value": float(observed), 54 + "threshold_value": float(threshold), 55 + "segments_checked": int(segment_count), 56 + "attempted_at": _iso_now(), 57 + }, 58 + ) 59 + 60 + 61 + def _bail_low_quality( 62 + reason: str, 63 + observed: float, 64 + threshold: float, 65 + segment_count: int, 66 + embeddings_count: int, 67 + ) -> dict[str, Any]: 68 + """Record and return a locked low-quality owner detection result.""" 69 + _mark_low_quality(reason, observed, threshold, segment_count) 70 + return { 71 + "status": "low_quality", 72 + "recommendation": "low_quality", 73 + "segments_available": int(segment_count), 74 + "embeddings_available": int(embeddings_count), 75 + "low_quality_reason": reason, 76 + "observed_value": float(observed), 77 + "threshold_value": float(threshold), 78 + } 79 + 80 + 81 + def _pairwise_cosines(embeddings: np.ndarray) -> np.ndarray: 82 + """Return pairwise cosine similarities for a cluster of embeddings.""" 83 + n = embeddings.shape[0] 84 + if n < 2: 85 + return np.empty(0, dtype=np.float32) 86 + norms = np.linalg.norm(embeddings, axis=1, keepdims=True) 87 + norms = np.where(norms < 1e-9, 1.0, norms) 88 + e_norm = embeddings / norms 89 + if n > 5000: 90 + rng = np.random.default_rng(seed=0) 91 + i = rng.integers(0, n, size=1000) 92 + j = rng.integers(0, n, size=1000) 93 + mask = i != j 94 + i = i[mask] 95 + j = j[mask] 96 + return np.einsum("ij,ij->i", e_norm[i], e_norm[j]).astype( 97 + np.float32, copy=False 98 + ) 99 + sim = e_norm @ e_norm.T 100 + iu = np.triu_indices(n, k=1) 101 + return sim[iu].astype(np.float32, copy=False) 102 + 103 + 38 104 def _routes_helpers(): 39 105 """Load speakers route helpers lazily to avoid import cycles.""" 40 106 from apps.speakers.routes import ( ··· 63 129 return f"/app/speakers/api/serve_audio/{day}/{stream}/{segment_key}/{source}.flac" 64 130 65 131 132 + def _fallback_statement_durations(jsonl_path: Path) -> dict[int, float | None]: 133 + """Estimate statement durations from adjacent transcript start times.""" 134 + if not jsonl_path.exists(): 135 + return {} 136 + 137 + starts: list[tuple[int, int]] = [] 138 + try: 139 + with open(jsonl_path, encoding="utf-8") as f: 140 + lines = f.readlines() 141 + except OSError: 142 + return {} 143 + 144 + for sentence_id, line in enumerate(lines[1:], start=1): 145 + try: 146 + entry = json.loads(line) 147 + except json.JSONDecodeError: 148 + continue 149 + start = entry.get("start") 150 + if not isinstance(start, str): 151 + continue 152 + try: 153 + hours, minutes, seconds = (int(part) for part in start.split(":", 2)) 154 + except ValueError: 155 + continue 156 + starts.append((sentence_id, hours * 3600 + minutes * 60 + seconds)) 157 + 158 + durations: dict[int, float | None] = {} 159 + for idx, (sentence_id, start_seconds) in enumerate(starts): 160 + next_start = starts[idx + 1][1] if idx + 1 < len(starts) else None 161 + # Why: older transcript JSONL files only persist statement starts, so 162 + # we estimate legacy durations from adjacent sentence boundaries. 163 + durations[sentence_id] = ( 164 + None if next_start is None else float(next_start - start_seconds) 165 + ) 166 + return durations 167 + 168 + 66 169 def count_segments_with_embeddings() -> int: 67 170 """Count all journal segments that contain audio embedding files.""" 68 171 _, _, scan_segment_embeddings = _routes_helpers() ··· 147 250 if emb_data is None: 148 251 continue 149 252 150 - embeddings, statement_ids = emb_data 253 + embeddings, statement_ids, durations_data = emb_data 151 254 if len(embeddings) == 0: 152 255 continue 153 256 257 + fallback_durations = ( 258 + {} 259 + if durations_data is not None 260 + else _fallback_statement_durations(segment_dir / f"{source}.jsonl") 261 + ) 154 262 embedding_chunks.append(embeddings.astype(np.float32)) 155 263 provenance.extend( 156 264 { ··· 159 267 "segment_key": segment_key, 160 268 "source": source, 161 269 "sentence_id": int(sid), 270 + "duration_s": ( 271 + float(durations_data[idx]) 272 + if durations_data is not None 273 + else fallback_durations.get(int(sid)) 274 + ), 162 275 } 163 - for sid in statement_ids 276 + for idx, sid in enumerate(statement_ids) 164 277 ) 165 278 166 279 if not embedding_chunks: ··· 214 327 } 215 328 216 329 cluster_embeddings = embeddings_matrix[cluster_indices] 330 + cluster_size = int(len(cluster_indices)) 331 + embeddings_count = int(embeddings_matrix.shape[0]) 332 + 333 + if cluster_size < OWNER_BOOTSTRAP_MIN_STMTS: 334 + return _bail_low_quality( 335 + "too_few_stmts", 336 + observed=cluster_size, 337 + threshold=OWNER_BOOTSTRAP_MIN_STMTS, 338 + segment_count=segment_count, 339 + embeddings_count=embeddings_count, 340 + ) 341 + 342 + cluster_durations = [ 343 + provenance[int(i)]["duration_s"] 344 + for i in cluster_indices 345 + if provenance[int(i)].get("duration_s") is not None 346 + ] 347 + if not cluster_durations: 348 + median_duration = 0.0 349 + else: 350 + median_duration = float(np.median(cluster_durations)) 351 + if median_duration < OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S: 352 + return _bail_low_quality( 353 + "median_duration_too_short", 354 + observed=median_duration, 355 + threshold=OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S, 356 + segment_count=segment_count, 357 + embeddings_count=embeddings_count, 358 + ) 359 + 360 + intra_cosines = _pairwise_cosines(cluster_embeddings) 361 + if intra_cosines.size == 0: 362 + intra_p25 = 0.0 363 + else: 364 + intra_p25 = float(np.percentile(intra_cosines, 25)) 365 + if intra_p25 < OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25: 366 + return _bail_low_quality( 367 + "cluster_too_diffuse", 368 + observed=intra_p25, 369 + threshold=OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25, 370 + segment_count=segment_count, 371 + embeddings_count=embeddings_count, 372 + ) 373 + 217 374 centroid = normalize_embedding(np.mean(cluster_embeddings, axis=0)) 218 375 if centroid is None: 219 376 _mark_no_cluster(segment_count) 220 377 return { 221 378 "status": "no_clusters", 222 379 "segments_available": segment_count, 223 - "embeddings_available": int(len(embeddings_matrix)), 380 + "embeddings_available": embeddings_count, 224 381 "recommendation": "no_clusters", 225 382 } 226 383 227 - cluster_size = int(len(cluster_indices)) 228 384 cluster_streams = {provenance[int(i)]["stream"] for i in cluster_indices} 229 385 streams_represented = len(cluster_streams) 230 386 recommendation = "ready" if streams_represented > 1 else "single_stream" ··· 347 503 if emb_data is None: 348 504 return [] 349 505 350 - embeddings, statement_ids = emb_data 506 + embeddings, statement_ids, _ = emb_data 351 507 results = [] 352 508 for embedding, statement_id in zip(embeddings, statement_ids): 353 509 normalized = normalize_embedding(embedding)
+21 -8
apps/speakers/routes.py
··· 85 85 return t.hour * 3600 + t.minute * 60 + t.second 86 86 87 87 88 - def _load_embeddings_file(npz_path: Path) -> tuple[np.ndarray, np.ndarray] | None: 89 - """Load embeddings and statement_ids from NPZ file. 88 + def _load_embeddings_file( 89 + npz_path: Path, 90 + ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None] | None: 91 + """Load embeddings, statement_ids, and optional durations from NPZ file. 90 92 91 - Returns tuple of (embeddings, statement_ids) or None if file is invalid. 93 + Returns tuple of (embeddings, statement_ids, durations_s) or None if file is invalid. 92 94 """ 93 95 if not npz_path.exists(): 94 96 return None ··· 97 99 data = np.load(npz_path) 98 100 embeddings = data.get("embeddings") 99 101 statement_ids = data.get("statement_ids") 102 + durations_s = data.get("durations_s") 100 103 101 104 if embeddings is None or statement_ids is None: 102 105 return None 103 106 104 - return embeddings, statement_ids 107 + return embeddings, statement_ids, durations_s 105 108 except Exception as e: 106 109 logger.warning("Failed to load embeddings %s: %s", npz_path, e) 107 110 return None ··· 418 421 419 422 def _load_sentences( 420 423 day: str, segment_key: str, source: str, stream: str | None = None 421 - ) -> tuple[list[dict], tuple[np.ndarray, np.ndarray] | None]: 424 + ) -> tuple[list[dict], tuple[np.ndarray, np.ndarray, np.ndarray | None] | None]: 422 425 """Load transcript sentences and their embeddings for an audio source. 423 426 424 427 Args: ··· 430 433 Returns: 431 434 Tuple of (sentences, emb_data): 432 435 - sentences: List of dicts with id, offset, text, has_embedding 433 - - emb_data: Tuple of (embeddings, statement_ids) or None if no embeddings 436 + - emb_data: Tuple of (embeddings, statement_ids, durations_s) or None if no embeddings 434 437 """ 435 438 if stream: 436 439 segment_dir = get_segment_path(day, segment_key, stream, create=False) ··· 478 481 emb_data = _load_embeddings_file(npz_path) 479 482 480 483 if emb_data is not None: 481 - embeddings, statement_ids = emb_data 484 + embeddings, statement_ids, _ = emb_data 482 485 emb_map = {int(sid): True for sid in statement_ids} 483 486 484 487 # Mark which sentences have embeddings ··· 502 505 if emb_data is None: 503 506 return None 504 507 505 - embeddings, statement_ids = emb_data 508 + embeddings, statement_ids, _ = emb_data 506 509 507 510 # Find the embedding for this sentence 508 511 for i, sid in enumerate(statement_ids): ··· 1111 1114 "status": "candidate", 1112 1115 "cluster_size": voiceprint.get("cluster_size"), 1113 1116 "samples": voiceprint.get("samples", []), 1117 + } 1118 + ) 1119 + 1120 + if status == "low_quality": 1121 + return jsonify( 1122 + { 1123 + "status": "low_quality", 1124 + "low_quality_reason": voiceprint.get("low_quality_reason", ""), 1125 + "observed_value": voiceprint.get("observed_value", 0.0), 1126 + "threshold_value": voiceprint.get("threshold_value", 0.0), 1114 1127 } 1115 1128 ) 1116 1129
+6
apps/speakers/status.py
··· 84 84 result["detected_at"] = voiceprint.get("detected_at") 85 85 result["streams_represented"] = voiceprint.get("streams_represented") 86 86 result["recommendation"] = voiceprint.get("recommendation") 87 + elif status == "low_quality": 88 + result["low_quality_reason"] = voiceprint.get("low_quality_reason", "") 89 + result["observed_value"] = voiceprint.get("observed_value", 0.0) 90 + result["threshold_value"] = voiceprint.get("threshold_value", 0.0) 91 + result["segments_checked"] = voiceprint.get("segments_checked", 0) 92 + result["attempted_at"] = voiceprint.get("attempted_at", "") 87 93 elif status == "no_cluster": 88 94 result["segments_checked"] = voiceprint.get("segments_checked") 89 95 result["attempted_at"] = voiceprint.get("attempted_at")
+1
apps/speakers/tests/conftest.py
··· 76 76 flat_day.symlink_to(chronicle_day, target_is_directory=True) 77 77 flat_dir = flat_day / stream_name / segment_key 78 78 chronicle_dir = chronicle_day / stream_name / segment_key 79 + flat_dir.mkdir(parents=True, exist_ok=True) 79 80 chronicle_dir.mkdir(parents=True, exist_ok=True) 80 81 return flat_dir, chronicle_dir 81 82
+6 -35
apps/speakers/tests/test_attribution.py
··· 43 43 source: str = "mic_audio", 44 44 ) -> Path: 45 45 """Write a segment with specific embeddings.""" 46 - seg_dir = env.journal / day / STREAM / segment_key 47 - seg_dir.mkdir(parents=True, exist_ok=True) 48 - 49 - # NPZ 50 - statement_ids = np.arange(1, len(embeddings) + 1, dtype=np.int32) 51 - np.savez_compressed( 52 - seg_dir / f"{source}.npz", 53 - embeddings=embeddings.astype(np.float32), 54 - statement_ids=statement_ids, 46 + return env.create_segment( 47 + day, 48 + segment_key, 49 + [source], 50 + stream=STREAM, 51 + embeddings=embeddings, 55 52 ) 56 - 57 - # JSONL transcript 58 - time_part = segment_key.split("_")[0] 59 - base_h, base_m, base_s = ( 60 - int(time_part[:2]), 61 - int(time_part[2:4]), 62 - int(time_part[4:6]), 63 - ) 64 - base_seconds = base_h * 3600 + base_m * 60 + base_s 65 - 66 - lines = [json.dumps({"raw": f"{source}.flac", "model": "medium.en"})] 67 - for idx in range(len(embeddings)): 68 - t = base_seconds + idx * 5 69 - h, m, s = t // 3600, (t % 3600) // 60, t % 60 70 - lines.append( 71 - json.dumps( 72 - { 73 - "start": f"{h:02d}:{m:02d}:{s:02d}", 74 - "text": f"Sentence {idx + 1} text.", 75 - } 76 - ) 77 - ) 78 - (seg_dir / f"{source}.jsonl").write_text("\n".join(lines) + "\n") 79 - (seg_dir / f"{source}.flac").write_bytes(b"") 80 - 81 - return seg_dir 82 53 83 54 84 55 # ---------------------------------------------------------------------------
+2 -1
apps/speakers/tests/test_discovery.py
··· 17 17 discover_unknown_speakers, 18 18 identify_cluster, 19 19 ) 20 + from apps.speakers.owner import OWNER_THRESHOLD 20 21 21 22 _runner = CliRunner() 22 23 ··· 61 62 entity_dir / "owner_centroid.npz", 62 63 centroid=centroid, 63 64 cluster_size=np.array(100, dtype=np.int32), 64 - threshold=np.array(0.82, dtype=np.float32), 65 + threshold=np.array(OWNER_THRESHOLD, dtype=np.float32), 65 66 version=np.array("2026-01-01T00:00:00"), 66 67 ) 67 68 return centroid
+33
apps/speakers/tests/test_encoder_config.py
··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + """Test the constant-import contract for encoder_config.""" 4 + 5 + from apps.speakers import attribution, encoder_config, owner 6 + 7 + 8 + def test_locked_constants(): 9 + assert encoder_config.ENCODER_ID == "wespeaker-resnet34-256" 10 + assert encoder_config.OWNER_THRESHOLD == 0.43 11 + assert encoder_config.ACOUSTIC_HIGH == 0.36 12 + assert encoder_config.ACOUSTIC_MEDIUM == 0.22 13 + assert encoder_config.OWNER_BOOTSTRAP_MIN_STMTS == 30 14 + assert encoder_config.OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S == 1.5 15 + assert encoder_config.OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25 == 0.30 16 + 17 + 18 + def test_attribution_imports_acoustic_constants(): 19 + assert attribution.ACOUSTIC_HIGH is encoder_config.ACOUSTIC_HIGH 20 + assert attribution.ACOUSTIC_MEDIUM is encoder_config.ACOUSTIC_MEDIUM 21 + 22 + 23 + def test_owner_imports_constants(): 24 + assert owner.OWNER_THRESHOLD is encoder_config.OWNER_THRESHOLD 25 + assert owner.OWNER_BOOTSTRAP_MIN_STMTS is encoder_config.OWNER_BOOTSTRAP_MIN_STMTS 26 + assert ( 27 + owner.OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S 28 + is encoder_config.OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S 29 + ) 30 + assert ( 31 + owner.OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25 32 + is encoder_config.OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25 33 + )
+179 -9
apps/speakers/tests/test_owner.py
··· 25 25 segment_key: str, 26 26 source: str, 27 27 embeddings: np.ndarray, 28 + *, 29 + durations_s: np.ndarray | None = None, 28 30 ) -> Path: 29 - segment_dir = journal / day / stream / segment_key 31 + chronicle_day = journal / "chronicle" / day 32 + chronicle_day.mkdir(parents=True, exist_ok=True) 33 + flat_day = journal / day 34 + if not flat_day.exists(): 35 + flat_day.symlink_to(chronicle_day, target_is_directory=True) 36 + segment_dir = chronicle_day / stream / segment_key 30 37 segment_dir.mkdir(parents=True, exist_ok=True) 31 38 32 39 statement_ids = np.arange(1, len(embeddings) + 1, dtype=np.int32) 33 - np.savez_compressed( 34 - segment_dir / f"{source}.npz", 35 - embeddings=np.asarray(embeddings, dtype=np.float32), 36 - statement_ids=statement_ids, 37 - ) 40 + npz_kwargs = { 41 + "embeddings": np.asarray(embeddings, dtype=np.float32), 42 + "statement_ids": statement_ids, 43 + } 44 + if durations_s is not None: 45 + npz_kwargs["durations_s"] = np.asarray(durations_s, dtype=np.float32) 46 + np.savez_compressed(segment_dir / f"{source}.npz", **npz_kwargs) 38 47 39 48 time_part = segment_key.split("_")[0] 40 49 base_h = int(time_part[0:2]) ··· 183 192 assert get_current()["voiceprint"]["status"] == "candidate" 184 193 185 194 195 + def test_low_quality_too_few_stmts(speakers_env, monkeypatch): 196 + import apps.speakers.owner as owner_module 197 + from apps.speakers.owner import detect_owner_candidate 198 + 199 + def stub_hdbscan(labels: np.ndarray): 200 + class StubHDBSCAN: 201 + def __init__(self, **kwargs): 202 + self.labels_ = np.asarray(labels, dtype=np.int32) 203 + 204 + def fit(self, embeddings: np.ndarray): 205 + assert embeddings.shape[0] == len(self.labels_) 206 + return self 207 + 208 + return StubHDBSCAN 209 + 210 + env = speakers_env() 211 + rng = np.random.default_rng(0) 212 + embeddings = _owner_embeddings(60, rng) 213 + _write_segment( 214 + env.journal, 215 + "20240101", 216 + "mic", 217 + "090000_300", 218 + "audio", 219 + embeddings, 220 + durations_s=np.full(60, 2.0, dtype=np.float32), 221 + ) 222 + 223 + labels = np.concatenate( 224 + [ 225 + np.zeros(29, dtype=np.int32), 226 + np.full(31, -1, dtype=np.int32), 227 + ] 228 + ) 229 + monkeypatch.setattr(owner_module, "HDBSCAN", stub_hdbscan(labels)) 230 + 231 + result = detect_owner_candidate() 232 + 233 + assert result["status"] == "low_quality" 234 + assert result["recommendation"] == "low_quality" 235 + assert result["low_quality_reason"] == "too_few_stmts" 236 + assert get_current()["voiceprint"]["status"] == "low_quality" 237 + assert not _candidate_path(env.journal).exists() 238 + 239 + 240 + def test_low_quality_median_duration_too_short(speakers_env, monkeypatch): 241 + import apps.speakers.owner as owner_module 242 + from apps.speakers.owner import detect_owner_candidate 243 + 244 + def stub_hdbscan(labels: np.ndarray): 245 + class StubHDBSCAN: 246 + def __init__(self, **kwargs): 247 + self.labels_ = np.asarray(labels, dtype=np.int32) 248 + 249 + def fit(self, embeddings: np.ndarray): 250 + assert embeddings.shape[0] == len(self.labels_) 251 + return self 252 + 253 + return StubHDBSCAN 254 + 255 + env = speakers_env() 256 + rng = np.random.default_rng(1) 257 + embeddings = _owner_embeddings(60, rng) 258 + _write_segment( 259 + env.journal, 260 + "20240101", 261 + "mic", 262 + "090000_300", 263 + "audio", 264 + embeddings, 265 + durations_s=np.full(60, 1.0, dtype=np.float32), 266 + ) 267 + 268 + monkeypatch.setattr( 269 + owner_module, "HDBSCAN", stub_hdbscan(np.zeros(60, dtype=np.int32)) 270 + ) 271 + 272 + result = detect_owner_candidate() 273 + 274 + assert result["status"] == "low_quality" 275 + assert result["recommendation"] == "low_quality" 276 + assert result["low_quality_reason"] == "median_duration_too_short" 277 + assert result["observed_value"] < 1.5 278 + assert get_current()["voiceprint"]["status"] == "low_quality" 279 + assert not _candidate_path(env.journal).exists() 280 + 281 + 282 + def test_low_quality_cluster_too_diffuse(speakers_env, monkeypatch): 283 + import apps.speakers.owner as owner_module 284 + from apps.speakers.owner import detect_owner_candidate 285 + 286 + def stub_hdbscan(labels: np.ndarray): 287 + class StubHDBSCAN: 288 + def __init__(self, **kwargs): 289 + self.labels_ = np.asarray(labels, dtype=np.int32) 290 + 291 + def fit(self, embeddings: np.ndarray): 292 + assert embeddings.shape[0] == len(self.labels_) 293 + return self 294 + 295 + return StubHDBSCAN 296 + 297 + env = speakers_env() 298 + rng = np.random.default_rng(0) 299 + template = np.ones((60, 256), dtype=np.float32) 300 + embeddings = template + rng.normal(scale=2.0, size=(60, 256)).astype(np.float32) 301 + _write_segment( 302 + env.journal, 303 + "20240101", 304 + "mic", 305 + "090000_300", 306 + "audio", 307 + embeddings, 308 + durations_s=np.full(60, 1.6, dtype=np.float32), 309 + ) 310 + 311 + monkeypatch.setattr( 312 + owner_module, "HDBSCAN", stub_hdbscan(np.zeros(60, dtype=np.int32)) 313 + ) 314 + 315 + result = detect_owner_candidate() 316 + 317 + assert result["status"] == "low_quality" 318 + assert result["recommendation"] == "low_quality" 319 + assert result["low_quality_reason"] == "cluster_too_diffuse" 320 + assert result["observed_value"] < 0.30 321 + assert get_current()["voiceprint"]["status"] == "low_quality" 322 + assert not _candidate_path(env.journal).exists() 323 + 324 + 186 325 def test_load_owner_centroid_no_principal(speakers_env): 187 326 from apps.speakers.owner import load_owner_centroid 188 327 ··· 321 460 assert response.get_json()["status"] == "candidate" 322 461 323 462 463 + def test_api_owner_status_low_quality(speakers_env): 464 + from apps.speakers.routes import speakers_bp 465 + 466 + speakers_env() 467 + update_state( 468 + "voiceprint", 469 + { 470 + "status": "low_quality", 471 + "low_quality_reason": "too_few_stmts", 472 + "observed_value": 5, 473 + "threshold_value": 30, 474 + "segments_checked": 1, 475 + "attempted_at": "2026-03-15T12:00:00", 476 + }, 477 + ) 478 + app = Flask(__name__) 479 + app.register_blueprint(speakers_bp) 480 + 481 + with app.test_client() as client: 482 + response = client.get("/app/speakers/api/owner/status") 483 + 484 + assert response.status_code == 200 485 + assert response.get_json() == { 486 + "status": "low_quality", 487 + "low_quality_reason": "too_few_stmts", 488 + "observed_value": 5, 489 + "threshold_value": 30, 490 + } 491 + 492 + 324 493 def test_api_owner_status_no_cluster(speakers_env): 325 494 from apps.speakers.routes import speakers_bp 326 495 ··· 375 544 376 545 377 546 def test_api_owner_confirm(speakers_env): 547 + from apps.speakers.owner import OWNER_THRESHOLD 378 548 from apps.speakers.routes import speakers_bp 379 549 380 550 env = speakers_env() ··· 386 556 candidate_path, 387 557 centroid=centroid, 388 558 cluster_size=np.array(88, dtype=np.int32), 389 - threshold=np.array(0.82, dtype=np.float32), 559 + threshold=np.array(OWNER_THRESHOLD, dtype=np.float32), 390 560 version=np.array("2026-03-15T12:00:00"), 391 561 ) 392 562 ··· 477 647 478 648 479 649 def test_confirm_owner_candidate_success(speakers_env): 480 - from apps.speakers.owner import confirm_owner_candidate 650 + from apps.speakers.owner import OWNER_THRESHOLD, confirm_owner_candidate 481 651 482 652 env = speakers_env() 483 653 principal_dir = env.create_entity("Self Person", is_principal=True) ··· 488 658 candidate_path, 489 659 centroid=centroid, 490 660 cluster_size=np.array(88, dtype=np.int32), 491 - threshold=np.array(0.82, dtype=np.float32), 661 + threshold=np.array(OWNER_THRESHOLD, dtype=np.float32), 492 662 version=np.array("2026-03-19T12:00:00"), 493 663 ) 494 664
+31 -2
apps/speakers/tests/test_routes.py
··· 121 121 assert sentences[0]["has_embedding"] is True 122 122 123 123 assert emb_data is not None 124 - embeddings, statement_ids = emb_data 124 + embeddings, statement_ids, durations_s = emb_data 125 125 assert embeddings.shape == (3, 256) 126 126 assert len(statement_ids) == 3 127 + assert durations_s is None 127 128 128 129 129 130 def test_load_sentences_no_transcript(speakers_env): ··· 293 294 result = _load_embeddings_file(npz_path) 294 295 295 296 assert result is not None 296 - embeddings, statement_ids = result 297 + embeddings, statement_ids, durations_s = result 297 298 assert embeddings.shape == (3, 256) 298 299 assert len(statement_ids) == 3 300 + assert durations_s is None 301 + 302 + 303 + def test_load_embeddings_file_with_durations(speakers_env): 304 + """Test loading embeddings from NPZ file with durations.""" 305 + from apps.speakers.routes import _load_embeddings_file 306 + 307 + env = speakers_env() 308 + embeddings = np.eye(3, 256, dtype=np.float32) 309 + statement_ids = np.arange(1, 4, dtype=np.int32) 310 + durations_s = np.array([1.6, 2.1, 2.8], dtype=np.float32) 311 + npz_path = env.journal / "20240101" / "test" / "143022_300" / "mic_audio.npz" 312 + npz_path.parent.mkdir(parents=True, exist_ok=True) 313 + np.savez_compressed( 314 + npz_path, 315 + embeddings=embeddings, 316 + statement_ids=statement_ids, 317 + durations_s=durations_s, 318 + ) 319 + 320 + result = _load_embeddings_file(npz_path) 321 + 322 + assert result is not None 323 + loaded_embeddings, loaded_ids, loaded_durations = result 324 + assert loaded_embeddings.shape == (3, 256) 325 + assert np.array_equal(loaded_ids, statement_ids) 326 + assert loaded_durations is not None 327 + assert np.allclose(loaded_durations, durations_s) 299 328 300 329 301 330 def test_load_embeddings_file_not_found():
+6 -1
apps/speakers/tests/test_suggest.py
··· 18 18 19 19 20 20 def create_meetings_md(env, day: str, content: str) -> Path: 21 - meetings_path = env.journal / day / "talents" / "meetings.md" 21 + chronicle_day = env.journal / "chronicle" / day 22 + chronicle_day.mkdir(parents=True, exist_ok=True) 23 + flat_day = env.journal / day 24 + if not flat_day.exists(): 25 + flat_day.symlink_to(chronicle_day, target_is_directory=True) 26 + meetings_path = chronicle_day / "talents" / "meetings.md" 22 27 meetings_path.parent.mkdir(parents=True, exist_ok=True) 23 28 meetings_path.write_text(content, encoding="utf-8") 24 29 return meetings_path
+16
apps/speakers/workspace.html
··· 1312 1312 hideOwnerBanner(); 1313 1313 return; 1314 1314 } 1315 + if (data.status === 'low_quality') { 1316 + hideDiscovery(); 1317 + renderOwnerLowQuality(data); 1318 + return; 1319 + } 1315 1320 if (data.status === 'candidate') { 1316 1321 hideDiscovery(); 1317 1322 renderOwnerCandidate(data); ··· 1379 1384 }); 1380 1385 1381 1386 attachAudioErrorListeners(ownerBanner); 1387 + } 1388 + 1389 + function renderOwnerLowQuality(data) { 1390 + ownerBanner.style.display = 'block'; 1391 + ownerBanner.innerHTML = ` 1392 + <div class="spk-owner-panel"> 1393 + <p class="spk-owner-low-quality-message"> 1394 + Solstone is still learning your voice. Once it has at least 30 longer recordings of you speaking, it will offer a confirmation prompt. 1395 + </p> 1396 + </div> 1397 + `; 1382 1398 } 1383 1399 1384 1400 function submitOwnerChoice(path) {
+3
observe/transcribe/main.py
··· 313 313 314 314 embeddings = [] 315 315 statement_ids = [] 316 + durations = [] 316 317 skipped = 0 317 318 318 319 for stmt in valid_statements: ··· 333 334 emb = session.run([output_name], {input_name: feats[None, :, :]})[0] 334 335 embeddings.append(emb[0].astype(np.float32)) 335 336 statement_ids.append(stmt["id"]) 337 + durations.append((end_sample - start_sample) / SAMPLE_RATE) 336 338 except Exception: 337 339 logging.exception( 338 340 "wespeaker embedding failed for statement %s", stmt["id"] ··· 354 356 return { 355 357 "embeddings": np.stack(embeddings, axis=0).astype(np.float32), 356 358 "statement_ids": np.asarray(statement_ids, dtype=np.int32), 359 + "durations_s": np.asarray(durations, dtype=np.float32), 357 360 "encoder": np.array(EMBEDDER_NAME), 358 361 } 359 362
+3 -2
think/entities/voiceprints.py
··· 85 85 existing_emb = data["embeddings"] 86 86 existing_meta_strings = data["metadata"] 87 87 existing_meta_dicts = [json.loads(m) for m in existing_meta_strings] 88 - except (FileNotFoundError, ValueError, np.lib.npyio.NpzFile) as exc: 88 + except (FileNotFoundError, OSError, ValueError) as exc: 89 89 logger.warning( 90 90 "Failed to load existing voiceprints for %s from %s: %s. Starting fresh.", 91 91 entity_id, ··· 141 141 """Safely save a voiceprint NPZ with file locking and integrity check.""" 142 142 lock_path = npz_path.with_suffix(".lock") 143 143 tmp_path = npz_path.with_name(npz_path.stem + ".tmp.npz") 144 + metadata_json = np.asarray([json.dumps(item) for item in metadata], dtype=str) 144 145 145 146 npz_path.parent.mkdir(parents=True, exist_ok=True) 146 147 ··· 151 152 np.savez_compressed( 152 153 tmp_path, 153 154 embeddings=embeddings, 154 - metadata=metadata, 155 + metadata=metadata_json, 155 156 ) 156 157 if not tmp_path.exists(): 157 158 raise FileNotFoundError(