personal memory agent
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

speaker-attribution: gate flywheel on per-segment acoustic overlap

Adds a pyannote-based speech-conditioned overlap fraction to the transcribe
pipeline and uses it to guard the voiceprint flywheel. Per-segment attribution
is unchanged; only long-term identity accumulation (accumulate_voiceprints
and the owner HDBSCAN bootstrap) skips segments with overlap > 0.10.

- Vendor pyannote-segmentation-3.0 ONNX (5.7 MB) under
observe/transcribe/assets/; Makefile .installed verifies its SHA-256.
- observe/transcribe/overlap.py runs sliding-window inference (10 s window,
5 s stride) and computes count(argmax in {4,5,6}) / count(argmax >= 1)
with explicit 0.0 for silent segments.
- process_audio writes overlap_fraction (4dp) and overlap_detector into the
chronicle JSONL header alongside the existing noisy_* fields.
- apps/speakers/_overlap.py reads the header (missing/invalid → 0.0 for
pre-overlap back-compat).
- accumulate_voiceprints short-circuits when overlap > 0.10; owner bootstrap
excludes those sources before subsampling and HDBSCAN.
- encoder_config.py is the canonical home for NOISY_FLYWHEEL_OVERLAP_MAX,
OVERLAP_DETECTOR_ID, and OVERLAP_DETECTOR_SHA256; transcribe and tests
import rather than duplicate the literals.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

+520 -3
+1 -1
Makefile
··· 54 54 echo "Python 3.14+ detected - installing onnxruntime from nightly feed..."; \ 55 55 $(UV) pip install --pre --no-deps --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ onnxruntime; \ 56 56 fi 57 - @$(VENV_BIN)/python -c "from observe.transcribe.main import WESPEAKER_MODEL_PATH, WESPEAKER_MODEL_SHA256; from observe.utils import compute_file_sha256; actual = compute_file_sha256(WESPEAKER_MODEL_PATH); assert actual == WESPEAKER_MODEL_SHA256, f'WeSpeaker asset hash mismatch: got {actual}, expected {WESPEAKER_MODEL_SHA256}'; print(f'wespeaker asset ok ({actual[:12]}...)')" 57 + @$(VENV_BIN)/python -c "from observe.transcribe.main import PYANNOTE_OVERLAP_MODEL_PATH, PYANNOTE_OVERLAP_MODEL_SHA256, WESPEAKER_MODEL_PATH, WESPEAKER_MODEL_SHA256; from observe.utils import compute_file_sha256; actual = compute_file_sha256(WESPEAKER_MODEL_PATH); assert actual == WESPEAKER_MODEL_SHA256, f'WeSpeaker asset hash mismatch: got {actual}, expected {WESPEAKER_MODEL_SHA256}'; print(f'wespeaker asset ok ({actual[:12]}...)'); actual = compute_file_sha256(PYANNOTE_OVERLAP_MODEL_PATH); assert actual == PYANNOTE_OVERLAP_MODEL_SHA256, f'pyannote asset hash mismatch: got {actual}, expected {PYANNOTE_OVERLAP_MODEL_SHA256}'; print(f'pyannote asset ok ({actual[:12]}...)')" 58 58 @echo "Installing Playwright browser for sol screenshot..." 59 59 $(VENV_BIN)/playwright install chromium 60 60 @$(MAKE) --no-print-directory skills
+33
apps/speakers/_overlap.py
··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Shared overlap-header readers for speaker attribution.""" 5 + 6 + from __future__ import annotations 7 + 8 + import json 9 + import logging 10 + from pathlib import Path 11 + 12 + logger = logging.getLogger(__name__) 13 + 14 + 15 + def _read_segment_overlap_fraction(jsonl_path: Path) -> float: 16 + """Return overlap_fraction from a chronicle JSONL header, or 0.0 if absent.""" 17 + try: 18 + with jsonl_path.open(encoding="utf-8") as f: 19 + line = f.readline() 20 + if not line: 21 + return 0.0 22 + header = json.loads(line) 23 + except FileNotFoundError: 24 + return 0.0 25 + except (OSError, json.JSONDecodeError) as exc: 26 + logger.info("overlap header read failed at %s: %s", jsonl_path, exc) 27 + return 0.0 28 + 29 + value = header.get("overlap_fraction", 0.0) 30 + try: 31 + return float(value) 32 + except (TypeError, ValueError): 33 + return 0.0
+19 -1
apps/speakers/attribution.py
··· 28 28 29 29 import numpy as np 30 30 31 - from apps.speakers.encoder_config import ACOUSTIC_HIGH, ACOUSTIC_MEDIUM 31 + from apps.speakers._overlap import _read_segment_overlap_fraction 32 + from apps.speakers.encoder_config import ( 33 + ACOUSTIC_HIGH, 34 + ACOUSTIC_MEDIUM, 35 + NOISY_FLYWHEEL_OVERLAP_MAX, 36 + ) 32 37 from apps.speakers.owner import load_owner_centroid 33 38 from think.entities import find_matching_entity 34 39 from think.entities.journal import ( ··· 562 567 owner_centroid, owner_threshold = centroid_data 563 568 564 569 seg_dir = segment_path(day, segment_key, stream, create=False) 570 + jsonl_path = seg_dir / f"{source}.jsonl" 571 + overlap_fraction = _read_segment_overlap_fraction(jsonl_path) 572 + if overlap_fraction > NOISY_FLYWHEEL_OVERLAP_MAX: 573 + logger.info( 574 + "flywheel skip: overlap=%.3f exceeds %.2f at %s/%s/%s", 575 + overlap_fraction, 576 + NOISY_FLYWHEEL_OVERLAP_MAX, 577 + day, 578 + segment_key, 579 + source, 580 + ) 581 + return {} 582 + 565 583 emb_data = load_embeddings_file(seg_dir / f"{source}.npz") 566 584 if emb_data is None: 567 585 return {}
+6
apps/speakers/encoder_config.py
··· 12 12 OWNER_BOOTSTRAP_MIN_STMTS: int = 30 13 13 OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S: float = 1.5 14 14 OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25: float = 0.30 15 + 16 + NOISY_FLYWHEEL_OVERLAP_MAX: float = 0.10 17 + OVERLAP_DETECTOR_ID: str = "pyannote-segmentation-3.0-onnx" 18 + OVERLAP_DETECTOR_SHA256: str = ( 19 + "057ee564753071c0b09b5b611648b50ac188d50846bff5f01e9f7bbf1591ea25" 20 + )
+18
apps/speakers/owner.py
··· 14 14 import numpy as np 15 15 from sklearn.cluster import HDBSCAN 16 16 17 + from apps.speakers._overlap import _read_segment_overlap_fraction 17 18 from apps.speakers.encoder_config import ( 19 + NOISY_FLYWHEEL_OVERLAP_MAX, 18 20 OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25, 19 21 OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S, 20 22 OWNER_BOOTSTRAP_MIN_STMTS, ··· 237 239 segment_count = count_segments_with_embeddings() 238 240 239 241 embedding_chunks: list[np.ndarray] = [] 242 + overlap_cache: dict[Path, float] = {} 240 243 provenance: list[dict[str, Any]] = [] 241 244 242 245 for day in day_dirs().keys(): ··· 246 249 segment_dir = segment_path(day, segment_key, stream) 247 250 248 251 for source in segment["sources"]: 252 + jsonl_path = segment_dir / f"{source}.jsonl" 253 + overlap = overlap_cache.setdefault( 254 + jsonl_path, _read_segment_overlap_fraction(jsonl_path) 255 + ) 256 + if overlap > NOISY_FLYWHEEL_OVERLAP_MAX: 257 + logger.info( 258 + "owner bootstrap skip: overlap=%.3f exceeds %.2f at %s/%s/%s", 259 + overlap, 260 + NOISY_FLYWHEEL_OVERLAP_MAX, 261 + day, 262 + segment_key, 263 + source, 264 + ) 265 + continue 266 + 249 267 emb_data = load_embeddings_file(segment_dir / f"{source}.npz") 250 268 if emb_data is None: 251 269 continue
+109
apps/speakers/tests/test_attribution.py
··· 10 10 11 11 import numpy as np 12 12 13 + from apps.speakers.encoder_config import OVERLAP_DETECTOR_ID 13 14 from apps.speakers.owner import OWNER_THRESHOLD 14 15 15 16 # Test stream name (matches conftest.STREAM) ··· 50 51 stream=STREAM, 51 52 embeddings=embeddings, 52 53 ) 54 + 55 + 56 + def _rewrite_segment_header(seg_dir: Path, source: str, **updates: object) -> None: 57 + jsonl_path = seg_dir / f"{source}.jsonl" 58 + lines = jsonl_path.read_text(encoding="utf-8").splitlines() 59 + header = json.loads(lines[0]) if lines else {} 60 + header.update(updates) 61 + lines[0] = json.dumps(header) 62 + jsonl_path.write_text("\n".join(lines) + "\n", encoding="utf-8") 53 63 54 64 55 65 # --------------------------------------------------------------------------- ··· 461 471 ) 462 472 463 473 assert saved == {} 474 + 475 + 476 + def test_accumulate_voiceprints_skips_chaotic_segment(speakers_env): 477 + from apps.speakers.attribution import accumulate_voiceprints 478 + 479 + env = speakers_env() 480 + _setup_owner(env) 481 + env.create_entity("Bob Smith") 482 + 483 + other_emb = _normalized([0.1, 0.99]) 484 + seg_dir = _write_controlled_segment( 485 + env, "20240101", "090000_300", np.vstack([other_emb]) 486 + ) 487 + _rewrite_segment_header( 488 + seg_dir, 489 + "mic_audio", 490 + overlap_fraction=0.20, 491 + overlap_detector=OVERLAP_DETECTOR_ID, 492 + ) 493 + 494 + labels = [ 495 + { 496 + "sentence_id": 1, 497 + "speaker": "bob_smith", 498 + "confidence": "high", 499 + "method": "structural_single_speaker", 500 + } 501 + ] 502 + 503 + saved = accumulate_voiceprints( 504 + "20240101", STREAM, "090000_300", labels, "mic_audio" 505 + ) 506 + 507 + assert saved == {} 508 + vp_path = env.journal / "entities" / "bob_smith" / "voiceprints.npz" 509 + assert not vp_path.exists() 510 + 511 + 512 + def test_accumulate_voiceprints_admits_clean_segment(speakers_env): 513 + from apps.speakers.attribution import accumulate_voiceprints 514 + 515 + env = speakers_env() 516 + _setup_owner(env) 517 + env.create_entity("Bob Smith") 518 + 519 + other_emb = _normalized([0.1, 0.99]) 520 + seg_dir = _write_controlled_segment( 521 + env, "20240101", "090000_300", np.vstack([other_emb]) 522 + ) 523 + _rewrite_segment_header( 524 + seg_dir, 525 + "mic_audio", 526 + overlap_fraction=0.05, 527 + overlap_detector=OVERLAP_DETECTOR_ID, 528 + ) 529 + 530 + labels = [ 531 + { 532 + "sentence_id": 1, 533 + "speaker": "bob_smith", 534 + "confidence": "high", 535 + "method": "structural_single_speaker", 536 + } 537 + ] 538 + 539 + saved = accumulate_voiceprints( 540 + "20240101", STREAM, "090000_300", labels, "mic_audio" 541 + ) 542 + 543 + assert saved == {"bob_smith": 1} 544 + 545 + 546 + def test_accumulate_voiceprints_missing_overlap_field_admits(speakers_env): 547 + from apps.speakers.attribution import accumulate_voiceprints 548 + 549 + env = speakers_env() 550 + _setup_owner(env) 551 + env.create_entity("Bob Smith") 552 + 553 + other_emb = _normalized([0.1, 0.99]) 554 + seg_dir = _write_controlled_segment( 555 + env, "20240101", "090000_300", np.vstack([other_emb]) 556 + ) 557 + _rewrite_segment_header(seg_dir, "mic_audio") 558 + 559 + labels = [ 560 + { 561 + "sentence_id": 1, 562 + "speaker": "bob_smith", 563 + "confidence": "high", 564 + "method": "structural_single_speaker", 565 + } 566 + ] 567 + 568 + saved = accumulate_voiceprints( 569 + "20240101", STREAM, "090000_300", labels, "mic_audio" 570 + ) 571 + 572 + assert saved == {"bob_smith": 1} 464 573 465 574 466 575 # ---------------------------------------------------------------------------
+5
apps/speakers/tests/test_encoder_config.py
··· 3 3 """Test the constant-import contract for encoder_config.""" 4 4 5 5 from apps.speakers import attribution, encoder_config, owner 6 + from observe.transcribe.main import OVERLAP_DETECTOR_ID as MAIN_OVERLAP_DETECTOR_ID 7 + from observe.transcribe.main import PYANNOTE_OVERLAP_MODEL_SHA256 6 8 7 9 8 10 def test_locked_constants(): ··· 13 15 assert encoder_config.OWNER_BOOTSTRAP_MIN_STMTS == 30 14 16 assert encoder_config.OWNER_BOOTSTRAP_MIN_MEDIAN_DURATION_S == 1.5 15 17 assert encoder_config.OWNER_BOOTSTRAP_MIN_INTRA_COSINE_P25 == 0.30 18 + assert encoder_config.NOISY_FLYWHEEL_OVERLAP_MAX == 0.10 19 + assert encoder_config.OVERLAP_DETECTOR_ID == MAIN_OVERLAP_DETECTOR_ID 20 + assert encoder_config.OVERLAP_DETECTOR_SHA256 == PYANNOTE_OVERLAP_MODEL_SHA256 16 21 17 22 18 23 def test_attribution_imports_acoustic_constants():
+64
apps/speakers/tests/test_owner.py
··· 11 11 import numpy as np 12 12 from flask import Flask 13 13 14 + from apps.speakers.encoder_config import OVERLAP_DETECTOR_ID 14 15 from think.awareness import get_current, update_state 15 16 16 17 ··· 69 70 (segment_dir / f"{source}.jsonl").write_text("\n".join(lines) + "\n") 70 71 (segment_dir / f"{source}.flac").write_bytes(b"") 71 72 return segment_dir 73 + 74 + 75 + def _rewrite_segment_header(segment_dir: Path, source: str, **updates: object) -> None: 76 + jsonl_path = segment_dir / f"{source}.jsonl" 77 + lines = jsonl_path.read_text(encoding="utf-8").splitlines() 78 + header = json.loads(lines[0]) if lines else {} 79 + header.update(updates) 80 + lines[0] = json.dumps(header) 81 + jsonl_path.write_text("\n".join(lines) + "\n", encoding="utf-8") 72 82 73 83 74 84 def _owner_embeddings(count: int, rng: np.random.Generator) -> np.ndarray: ··· 320 330 assert result["observed_value"] < 0.30 321 331 assert get_current()["voiceprint"]["status"] == "low_quality" 322 332 assert not _candidate_path(env.journal).exists() 333 + 334 + 335 + def test_detect_owner_candidate_excludes_chaotic_segments(speakers_env, monkeypatch): 336 + import apps.speakers.owner as owner_module 337 + from apps.speakers.owner import detect_owner_candidate 338 + 339 + class StubHDBSCAN: 340 + def __init__(self, **kwargs): 341 + self.labels_ = np.zeros(60, dtype=np.int32) 342 + 343 + def fit(self, embeddings: np.ndarray): 344 + assert embeddings.shape[0] == 60 345 + return self 346 + 347 + env = speakers_env() 348 + rng = np.random.default_rng(2) 349 + clean_dir = _write_segment( 350 + env.journal, 351 + "20240101", 352 + "mic", 353 + "090000_300", 354 + "audio", 355 + _owner_embeddings(60, rng), 356 + durations_s=np.full(60, 2.0, dtype=np.float32), 357 + ) 358 + _rewrite_segment_header( 359 + clean_dir, 360 + "audio", 361 + overlap_fraction=0.05, 362 + overlap_detector=OVERLAP_DETECTOR_ID, 363 + ) 364 + 365 + chaotic_dir = _write_segment( 366 + env.journal, 367 + "20240102", 368 + "mic", 369 + "090000_300", 370 + "audio", 371 + _owner_embeddings(60, rng), 372 + durations_s=np.full(60, 2.0, dtype=np.float32), 373 + ) 374 + _rewrite_segment_header( 375 + chaotic_dir, 376 + "audio", 377 + overlap_fraction=0.20, 378 + overlap_detector=OVERLAP_DETECTOR_ID, 379 + ) 380 + 381 + monkeypatch.setattr(owner_module, "HDBSCAN", StubHDBSCAN) 382 + 383 + result = detect_owner_candidate() 384 + 385 + assert result["status"] == "candidate" 386 + assert result["cluster_size"] == 60 323 387 324 388 325 389 def test_load_owner_centroid_no_principal(speakers_env):
observe/transcribe/assets/pyannote-segmentation-3.0.onnx

This is a binary file and will not be displayed.

+18 -1
observe/transcribe/main.py
··· 63 63 64 64 import numpy as np 65 65 66 + from apps.speakers.encoder_config import ( 67 + OVERLAP_DETECTOR_ID, 68 + OVERLAP_DETECTOR_SHA256, 69 + ) 66 70 from observe.transcribe import ( 67 71 BACKEND_REGISTRY, 68 72 get_backend, 69 73 ) 70 74 from observe.transcribe import transcribe as stt_transcribe 75 + from observe.transcribe.overlap import compute_overlap_fraction 71 76 from observe.transcribe.whisper import DEFAULT_COMPUTE, DEFAULT_DEVICE, DEFAULT_MODEL 72 77 from observe.utils import SAMPLE_RATE, get_segment_key, load_audio 73 78 from observe.vad import ( ··· 112 117 MIN_STATEMENT_DURATION = 0.3 113 118 114 119 # WeSpeaker embedder asset 120 + ASSETS_DIR = Path(__file__).parent / "assets" 115 121 EMBEDDER_NAME = "wespeaker-resnet34-256" 116 122 WESPEAKER_MODEL_SHA256 = ( 117 123 "5ef208a9da1453335308a6b6f4e6dfbd7e183a38b604de0a57664f45d257fe94" 118 124 ) 119 - WESPEAKER_MODEL_PATH = Path(__file__).parent / "assets" / "wespeaker-resnet34-256.onnx" 125 + WESPEAKER_MODEL_PATH = ASSETS_DIR / "wespeaker-resnet34-256.onnx" 126 + PYANNOTE_OVERLAP_MODEL_SHA256 = OVERLAP_DETECTOR_SHA256 127 + PYANNOTE_OVERLAP_MODEL_PATH = ASSETS_DIR / "pyannote-segmentation-3.0.onnx" 120 128 121 129 # Number of recent entity names to load for transcription context 122 130 ENTITY_NAMES_LIMIT = 40 ··· 376 384 vad_result: VadResult | None = None, 377 385 segment_meta: dict | None = None, 378 386 backend: str | None = None, 387 + *, 388 + overlap_fraction: float | None = None, 389 + overlap_detector: str | None = None, 379 390 ) -> list[str]: 380 391 """Convert statements to JSONL lines. 381 392 ··· 419 430 ratio = vad_result.loud_speech_ratio 420 431 if ratio is not None: 421 432 metadata["loud_speech_ratio"] = round(ratio, 2) 433 + if overlap_fraction is not None and overlap_detector is not None: 434 + metadata["overlap_fraction"] = round(float(overlap_fraction), 4) 435 + metadata["overlap_detector"] = overlap_detector 422 436 423 437 # Add enrichment metadata if available 424 438 if enrichment: ··· 626 640 # Generate embeddings before timestamp restoration 627 641 # Use reduced audio buffer if available for consistent timestamps 628 642 embeddings_data = _embed_statements(stt_buffer, statements, SAMPLE_RATE) 643 + overlap_fraction_value = compute_overlap_fraction(audio_buffer) 629 644 630 645 # Restore original timestamps if audio was reduced (non-Gemini backends only) 631 646 # Gemini with chunks already has timestamps in original audio time ··· 649 664 vad_result, 650 665 segment_meta, 651 666 resolved_backend, 667 + overlap_fraction=overlap_fraction_value, 668 + overlap_detector=OVERLAP_DETECTOR_ID, 652 669 ) 653 670 654 671 # Write JSONL
+127
observe/transcribe/overlap.py
··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Overlap-fraction inference for speaker-attribution flywheel gating.""" 5 + 6 + from __future__ import annotations 7 + 8 + import logging 9 + import time 10 + from pathlib import Path 11 + 12 + import numpy as np 13 + import onnxruntime as ort 14 + import soundfile as sf 15 + from scipy.signal import resample_poly 16 + 17 + from observe.utils import SAMPLE_RATE 18 + 19 + logger = logging.getLogger(__name__) 20 + 21 + WINDOW_S = 10 22 + STRIDE_S = 5 23 + FRAMES_PER_WINDOW = 589 24 + OVERLAP_CLASSES = (4, 5, 6) 25 + 26 + _overlap_session: ort.InferenceSession | None = None 27 + 28 + 29 + def _get_overlap_session() -> ort.InferenceSession: 30 + """Return a cached ONNX InferenceSession for the pyannote overlap model.""" 31 + global _overlap_session 32 + 33 + if _overlap_session is None: 34 + from observe.transcribe.main import ( 35 + PYANNOTE_OVERLAP_MODEL_PATH, 36 + _select_onnx_providers, 37 + ) 38 + 39 + if not PYANNOTE_OVERLAP_MODEL_PATH.is_file(): 40 + raise FileNotFoundError( 41 + f"pyannote model not found at {PYANNOTE_OVERLAP_MODEL_PATH}. " 42 + "Run `make install` to verify the bundled asset." 43 + ) 44 + 45 + providers = _select_onnx_providers() 46 + start = time.monotonic() 47 + _overlap_session = ort.InferenceSession( 48 + str(PYANNOTE_OVERLAP_MODEL_PATH), 49 + providers=providers, 50 + ) 51 + elapsed_ms = (time.monotonic() - start) * 1000.0 52 + logger.info( 53 + "pyannote overlap session loaded (providers=%s, elapsed=%.1fms)", 54 + _overlap_session.get_providers(), 55 + elapsed_ms, 56 + ) 57 + 58 + return _overlap_session 59 + 60 + 61 + def compute_overlap_fraction( 62 + audio: np.ndarray, sample_rate: int = SAMPLE_RATE 63 + ) -> float: 64 + """Compute the speech-conditioned overlap fraction for an audio segment.""" 65 + if sample_rate != SAMPLE_RATE: 66 + raise ValueError( 67 + f"pyannote overlap detector requires {SAMPLE_RATE} Hz audio, got {sample_rate}" 68 + ) 69 + 70 + session = _get_overlap_session() 71 + input_name = session.get_inputs()[0].name 72 + 73 + window_samples = WINDOW_S * sample_rate 74 + stride_samples = STRIDE_S * sample_rate 75 + 76 + audio_f32 = np.asarray(audio, dtype=np.float32) 77 + if len(audio_f32) < window_samples: 78 + pad = window_samples - len(audio_f32) 79 + audio_padded = np.concatenate([audio_f32, np.zeros(pad, dtype=np.float32)]) 80 + else: 81 + audio_padded = audio_f32 82 + 83 + starts = list(range(0, len(audio_padded) - window_samples + 1, stride_samples)) 84 + final_start = max(0, len(audio_padded) - window_samples) 85 + if not starts: 86 + starts = [final_start] 87 + elif starts[-1] != final_start: 88 + starts.append(final_start) 89 + 90 + samples_per_frame = window_samples / FRAMES_PER_WINDOW 91 + num_frames = int(np.ceil(len(audio_padded) / samples_per_frame)) 92 + accum = np.zeros((num_frames, 7), dtype=np.float64) 93 + counts = np.zeros((num_frames,), dtype=np.int32) 94 + 95 + for start_sample in starts: 96 + chunk = audio_padded[start_sample : start_sample + window_samples][ 97 + None, None, : 98 + ] 99 + log_probs = session.run(None, {input_name: chunk})[0][0] 100 + frame_start = int(round(start_sample / samples_per_frame)) 101 + frame_end = frame_start + log_probs.shape[0] 102 + if frame_end > num_frames: 103 + frame_end = num_frames 104 + log_probs = log_probs[: frame_end - frame_start] 105 + accum[frame_start:frame_end] += log_probs.astype(np.float64) 106 + counts[frame_start:frame_end] += 1 107 + 108 + counts = np.maximum(counts, 1) 109 + avg_log_probs = (accum / counts[:, None]).astype(np.float32) 110 + argmax = avg_log_probs.argmax(axis=-1) 111 + speech_count = int((argmax >= 1).sum()) 112 + if speech_count == 0: 113 + return 0.0 114 + 115 + overlap_count = int(np.isin(argmax, OVERLAP_CLASSES).sum()) 116 + return float(overlap_count / speech_count) 117 + 118 + 119 + def compute_overlap_fraction_for_wav(path: Path) -> float: 120 + """Load a WAV file and compute its speech-conditioned overlap fraction.""" 121 + audio, sample_rate = sf.read(path, dtype="float32", always_2d=False) 122 + if audio.ndim > 1: 123 + audio = audio.mean(axis=1) 124 + if sample_rate != SAMPLE_RATE: 125 + audio = resample_poly(audio, SAMPLE_RATE, sample_rate).astype(np.float32) 126 + sample_rate = SAMPLE_RATE 127 + return compute_overlap_fraction(np.asarray(audio, dtype=np.float32), sample_rate)
+120
tests/test_transcribe_overlap.py
··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Tests for pyannote overlap-fraction inference.""" 5 + 6 + from __future__ import annotations 7 + 8 + import numpy as np 9 + import pytest 10 + 11 + from observe.utils import SAMPLE_RATE 12 + 13 + 14 + class _Input: 15 + def __init__(self, name: str): 16 + self.name = name 17 + 18 + 19 + class _StubSession: 20 + def __init__(self, log_probs: np.ndarray): 21 + self._log_probs = log_probs.astype(np.float32) 22 + 23 + def get_inputs(self): 24 + return [_Input("input_values")] 25 + 26 + def run(self, _outputs, _inputs): 27 + return [self._log_probs[None, :, :]] 28 + 29 + 30 + def _dominant_log_probs(classes: np.ndarray) -> np.ndarray: 31 + log_probs = np.full((classes.shape[0], 7), -10.0, dtype=np.float32) 32 + log_probs[np.arange(classes.shape[0]), classes] = 0.0 33 + return log_probs 34 + 35 + 36 + def test_compute_overlap_fraction_silent_audio_returns_zero(monkeypatch): 37 + from observe.transcribe import overlap 38 + 39 + monkeypatch.setattr( 40 + overlap, 41 + "_get_overlap_session", 42 + lambda: _StubSession(_dominant_log_probs(np.zeros(589, dtype=np.int64))), 43 + ) 44 + 45 + result = overlap.compute_overlap_fraction( 46 + np.zeros(12 * SAMPLE_RATE, dtype=np.float32) 47 + ) 48 + 49 + assert result == 0.0 50 + 51 + 52 + def test_compute_overlap_fraction_short_audio_padded(monkeypatch): 53 + from observe.transcribe import overlap 54 + 55 + monkeypatch.setattr( 56 + overlap, 57 + "_get_overlap_session", 58 + lambda: _StubSession(_dominant_log_probs(np.zeros(589, dtype=np.int64))), 59 + ) 60 + 61 + result = overlap.compute_overlap_fraction( 62 + np.zeros(3 * SAMPLE_RATE, dtype=np.float32) 63 + ) 64 + 65 + assert isinstance(result, float) 66 + assert result == 0.0 67 + 68 + 69 + def test_compute_overlap_fraction_non_aligned_length(monkeypatch): 70 + from observe.transcribe import overlap 71 + 72 + monkeypatch.setattr( 73 + overlap, 74 + "_get_overlap_session", 75 + lambda: _StubSession(_dominant_log_probs(np.zeros(589, dtype=np.int64))), 76 + ) 77 + 78 + audio = np.zeros(int(13.7 * SAMPLE_RATE), dtype=np.float32) 79 + result = overlap.compute_overlap_fraction(audio) 80 + 81 + assert result == 0.0 82 + 83 + 84 + def test_compute_overlap_fraction_rejects_wrong_sample_rate(): 85 + from observe.transcribe.overlap import compute_overlap_fraction 86 + 87 + with pytest.raises(ValueError, match="requires 16000 Hz audio"): 88 + compute_overlap_fraction(np.zeros(16000, dtype=np.float32), sample_rate=8000) 89 + 90 + 91 + def test_get_overlap_session_loads_and_caches(): 92 + from observe.transcribe.overlap import _get_overlap_session 93 + 94 + first = _get_overlap_session() 95 + second = _get_overlap_session() 96 + 97 + assert first is second 98 + 99 + 100 + def test_compute_overlap_fraction_uses_conditioned_formula(monkeypatch): 101 + from observe.transcribe import overlap 102 + 103 + classes = np.concatenate( 104 + [ 105 + np.full(300, 1, dtype=np.int64), 106 + np.full(100, 4, dtype=np.int64), 107 + np.zeros(189, dtype=np.int64), 108 + ] 109 + ) 110 + monkeypatch.setattr( 111 + overlap, 112 + "_get_overlap_session", 113 + lambda: _StubSession(_dominant_log_probs(classes)), 114 + ) 115 + 116 + result = overlap.compute_overlap_fraction( 117 + np.zeros(10 * SAMPLE_RATE, dtype=np.float32) 118 + ) 119 + 120 + assert result == pytest.approx(100 / 400)