feat(transcribe): tolerate music without spurious errors or wasted Rev.ai calls

+2

docs/JOURNAL.md

··· 240 240 "backend": "whisper", 241 241 "enrich": true, 242 242 "preserve_all": false, 243 + "noise_upgrade_min_speech_ratio": 0.3, 243 244 "whisper": { 244 245 "device": "auto", 245 246 "model": "medium.en", ··· 256 257 - `backend` (string) – STT backend to use: `"whisper"` (local processing) or `"revai"` (cloud with speaker diarization). Default: `"whisper"`. 257 258 - `enrich` (boolean) – Enable LLM enrichment for topic extraction and transcript correction. Default: `true`. 258 259 - `preserve_all` (boolean) – Keep audio files even when no speech is detected. When `false`, silent recordings are deleted to save disk space. Default: `false`. 260 + - `noise_upgrade_min_speech_ratio` (number) – Min speech/loud ratio required for noisy upgrade (default: `0.3`). Filters out music and other non-speech noise. 259 261 260 262 **Whisper backend settings** (`transcribe.whisper`): 261 263 - `device` (string) – Device for inference: `"auto"` (detect GPU, fall back to CPU), `"cpu"`, or `"cuda"`. Default: `"auto"`.

+33 -20

observe/transcribe/main.py

··· 21 21 - transcribe.preserve_all: Keep audio files even when no speech detected (default: false) 22 22 - transcribe.min_speech_seconds: Minimum speech duration to proceed. Default: 1.0 23 23 - transcribe.noise_upgrade: Auto-switch to Rev.ai for noisy recordings (default: true) 24 + - transcribe.noise_upgrade_min_speech_ratio: Min speech/loud ratio required for noisy upgrade (default: 0.3). Filters out music and other non-speech noise. 24 25 25 26 Whisper backend settings (transcribe.whisper): 26 27 - device: Device for inference ("auto", "cpu", "cuda"). Default: "auto" ··· 205 206 if vad_result.noisy_rms is not None: 206 207 event["noisy_rms"] = round(vad_result.noisy_rms, 4) 207 208 event["noisy_s"] = round(vad_result.noisy_s, 1) 209 + if vad_result.loud_windows > 0: 210 + event["loud_windows"] = vad_result.loud_windows 211 + event["speech_loud_windows"] = vad_result.speech_loud_windows 212 + ratio = vad_result.loud_speech_ratio 213 + if ratio is not None: 214 + event["loud_speech_ratio"] = round(ratio, 2) 208 215 209 216 if day: 210 217 event["day"] = day ··· 359 366 if vad_result.noisy_rms is not None: 360 367 metadata["noisy_rms"] = round(vad_result.noisy_rms, 4) 361 368 metadata["noisy_s"] = round(vad_result.noisy_s, 1) 369 + if vad_result.loud_windows > 0: 370 + metadata["loud_windows"] = vad_result.loud_windows 371 + metadata["speech_loud_windows"] = vad_result.speech_loud_windows 372 + ratio = vad_result.loud_speech_ratio 373 + if ratio is not None: 374 + metadata["loud_speech_ratio"] = round(ratio, 2) 362 375 363 376 # Add enrichment metadata if available 364 377 if enrichment: ··· 505 518 backend_module = get_backend(backend) 506 519 model_info = backend_module.get_model_info(backend_config) 507 520 508 - # Sanity check: if VAD detected speech but we got no statements, something is wrong 509 - if vad_result.has_speech and not statements: 510 - if vad_result.speech_duration < 5.0: 511 - # Marginal speech detection — treat as silence rather than failure. 512 - # VAD occasionally flags brief noise as speech; if the STT backend 513 - # can't produce anything from it, that's expected, not an error. 514 - logging.info( 515 - f"VAD detected {vad_result.speech_duration:.1f}s of marginal speech " 516 - f"but transcription produced 0 statements — treating as silence" 517 - ) 518 - else: 519 - raise RuntimeError( 520 - f"VAD detected {vad_result.speech_duration:.1f}s of speech " 521 - f"(from {vad_result.duration:.1f}s total) but transcription produced " 522 - f"0 statements. This indicates a transcription failure, not silence." 523 - ) 524 - 525 521 # Load config for preserve_all setting 526 522 config = get_config() 527 523 preserve_all = config.get("transcribe", {}).get("preserve_all", False) ··· 531 527 532 528 # Handle no speech detected 533 529 if not statements: 530 + logging.info( 531 + "STT backend returned 0 statements, treating as silence " 532 + "(VAD: %.1fs speech of %.1fs)", 533 + vad_result.speech_duration, 534 + vad_result.duration, 535 + ) 534 536 if preserve_all: 535 537 event["outcome"] = "preserved" 536 538 logging.info( ··· 706 708 # - Audio is noisy 707 709 # - Rev.ai token is available 708 710 noise_upgrade = transcribe_config.get("noise_upgrade", True) 711 + min_ratio = transcribe_config.get("noise_upgrade_min_speech_ratio", 0.3) 709 712 if ( 710 713 not args.backend 711 714 and noise_upgrade ··· 714 717 ): 715 718 from observe.transcribe.revai import has_token 716 719 717 - if has_token(): 720 + ratio = vad_result.loud_speech_ratio 721 + if ratio is not None and ratio < min_ratio: 722 + logging.info( 723 + "Noisy audio (RMS=%.4f) looks like non-speech (loud_speech_ratio=%.2f < %.2f), " 724 + "skipping Rev.ai upgrade", 725 + vad_result.noisy_rms, 726 + ratio, 727 + min_ratio, 728 + ) 729 + elif has_token(): 718 730 logging.info( 719 - f"Noisy audio detected (RMS={vad_result.noisy_rms:.4f}), " 720 - f"upgrading to Rev.ai backend" 731 + "Noisy audio detected (RMS=%.4f, loud_speech_ratio=%s), upgrading to Rev.ai backend", 732 + vad_result.noisy_rms, 733 + f"{ratio:.2f}" if ratio is not None else "n/a", 721 734 ) 722 735 backend = "revai" 723 736

+58 -1

observe/vad.py

··· 113 113 return float(np.mean(rms_values)), total_duration 114 114 115 115 116 + def compute_loud_speech_windows( 117 + audio: np.ndarray, 118 + speech_segments: list[tuple[float, float]], 119 + sample_rate: int, 120 + window_s: float = 1.0, 121 + rms_threshold: float = 0.01, 122 + ) -> tuple[int, int]: 123 + """Count loud fixed-duration windows and those that overlap speech.""" 124 + window_samples = int(window_s * sample_rate) 125 + if len(audio) < window_samples: 126 + return 0, 0 127 + 128 + loud_windows = 0 129 + speech_loud_windows = 0 130 + 131 + for i in range(len(audio) // window_samples): 132 + start_sample = i * window_samples 133 + end_sample = start_sample + window_samples 134 + window_audio = audio[start_sample:end_sample] 135 + rms = np.sqrt(np.mean(window_audio**2)) 136 + if rms <= rms_threshold: 137 + continue 138 + 139 + loud_windows += 1 140 + window_start = i * window_s 141 + window_end = window_start + window_s 142 + if any( 143 + window_start < end and window_end > start for start, end in speech_segments 144 + ): 145 + speech_loud_windows += 1 146 + 147 + return loud_windows, speech_loud_windows 148 + 149 + 116 150 @dataclass 117 151 class SpeechSegment: 118 152 """A segment of speech with original and reduced timestamps. ··· 203 237 speech_segments: List of (start, end) tuples for each speech segment 204 238 noisy_rms: RMS level of non-speech regions (None if not computable) 205 239 noisy_s: Duration of non-speech audio used for RMS calculation 240 + loud_windows: Number of 1s windows whose RMS exceeds the loud threshold 241 + speech_loud_windows: Number of loud windows that overlap speech segments 242 + loud_speech_ratio: Ratio of speech_loud_windows to loud_windows, or None 206 243 """ 207 244 208 245 duration: float ··· 211 248 speech_segments: list[tuple[float, float]] = field(default_factory=list) 212 249 noisy_rms: float | None = None 213 250 noisy_s: float = 0.0 251 + loud_windows: int = 0 252 + speech_loud_windows: int = 0 214 253 215 254 def is_noisy(self, threshold: float = 0.01) -> bool: 216 255 """Check if background noise level exceeds threshold. ··· 230 269 return 0.0 231 270 return self.speech_duration / self.duration 232 271 272 + @property 273 + def loud_speech_ratio(self) -> float | None: 274 + """Ratio of speech-overlapping loud windows to all loud windows.""" 275 + if self.loud_windows <= 0: 276 + return None 277 + return self.speech_loud_windows / self.loud_windows 278 + 233 279 234 280 def run_vad( 235 281 audio: np.ndarray, ··· 274 320 275 321 # Compute RMS of non-speech regions (for noise detection) 276 322 noisy_rms, noisy_s = compute_nonspeech_rms(audio, speech_segments, SAMPLE_RATE) 323 + loud_windows, speech_loud_windows = compute_loud_speech_windows( 324 + audio, speech_segments, SAMPLE_RATE 325 + ) 277 326 278 327 vad_time = time.perf_counter() - t0 279 328 rms_str = f", rms={noisy_rms:.4f}" if noisy_rms is not None else "" 329 + ratio = speech_loud_windows / loud_windows if loud_windows > 0 else None 330 + loud_str = ( 331 + f", loud_windows={loud_windows}, speech_loud={speech_loud_windows}, ratio={ratio:.2f}" 332 + if ratio is not None 333 + else "" 334 + ) 280 335 logging.info( 281 336 f" VAD complete in {vad_time:.2f}s: " 282 337 f"{duration:.1f}s total, {speech_duration:.1f}s speech, " 283 - f"{len(speech_chunks)} chunks, has_speech={has_speech}{rms_str}" 338 + f"{len(speech_chunks)} chunks, has_speech={has_speech}{rms_str}{loud_str}" 284 339 ) 285 340 286 341 return VadResult( ··· 290 345 speech_segments=speech_segments, 291 346 noisy_rms=noisy_rms, 292 347 noisy_s=noisy_s, 348 + loud_windows=loud_windows, 349 + speech_loud_windows=speech_loud_windows, 293 350 ) 294 351 295 352

+106

tests/test_transcribe_empty_result.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Tests for empty-result handling in process_audio.""" 5 + 6 + from unittest.mock import MagicMock, patch 7 + 8 + import numpy as np 9 + import pytest 10 + 11 + from observe.utils import SAMPLE_RATE 12 + from observe.vad import VadResult 13 + 14 + 15 + @pytest.fixture 16 + def raw_path(tmp_path): 17 + path = tmp_path / "chronicle" / "20260416" / "default" / "120000_300" / "audio.m4a" 18 + path.parent.mkdir(parents=True) 19 + path.touch() 20 + return path 21 + 22 + 23 + @pytest.fixture 24 + def audio_buffer(): 25 + return np.zeros(10 * SAMPLE_RATE, dtype=np.float32) 26 + 27 + 28 + @pytest.fixture 29 + def vad_result(): 30 + return VadResult( 31 + duration=10.0, 32 + speech_duration=5.0, 33 + has_speech=True, 34 + speech_segments=[(1.0, 6.0)], 35 + ) 36 + 37 + 38 + def test_empty_statements_filter_path(raw_path, audio_buffer, vad_result): 39 + from observe.transcribe.main import process_audio 40 + 41 + backend_module = MagicMock() 42 + backend_module.get_model_info.return_value = { 43 + "model": "medium.en", 44 + "device": "cpu", 45 + "compute_type": "int8", 46 + } 47 + 48 + with ( 49 + patch( 50 + "observe.transcribe.main.get_config", 51 + return_value={"transcribe": {"preserve_all": False}}, 52 + ), 53 + patch( 54 + "observe.transcribe.main.get_journal", return_value=str(raw_path.parents[4]) 55 + ), 56 + patch("observe.transcribe.main.stt_transcribe", return_value=[]), 57 + patch("observe.transcribe.main.get_backend", return_value=backend_module), 58 + patch("observe.transcribe.main.callosum_send") as mock_send, 59 + ): 60 + process_audio(raw_path, audio_buffer, vad_result, {}, backend="whisper") 61 + 62 + assert not raw_path.exists() 63 + assert mock_send.call_args.args[:2] == ("observe", "transcribed") 64 + assert mock_send.call_args.kwargs["outcome"] == "filtered" 65 + 66 + 67 + def test_empty_statements_preserve_path(raw_path, audio_buffer, vad_result): 68 + from observe.transcribe.main import process_audio 69 + 70 + backend_module = MagicMock() 71 + backend_module.get_model_info.return_value = { 72 + "model": "medium.en", 73 + "device": "cpu", 74 + "compute_type": "int8", 75 + } 76 + 77 + with ( 78 + patch( 79 + "observe.transcribe.main.get_config", 80 + return_value={"transcribe": {"preserve_all": True}}, 81 + ), 82 + patch( 83 + "observe.transcribe.main.get_journal", return_value=str(raw_path.parents[4]) 84 + ), 85 + patch("observe.transcribe.main.stt_transcribe", return_value=[]), 86 + patch("observe.transcribe.main.get_backend", return_value=backend_module), 87 + patch("observe.transcribe.main.callosum_send") as mock_send, 88 + ): 89 + process_audio(raw_path, audio_buffer, vad_result, {}, backend="whisper") 90 + 91 + assert raw_path.exists() 92 + assert mock_send.call_args.args[:2] == ("observe", "transcribed") 93 + assert mock_send.call_args.kwargs["outcome"] == "preserved" 94 + 95 + 96 + def test_backend_raise_propagates(raw_path, audio_buffer, vad_result): 97 + from observe.transcribe.main import process_audio 98 + 99 + with patch( 100 + "observe.transcribe.main.stt_transcribe", side_effect=RuntimeError("rev.ai 502") 101 + ): 102 + with pytest.raises(SystemExit) as exc_info: 103 + process_audio(raw_path, audio_buffer, vad_result, {}, backend="whisper") 104 + 105 + assert exc_info.value.code == 1 106 + assert raw_path.exists()

+149

tests/test_transcribe_noise_upgrade.py

··· 3 3 4 4 """Tests for noise upgrade feature in transcription.""" 5 5 6 + import argparse 6 7 from unittest.mock import patch 7 8 9 + import numpy as np 10 + import pytest 11 + 12 + from observe.utils import SAMPLE_RATE 8 13 from observe.vad import VadResult 14 + 15 + 16 + @pytest.fixture 17 + def audio_path(tmp_path): 18 + path = tmp_path / "chronicle" / "20260416" / "default" / "120000_300" / "audio.m4a" 19 + path.parent.mkdir(parents=True) 20 + path.touch() 21 + return path 22 + 23 + 24 + @pytest.fixture 25 + def args(): 26 + return argparse.Namespace(backend=None, cpu=False, model=None, redo=False) 27 + 28 + 29 + @pytest.fixture 30 + def audio_buffer(): 31 + return np.zeros(10 * SAMPLE_RATE, dtype=np.float32) 9 32 10 33 11 34 class TestHasToken: ··· 171 194 lines = _statements_to_jsonl(statements, "audio.flac", base_dt, model_info) 172 195 metadata = json.loads(lines[0]) 173 196 assert metadata["backend"] == "unknown" 197 + 198 + 199 + class TestNoiseUpgradeGate: 200 + def test_gate_blocks_on_low_ratio(self, audio_path, args, audio_buffer): 201 + from observe.transcribe.main import _process_one 202 + 203 + vad = VadResult( 204 + duration=10.0, 205 + speech_duration=5.0, 206 + has_speech=True, 207 + speech_segments=[(1.0, 6.0)], 208 + noisy_rms=0.02, 209 + noisy_s=3.0, 210 + loud_windows=200, 211 + speech_loud_windows=10, 212 + ) 213 + transcribe_config = { 214 + "backend": "whisper", 215 + "noise_upgrade": True, 216 + "noise_upgrade_min_speech_ratio": 0.3, 217 + "whisper": {}, 218 + } 219 + 220 + with ( 221 + patch("observe.transcribe.main.load_audio", return_value=audio_buffer), 222 + patch("observe.transcribe.main.run_vad", return_value=vad), 223 + patch("observe.transcribe.main.reduce_audio", return_value=(None, None)), 224 + patch("observe.transcribe.main.process_audio") as mock_process_audio, 225 + patch("observe.transcribe.revai.has_token", return_value=True), 226 + ): 227 + _process_one(audio_path, args, transcribe_config, []) 228 + 229 + assert mock_process_audio.call_args.kwargs["backend"] == "whisper" 230 + 231 + def test_gate_admits_on_high_ratio(self, audio_path, args, audio_buffer): 232 + from observe.transcribe.main import _process_one 233 + 234 + vad = VadResult( 235 + duration=10.0, 236 + speech_duration=5.0, 237 + has_speech=True, 238 + speech_segments=[(1.0, 6.0)], 239 + noisy_rms=0.02, 240 + noisy_s=3.0, 241 + loud_windows=100, 242 + speech_loud_windows=90, 243 + ) 244 + transcribe_config = { 245 + "backend": "whisper", 246 + "noise_upgrade": True, 247 + "noise_upgrade_min_speech_ratio": 0.3, 248 + "whisper": {}, 249 + } 250 + 251 + with ( 252 + patch("observe.transcribe.main.load_audio", return_value=audio_buffer), 253 + patch("observe.transcribe.main.run_vad", return_value=vad), 254 + patch("observe.transcribe.main.reduce_audio", return_value=(None, None)), 255 + patch("observe.transcribe.main.process_audio") as mock_process_audio, 256 + patch("observe.transcribe.revai.has_token", return_value=True), 257 + ): 258 + _process_one(audio_path, args, transcribe_config, []) 259 + 260 + assert mock_process_audio.call_args.kwargs["backend"] == "revai" 261 + 262 + def test_gate_fallback_when_ratio_none(self, audio_path, args, audio_buffer): 263 + from observe.transcribe.main import _process_one 264 + 265 + vad = VadResult( 266 + duration=10.0, 267 + speech_duration=5.0, 268 + has_speech=True, 269 + speech_segments=[(1.0, 6.0)], 270 + noisy_rms=0.02, 271 + noisy_s=3.0, 272 + loud_windows=0, 273 + speech_loud_windows=0, 274 + ) 275 + transcribe_config = { 276 + "backend": "whisper", 277 + "noise_upgrade": True, 278 + "noise_upgrade_min_speech_ratio": 0.3, 279 + "whisper": {}, 280 + } 281 + 282 + with ( 283 + patch("observe.transcribe.main.load_audio", return_value=audio_buffer), 284 + patch("observe.transcribe.main.run_vad", return_value=vad), 285 + patch("observe.transcribe.main.reduce_audio", return_value=(None, None)), 286 + patch("observe.transcribe.main.process_audio") as mock_process_audio, 287 + patch("observe.transcribe.revai.has_token", return_value=True), 288 + ): 289 + _process_one(audio_path, args, transcribe_config, []) 290 + 291 + assert mock_process_audio.call_args.kwargs["backend"] == "revai" 292 + 293 + def test_gate_blocks_when_not_noisy(self, audio_path, args, audio_buffer): 294 + from observe.transcribe.main import _process_one 295 + 296 + vad = VadResult( 297 + duration=10.0, 298 + speech_duration=5.0, 299 + has_speech=True, 300 + speech_segments=[(1.0, 6.0)], 301 + noisy_rms=0.005, 302 + noisy_s=3.0, 303 + loud_windows=100, 304 + speech_loud_windows=90, 305 + ) 306 + transcribe_config = { 307 + "backend": "whisper", 308 + "noise_upgrade": True, 309 + "noise_upgrade_min_speech_ratio": 0.3, 310 + "whisper": {}, 311 + } 312 + 313 + with ( 314 + patch("observe.transcribe.main.load_audio", return_value=audio_buffer), 315 + patch("observe.transcribe.main.run_vad", return_value=vad), 316 + patch("observe.transcribe.main.reduce_audio", return_value=(None, None)), 317 + patch("observe.transcribe.main.process_audio") as mock_process_audio, 318 + patch("observe.transcribe.revai.has_token", return_value=True), 319 + ): 320 + _process_one(audio_path, args, transcribe_config, []) 321 + 322 + assert mock_process_audio.call_args.kwargs["backend"] == "whisper"

+61

tests/test_vad.py

··· 6 6 from unittest.mock import patch 7 7 8 8 import numpy as np 9 + import pytest 9 10 10 11 from observe.utils import SAMPLE_RATE 11 12 from observe.vad import ( ··· 13 14 AudioReduction, 14 15 SpeechSegment, 15 16 VadResult, 17 + compute_loud_speech_windows, 16 18 compute_nonspeech_rms, 17 19 get_nonspeech_segments, 18 20 reduce_audio, ··· 21 23 ) 22 24 23 25 26 + @pytest.fixture 27 + def rng(): 28 + return np.random.default_rng(42) 29 + 30 + 24 31 class TestVadResult: 25 32 """Test VadResult dataclass.""" 26 33 ··· 37 44 assert result.speech_duration == 5.0 38 45 assert result.has_speech is True 39 46 assert result.speech_segments == [(1.0, 3.0), (5.0, 8.0)] 47 + assert result.loud_windows == 0 48 + assert result.speech_loud_windows == 0 49 + assert result.loud_speech_ratio is None 40 50 41 51 def test_vad_result_no_speech(self): 42 52 """VadResult with no speech should have has_speech=False.""" ··· 401 411 402 412 assert result.noisy_rms is None 403 413 assert result.noisy_s == 0.0 414 + 415 + def test_loud_speech_ratio_music_like_fixture(self, rng): 416 + audio = rng.normal(0.0, 0.1, 30 * SAMPLE_RATE).astype(np.float32) 417 + speech_segments = [(5.0, 6.0), (20.0, 21.0)] 418 + 419 + loud_windows, speech_loud_windows = compute_loud_speech_windows( 420 + audio, speech_segments, SAMPLE_RATE 421 + ) 422 + 423 + assert loud_windows >= 25 424 + assert speech_loud_windows <= 3 425 + assert speech_loud_windows / loud_windows < 0.15 426 + 427 + def test_loud_speech_ratio_meeting_like_fixture(self): 428 + audio = np.zeros(30 * SAMPLE_RATE, dtype=np.float32) 429 + audio[0 : 10 * SAMPLE_RATE] = 0.1 430 + audio[15 * SAMPLE_RATE : 25 * SAMPLE_RATE] = 0.1 431 + speech_segments = [(0.0, 10.0), (15.0, 25.0)] 432 + 433 + loud_windows, speech_loud_windows = compute_loud_speech_windows( 434 + audio, speech_segments, SAMPLE_RATE 435 + ) 436 + 437 + assert loud_windows == 20 438 + assert speech_loud_windows == 20 439 + assert speech_loud_windows / loud_windows == pytest.approx(1.0, rel=0.01) 440 + 441 + def test_loud_speech_ratio_all_silent(self): 442 + audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32) 443 + 444 + loud_windows, speech_loud_windows = compute_loud_speech_windows( 445 + audio, [], SAMPLE_RATE 446 + ) 447 + 448 + assert loud_windows == 0 449 + assert speech_loud_windows == 0 450 + assert ( 451 + VadResult( 452 + duration=10.0, 453 + speech_duration=0.0, 454 + has_speech=False, 455 + loud_windows=0, 456 + speech_loud_windows=0, 457 + ).loud_speech_ratio 458 + is None 459 + ) 460 + 461 + def test_loud_speech_ratio_short_audio(self): 462 + audio = np.zeros(SAMPLE_RATE // 2, dtype=np.float32) 463 + 464 + assert compute_loud_speech_windows(audio, [], SAMPLE_RATE) == (0, 0) 404 465 405 466 406 467 class TestSpeechSegment:

Configure Feed

Configure Feed