Fix M4A multi-track audio processing for sck-cli files

+199 -24

2 changed files

expand all

observe

transcribe.py

tests

test_transcribe.py

+49 -23

observe/transcribe.py

··· 22 22 23 23 from observe.diarize import DiarizationError, diarize, save_speaker_embeddings 24 24 from observe.hear import SAMPLE_RATE 25 - from observe.utils import ( 26 - extract_descriptive_suffix, 27 - get_segment_key, 28 - segment_and_suffix, 29 - ) 25 + from observe.utils import get_segment_key, segment_and_suffix 30 26 from think.callosum import callosum_send 31 27 from think.entities import load_entity_names 32 28 from think.models import GEMINI_FLASH ··· 187 183 """Prepare audio file for diarization, converting if needed. 188 184 189 185 Returns path to a file suitable for diarization (mono or stereo FLAC/WAV). 190 - For m4a files, converts to temporary FLAC. 186 + For m4a files, converts to temporary FLAC, mixing all audio streams. 187 + 188 + M4A files from sck-cli contain two mono streams: track 0 = system audio, 189 + track 1 = microphone. Both are decoded and mixed together. 191 190 """ 192 191 import av 193 192 ··· 196 195 197 196 logging.info(f"Converting m4a to FLAC for diarization: {raw_path}") 198 197 198 + # First pass: count streams 199 199 container = av.open(str(raw_path)) 200 - audio_streams = list(container.streams.audio) 200 + num_streams = len(list(container.streams.audio)) 201 + container.close() 201 202 202 - if len(audio_streams) == 0: 203 - container.close() 203 + if num_streams == 0: 204 204 raise ValueError(f"No audio streams found in {raw_path}") 205 205 206 - # Decode to mono for diarization 207 - stream = audio_streams[0] 208 - resampler = av.audio.resampler.AudioResampler( 209 - format="flt", layout="mono", rate=SAMPLE_RATE 210 - ) 211 - chunks = [] 212 - for frame in container.decode(stream): 213 - for out_frame in resampler.resample(frame): 214 - arr = out_frame.to_ndarray() 215 - chunks.append(arr) 206 + # Decode each stream separately (PyAV requires fresh container per stream) 207 + # sck-cli produces: track 0 = system audio, track 1 = microphone 208 + stream_data = [] 209 + for stream_idx in range(num_streams): 210 + container = av.open(str(raw_path)) 211 + stream = list(container.streams.audio)[stream_idx] 216 212 217 - container.close() 213 + resampler = av.audio.resampler.AudioResampler( 214 + format="flt", layout="mono", rate=SAMPLE_RATE 215 + ) 216 + chunks = [] 217 + for frame in container.decode(stream): 218 + for out_frame in resampler.resample(frame): 219 + arr = out_frame.to_ndarray() 220 + chunks.append(arr) 218 221 219 - if not chunks: 222 + container.close() 223 + 224 + if chunks: 225 + combined = np.concatenate(chunks, axis=1).flatten() 226 + stream_data.append(combined) 227 + logging.info( 228 + f" Stream {stream_idx}: {len(combined)} samples " 229 + f"({len(combined) / SAMPLE_RATE:.1f}s)" 230 + ) 231 + 232 + if not stream_data: 220 233 raise ValueError(f"No audio data decoded from {raw_path}") 221 234 222 - combined = np.concatenate(chunks, axis=1).flatten() 235 + # Mix all streams together 236 + if len(stream_data) == 1: 237 + mixed = stream_data[0] 238 + else: 239 + # Pad shorter streams to match longest 240 + max_len = max(len(s) for s in stream_data) 241 + padded = [] 242 + for s in stream_data: 243 + if len(s) < max_len: 244 + s = np.pad(s, (0, max_len - len(s)), mode="constant") 245 + padded.append(s) 246 + # Average all streams 247 + mixed = np.mean(padded, axis=0) 248 + logging.info(f" Mixed {len(stream_data)} streams -> {len(mixed)} samples") 223 249 224 250 # Write to temporary FLAC in same directory 225 251 temp_path = raw_path.with_suffix(".tmp.flac") 226 - audio_int16 = (np.clip(combined, -1.0, 1.0) * 32767).astype(np.int16) 252 + audio_int16 = (np.clip(mixed, -1.0, 1.0) * 32767).astype(np.int16) 227 253 sf.write(temp_path, audio_int16, SAMPLE_RATE, format="FLAC") 228 254 229 255 return temp_path

+150 -1

tests/test_transcribe.py

··· 4 4 """Tests for observe.transcribe validation logic.""" 5 5 6 6 import json 7 + import shutil 8 + import tempfile 9 + from pathlib import Path 7 10 8 - from observe.transcribe import validate_transcription 11 + import numpy as np 12 + import pytest 13 + import soundfile as sf 14 + 15 + from observe.transcribe import Transcriber, validate_transcription 16 + 17 + 18 + class _MockTranscriber: 19 + """Mock transcriber for testing _prepare_audio without API key.""" 20 + 21 + def _prepare_audio(self, raw_path): 22 + return Transcriber._prepare_audio(self, raw_path) 9 23 10 24 11 25 def _extract_metadata_and_items(result: list) -> tuple[dict, list]: ··· 263 277 # Only line should be empty metadata 264 278 first = json.loads(lines[0]) 265 279 assert first == {} 280 + 281 + 282 + @pytest.mark.skipif(not shutil.which("ffmpeg"), reason="ffmpeg not installed") 283 + def test_prepare_audio_multi_track_m4a(): 284 + """Test that _prepare_audio mixes multiple M4A audio streams together.""" 285 + import subprocess 286 + 287 + with tempfile.TemporaryDirectory() as tmpdir: 288 + # Create two mono FLAC files to combine into multi-track M4A 289 + track0_path = Path(tmpdir) / "track0.flac" 290 + track1_path = Path(tmpdir) / "track1.flac" 291 + m4a_path = Path(tmpdir) / "test.m4a" 292 + 293 + # Track 0: silence (system audio - no content) 294 + # Track 1: 440Hz sine wave (microphone - has voice) 295 + sample_rate = 16000 296 + duration = 1.0 # 1 second 297 + t = np.linspace(0, duration, int(sample_rate * duration), dtype=np.float32) 298 + 299 + track0_data = np.zeros_like(t) # Silence 300 + track1_data = 0.5 * np.sin(2 * np.pi * 440 * t) # 440Hz tone 301 + 302 + sf.write(track0_path, track0_data, sample_rate, format="FLAC") 303 + sf.write(track1_path, track1_data, sample_rate, format="FLAC") 304 + 305 + # Use ffmpeg to create multi-track M4A (same structure as sck-cli output) 306 + # This creates an M4A with 2 separate mono audio streams 307 + result = subprocess.run( 308 + [ 309 + "ffmpeg", 310 + "-y", 311 + "-i", 312 + str(track0_path), 313 + "-i", 314 + str(track1_path), 315 + "-map", 316 + "0:a", 317 + "-map", 318 + "1:a", 319 + "-c:a", 320 + "aac", 321 + "-b:a", 322 + "64k", 323 + str(m4a_path), 324 + ], 325 + capture_output=True, 326 + text=True, 327 + ) 328 + assert result.returncode == 0, f"ffmpeg failed: {result.stderr}" 329 + 330 + transcriber = _MockTranscriber() 331 + temp_flac = transcriber._prepare_audio(m4a_path) 332 + 333 + try: 334 + assert temp_flac.exists() 335 + assert temp_flac.suffix == ".flac" 336 + 337 + # Read the output and verify both streams were mixed 338 + mixed_data, sr = sf.read(temp_flac, dtype="float32") 339 + 340 + # The mixed audio should have content from track 1 (the sine wave) 341 + # AAC compression affects amplitude, so use loose threshold 342 + rms = np.sqrt(np.mean(mixed_data**2)) 343 + assert rms > 0.1, f"Mixed audio should contain signal, got RMS={rms}" 344 + 345 + # Verify sample rate matches expected 346 + assert sr == 16000 347 + finally: 348 + if temp_flac.exists(): 349 + temp_flac.unlink() 350 + 351 + 352 + @pytest.mark.skipif(not shutil.which("ffmpeg"), reason="ffmpeg not installed") 353 + def test_prepare_audio_single_stream_m4a(): 354 + """Test that _prepare_audio handles single-stream M4A correctly.""" 355 + import subprocess 356 + 357 + with tempfile.TemporaryDirectory() as tmpdir: 358 + track_path = Path(tmpdir) / "track.flac" 359 + m4a_path = Path(tmpdir) / "single.m4a" 360 + 361 + # Single track with 440Hz sine wave 362 + sample_rate = 16000 363 + duration = 0.5 364 + t = np.linspace(0, duration, int(sample_rate * duration), dtype=np.float32) 365 + track_data = 0.5 * np.sin(2 * np.pi * 440 * t) 366 + 367 + sf.write(track_path, track_data, sample_rate, format="FLAC") 368 + 369 + result = subprocess.run( 370 + [ 371 + "ffmpeg", 372 + "-y", 373 + "-i", 374 + str(track_path), 375 + "-c:a", 376 + "aac", 377 + "-b:a", 378 + "64k", 379 + str(m4a_path), 380 + ], 381 + capture_output=True, 382 + text=True, 383 + ) 384 + assert result.returncode == 0, f"ffmpeg failed: {result.stderr}" 385 + 386 + transcriber = _MockTranscriber() 387 + temp_flac = transcriber._prepare_audio(m4a_path) 388 + 389 + try: 390 + assert temp_flac.exists() 391 + 392 + mixed_data, sr = sf.read(temp_flac, dtype="float32") 393 + rms = np.sqrt(np.mean(mixed_data**2)) 394 + # Single stream should preserve the signal 395 + assert rms > 0.3, f"Single stream should have strong signal, got RMS={rms}" 396 + finally: 397 + if temp_flac.exists(): 398 + temp_flac.unlink() 399 + 400 + 401 + def test_prepare_audio_flac_passthrough(): 402 + """Test that _prepare_audio returns FLAC files unchanged.""" 403 + with tempfile.TemporaryDirectory() as tmpdir: 404 + flac_path = Path(tmpdir) / "test.flac" 405 + 406 + sample_rate = 16000 407 + data = np.zeros(sample_rate, dtype=np.float32) 408 + sf.write(flac_path, data, sample_rate, format="FLAC") 409 + 410 + transcriber = _MockTranscriber() 411 + result = transcriber._prepare_audio(flac_path) 412 + 413 + # FLAC should be returned as-is (not converted) 414 + assert result == flac_path

Configure Feed

Configure Feed