Merge branch 'main' of github.com:solpbc/solstone

+12 -2

docs/JOURNAL.md

··· 181 181 "transcribe": { 182 182 "device": "auto", 183 183 "model": "medium.en", 184 - "compute_type": "default" 184 + "compute_type": "default", 185 + "enrich": true, 186 + "preserve_all": false 185 187 } 186 188 } 187 189 ``` ··· 189 191 Fields: 190 192 - `device` (string) – Device for inference: `"auto"` (detect GPU, fall back to CPU), `"cpu"`, or `"cuda"`. Default: `"auto"`. 191 193 - `model` (string) – Whisper model to use (e.g., `"tiny.en"`, `"base.en"`, `"small.en"`, `"medium.en"`, `"large-v3-turbo"`, `"distil-large-v3"`). Default: `"medium.en"`. 192 - - `compute_type` (string) – Compute precision: `"default"` (auto-select: float16 on GPU, float32 on CPU), `"float32"` (most compatible), `"float16"` (faster on most GPUs), `"int8"` (fastest on CPU). Default: `"default"`. 194 + - `compute_type` (string) – Compute precision: `"default"` (auto-select optimal for platform), `"float32"` (most compatible), `"float16"` (faster on CUDA GPUs), `"int8"` (fastest on CPU). Default: `"default"`. 195 + - `enrich` (boolean) – Enable LLM enrichment for topic extraction and transcript correction. Default: `true`. 196 + - `preserve_all` (boolean) – Keep audio files even when no speech is detected. When `false`, silent recordings are deleted to save disk space. Default: `false`. 197 + 198 + **Platform auto-detection**: When `compute_type` is `"default"`, optimal settings are automatically selected: 199 + - **CUDA GPU**: Uses `float16` for GPU-optimized inference 200 + - **CPU (including Apple Silicon)**: Uses `int8` for ~2x faster inference and significantly faster model loading 201 + 202 + Voice embeddings (resemblyzer) also auto-detect the best device: MPS on Apple Silicon (~16x faster), CUDA when available, or CPU fallback. 193 203 194 204 These settings can be overridden via CLI flags: `--cpu` forces CPU mode with int8, `--model MODEL` overrides the model. 195 205

+39 -30

observe/macos/observer.py

··· 163 163 logger.warning(f"No audio streams in {audio_path}") 164 164 return False 165 165 166 - stream = audio_streams[0] 167 - sample_rate = stream.rate or SAMPLE_RATE 166 + # Check ALL audio streams - pass if ANY has enough voice activity 167 + # Stream 0 is typically system audio, stream 1 is microphone 168 + for stream_idx, stream in enumerate(audio_streams): 169 + sample_rate = stream.rate or SAMPLE_RATE 170 + 171 + # Decode audio and collect samples for this stream 172 + samples = [] 173 + container.seek(0) # Reset to start for each stream 174 + for packet in container.demux(stream): 175 + for frame in packet.decode(): 176 + arr = frame.to_ndarray() 177 + # Convert to mono if stereo (average channels) 178 + if arr.ndim > 1: 179 + arr = arr.mean(axis=0) 180 + samples.append(arr.flatten()) 181 + 182 + if not samples: 183 + continue 168 184 169 - # Decode audio and collect samples 170 - samples = [] 171 - for frame in container.decode(stream): 172 - arr = frame.to_ndarray() 173 - # Convert to mono if stereo (average channels) 174 - if arr.ndim > 1: 175 - arr = arr.mean(axis=0) 176 - samples.append(arr.flatten()) 185 + # Concatenate all samples 186 + all_samples = np.concatenate(samples) 177 187 178 - if not samples: 179 - logger.warning(f"No audio samples decoded from {audio_path}") 180 - return False 188 + # Split into CHUNK_DURATION (5 second) chunks and count threshold hits 189 + chunk_samples = int(sample_rate * CHUNK_DURATION) 190 + threshold_hits = 0 181 191 182 - # Concatenate all samples 183 - all_samples = np.concatenate(samples) 192 + for i in range(0, len(all_samples), chunk_samples): 193 + chunk = all_samples[i : i + chunk_samples] 194 + if len(chunk) == 0: 195 + continue 184 196 185 - # Split into CHUNK_DURATION (5 second) chunks and count threshold hits 186 - chunk_samples = int(sample_rate * CHUNK_DURATION) 187 - threshold_hits = 0 197 + # Compute RMS for this chunk 198 + rms = float(np.sqrt(np.mean(chunk**2))) 199 + if rms > RMS_THRESHOLD: 200 + threshold_hits += 1 188 201 189 - for i in range(0, len(all_samples), chunk_samples): 190 - chunk = all_samples[i : i + chunk_samples] 191 - if len(chunk) == 0: 192 - continue 202 + logger.debug( 203 + f"Audio threshold check stream {stream_idx}: " 204 + f"{threshold_hits}/{MIN_HITS_FOR_SAVE} hits" 205 + ) 193 206 194 - # Compute RMS for this chunk 195 - rms = float(np.sqrt(np.mean(chunk**2))) 196 - if rms > RMS_THRESHOLD: 197 - threshold_hits += 1 207 + if threshold_hits >= MIN_HITS_FOR_SAVE: 208 + return True 198 209 199 - logger.debug( 200 - f"Audio threshold check: {threshold_hits}/{MIN_HITS_FOR_SAVE} hits" 201 - ) 202 - return threshold_hits >= MIN_HITS_FOR_SAVE 210 + # No stream passed threshold 211 + return False 203 212 204 213 except Exception as e: 205 214 logger.warning(f"Error checking audio threshold for {audio_path}: {e}")

+143 -36

observe/transcribe.py

··· 15 15 - <stem>.npz: Sentence-level voice embeddings indexed by segment id 16 16 17 17 Configuration (journal config): 18 + - transcribe.device: Device for inference ("auto", "cpu", "cuda"). Default: "auto" 19 + - transcribe.model: Whisper model size (e.g., "medium.en"). Default: "medium.en" 20 + - transcribe.compute_type: Precision ("default", "float32", "float16", "int8"). 21 + When "default", auto-selects: float16 for CUDA, int8 for CPU (including Apple Silicon). 18 22 - transcribe.enrich: Enable/disable Gemini Lite enrichment (default: true) 23 + - transcribe.preserve_all: Keep audio files even when no speech detected (default: false) 24 + 25 + Platform optimizations: 26 + - CUDA GPU: Uses float16 for GPU-optimized inference 27 + - Apple Silicon: Uses int8 for Whisper (~2x faster), MPS for embeddings (~16x faster) 28 + - Other CPU: Uses int8 for best performance 19 29 """ 20 30 21 31 from __future__ import annotations ··· 25 35 import json 26 36 import logging 27 37 import os 38 + import platform 28 39 import time 29 40 from pathlib import Path 30 41 ··· 55 66 SENTENCE_ENDINGS = frozenset(".?!") 56 67 57 68 69 + def _is_apple_silicon() -> bool: 70 + """Detect if running on Apple Silicon.""" 71 + return platform.system() == "Darwin" and platform.machine() == "arm64" 72 + 73 + 74 + def _has_cuda() -> bool: 75 + """Check if CUDA is available via CTranslate2.""" 76 + try: 77 + import ctranslate2 78 + 79 + return ctranslate2.get_cuda_device_count() > 0 80 + except Exception: 81 + return False 82 + 83 + 84 + def _get_optimal_compute_type(device: str) -> str: 85 + """Get optimal compute type for the current platform. 86 + 87 + When compute_type is "default", CTranslate2 auto-selects but makes suboptimal 88 + choices on some platforms. This function provides better defaults: 89 + 90 + - CUDA GPU: float16 for GPU-optimized inference 91 + - CPU (including Apple Silicon): int8 for ~2x faster inference and faster model load 92 + 93 + Args: 94 + device: The device being used ("cpu", "cuda", "auto") 95 + 96 + Returns: 97 + Optimal compute type string 98 + """ 99 + # If CUDA is explicitly requested or auto-detected, float16 is optimal 100 + if device == "cuda" or (device == "auto" and _has_cuda()): 101 + return "float16" 102 + 103 + # For CPU (including Apple Silicon), int8 is fastest 104 + # This provides ~2x speedup and 76x faster model loading 105 + return "int8" 106 + 107 + 108 + def _get_optimal_encoder_device() -> str: 109 + """Get optimal device for VoiceEncoder (resemblyzer). 110 + 111 + On Apple Silicon, MPS provides ~16x speedup over CPU for embeddings. 112 + 113 + Returns: 114 + Device string: "mps" on Apple Silicon with MPS available, "cpu" otherwise 115 + """ 116 + if _is_apple_silicon(): 117 + try: 118 + import torch 119 + 120 + if torch.backends.mps.is_available(): 121 + return "mps" 122 + except ImportError: 123 + pass 124 + return "cpu" 125 + 126 + 58 127 def resegment_by_sentences(segments: list[dict]) -> list[dict]: 59 128 """Re-segment transcript by sentence boundaries instead of acoustic pauses. 60 129 ··· 138 207 from faster_whisper import WhisperModel 139 208 from resemblyzer import VoiceEncoder 140 209 141 - # VoiceEncoder follows whisper device: None auto-detects CUDA, "cpu" forces CPU 142 - encoder_device = None if device in ("auto", "cuda") else "cpu" 210 + # Resolve "default" compute_type to platform-optimal setting 211 + # CTranslate2's auto-selection falls back to float32 on CPU, but int8 is faster 212 + if compute_type == "default": 213 + compute_type = _get_optimal_compute_type(device) 214 + logging.info( 215 + f"Auto-selected compute_type={compute_type} for device={device}" 216 + ) 143 217 144 218 logging.info(f"Loading faster-whisper model ({model_size})...") 145 219 t0 = time.perf_counter() ··· 153 227 f"(device={whisper_actual_device}, compute={whisper_actual_compute})" 154 228 ) 155 229 230 + # VoiceEncoder: use MPS on Apple Silicon for ~16x speedup, otherwise CPU 231 + # (CUDA auto-detection handled by resemblyzer when device=None) 232 + if whisper_actual_device == "cuda": 233 + encoder_device = None # Let resemblyzer auto-detect CUDA 234 + else: 235 + encoder_device = _get_optimal_encoder_device() 236 + 156 237 logging.info("Loading resemblyzer VoiceEncoder...") 157 238 t0 = time.perf_counter() 158 239 self.voice_encoder = VoiceEncoder(device=encoder_device) ··· 180 261 """ 181 262 return audio_path.with_suffix(".npz") 182 263 183 - def _transcribe(self, audio_path: Path, initial_prompt: str | None) -> list[dict]: 264 + def _transcribe( 265 + self, audio_path: Path, initial_prompt: str | None 266 + ) -> tuple[list[dict], dict]: 184 267 """Transcribe audio using faster-whisper. 185 268 186 269 Args: ··· 188 271 initial_prompt: Optional prompt with entity names for context 189 272 190 273 Returns: 191 - List of sentence-aligned segments with word-level data 274 + Tuple of (segments, vad_info) where: 275 + - segments: List of sentence-aligned segments with word-level data 276 + - vad_info: Dict with 'duration' (total) and 'duration_after_vad' (speech) 192 277 193 278 Raises: 194 279 RuntimeError: If VAD detected speech but transcription produced no segments ··· 270 355 f"to {len(sentence_segments)} sentences" 271 356 ) 272 357 273 - return sentence_segments 358 + # Return segments with VAD info for callosum events 359 + vad_info = { 360 + "duration": info.duration, 361 + "duration_after_vad": info.duration_after_vad, 362 + } 363 + return sentence_segments, vad_info 274 364 275 365 def _embed_segments( 276 366 self, audio_path: Path, segments: list[dict] ··· 451 541 452 542 try: 453 543 # Transcribe with faster-whisper 454 - segments = self._transcribe(audio_path, STYLE_PROMPT) 544 + segments, vad_info = self._transcribe(audio_path, STYLE_PROMPT) 545 + 546 + # Load config for preserve_all setting 547 + config = get_config() 548 + preserve_all = config.get("transcribe", {}).get("preserve_all", False) 549 + 550 + # Build base event fields (always emitted as observe.transcribed) 551 + journal_path = Path(get_journal()) 552 + day = raw_path.parent.parent.name 553 + try: 554 + rel_input = raw_path.relative_to(journal_path) 555 + except ValueError: 556 + rel_input = raw_path 557 + 558 + event = { 559 + "input": str(rel_input), 560 + "vad_duration": round(vad_info["duration"], 1), 561 + "vad_speech": round(vad_info["duration_after_vad"], 1), 562 + } 563 + if day: 564 + event["day"] = day 565 + if segment: 566 + event["segment"] = segment 567 + if remote: 568 + event["remote"] = remote 455 569 456 - # Skip if no speech detected - safe to delete since _transcribe() already 570 + # Handle no speech detected - safe to skip/delete since _transcribe() already 457 571 # validated that VAD also detected minimal speech (raises RuntimeError otherwise) 458 572 if not segments: 459 - logging.info(f"No speech detected in {raw_path}, removing file") 460 - raw_path.unlink() 573 + # Determine outcome based on preserve_all config 574 + if preserve_all: 575 + event["outcome"] = "preserved" 576 + logging.info( 577 + f"No speech detected in {raw_path}, preserving file " 578 + f"(preserve_all=true, VAD: {vad_info['duration_after_vad']:.1f}s " 579 + f"of {vad_info['duration']:.1f}s)" 580 + ) 581 + else: 582 + event["outcome"] = "deleted" 583 + logging.info(f"No speech detected in {raw_path}, removing file") 584 + raw_path.unlink() 585 + 586 + callosum_send("observe", "transcribed", **event) 461 587 return 462 588 463 - # Extract date and time from path structure 589 + # Extract date and time from path structure (day already set above) 464 590 # Files are always in segment directories: YYYYMMDD/HHMMSS_LEN/audio.flac 465 591 time_part = segment.split("_")[0] if segment else "000000" 466 - day_str = raw_path.parent.parent.name 467 - 468 592 base_dt = datetime.datetime.strptime( 469 - f"{day_str}_{time_part}", "%Y%m%d_%H%M%S" 593 + f"{day}_{time_part}", "%Y%m%d_%H%M%S" 470 594 ) 471 595 472 596 # Extract source from <source>_audio pattern ··· 476 600 if suffix.endswith("_audio") and suffix != "audio": 477 601 source = suffix[:-6] # Remove "_audio" suffix 478 602 479 - # Run enrichment if enabled in config 603 + # Run enrichment if enabled in config (config already loaded above) 480 604 enrichment = None 481 - config = get_config() 482 605 enrich_enabled = config.get("transcribe", {}).get("enrich", True) 483 606 if enrich_enabled: 484 607 from observe.enrich import enrich_transcript ··· 504 627 else: 505 628 logging.warning(f"No embeddings generated for {raw_path}") 506 629 507 - # Emit completion event 508 - journal_path = Path(get_journal()) 509 - duration_ms = int((time.time() - start_time) * 1000) 510 - 630 + # Add completion fields and emit event 631 + event["outcome"] = "transcribed" 632 + event["duration_ms"] = int((time.time() - start_time) * 1000) 511 633 try: 512 - rel_input = raw_path.relative_to(journal_path) 513 634 rel_output = jsonl_path.relative_to(journal_path) 514 635 except ValueError: 515 - rel_input = raw_path 516 636 rel_output = jsonl_path 637 + event["output"] = str(rel_output) 517 638 518 - # Extract day from audio path (grandparent is day dir) 519 - day = raw_path.parent.parent.name 520 - 521 - event_fields = { 522 - "input": str(rel_input), 523 - "output": str(rel_output), 524 - "duration_ms": duration_ms, 525 - } 526 - if day: 527 - event_fields["day"] = day 528 - if segment: 529 - event_fields["segment"] = segment 530 - if remote: 531 - event_fields["remote"] = remote 532 - callosum_send("observe", "transcribed", **event_fields) 639 + callosum_send("observe", "transcribed", **event) 533 640 534 641 except Exception as e: 535 642 logging.error(f"Failed to transcribe {raw_path}: {e}", exc_info=True)

Configure Feed

Configure Feed