···181181 "transcribe": {
182182 "device": "auto",
183183 "model": "medium.en",
184184- "compute_type": "default"
184184+ "compute_type": "default",
185185+ "enrich": true,
186186+ "preserve_all": false
185187 }
186188}
187189```
···189191Fields:
190192- `device` (string) – Device for inference: `"auto"` (detect GPU, fall back to CPU), `"cpu"`, or `"cuda"`. Default: `"auto"`.
191193- `model` (string) – Whisper model to use (e.g., `"tiny.en"`, `"base.en"`, `"small.en"`, `"medium.en"`, `"large-v3-turbo"`, `"distil-large-v3"`). Default: `"medium.en"`.
192192-- `compute_type` (string) – Compute precision: `"default"` (auto-select: float16 on GPU, float32 on CPU), `"float32"` (most compatible), `"float16"` (faster on most GPUs), `"int8"` (fastest on CPU). Default: `"default"`.
194194+- `compute_type` (string) – Compute precision: `"default"` (auto-select optimal for platform), `"float32"` (most compatible), `"float16"` (faster on CUDA GPUs), `"int8"` (fastest on CPU). Default: `"default"`.
195195+- `enrich` (boolean) – Enable LLM enrichment for topic extraction and transcript correction. Default: `true`.
196196+- `preserve_all` (boolean) – Keep audio files even when no speech is detected. When `false`, silent recordings are deleted to save disk space. Default: `false`.
197197+198198+**Platform auto-detection**: When `compute_type` is `"default"`, optimal settings are automatically selected:
199199+- **CUDA GPU**: Uses `float16` for GPU-optimized inference
200200+- **CPU (including Apple Silicon)**: Uses `int8` for ~2x faster inference and significantly faster model loading
201201+202202+Voice embeddings (resemblyzer) also auto-detect the best device: MPS on Apple Silicon (~16x faster), CUDA when available, or CPU fallback.
193203194204These settings can be overridden via CLI flags: `--cpu` forces CPU mode with int8, `--model MODEL` overrides the model.
195205
+39-30
observe/macos/observer.py
···163163 logger.warning(f"No audio streams in {audio_path}")
164164 return False
165165166166- stream = audio_streams[0]
167167- sample_rate = stream.rate or SAMPLE_RATE
166166+ # Check ALL audio streams - pass if ANY has enough voice activity
167167+ # Stream 0 is typically system audio, stream 1 is microphone
168168+ for stream_idx, stream in enumerate(audio_streams):
169169+ sample_rate = stream.rate or SAMPLE_RATE
170170+171171+ # Decode audio and collect samples for this stream
172172+ samples = []
173173+ container.seek(0) # Reset to start for each stream
174174+ for packet in container.demux(stream):
175175+ for frame in packet.decode():
176176+ arr = frame.to_ndarray()
177177+ # Convert to mono if stereo (average channels)
178178+ if arr.ndim > 1:
179179+ arr = arr.mean(axis=0)
180180+ samples.append(arr.flatten())
181181+182182+ if not samples:
183183+ continue
168184169169- # Decode audio and collect samples
170170- samples = []
171171- for frame in container.decode(stream):
172172- arr = frame.to_ndarray()
173173- # Convert to mono if stereo (average channels)
174174- if arr.ndim > 1:
175175- arr = arr.mean(axis=0)
176176- samples.append(arr.flatten())
185185+ # Concatenate all samples
186186+ all_samples = np.concatenate(samples)
177187178178- if not samples:
179179- logger.warning(f"No audio samples decoded from {audio_path}")
180180- return False
188188+ # Split into CHUNK_DURATION (5 second) chunks and count threshold hits
189189+ chunk_samples = int(sample_rate * CHUNK_DURATION)
190190+ threshold_hits = 0
181191182182- # Concatenate all samples
183183- all_samples = np.concatenate(samples)
192192+ for i in range(0, len(all_samples), chunk_samples):
193193+ chunk = all_samples[i : i + chunk_samples]
194194+ if len(chunk) == 0:
195195+ continue
184196185185- # Split into CHUNK_DURATION (5 second) chunks and count threshold hits
186186- chunk_samples = int(sample_rate * CHUNK_DURATION)
187187- threshold_hits = 0
197197+ # Compute RMS for this chunk
198198+ rms = float(np.sqrt(np.mean(chunk**2)))
199199+ if rms > RMS_THRESHOLD:
200200+ threshold_hits += 1
188201189189- for i in range(0, len(all_samples), chunk_samples):
190190- chunk = all_samples[i : i + chunk_samples]
191191- if len(chunk) == 0:
192192- continue
202202+ logger.debug(
203203+ f"Audio threshold check stream {stream_idx}: "
204204+ f"{threshold_hits}/{MIN_HITS_FOR_SAVE} hits"
205205+ )
193206194194- # Compute RMS for this chunk
195195- rms = float(np.sqrt(np.mean(chunk**2)))
196196- if rms > RMS_THRESHOLD:
197197- threshold_hits += 1
207207+ if threshold_hits >= MIN_HITS_FOR_SAVE:
208208+ return True
198209199199- logger.debug(
200200- f"Audio threshold check: {threshold_hits}/{MIN_HITS_FOR_SAVE} hits"
201201- )
202202- return threshold_hits >= MIN_HITS_FOR_SAVE
210210+ # No stream passed threshold
211211+ return False
203212204213 except Exception as e:
205214 logger.warning(f"Error checking audio threshold for {audio_path}: {e}")
+143-36
observe/transcribe.py
···1515- <stem>.npz: Sentence-level voice embeddings indexed by segment id
16161717Configuration (journal config):
1818+- transcribe.device: Device for inference ("auto", "cpu", "cuda"). Default: "auto"
1919+- transcribe.model: Whisper model size (e.g., "medium.en"). Default: "medium.en"
2020+- transcribe.compute_type: Precision ("default", "float32", "float16", "int8").
2121+ When "default", auto-selects: float16 for CUDA, int8 for CPU (including Apple Silicon).
1822- transcribe.enrich: Enable/disable Gemini Lite enrichment (default: true)
2323+- transcribe.preserve_all: Keep audio files even when no speech detected (default: false)
2424+2525+Platform optimizations:
2626+- CUDA GPU: Uses float16 for GPU-optimized inference
2727+- Apple Silicon: Uses int8 for Whisper (~2x faster), MPS for embeddings (~16x faster)
2828+- Other CPU: Uses int8 for best performance
1929"""
20302131from __future__ import annotations
···2535import json
2636import logging
2737import os
3838+import platform
2839import time
2940from pathlib import Path
3041···5566SENTENCE_ENDINGS = frozenset(".?!")
566757686969+def _is_apple_silicon() -> bool:
7070+ """Detect if running on Apple Silicon."""
7171+ return platform.system() == "Darwin" and platform.machine() == "arm64"
7272+7373+7474+def _has_cuda() -> bool:
7575+ """Check if CUDA is available via CTranslate2."""
7676+ try:
7777+ import ctranslate2
7878+7979+ return ctranslate2.get_cuda_device_count() > 0
8080+ except Exception:
8181+ return False
8282+8383+8484+def _get_optimal_compute_type(device: str) -> str:
8585+ """Get optimal compute type for the current platform.
8686+8787+ When compute_type is "default", CTranslate2 auto-selects but makes suboptimal
8888+ choices on some platforms. This function provides better defaults:
8989+9090+ - CUDA GPU: float16 for GPU-optimized inference
9191+ - CPU (including Apple Silicon): int8 for ~2x faster inference and faster model load
9292+9393+ Args:
9494+ device: The device being used ("cpu", "cuda", "auto")
9595+9696+ Returns:
9797+ Optimal compute type string
9898+ """
9999+ # If CUDA is explicitly requested or auto-detected, float16 is optimal
100100+ if device == "cuda" or (device == "auto" and _has_cuda()):
101101+ return "float16"
102102+103103+ # For CPU (including Apple Silicon), int8 is fastest
104104+ # This provides ~2x speedup and 76x faster model loading
105105+ return "int8"
106106+107107+108108+def _get_optimal_encoder_device() -> str:
109109+ """Get optimal device for VoiceEncoder (resemblyzer).
110110+111111+ On Apple Silicon, MPS provides ~16x speedup over CPU for embeddings.
112112+113113+ Returns:
114114+ Device string: "mps" on Apple Silicon with MPS available, "cpu" otherwise
115115+ """
116116+ if _is_apple_silicon():
117117+ try:
118118+ import torch
119119+120120+ if torch.backends.mps.is_available():
121121+ return "mps"
122122+ except ImportError:
123123+ pass
124124+ return "cpu"
125125+126126+58127def resegment_by_sentences(segments: list[dict]) -> list[dict]:
59128 """Re-segment transcript by sentence boundaries instead of acoustic pauses.
60129···138207 from faster_whisper import WhisperModel
139208 from resemblyzer import VoiceEncoder
140209141141- # VoiceEncoder follows whisper device: None auto-detects CUDA, "cpu" forces CPU
142142- encoder_device = None if device in ("auto", "cuda") else "cpu"
210210+ # Resolve "default" compute_type to platform-optimal setting
211211+ # CTranslate2's auto-selection falls back to float32 on CPU, but int8 is faster
212212+ if compute_type == "default":
213213+ compute_type = _get_optimal_compute_type(device)
214214+ logging.info(
215215+ f"Auto-selected compute_type={compute_type} for device={device}"
216216+ )
143217144218 logging.info(f"Loading faster-whisper model ({model_size})...")
145219 t0 = time.perf_counter()
···153227 f"(device={whisper_actual_device}, compute={whisper_actual_compute})"
154228 )
155229230230+ # VoiceEncoder: use MPS on Apple Silicon for ~16x speedup, otherwise CPU
231231+ # (CUDA auto-detection handled by resemblyzer when device=None)
232232+ if whisper_actual_device == "cuda":
233233+ encoder_device = None # Let resemblyzer auto-detect CUDA
234234+ else:
235235+ encoder_device = _get_optimal_encoder_device()
236236+156237 logging.info("Loading resemblyzer VoiceEncoder...")
157238 t0 = time.perf_counter()
158239 self.voice_encoder = VoiceEncoder(device=encoder_device)
···180261 """
181262 return audio_path.with_suffix(".npz")
182263183183- def _transcribe(self, audio_path: Path, initial_prompt: str | None) -> list[dict]:
264264+ def _transcribe(
265265+ self, audio_path: Path, initial_prompt: str | None
266266+ ) -> tuple[list[dict], dict]:
184267 """Transcribe audio using faster-whisper.
185268186269 Args:
···188271 initial_prompt: Optional prompt with entity names for context
189272190273 Returns:
191191- List of sentence-aligned segments with word-level data
274274+ Tuple of (segments, vad_info) where:
275275+ - segments: List of sentence-aligned segments with word-level data
276276+ - vad_info: Dict with 'duration' (total) and 'duration_after_vad' (speech)
192277193278 Raises:
194279 RuntimeError: If VAD detected speech but transcription produced no segments
···270355 f"to {len(sentence_segments)} sentences"
271356 )
272357273273- return sentence_segments
358358+ # Return segments with VAD info for callosum events
359359+ vad_info = {
360360+ "duration": info.duration,
361361+ "duration_after_vad": info.duration_after_vad,
362362+ }
363363+ return sentence_segments, vad_info
274364275365 def _embed_segments(
276366 self, audio_path: Path, segments: list[dict]
···451541452542 try:
453543 # Transcribe with faster-whisper
454454- segments = self._transcribe(audio_path, STYLE_PROMPT)
544544+ segments, vad_info = self._transcribe(audio_path, STYLE_PROMPT)
545545+546546+ # Load config for preserve_all setting
547547+ config = get_config()
548548+ preserve_all = config.get("transcribe", {}).get("preserve_all", False)
549549+550550+ # Build base event fields (always emitted as observe.transcribed)
551551+ journal_path = Path(get_journal())
552552+ day = raw_path.parent.parent.name
553553+ try:
554554+ rel_input = raw_path.relative_to(journal_path)
555555+ except ValueError:
556556+ rel_input = raw_path
557557+558558+ event = {
559559+ "input": str(rel_input),
560560+ "vad_duration": round(vad_info["duration"], 1),
561561+ "vad_speech": round(vad_info["duration_after_vad"], 1),
562562+ }
563563+ if day:
564564+ event["day"] = day
565565+ if segment:
566566+ event["segment"] = segment
567567+ if remote:
568568+ event["remote"] = remote
455569456456- # Skip if no speech detected - safe to delete since _transcribe() already
570570+ # Handle no speech detected - safe to skip/delete since _transcribe() already
457571 # validated that VAD also detected minimal speech (raises RuntimeError otherwise)
458572 if not segments:
459459- logging.info(f"No speech detected in {raw_path}, removing file")
460460- raw_path.unlink()
573573+ # Determine outcome based on preserve_all config
574574+ if preserve_all:
575575+ event["outcome"] = "preserved"
576576+ logging.info(
577577+ f"No speech detected in {raw_path}, preserving file "
578578+ f"(preserve_all=true, VAD: {vad_info['duration_after_vad']:.1f}s "
579579+ f"of {vad_info['duration']:.1f}s)"
580580+ )
581581+ else:
582582+ event["outcome"] = "deleted"
583583+ logging.info(f"No speech detected in {raw_path}, removing file")
584584+ raw_path.unlink()
585585+586586+ callosum_send("observe", "transcribed", **event)
461587 return
462588463463- # Extract date and time from path structure
589589+ # Extract date and time from path structure (day already set above)
464590 # Files are always in segment directories: YYYYMMDD/HHMMSS_LEN/audio.flac
465591 time_part = segment.split("_")[0] if segment else "000000"
466466- day_str = raw_path.parent.parent.name
467467-468592 base_dt = datetime.datetime.strptime(
469469- f"{day_str}_{time_part}", "%Y%m%d_%H%M%S"
593593+ f"{day}_{time_part}", "%Y%m%d_%H%M%S"
470594 )
471595472596 # Extract source from <source>_audio pattern
···476600 if suffix.endswith("_audio") and suffix != "audio":
477601 source = suffix[:-6] # Remove "_audio" suffix
478602479479- # Run enrichment if enabled in config
603603+ # Run enrichment if enabled in config (config already loaded above)
480604 enrichment = None
481481- config = get_config()
482605 enrich_enabled = config.get("transcribe", {}).get("enrich", True)
483606 if enrich_enabled:
484607 from observe.enrich import enrich_transcript
···504627 else:
505628 logging.warning(f"No embeddings generated for {raw_path}")
506629507507- # Emit completion event
508508- journal_path = Path(get_journal())
509509- duration_ms = int((time.time() - start_time) * 1000)
510510-630630+ # Add completion fields and emit event
631631+ event["outcome"] = "transcribed"
632632+ event["duration_ms"] = int((time.time() - start_time) * 1000)
511633 try:
512512- rel_input = raw_path.relative_to(journal_path)
513634 rel_output = jsonl_path.relative_to(journal_path)
514635 except ValueError:
515515- rel_input = raw_path
516636 rel_output = jsonl_path
637637+ event["output"] = str(rel_output)
517638518518- # Extract day from audio path (grandparent is day dir)
519519- day = raw_path.parent.parent.name
520520-521521- event_fields = {
522522- "input": str(rel_input),
523523- "output": str(rel_output),
524524- "duration_ms": duration_ms,
525525- }
526526- if day:
527527- event_fields["day"] = day
528528- if segment:
529529- event_fields["segment"] = segment
530530- if remote:
531531- event_fields["remote"] = remote
532532- callosum_send("observe", "transcribed", **event_fields)
639639+ callosum_send("observe", "transcribed", **event)
533640534641 except Exception as e:
535642 logging.error(f"Failed to transcribe {raw_path}: {e}", exc_info=True)