personal memory agent
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Enhance live transcription with optional Whisper support and adjust parameters for active speaker identification

Jer Miller eb1bd803 a2d2617e

+20 -14
+20 -14
hear/live.py
··· 17 17 18 18 from hear.audio_utils import SAMPLE_RATE, detect_speech 19 19 from see.screen_dbus import take_screenshot 20 - from think.models import GEMINI_FLASH 20 + from think.models import GEMINI_FLASH, GEMINI_LITE 21 21 22 22 MODEL = GEMINI_FLASH # -lite-preview-06-17 23 23 ··· 77 77 model=MODEL, 78 78 contents=contents, 79 79 config=types.GenerateContentConfig( 80 - temperature=0.0, 81 - max_output_tokens=16, 80 + temperature=0.1, 81 + max_output_tokens=256+1024, 82 + thinking_config=types.ThinkingConfig(thinking_budget=1024), 82 83 response_mime_type="text/plain", 83 84 ), 84 85 ) 85 - result = response.text.strip() 86 - logging.info("Meeting result: %s", result) 87 - return result 86 + logging.info("Meeting Speaker: %s", response.text) 87 + return response.text.strip() 88 88 except Exception as e: 89 89 logging.error("Gemini meeting request failed: %s", e) 90 90 return "" ··· 109 109 stash: np.ndarray, 110 110 client, 111 111 speaker_state: dict, 112 + use_whisper: bool = False, 112 113 ) -> np.ndarray: 113 114 """Handle a single audio message from WebSocket.""" 114 115 try: ··· 126 127 speaker_state["name"] = "" 127 128 speaker_state["task"] = None 128 129 129 - # Trigger screenshot capture if stash grows beyond 5 seconds 130 - if len(stash) / SAMPLE_RATE > 5 and speaker_state.get("task") is None: 130 + # Trigger screenshot capture if stash grows beyond 3 seconds 131 + if len(stash) / SAMPLE_RATE > 3 and speaker_state.get("task") is None: 131 132 speaker_state["task"] = asyncio.create_task(identify_active_speaker(client)) 132 133 133 134 segments, stash = detect_speech(vad, "live", stash) ··· 141 142 MODEL, 142 143 buf.getvalue(), 143 144 ) 144 - w_text = transcribe_whisper(buf.getvalue()) 145 - prefix = f"{speaker_state.get('name', '')}: " if speaker_state.get("name") else "" 146 - print(f"G: {prefix}{g_text}\nW: {w_text}") 145 + if use_whisper: 146 + w_text = transcribe_whisper(buf.getvalue()) 147 + prefix = f"{speaker_state.get('name', '')}: " if speaker_state.get("name") else "" 148 + print(f"G: {prefix}{g_text}\nW: {w_text}") 149 + else: 150 + prefix = f"{speaker_state.get('name', '')}: " if speaker_state.get("name") else "" 151 + print(f"G: {prefix}{g_text}") 147 152 speaker_state["name"] = "" 148 153 speaker_state["task"] = None 149 154 except Exception as e: ··· 154 159 return stash 155 160 156 161 157 - async def live_loop(ws_url: str, client) -> None: 162 + async def live_loop(ws_url: str, client, use_whisper: bool = False) -> None: 158 163 vad = load_silero_vad() 159 164 stash = np.array([], dtype=np.float32) 160 165 speaker_state = {"task": None, "name": ""} ··· 168 173 async with websockets.connect(ws_url) as ws: 169 174 logging.info("WebSocket connected successfully") 170 175 async for msg in ws: 171 - stash = await handle_audio_message(msg, vad, stash, client, speaker_state) 176 + stash = await handle_audio_message(msg, vad, stash, client, speaker_state, use_whisper) 172 177 processed_seconds += (len(msg) // 8) / SAMPLE_RATE # Approximate calculation 173 178 174 179 # If we reach here, connection closed normally ··· 206 211 parser = argparse.ArgumentParser(description="Live transcription from WebSocket") 207 212 parser.add_argument("--ws-url", required=True, help="WebSocket URL from gemini-mic") 208 213 parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose output") 214 + parser.add_argument("--whisper", action="store_true", help="Enable Whisper transcription (off by default)") 209 215 args = parser.parse_args() 210 216 211 217 api_key = os.getenv("GOOGLE_API_KEY") ··· 216 222 217 223 client = genai.Client(api_key=api_key) 218 224 219 - asyncio.run(live_loop(args.ws_url, client)) 225 + asyncio.run(live_loop(args.ws_url, client, args.whisper)) 220 226 221 227 222 228 if __name__ == "__main__":