personal memory agent
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Cluster layer semantic rename: audio → transcripts

Renames the `audio` source concept to `transcripts` across the entire
cluster layer, muse configs, importers, CLI, and formatters. The cluster
layer now has three cognitive source types: transcripts (primary narrative),
screen (visual context), and agents (prior analytical outputs).

Key changes:
- cluster.py: sources param/return key `audio` → `transcripts`, expanded
file loading to include *_transcript.jsonl, *_transcript.md, imported.md
- Contextual cluster headers via _transcript_header() — stream name informs
the header (e.g. "ChatGPT Conversation" for import.chatgpt)
- All 23 muse agent configs: sources.audio → sources.transcripts
- muse.py defaults: audio → transcripts
- write_segment(): writes conversation_transcript.jsonl
- write_markdown_segments(): accepts filename param — ICS writes
event_transcript.md, Obsidian writes note_transcript.md, Kindle writes
highlights_transcript.md
- formatters.py: new patterns for *_transcript.* files (legacy patterns kept)
- journal_stats.py: audio_sessions → transcript_sessions, etc.
- CLI: --transcripts flag (--audio kept as hidden alias)
- Full backward compat for legacy on-disk files (audio.jsonl,
imported_audio.jsonl, imported.md)

Fixes import visibility bug: markdown imports (ICS, Obsidian, Kindle) are
now loadable by dream agents via the *_transcript.md glob.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

+323 -228
+20 -16
apps/transcripts/call.py
··· 37 37 ) -> None: 38 38 """List transcript coverage ranges for a day.""" 39 39 day = resolve_sol_day(day) 40 - audio_ranges, screen_ranges = cluster_scan(day) 40 + transcript_ranges, screen_ranges = cluster_scan(day) 41 41 42 - typer.echo("Audio:") 43 - if audio_ranges: 44 - for start, end in audio_ranges: 42 + typer.echo("Transcripts:") 43 + if transcript_ranges: 44 + for start, end in transcript_ranges: 45 45 typer.echo(f" {start} - {end}") 46 46 else: 47 47 typer.echo(" (none)") ··· 89 89 None, "--stream", help="Stream name (default: SOL_STREAM env)." 90 90 ), 91 91 full: bool = typer.Option( 92 - False, "--full", help="Include audio, screen, and agents." 92 + False, "--full", help="Include transcripts, screen, and agents." 93 93 ), 94 - raw: bool = typer.Option(False, "--raw", help="Include audio and screen only."), 95 - audio: bool = typer.Option(False, "--audio", help="Include audio transcripts."), 94 + raw: bool = typer.Option(False, "--raw", help="Include transcripts and screen only."), 95 + transcripts: bool = typer.Option(False, "--transcripts", help="Include transcript content."), 96 + audio: bool = typer.Option(False, "--audio", help="Alias for --transcripts.", hidden=True), 96 97 screen: bool = typer.Option(False, "--screen", help="Include screen transcripts."), 97 98 agents: bool = typer.Option(False, "--agents", help="Include agent outputs."), 98 99 max_bytes: int = typer.Option( ··· 103 104 day = resolve_sol_day(day) 104 105 segment = resolve_sol_segment(segment) 105 106 stream = stream or get_sol_stream() 107 + # --audio is an alias for --transcripts 108 + transcripts = transcripts or audio 109 + 106 110 if full and raw: 107 111 typer.echo("Error: Cannot use --full and --raw together.", err=True) 108 112 raise typer.Exit(1) 109 113 110 - if (full or raw) and (audio or screen or agents): 114 + if (full or raw) and (transcripts or screen or agents): 111 115 typer.echo( 112 116 "Error: Cannot mix --full/--raw with individual source flags.", err=True 113 117 ) 114 118 raise typer.Exit(1) 115 119 116 120 if full: 117 - sources: dict[str, bool] = {"audio": True, "screen": True, "agents": True} 121 + sources: dict[str, bool] = {"transcripts": True, "screen": True, "agents": True} 118 122 elif raw: 119 - sources = {"audio": True, "screen": True, "agents": False} 120 - elif audio or screen or agents: 121 - sources = {"audio": audio, "screen": screen, "agents": agents} 123 + sources = {"transcripts": True, "screen": True, "agents": False} 124 + elif transcripts or screen or agents: 125 + sources = {"transcripts": transcripts, "screen": screen, "agents": agents} 122 126 else: 123 - sources = {"audio": True, "screen": False, "agents": True} 127 + sources = {"transcripts": True, "screen": False, "agents": True} 124 128 125 129 if segment and (start or length is not None): 126 130 typer.echo("Error: Cannot mix --segment with --start/--length.", err=True) ··· 151 155 152 156 days_with_data = 0 153 157 for day in days: 154 - audio_ranges, screen_ranges = cluster_scan(day) 155 - if audio_ranges or screen_ranges: 158 + transcript_ranges, screen_ranges = cluster_scan(day) 159 + if transcript_ranges or screen_ranges: 156 160 days_with_data += 1 157 - typer.echo(f"{day} audio:{len(audio_ranges)} screen:{len(screen_ranges)}") 161 + typer.echo(f"{day} transcripts:{len(transcript_ranges)} screen:{len(screen_ranges)}") 158 162 159 163 if not days_with_data: 160 164 typer.echo(f"No data for {month}.")
+2 -2
apps/transcripts/tests/test_call.py
··· 14 14 def test_scan_day(self): 15 15 result = runner.invoke(call_app, ["transcripts", "scan", "20240101"]) 16 16 assert result.exit_code == 0 17 - assert "Audio:" in result.output 17 + assert "Transcripts:" in result.output 18 18 assert "Screen:" in result.output 19 19 20 20 def test_scan_empty_day(self): ··· 113 113 monkeypatch.setenv("SOL_DAY", "20240101") 114 114 result = runner.invoke(call_app, ["transcripts", "scan"]) 115 115 assert result.exit_code == 0 116 - assert "Audio:" in result.output 116 + assert "Transcripts:" in result.output 117 117 118 118 def test_read_from_sol_day(self, monkeypatch): 119 119 """read with SOL_DAY env and no arg works."""
+1 -1
muse/activities.md
··· 12 12 "thinking_budget": 4096, 13 13 "max_output_tokens": 2048, 14 14 "instructions": { 15 - "sources": {"audio": false, "screen": false, "agents": false}, 15 + "sources": {"transcripts": false, "screen": false, "agents": false}, 16 16 "facets": false 17 17 } 18 18
+1 -1
muse/activity.md
··· 8 8 "priority": 10, 9 9 "output": "md", 10 10 "instructions": { 11 - "sources": {"audio": true, "screen": true, "agents": false}, 11 + "sources": {"transcripts": true, "screen": true, "agents": false}, 12 12 "facets": true 13 13 } 14 14
+1 -1
muse/activity_state.md
··· 13 13 "thinking_budget": 4096, 14 14 "max_output_tokens": 3072, 15 15 "instructions": { 16 - "sources": {"audio": true, "screen": true, "agents": false}, 16 + "sources": {"transcripts": true, "screen": true, "agents": false}, 17 17 "facets": true 18 18 } 19 19
+1 -1
muse/daily_schedule.md
··· 11 11 "thinking_budget": 4096, 12 12 "max_output_tokens": 512, 13 13 "instructions": { 14 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}}, 14 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}}, 15 15 "facets": true 16 16 } 17 17
+1 -1
muse/decisions.md
··· 11 11 "priority": 10, 12 12 "output": "md", 13 13 "instructions": { 14 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}}, 14 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}}, 15 15 "facets": true, 16 16 "activity": true 17 17 }
+1 -1
muse/documentation.md
··· 11 11 "disabled": true, 12 12 "output": "md", 13 13 "instructions": { 14 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}}, 14 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}}, 15 15 "facets": true 16 16 } 17 17
+1 -1
muse/entities.md
··· 11 11 "max_output_tokens": 1024, 12 12 "output": "md", 13 13 "instructions": { 14 - "sources": {"audio": true, "screen": true, "agents": false}, 14 + "sources": {"transcripts": true, "screen": true, "agents": false}, 15 15 "facets": false 16 16 } 17 17
+1 -1
muse/facets.md
··· 11 11 "max_output_tokens": 512, 12 12 "output": "json", 13 13 "instructions": { 14 - "sources": {"audio": false, "screen": false, "agents": true}, 14 + "sources": {"transcripts": false, "screen": false, "agents": true}, 15 15 "facets": true 16 16 } 17 17
+1 -1
muse/files.md
··· 11 11 "disabled": true, 12 12 "output": "md", 13 13 "instructions": { 14 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}}, 14 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}}, 15 15 "facets": true 16 16 } 17 17
+1 -1
muse/flow.md
··· 10 10 "priority": 10, 11 11 "output": "md", 12 12 "instructions": { 13 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}}, 13 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}}, 14 14 "facets": true 15 15 } 16 16
+1 -1
muse/followups.md
··· 11 11 "priority": 10, 12 12 "output": "md", 13 13 "instructions": { 14 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}}, 14 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}}, 15 15 "facets": true, 16 16 "activity": true 17 17 }
+1 -1
muse/knowledge_graph.md
··· 10 10 "priority": 10, 11 11 "output": "md", 12 12 "instructions": { 13 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}}, 13 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}}, 14 14 "facets": true 15 15 } 16 16
+1 -1
muse/media.md
··· 11 11 "disabled": true, 12 12 "output": "md", 13 13 "instructions": { 14 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}}, 14 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}}, 15 15 "facets": true 16 16 } 17 17
+1 -1
muse/meetings.md
··· 11 11 "priority": 10, 12 12 "output": "md", 13 13 "instructions": { 14 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}}, 14 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}}, 15 15 "facets": true, 16 16 "activity": true 17 17 }
+1 -1
muse/messaging.md
··· 11 11 "priority": 10, 12 12 "output": "md", 13 13 "instructions": { 14 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}}, 14 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}}, 15 15 "facets": true, 16 16 "activity": true 17 17 }
+1 -1
muse/observation.md
··· 10 10 "thinking_budget": 2048, 11 11 "max_output_tokens": 2048, 12 12 "instructions": { 13 - "sources": {"audio": true, "screen": true, "agents": false} 13 + "sources": {"transcripts": true, "screen": true, "agents": false} 14 14 } 15 15 } 16 16
+1 -1
muse/opportunities.md
··· 11 11 "disabled": true, 12 12 "output": "md", 13 13 "instructions": { 14 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}}, 14 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}}, 15 15 "facets": true 16 16 } 17 17
+1 -1
muse/research.md
··· 11 11 "disabled": true, 12 12 "output": "md", 13 13 "instructions": { 14 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}}, 14 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}}, 15 15 "facets": true 16 16 } 17 17
+1 -1
muse/schedule.md
··· 9 9 "priority": 10, 10 10 "output": "md", 11 11 "instructions": { 12 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}} 12 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}} 13 13 } 14 14 15 15 }
+1 -1
muse/screen.md
··· 8 8 "priority": 10, 9 9 "output": "md", 10 10 "instructions": { 11 - "sources": {"audio": true, "screen": "required", "agents": false} 11 + "sources": {"transcripts": true, "screen": "required", "agents": false} 12 12 } 13 13 14 14 }
+1 -1
muse/speakers.md
··· 8 8 "output": "json", 9 9 "color": "#e64a19", 10 10 "instructions": { 11 - "sources": {"audio": "required", "screen": true, "agents": false} 11 + "sources": {"transcripts": "required", "screen": true, "agents": false} 12 12 } 13 13 14 14 }
+1 -1
muse/timeline.md
··· 10 10 "priority": 10, 11 11 "output": "md", 12 12 "instructions": { 13 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}} 13 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}} 14 14 } 15 15 16 16 }
+1 -1
muse/tools.md
··· 11 11 "disabled": true, 12 12 "output": "md", 13 13 "instructions": { 14 - "sources": {"audio": true, "screen": false, "agents": {"screen": true}}, 14 + "sources": {"transcripts": true, "screen": false, "agents": {"screen": true}}, 15 15 "facets": true 16 16 } 17 17
+34 -34
tests/baselines/api/stats/stats.json
··· 12 12 "facets": false, 13 13 "sources": { 14 14 "agents": false, 15 - "audio": false, 16 - "screen": false 15 + "screen": false, 16 + "transcripts": false 17 17 } 18 18 }, 19 19 "max_output_tokens": 2048, ··· 35 35 "facets": true, 36 36 "sources": { 37 37 "agents": false, 38 - "audio": true, 39 - "screen": true 38 + "screen": true, 39 + "transcripts": true 40 40 } 41 41 }, 42 42 "mtime": 0, ··· 59 59 "facets": true, 60 60 "sources": { 61 61 "agents": false, 62 - "audio": true, 63 - "screen": true 62 + "screen": true, 63 + "transcripts": true 64 64 } 65 65 }, 66 66 "max_output_tokens": 3072, ··· 89 89 "agents": { 90 90 "screen": true 91 91 }, 92 - "audio": true, 93 - "screen": false 92 + "screen": false, 93 + "transcripts": true 94 94 } 95 95 }, 96 96 "max_output_tokens": 512, ··· 120 120 "agents": { 121 121 "screen": true 122 122 }, 123 - "audio": true, 124 - "screen": false 123 + "screen": false, 124 + "transcripts": true 125 125 } 126 126 }, 127 127 "mtime": 0, ··· 144 144 "facets": false, 145 145 "sources": { 146 146 "agents": false, 147 - "audio": true, 148 - "screen": true 147 + "screen": true, 148 + "transcripts": true 149 149 } 150 150 }, 151 151 "max_output_tokens": 1024, ··· 166 166 "facets": true, 167 167 "sources": { 168 168 "agents": true, 169 - "audio": false, 170 - "screen": false 169 + "screen": false, 170 + "transcripts": false 171 171 } 172 172 }, 173 173 "max_output_tokens": 512, ··· 194 194 "agents": { 195 195 "screen": true 196 196 }, 197 - "audio": true, 198 - "screen": false 197 + "screen": false, 198 + "transcripts": true 199 199 } 200 200 }, 201 201 "mtime": 0, ··· 224 224 "agents": { 225 225 "screen": true 226 226 }, 227 - "audio": true, 228 - "screen": false 227 + "screen": false, 228 + "transcripts": true 229 229 } 230 230 }, 231 231 "mtime": 0, ··· 250 250 "agents": { 251 251 "screen": true 252 252 }, 253 - "audio": true, 254 - "screen": false 253 + "screen": false, 254 + "transcripts": true 255 255 } 256 256 }, 257 257 "mtime": 0, ··· 280 280 "agents": { 281 281 "screen": true 282 282 }, 283 - "audio": true, 284 - "screen": false 283 + "screen": false, 284 + "transcripts": true 285 285 } 286 286 }, 287 287 "mtime": 0, ··· 311 311 "agents": { 312 312 "screen": true 313 313 }, 314 - "audio": true, 315 - "screen": false 314 + "screen": false, 315 + "transcripts": true 316 316 } 317 317 }, 318 318 "mtime": 0, ··· 335 335 "instructions": { 336 336 "sources": { 337 337 "agents": false, 338 - "audio": true, 339 - "screen": true 338 + "screen": true, 339 + "transcripts": true 340 340 } 341 341 }, 342 342 "max_output_tokens": 2048, ··· 362 362 "agents": { 363 363 "screen": true 364 364 }, 365 - "audio": true, 366 - "screen": false 365 + "screen": false, 366 + "transcripts": true 367 367 } 368 368 }, 369 369 "mtime": 0, ··· 381 381 "instructions": { 382 382 "sources": { 383 383 "agents": false, 384 - "audio": true, 385 - "screen": "required" 384 + "screen": "required", 385 + "transcripts": true 386 386 } 387 387 }, 388 388 "mtime": 0, ··· 400 400 "instructions": { 401 401 "sources": { 402 402 "agents": false, 403 - "audio": "required", 404 - "screen": true 403 + "screen": true, 404 + "transcripts": "required" 405 405 } 406 406 }, 407 407 "mtime": 0, ··· 424 424 "agents": { 425 425 "screen": true 426 426 }, 427 - "audio": true, 428 - "screen": false 427 + "screen": false, 428 + "transcripts": true 429 429 } 430 430 }, 431 431 "mtime": 0,
+3 -3
tests/baselines/api/transcripts/segments.json
··· 6 6 "start": "09:00", 7 7 "stream": "default", 8 8 "types": [ 9 - "audio", 9 + "transcripts", 10 10 "screen" 11 11 ] 12 12 }, ··· 16 16 "start": "14:00", 17 17 "stream": "default", 18 18 "types": [ 19 - "audio", 19 + "transcripts", 20 20 "screen" 21 21 ] 22 22 }, ··· 26 26 "start": "18:00", 27 27 "stream": "default", 28 28 "types": [ 29 - "audio", 29 + "transcripts", 30 30 "screen" 31 31 ] 32 32 }
+30 -30
tests/test_cluster.py
··· 9 9 10 10 11 11 def test_cluster(tmp_path, monkeypatch): 12 - """Test cluster() uses audio and agent output summaries (*.md files).""" 12 + """Test cluster() uses transcripts and agent output summaries (*.md files).""" 13 13 monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 14 14 day_dir = day_path("20240101") 15 15 ··· 25 25 "screen summary" 26 26 ) 27 27 result, counts = mod.cluster( 28 - "20240101", sources={"audio": True, "screen": False, "agents": True} 28 + "20240101", sources={"transcripts": True, "screen": False, "agents": True} 29 29 ) 30 - assert counts["audio"] == 1 30 + assert counts["transcripts"] == 1 31 31 assert counts["agents"] == 1 32 - assert "Audio Transcript" in result 32 + assert "### Transcript" in result 33 33 # Now uses insight rendering: "### {stem} summary" 34 34 assert "screen summary" in result 35 35 36 36 37 37 def test_cluster_range(tmp_path, monkeypatch): 38 - """Test cluster_range with audio and agents sources.""" 38 + """Test cluster_range with transcripts and agents sources.""" 39 39 monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 40 40 day_dir = day_path("20240101") 41 41 ··· 55 55 "20240101", 56 56 "120000", 57 57 "120100", 58 - sources={"audio": True, "screen": False, "agents": True}, 58 + sources={"transcripts": True, "screen": False, "agents": True}, 59 59 ) 60 60 # Check that the function works and includes expected sections 61 - assert "Audio Transcript" in md 61 + assert "### Transcript" in md 62 62 # Now uses insight rendering: "### {stem} summary" 63 63 assert "screen summary" in md 64 64 assert "screen summary content" in md ··· 134 134 assert segments[0]["key"] == "090000_300" 135 135 assert segments[0]["start"] == "09:00" 136 136 assert segments[0]["end"] == "09:05" 137 - assert segments[0]["types"] == ["audio"] 137 + assert segments[0]["types"] == ["transcripts"] 138 138 139 - # Check second segment (both audio and screen) 139 + # Check second segment (both transcripts and screen) 140 140 assert segments[1]["key"] == "100000_600" 141 141 assert segments[1]["start"] == "10:00" 142 142 assert segments[1]["end"] == "10:10" 143 - assert "audio" in segments[1]["types"] 143 + assert "transcripts" in segments[1]["types"] 144 144 assert "screen" in segments[1]["types"] 145 145 146 146 # Check third segment (screen only) ··· 176 176 result, counts = mod.cluster_period( 177 177 "20240101", 178 178 "100000_300", 179 - sources={"audio": True, "screen": True, "agents": False}, 179 + sources={"transcripts": True, "screen": True, "agents": False}, 180 180 ) 181 181 182 - # Should have both audio and screen entries 183 - assert counts["audio"] == 1 182 + # Should have both transcript and screen entries 183 + assert counts["transcripts"] == 1 184 184 assert counts["screen"] == 1 185 - assert "Audio Transcript" in result 185 + assert "### Transcript" in result 186 186 # Should use raw screen format header 187 187 assert "Screen Activity" in result 188 188 # Raw screen content should be present ··· 218 218 "20240101", 219 219 "100000", 220 220 "100500", 221 - sources={"audio": True, "screen": False, "agents": True}, 221 + sources={"transcripts": True, "screen": False, "agents": True}, 222 222 ) 223 223 224 - assert "Audio Transcript" in result 224 + assert "### Transcript" in result 225 225 # Should include both .md files as agent outputs 226 226 assert "### screen summary" in result 227 227 assert "Screen activity summary" in result ··· 253 253 "20240101", 254 254 "100000", 255 255 "100500", 256 - sources={"audio": False, "screen": True, "agents": False}, 256 + sources={"transcripts": False, "screen": True, "agents": False}, 257 257 ) 258 258 259 259 assert "Screen Activity" in result ··· 289 289 "20240101", 290 290 "100000", 291 291 "100500", 292 - sources={"audio": False, "screen": True, "agents": False}, 292 + sources={"transcripts": False, "screen": True, "agents": False}, 293 293 ) 294 294 295 295 # Should include content from both screen files ··· 367 367 result, counts = mod.cluster_span( 368 368 "20240101", 369 369 ["090000_300", "110000_300"], 370 - sources={"audio": True, "screen": False, "agents": False}, 370 + sources={"transcripts": True, "screen": False, "agents": False}, 371 371 ) 372 372 373 - # Should have 2 audio entries (one per segment) 374 - assert counts["audio"] == 2 373 + # Should have 2 transcript entries (one per segment) 374 + assert counts["transcripts"] == 2 375 375 assert counts["screen"] == 0 376 376 assert "morning segment" in result 377 377 assert "late morning segment" in result ··· 398 398 mod.cluster_span( 399 399 "20240101", 400 400 ["090000_300", "100000_300"], 401 - sources={"audio": True, "screen": False, "agents": False}, 401 + sources={"transcripts": True, "screen": False, "agents": False}, 402 402 ) 403 403 404 404 assert "100000_300" in str(exc_info.value) ··· 424 424 # Test filtering to only include entities 425 425 result, counts = mod.cluster( 426 426 "20240101", 427 - sources={"audio": True, "screen": False, "agents": {"entities": True}}, 427 + sources={"transcripts": True, "screen": False, "agents": {"entities": True}}, 428 428 ) 429 429 430 - assert counts["audio"] == 1 430 + assert counts["transcripts"] == 1 431 431 assert counts["agents"] == 1 # Only entities should be counted 432 432 assert "Entity extraction results" in result 433 433 assert "Meeting summary results" not in result ··· 454 454 result, counts = mod.cluster( 455 455 "20240101", 456 456 sources={ 457 - "audio": True, 457 + "transcripts": True, 458 458 "screen": False, 459 459 "agents": {"entities": True, "meetings": "required", "flow": False}, 460 460 }, 461 461 ) 462 462 463 - assert counts["audio"] == 1 463 + assert counts["transcripts"] == 1 464 464 assert counts["agents"] == 2 # entities + meetings 465 465 assert "Entity extraction results" in result 466 466 assert "Meeting summary results" in result ··· 487 487 result, counts = mod.cluster( 488 488 "20240101", 489 489 sources={ 490 - "audio": True, 490 + "transcripts": True, 491 491 "screen": False, 492 492 "agents": {"entities": False, "todos:review": True}, 493 493 }, 494 494 ) 495 495 496 - assert counts["audio"] == 1 496 + assert counts["transcripts"] == 1 497 497 assert counts["agents"] == 1 # Only todos:review 498 498 assert "System entity results" not in result 499 499 assert "Todos review results" in result ··· 515 515 # Empty dict should mean no agents 516 516 result, counts = mod.cluster( 517 517 "20240101", 518 - sources={"audio": True, "screen": False, "agents": {}}, 518 + sources={"transcripts": True, "screen": False, "agents": {}}, 519 519 ) 520 520 521 - assert counts["audio"] == 1 521 + assert counts["transcripts"] == 1 522 522 assert counts["agents"] == 0 523 523 assert "Entity extraction results" not in result 524 524
+6 -6
tests/test_cluster_full.py
··· 29 29 copy_day(tmp_path) 30 30 monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 31 31 md, counts = mod.cluster( 32 - "20240101", sources={"audio": True, "screen": False, "agents": True} 32 + "20240101", sources={"transcripts": True, "screen": False, "agents": True} 33 33 ) 34 - # Audio entries come from 2 segments on 20240101 (default + import.apple) 35 - assert counts["audio"] == 2 34 + # Transcript entries come from 2 segments on 20240101 (default + import.apple) 35 + assert counts["transcripts"] == 2 36 36 assert counts["agents"] == 2 # audio.md + screen.md 37 - assert "Audio Transcript" in md 37 + assert "### Transcript" in md 38 38 # Now uses insight format: "### {stem} summary" 39 39 assert "### screen summary" in md 40 40 assert "### audio summary" in md ··· 45 45 copy_day(tmp_path) 46 46 monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 47 47 out, _counts = mod.cluster( 48 - "20240101", sources={"audio": True, "screen": False, "agents": True} 48 + "20240101", sources={"transcripts": True, "screen": False, "agents": True} 49 49 ) 50 50 # Now uses insight format: "### {stem} summary" 51 51 assert "### screen summary" in out ··· 59 59 "20240101", 60 60 "123456", 61 61 "123556", 62 - sources={"audio": True, "screen": True, "agents": False}, 62 + sources={"transcripts": True, "screen": True, "agents": False}, 63 63 ) 64 64 # Range mode with screen=True uses raw screen data. 65 65 assert "### Screen Activity" in out
+1 -1
tests/test_gemini_importer.py
··· 182 182 assert result.segments is not None 183 183 assert len(result.segments) >= 1 184 184 assert any( 185 - Path(p).name == "imported_audio.jsonl" for p in result.files_created 185 + Path(p).name == "conversation_transcript.jsonl" for p in result.files_created 186 186 ) 187 187 188 188 first_path = Path(result.files_created[0])
+1 -1
tests/test_generators.py
··· 134 134 instructions = speakers.get("instructions", {}) 135 135 sources = instructions.get("sources", {}) 136 136 137 - assert sources.get("audio") == "required", "speakers should require audio" 137 + assert sources.get("transcripts") == "required", "speakers should require transcripts" 138 138 assert sources.get("screen") is True, "speakers should include screen" 139 139 140 140
+16 -16
tests/test_importer.py
··· 124 124 # Duration: seg1 starts at 12:00:00, seg2 at 12:05:00 = 300s duration 125 125 # Last segment (seg2) defaults to 5s since no audio duration 126 126 # Segments are under stream directory (import.text for .txt files) 127 - f1 = day_dir / "import.text" / "120000_300" / "imported_audio.jsonl" 128 - f2 = day_dir / "import.text" / "120500_5" / "imported_audio.jsonl" 127 + f1 = day_dir / "import.text" / "120000_300" / "conversation_transcript.jsonl" 128 + f2 = day_dir / "import.text" / "120500_5" / "conversation_transcript.jsonl" 129 129 130 130 # Read JSONL format: first line is metadata, subsequent lines are entries 131 131 lines1 = f1.read_text().strip().split("\n") ··· 222 222 223 223 day_dir = day_path("20251205") 224 224 # Single segment, last segment defaults to 5s 225 - f1 = day_dir / "import.text" / "163000_5" / "imported_audio.jsonl" 225 + f1 = day_dir / "import.text" / "163000_5" / "conversation_transcript.jsonl" 226 226 assert f1.exists() 227 227 228 228 lines = f1.read_text().strip().split("\n") ··· 283 283 written = Path(json_path) 284 284 assert ( 285 285 written 286 - == tmp_path / "20240101" / "import.text" / "120000_300" / "imported_audio.jsonl" 286 + == tmp_path / "20240101" / "import.text" / "120000_300" / "conversation_transcript.jsonl" 287 287 ) 288 288 assert written.exists() 289 289 ··· 420 420 assert len(result.files_created) == 3 421 421 422 422 first_segment = ( 423 - day_path("20260115") / "import.chatgpt" / "120000_300" / "imported_audio.jsonl" 423 + day_path("20260115") / "import.chatgpt" / "120000_300" / "conversation_transcript.jsonl" 424 424 ) 425 425 second_segment = ( 426 - day_path("20260115") / "import.chatgpt" / "120501_300" / "imported_audio.jsonl" 426 + day_path("20260115") / "import.chatgpt" / "120501_300" / "conversation_transcript.jsonl" 427 427 ) 428 428 third_segment = ( 429 - day_path("20260116") / "import.chatgpt" / "000000_300" / "imported_audio.jsonl" 429 + day_path("20260116") / "import.chatgpt" / "000000_300" / "conversation_transcript.jsonl" 430 430 ) 431 431 432 432 assert first_segment.exists() ··· 551 551 assert len(result.files_created) == 3 552 552 553 553 first_segment = ( 554 - day_path("20260115") / "import.claude" / "120000_300" / "imported_audio.jsonl" 554 + day_path("20260115") / "import.claude" / "120000_300" / "conversation_transcript.jsonl" 555 555 ) 556 556 second_segment = ( 557 - day_path("20260115") / "import.claude" / "120501_300" / "imported_audio.jsonl" 557 + day_path("20260115") / "import.claude" / "120501_300" / "conversation_transcript.jsonl" 558 558 ) 559 559 third_segment = ( 560 - day_path("20260116") / "import.claude" / "000000_300" / "imported_audio.jsonl" 560 + day_path("20260116") / "import.claude" / "000000_300" / "conversation_transcript.jsonl" 561 561 ) 562 562 563 563 assert first_segment.exists() ··· 621 621 {"start": "12:00:30", "speaker": "Bob", "text": "Hi there"}, 622 622 ] 623 623 624 - # Stream-based path: day/stream/segment/imported_audio.jsonl 624 + # Stream-based path: day/stream/segment/conversation_transcript.jsonl 625 625 context = { 626 626 "file_path": Path( 627 - "/journal/20240101/import.text/120000_300/imported_audio.jsonl" 627 + "/journal/20240101/import.text/120000_300/conversation_transcript.jsonl" 628 628 ) 629 629 } 630 630 chunks, meta = format_audio(entries, context) ··· 1251 1251 1252 1252 result = mod.ICSImporter().process(ics_path, tmp_path, facet="work") 1253 1253 1254 - first_md = day_path("20260301") / "import.ics" / "120000_300" / "imported.md" 1255 - second_md = day_path("20260302") / "import.ics" / "090000_300" / "imported.md" 1254 + first_md = day_path("20260301") / "import.ics" / "120000_300" / "event_transcript.md" 1255 + second_md = day_path("20260302") / "import.ics" / "090000_300" / "event_transcript.md" 1256 1256 1257 1257 assert result.entries_written == 4 1258 1258 assert result.errors == [] ··· 1480 1480 assert first_key == "100000_300" 1481 1481 assert second_key == "101000_300" 1482 1482 1483 - first_md = day_path("20260315") / "import.obsidian" / "100000_300" / "imported.md" 1484 - second_md = day_path("20260315") / "import.obsidian" / "101000_300" / "imported.md" 1483 + first_md = day_path("20260315") / "import.obsidian" / "100000_300" / "note_transcript.md" 1484 + second_md = day_path("20260315") / "import.obsidian" / "101000_300" / "note_transcript.md" 1485 1485 assert first_md.exists() 1486 1486 assert second_md.exists() 1487 1487
+9 -9
tests/test_journal_stats.py
··· 53 53 js = stats_mod.JournalStats() 54 54 day_data = js.scan_day("20240101", str(day)) 55 55 js._apply_day_stats("20240101", day_data) 56 - assert js.days["20240101"]["audio_sessions"] == 1 57 - assert js.days["20240101"]["audio_segments"] == 2 56 + assert js.days["20240101"]["transcript_sessions"] == 1 57 + assert js.days["20240101"]["transcript_segments"] == 2 58 58 assert ( 59 59 js.days["20240101"]["pending_segments"] == 1 60 60 ) # Both files belong to same segment ··· 171 171 data = js.to_dict() 172 172 assert "token_usage_by_day" in data 173 173 assert "token_totals_by_model" in data 174 - assert "total_audio_duration" in data 174 + assert "total_transcript_duration" in data 175 175 assert "total_screen_duration" in data 176 176 assert ( 177 177 data["token_usage_by_day"]["20240101"]["gemini-2.5-flash"]["total_tokens"] ··· 200 200 # First scan - should create cache 201 201 js1 = stats_mod.JournalStats() 202 202 js1.scan(str(journal), verbose=False, use_cache=True) 203 - assert js1.days["20240101"]["audio_sessions"] == 1 203 + assert js1.days["20240101"]["transcript_sessions"] == 1 204 204 assert (day / "stats.json").exists() 205 205 206 206 # Load cache and verify contents 207 207 with open(day / "stats.json") as f: 208 208 cached = json.load(f) 209 - assert cached["stats"]["audio_sessions"] == 1 210 - assert cached["stats"]["audio_segments"] == 2 209 + assert cached["stats"]["transcript_sessions"] == 1 210 + assert cached["stats"]["transcript_segments"] == 2 211 211 212 212 # Second scan - should use cache 213 213 js2 = stats_mod.JournalStats() 214 214 js2.scan(str(journal), verbose=False, use_cache=True) 215 - assert js2.days["20240101"]["audio_sessions"] == 1 216 - assert js2.days["20240101"]["audio_segments"] == 2 215 + assert js2.days["20240101"]["transcript_sessions"] == 1 216 + assert js2.days["20240101"]["transcript_segments"] == 2 217 217 218 218 # Third scan with --no-cache - should re-scan 219 219 js3 = stats_mod.JournalStats() 220 220 js3.scan(str(journal), verbose=False, use_cache=False) 221 - assert js3.days["20240101"]["audio_sessions"] == 1 221 + assert js3.days["20240101"]["transcript_sessions"] == 1 222 222 223 223 224 224 def test_token_usage_new_format(tmp_path, monkeypatch):
+1 -1
tests/test_kindle_importer.py
··· 141 141 142 142 md_path = Path(result.files_created[0]) 143 143 assert md_path.exists() 144 - assert md_path.name == "imported.md" 144 + assert md_path.name == "highlights_transcript.md" 145 145 md = md_path.read_text() 146 146 assert "Test Book" in md 147 147 assert "> This is a highlighted passage." in md
+9 -9
tests/test_muse.py
··· 21 21 22 22 def test_merge_instructions_config_empty_overrides(): 23 23 """Test that empty overrides returns defaults copy.""" 24 - defaults = {"system": "journal", "facets": True, "sources": {"audio": False}} 24 + defaults = {"system": "journal", "facets": True, "sources": {"transcripts": False}} 25 25 result = _merge_instructions_config(defaults, None) 26 26 assert result == defaults 27 27 assert result is not defaults # Should be a copy ··· 29 29 30 30 def test_merge_instructions_config_with_overrides(): 31 31 """Test that overrides are merged correctly.""" 32 - defaults = {"system": "journal", "facets": True, "sources": {"audio": False}} 32 + defaults = {"system": "journal", "facets": True, "sources": {"transcripts": False}} 33 33 overrides = {"system": "custom", "facets": False} 34 34 result = _merge_instructions_config(defaults, overrides) 35 35 assert result["system"] == "custom" 36 36 assert result["facets"] is False 37 - assert result["sources"] == {"audio": False} # Preserved 37 + assert result["sources"] == {"transcripts": False} # Preserved 38 38 39 39 40 40 def test_merge_instructions_config_sources_merge(): 41 41 """Test that sources dict is merged, not replaced.""" 42 - defaults = {"system": None, "sources": {"audio": False, "screen": False}} 43 - overrides = {"sources": {"audio": True}} 42 + defaults = {"system": None, "sources": {"transcripts": False, "screen": False}} 43 + overrides = {"sources": {"transcripts": True}} 44 44 result = _merge_instructions_config(defaults, overrides) 45 - assert result["sources"]["audio"] is True # Overridden 45 + assert result["sources"]["transcripts"] is True # Overridden 46 46 assert result["sources"]["screen"] is False # Preserved from defaults 47 47 48 48 ··· 235 235 result = compose_instructions() 236 236 237 237 assert "sources" in result 238 - assert result["sources"]["audio"] is False 238 + assert result["sources"]["transcripts"] is False 239 239 assert result["sources"]["screen"] is False 240 240 assert result["sources"]["agents"] is False 241 241 ··· 251 251 252 252 result = compose_instructions( 253 253 config_overrides={ 254 - "sources": {"audio": True, "agents": True}, 254 + "sources": {"transcripts": True, "agents": True}, 255 255 }, 256 256 ) 257 257 258 - assert result["sources"]["audio"] is True # Overridden 258 + assert result["sources"]["transcripts"] is True # Overridden 259 259 assert result["sources"]["screen"] is False # Default preserved 260 260 assert result["sources"]["agents"] is True # Overridden 261 261
+114 -53
think/cluster.py
··· 71 71 return False 72 72 73 73 74 + def _transcript_header(stream: str | None, filename: str) -> str: 75 + """Generate contextual header for a transcript entry. 76 + 77 + Uses stream name and filename to produce a descriptive header 78 + instead of hardcoded "Audio Transcript". 79 + """ 80 + _IMPORT_LABELS = { 81 + "import.chatgpt": "ChatGPT Conversation", 82 + "import.claude": "Claude Conversation", 83 + "import.gemini": "Gemini Conversation", 84 + "import.ics": "Calendar Event", 85 + "import.obsidian": "Note", 86 + "import.kindle": "Highlights", 87 + } 88 + if stream and stream in _IMPORT_LABELS: 89 + return _IMPORT_LABELS[stream] 90 + return "Transcript" 91 + 92 + 74 93 def _process_segment( 75 94 segment_path: Path, 76 95 date_str: str, 77 - audio: bool, 96 + transcripts: bool, 78 97 screen: bool, 79 98 agents: bool | dict[str, bool | str], 80 99 ) -> list[dict[str, Any]]: ··· 83 102 Args: 84 103 segment_path: Path to segment directory 85 104 date_str: Date in YYYYMMDD format 86 - audio: Whether to load audio transcripts 105 + transcripts: Whether to load transcript content (JSONL and markdown) 87 106 screen: Whether to load raw screen data from *screen.jsonl files 88 107 agents: Whether to load agent output summaries from *.md files. 89 108 Can be bool (all/none) or dict for selective filtering ··· 110 129 segment_start = datetime.combine(day_date, start_time) 111 130 segment_end = datetime.combine(day_date, end_time) 112 131 113 - # Process audio transcripts 114 - if audio: 115 - audio_files = [f for f in segment_path.glob("*audio.jsonl") if f.is_file()] 116 - for audio_file in audio_files: 132 + # Process transcript content (JSONL: legacy *audio.jsonl + new *_transcript.jsonl) 133 + if transcripts: 134 + jsonl_files = set() 135 + for pattern in ("*audio.jsonl", "*_transcript.jsonl"): 136 + jsonl_files.update(f for f in segment_path.glob(pattern) if f.is_file()) 137 + for jsonl_file in sorted(jsonl_files): 117 138 from observe.hear import load_transcript 118 139 119 140 metadata, transcript_entries, formatted_text = load_transcript( 120 - str(audio_file) 141 + str(jsonl_file) 121 142 ) 122 143 if transcript_entries is None: 123 144 print( 124 - f"Warning: Could not load transcript {audio_file.name}: {metadata.get('error')}", 145 + f"Warning: Could not load transcript {jsonl_file.name}: {metadata.get('error')}", 125 146 file=sys.stderr, 126 147 ) 127 148 continue ··· 132 153 "segment_key": segment_key, 133 154 "segment_start": segment_start, 134 155 "segment_end": segment_end, 135 - "prefix": "audio", 156 + "prefix": "transcript", 136 157 "content": formatted_text, 137 - "name": f"{segment_path.name}/{audio_file.name}", 158 + "name": f"{segment_path.name}/{jsonl_file.name}", 138 159 "stream": stream, 139 160 } 140 161 ) 141 162 163 + # Process markdown transcript files (*_transcript.md + legacy imported.md) 164 + md_files = set() 165 + for pattern in ("*_transcript.md", "imported.md"): 166 + md_files.update(f for f in segment_path.glob(pattern) if f.is_file()) 167 + for md_file in sorted(md_files): 168 + try: 169 + content = md_file.read_text() 170 + if content.strip(): 171 + entries.append( 172 + { 173 + "timestamp": segment_start, 174 + "segment_key": segment_key, 175 + "segment_start": segment_start, 176 + "segment_end": segment_end, 177 + "prefix": "transcript", 178 + "content": content, 179 + "name": f"{segment_path.name}/{md_file.name}", 180 + "stream": stream, 181 + } 182 + ) 183 + except Exception as e: # pragma: no cover - warning only 184 + print( 185 + f"Warning: Could not read transcript {md_file.name}: {e}", 186 + file=sys.stderr, 187 + ) 188 + 142 189 # Process raw screen data from screen.jsonl and *_screen.jsonl 143 190 if screen: 144 191 screen_files = list(segment_path.glob("*screen.jsonl")) ··· 207 254 208 255 209 256 def _load_entries( 210 - day_dir: str, audio: bool, screen: bool, agents: bool | dict[str, bool | str] 257 + day_dir: str, transcripts: bool, screen: bool, agents: bool | dict[str, bool | str] 211 258 ) -> list[dict[str, Any]]: 212 259 """Load all transcript entries from a day directory.""" 213 260 from think.utils import segment_parse ··· 222 269 start_time, _ = segment_parse(seg_path.name) 223 270 if not start_time: 224 271 continue 225 - entries.extend(_process_segment(seg_path, date_str, audio, screen, agents)) 272 + entries.extend(_process_segment(seg_path, date_str, transcripts, screen, agents)) 226 273 227 274 entries.sort(key=lambda e: e["timestamp"]) 228 275 return entries ··· 245 292 """Count entries by source type (prefix). 246 293 247 294 Maps the internal prefix names to source config names: 248 - - "audio" -> "audio" 295 + - "transcript" -> "transcripts" 249 296 - "screen" -> "screen" 250 297 - "agent_output" -> "agents" 251 298 252 299 Returns: 253 - Dict with counts for each source type, e.g., {"audio": 2, "screen": 1, "agents": 0} 300 + Dict with counts for each source type, e.g., {"transcripts": 2, "screen": 1, "agents": 0} 254 301 """ 255 302 # Map internal prefix to source config name 256 303 prefix_to_source = { 257 - "audio": "audio", 304 + "transcript": "transcripts", 258 305 "screen": "screen", 259 306 "agent_output": "agents", 260 307 } ··· 263 310 264 311 # Ensure all standard sources are present (even if 0) 265 312 return { 266 - "audio": counts.get("audio", 0), 313 + "transcripts": counts.get("transcripts", 0), 267 314 "screen": counts.get("screen", 0), 268 315 "agents": counts.get("agents", 0), 269 316 } ··· 292 339 lines.append("") 293 340 294 341 for entry in segment_entries: 295 - if entry["prefix"] == "audio": 296 - lines.append("### Audio Transcript") 342 + if entry["prefix"] == "transcript": 343 + header = _transcript_header(entry.get("stream"), entry.get("name", "")) 344 + lines.append(f"### {header}") 297 345 lines.append(entry["content"].strip()) 298 346 lines.append("") 299 347 elif entry["prefix"] == "screen": ··· 342 390 343 391 344 392 def cluster_scan(day: str) -> tuple[list[tuple[str, str]], list[tuple[str, str]]]: 345 - """Return 15-minute ranges with audio and screen transcripts for ``day``. 393 + """Return 15-minute ranges with transcript and screen content for ``day``. 346 394 347 395 Args: 348 396 day: Day folder in ``YYYYMMDD`` format. 349 397 350 398 Returns: 351 - Two lists containing ``(start, end)`` pairs (``HH:MM``) for audio and 352 - screen transcripts respectively. 399 + Two lists containing ``(start, end)`` pairs (``HH:MM``) for transcript and 400 + screen content respectively. 353 401 """ 354 402 355 403 day_dir = str(day_path(day)) ··· 358 406 return [], [] 359 407 360 408 date_str = _date_str(day_dir) 361 - audio_slots: set[datetime] = set() 409 + transcript_slots: set[datetime] = set() 362 410 screen_slots: set[datetime] = set() 363 411 day_path_obj = Path(day_dir) 364 412 365 - # Check timestamp subdirectories for transcript files 413 + # Check timestamp subdirectories for content files 366 414 from think.utils import iter_segments, segment_parse 367 415 368 416 for _stream, _seg_key, seg_path in iter_segments(day_path_obj): ··· 375 423 minute=dt.minute - (dt.minute % 15), second=0, microsecond=0 376 424 ) 377 425 378 - # Check for audio transcripts 379 - if (seg_path / "audio.jsonl").exists() or any( 380 - seg_path.glob("*_audio.jsonl") 426 + # Check for transcript content (legacy audio + new transcript convention) 427 + if ( 428 + (seg_path / "audio.jsonl").exists() 429 + or any(seg_path.glob("*_audio.jsonl")) 430 + or any(seg_path.glob("*_transcript.jsonl")) 431 + or any(seg_path.glob("*_transcript.md")) 432 + or (seg_path / "imported.md").exists() 381 433 ): 382 - audio_slots.add(slot) 434 + transcript_slots.add(slot) 383 435 384 436 # Check for screen content 385 437 if (seg_path / "screen.jsonl").exists() or any( ··· 387 439 ): 388 440 screen_slots.add(slot) 389 441 390 - audio_ranges = _slots_to_ranges(sorted(audio_slots)) 442 + transcript_ranges = _slots_to_ranges(sorted(transcript_slots)) 391 443 screen_ranges = _slots_to_ranges(sorted(screen_slots)) 392 - return audio_ranges, screen_ranges 444 + return transcript_ranges, screen_ranges 393 445 394 446 395 447 def cluster_segments(day: str) -> list[dict[str, Any]]: ··· 406 458 - key: segment directory name (HHMMSS_LEN format) 407 459 - start: start time as HH:MM 408 460 - end: end time as HH:MM 409 - - types: list of content types present ("audio", "screen", or both) 461 + - types: list of content types present ("transcripts", "screen", or both) 410 462 """ 411 463 from think.utils import segment_parse 412 464 ··· 425 477 continue 426 478 427 479 types = [] 428 - # Check for audio transcripts 429 - if (seg_path / "audio.jsonl").exists() or any(seg_path.glob("*_audio.jsonl")): 430 - types.append("audio") 480 + # Check for transcript content (legacy audio + new transcript convention) 481 + if ( 482 + (seg_path / "audio.jsonl").exists() 483 + or any(seg_path.glob("*_audio.jsonl")) 484 + or any(seg_path.glob("*_transcript.jsonl")) 485 + or any(seg_path.glob("*_transcript.md")) 486 + or (seg_path / "imported.md").exists() 487 + ): 488 + types.append("transcripts") 431 489 432 490 # Check for screen content 433 491 if (seg_path / "screen.jsonl").exists() or any(seg_path.glob("*_screen.jsonl")): ··· 488 546 489 547 Args: 490 548 day: Day in YYYYMMDD format 491 - sources: Dict with keys "audio", "screen", "agents". 549 + sources: Dict with keys "transcripts", "screen", "agents". 492 550 Values can be bool, "required" string, or dict (for agents). 493 551 The "agents" source can be a dict for selective filtering, 494 552 e.g., {"entities": True, "meetings": "required"}. 495 553 496 554 Returns: 497 555 Tuple of (markdown, source_counts) where source_counts is a dict 498 - with keys "audio", "screen", "agents" mapping to entry counts. 556 + with keys "transcripts", "screen", "agents" mapping to entry counts. 499 557 """ 500 - empty_counts = {"audio": 0, "screen": 0, "agents": 0} 558 + empty_counts = {"transcripts": 0, "screen": 0, "agents": 0} 501 559 502 560 day_dir = str(day_path(day)) 503 561 # day_path now ensures dir exists, but check anyway for safety ··· 506 564 507 565 entries = _load_entries( 508 566 day_dir, 509 - audio=sources.get("audio", False), 567 + transcripts=sources.get("transcripts", False), 510 568 screen=sources.get("screen", False), 511 569 agents=sources.get("agents", False), 512 570 ) 513 571 if not entries: 514 572 return ( 515 - f"No audio or screen files found for date {day} in {day_dir}.", 573 + f"No transcript or screen files found for date {day} in {day_dir}.", 516 574 empty_counts, 517 575 ) 518 576 ··· 532 590 Args: 533 591 day: Day in YYYYMMDD format 534 592 segment: Segment key in HHMMSS_LEN format (e.g., "163045_300") 535 - sources: Dict with keys "audio", "screen", "agents". 593 + sources: Dict with keys "transcripts", "screen", "agents". 536 594 Values can be bool, "required" string, or dict (for agents). 537 595 stream: Stream name. If None, searches all streams for the segment. 538 596 539 597 Returns: 540 598 Tuple of (markdown, source_counts) where source_counts is a dict 541 - with keys "audio", "screen", "agents" mapping to entry counts. 599 + with keys "transcripts", "screen", "agents" mapping to entry counts. 542 600 """ 543 - empty_counts = {"audio": 0, "screen": 0, "agents": 0} 601 + empty_counts = {"transcripts": 0, "screen": 0, "agents": 0} 544 602 545 603 segment_dir = _find_segment_dir(day, segment, stream) 546 604 ··· 549 607 550 608 entries = _load_entries_from_segment( 551 609 str(segment_dir), 552 - audio=sources.get("audio", False), 610 + transcripts=sources.get("transcripts", False), 553 611 screen=sources.get("screen", False), 554 612 agents=sources.get("agents", False), 555 613 ) 556 614 if not entries: 557 - return f"No audio or screen files found for segment {segment}", empty_counts 615 + return f"No transcript or screen files found for segment {segment}", empty_counts 558 616 559 617 groups = _group_entries(entries) 560 618 markdown = _groups_to_markdown(groups) ··· 562 620 563 621 564 622 def _load_entries_from_segment( 565 - segment_dir: str, audio: bool, screen: bool, agents: bool | dict[str, bool | str] 623 + segment_dir: str, 624 + transcripts: bool, 625 + screen: bool, 626 + agents: bool | dict[str, bool | str], 566 627 ) -> list[dict[str, Any]]: 567 628 """Load entries from a single segment directory. 568 629 569 630 Args: 570 631 segment_dir: Path to segment directory (e.g., /path/to/20251109/163045_300) 571 - audio: Whether to load audio transcripts 632 + transcripts: Whether to load transcript content (JSONL and markdown) 572 633 screen: Whether to load raw screen data from *screen.jsonl files 573 634 agents: Whether to load agent output summaries from *.md files 574 635 ··· 578 639 segment_path_obj = Path(segment_dir) 579 640 # Parent is stream dir; grandparent is day dir 580 641 date_str = _date_str(str(segment_path_obj.parent.parent)) 581 - entries = _process_segment(segment_path_obj, date_str, audio, screen, agents) 642 + entries = _process_segment(segment_path_obj, date_str, transcripts, screen, agents) 582 643 entries.sort(key=lambda e: e["timestamp"]) 583 644 return entries 584 645 ··· 599 660 Args: 600 661 day: Day in YYYYMMDD format 601 662 span: List of segment keys in HHMMSS_LEN format (e.g., ["163045_300", "170000_600"]) 602 - sources: Dict with keys "audio", "screen", "agents". 663 + sources: Dict with keys "transcripts", "screen", "agents". 603 664 Values can be bool, "required" string, or dict (for agents). 604 665 stream: Stream name. If None, searches all streams for each segment. 605 666 606 667 Returns: 607 668 Tuple of (markdown, source_counts) where source_counts is a dict 608 - with keys "audio", "screen", "agents" mapping to entry counts. 669 + with keys "transcripts", "screen", "agents" mapping to entry counts. 609 670 610 671 Raises: 611 672 ValueError: If any segment directories are missing 612 673 """ 613 - empty_counts = {"audio": 0, "screen": 0, "agents": 0} 674 + empty_counts = {"transcripts": 0, "screen": 0, "agents": 0} 614 675 615 676 # Validate all segments in span exist upfront (fail fast) 616 677 missing = [] ··· 630 691 for seg_dir in found_dirs: 631 692 segment_entries = _load_entries_from_segment( 632 693 str(seg_dir), 633 - audio=sources.get("audio", False), 694 + transcripts=sources.get("transcripts", False), 634 695 screen=sources.get("screen", False), 635 696 agents=sources.get("agents", False), 636 697 ) ··· 638 699 639 700 if not entries: 640 701 return ( 641 - f"No audio or screen files found in span: {', '.join(span)}", 702 + f"No transcript or screen files found in span: {', '.join(span)}", 642 703 empty_counts, 643 704 ) 644 705 ··· 674 735 day: Day in YYYYMMDD format 675 736 start: Start time in HHMMSS format 676 737 end: End time in HHMMSS format 677 - sources: Dict with keys "audio", "screen", "agents". 738 + sources: Dict with keys "transcripts", "screen", "agents". 678 739 Values can be bool, "required" string, or dict (for agents). 679 740 """ 680 741 day_dir = str(day_path(day)) ··· 684 745 685 746 entries = _load_entries( 686 747 day_dir, 687 - audio=sources.get("audio", False), 748 + transcripts=sources.get("transcripts", False), 688 749 screen=sources.get("screen", False), 689 750 agents=sources.get("agents", False), 690 751 )
+20 -1
think/formatters.py
··· 146 146 "format_imported", 147 147 True, 148 148 ), 149 + # Markdown transcript imports (new convention + legacy) 150 + "*/import.*/*/*_transcript.md": ("think.markdown", "format_markdown", True), 149 151 "*/import.*/*/imported.md": ("think.markdown", "format_markdown", True), 150 - # AI chat imports — dedicated formatter (must precede generic *_audio.jsonl) 152 + # AI chat imports — dedicated formatter (new + legacy filenames) 153 + "*/import.chatgpt/*/conversation_transcript.jsonl": ( 154 + "think.importers.formatting", 155 + "format_ai_chat", 156 + True, 157 + ), 158 + "*/import.claude/*/conversation_transcript.jsonl": ( 159 + "think.importers.formatting", 160 + "format_ai_chat", 161 + True, 162 + ), 163 + "*/import.gemini/*/conversation_transcript.jsonl": ( 164 + "think.importers.formatting", 165 + "format_ai_chat", 166 + True, 167 + ), 168 + # Legacy AI chat import filenames (backward compat) 151 169 "*/import.chatgpt/*/imported_audio.jsonl": ( 152 170 "think.importers.formatting", 153 171 "format_ai_chat", ··· 167 185 # Layout: day/stream/segment/audio.jsonl 168 186 "*/*/*/audio.jsonl": ("observe.hear", "format_audio", False), 169 187 "*/*/*/*_audio.jsonl": ("observe.hear", "format_audio", False), 188 + "*/*/*/*_transcript.jsonl": ("observe.hear", "format_audio", False), 170 189 "*/*/*/screen.jsonl": ("observe.screen", "format_screen", False), 171 190 "*/*/*/*_screen.jsonl": ("observe.screen", "format_screen", False), 172 191 # Markdown — specific journal paths (all indexed)
+1
think/importers/ics.py
··· 449 449 "ics", 450 450 windows, 451 451 lambda items: "\n\n".join(_render_event_markdown(e) for e in items), 452 + filename="event_transcript.md", 452 453 ) 453 454 454 455 segment_days = {day for day, _ in segments}
+1
think/importers/kindle.py
··· 298 298 "kindle", 299 299 windows, 300 300 _render_highlight_markdown, 301 + filename="highlights_transcript.md", 301 302 ) 302 303 303 304 segment_days = {day for day, _ in segments}
+1
think/importers/obsidian.py
··· 281 281 "obsidian", 282 282 windows, 283 283 lambda items: "\n\n".join(_render_note_markdown(n) for n in items), 284 + filename="note_transcript.md", 284 285 ) 285 286 286 287 # Seed entities from wikilinks
+8 -3
think/importers/shared.py
··· 104 104 detected_setting: str | None = None, 105 105 model: str | None = None, 106 106 ) -> str: 107 - """Write a single segment's imported_audio.jsonl file.""" 107 + """Write a single segment's conversation_transcript.jsonl file.""" 108 108 ts_dir = os.path.join(day_dir, stream, segment_key) 109 109 os.makedirs(ts_dir, exist_ok=True) 110 - json_path = os.path.join(ts_dir, "imported_audio.jsonl") 110 + json_path = os.path.join(ts_dir, "conversation_transcript.jsonl") 111 111 112 112 _write_import_jsonl( 113 113 json_path, ··· 245 245 source: str, 246 246 windows: list[tuple[str, str, list[dict[str, Any]]]], 247 247 render: Callable[[list[dict[str, Any]]], str], 248 + *, 249 + filename: str = "imported.md", 248 250 ) -> tuple[list[str], list[tuple[str, str]]]: 249 251 """Write markdown segments from windowed items. 250 252 ··· 256 258 Output of ``window_items`` — (day, seg_key, items) tuples. 257 259 render : callable 258 260 Function taking list of items and returning markdown string. 261 + filename : str 262 + Output filename (default: ``imported.md`` for backward compat). 263 + New importers should use ``*_transcript.md`` convention. 259 264 260 265 Returns 261 266 ------- ··· 268 273 for day, seg_key, items in windows: 269 274 segment_dir = day_path(day) / f"import.{source}" / seg_key 270 275 segment_dir.mkdir(parents=True, exist_ok=True) 271 - md_path = segment_dir / "imported.md" 276 + md_path = segment_dir / filename 272 277 md_path.write_text(render(items) + "\n", encoding="utf-8") 273 278 created_files.append(str(md_path)) 274 279 segments.append((day, seg_key))
+20 -17
think/journal_stats.py
··· 22 22 def __init__(self) -> None: 23 23 self.days: Dict[str, Dict[str, float | int]] = {} 24 24 self.totals: Counter[str] = Counter() 25 - self.total_audio_duration = 0.0 25 + self.total_transcript_duration = 0.0 26 26 self.total_screen_duration = 0.0 27 27 self.agent_counts: Counter[str] = Counter() 28 28 self.agent_minutes: Counter[str] = Counter() ··· 43 43 files = [] 44 44 # Check segment subdirectories for processed files (day/stream/segment/) 45 45 files.extend(day_dir.glob("*/*/*audio.jsonl")) 46 + files.extend(day_dir.glob("*/*/*_transcript.jsonl")) 47 + files.extend(day_dir.glob("*/*/*_transcript.md")) 46 48 files.extend(day_dir.glob("*/*/*screen.jsonl")) 47 49 # Check day root for unprocessed media files 48 50 files.extend(day_dir.glob("*.flac")) ··· 130 132 counts_for_totals = { 131 133 k: v 132 134 for k, v in stats.items() 133 - if k not in ("audio_duration", "screen_duration") 135 + if k not in ("transcript_duration", "screen_duration") 134 136 } 135 137 self.totals.update(counts_for_totals) 136 138 137 139 # Accumulate durations 138 - self.total_audio_duration += stats.get("audio_duration", 0.0) 140 + self.total_transcript_duration += stats.get("transcript_duration", 0.0) 139 141 self.total_screen_duration += stats.get("screen_duration", 0.0) 140 142 141 143 # Apply agent data ··· 172 174 def scan_day(self, day: str, path: str) -> dict: 173 175 """Scan a single day and return stats dict for caching.""" 174 176 stats: Counter[str] = Counter() 175 - audio_duration = 0.0 177 + transcript_duration = 0.0 176 178 screen_duration = 0.0 177 179 day_dir = Path(path) 178 180 ··· 181 183 facet_data = {} 182 184 heatmap_hours = {} 183 185 184 - # --- Audio sessions --- 185 - # Check segment subdirectories for audio files (day/stream/segment/) 186 - audio_files = list(day_dir.glob("*/*/audio.jsonl")) 187 - audio_files.extend(day_dir.glob("*/*/*_audio.jsonl")) # Split audio files 188 - for jsonl_file in sorted(audio_files): 189 - stats["audio_sessions"] += 1 186 + # --- Transcript sessions --- 187 + # Check segment subdirectories for transcript JSONL files (day/stream/segment/) 188 + transcript_files = list(day_dir.glob("*/*/audio.jsonl")) 189 + transcript_files.extend(day_dir.glob("*/*/*_audio.jsonl")) 190 + transcript_files.extend(day_dir.glob("*/*/*_transcript.jsonl")) 191 + for jsonl_file in sorted(set(transcript_files)): 192 + stats["transcript_sessions"] += 1 190 193 191 194 try: 192 195 with open(jsonl_file, encoding="utf-8") as f: 193 196 lines = [line.strip() for line in f if line.strip()] 194 197 195 198 if not lines: 196 - logger.debug(f"Empty audio file: {jsonl_file}") 199 + logger.debug(f"Empty transcript file: {jsonl_file}") 197 200 continue 198 201 199 202 # First line is metadata, rest are segments ··· 205 208 logger.debug(f"Invalid JSON at line {i} in {jsonl_file}: {e}") 206 209 continue 207 210 208 - stats["audio_segments"] += len(segments) 211 + stats["transcript_segments"] += len(segments) 209 212 210 213 # Calculate duration from timestamps 211 214 if segments: 212 215 duration = self._calculate_audio_duration(segments) 213 - audio_duration += duration 216 + transcript_duration += duration 214 217 215 218 except (OSError, IOError) as e: 216 - logger.warning(f"Error reading audio file {jsonl_file}: {e}") 219 + logger.warning(f"Error reading transcript file {jsonl_file}: {e}") 217 220 except Exception as e: 218 221 logger.warning(f"Unexpected error processing {jsonl_file}: {e}") 219 222 ··· 324 327 ) 325 328 326 329 # --- Build return dict --- 327 - stats["audio_duration"] = audio_duration 330 + stats["transcript_duration"] = transcript_duration 328 331 stats["screen_duration"] = screen_duration 329 332 330 333 return { ··· 456 459 ) 457 460 logger.info( 458 461 f"Scanned {len(self.days)} days, " 459 - f"{self.totals.get('audio_sessions', 0)} audio sessions, " 462 + f"{self.totals.get('transcript_sessions', 0)} transcript sessions, " 460 463 f"{self.totals.get('screen_sessions', 0)} screen sessions" 461 464 f"{cache_status}" 462 465 ) ··· 466 469 return { 467 470 "days": self.days, 468 471 "totals": dict(self.totals), 469 - "total_audio_duration": self.total_audio_duration, 472 + "total_transcript_duration": self.total_transcript_duration, 470 473 "total_screen_duration": self.total_screen_duration, 471 474 "agent_counts": dict(self.agent_counts), 472 475 "agent_minutes": {k: round(v, 2) for k, v in self.agent_minutes.items()},
+3 -3
think/muse.py
··· 334 334 "day": False, 335 335 "activity": False, 336 336 "sources": { 337 - "audio": False, 337 + "transcripts": False, 338 338 "screen": False, 339 339 "agents": False, 340 340 }, ··· 431 431 true = include current date/time in extra_context 432 432 - "day": false | true (default: false) 433 433 true = include analysis day context (requires analysis_day parameter) 434 - - "sources": {"audio": bool, "screen": bool, "agents": bool|dict} 434 + - "sources": {"transcripts": bool, "screen": bool, "agents": bool|dict} 435 435 The "agents" source can be: 436 436 - bool: True (all agents), False (no agents) 437 437 - "required": all agents, fail if none found ··· 445 445 - system_prompt_name: str - name of system prompt (for cache keys) 446 446 - user_instruction: str | None - loaded from user_prompt if provided 447 447 - extra_context: str | None - facets + now + day context 448 - - sources: dict - {"audio": bool, "screen": bool, "agents": bool|dict} 448 + - sources: dict - {"transcripts": bool, "screen": bool, "agents": bool|dict} 449 449 """ 450 450 from think.utils import format_day 451 451