personal memory agent
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Granola meeting transcript importer via muesli

SyncableBackend registered as 'granola' in the sync registry. Reads local
markdown files from ~/.local/share/muesli/transcripts/ — no network calls.

- Three-state setup detection (not installed / installed but no transcripts / ready)
- Incremental sync via doc_id + remote_updated_at tracking in granola.json
- YAML frontmatter parsing for metadata (title, participants, duration, summary)
- Speaker-labeled transcript parsing (**Name (HH:MM:SS):** text)
- 5-min windowed segments via shared _window_messages()
- Entity seeding from enriched ## Participants section (name, email, title, company)
- Source markdown preserved as source.md in first segment
- CLI: sol import --sync granola [--save|--force|--path]
- 25 tests covering parsing, sync state, detection, incremental, CLI integration

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

+1107 -10
+589
tests/test_importer_granola.py
··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Tests for the Granola meeting transcript importer (via muesli).""" 5 + 6 + import json 7 + from pathlib import Path 8 + from textwrap import dedent 9 + 10 + import pytest 11 + 12 + 13 + # --------------------------------------------------------------------------- 14 + # Helpers 15 + # --------------------------------------------------------------------------- 16 + 17 + SAMPLE_TRANSCRIPT = dedent("""\ 18 + --- 19 + doc_id: doc_test_001 20 + source: granola 21 + created_at: "2025-10-28T15:04:05Z" 22 + remote_updated_at: "2025-10-29T01:23:45Z" 23 + title: Q1 Planning 24 + participants: 25 + - Alice 26 + - Bob 27 + duration_seconds: 3170 28 + labels: 29 + - Planning 30 + creator: "[[Alice Smith]]" 31 + attendees: 32 + - "[[Alice Smith]]" 33 + - "[[Bob Jones]]" 34 + summary_text: "Discussed Q1 priorities and resource allocation." 35 + generator: muesli 1.0 36 + --- 37 + 38 + # Q1 Planning 39 + 40 + _Date: 2025-10-28 · Duration: 52m · Participants: Alice Smith, Bob Jones_ 41 + 42 + ## Participants 43 + 44 + - **Alice Smith**, Engineering Manager, Acme Corp, (alice@acme.com) 45 + - **Bob Jones**, (bob@jones.io) 46 + 47 + --- 48 + 49 + **Alice Smith (15:04:12):** Welcome to the planning session. 50 + **Bob Jones (15:04:19):** Thanks, let's get started. 51 + **Alice Smith (15:04:35):** First, let's review our Q4 results. 52 + **Bob Jones (15:05:01):** Revenue was up 15% quarter over quarter. 53 + **Alice Smith (15:09:22):** Now let's talk about Q1 priorities. 54 + **Bob Jones (15:09:45):** I think we should focus on the mobile launch. 55 + """) 56 + 57 + 58 + STUB_TRANSCRIPT = dedent("""\ 59 + --- 60 + doc_id: doc_stub_002 61 + source: granola 62 + created_at: "2025-11-01T10:00:00Z" 63 + remote_updated_at: "2025-11-01T10:00:00Z" 64 + title: Quick Sync 65 + duration_seconds: 60 66 + generator: muesli 1.0 67 + --- 68 + 69 + # Quick Sync 70 + 71 + _No transcript available._ 72 + """) 73 + 74 + 75 + NO_DOCID_TRANSCRIPT = dedent("""\ 76 + --- 77 + source: granola 78 + title: Orphan 79 + --- 80 + 81 + # Orphan 82 + 83 + **Alice (10:00:00):** Hello. 84 + """) 85 + 86 + 87 + def _write_transcript(muesli_dir: Path, filename: str, content: str) -> Path: 88 + """Write a transcript file to the muesli directory.""" 89 + muesli_dir.mkdir(parents=True, exist_ok=True) 90 + path = muesli_dir / filename 91 + path.write_text(content, encoding="utf-8") 92 + return path 93 + 94 + 95 + # --------------------------------------------------------------------------- 96 + # Frontmatter parsing 97 + # --------------------------------------------------------------------------- 98 + 99 + 100 + def test_parse_muesli_file(tmp_path): 101 + """Parse frontmatter and body from a muesli markdown file.""" 102 + from think.importers.granola import _parse_muesli_file 103 + 104 + path = _write_transcript(tmp_path, "test.md", SAMPLE_TRANSCRIPT) 105 + fm, body = _parse_muesli_file(path) 106 + 107 + assert fm["doc_id"] == "doc_test_001" 108 + assert fm["title"] == "Q1 Planning" 109 + assert fm["duration_seconds"] == 3170 110 + assert fm["summary_text"] == "Discussed Q1 priorities and resource allocation." 111 + assert "Alice Smith (15:04:12)" in body 112 + 113 + 114 + def test_parse_muesli_file_missing_fields(tmp_path): 115 + """Handles files with minimal frontmatter.""" 116 + from think.importers.granola import _parse_muesli_file 117 + 118 + content = dedent("""\ 119 + --- 120 + doc_id: minimal 121 + --- 122 + 123 + **Speaker (10:00:00):** Hello. 124 + """) 125 + path = _write_transcript(tmp_path, "minimal.md", content) 126 + fm, body = _parse_muesli_file(path) 127 + 128 + assert fm["doc_id"] == "minimal" 129 + assert "title" not in fm 130 + assert "Speaker (10:00:00)" in body 131 + 132 + 133 + # --------------------------------------------------------------------------- 134 + # Participant parsing 135 + # --------------------------------------------------------------------------- 136 + 137 + 138 + def test_parse_participants(): 139 + """Extract participant info from ## Participants section.""" 140 + from think.importers.granola import _parse_participants 141 + 142 + participants = _parse_participants(SAMPLE_TRANSCRIPT) 143 + assert len(participants) == 2 144 + 145 + alice = participants[0] 146 + assert alice["name"] == "Alice Smith" 147 + assert alice["email"] == "alice@acme.com" 148 + assert alice["title"] == "Engineering Manager" 149 + assert alice["company"] == "Acme Corp" 150 + 151 + bob = participants[1] 152 + assert bob["name"] == "Bob Jones" 153 + assert bob["email"] == "bob@jones.io" 154 + 155 + 156 + def test_parse_participants_with_linkedin(): 157 + """Extract LinkedIn handle from participant line.""" 158 + from think.importers.granola import _parse_participants 159 + 160 + body = dedent("""\ 161 + ## Participants 162 + 163 + - **Jane Doe**, CTO, StartupCo, linkedin.com/in/janedoe, (jane@startup.co) 164 + """) 165 + participants = _parse_participants(body) 166 + assert len(participants) == 1 167 + assert participants[0]["name"] == "Jane Doe" 168 + assert participants[0]["email"] == "jane@startup.co" 169 + assert participants[0]["linkedin"] == "janedoe" 170 + assert participants[0]["title"] == "CTO" 171 + assert participants[0]["company"] == "StartupCo" 172 + 173 + 174 + def test_parse_participants_no_section(): 175 + """Returns empty list when no ## Participants section exists.""" 176 + from think.importers.granola import _parse_participants 177 + 178 + assert _parse_participants("# Meeting\n\nJust some text.") == [] 179 + 180 + 181 + def test_parse_participants_wikilinks(): 182 + """Strips [[wikilink]] brackets from participant names.""" 183 + from think.importers.granola import _parse_participants 184 + 185 + body = dedent("""\ 186 + ## Participants 187 + 188 + - **[[Alice Smith]]**, (alice@acme.com) 189 + """) 190 + participants = _parse_participants(body) 191 + assert participants[0]["name"] == "Alice Smith" 192 + 193 + 194 + # --------------------------------------------------------------------------- 195 + # Transcript entry parsing 196 + # --------------------------------------------------------------------------- 197 + 198 + 199 + def test_parse_transcript_entries(): 200 + """Parse speaker-labeled entries into timestamped messages.""" 201 + import datetime as dt 202 + 203 + from think.importers.granola import _parse_transcript_entries 204 + 205 + base_dt = dt.datetime(2025, 10, 28, 0, 0, 0, tzinfo=dt.timezone.utc) 206 + messages = _parse_transcript_entries(SAMPLE_TRANSCRIPT, base_dt, dt.timezone.utc) 207 + 208 + assert len(messages) == 6 209 + assert messages[0]["speaker"] == "Alice Smith" 210 + assert messages[0]["text"] == "Welcome to the planning session." 211 + assert messages[0]["model_slug"] is None 212 + 213 + assert messages[1]["speaker"] == "Bob Jones" 214 + assert messages[1]["text"] == "Thanks, let's get started." 215 + 216 + # Timestamps should be in order 217 + for i in range(1, len(messages)): 218 + assert messages[i]["create_time"] >= messages[i - 1]["create_time"] 219 + 220 + 221 + def test_parse_transcript_entries_empty_body(): 222 + """Returns empty list when no transcript entries found.""" 223 + import datetime as dt 224 + 225 + from think.importers.granola import _parse_transcript_entries 226 + 227 + base_dt = dt.datetime(2025, 10, 28, 0, 0, 0, tzinfo=dt.timezone.utc) 228 + messages = _parse_transcript_entries("No transcript here.", base_dt, dt.timezone.utc) 229 + assert messages == [] 230 + 231 + 232 + # --------------------------------------------------------------------------- 233 + # Date parsing 234 + # --------------------------------------------------------------------------- 235 + 236 + 237 + def test_date_from_filename(): 238 + """Extract date from muesli filename.""" 239 + from think.importers.granola import _date_from_filename 240 + 241 + import datetime as dt 242 + 243 + assert _date_from_filename("2025-10-28_q1-planning.md") == dt.date(2025, 10, 28) 244 + assert _date_from_filename("random-file.md") is None 245 + assert _date_from_filename("2025-01-01.md") == dt.date(2025, 1, 1) 246 + 247 + 248 + def test_parse_created_at(): 249 + """Parse created_at from frontmatter with timezone.""" 250 + import datetime as dt 251 + 252 + from think.importers.granola import _parse_created_at 253 + 254 + fm = {"created_at": "2025-10-28T15:04:05Z"} 255 + created_dt, tz = _parse_created_at(fm, "test.md") 256 + assert created_dt is not None 257 + assert tz == dt.timezone.utc 258 + 259 + 260 + def test_parse_created_at_fallback_to_filename(): 261 + """Falls back to filename date when created_at is missing.""" 262 + from think.importers.granola import _parse_created_at 263 + 264 + fm = {} 265 + created_dt, tz = _parse_created_at(fm, "2025-10-28_meeting.md") 266 + assert created_dt is not None 267 + assert tz is None # naive/local 268 + 269 + 270 + def test_parse_created_at_no_date(): 271 + """Returns None when no date source available.""" 272 + from think.importers.granola import _parse_created_at 273 + 274 + fm = {} 275 + created_dt, tz = _parse_created_at(fm, "random.md") 276 + assert created_dt is None 277 + 278 + 279 + # --------------------------------------------------------------------------- 280 + # GranolaBackend protocol 281 + # --------------------------------------------------------------------------- 282 + 283 + 284 + def test_granola_protocol_conformance(): 285 + """GranolaBackend satisfies SyncableBackend protocol.""" 286 + from think.importers.granola import GranolaBackend 287 + from think.importers.sync import SyncableBackend 288 + 289 + assert isinstance(GranolaBackend(), SyncableBackend) 290 + 291 + 292 + def test_granola_in_registry(): 293 + """Granola is registered in the syncable backend registry.""" 294 + from think.importers.sync import get_syncable_backends 295 + 296 + backends = get_syncable_backends() 297 + names = [b.name for b in backends] 298 + assert "granola" in names 299 + 300 + 301 + # --------------------------------------------------------------------------- 302 + # GranolaBackend.sync() — detection 303 + # --------------------------------------------------------------------------- 304 + 305 + 306 + def test_granola_sync_no_muesli(tmp_path): 307 + """Raises ValueError when muesli is not installed.""" 308 + from think.importers.granola import GranolaBackend 309 + 310 + nonexistent = tmp_path / "nonexistent" / "transcripts" 311 + with pytest.raises(ValueError, match="muesli to extract"): 312 + GranolaBackend().sync(tmp_path, source_path=nonexistent) 313 + 314 + 315 + def test_granola_sync_no_transcripts(tmp_path): 316 + """Raises ValueError when muesli dir exists but no transcripts.""" 317 + from think.importers.granola import GranolaBackend 318 + 319 + muesli_dir = tmp_path / "muesli" / "transcripts" 320 + # Create parent but not transcripts dir 321 + (tmp_path / "muesli").mkdir() 322 + with pytest.raises(ValueError, match="no transcripts found"): 323 + GranolaBackend().sync(tmp_path, source_path=muesli_dir) 324 + 325 + 326 + # --------------------------------------------------------------------------- 327 + # GranolaBackend.sync() — catalog mode 328 + # --------------------------------------------------------------------------- 329 + 330 + 331 + def test_granola_sync_dry_run(tmp_path): 332 + """Dry-run catalogs transcripts and saves state.""" 333 + from think.importers.granola import GranolaBackend 334 + from think.importers.sync import load_sync_state 335 + 336 + muesli_dir = tmp_path / "muesli" 337 + _write_transcript(muesli_dir, "2025-10-28_q1.md", SAMPLE_TRANSCRIPT) 338 + _write_transcript(muesli_dir, "stub.md", STUB_TRANSCRIPT) 339 + 340 + result = GranolaBackend().sync(tmp_path, source_path=muesli_dir) 341 + 342 + assert result["total"] == 2 343 + assert result["available"] == 1 # only the one with transcript content 344 + assert result["skipped"] == 1 # stub has no transcript 345 + assert result["imported"] == 0 346 + assert result["downloaded"] == 0 347 + 348 + # State was saved 349 + state = load_sync_state(tmp_path, "granola") 350 + assert state is not None 351 + assert len(state["files"]) == 2 352 + assert state["files"]["doc_test_001"]["status"] == "available" 353 + assert state["files"]["doc_stub_002"]["status"] == "skipped" 354 + assert state["files"]["doc_stub_002"]["skip_reason"] == "no_transcript" 355 + 356 + 357 + def test_granola_sync_skips_no_docid(tmp_path): 358 + """Transcripts without doc_id in frontmatter are skipped.""" 359 + from think.importers.granola import GranolaBackend 360 + 361 + muesli_dir = tmp_path / "muesli" 362 + _write_transcript(muesli_dir, "orphan.md", NO_DOCID_TRANSCRIPT) 363 + 364 + result = GranolaBackend().sync(tmp_path, source_path=muesli_dir) 365 + assert result["total"] == 0 366 + assert result["available"] == 0 367 + 368 + 369 + # --------------------------------------------------------------------------- 370 + # GranolaBackend.sync() — incremental 371 + # --------------------------------------------------------------------------- 372 + 373 + 374 + def test_granola_sync_incremental(tmp_path): 375 + """Second sync skips already-imported transcripts.""" 376 + from think.importers.granola import GranolaBackend 377 + from think.importers.sync import load_sync_state, save_sync_state 378 + 379 + muesli_dir = tmp_path / "muesli" 380 + _write_transcript(muesli_dir, "2025-10-28_q1.md", SAMPLE_TRANSCRIPT) 381 + 382 + # Pre-seed state: doc_test_001 already imported 383 + save_sync_state( 384 + tmp_path, 385 + "granola", 386 + { 387 + "backend": "granola", 388 + "files": { 389 + "doc_test_001": { 390 + "filename": "2025-10-28_q1.md", 391 + "remote_updated_at": "2025-10-29T01:23:45Z", 392 + "status": "imported", 393 + "imported_at": "2026-03-01T00:00:00", 394 + }, 395 + }, 396 + }, 397 + ) 398 + 399 + result = GranolaBackend().sync(tmp_path, source_path=muesli_dir) 400 + 401 + assert result["imported"] == 1 402 + assert result["available"] == 0 # nothing new 403 + 404 + 405 + def test_granola_sync_detects_updated(tmp_path): 406 + """Re-imports when remote_updated_at is newer than last sync.""" 407 + from think.importers.granola import GranolaBackend 408 + from think.importers.sync import save_sync_state 409 + 410 + muesli_dir = tmp_path / "muesli" 411 + _write_transcript(muesli_dir, "2025-10-28_q1.md", SAMPLE_TRANSCRIPT) 412 + 413 + # Pre-seed with older remote_updated_at 414 + save_sync_state( 415 + tmp_path, 416 + "granola", 417 + { 418 + "backend": "granola", 419 + "files": { 420 + "doc_test_001": { 421 + "filename": "2025-10-28_q1.md", 422 + "remote_updated_at": "2025-10-28T00:00:00Z", # older 423 + "status": "imported", 424 + }, 425 + }, 426 + }, 427 + ) 428 + 429 + result = GranolaBackend().sync(tmp_path, source_path=muesli_dir) 430 + assert result["available"] == 1 # updated, needs re-import 431 + 432 + 433 + def test_granola_sync_detects_removed(tmp_path): 434 + """Marks files as removed when they disappear from muesli dir.""" 435 + from think.importers.granola import GranolaBackend 436 + from think.importers.sync import load_sync_state, save_sync_state 437 + 438 + muesli_dir = tmp_path / "muesli" 439 + muesli_dir.mkdir(parents=True) 440 + # No files in dir, but state has one 441 + 442 + save_sync_state( 443 + tmp_path, 444 + "granola", 445 + { 446 + "backend": "granola", 447 + "files": { 448 + "doc_gone": { 449 + "filename": "old.md", 450 + "status": "imported", 451 + }, 452 + }, 453 + }, 454 + ) 455 + 456 + GranolaBackend().sync(tmp_path, source_path=muesli_dir) 457 + state = load_sync_state(tmp_path, "granola") 458 + assert state["files"]["doc_gone"]["status"] == "removed" 459 + 460 + 461 + def test_granola_sync_force(tmp_path): 462 + """Force flag clears state and re-imports everything.""" 463 + from think.importers.granola import GranolaBackend 464 + from think.importers.sync import save_sync_state 465 + 466 + muesli_dir = tmp_path / "muesli" 467 + _write_transcript(muesli_dir, "2025-10-28_q1.md", SAMPLE_TRANSCRIPT) 468 + 469 + # Pre-seed as already imported 470 + save_sync_state( 471 + tmp_path, 472 + "granola", 473 + { 474 + "backend": "granola", 475 + "files": { 476 + "doc_test_001": { 477 + "filename": "2025-10-28_q1.md", 478 + "remote_updated_at": "2025-10-29T01:23:45Z", 479 + "status": "imported", 480 + }, 481 + }, 482 + }, 483 + ) 484 + 485 + result = GranolaBackend().sync(tmp_path, source_path=muesli_dir, force=True) 486 + assert result["available"] == 1 # force causes re-import 487 + 488 + 489 + # --------------------------------------------------------------------------- 490 + # GranolaBackend.sync() — import mode 491 + # --------------------------------------------------------------------------- 492 + 493 + 494 + def test_granola_sync_import(tmp_path, monkeypatch): 495 + """Import mode writes segments and updates state.""" 496 + from think.importers.granola import GranolaBackend 497 + from think.importers.sync import load_sync_state 498 + 499 + # Point journal to tmp_path 500 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 501 + 502 + muesli_dir = tmp_path / "muesli" 503 + _write_transcript(muesli_dir, "2025-10-28_q1.md", SAMPLE_TRANSCRIPT) 504 + 505 + result = GranolaBackend().sync( 506 + tmp_path, source_path=muesli_dir, dry_run=False 507 + ) 508 + 509 + assert result["downloaded"] == 1 510 + assert result["imported"] == 1 511 + assert result["available"] == 0 512 + assert result["errors"] == [] 513 + 514 + # State updated 515 + state = load_sync_state(tmp_path, "granola") 516 + doc = state["files"]["doc_test_001"] 517 + assert doc["status"] == "imported" 518 + assert "imported_at" in doc 519 + assert doc["segments"] > 0 520 + 521 + # Verify segments were written 522 + # The transcript spans 15:04:12 to 15:09:45 (local time from UTC), 523 + # so we check for import.granola directories 524 + import glob 525 + 526 + segments = glob.glob(str(tmp_path / "*/import.granola/*/conversation_transcript.jsonl")) 527 + assert len(segments) >= 1 528 + 529 + # Check JSONL content of first segment 530 + with open(segments[0], "r") as f: 531 + lines = f.readlines() 532 + 533 + # First line is metadata 534 + meta = json.loads(lines[0]) 535 + assert "imported" in meta 536 + assert meta["imported"]["id"] == "doc_test_001" 537 + assert meta["topics"] == "Q1 Planning" 538 + 539 + # Subsequent lines are entries 540 + entry = json.loads(lines[1]) 541 + assert "speaker" in entry 542 + assert "text" in entry 543 + assert "start" in entry 544 + assert entry["source"] == "import" 545 + 546 + # Check source.md was copied 547 + source_files = glob.glob(str(tmp_path / "*/import.granola/*/source.md")) 548 + assert len(source_files) == 1 # only in first segment 549 + 550 + 551 + # --------------------------------------------------------------------------- 552 + # CLI integration 553 + # --------------------------------------------------------------------------- 554 + 555 + 556 + def test_granola_backends_cli_flag(capsys, monkeypatch): 557 + """sol import --backends lists granola.""" 558 + import sys 559 + 560 + from think.importers.cli import main 561 + 562 + monkeypatch.setattr(sys, "argv", ["sol import", "--backends"]) 563 + monkeypatch.setenv("JOURNAL_PATH", "/tmp/test-journal") 564 + main() 565 + captured = capsys.readouterr() 566 + assert "granola" in captured.out 567 + 568 + 569 + def test_granola_sync_cli(capsys, monkeypatch, tmp_path): 570 + """sol import --sync granola --path <dir> runs catalog.""" 571 + import sys 572 + 573 + from think.importers.cli import main 574 + 575 + muesli_dir = tmp_path / "muesli" 576 + _write_transcript(muesli_dir, "2025-10-28_q1.md", SAMPLE_TRANSCRIPT) 577 + 578 + monkeypatch.setattr( 579 + sys, 580 + "argv", 581 + ["sol import", "--sync", "granola", "--path", str(muesli_dir)], 582 + ) 583 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 584 + 585 + main() 586 + captured = capsys.readouterr() 587 + assert "Total:" in captured.out 588 + assert "Available to import:" in captured.out 589 + assert "Q1 Planning" in captured.out # title shown in available list
+1 -1
tests/test_importer_sync.py
··· 426 426 main() 427 427 428 428 captured = capsys.readouterr() 429 - assert "Total recordings:" in captured.out 429 + assert "Total:" in captured.out 430 430 assert "Available to import:" in captured.out
+35 -9
think/importers/cli.py
··· 107 107 return timestamp 108 108 109 109 110 - def _run_sync(backend_name: str, *, dry_run: bool = True) -> None: 110 + def _run_sync(backend_name: str, *, dry_run: bool = True, **extra: Any) -> None: 111 111 """Run sync for a named backend and print results.""" 112 + import inspect 113 + 112 114 from think.importers.plaud import format_size 113 115 from think.importers.sync import get_syncable_backends, load_sync_state 114 116 ··· 131 133 mode = "save" if not dry_run else "catalog" 132 134 print(f"Syncing {backend_name} ({mode} mode)...") 133 135 print() 136 + 137 + # Pass extra kwargs only if the backend accepts them 138 + sync_kwargs: dict[str, Any] = {"dry_run": dry_run} 139 + sig = inspect.signature(backend.sync) 140 + for key, value in extra.items(): 141 + if key in sig.parameters and value is not None: 142 + sync_kwargs[key] = value 134 143 135 144 try: 136 - result = backend.sync(journal_root, dry_run=dry_run) 145 + result = backend.sync(journal_root, **sync_kwargs) 137 146 except ValueError as e: 138 147 raise SystemExit(str(e)) 139 148 except RuntimeError as e: ··· 148 157 149 158 # Print summary 150 159 print() 151 - print(f" Total recordings: {total}") 160 + print(f" Total: {total}") 152 161 print(f" Already imported: {imported}") 153 162 print(f" Available to import: {available}") 154 163 if skipped: 155 - print(f" Skipped: {skipped} (trashed/short)") 164 + print(f" Skipped: {skipped}") 156 165 157 166 if downloaded > 0: 158 - print(f" Downloaded + imported: {downloaded}") 167 + print(f" Imported this run: {downloaded}") 159 168 if errors: 160 169 print(f" Errors: {len(errors)}") 161 170 for err in errors: ··· 173 182 ] 174 183 if avail_files: 175 184 print() 176 - print("Available recordings:") 185 + print("Available:") 177 186 for _fid, info in avail_files: 178 187 name = info.get("filename", "unnamed") 188 + title = info.get("title", "") 179 189 size = info.get("filesize", 0) 180 - print(f" - {name} ({format_size(size)})") 190 + if title: 191 + print(f" - {title} ({name})") 192 + elif size: 193 + print(f" - {name} ({format_size(size)})") 194 + else: 195 + print(f" - {name}") 181 196 print() 182 - print("Run with --save to download and import these files:") 197 + print(f"Run with --save to import:") 183 198 print(f" sol import --sync {backend_name} --save") 184 199 185 200 if not dry_run and available == 0 and downloaded == 0: ··· 248 263 help="With --sync: download and import new files (default is dry-run)", 249 264 ) 250 265 parser.add_argument( 266 + "--path", 267 + type=str, 268 + default=None, 269 + help="With --sync: override the default source directory path", 270 + ) 271 + parser.add_argument( 251 272 "--list-importers", 252 273 action="store_true", 253 274 help="List available file importers", ··· 302 323 return 303 324 304 325 if args.sync: 305 - _run_sync(args.sync, dry_run=not args.save) 326 + extra: dict[str, Any] = {} 327 + if args.path: 328 + extra["source_path"] = Path(os.path.expanduser(args.path)) 329 + if args.force: 330 + extra["force"] = True 331 + _run_sync(args.sync, dry_run=not args.save, **extra) 306 332 return 307 333 308 334 if not args.media:
+481
think/importers/granola.py
··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Granola meeting transcript importer via muesli. 5 + 6 + Reads local markdown files produced by muesli (harper reed's Granola extractor) 7 + from ~/.local/share/muesli/transcripts/ and imports them into the journal as 8 + speaker-labeled transcript segments. 9 + 10 + Architecture: SyncableBackend registered as "granola" in the sync registry. 11 + No network calls — reads local files only. 12 + """ 13 + 14 + import datetime as dt 15 + import logging 16 + import re 17 + import shutil 18 + from pathlib import Path 19 + from typing import Any 20 + 21 + import frontmatter 22 + 23 + from think.importers.shared import ( 24 + _window_messages, 25 + seed_entities, 26 + write_segment, 27 + ) 28 + from think.importers.sync import load_sync_state, save_sync_state 29 + from think.utils import day_path 30 + 31 + logger = logging.getLogger(__name__) 32 + 33 + DEFAULT_MUESLI_PATH = Path.home() / ".local" / "share" / "muesli" / "transcripts" 34 + 35 + # **Name (HH:MM:SS):** text 36 + TRANSCRIPT_RE = re.compile( 37 + r"^\*\*(.+?)\s*\((\d{1,2}:\d{2}:\d{2})\):\*\*\s*(.+)", 38 + re.MULTILINE, 39 + ) 40 + 41 + # - **Name**, Title, Company, (email) 42 + PARTICIPANT_RE = re.compile( 43 + r"^\s*-\s*\*\*(.+?)\*\*(?:,\s*(.+))?$", 44 + re.MULTILINE, 45 + ) 46 + 47 + EMAIL_RE = re.compile(r"\(([^)]+@[^)]+)\)") 48 + LINKEDIN_RE = re.compile(r"linkedin\.com/in/([^\s,)]+)", re.IGNORECASE) 49 + 50 + # YYYY-MM-DD at the start of a filename 51 + FILENAME_DATE_RE = re.compile(r"^(\d{4})-(\d{2})-(\d{2})") 52 + 53 + 54 + def _parse_muesli_file(path: Path) -> tuple[dict[str, Any], str]: 55 + """Parse a muesli markdown file into (frontmatter_dict, body_text).""" 56 + post = frontmatter.load(str(path)) 57 + return dict(post.metadata), post.content 58 + 59 + 60 + def _parse_participants(body: str) -> list[dict[str, Any]]: 61 + """Extract enriched participant info from the ## Participants section. 62 + 63 + Returns list of dicts with keys: name, email, title, company, linkedin. 64 + """ 65 + section_match = re.search( 66 + r"^##\s*Participants\s*\n(.*?)(?=^##|\Z)", 67 + body, 68 + re.MULTILINE | re.DOTALL, 69 + ) 70 + if not section_match: 71 + return [] 72 + 73 + section = section_match.group(1) 74 + participants: list[dict[str, Any]] = [] 75 + 76 + for match in PARTICIPANT_RE.finditer(section): 77 + name = match.group(1).strip() 78 + # Strip [[wikilink]] brackets 79 + name = name.replace("[[", "").replace("]]", "") 80 + 81 + rest = match.group(2) or "" 82 + 83 + # Extract email: (user@domain.com) 84 + email = "" 85 + email_match = EMAIL_RE.search(rest) 86 + if email_match: 87 + email = email_match.group(1).strip() 88 + rest = rest[: email_match.start()] + rest[email_match.end() :] 89 + 90 + # Extract LinkedIn handle 91 + linkedin = "" 92 + linkedin_match = LINKEDIN_RE.search(rest) 93 + if linkedin_match: 94 + linkedin = linkedin_match.group(1).strip() 95 + rest = rest[: linkedin_match.start()] + rest[linkedin_match.end() :] 96 + 97 + # Remaining comma-separated parts: title, company 98 + parts = [p.strip() for p in rest.split(",") if p.strip()] 99 + title = parts[0] if len(parts) >= 1 else "" 100 + company = parts[1] if len(parts) >= 2 else "" 101 + 102 + participants.append( 103 + { 104 + "name": name, 105 + "email": email, 106 + "title": title, 107 + "company": company, 108 + "linkedin": linkedin, 109 + } 110 + ) 111 + 112 + return participants 113 + 114 + 115 + def _date_from_filename(filename: str) -> dt.date | None: 116 + """Extract date from muesli filename like 2025-10-28_q1-planning.md.""" 117 + match = FILENAME_DATE_RE.match(filename) 118 + if match: 119 + return dt.date(int(match.group(1)), int(match.group(2)), int(match.group(3))) 120 + return None 121 + 122 + 123 + def _parse_created_at( 124 + fm: dict[str, Any], filename: str 125 + ) -> tuple[dt.datetime | None, dt.timezone | None]: 126 + """Parse created_at from frontmatter, falling back to filename date. 127 + 128 + Returns (datetime, timezone) where timezone is UTC if created_at had Z/+00:00, 129 + or None for naive/local time. 130 + """ 131 + created_at_str = fm.get("created_at", "") 132 + if created_at_str: 133 + try: 134 + created_dt = dt.datetime.fromisoformat( 135 + str(created_at_str).replace("Z", "+00:00") 136 + ) 137 + tz = created_dt.tzinfo 138 + return created_dt, tz 139 + except (ValueError, TypeError): 140 + pass 141 + 142 + # Fall back to filename date 143 + base_date = _date_from_filename(filename) 144 + if base_date: 145 + return dt.datetime(base_date.year, base_date.month, base_date.day), None 146 + return None, None 147 + 148 + 149 + def _parse_transcript_entries( 150 + body: str, 151 + base_dt: dt.datetime, 152 + tz: dt.timezone | None, 153 + ) -> list[dict[str, Any]]: 154 + """Parse speaker-labeled transcript entries into message dicts. 155 + 156 + Each message has: create_time (float epoch), speaker, text, model_slug. 157 + """ 158 + base_date = base_dt.date() if tz is None else base_dt.astimezone(tz).date() 159 + messages: list[dict[str, Any]] = [] 160 + 161 + for match in TRANSCRIPT_RE.finditer(body): 162 + speaker = match.group(1).strip() 163 + time_str = match.group(2) 164 + text = match.group(3).strip() 165 + 166 + h, m, s = (int(p) for p in time_str.split(":")) 167 + 168 + if tz is not None: 169 + msg_dt = dt.datetime( 170 + base_date.year, base_date.month, base_date.day, h, m, s, tzinfo=tz 171 + ) 172 + else: 173 + msg_dt = dt.datetime( 174 + base_date.year, base_date.month, base_date.day, h, m, s 175 + ) 176 + 177 + messages.append( 178 + { 179 + "create_time": msg_dt.timestamp(), 180 + "speaker": speaker, 181 + "text": text, 182 + "model_slug": None, 183 + } 184 + ) 185 + 186 + return messages 187 + 188 + 189 + def _import_transcript( 190 + md_file: Path, 191 + fm: dict[str, Any], 192 + body: str, 193 + journal_root: Path, 194 + ) -> tuple[list[str], list[tuple[str, str]], int]: 195 + """Import a single muesli transcript into the journal. 196 + 197 + Returns (created_files, segments, entities_seeded). 198 + """ 199 + doc_id = fm.get("doc_id", md_file.stem) 200 + title = fm.get("title", "") 201 + duration_seconds = fm.get("duration_seconds") 202 + summary_text = fm.get("summary_text", "") 203 + 204 + created_dt, tz = _parse_created_at(fm, md_file.name) 205 + if created_dt is None: 206 + logger.warning("No date for %s -- skipping", md_file.name) 207 + return [], [], 0 208 + 209 + # Parse transcript entries 210 + messages = _parse_transcript_entries(body, created_dt, tz) 211 + if not messages: 212 + return [], [], 0 213 + 214 + messages.sort(key=lambda m: m["create_time"]) 215 + 216 + # Estimate duration if missing 217 + if not duration_seconds and len(messages) >= 2: 218 + duration_seconds = int(messages[-1]["create_time"] - messages[0]["create_time"]) 219 + if not duration_seconds: 220 + duration_seconds = 300 221 + 222 + # Window into 5-min segments 223 + windows = _window_messages(messages) 224 + 225 + created_files: list[str] = [] 226 + segments: list[tuple[str, str]] = [] 227 + source_copied = False 228 + 229 + for day, seg_key, _model, entries in windows: 230 + day_dir = str(day_path(day)) 231 + 232 + # Build setting string with summary if available 233 + setting = "meeting" 234 + if summary_text: 235 + setting = f"meeting -- {summary_text}" 236 + 237 + json_path = write_segment( 238 + day_dir, 239 + "import.granola", 240 + seg_key, 241 + entries, 242 + import_id=doc_id, 243 + topics=title, 244 + detected_setting=setting, 245 + ) 246 + created_files.append(json_path) 247 + segments.append((day, seg_key)) 248 + 249 + # Copy source markdown into first segment for ground-truth preservation 250 + if not source_copied: 251 + import os 252 + 253 + seg_dir = os.path.dirname(json_path) 254 + source_dest = os.path.join(seg_dir, "source.md") 255 + shutil.copy2(str(md_file), source_dest) 256 + created_files.append(source_dest) 257 + source_copied = True 258 + 259 + # Seed entities from participants 260 + entities_seeded = 0 261 + participants = _parse_participants(body) 262 + if participants and segments: 263 + first_day = segments[0][0] 264 + entity_dicts = [ 265 + {"name": p["name"], "type": "Person", "email": p["email"]} 266 + for p in participants 267 + if p["name"] 268 + ] 269 + if entity_dicts: 270 + try: 271 + seeded = seed_entities("import.granola", first_day, entity_dicts) 272 + entities_seeded = len(seeded) 273 + except Exception as exc: 274 + logger.warning( 275 + "Entity seeding failed for %s: %s", md_file.name, exc 276 + ) 277 + 278 + return created_files, segments, entities_seeded 279 + 280 + 281 + class GranolaBackend: 282 + """Syncable backend for Granola meeting transcripts via muesli.""" 283 + 284 + name: str = "granola" 285 + 286 + def sync( 287 + self, 288 + journal_root: Path, 289 + *, 290 + dry_run: bool = True, 291 + source_path: Path | None = None, 292 + force: bool = False, 293 + ) -> dict[str, Any]: 294 + """Sync Granola transcripts from muesli local files. 295 + 296 + Scans the muesli transcripts directory for markdown files, compares 297 + against sync state, and imports new/updated transcripts into the journal. 298 + 299 + Args: 300 + journal_root: Path to the journal root directory. 301 + dry_run: If True, catalog only (no import). If False, import. 302 + source_path: Override muesli transcripts directory path. 303 + force: If True, ignore sync state and re-import everything. 304 + 305 + Returns: 306 + Summary dict with total, imported, available, skipped, downloaded, errors. 307 + """ 308 + muesli_dir = source_path or DEFAULT_MUESLI_PATH 309 + 310 + # Three-state setup detection 311 + if not muesli_dir.exists(): 312 + parent = muesli_dir.parent 313 + if parent.exists(): 314 + raise ValueError( 315 + "Muesli is installed but no transcripts found.\n" 316 + "Run `muesli sync` to extract your Granola transcripts, " 317 + "then try again." 318 + ) 319 + raise ValueError( 320 + "Granola transcripts require muesli to extract.\n" 321 + "Install: cargo install --git " 322 + "https://github.com/harperreed/muesli.git --all-features\n" 323 + "Then run: muesli sync" 324 + ) 325 + 326 + # Load or initialize sync state 327 + state = load_sync_state(journal_root, "granola") or { 328 + "backend": "granola", 329 + "source_path": str(muesli_dir), 330 + "files": {}, 331 + } 332 + 333 + if force: 334 + state["files"] = {} 335 + 336 + known_files: dict[str, dict[str, Any]] = state.get("files", {}) 337 + 338 + # Scan muesli transcripts directory 339 + md_files = sorted(muesli_dir.glob("*.md")) 340 + 341 + # Parse each file and determine sync status 342 + to_import: list[tuple[Path, dict[str, Any], str]] = [] 343 + current_doc_ids: set[str] = set() 344 + skipped_count = 0 345 + 346 + for md_file in md_files: 347 + try: 348 + fm, body = _parse_muesli_file(md_file) 349 + except Exception as exc: 350 + logger.warning("Failed to parse %s: %s", md_file.name, exc) 351 + continue 352 + 353 + doc_id = fm.get("doc_id") 354 + if not doc_id: 355 + logger.debug("No doc_id in %s -- skipping", md_file.name) 356 + skipped_count += 1 357 + continue 358 + 359 + current_doc_ids.add(doc_id) 360 + remote_updated_at = str(fm.get("remote_updated_at", "")) 361 + title = fm.get("title", md_file.stem) 362 + duration = fm.get("duration_seconds", 0) 363 + 364 + # Check for transcript content 365 + has_transcript = bool(TRANSCRIPT_RE.search(body)) 366 + if not has_transcript: 367 + if doc_id not in known_files: 368 + known_files[doc_id] = { 369 + "filename": md_file.name, 370 + "title": title, 371 + "remote_updated_at": remote_updated_at, 372 + "status": "skipped", 373 + "skip_reason": "no_transcript", 374 + } 375 + skipped_count += 1 376 + continue 377 + 378 + # Check sync state for this doc 379 + if doc_id in known_files and not force: 380 + existing = known_files[doc_id] 381 + if existing.get("status") == "imported": 382 + existing_updated = existing.get("remote_updated_at", "") 383 + if remote_updated_at <= existing_updated: 384 + # Unchanged -- skip 385 + continue 386 + # Updated -- will re-import 387 + 388 + # Mark as available for import 389 + known_files.setdefault(doc_id, {}) 390 + known_files[doc_id].update( 391 + { 392 + "filename": md_file.name, 393 + "title": title, 394 + "remote_updated_at": remote_updated_at, 395 + "duration_seconds": duration, 396 + "status": "available", 397 + } 398 + ) 399 + to_import.append((md_file, fm, doc_id)) 400 + 401 + # Detect deleted files (in state but gone from muesli dir) 402 + for doc_id, info in known_files.items(): 403 + if doc_id not in current_doc_ids and info.get("status") not in ( 404 + "removed", 405 + "skipped", 406 + ): 407 + info["status"] = "removed" 408 + 409 + # Compute summary 410 + total = len(known_files) 411 + imported = sum( 412 + 1 for f in known_files.values() if f.get("status") == "imported" 413 + ) 414 + available = len(to_import) 415 + skipped_total = sum( 416 + 1 for f in known_files.values() if f.get("status") == "skipped" 417 + ) 418 + 419 + result: dict[str, Any] = { 420 + "total": total, 421 + "imported": imported, 422 + "available": available, 423 + "skipped": skipped_total, 424 + "downloaded": 0, 425 + "errors": [], 426 + } 427 + 428 + # Import transcripts if not dry-run 429 + if not dry_run and to_import: 430 + downloaded = 0 431 + errors: list[str] = [] 432 + 433 + for md_file, _fm_partial, doc_id in to_import: 434 + try: 435 + fm_full, body = _parse_muesli_file(md_file) 436 + files, segs, entities = _import_transcript( 437 + md_file, fm_full, body, journal_root 438 + ) 439 + 440 + if files: 441 + known_files[doc_id]["status"] = "imported" 442 + known_files[doc_id][ 443 + "imported_at" 444 + ] = dt.datetime.now().isoformat() 445 + known_files[doc_id]["segments"] = len(segs) 446 + known_files[doc_id]["entities_seeded"] = entities 447 + downloaded += 1 448 + logger.info( 449 + "Imported %s: %d segments, %d entities", 450 + md_file.name, 451 + len(segs), 452 + entities, 453 + ) 454 + else: 455 + known_files[doc_id]["status"] = "skipped" 456 + known_files[doc_id]["skip_reason"] = "no_content" 457 + except Exception as exc: 458 + msg = f"{md_file.name}: {exc}" 459 + logger.warning("Import failed: %s", msg) 460 + errors.append(msg) 461 + 462 + result["downloaded"] = downloaded 463 + result["errors"] = errors 464 + result["imported"] = sum( 465 + 1 for f in known_files.values() if f.get("status") == "imported" 466 + ) 467 + result["available"] = sum( 468 + 1 for f in known_files.values() if f.get("status") == "available" 469 + ) 470 + 471 + # Save updated state 472 + state["files"] = known_files 473 + state["source_path"] = str(muesli_dir) 474 + state["last_sync"] = dt.datetime.now().isoformat() 475 + save_sync_state(journal_root, "granola", state) 476 + 477 + return result 478 + 479 + 480 + # Module-level backend instance for discovery 481 + backend = GranolaBackend()
+1
think/importers/sync.py
··· 23 23 24 24 SYNCABLE_REGISTRY: dict[str, str] = { 25 25 "plaud": "think.importers.plaud", 26 + "granola": "think.importers.granola", 26 27 } 27 28 28 29