Granola meeting transcript importer via muesli · solpbc.org/solstone@cf2258e

+589

tests/test_importer_granola.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Tests for the Granola meeting transcript importer (via muesli).""" 5 + 6 + import json 7 + from pathlib import Path 8 + from textwrap import dedent 9 + 10 + import pytest 11 + 12 + 13 + # --------------------------------------------------------------------------- 14 + # Helpers 15 + # --------------------------------------------------------------------------- 16 + 17 + SAMPLE_TRANSCRIPT = dedent("""\ 18 + --- 19 + doc_id: doc_test_001 20 + source: granola 21 + created_at: "2025-10-28T15:04:05Z" 22 + remote_updated_at: "2025-10-29T01:23:45Z" 23 + title: Q1 Planning 24 + participants: 25 + - Alice 26 + - Bob 27 + duration_seconds: 3170 28 + labels: 29 + - Planning 30 + creator: "[[Alice Smith]]" 31 + attendees: 32 + - "[[Alice Smith]]" 33 + - "[[Bob Jones]]" 34 + summary_text: "Discussed Q1 priorities and resource allocation." 35 + generator: muesli 1.0 36 + --- 37 + 38 + # Q1 Planning 39 + 40 + _Date: 2025-10-28 · Duration: 52m · Participants: Alice Smith, Bob Jones_ 41 + 42 + ## Participants 43 + 44 + - **Alice Smith**, Engineering Manager, Acme Corp, (alice@acme.com) 45 + - **Bob Jones**, (bob@jones.io) 46 + 47 + --- 48 + 49 + **Alice Smith (15:04:12):** Welcome to the planning session. 50 + **Bob Jones (15:04:19):** Thanks, let's get started. 51 + **Alice Smith (15:04:35):** First, let's review our Q4 results. 52 + **Bob Jones (15:05:01):** Revenue was up 15% quarter over quarter. 53 + **Alice Smith (15:09:22):** Now let's talk about Q1 priorities. 54 + **Bob Jones (15:09:45):** I think we should focus on the mobile launch. 55 + """) 56 + 57 + 58 + STUB_TRANSCRIPT = dedent("""\ 59 + --- 60 + doc_id: doc_stub_002 61 + source: granola 62 + created_at: "2025-11-01T10:00:00Z" 63 + remote_updated_at: "2025-11-01T10:00:00Z" 64 + title: Quick Sync 65 + duration_seconds: 60 66 + generator: muesli 1.0 67 + --- 68 + 69 + # Quick Sync 70 + 71 + _No transcript available._ 72 + """) 73 + 74 + 75 + NO_DOCID_TRANSCRIPT = dedent("""\ 76 + --- 77 + source: granola 78 + title: Orphan 79 + --- 80 + 81 + # Orphan 82 + 83 + **Alice (10:00:00):** Hello. 84 + """) 85 + 86 + 87 + def _write_transcript(muesli_dir: Path, filename: str, content: str) -> Path: 88 + """Write a transcript file to the muesli directory.""" 89 + muesli_dir.mkdir(parents=True, exist_ok=True) 90 + path = muesli_dir / filename 91 + path.write_text(content, encoding="utf-8") 92 + return path 93 + 94 + 95 + # --------------------------------------------------------------------------- 96 + # Frontmatter parsing 97 + # --------------------------------------------------------------------------- 98 + 99 + 100 + def test_parse_muesli_file(tmp_path): 101 + """Parse frontmatter and body from a muesli markdown file.""" 102 + from think.importers.granola import _parse_muesli_file 103 + 104 + path = _write_transcript(tmp_path, "test.md", SAMPLE_TRANSCRIPT) 105 + fm, body = _parse_muesli_file(path) 106 + 107 + assert fm["doc_id"] == "doc_test_001" 108 + assert fm["title"] == "Q1 Planning" 109 + assert fm["duration_seconds"] == 3170 110 + assert fm["summary_text"] == "Discussed Q1 priorities and resource allocation." 111 + assert "Alice Smith (15:04:12)" in body 112 + 113 + 114 + def test_parse_muesli_file_missing_fields(tmp_path): 115 + """Handles files with minimal frontmatter.""" 116 + from think.importers.granola import _parse_muesli_file 117 + 118 + content = dedent("""\ 119 + --- 120 + doc_id: minimal 121 + --- 122 + 123 + **Speaker (10:00:00):** Hello. 124 + """) 125 + path = _write_transcript(tmp_path, "minimal.md", content) 126 + fm, body = _parse_muesli_file(path) 127 + 128 + assert fm["doc_id"] == "minimal" 129 + assert "title" not in fm 130 + assert "Speaker (10:00:00)" in body 131 + 132 + 133 + # --------------------------------------------------------------------------- 134 + # Participant parsing 135 + # --------------------------------------------------------------------------- 136 + 137 + 138 + def test_parse_participants(): 139 + """Extract participant info from ## Participants section.""" 140 + from think.importers.granola import _parse_participants 141 + 142 + participants = _parse_participants(SAMPLE_TRANSCRIPT) 143 + assert len(participants) == 2 144 + 145 + alice = participants[0] 146 + assert alice["name"] == "Alice Smith" 147 + assert alice["email"] == "alice@acme.com" 148 + assert alice["title"] == "Engineering Manager" 149 + assert alice["company"] == "Acme Corp" 150 + 151 + bob = participants[1] 152 + assert bob["name"] == "Bob Jones" 153 + assert bob["email"] == "bob@jones.io" 154 + 155 + 156 + def test_parse_participants_with_linkedin(): 157 + """Extract LinkedIn handle from participant line.""" 158 + from think.importers.granola import _parse_participants 159 + 160 + body = dedent("""\ 161 + ## Participants 162 + 163 + - **Jane Doe**, CTO, StartupCo, linkedin.com/in/janedoe, (jane@startup.co) 164 + """) 165 + participants = _parse_participants(body) 166 + assert len(participants) == 1 167 + assert participants[0]["name"] == "Jane Doe" 168 + assert participants[0]["email"] == "jane@startup.co" 169 + assert participants[0]["linkedin"] == "janedoe" 170 + assert participants[0]["title"] == "CTO" 171 + assert participants[0]["company"] == "StartupCo" 172 + 173 + 174 + def test_parse_participants_no_section(): 175 + """Returns empty list when no ## Participants section exists.""" 176 + from think.importers.granola import _parse_participants 177 + 178 + assert _parse_participants("# Meeting\n\nJust some text.") == [] 179 + 180 + 181 + def test_parse_participants_wikilinks(): 182 + """Strips [[wikilink]] brackets from participant names.""" 183 + from think.importers.granola import _parse_participants 184 + 185 + body = dedent("""\ 186 + ## Participants 187 + 188 + - **[[Alice Smith]]**, (alice@acme.com) 189 + """) 190 + participants = _parse_participants(body) 191 + assert participants[0]["name"] == "Alice Smith" 192 + 193 + 194 + # --------------------------------------------------------------------------- 195 + # Transcript entry parsing 196 + # --------------------------------------------------------------------------- 197 + 198 + 199 + def test_parse_transcript_entries(): 200 + """Parse speaker-labeled entries into timestamped messages.""" 201 + import datetime as dt 202 + 203 + from think.importers.granola import _parse_transcript_entries 204 + 205 + base_dt = dt.datetime(2025, 10, 28, 0, 0, 0, tzinfo=dt.timezone.utc) 206 + messages = _parse_transcript_entries(SAMPLE_TRANSCRIPT, base_dt, dt.timezone.utc) 207 + 208 + assert len(messages) == 6 209 + assert messages[0]["speaker"] == "Alice Smith" 210 + assert messages[0]["text"] == "Welcome to the planning session." 211 + assert messages[0]["model_slug"] is None 212 + 213 + assert messages[1]["speaker"] == "Bob Jones" 214 + assert messages[1]["text"] == "Thanks, let's get started." 215 + 216 + # Timestamps should be in order 217 + for i in range(1, len(messages)): 218 + assert messages[i]["create_time"] >= messages[i - 1]["create_time"] 219 + 220 + 221 + def test_parse_transcript_entries_empty_body(): 222 + """Returns empty list when no transcript entries found.""" 223 + import datetime as dt 224 + 225 + from think.importers.granola import _parse_transcript_entries 226 + 227 + base_dt = dt.datetime(2025, 10, 28, 0, 0, 0, tzinfo=dt.timezone.utc) 228 + messages = _parse_transcript_entries("No transcript here.", base_dt, dt.timezone.utc) 229 + assert messages == [] 230 + 231 + 232 + # --------------------------------------------------------------------------- 233 + # Date parsing 234 + # --------------------------------------------------------------------------- 235 + 236 + 237 + def test_date_from_filename(): 238 + """Extract date from muesli filename.""" 239 + from think.importers.granola import _date_from_filename 240 + 241 + import datetime as dt 242 + 243 + assert _date_from_filename("2025-10-28_q1-planning.md") == dt.date(2025, 10, 28) 244 + assert _date_from_filename("random-file.md") is None 245 + assert _date_from_filename("2025-01-01.md") == dt.date(2025, 1, 1) 246 + 247 + 248 + def test_parse_created_at(): 249 + """Parse created_at from frontmatter with timezone.""" 250 + import datetime as dt 251 + 252 + from think.importers.granola import _parse_created_at 253 + 254 + fm = {"created_at": "2025-10-28T15:04:05Z"} 255 + created_dt, tz = _parse_created_at(fm, "test.md") 256 + assert created_dt is not None 257 + assert tz == dt.timezone.utc 258 + 259 + 260 + def test_parse_created_at_fallback_to_filename(): 261 + """Falls back to filename date when created_at is missing.""" 262 + from think.importers.granola import _parse_created_at 263 + 264 + fm = {} 265 + created_dt, tz = _parse_created_at(fm, "2025-10-28_meeting.md") 266 + assert created_dt is not None 267 + assert tz is None # naive/local 268 + 269 + 270 + def test_parse_created_at_no_date(): 271 + """Returns None when no date source available.""" 272 + from think.importers.granola import _parse_created_at 273 + 274 + fm = {} 275 + created_dt, tz = _parse_created_at(fm, "random.md") 276 + assert created_dt is None 277 + 278 + 279 + # --------------------------------------------------------------------------- 280 + # GranolaBackend protocol 281 + # --------------------------------------------------------------------------- 282 + 283 + 284 + def test_granola_protocol_conformance(): 285 + """GranolaBackend satisfies SyncableBackend protocol.""" 286 + from think.importers.granola import GranolaBackend 287 + from think.importers.sync import SyncableBackend 288 + 289 + assert isinstance(GranolaBackend(), SyncableBackend) 290 + 291 + 292 + def test_granola_in_registry(): 293 + """Granola is registered in the syncable backend registry.""" 294 + from think.importers.sync import get_syncable_backends 295 + 296 + backends = get_syncable_backends() 297 + names = [b.name for b in backends] 298 + assert "granola" in names 299 + 300 + 301 + # --------------------------------------------------------------------------- 302 + # GranolaBackend.sync() — detection 303 + # --------------------------------------------------------------------------- 304 + 305 + 306 + def test_granola_sync_no_muesli(tmp_path): 307 + """Raises ValueError when muesli is not installed.""" 308 + from think.importers.granola import GranolaBackend 309 + 310 + nonexistent = tmp_path / "nonexistent" / "transcripts" 311 + with pytest.raises(ValueError, match="muesli to extract"): 312 + GranolaBackend().sync(tmp_path, source_path=nonexistent) 313 + 314 + 315 + def test_granola_sync_no_transcripts(tmp_path): 316 + """Raises ValueError when muesli dir exists but no transcripts.""" 317 + from think.importers.granola import GranolaBackend 318 + 319 + muesli_dir = tmp_path / "muesli" / "transcripts" 320 + # Create parent but not transcripts dir 321 + (tmp_path / "muesli").mkdir() 322 + with pytest.raises(ValueError, match="no transcripts found"): 323 + GranolaBackend().sync(tmp_path, source_path=muesli_dir) 324 + 325 + 326 + # --------------------------------------------------------------------------- 327 + # GranolaBackend.sync() — catalog mode 328 + # --------------------------------------------------------------------------- 329 + 330 + 331 + def test_granola_sync_dry_run(tmp_path): 332 + """Dry-run catalogs transcripts and saves state.""" 333 + from think.importers.granola import GranolaBackend 334 + from think.importers.sync import load_sync_state 335 + 336 + muesli_dir = tmp_path / "muesli" 337 + _write_transcript(muesli_dir, "2025-10-28_q1.md", SAMPLE_TRANSCRIPT) 338 + _write_transcript(muesli_dir, "stub.md", STUB_TRANSCRIPT) 339 + 340 + result = GranolaBackend().sync(tmp_path, source_path=muesli_dir) 341 + 342 + assert result["total"] == 2 343 + assert result["available"] == 1 # only the one with transcript content 344 + assert result["skipped"] == 1 # stub has no transcript 345 + assert result["imported"] == 0 346 + assert result["downloaded"] == 0 347 + 348 + # State was saved 349 + state = load_sync_state(tmp_path, "granola") 350 + assert state is not None 351 + assert len(state["files"]) == 2 352 + assert state["files"]["doc_test_001"]["status"] == "available" 353 + assert state["files"]["doc_stub_002"]["status"] == "skipped" 354 + assert state["files"]["doc_stub_002"]["skip_reason"] == "no_transcript" 355 + 356 + 357 + def test_granola_sync_skips_no_docid(tmp_path): 358 + """Transcripts without doc_id in frontmatter are skipped.""" 359 + from think.importers.granola import GranolaBackend 360 + 361 + muesli_dir = tmp_path / "muesli" 362 + _write_transcript(muesli_dir, "orphan.md", NO_DOCID_TRANSCRIPT) 363 + 364 + result = GranolaBackend().sync(tmp_path, source_path=muesli_dir) 365 + assert result["total"] == 0 366 + assert result["available"] == 0 367 + 368 + 369 + # --------------------------------------------------------------------------- 370 + # GranolaBackend.sync() — incremental 371 + # --------------------------------------------------------------------------- 372 + 373 + 374 + def test_granola_sync_incremental(tmp_path): 375 + """Second sync skips already-imported transcripts.""" 376 + from think.importers.granola import GranolaBackend 377 + from think.importers.sync import load_sync_state, save_sync_state 378 + 379 + muesli_dir = tmp_path / "muesli" 380 + _write_transcript(muesli_dir, "2025-10-28_q1.md", SAMPLE_TRANSCRIPT) 381 + 382 + # Pre-seed state: doc_test_001 already imported 383 + save_sync_state( 384 + tmp_path, 385 + "granola", 386 + { 387 + "backend": "granola", 388 + "files": { 389 + "doc_test_001": { 390 + "filename": "2025-10-28_q1.md", 391 + "remote_updated_at": "2025-10-29T01:23:45Z", 392 + "status": "imported", 393 + "imported_at": "2026-03-01T00:00:00", 394 + }, 395 + }, 396 + }, 397 + ) 398 + 399 + result = GranolaBackend().sync(tmp_path, source_path=muesli_dir) 400 + 401 + assert result["imported"] == 1 402 + assert result["available"] == 0 # nothing new 403 + 404 + 405 + def test_granola_sync_detects_updated(tmp_path): 406 + """Re-imports when remote_updated_at is newer than last sync.""" 407 + from think.importers.granola import GranolaBackend 408 + from think.importers.sync import save_sync_state 409 + 410 + muesli_dir = tmp_path / "muesli" 411 + _write_transcript(muesli_dir, "2025-10-28_q1.md", SAMPLE_TRANSCRIPT) 412 + 413 + # Pre-seed with older remote_updated_at 414 + save_sync_state( 415 + tmp_path, 416 + "granola", 417 + { 418 + "backend": "granola", 419 + "files": { 420 + "doc_test_001": { 421 + "filename": "2025-10-28_q1.md", 422 + "remote_updated_at": "2025-10-28T00:00:00Z", # older 423 + "status": "imported", 424 + }, 425 + }, 426 + }, 427 + ) 428 + 429 + result = GranolaBackend().sync(tmp_path, source_path=muesli_dir) 430 + assert result["available"] == 1 # updated, needs re-import 431 + 432 + 433 + def test_granola_sync_detects_removed(tmp_path): 434 + """Marks files as removed when they disappear from muesli dir.""" 435 + from think.importers.granola import GranolaBackend 436 + from think.importers.sync import load_sync_state, save_sync_state 437 + 438 + muesli_dir = tmp_path / "muesli" 439 + muesli_dir.mkdir(parents=True) 440 + # No files in dir, but state has one 441 + 442 + save_sync_state( 443 + tmp_path, 444 + "granola", 445 + { 446 + "backend": "granola", 447 + "files": { 448 + "doc_gone": { 449 + "filename": "old.md", 450 + "status": "imported", 451 + }, 452 + }, 453 + }, 454 + ) 455 + 456 + GranolaBackend().sync(tmp_path, source_path=muesli_dir) 457 + state = load_sync_state(tmp_path, "granola") 458 + assert state["files"]["doc_gone"]["status"] == "removed" 459 + 460 + 461 + def test_granola_sync_force(tmp_path): 462 + """Force flag clears state and re-imports everything.""" 463 + from think.importers.granola import GranolaBackend 464 + from think.importers.sync import save_sync_state 465 + 466 + muesli_dir = tmp_path / "muesli" 467 + _write_transcript(muesli_dir, "2025-10-28_q1.md", SAMPLE_TRANSCRIPT) 468 + 469 + # Pre-seed as already imported 470 + save_sync_state( 471 + tmp_path, 472 + "granola", 473 + { 474 + "backend": "granola", 475 + "files": { 476 + "doc_test_001": { 477 + "filename": "2025-10-28_q1.md", 478 + "remote_updated_at": "2025-10-29T01:23:45Z", 479 + "status": "imported", 480 + }, 481 + }, 482 + }, 483 + ) 484 + 485 + result = GranolaBackend().sync(tmp_path, source_path=muesli_dir, force=True) 486 + assert result["available"] == 1 # force causes re-import 487 + 488 + 489 + # --------------------------------------------------------------------------- 490 + # GranolaBackend.sync() — import mode 491 + # --------------------------------------------------------------------------- 492 + 493 + 494 + def test_granola_sync_import(tmp_path, monkeypatch): 495 + """Import mode writes segments and updates state.""" 496 + from think.importers.granola import GranolaBackend 497 + from think.importers.sync import load_sync_state 498 + 499 + # Point journal to tmp_path 500 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 501 + 502 + muesli_dir = tmp_path / "muesli" 503 + _write_transcript(muesli_dir, "2025-10-28_q1.md", SAMPLE_TRANSCRIPT) 504 + 505 + result = GranolaBackend().sync( 506 + tmp_path, source_path=muesli_dir, dry_run=False 507 + ) 508 + 509 + assert result["downloaded"] == 1 510 + assert result["imported"] == 1 511 + assert result["available"] == 0 512 + assert result["errors"] == [] 513 + 514 + # State updated 515 + state = load_sync_state(tmp_path, "granola") 516 + doc = state["files"]["doc_test_001"] 517 + assert doc["status"] == "imported" 518 + assert "imported_at" in doc 519 + assert doc["segments"] > 0 520 + 521 + # Verify segments were written 522 + # The transcript spans 15:04:12 to 15:09:45 (local time from UTC), 523 + # so we check for import.granola directories 524 + import glob 525 + 526 + segments = glob.glob(str(tmp_path / "*/import.granola/*/conversation_transcript.jsonl")) 527 + assert len(segments) >= 1 528 + 529 + # Check JSONL content of first segment 530 + with open(segments[0], "r") as f: 531 + lines = f.readlines() 532 + 533 + # First line is metadata 534 + meta = json.loads(lines[0]) 535 + assert "imported" in meta 536 + assert meta["imported"]["id"] == "doc_test_001" 537 + assert meta["topics"] == "Q1 Planning" 538 + 539 + # Subsequent lines are entries 540 + entry = json.loads(lines[1]) 541 + assert "speaker" in entry 542 + assert "text" in entry 543 + assert "start" in entry 544 + assert entry["source"] == "import" 545 + 546 + # Check source.md was copied 547 + source_files = glob.glob(str(tmp_path / "*/import.granola/*/source.md")) 548 + assert len(source_files) == 1 # only in first segment 549 + 550 + 551 + # --------------------------------------------------------------------------- 552 + # CLI integration 553 + # --------------------------------------------------------------------------- 554 + 555 + 556 + def test_granola_backends_cli_flag(capsys, monkeypatch): 557 + """sol import --backends lists granola.""" 558 + import sys 559 + 560 + from think.importers.cli import main 561 + 562 + monkeypatch.setattr(sys, "argv", ["sol import", "--backends"]) 563 + monkeypatch.setenv("JOURNAL_PATH", "/tmp/test-journal") 564 + main() 565 + captured = capsys.readouterr() 566 + assert "granola" in captured.out 567 + 568 + 569 + def test_granola_sync_cli(capsys, monkeypatch, tmp_path): 570 + """sol import --sync granola --path <dir> runs catalog.""" 571 + import sys 572 + 573 + from think.importers.cli import main 574 + 575 + muesli_dir = tmp_path / "muesli" 576 + _write_transcript(muesli_dir, "2025-10-28_q1.md", SAMPLE_TRANSCRIPT) 577 + 578 + monkeypatch.setattr( 579 + sys, 580 + "argv", 581 + ["sol import", "--sync", "granola", "--path", str(muesli_dir)], 582 + ) 583 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 584 + 585 + main() 586 + captured = capsys.readouterr() 587 + assert "Total:" in captured.out 588 + assert "Available to import:" in captured.out 589 + assert "Q1 Planning" in captured.out # title shown in available list

+1 -1

tests/test_importer_sync.py

··· 426 426 main() 427 427 428 428 captured = capsys.readouterr() 429 - assert "Total recordings:" in captured.out 429 + assert "Total:" in captured.out 430 430 assert "Available to import:" in captured.out

+35 -9

think/importers/cli.py

··· 107 107 return timestamp 108 108 109 109 110 - def _run_sync(backend_name: str, *, dry_run: bool = True) -> None: 110 + def _run_sync(backend_name: str, *, dry_run: bool = True, **extra: Any) -> None: 111 111 """Run sync for a named backend and print results.""" 112 + import inspect 113 + 112 114 from think.importers.plaud import format_size 113 115 from think.importers.sync import get_syncable_backends, load_sync_state 114 116 ··· 131 133 mode = "save" if not dry_run else "catalog" 132 134 print(f"Syncing {backend_name} ({mode} mode)...") 133 135 print() 136 + 137 + # Pass extra kwargs only if the backend accepts them 138 + sync_kwargs: dict[str, Any] = {"dry_run": dry_run} 139 + sig = inspect.signature(backend.sync) 140 + for key, value in extra.items(): 141 + if key in sig.parameters and value is not None: 142 + sync_kwargs[key] = value 134 143 135 144 try: 136 - result = backend.sync(journal_root, dry_run=dry_run) 145 + result = backend.sync(journal_root, **sync_kwargs) 137 146 except ValueError as e: 138 147 raise SystemExit(str(e)) 139 148 except RuntimeError as e: ··· 148 157 149 158 # Print summary 150 159 print() 151 - print(f" Total recordings: {total}") 160 + print(f" Total: {total}") 152 161 print(f" Already imported: {imported}") 153 162 print(f" Available to import: {available}") 154 163 if skipped: 155 - print(f" Skipped: {skipped} (trashed/short)") 164 + print(f" Skipped: {skipped}") 156 165 157 166 if downloaded > 0: 158 - print(f" Downloaded + imported: {downloaded}") 167 + print(f" Imported this run: {downloaded}") 159 168 if errors: 160 169 print(f" Errors: {len(errors)}") 161 170 for err in errors: ··· 173 182 ] 174 183 if avail_files: 175 184 print() 176 - print("Available recordings:") 185 + print("Available:") 177 186 for _fid, info in avail_files: 178 187 name = info.get("filename", "unnamed") 188 + title = info.get("title", "") 179 189 size = info.get("filesize", 0) 180 - print(f" - {name} ({format_size(size)})") 190 + if title: 191 + print(f" - {title} ({name})") 192 + elif size: 193 + print(f" - {name} ({format_size(size)})") 194 + else: 195 + print(f" - {name}") 181 196 print() 182 - print("Run with --save to download and import these files:") 197 + print(f"Run with --save to import:") 183 198 print(f" sol import --sync {backend_name} --save") 184 199 185 200 if not dry_run and available == 0 and downloaded == 0: ··· 248 263 help="With --sync: download and import new files (default is dry-run)", 249 264 ) 250 265 parser.add_argument( 266 + "--path", 267 + type=str, 268 + default=None, 269 + help="With --sync: override the default source directory path", 270 + ) 271 + parser.add_argument( 251 272 "--list-importers", 252 273 action="store_true", 253 274 help="List available file importers", ··· 302 323 return 303 324 304 325 if args.sync: 305 - _run_sync(args.sync, dry_run=not args.save) 326 + extra: dict[str, Any] = {} 327 + if args.path: 328 + extra["source_path"] = Path(os.path.expanduser(args.path)) 329 + if args.force: 330 + extra["force"] = True 331 + _run_sync(args.sync, dry_run=not args.save, **extra) 306 332 return 307 333 308 334 if not args.media:

+481

think/importers/granola.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Granola meeting transcript importer via muesli. 5 + 6 + Reads local markdown files produced by muesli (harper reed's Granola extractor) 7 + from ~/.local/share/muesli/transcripts/ and imports them into the journal as 8 + speaker-labeled transcript segments. 9 + 10 + Architecture: SyncableBackend registered as "granola" in the sync registry. 11 + No network calls — reads local files only. 12 + """ 13 + 14 + import datetime as dt 15 + import logging 16 + import re 17 + import shutil 18 + from pathlib import Path 19 + from typing import Any 20 + 21 + import frontmatter 22 + 23 + from think.importers.shared import ( 24 + _window_messages, 25 + seed_entities, 26 + write_segment, 27 + ) 28 + from think.importers.sync import load_sync_state, save_sync_state 29 + from think.utils import day_path 30 + 31 + logger = logging.getLogger(__name__) 32 + 33 + DEFAULT_MUESLI_PATH = Path.home() / ".local" / "share" / "muesli" / "transcripts" 34 + 35 + # **Name (HH:MM:SS):** text 36 + TRANSCRIPT_RE = re.compile( 37 + r"^\*\*(.+?)\s*$(\d{1,2}:\d{2}:\d{2})$:\*\*\s*(.+)", 38 + re.MULTILINE, 39 + ) 40 + 41 + # - **Name**, Title, Company, (email) 42 + PARTICIPANT_RE = re.compile( 43 + r"^\s*-\s*\*\*(.+?)\*\*(?:,\s*(.+))?$", 44 + re.MULTILINE, 45 + ) 46 + 47 + EMAIL_RE = re.compile(r"$([^)]+@[^)]+)$") 48 + LINKEDIN_RE = re.compile(r"linkedin\.com/in/([^\s,)]+)", re.IGNORECASE) 49 + 50 + # YYYY-MM-DD at the start of a filename 51 + FILENAME_DATE_RE = re.compile(r"^(\d{4})-(\d{2})-(\d{2})") 52 + 53 + 54 + def _parse_muesli_file(path: Path) -> tuple[dict[str, Any], str]: 55 + """Parse a muesli markdown file into (frontmatter_dict, body_text).""" 56 + post = frontmatter.load(str(path)) 57 + return dict(post.metadata), post.content 58 + 59 + 60 + def _parse_participants(body: str) -> list[dict[str, Any]]: 61 + """Extract enriched participant info from the ## Participants section. 62 + 63 + Returns list of dicts with keys: name, email, title, company, linkedin. 64 + """ 65 + section_match = re.search( 66 + r"^##\s*Participants\s*\n(.*?)(?=^##|\Z)", 67 + body, 68 + re.MULTILINE | re.DOTALL, 69 + ) 70 + if not section_match: 71 + return [] 72 + 73 + section = section_match.group(1) 74 + participants: list[dict[str, Any]] = [] 75 + 76 + for match in PARTICIPANT_RE.finditer(section): 77 + name = match.group(1).strip() 78 + # Strip [[wikilink]] brackets 79 + name = name.replace("[[", "").replace("]]", "") 80 + 81 + rest = match.group(2) or "" 82 + 83 + # Extract email: (user@domain.com) 84 + email = "" 85 + email_match = EMAIL_RE.search(rest) 86 + if email_match: 87 + email = email_match.group(1).strip() 88 + rest = rest[: email_match.start()] + rest[email_match.end() :] 89 + 90 + # Extract LinkedIn handle 91 + linkedin = "" 92 + linkedin_match = LINKEDIN_RE.search(rest) 93 + if linkedin_match: 94 + linkedin = linkedin_match.group(1).strip() 95 + rest = rest[: linkedin_match.start()] + rest[linkedin_match.end() :] 96 + 97 + # Remaining comma-separated parts: title, company 98 + parts = [p.strip() for p in rest.split(",") if p.strip()] 99 + title = parts[0] if len(parts) >= 1 else "" 100 + company = parts[1] if len(parts) >= 2 else "" 101 + 102 + participants.append( 103 + { 104 + "name": name, 105 + "email": email, 106 + "title": title, 107 + "company": company, 108 + "linkedin": linkedin, 109 + } 110 + ) 111 + 112 + return participants 113 + 114 + 115 + def _date_from_filename(filename: str) -> dt.date | None: 116 + """Extract date from muesli filename like 2025-10-28_q1-planning.md.""" 117 + match = FILENAME_DATE_RE.match(filename) 118 + if match: 119 + return dt.date(int(match.group(1)), int(match.group(2)), int(match.group(3))) 120 + return None 121 + 122 + 123 + def _parse_created_at( 124 + fm: dict[str, Any], filename: str 125 + ) -> tuple[dt.datetime | None, dt.timezone | None]: 126 + """Parse created_at from frontmatter, falling back to filename date. 127 + 128 + Returns (datetime, timezone) where timezone is UTC if created_at had Z/+00:00, 129 + or None for naive/local time. 130 + """ 131 + created_at_str = fm.get("created_at", "") 132 + if created_at_str: 133 + try: 134 + created_dt = dt.datetime.fromisoformat( 135 + str(created_at_str).replace("Z", "+00:00") 136 + ) 137 + tz = created_dt.tzinfo 138 + return created_dt, tz 139 + except (ValueError, TypeError): 140 + pass 141 + 142 + # Fall back to filename date 143 + base_date = _date_from_filename(filename) 144 + if base_date: 145 + return dt.datetime(base_date.year, base_date.month, base_date.day), None 146 + return None, None 147 + 148 + 149 + def _parse_transcript_entries( 150 + body: str, 151 + base_dt: dt.datetime, 152 + tz: dt.timezone | None, 153 + ) -> list[dict[str, Any]]: 154 + """Parse speaker-labeled transcript entries into message dicts. 155 + 156 + Each message has: create_time (float epoch), speaker, text, model_slug. 157 + """ 158 + base_date = base_dt.date() if tz is None else base_dt.astimezone(tz).date() 159 + messages: list[dict[str, Any]] = [] 160 + 161 + for match in TRANSCRIPT_RE.finditer(body): 162 + speaker = match.group(1).strip() 163 + time_str = match.group(2) 164 + text = match.group(3).strip() 165 + 166 + h, m, s = (int(p) for p in time_str.split(":")) 167 + 168 + if tz is not None: 169 + msg_dt = dt.datetime( 170 + base_date.year, base_date.month, base_date.day, h, m, s, tzinfo=tz 171 + ) 172 + else: 173 + msg_dt = dt.datetime( 174 + base_date.year, base_date.month, base_date.day, h, m, s 175 + ) 176 + 177 + messages.append( 178 + { 179 + "create_time": msg_dt.timestamp(), 180 + "speaker": speaker, 181 + "text": text, 182 + "model_slug": None, 183 + } 184 + ) 185 + 186 + return messages 187 + 188 + 189 + def _import_transcript( 190 + md_file: Path, 191 + fm: dict[str, Any], 192 + body: str, 193 + journal_root: Path, 194 + ) -> tuple[list[str], list[tuple[str, str]], int]: 195 + """Import a single muesli transcript into the journal. 196 + 197 + Returns (created_files, segments, entities_seeded). 198 + """ 199 + doc_id = fm.get("doc_id", md_file.stem) 200 + title = fm.get("title", "") 201 + duration_seconds = fm.get("duration_seconds") 202 + summary_text = fm.get("summary_text", "") 203 + 204 + created_dt, tz = _parse_created_at(fm, md_file.name) 205 + if created_dt is None: 206 + logger.warning("No date for %s -- skipping", md_file.name) 207 + return [], [], 0 208 + 209 + # Parse transcript entries 210 + messages = _parse_transcript_entries(body, created_dt, tz) 211 + if not messages: 212 + return [], [], 0 213 + 214 + messages.sort(key=lambda m: m["create_time"]) 215 + 216 + # Estimate duration if missing 217 + if not duration_seconds and len(messages) >= 2: 218 + duration_seconds = int(messages[-1]["create_time"] - messages[0]["create_time"]) 219 + if not duration_seconds: 220 + duration_seconds = 300 221 + 222 + # Window into 5-min segments 223 + windows = _window_messages(messages) 224 + 225 + created_files: list[str] = [] 226 + segments: list[tuple[str, str]] = [] 227 + source_copied = False 228 + 229 + for day, seg_key, _model, entries in windows: 230 + day_dir = str(day_path(day)) 231 + 232 + # Build setting string with summary if available 233 + setting = "meeting" 234 + if summary_text: 235 + setting = f"meeting -- {summary_text}" 236 + 237 + json_path = write_segment( 238 + day_dir, 239 + "import.granola", 240 + seg_key, 241 + entries, 242 + import_id=doc_id, 243 + topics=title, 244 + detected_setting=setting, 245 + ) 246 + created_files.append(json_path) 247 + segments.append((day, seg_key)) 248 + 249 + # Copy source markdown into first segment for ground-truth preservation 250 + if not source_copied: 251 + import os 252 + 253 + seg_dir = os.path.dirname(json_path) 254 + source_dest = os.path.join(seg_dir, "source.md") 255 + shutil.copy2(str(md_file), source_dest) 256 + created_files.append(source_dest) 257 + source_copied = True 258 + 259 + # Seed entities from participants 260 + entities_seeded = 0 261 + participants = _parse_participants(body) 262 + if participants and segments: 263 + first_day = segments[0][0] 264 + entity_dicts = [ 265 + {"name": p["name"], "type": "Person", "email": p["email"]} 266 + for p in participants 267 + if p["name"] 268 + ] 269 + if entity_dicts: 270 + try: 271 + seeded = seed_entities("import.granola", first_day, entity_dicts) 272 + entities_seeded = len(seeded) 273 + except Exception as exc: 274 + logger.warning( 275 + "Entity seeding failed for %s: %s", md_file.name, exc 276 + ) 277 + 278 + return created_files, segments, entities_seeded 279 + 280 + 281 + class GranolaBackend: 282 + """Syncable backend for Granola meeting transcripts via muesli.""" 283 + 284 + name: str = "granola" 285 + 286 + def sync( 287 + self, 288 + journal_root: Path, 289 + *, 290 + dry_run: bool = True, 291 + source_path: Path | None = None, 292 + force: bool = False, 293 + ) -> dict[str, Any]: 294 + """Sync Granola transcripts from muesli local files. 295 + 296 + Scans the muesli transcripts directory for markdown files, compares 297 + against sync state, and imports new/updated transcripts into the journal. 298 + 299 + Args: 300 + journal_root: Path to the journal root directory. 301 + dry_run: If True, catalog only (no import). If False, import. 302 + source_path: Override muesli transcripts directory path. 303 + force: If True, ignore sync state and re-import everything. 304 + 305 + Returns: 306 + Summary dict with total, imported, available, skipped, downloaded, errors. 307 + """ 308 + muesli_dir = source_path or DEFAULT_MUESLI_PATH 309 + 310 + # Three-state setup detection 311 + if not muesli_dir.exists(): 312 + parent = muesli_dir.parent 313 + if parent.exists(): 314 + raise ValueError( 315 + "Muesli is installed but no transcripts found.\n" 316 + "Run `muesli sync` to extract your Granola transcripts, " 317 + "then try again." 318 + ) 319 + raise ValueError( 320 + "Granola transcripts require muesli to extract.\n" 321 + "Install: cargo install --git " 322 + "https://github.com/harperreed/muesli.git --all-features\n" 323 + "Then run: muesli sync" 324 + ) 325 + 326 + # Load or initialize sync state 327 + state = load_sync_state(journal_root, "granola") or { 328 + "backend": "granola", 329 + "source_path": str(muesli_dir), 330 + "files": {}, 331 + } 332 + 333 + if force: 334 + state["files"] = {} 335 + 336 + known_files: dict[str, dict[str, Any]] = state.get("files", {}) 337 + 338 + # Scan muesli transcripts directory 339 + md_files = sorted(muesli_dir.glob("*.md")) 340 + 341 + # Parse each file and determine sync status 342 + to_import: list[tuple[Path, dict[str, Any], str]] = [] 343 + current_doc_ids: set[str] = set() 344 + skipped_count = 0 345 + 346 + for md_file in md_files: 347 + try: 348 + fm, body = _parse_muesli_file(md_file) 349 + except Exception as exc: 350 + logger.warning("Failed to parse %s: %s", md_file.name, exc) 351 + continue 352 + 353 + doc_id = fm.get("doc_id") 354 + if not doc_id: 355 + logger.debug("No doc_id in %s -- skipping", md_file.name) 356 + skipped_count += 1 357 + continue 358 + 359 + current_doc_ids.add(doc_id) 360 + remote_updated_at = str(fm.get("remote_updated_at", "")) 361 + title = fm.get("title", md_file.stem) 362 + duration = fm.get("duration_seconds", 0) 363 + 364 + # Check for transcript content 365 + has_transcript = bool(TRANSCRIPT_RE.search(body)) 366 + if not has_transcript: 367 + if doc_id not in known_files: 368 + known_files[doc_id] = { 369 + "filename": md_file.name, 370 + "title": title, 371 + "remote_updated_at": remote_updated_at, 372 + "status": "skipped", 373 + "skip_reason": "no_transcript", 374 + } 375 + skipped_count += 1 376 + continue 377 + 378 + # Check sync state for this doc 379 + if doc_id in known_files and not force: 380 + existing = known_files[doc_id] 381 + if existing.get("status") == "imported": 382 + existing_updated = existing.get("remote_updated_at", "") 383 + if remote_updated_at <= existing_updated: 384 + # Unchanged -- skip 385 + continue 386 + # Updated -- will re-import 387 + 388 + # Mark as available for import 389 + known_files.setdefault(doc_id, {}) 390 + known_files[doc_id].update( 391 + { 392 + "filename": md_file.name, 393 + "title": title, 394 + "remote_updated_at": remote_updated_at, 395 + "duration_seconds": duration, 396 + "status": "available", 397 + } 398 + ) 399 + to_import.append((md_file, fm, doc_id)) 400 + 401 + # Detect deleted files (in state but gone from muesli dir) 402 + for doc_id, info in known_files.items(): 403 + if doc_id not in current_doc_ids and info.get("status") not in ( 404 + "removed", 405 + "skipped", 406 + ): 407 + info["status"] = "removed" 408 + 409 + # Compute summary 410 + total = len(known_files) 411 + imported = sum( 412 + 1 for f in known_files.values() if f.get("status") == "imported" 413 + ) 414 + available = len(to_import) 415 + skipped_total = sum( 416 + 1 for f in known_files.values() if f.get("status") == "skipped" 417 + ) 418 + 419 + result: dict[str, Any] = { 420 + "total": total, 421 + "imported": imported, 422 + "available": available, 423 + "skipped": skipped_total, 424 + "downloaded": 0, 425 + "errors": [], 426 + } 427 + 428 + # Import transcripts if not dry-run 429 + if not dry_run and to_import: 430 + downloaded = 0 431 + errors: list[str] = [] 432 + 433 + for md_file, _fm_partial, doc_id in to_import: 434 + try: 435 + fm_full, body = _parse_muesli_file(md_file) 436 + files, segs, entities = _import_transcript( 437 + md_file, fm_full, body, journal_root 438 + ) 439 + 440 + if files: 441 + known_files[doc_id]["status"] = "imported" 442 + known_files[doc_id][ 443 + "imported_at" 444 + ] = dt.datetime.now().isoformat() 445 + known_files[doc_id]["segments"] = len(segs) 446 + known_files[doc_id]["entities_seeded"] = entities 447 + downloaded += 1 448 + logger.info( 449 + "Imported %s: %d segments, %d entities", 450 + md_file.name, 451 + len(segs), 452 + entities, 453 + ) 454 + else: 455 + known_files[doc_id]["status"] = "skipped" 456 + known_files[doc_id]["skip_reason"] = "no_content" 457 + except Exception as exc: 458 + msg = f"{md_file.name}: {exc}" 459 + logger.warning("Import failed: %s", msg) 460 + errors.append(msg) 461 + 462 + result["downloaded"] = downloaded 463 + result["errors"] = errors 464 + result["imported"] = sum( 465 + 1 for f in known_files.values() if f.get("status") == "imported" 466 + ) 467 + result["available"] = sum( 468 + 1 for f in known_files.values() if f.get("status") == "available" 469 + ) 470 + 471 + # Save updated state 472 + state["files"] = known_files 473 + state["source_path"] = str(muesli_dir) 474 + state["last_sync"] = dt.datetime.now().isoformat() 475 + save_sync_state(journal_root, "granola", state) 476 + 477 + return result 478 + 479 + 480 + # Module-level backend instance for discovery 481 + backend = GranolaBackend()

+1

think/importers/sync.py

··· 23 23 24 24 SYNCABLE_REGISTRY: dict[str, str] = { 25 25 "plaud": "think.importers.plaud", 26 + "granola": "think.importers.granola", 26 27 } 27 28 28 29

Configure Feed

Configure Feed