Require explicit sources for all cluster functions

+40 -12

tests/test_cluster.py

··· 19 19 (day_dir / "120000_300" / "audio.jsonl").write_text('{}\n{"text": "hi"}\n') 20 20 (day_dir / "120500_300").mkdir() 21 21 (day_dir / "120500_300" / "screen.md").write_text("screen summary") 22 - result, counts = mod.cluster("20240101") 22 + result, counts = mod.cluster( 23 + "20240101", sources={"audio": True, "screen": False, "agents": True} 24 + ) 23 25 assert counts["audio"] == 1 24 26 assert counts["agents"] == 1 25 27 assert "Audio Transcript" in result ··· 28 30 29 31 30 32 def test_cluster_range(tmp_path, monkeypatch): 31 - """Test cluster_range with audio and agents parameters.""" 33 + """Test cluster_range with audio and agents sources.""" 32 34 monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 33 35 day_dir = day_path("20240101") 34 36 ··· 41 43 ) 42 44 (day_dir / "120000_300" / "screen.md").write_text("screen summary content") 43 45 # Test with agents=True to include *.md files 44 - md = mod.cluster_range("20240101", "120000", "120100", audio=True, agents=True) 46 + md = mod.cluster_range( 47 + "20240101", 48 + "120000", 49 + "120100", 50 + sources={"audio": True, "screen": False, "agents": True}, 51 + ) 45 52 # Check that the function works and includes expected sections 46 53 assert "Audio Transcript" in md 47 54 # Now uses insight rendering: "### {stem} summary" ··· 145 152 # Also create screen.md (insight) to verify it's NOT used by cluster_period 146 153 (segment / "screen.md").write_text("This insight should NOT appear") 147 154 148 - result, counts = mod.cluster_period("20240101", "100000_300") 155 + result, counts = mod.cluster_period( 156 + "20240101", 157 + "100000_300", 158 + sources={"audio": True, "screen": True, "agents": False}, 159 + ) 149 160 150 161 # Should have both audio and screen entries 151 162 assert counts["audio"] == 1 ··· 160 171 161 172 162 173 def test_cluster_range_with_agents(tmp_path, monkeypatch): 163 - """Test cluster_range with agents=True loads all *.md files.""" 174 + """Test cluster_range with agents source loads all *.md files.""" 164 175 monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 165 176 day_dir = day_path("20240101") 166 177 ··· 182 193 183 194 # Test agents=True returns *.md summaries, not raw screen data 184 195 result = mod.cluster_range( 185 - "20240101", "100000", "100500", audio=True, screen=False, agents=True 196 + "20240101", 197 + "100000", 198 + "100500", 199 + sources={"audio": True, "screen": False, "agents": True}, 186 200 ) 187 201 188 202 assert "Audio Transcript" in result ··· 196 210 197 211 198 212 def test_cluster_range_with_screen(tmp_path, monkeypatch): 199 - """Test cluster_range with screen=True loads raw screen.jsonl data.""" 213 + """Test cluster_range with screen source loads raw screen.jsonl data.""" 200 214 monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 201 215 day_dir = day_path("20240101") 202 216 ··· 213 227 214 228 # Test screen=True returns raw screen data, not agent outputs 215 229 result = mod.cluster_range( 216 - "20240101", "100000", "100500", audio=False, screen=True, agents=False 230 + "20240101", 231 + "100000", 232 + "100500", 233 + sources={"audio": False, "screen": True, "agents": False}, 217 234 ) 218 235 219 236 assert "Screen Activity" in result ··· 246 263 247 264 # Test screen=True returns data from both screen files 248 265 result = mod.cluster_range( 249 - "20240101", "100000", "100500", audio=False, screen=True, agents=False 266 + "20240101", 267 + "100000", 268 + "100500", 269 + sources={"audio": False, "screen": True, "agents": False}, 250 270 ) 251 271 252 272 # Should include content from both screen files ··· 318 338 '{"raw": "audio.flac"}\n{"start": "00:00:01", "text": "late morning segment"}\n' 319 339 ) 320 340 321 - # Process only first and third segments as a span 322 - result, counts = mod.cluster_span("20240101", ["090000_300", "110000_300"]) 341 + # Process only first and third segments as a span (audio only, no screen) 342 + result, counts = mod.cluster_span( 343 + "20240101", 344 + ["090000_300", "110000_300"], 345 + sources={"audio": True, "screen": False, "agents": False}, 346 + ) 323 347 324 348 # Should have 2 audio entries (one per segment) 325 349 assert counts["audio"] == 2 ··· 344 368 345 369 # Try to process existing and non-existing segments 346 370 with pytest.raises(ValueError) as exc_info: 347 - mod.cluster_span("20240101", ["090000_300", "100000_300"]) 371 + mod.cluster_span( 372 + "20240101", 373 + ["090000_300", "100000_300"], 374 + sources={"audio": True, "screen": False, "agents": False}, 375 + ) 348 376 349 377 assert "100000_300" in str(exc_info.value) 350 378 assert "not found" in str(exc_info.value)

+3 -1

tests/test_cluster_full.py

··· 28 28 mod = importlib.import_module("think.cluster") 29 29 copy_day(tmp_path) 30 30 monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 31 - md, counts = mod.cluster("20240101") 31 + md, counts = mod.cluster( 32 + "20240101", sources={"audio": True, "screen": False, "agents": True} 33 + ) 32 34 # Count: audio.jsonl (1) + audio.md (1) + screen.md (1) = 3 entries 33 35 assert counts["audio"] == 1 34 36 assert counts["agents"] == 2 # audio.md + screen.md

+26 -46

think/cluster.py

··· 377 377 378 378 def cluster( 379 379 day: str, 380 - sources: Dict[str, bool | str] | None = None, 380 + sources: Dict[str, bool | str], 381 381 ) -> Tuple[str, Dict[str, int]]: 382 382 """Return Markdown summary for one day's JSON files and counts by source. 383 - 384 - By default uses insight summaries (*.md files) rather than raw screen data 385 - for daily view. Override with sources parameter. 386 383 387 384 Args: 388 385 day: Day in YYYYMMDD format 389 - sources: Optional dict with keys "audio", "screen", "agents". 386 + sources: Dict with keys "audio", "screen", "agents". 390 387 Values can be bool or "required" string (see source_is_enabled). 391 - Defaults to {"audio": True, "screen": False, "agents": True}. 392 388 393 389 Returns: 394 390 Tuple of (markdown, source_counts) where source_counts is a dict 395 391 with keys "audio", "screen", "agents" mapping to entry counts. 396 392 """ 397 - # Default sources for daily generators: audio + agent summaries, no raw screen 398 - if sources is None: 399 - sources = {"audio": True, "screen": False, "agents": True} 400 - 401 393 empty_counts = {"audio": 0, "screen": 0, "agents": 0} 402 394 403 395 day_dir = str(day_path(day)) ··· 407 399 408 400 entries = _load_entries( 409 401 day_dir, 410 - audio=sources.get("audio", True), 402 + audio=sources.get("audio", False), 411 403 screen=sources.get("screen", False), 412 - agents=sources.get("agents", True), 404 + agents=sources.get("agents", False), 413 405 ) 414 406 if not entries: 415 407 return ( ··· 425 417 def cluster_period( 426 418 day: str, 427 419 segment: str, 428 - sources: Dict[str, bool | str] | None = None, 420 + sources: Dict[str, bool | str], 429 421 ) -> Tuple[str, Dict[str, int]]: 430 422 """Return Markdown summary for one segment's JSON files and counts by source. 431 423 432 - By default uses raw screen data for segment generators (more granular than summaries). 433 - Override with sources parameter. 434 - 435 424 Args: 436 425 day: Day in YYYYMMDD format 437 426 segment: Segment key in HHMMSS_LEN format (e.g., "163045_300") 438 - sources: Optional dict with keys "audio", "screen", "agents". 427 + sources: Dict with keys "audio", "screen", "agents". 439 428 Values can be bool or "required" string (see source_is_enabled). 440 - Defaults to {"audio": True, "screen": True, "agents": False}. 441 429 442 430 Returns: 443 431 Tuple of (markdown, source_counts) where source_counts is a dict 444 432 with keys "audio", "screen", "agents" mapping to entry counts. 445 433 """ 446 - # Default sources for segment generators: audio + raw screen, no agent summaries 447 - if sources is None: 448 - sources = {"audio": True, "screen": True, "agents": False} 449 - 450 434 empty_counts = {"audio": 0, "screen": 0, "agents": 0} 451 435 452 436 day_dir = str(day_path(day)) ··· 457 441 458 442 entries = _load_entries_from_segment( 459 443 str(segment_dir), 460 - audio=sources.get("audio", True), 461 - screen=sources.get("screen", True), 444 + audio=sources.get("audio", False), 445 + screen=sources.get("screen", False), 462 446 agents=sources.get("agents", False), 463 447 ) 464 448 if not entries: ··· 493 477 def cluster_span( 494 478 day: str, 495 479 span: List[str], 496 - sources: Dict[str, bool | str] | None = None, 480 + sources: Dict[str, bool | str], 497 481 ) -> Tuple[str, Dict[str, int]]: 498 482 """Return Markdown summary for a span of segments and counts by source. 499 483 500 484 A span is a list of sequential segment keys (e.g., from an import that created 501 485 multiple 5-minute segments from one audio file). 502 486 503 - By default uses raw screen data for segment generators (more granular than summaries). 504 487 Validates all segments exist before processing; raises ValueError if any are missing. 505 488 506 489 Args: 507 490 day: Day in YYYYMMDD format 508 491 span: List of segment keys in HHMMSS_LEN format (e.g., ["163045_300", "170000_600"]) 509 - sources: Optional dict with keys "audio", "screen", "agents". 492 + sources: Dict with keys "audio", "screen", "agents". 510 493 Values can be bool or "required" string (see source_is_enabled). 511 - Defaults to {"audio": True, "screen": True, "agents": False}. 512 494 513 495 Returns: 514 496 Tuple of (markdown, source_counts) where source_counts is a dict ··· 517 499 Raises: 518 500 ValueError: If any segment directories are missing 519 501 """ 520 - # Default sources for segment generators: audio + raw screen, no agent summaries 521 - if sources is None: 522 - sources = {"audio": True, "screen": True, "agents": False} 523 - 524 502 empty_counts = {"audio": 0, "screen": 0, "agents": 0} 525 503 day_dir = str(day_path(day)) 526 504 ··· 540 518 segment_dir = Path(day_dir) / segment_key 541 519 segment_entries = _load_entries_from_segment( 542 520 str(segment_dir), 543 - audio=sources.get("audio", True), 544 - screen=sources.get("screen", True), 521 + audio=sources.get("audio", False), 522 + screen=sources.get("screen", False), 545 523 agents=sources.get("agents", False), 546 524 ) 547 525 entries.extend(segment_entries) ··· 573 551 day: str, 574 552 start: str, 575 553 end: str, 576 - audio: bool = True, 577 - screen: bool = False, 578 - agents: bool = True, 554 + sources: Dict[str, bool], 579 555 ) -> str: 580 556 """Return markdown for ``day`` limited to ``start``-``end`` (HHMMSS). 581 557 ··· 586 562 day: Day in YYYYMMDD format 587 563 start: Start time in HHMMSS format 588 564 end: End time in HHMMSS format 589 - audio: Whether to include audio transcripts 590 - screen: Whether to include raw screen data from *screen.jsonl files 591 - agents: Whether to include agent output summaries from *.md files 565 + sources: Dict with keys "audio", "screen", "agents" (all bool). 592 566 """ 593 - 594 567 day_dir = str(day_path(day)) 595 568 date_str = _date_str(day_dir) 596 569 start_dt = datetime.strptime(date_str + start, "%Y%m%d%H%M%S") 597 570 end_dt = datetime.strptime(date_str + end, "%Y%m%d%H%M%S") 598 571 599 - entries = _load_entries(day_dir, audio, screen, agents) 572 + entries = _load_entries( 573 + day_dir, 574 + audio=sources.get("audio", False), 575 + screen=sources.get("screen", False), 576 + agents=sources.get("agents", False), 577 + ) 600 578 # Include segments that overlap with the requested range 601 579 entries = [ 602 580 e ··· 631 609 if args.start and args.length is not None: 632 610 start_dt = datetime.strptime(args.start, "%H%M%S") 633 611 end_dt = start_dt + timedelta(minutes=args.length) 612 + # CLI range view: show raw data (audio + screen, no summaries) 634 613 markdown = cluster_range( 635 614 args.day, 636 615 args.start, 637 616 end_dt.strftime("%H%M%S"), 638 - audio=True, 639 - screen=True, 640 - agents=False, 617 + sources={"audio": True, "screen": True, "agents": False}, 641 618 ) 642 619 print(markdown) 643 620 elif args.start or args.length is not None: 644 621 parser.error("--start and --length must be used together") 645 622 else: 646 - markdown, _counts = cluster(args.day) 623 + # CLI default: show audio + agent summaries (daily view) 624 + markdown, _counts = cluster( 625 + args.day, sources={"audio": True, "screen": False, "agents": True} 626 + ) 647 627 print(markdown) 648 628 649 629

+4 -12

think/resources/transcripts.py

··· 49 49 day=day, 50 50 start=time, 51 51 end=end_time, 52 - audio=True, 53 - screen=True, 54 - agents=False, 52 + sources={"audio": True, "screen": True, "agents": False}, 55 53 ) 56 54 description = f"Raw audio and screencast transcripts from {day} at {time} for {length} minutes" 57 55 elif mode == "audio": ··· 59 57 day=day, 60 58 start=time, 61 59 end=end_time, 62 - audio=True, 63 - screen=False, 64 - agents=False, 60 + sources={"audio": True, "screen": False, "agents": False}, 65 61 ) 66 62 description = ( 67 63 f"Raw audio transcripts from {day} at {time} for {length} minutes" ··· 71 67 day=day, 72 68 start=time, 73 69 end=end_time, 74 - audio=False, 75 - screen=True, 76 - agents=False, 70 + sources={"audio": False, "screen": True, "agents": False}, 77 71 ) 78 72 description = ( 79 73 f"Raw screencast transcripts from {day} at {time} for {length} minutes" ··· 83 77 day=day, 84 78 start=time, 85 79 end=end_time, 86 - audio=False, 87 - screen=False, 88 - agents=True, 80 + sources={"audio": False, "screen": False, "agents": True}, 89 81 ) 90 82 description = ( 91 83 f"AI-generated summaries from {day} at {time} for {length} minutes"

Configure Feed

Configure Feed