Align Claude chat importer with segment output pattern

+136

tests/test_importer.py

··· 444 444 ] 445 445 446 446 447 + def test_claude_chat_importer_segments(tmp_path, monkeypatch): 448 + """Claude importer should write message windows as import segments.""" 449 + mod = importlib.import_module("think.importers.claude_chat") 450 + 451 + base = dt.datetime(2026, 1, 15, 12, 0, 0) 452 + conversations = [ 453 + { 454 + "name": "First", 455 + "created_at": base.isoformat(), 456 + "chat_messages": [ 457 + { 458 + "sender": "human", 459 + "text": "Hello", 460 + "created_at": base.isoformat(), 461 + }, 462 + { 463 + "sender": "assistant", 464 + "text": "Hi there", 465 + "created_at": (base + dt.timedelta(seconds=60)).isoformat(), 466 + }, 467 + { 468 + "sender": "human", 469 + "text": "New topic", 470 + "created_at": (base + dt.timedelta(seconds=301)).isoformat(), 471 + }, 472 + ], 473 + }, 474 + { 475 + "name": "Second", 476 + "created_at": (base + dt.timedelta(hours=12)).isoformat(), 477 + "chat_messages": [ 478 + { 479 + "sender": "assistant", 480 + "text": "Fallback time", 481 + }, 482 + { 483 + "sender": "assistant", 484 + "text": "Next day reply", 485 + "created_at": (base + dt.timedelta(hours=12)).isoformat(), 486 + }, 487 + ], 488 + }, 489 + { 490 + "name": "Empty", 491 + "created_at": base.isoformat(), 492 + "chat_messages": [], 493 + }, 494 + ] 495 + 496 + archive = tmp_path / "claude.zip" 497 + with zipfile.ZipFile(archive, "w") as zf: 498 + zf.writestr("conversations.json", json.dumps(conversations)) 499 + 500 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 501 + 502 + fixed_dt = dt.datetime(2026, 1, 20, 8, 30, 0) 503 + 504 + class FixedDateTime(dt.datetime): 505 + @classmethod 506 + def now(cls, tz=None): 507 + return fixed_dt 508 + 509 + monkeypatch.setattr(mod.dt, "datetime", FixedDateTime) 510 + 511 + result = mod.ClaudeChatImporter().process(archive, tmp_path, facet="work") 512 + 513 + assert result.entries_written == 5 514 + assert result.errors == [] 515 + assert result.segments == [ 516 + ("20260115", "120000_300"), 517 + ("20260115", "120501_300"), 518 + ("20260116", "000000_300"), 519 + ] 520 + assert len(result.files_created) == 3 521 + 522 + first_segment = ( 523 + day_path("20260115") / "import.claude" / "120000_300" / "imported_audio.jsonl" 524 + ) 525 + second_segment = ( 526 + day_path("20260115") / "import.claude" / "120501_300" / "imported_audio.jsonl" 527 + ) 528 + third_segment = ( 529 + day_path("20260116") / "import.claude" / "000000_300" / "imported_audio.jsonl" 530 + ) 531 + 532 + assert first_segment.exists() 533 + assert second_segment.exists() 534 + assert third_segment.exists() 535 + 536 + first_lines = first_segment.read_text().strip().split("\n") 537 + first_meta = json.loads(first_lines[0]) 538 + first_entries = [json.loads(line) for line in first_lines[1:]] 539 + assert first_meta["imported"] == {"id": "20260120_083000", "facet": "work"} 540 + assert first_entries == [ 541 + {"start": "00:00:00", "speaker": "Human", "text": "Hello", "source": "import"}, 542 + { 543 + "start": "00:01:00", 544 + "speaker": "Assistant", 545 + "text": "Hi there", 546 + "source": "import", 547 + }, 548 + ] 549 + 550 + second_lines = second_segment.read_text().strip().split("\n") 551 + second_meta = json.loads(second_lines[0]) 552 + second_entries = [json.loads(line) for line in second_lines[1:]] 553 + assert second_meta == {"imported": {"id": "20260120_083000", "facet": "work"}} 554 + assert second_entries == [ 555 + { 556 + "start": "00:00:00", 557 + "speaker": "Human", 558 + "text": "New topic", 559 + "source": "import", 560 + } 561 + ] 562 + 563 + third_lines = third_segment.read_text().strip().split("\n") 564 + third_meta = json.loads(third_lines[0]) 565 + third_entries = [json.loads(line) for line in third_lines[1:]] 566 + assert third_meta == {"imported": {"id": "20260120_083000", "facet": "work"}} 567 + assert third_entries == [ 568 + { 569 + "start": "00:00:00", 570 + "speaker": "Assistant", 571 + "text": "Fallback time", 572 + "source": "import", 573 + }, 574 + { 575 + "start": "00:00:00", 576 + "speaker": "Assistant", 577 + "text": "Next day reply", 578 + "source": "import", 579 + }, 580 + ] 581 + 582 + 447 583 def test_format_audio_stream_path(): 448 584 """Test format_audio correctly parses timestamps from stream-based paths.""" 449 585 from observe.hear import format_audio

+1 -56

think/importers/chatgpt.py

··· 11 11 from typing import Any, Callable 12 12 13 13 from think.importers.file_importer import ImportPreview, ImportResult 14 - from think.importers.shared import write_segment 14 + from think.importers.shared import _window_messages, write_segment 15 15 from think.utils import day_path 16 16 17 17 logger = logging.getLogger(__name__) ··· 106 106 skipped += 1 107 107 108 108 return messages, model_counts, skipped 109 - 110 - 111 - def _window_messages( 112 - messages: list[dict[str, Any]], 113 - window_duration: int = 300, 114 - ) -> list[tuple[str, str, str | None, list[dict[str, Any]]]]: 115 - """Group sorted messages into fixed-duration windows per day.""" 116 - if not messages: 117 - return [] 118 - 119 - windows: list[tuple[str, str, str | None, list[dict[str, Any]]]] = [] 120 - window_start: float | None = None 121 - window_day: str | None = None 122 - window_entries: list[dict[str, Any]] = [] 123 - window_model: str | None = None 124 - 125 - for msg in messages: 126 - msg_dt = dt.datetime.fromtimestamp(msg["create_time"]) 127 - msg_day = msg_dt.strftime("%Y%m%d") 128 - 129 - if ( 130 - window_start is None 131 - or msg_day != window_day 132 - or msg["create_time"] - window_start >= window_duration 133 - ): 134 - if window_entries and window_day and window_start is not None: 135 - start_dt = dt.datetime.fromtimestamp(window_start) 136 - seg_key = f"{start_dt.strftime('%H%M%S')}_{window_duration}" 137 - windows.append((window_day, seg_key, window_model, window_entries)) 138 - 139 - window_start = msg["create_time"] 140 - window_day = msg_day 141 - window_entries = [] 142 - window_model = None 143 - 144 - offset = int(msg["create_time"] - window_start) 145 - h, remainder = divmod(offset, 3600) 146 - m, s = divmod(remainder, 60) 147 - window_entries.append( 148 - { 149 - "start": f"{h:02d}:{m:02d}:{s:02d}", 150 - "speaker": msg["speaker"], 151 - "text": msg["text"], 152 - } 153 - ) 154 - 155 - if msg["model_slug"] and window_model is None: 156 - window_model = msg["model_slug"] 157 - 158 - if window_entries and window_day and window_start is not None: 159 - start_dt = dt.datetime.fromtimestamp(window_start) 160 - seg_key = f"{start_dt.strftime('%H%M%S')}_{window_duration}" 161 - windows.append((window_day, seg_key, window_model, window_entries)) 162 - 163 - return windows 164 109 165 110 166 111 class ChatGPTImporter:

+115 -70

think/importers/claude_chat.py

··· 11 11 from typing import Any, Callable 12 12 13 13 from think.importers.file_importer import ImportPreview, ImportResult 14 - from think.importers.shared import write_structured_import 14 + from think.importers.shared import _window_messages, write_segment 15 + from think.utils import day_path 15 16 16 17 logger = logging.getLogger(__name__) 17 18 ··· 27 28 raise ValueError(f"Unsupported file type: {path.suffix}") 28 29 29 30 30 - def _format_messages(messages: list[dict[str, Any]]) -> str: 31 - """Format chat messages into readable text with role prefixes.""" 32 - lines: list[str] = [] 33 - for msg in messages: 34 - sender = msg.get("sender", "unknown") 35 - text = msg.get("text", "") 36 - if not text: 31 + def _extract_messages( 32 + conversations: list[dict[str, Any]], 33 + ) -> tuple[list[dict[str, Any]], int]: 34 + """Extract timestamped messages from Claude conversations.""" 35 + messages: list[dict[str, Any]] = [] 36 + skipped = 0 37 + 38 + for conv in conversations: 39 + chat_messages = conv.get("chat_messages", []) 40 + if not chat_messages: 41 + skipped += 1 37 42 continue 38 - role = "Human" if sender == "human" else "Assistant" 39 - lines.append(f"{role}: {text}") 40 - return "\n\n".join(lines) 43 + 44 + conv_created = conv.get("created_at", "") 45 + conv_ts: float | None = None 46 + if conv_created: 47 + try: 48 + conv_ts = dt.datetime.fromisoformat(conv_created).timestamp() 49 + except (ValueError, TypeError): 50 + pass 51 + 52 + conv_has_content = False 53 + for msg in chat_messages: 54 + sender = msg.get("sender", "") 55 + text = msg.get("text", "") 56 + if not text: 57 + continue 58 + 59 + created_at = msg.get("created_at", "") 60 + create_time: float | None = None 61 + if created_at: 62 + try: 63 + create_time = dt.datetime.fromisoformat(created_at).timestamp() 64 + except (ValueError, TypeError): 65 + pass 66 + if create_time is None: 67 + create_time = conv_ts 68 + if create_time is None: 69 + continue 70 + 71 + messages.append( 72 + { 73 + "create_time": create_time, 74 + "speaker": "Human" if sender == "human" else "Assistant", 75 + "text": text, 76 + "model_slug": None, 77 + } 78 + ) 79 + conv_has_content = True 80 + 81 + if not conv_has_content: 82 + skipped += 1 83 + 84 + return messages, skipped 41 85 42 86 43 87 class ClaudeChatImporter: ··· 77 121 ) 78 122 79 123 dates: list[str] = [] 80 - valid_count = 0 124 + message_count = 0 81 125 for conv in conversations: 82 - messages = conv.get("chat_messages", []) 83 - if not messages: 84 - continue 85 - valid_count += 1 86 - created = conv.get("created_at", "") 87 - if created: 88 - try: 89 - day = dt.datetime.fromisoformat(created).strftime("%Y%m%d") 90 - dates.append(day) 91 - except ValueError: 92 - pass 126 + for msg in conv.get("chat_messages", []): 127 + text = msg.get("text", "") 128 + if not text: 129 + continue 130 + message_count += 1 131 + created_at = msg.get("created_at", "") 132 + if not created_at: 133 + created_at = conv.get("created_at", "") 134 + if created_at: 135 + try: 136 + day = dt.datetime.fromisoformat(created_at).strftime("%Y%m%d") 137 + dates.append(day) 138 + except ValueError: 139 + pass 93 140 94 141 dates.sort() 95 142 date_range = (dates[0], dates[-1]) if dates else ("", "") 96 143 97 144 return ImportPreview( 98 145 date_range=date_range, 99 - item_count=valid_count, 146 + item_count=message_count, 100 147 entity_count=0, 101 - summary=f"{valid_count} conversations from Claude chat export", 148 + summary=f"{message_count} messages from Claude chat export", 102 149 ) 103 150 104 151 def process( ··· 111 158 ) -> ImportResult: 112 159 conversations = _open_conversations(path) 113 160 import_id = dt.datetime.now().strftime("%Y%m%d_%H%M%S") 161 + messages, skipped = _extract_messages(conversations) 162 + if not messages: 163 + return ImportResult( 164 + entries_written=0, 165 + entities_seeded=0, 166 + files_created=[], 167 + errors=[], 168 + summary="No messages found in Claude export", 169 + ) 114 170 115 - entries: list[dict[str, Any]] = [] 116 - errors: list[str] = [] 117 - skipped = 0 171 + messages.sort(key=lambda m: m["create_time"]) 118 172 119 - for i, conv in enumerate(conversations): 120 - messages = conv.get("chat_messages", []) 121 - if not messages: 122 - skipped += 1 123 - continue 173 + if progress_callback: 174 + progress_callback(len(conversations), len(conversations)) 124 175 125 - title = conv.get("name", "Untitled") 126 - created = conv.get("created_at", "") 176 + windows = _window_messages(messages) 177 + created_files: list[str] = [] 178 + segments: list[tuple[str, str]] = [] 179 + errors: list[str] = [] 180 + written_count = 0 127 181 128 - # Parse timestamp 182 + for day, seg_key, _model, entries in windows: 183 + day_dir = str(day_path(day)) 129 184 try: 130 - ts = dt.datetime.fromisoformat(created).isoformat() 131 - except (ValueError, TypeError): 132 - errors.append(f"Bad timestamp for conversation: {title!r}") 133 - continue 134 - 135 - content = _format_messages(messages) 136 - if not content: 137 - skipped += 1 138 - continue 139 - 140 - entries.append( 141 - { 142 - "type": "ai_chat", 143 - "ts": ts, 144 - "title": title, 145 - "source": "claude", 146 - "message_count": len(messages), 147 - "content": content, 148 - } 149 - ) 150 - 151 - if progress_callback and (i + 1) % 100 == 0: 152 - progress_callback(i + 1, len(conversations)) 153 - 154 - # Write to journal 155 - created_files = write_structured_import( 156 - "claude", 157 - entries, 158 - import_id=import_id, 159 - facet=facet, 160 - ) 185 + json_path = write_segment( 186 + day_dir, 187 + "import.claude", 188 + seg_key, 189 + entries, 190 + import_id=import_id, 191 + facet=facet, 192 + model=None, 193 + ) 194 + created_files.append(json_path) 195 + segments.append((day, seg_key)) 196 + written_count += len(entries) 197 + except Exception as exc: 198 + errors.append(f"Failed to write segment {day}/{seg_key}: {exc}") 199 + logger.warning("Failed to write segment %s/%s: %s", day, seg_key, exc) 161 200 162 201 if skipped: 163 - logger.info("Skipped %d conversations with no messages", skipped) 202 + logger.info("Skipped %d conversations with no content", skipped) 203 + 204 + days = sorted({day for day, _ in segments}) 164 205 165 206 return ImportResult( 166 - entries_written=len(entries), 207 + entries_written=written_count, 167 208 entities_seeded=0, 168 209 files_created=created_files, 169 210 errors=errors, 170 - summary=f"Imported {len(entries)} Claude conversations across {len(created_files)} days", 211 + summary=( 212 + f"Imported {len(messages)} messages across {len(days)} days into " 213 + f"{len(segments)} segments" 214 + ), 215 + segments=segments, 171 216 ) 172 217 173 218

+56 -1

think/importers/shared.py

··· 10 10 import os 11 11 import shutil 12 12 from pathlib import Path 13 - from typing import TYPE_CHECKING 13 + from typing import TYPE_CHECKING, Any 14 14 15 15 from think.importers.utils import save_import_file, write_import_metadata 16 16 from think.utils import day_path, get_journal, now_ms ··· 121 121 model=model, 122 122 ) 123 123 return json_path 124 + 125 + 126 + def _window_messages( 127 + messages: list[dict[str, Any]], 128 + window_duration: int = 300, 129 + ) -> list[tuple[str, str, str | None, list[dict[str, Any]]]]: 130 + """Group sorted messages into fixed-duration windows per day.""" 131 + if not messages: 132 + return [] 133 + 134 + windows: list[tuple[str, str, str | None, list[dict[str, Any]]]] = [] 135 + window_start: float | None = None 136 + window_day: str | None = None 137 + window_entries: list[dict[str, Any]] = [] 138 + window_model: str | None = None 139 + 140 + for msg in messages: 141 + msg_dt = dt.datetime.fromtimestamp(msg["create_time"]) 142 + msg_day = msg_dt.strftime("%Y%m%d") 143 + 144 + if ( 145 + window_start is None 146 + or msg_day != window_day 147 + or msg["create_time"] - window_start >= window_duration 148 + ): 149 + if window_entries and window_day and window_start is not None: 150 + start_dt = dt.datetime.fromtimestamp(window_start) 151 + seg_key = f"{start_dt.strftime('%H%M%S')}_{window_duration}" 152 + windows.append((window_day, seg_key, window_model, window_entries)) 153 + 154 + window_start = msg["create_time"] 155 + window_day = msg_day 156 + window_entries = [] 157 + window_model = None 158 + 159 + offset = int(msg["create_time"] - window_start) 160 + h, remainder = divmod(offset, 3600) 161 + m, s = divmod(remainder, 60) 162 + window_entries.append( 163 + { 164 + "start": f"{h:02d}:{m:02d}:{s:02d}", 165 + "speaker": msg["speaker"], 166 + "text": msg["text"], 167 + } 168 + ) 169 + 170 + if msg["model_slug"] and window_model is None: 171 + window_model = msg["model_slug"] 172 + 173 + if window_entries and window_day and window_start is not None: 174 + start_dt = dt.datetime.fromtimestamp(window_start) 175 + seg_key = f"{start_dt.strftime('%H%M%S')}_{window_duration}" 176 + windows.append((window_day, seg_key, window_model, window_entries)) 177 + 178 + return windows 124 179 125 180 126 181 # MIME type mapping for import metadata

Configure Feed

Configure Feed