Merge branch 'hopper-v2a2kg6v-ai-chat-importers'

+32 -38

tests/test_gemini_importer.py

··· 53 53 54 54 def test_parse_activity_basic(): 55 55 act = _sample_activity() 56 - entry = _parse_activity(act) 57 - assert entry is not None 58 - assert entry["type"] == "ai_chat" 59 - assert entry["source"] == "gemini" 60 - assert "Python" in entry["content"] 61 - assert entry["message_count"] == 2 62 - 63 - 64 - def test_parse_activity_bard(): 65 - entry = _parse_activity(_bard_activity()) 66 - assert entry is not None 67 - assert entry.get("variant") == "bard" 68 - assert entry["source"] == "gemini" 56 + messages = _parse_activity(act) 57 + assert len(messages) == 2 58 + assert set(messages[0]) == {"create_time", "speaker", "text", "model_slug"} 59 + assert messages[0]["speaker"] == "Human" 60 + assert messages[0]["text"] == "What is Python?" 61 + assert messages[0]["model_slug"] is None 62 + assert messages[1]["speaker"] == "Assistant" 63 + assert "programming language" in messages[1]["text"] 64 + assert messages[1]["create_time"] == messages[0]["create_time"] 69 65 70 66 71 67 def test_parse_activity_no_content(): 72 68 act = {"header": "Gemini Apps", "time": "2026-01-15T10:00:00Z"} 73 - assert _parse_activity(act) is None 69 + assert _parse_activity(act) == [] 74 70 75 71 76 72 def test_parse_activity_no_time(): 77 73 act = _sample_activity() 78 74 del act["time"] 79 - assert _parse_activity(act) is None 75 + assert _parse_activity(act) == [] 80 76 81 77 82 78 def test_parse_activity_prompt_only(): ··· 87 83 "subtitles": [{"value": "What is the meaning of life?"}], 88 84 "products": ["Gemini"], 89 85 } 90 - entry = _parse_activity(act) 91 - assert entry is not None 92 - assert "meaning of life" in entry["content"] 93 - assert entry["message_count"] == 1 94 - 95 - 96 - def test_parse_activity_title_cleanup(): 97 - act = _sample_activity(title="Asked Gemini: What is Python?") 98 - entry = _parse_activity(act) 99 - assert entry is not None 100 - assert entry["title"] == "What is Python?" 86 + messages = _parse_activity(act) 87 + assert len(messages) == 1 88 + assert messages[0]["speaker"] == "Human" 89 + assert "meaning of life" in messages[0]["text"] 101 90 102 91 103 92 # --- Detection tests --- ··· 188 177 with tempfile.TemporaryDirectory() as journal: 189 178 os.environ["JOURNAL_PATH"] = journal 190 179 result = importer.process(Path(f.name), Path(journal)) 191 - assert result.entries_written == 2 180 + assert result.entries_written == 4 192 181 assert result.errors == [] 193 182 assert result.segments is not None 194 183 assert len(result.segments) >= 1 195 - assert any(Path(p).name == "imported.md" for p in result.files_created) 184 + assert any( 185 + Path(p).name == "imported_audio.jsonl" for p in result.files_created 186 + ) 196 187 197 - md = "" 198 - for file_path in result.files_created: 199 - md_path = Path(file_path) 200 - assert md_path.exists() 201 - md += md_path.read_text() 202 - assert "Python" in md 203 - assert "sorted" in md 188 + first_path = Path(result.files_created[0]) 189 + assert first_path.exists() 190 + lines = first_path.read_text().strip().split("\n") 191 + metadata = json.loads(lines[0]) 192 + entries = [json.loads(line) for line in lines[1:]] 193 + assert "imported" in metadata 194 + assert entries[0]["start"] == "00:00:00" 195 + assert entries[0]["speaker"] == "Human" 196 + assert entries[0]["source"] == "import" 197 + assert entries[1]["speaker"] == "Assistant" 204 198 finally: 205 199 os.unlink(f.name) 206 200 os.environ.pop("JOURNAL_PATH", None) ··· 218 212 with tempfile.TemporaryDirectory() as journal: 219 213 os.environ["JOURNAL_PATH"] = journal 220 214 result = importer.process(Path(tmp.name), Path(journal)) 221 - assert result.entries_written == 1 215 + assert result.entries_written == 2 222 216 assert result.segments is not None 223 217 assert len(result.segments) == 1 224 - assert any(Path(p).suffix == ".md" for p in result.files_created) 218 + assert any(Path(p).suffix == ".jsonl" for p in result.files_created) 225 219 finally: 226 220 os.unlink(tmp.name) 227 221 os.environ.pop("JOURNAL_PATH", None) ··· 244 238 with tempfile.TemporaryDirectory() as journal: 245 239 os.environ["JOURNAL_PATH"] = journal 246 240 result = importer.process(Path(f.name), Path(journal)) 247 - assert result.entries_written == 2 241 + assert result.entries_written == 4 248 242 assert result.segments is not None 249 243 assert len(result.segments) == 2 250 244 assert len(result.files_created) == 2

+54 -28

tests/test_import_formatting.py

··· 8 8 import tempfile 9 9 from pathlib import Path 10 10 11 - from think.importers.formatting import format_imported 11 + from think.importers.formatting import format_ai_chat, format_imported 12 12 13 13 14 14 def _make_entries(source: str, content_entries: list[dict]) -> list[dict]: ··· 136 136 assert "Note: My personal note" in md 137 137 138 138 139 - def test_ai_chat(): 140 - entries = _make_entries( 141 - "claude", 142 - [ 143 - { 144 - "type": "ai_chat", 145 - "ts": "2026-01-20T14:00:00", 146 - "title": "Help with Python parsing", 147 - "source": "claude", 148 - "message_count": 12, 149 - "content": "Human: How do I parse JSON?\n\nAssistant: Use json.loads().", 150 - } 151 - ], 152 - ) 153 - chunks, meta = format_imported(entries, None) 154 - assert len(chunks) == 1 155 - assert meta["indexer"]["agent"] == "import.claude" 156 - 157 - md = chunks[0]["markdown"] 158 - assert "Help with Python parsing" in md 159 - assert "claude conversation" in md 160 - assert "12 messages" in md 161 - assert "json.loads()" in md 162 - 163 - 164 139 def test_generic_entry(): 165 140 entries = _make_entries( 166 141 "custom", ··· 241 216 def test_formatter_registration_gemini_segment(): 242 217 from think.formatters import get_formatter 243 218 244 - formatter = get_formatter("20260115/import.gemini/100000_300/imported.md") 219 + formatter = get_formatter("20260115/import.gemini/100000_300/imported_audio.jsonl") 245 220 assert formatter is not None 246 - assert formatter.__name__ == "format_markdown" 221 + assert formatter.__name__ == "format_ai_chat" 222 + 223 + 224 + def test_formatter_registration_chatgpt_segment(): 225 + from think.formatters import get_formatter 226 + 227 + formatter = get_formatter("20260115/import.chatgpt/100000_300/imported_audio.jsonl") 228 + assert formatter is not None 229 + assert formatter.__name__ == "format_ai_chat" 230 + 231 + 232 + def test_formatter_registration_claude_segment(): 233 + from think.formatters import get_formatter 234 + 235 + formatter = get_formatter("20260115/import.claude/100000_300/imported_audio.jsonl") 236 + assert formatter is not None 237 + assert formatter.__name__ == "format_ai_chat" 247 238 248 239 249 240 def test_path_metadata_extraction(): ··· 291 282 292 283 files = find_formattable_files(tmpdir) 293 284 assert "20260115/import.ics/120000_300/imported.md" in files 285 + 286 + 287 + def test_format_ai_chat_empty(): 288 + chunks, meta = format_ai_chat([], None) 289 + assert chunks == [] 290 + assert meta == {} 291 + 292 + 293 + def test_format_ai_chat_basic(): 294 + entries = [ 295 + {"imported": {"id": "20260101_120000", "facet": "work"}, "model": "gpt-4o"}, 296 + {"start": "00:00:00", "speaker": "Human", "text": "Hello", "source": "import"}, 297 + { 298 + "start": "00:00:05", 299 + "speaker": "Assistant", 300 + "text": "Hi there", 301 + "source": "import", 302 + }, 303 + ] 304 + 305 + chunks, meta = format_ai_chat( 306 + entries, 307 + { 308 + "file_path": Path( 309 + "/journal/20260115/import.chatgpt/120000_300/imported_audio.jsonl" 310 + ) 311 + }, 312 + ) 313 + 314 + assert len(chunks) == 2 315 + assert meta["indexer"]["agent"] == "import.chatgpt" 316 + assert meta["header"] == "# ChatGPT conversation\nModel: gpt-4o\nFacet: work" 317 + assert chunks[0]["markdown"] == "**Human:** Hello" 318 + assert chunks[1]["markdown"] == "**Assistant:** Hi there" 319 + assert chunks[1]["timestamp"] > chunks[0]["timestamp"] 294 320 295 321 296 322 def test_format_file_integration():

+16

think/formatters.py

··· 147 147 True, 148 148 ), 149 149 "*/import.*/*/imported.md": ("think.markdown", "format_markdown", True), 150 + # AI chat imports — dedicated formatter (must precede generic *_audio.jsonl) 151 + "*/import.chatgpt/*/imported_audio.jsonl": ( 152 + "think.importers.formatting", 153 + "format_ai_chat", 154 + True, 155 + ), 156 + "*/import.claude/*/imported_audio.jsonl": ( 157 + "think.importers.formatting", 158 + "format_ai_chat", 159 + True, 160 + ), 161 + "*/import.gemini/*/imported_audio.jsonl": ( 162 + "think.importers.formatting", 163 + "format_ai_chat", 164 + True, 165 + ), 150 166 # Raw transcripts — formattable but not indexed (agent outputs are more useful) 151 167 # Layout: day/stream/segment/audio.jsonl 152 168 "*/*/*/audio.jsonl": ("observe.hear", "format_audio", False),

+90 -21

think/importers/formatting.py

··· 3 3 4 4 """Formatter for structured import JSONL files. 5 5 6 - Converts entries from file importers (ICS, Obsidian, Kindle, AI chat) into 7 - markdown chunks for the search index. Each entry type gets a source-appropriate 6 + Converts entries from file importers (ICS, Obsidian, Kindle) into markdown 7 + chunks for the search index. Each entry type gets a source-appropriate 8 8 markdown representation. 9 9 10 10 Import JSONL layout: YYYYMMDD/import.{source}/imported.jsonl ··· 14 14 15 15 from __future__ import annotations 16 16 17 + import re 17 18 from datetime import datetime 19 + from pathlib import Path 18 20 from typing import Any 21 + 22 + _SOURCE_LABELS = {"chatgpt": "ChatGPT", "claude": "Claude", "gemini": "Gemini"} 19 23 20 24 21 25 def format_imported( ··· 67 71 return chunks, meta 68 72 69 73 74 + def format_ai_chat( 75 + entries: list[dict], 76 + context: dict | None = None, 77 + ) -> tuple[list[dict], dict]: 78 + """Format AI chat import JSONL entries to markdown chunks.""" 79 + meta: dict[str, Any] = {} 80 + chunks: list[dict[str, Any]] = [] 81 + 82 + if not entries: 83 + return chunks, meta 84 + 85 + metadata = entries[0] if entries and "start" not in entries[0] else {} 86 + transcript_entries = [entry for entry in entries if "start" in entry] 87 + 88 + source_key = "ai_chat" 89 + source = "AI chat" 90 + imported = metadata.get("imported", {}) 91 + 92 + ctx = context or {} 93 + file_path = ctx.get("file_path") 94 + if file_path: 95 + parts = Path(str(file_path)).parts 96 + for part in parts: 97 + if part.startswith("import."): 98 + source_key = part.replace("import.", "").lower() 99 + source = _SOURCE_LABELS.get(source_key, source_key.capitalize()) 100 + break 101 + 102 + meta["indexer"] = {"agent": f"import.{source_key}"} 103 + 104 + model = metadata.get("model") 105 + header_parts = [f"# {source} conversation"] 106 + if model: 107 + header_parts.append(f"Model: {model}") 108 + if isinstance(imported, dict) and imported.get("facet"): 109 + header_parts.append(f"Facet: {imported['facet']}") 110 + meta["header"] = ( 111 + "\n".join(header_parts) if len(header_parts) > 1 else header_parts[0] 112 + ) 113 + 114 + base_timestamp = 0 115 + if file_path: 116 + parts = Path(str(file_path)).parts 117 + rev_parts = list(reversed(parts)) 118 + for i, part in enumerate(rev_parts): 119 + if re.match(r"^\d{8}$", part): 120 + day_str = part 121 + from think.utils import segment_parse 122 + 123 + for j in range(1, i): 124 + parsed_time, _ = segment_parse(rev_parts[j]) 125 + if parsed_time is not None: 126 + day_date = datetime.strptime(day_str, "%Y%m%d").date() 127 + dt_obj = datetime.combine(day_date, parsed_time) 128 + base_timestamp = int(dt_obj.timestamp() * 1000) 129 + break 130 + break 131 + 132 + for entry in transcript_entries: 133 + speaker = entry.get("speaker", "") 134 + text = entry.get("text", "") 135 + if not text: 136 + continue 137 + 138 + start = entry.get("start", "") 139 + entry_timestamp = base_timestamp 140 + if start: 141 + try: 142 + h, m, s = map(int, start.split(":")) 143 + entry_timestamp = base_timestamp + (h * 3600 + m * 60 + s) * 1000 144 + except (ValueError, AttributeError): 145 + pass 146 + 147 + chunks.append( 148 + { 149 + "timestamp": entry_timestamp, 150 + "markdown": f"**{speaker}:** {text}", 151 + "source": entry, 152 + } 153 + ) 154 + 155 + return chunks, meta 156 + 157 + 70 158 def _format_entry(entry_type: str, entry: dict) -> str: 71 159 """Dispatch to type-specific formatter.""" 72 160 if entry_type == "calendar_event": ··· 75 163 return _format_note(entry) 76 164 elif entry_type == "highlight": 77 165 return _format_highlight(entry) 78 - elif entry_type == "ai_chat": 79 - return _format_ai_chat(entry) 80 166 else: 81 167 return _format_generic(entry) 82 168 ··· 175 261 else: 176 262 lines.append("") 177 263 lines.append(f"> {content}") 178 - 179 - return "\n".join(lines) 180 - 181 - 182 - def _format_ai_chat(entry: dict) -> str: 183 - """Format an AI chat conversation entry.""" 184 - title = entry.get("title", "Untitled conversation") 185 - source = entry.get("source", "ai") 186 - msg_count = entry.get("message_count", 0) 187 - 188 - lines = [f"## {title}"] 189 - lines.append(f"{source} conversation, {msg_count} messages") 190 - 191 - content = entry.get("content", "") 192 - if content: 193 - lines.append("") 194 - lines.append(content) 195 264 196 265 return "\n".join(lines) 197 266

+76 -84

think/importers/gemini.py

··· 23 23 from typing import Any, Callable 24 24 25 25 from think.importers.file_importer import ImportPreview, ImportResult 26 - from think.importers.shared import window_items, write_markdown_segments 26 + from think.importers.shared import _window_messages, write_segment 27 + from think.utils import day_path 27 28 28 29 logger = logging.getLogger(__name__) 29 30 ··· 76 77 raise ValueError(f"Unsupported file type: {path.suffix}") 77 78 78 79 79 - def _parse_activity(activity: dict[str, Any]) -> dict[str, Any] | None: 80 - """Parse a single Gemini activity record into an import entry.""" 80 + def _parse_activity(activity: dict[str, Any]) -> list[dict[str, Any]]: 81 + """Parse a single Gemini activity record into timestamped messages.""" 81 82 # Extract user prompt from subtitles 82 83 subtitles = activity.get("subtitles", []) 83 84 prompt = "" ··· 89 90 90 91 # Extract AI response from safeHtmlItem 91 92 response = "" 92 - products = activity.get("products", []) 93 93 # safeHtmlItem can be at activity level or nested 94 94 safe_html_items = activity.get("safeHtmlItem", []) 95 95 for item in safe_html_items: ··· 100 100 101 101 # If no content at all, skip 102 102 if not prompt and not response: 103 - return None 103 + return [] 104 104 105 105 # Parse timestamp 106 106 time_str = activity.get("time", "") 107 107 if not time_str: 108 - return None 108 + return [] 109 109 try: 110 - ts = dt.datetime.fromisoformat(time_str.replace("Z", "+00:00")).isoformat() 110 + create_time = dt.datetime.fromisoformat( 111 + time_str.replace("Z", "+00:00") 112 + ).timestamp() 111 113 except (ValueError, TypeError): 112 - return None 114 + return [] 113 115 114 - # Build title from prompt (truncated) 115 - title = activity.get("title", prompt) 116 - # Clean up "Asked Gemini" / "Talked to Bard" prefixes 117 - for prefix in ("Asked Gemini", "Talked to Bard", "Asked Bard"): 118 - if title.startswith(prefix): 119 - title = title[len(prefix) :].strip().lstrip(":").strip() 120 - break 121 - if not title: 122 - title = prompt[:80] if prompt else "Gemini activity" 123 - 124 - # Build readable content 125 - parts: list[str] = [] 116 + messages: list[dict[str, Any]] = [] 126 117 if prompt: 127 - parts.append(f"Human: {prompt}") 118 + messages.append( 119 + { 120 + "create_time": create_time, 121 + "speaker": "Human", 122 + "text": prompt, 123 + "model_slug": None, 124 + } 125 + ) 128 126 if response: 129 - parts.append(f"Assistant: {response}") 130 - content = "\n\n".join(parts) 131 - 132 - # Detect source variant 133 - source_products = [p.lower() for p in products] if products else [] 134 - is_bard = any("bard" in p for p in source_products) 135 - header = activity.get("header", "") 136 - if "bard" in header.lower(): 137 - is_bard = True 138 - 139 - entry: dict[str, Any] = { 140 - "type": "ai_chat", 141 - "ts": ts, 142 - "title": title, 143 - "source": "gemini", 144 - "message_count": (1 if prompt else 0) + (1 if response else 0), 145 - "content": content, 146 - } 147 - if is_bard: 148 - entry["variant"] = "bard" 149 - 150 - return entry 151 - 152 - 153 - def _render_activity_markdown(activity: dict) -> str: 154 - """Render a Gemini activity as markdown.""" 155 - title = activity.get("title", "Gemini activity") 156 - lines = [f"## {title}"] 157 - 158 - content = activity.get("content", "") 159 - if content: 160 - # Content already has "Human: ..." and "Assistant: ..." format 161 - # Convert to bold labels 162 - for part in content.split("\n\n"): 163 - part = part.strip() 164 - if part.startswith("Human: "): 165 - lines.append(f"**Human:** {part[7:]}") 166 - elif part.startswith("Assistant: "): 167 - lines.append(f"**Assistant:** {part[11:]}") 168 - elif part: 169 - lines.append(part) 127 + messages.append( 128 + { 129 + "create_time": create_time, 130 + "speaker": "Assistant", 131 + "text": response, 132 + "model_slug": None, 133 + } 134 + ) 170 135 171 - return "\n\n".join(lines) 136 + return messages 172 137 173 138 174 139 class GeminiImporter: ··· 222 187 bard_count = 0 223 188 224 189 for act in activities: 225 - entry = _parse_activity(act) 226 - if entry is None: 190 + messages = _parse_activity(act) 191 + if not messages: 227 192 continue 228 193 valid_count += 1 229 - if entry.get("variant") == "bard": 194 + products = [p.lower() for p in act.get("products", [])] 195 + header = str(act.get("header", "")).lower() 196 + if any("bard" in product for product in products) or "bard" in header: 230 197 bard_count += 1 231 198 try: 232 - day = dt.datetime.fromisoformat(entry["ts"]).strftime("%Y%m%d") 199 + day = dt.datetime.fromtimestamp(messages[0]["create_time"]).strftime( 200 + "%Y%m%d" 201 + ) 233 202 dates.append(day) 234 203 except (ValueError, OSError): 235 204 pass ··· 254 223 progress_callback: Callable | None = None, 255 224 ) -> ImportResult: 256 225 activities = _load_activities(path) 226 + import_id = dt.datetime.now().strftime("%Y%m%d_%H%M%S") 257 227 258 - entries: list[dict[str, Any]] = [] 228 + messages: list[dict[str, Any]] = [] 259 229 errors: list[str] = [] 260 230 skipped = 0 231 + bard_count = 0 232 + valid_count = 0 261 233 262 234 for i, act in enumerate(activities): 263 - entry = _parse_activity(act) 264 - if entry is None: 235 + activity_messages = _parse_activity(act) 236 + if not activity_messages: 265 237 skipped += 1 266 238 continue 239 + valid_count += 1 267 240 268 - # Add epoch timestamp for windowing 269 - entry["create_ts"] = dt.datetime.fromisoformat(entry["ts"]).timestamp() 270 - entries.append(entry) 241 + products = [p.lower() for p in act.get("products", [])] 242 + header = str(act.get("header", "")).lower() 243 + if any("bard" in product for product in products) or "bard" in header: 244 + bard_count += 1 245 + 246 + messages.extend(activity_messages) 271 247 272 248 if progress_callback and (i + 1) % 100 == 0: 273 249 progress_callback(i + 1, len(activities)) 274 250 275 - if not entries: 251 + if not messages: 276 252 return ImportResult( 277 253 entries_written=0, 278 254 entities_seeded=0, ··· 281 257 summary="No activities found to import", 282 258 ) 283 259 284 - entries.sort(key=lambda e: e["create_ts"]) 260 + messages.sort(key=lambda msg: msg["create_time"]) 285 261 286 - windows = window_items(entries, "create_ts") 287 - created_files, segments = write_markdown_segments( 288 - "gemini", 289 - windows, 290 - lambda items: "\n\n".join(_render_activity_markdown(a) for a in items), 291 - ) 262 + windows = _window_messages(messages) 263 + created_files: list[str] = [] 264 + segments: list[tuple[str, str]] = [] 265 + written_count = 0 266 + 267 + for day, seg_key, model_slug, entries in windows: 268 + day_dir = str(day_path(day)) 269 + try: 270 + json_path = write_segment( 271 + day_dir, 272 + "import.gemini", 273 + seg_key, 274 + entries, 275 + import_id=import_id, 276 + facet=facet, 277 + model=model_slug, 278 + ) 279 + created_files.append(json_path) 280 + segments.append((day, seg_key)) 281 + written_count += len(entries) 282 + except Exception as exc: 283 + errors.append(f"Failed to write segment {day}/{seg_key}: {exc}") 284 + logger.warning("Failed to write segment %s/%s: %s", day, seg_key, exc) 292 285 293 286 segment_days = {day for day, _ in segments} 294 287 295 288 if skipped: 296 289 logger.info("Skipped %d activities with no content", skipped) 297 290 298 - bard_count = sum(1 for e in entries if e.get("variant") == "bard") 299 291 bard_info = f" ({bard_count} Bard-era)" if bard_count else "" 300 292 301 293 return ImportResult( 302 - entries_written=len(entries), 294 + entries_written=written_count, 303 295 entities_seeded=0, 304 296 files_created=created_files, 305 297 errors=errors, 306 298 summary=( 307 - f"Imported {len(entries)} Gemini activities{bard_info} across " 299 + f"Imported {len(messages)} messages from {valid_count} Gemini activities{bard_info} across " 308 300 f"{len(segment_days)} days into {len(segments)} segments" 309 301 ), 310 302 segments=segments,

Configure Feed

Configure Feed