Add Gemini activity importer + post-import enrichment event

+241

tests/test_gemini_importer.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Tests for think.importers.gemini — Gemini/Bard activity importer.""" 5 + 6 + import json 7 + import os 8 + import tempfile 9 + import zipfile 10 + from pathlib import Path 11 + 12 + from think.importers.gemini import GeminiImporter, _parse_activity, _strip_html 13 + 14 + importer = GeminiImporter() 15 + 16 + 17 + def _sample_activity( 18 + prompt: str = "What is Python?", 19 + response: str = "Python is a programming language.", 20 + time: str = "2026-01-15T10:30:00Z", 21 + header: str = "Gemini Apps", 22 + title: str | None = None, 23 + ) -> dict: 24 + """Build a sample Gemini activity record.""" 25 + act: dict = { 26 + "header": header, 27 + "title": title or f"Asked Gemini: {prompt[:40]}", 28 + "time": time, 29 + "products": ["Gemini"], 30 + "subtitles": [{"value": prompt}], 31 + "safeHtmlItem": [{"html": f"{response}"}], 32 + } 33 + return act 34 + 35 + 36 + def _bard_activity() -> dict: 37 + return _sample_activity( 38 + prompt="Tell me a joke", 39 + response="Why did the chicken cross the road?", 40 + header="Bard", 41 + title="Talked to Bard", 42 + ) 43 + 44 + 45 + # --- Unit tests for helpers --- 46 + 47 + 48 + def test_strip_html(): 49 + assert _strip_html("Hello world") == "Hello world" 50 + assert _strip_html("No tags") == "No tags" 51 + assert _strip_html("& entities <") == "& entities <" 52 + 53 + 54 + def test_parse_activity_basic(): 55 + act = _sample_activity() 56 + entry = _parse_activity(act) 57 + assert entry is not None 58 + assert entry["type"] == "ai_chat" 59 + assert entry["source"] == "gemini" 60 + assert "Python" in entry["content"] 61 + assert entry["message_count"] == 2 62 + 63 + 64 + def test_parse_activity_bard(): 65 + entry = _parse_activity(_bard_activity()) 66 + assert entry is not None 67 + assert entry.get("variant") == "bard" 68 + assert entry["source"] == "gemini" 69 + 70 + 71 + def test_parse_activity_no_content(): 72 + act = {"header": "Gemini Apps", "time": "2026-01-15T10:00:00Z"} 73 + assert _parse_activity(act) is None 74 + 75 + 76 + def test_parse_activity_no_time(): 77 + act = _sample_activity() 78 + del act["time"] 79 + assert _parse_activity(act) is None 80 + 81 + 82 + def test_parse_activity_prompt_only(): 83 + act = { 84 + "header": "Gemini Apps", 85 + "title": "Asked Gemini", 86 + "time": "2026-01-15T10:00:00Z", 87 + "subtitles": [{"value": "What is the meaning of life?"}], 88 + "products": ["Gemini"], 89 + } 90 + entry = _parse_activity(act) 91 + assert entry is not None 92 + assert "meaning of life" in entry["content"] 93 + assert entry["message_count"] == 1 94 + 95 + 96 + def test_parse_activity_title_cleanup(): 97 + act = _sample_activity(title="Asked Gemini: What is Python?") 98 + entry = _parse_activity(act) 99 + assert entry is not None 100 + assert entry["title"] == "What is Python?" 101 + 102 + 103 + # --- Detection tests --- 104 + 105 + 106 + def test_detect_json_file(): 107 + with tempfile.NamedTemporaryFile(suffix=".json", mode="w", delete=False) as f: 108 + json.dump([_sample_activity()], f) 109 + f.flush() 110 + try: 111 + assert importer.detect(Path(f.name)) is True 112 + finally: 113 + os.unlink(f.name) 114 + 115 + 116 + def test_detect_json_wrong_format(): 117 + with tempfile.NamedTemporaryFile(suffix=".json", mode="w", delete=False) as f: 118 + json.dump([{"not": "gemini"}], f) 119 + f.flush() 120 + try: 121 + assert importer.detect(Path(f.name)) is False 122 + finally: 123 + os.unlink(f.name) 124 + 125 + 126 + def test_detect_zip(): 127 + with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as tmp: 128 + with zipfile.ZipFile(tmp, "w") as zf: 129 + data = json.dumps([_sample_activity()]) 130 + zf.writestr("Takeout/My Activity/Gemini Apps/MyActivity.json", data) 131 + try: 132 + assert importer.detect(Path(tmp.name)) is True 133 + finally: 134 + os.unlink(tmp.name) 135 + 136 + 137 + def test_detect_directory(): 138 + with tempfile.TemporaryDirectory() as tmpdir: 139 + activity_dir = Path(tmpdir) / "My Activity" / "Gemini Apps" 140 + activity_dir.mkdir(parents=True) 141 + (activity_dir / "MyActivity.json").write_text( 142 + json.dumps([_sample_activity()]) 143 + ) 144 + assert importer.detect(Path(tmpdir)) is True 145 + 146 + 147 + def test_detect_directory_no_activity(): 148 + with tempfile.TemporaryDirectory() as tmpdir: 149 + assert importer.detect(Path(tmpdir)) is False 150 + 151 + 152 + # --- Preview tests --- 153 + 154 + 155 + def test_preview_json(): 156 + with tempfile.NamedTemporaryFile(suffix=".json", mode="w", delete=False) as f: 157 + activities = [ 158 + _sample_activity(time="2026-01-15T10:00:00Z"), 159 + _sample_activity(time="2026-02-20T14:00:00Z"), 160 + _bard_activity(), 161 + ] 162 + json.dump(activities, f) 163 + f.flush() 164 + try: 165 + preview = importer.preview(Path(f.name)) 166 + assert preview.item_count == 3 167 + assert preview.date_range[0] == "20260115" 168 + assert "Bard" in preview.summary or "bard" in preview.summary.lower() 169 + finally: 170 + os.unlink(f.name) 171 + 172 + 173 + # --- Process tests --- 174 + 175 + 176 + def test_process_json(): 177 + with tempfile.NamedTemporaryFile(suffix=".json", mode="w", delete=False) as f: 178 + activities = [ 179 + _sample_activity(time="2026-01-15T10:00:00Z"), 180 + _sample_activity( 181 + prompt="How to sort a list?", 182 + response="Use sorted().", 183 + time="2026-01-15T14:00:00Z", 184 + ), 185 + ] 186 + json.dump(activities, f) 187 + f.flush() 188 + 189 + try: 190 + with tempfile.TemporaryDirectory() as journal: 191 + os.environ["JOURNAL_PATH"] = journal 192 + result = importer.process(Path(f.name), Path(journal)) 193 + assert result.entries_written == 2 194 + assert len(result.files_created) == 1 # same day 195 + assert result.errors == [] 196 + 197 + # Verify JSONL content 198 + jsonl_path = Path(result.files_created[0]) 199 + assert jsonl_path.exists() 200 + lines = jsonl_path.read_text().strip().split("\n") 201 + header = json.loads(lines[0]) 202 + assert header["import"]["source"] == "gemini" 203 + assert header["entry_count"] == 2 204 + 205 + entry = json.loads(lines[1]) 206 + assert entry["type"] == "ai_chat" 207 + assert entry["source"] == "gemini" 208 + finally: 209 + os.unlink(f.name) 210 + os.environ.pop("JOURNAL_PATH", None) 211 + 212 + 213 + def test_process_zip(): 214 + with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as tmp: 215 + with zipfile.ZipFile(tmp, "w") as zf: 216 + activities = [_sample_activity(time="2026-03-01T09:00:00Z")] 217 + zf.writestr( 218 + "Takeout/My Activity/Gemini Apps/MyActivity.json", 219 + json.dumps(activities), 220 + ) 221 + try: 222 + with tempfile.TemporaryDirectory() as journal: 223 + os.environ["JOURNAL_PATH"] = journal 224 + result = importer.process(Path(tmp.name), Path(journal)) 225 + assert result.entries_written == 1 226 + assert len(result.files_created) == 1 227 + finally: 228 + os.unlink(tmp.name) 229 + os.environ.pop("JOURNAL_PATH", None) 230 + 231 + 232 + # --- Registry test --- 233 + 234 + 235 + def test_registered_in_registry(): 236 + from think.importers.file_importer import FILE_IMPORTER_REGISTRY, get_file_importer 237 + 238 + assert "gemini" in FILE_IMPORTER_REGISTRY 239 + imp = get_file_importer("gemini") 240 + assert imp is not None 241 + assert imp.name == "gemini"

+19

think/importers/cli.py

··· 632 632 ) 633 633 except Exception as exc: 634 634 logger.warning("Failed to index %s: %s", created_file, exc) 635 + 636 + # Emit enrichment event with affected days 637 + days_affected = sorted( 638 + { 639 + os.path.basename(os.path.dirname(os.path.dirname(f))) 640 + for f in result.files_created 641 + if os.path.basename(os.path.dirname(os.path.dirname(f))).isdigit() 642 + } 643 + ) 644 + if days_affected: 645 + _callosum.emit( 646 + "importer", 647 + "enrichment_ready", 648 + import_id=_import_id, 649 + importer=_file_importer.name, 650 + days=days_affected, 651 + entries_written=result.entries_written, 652 + ) 653 + 635 654 if args.json: 636 655 print( 637 656 json.dumps(

+1

think/importers/file_importer.py

··· 59 59 "claude": "think.importers.claude_chat", 60 60 "chatgpt": "think.importers.chatgpt", 61 61 "kindle": "think.importers.kindle", 62 + "gemini": "think.importers.gemini", 62 63 } 63 64 64 65

+275

think/importers/gemini.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Gemini activity importer — imports from Google Takeout My Activity export. 5 + 6 + Gemini/Bard exports via Google Takeout are activity logs, NOT threaded 7 + conversations. Each activity is a standalone record with the user prompt 8 + and (often truncated) AI response. 9 + 10 + Expected paths in ZIP: 11 + Takeout/My Activity/Gemini Apps/MyActivity.json 12 + My Activity/Gemini Apps/MyActivity.json 13 + Or a directory/JSON file with the same structure. 14 + """ 15 + 16 + import datetime as dt 17 + import html 18 + import json 19 + import logging 20 + import re 21 + import zipfile 22 + from pathlib import Path 23 + from typing import Any, Callable 24 + 25 + from think.importers.file_importer import ImportPreview, ImportResult 26 + from think.importers.shared import write_structured_import 27 + 28 + logger = logging.getLogger(__name__) 29 + 30 + # Paths to look for inside a Takeout ZIP 31 + _ACTIVITY_PATHS = [ 32 + "Takeout/My Activity/Gemini Apps/MyActivity.json", 33 + "My Activity/Gemini Apps/MyActivity.json", 34 + "Takeout/My Activity/Bard/MyActivity.json", 35 + "My Activity/Bard/MyActivity.json", 36 + ] 37 + 38 + _HTML_TAG_RE = re.compile(r"<[^>]+>") 39 + 40 + 41 + def _strip_html(text: str) -> str: 42 + """Strip HTML tags and decode entities.""" 43 + text = _HTML_TAG_RE.sub("", text) 44 + return html.unescape(text).strip() 45 + 46 + 47 + def _load_activities(path: Path) -> list[dict[str, Any]]: 48 + """Load Gemini activity records from a ZIP, directory, or JSON file.""" 49 + if path.is_file() and path.suffix.lower() == ".zip": 50 + with zipfile.ZipFile(path, "r") as zf: 51 + names = zf.namelist() 52 + for activity_path in _ACTIVITY_PATHS: 53 + if activity_path in names: 54 + with zf.open(activity_path) as f: 55 + return json.loads(f.read()) 56 + raise ValueError(f"No Gemini activity file found in {path.name}") 57 + 58 + if path.is_file() and path.suffix.lower() == ".json": 59 + with open(path, "r", encoding="utf-8") as f: 60 + return json.load(f) 61 + 62 + if path.is_dir(): 63 + # Check for activity file in directory 64 + for activity_path in _ACTIVITY_PATHS: 65 + candidate = path / activity_path 66 + if candidate.exists(): 67 + with open(candidate, "r", encoding="utf-8") as f: 68 + return json.load(f) 69 + # Try MyActivity.json directly in the directory 70 + candidate = path / "MyActivity.json" 71 + if candidate.exists(): 72 + with open(candidate, "r", encoding="utf-8") as f: 73 + return json.load(f) 74 + raise ValueError(f"No Gemini activity file found in {path}") 75 + 76 + raise ValueError(f"Unsupported file type: {path.suffix}") 77 + 78 + 79 + def _parse_activity(activity: dict[str, Any]) -> dict[str, Any] | None: 80 + """Parse a single Gemini activity record into an import entry.""" 81 + # Extract user prompt from subtitles 82 + subtitles = activity.get("subtitles", []) 83 + prompt = "" 84 + for sub in subtitles: 85 + text = sub.get("value", sub.get("name", "")) 86 + if text: 87 + prompt = text 88 + break 89 + 90 + # Extract AI response from safeHtmlItem 91 + response = "" 92 + products = activity.get("products", []) 93 + # safeHtmlItem can be at activity level or nested 94 + safe_html_items = activity.get("safeHtmlItem", []) 95 + for item in safe_html_items: 96 + html_content = item.get("html", "") 97 + if html_content: 98 + response = _strip_html(html_content) 99 + break 100 + 101 + # If no content at all, skip 102 + if not prompt and not response: 103 + return None 104 + 105 + # Parse timestamp 106 + time_str = activity.get("time", "") 107 + if not time_str: 108 + return None 109 + try: 110 + ts = dt.datetime.fromisoformat(time_str.replace("Z", "+00:00")).isoformat() 111 + except (ValueError, TypeError): 112 + return None 113 + 114 + # Build title from prompt (truncated) 115 + title = activity.get("title", prompt) 116 + # Clean up "Asked Gemini" / "Talked to Bard" prefixes 117 + for prefix in ("Asked Gemini", "Talked to Bard", "Asked Bard"): 118 + if title.startswith(prefix): 119 + title = title[len(prefix) :].strip().lstrip(":").strip() 120 + break 121 + if not title: 122 + title = prompt[:80] if prompt else "Gemini activity" 123 + 124 + # Build readable content 125 + parts: list[str] = [] 126 + if prompt: 127 + parts.append(f"Human: {prompt}") 128 + if response: 129 + parts.append(f"Assistant: {response}") 130 + content = "\n\n".join(parts) 131 + 132 + # Detect source variant 133 + source_products = [p.lower() for p in products] if products else [] 134 + is_bard = any("bard" in p for p in source_products) 135 + header = activity.get("header", "") 136 + if "bard" in header.lower(): 137 + is_bard = True 138 + 139 + entry: dict[str, Any] = { 140 + "type": "ai_chat", 141 + "ts": ts, 142 + "title": title, 143 + "source": "gemini", 144 + "message_count": (1 if prompt else 0) + (1 if response else 0), 145 + "content": content, 146 + } 147 + if is_bard: 148 + entry["variant"] = "bard" 149 + 150 + return entry 151 + 152 + 153 + class GeminiImporter: 154 + name = "gemini" 155 + display_name = "Gemini Activity History" 156 + file_patterns = ["*.zip", "*.json"] 157 + description = "Import activity from Google Takeout Gemini/Bard export" 158 + 159 + def detect(self, path: Path) -> bool: 160 + if path.is_file() and path.suffix.lower() == ".zip": 161 + try: 162 + with zipfile.ZipFile(path, "r") as zf: 163 + names = zf.namelist() 164 + return any(p in names for p in _ACTIVITY_PATHS) 165 + except zipfile.BadZipFile: 166 + return False 167 + 168 + if path.is_file() and path.suffix.lower() == ".json": 169 + try: 170 + with open(path, "r", encoding="utf-8") as f: 171 + data = json.load(f) 172 + if not isinstance(data, list) or len(data) == 0: 173 + return False 174 + first = data[0] 175 + # Gemini activity records have "header" and "time" fields 176 + return "header" in first and "time" in first 177 + except (json.JSONDecodeError, KeyError): 178 + return False 179 + 180 + if path.is_dir(): 181 + for activity_path in _ACTIVITY_PATHS: 182 + if (path / activity_path).exists(): 183 + return True 184 + if (path / "MyActivity.json").exists(): 185 + return True 186 + 187 + return False 188 + 189 + def preview(self, path: Path) -> ImportPreview: 190 + activities = _load_activities(path) 191 + if not activities: 192 + return ImportPreview( 193 + date_range=("", ""), 194 + item_count=0, 195 + entity_count=0, 196 + summary="Empty export — no Gemini activity found", 197 + ) 198 + 199 + dates: list[str] = [] 200 + valid_count = 0 201 + bard_count = 0 202 + 203 + for act in activities: 204 + entry = _parse_activity(act) 205 + if entry is None: 206 + continue 207 + valid_count += 1 208 + if entry.get("variant") == "bard": 209 + bard_count += 1 210 + try: 211 + day = dt.datetime.fromisoformat(entry["ts"]).strftime("%Y%m%d") 212 + dates.append(day) 213 + except (ValueError, OSError): 214 + pass 215 + 216 + dates.sort() 217 + date_range = (dates[0], dates[-1]) if dates else ("", "") 218 + 219 + bard_info = f" ({bard_count} Bard-era)" if bard_count else "" 220 + return ImportPreview( 221 + date_range=date_range, 222 + item_count=valid_count, 223 + entity_count=0, 224 + summary=f"{valid_count} activities from Gemini export{bard_info}", 225 + ) 226 + 227 + def process( 228 + self, 229 + path: Path, 230 + journal_root: Path, 231 + *, 232 + facet: str | None = None, 233 + progress_callback: Callable | None = None, 234 + ) -> ImportResult: 235 + activities = _load_activities(path) 236 + import_id = dt.datetime.now().strftime("%Y%m%d_%H%M%S") 237 + 238 + entries: list[dict[str, Any]] = [] 239 + errors: list[str] = [] 240 + skipped = 0 241 + 242 + for i, act in enumerate(activities): 243 + entry = _parse_activity(act) 244 + if entry is None: 245 + skipped += 1 246 + continue 247 + entries.append(entry) 248 + 249 + if progress_callback and (i + 1) % 100 == 0: 250 + progress_callback(i + 1, len(activities)) 251 + 252 + # Write to journal 253 + created_files = write_structured_import( 254 + "gemini", 255 + entries, 256 + import_id=import_id, 257 + facet=facet, 258 + ) 259 + 260 + if skipped: 261 + logger.info("Skipped %d activities with no content", skipped) 262 + 263 + bard_count = sum(1 for e in entries if e.get("variant") == "bard") 264 + bard_info = f" ({bard_count} Bard-era)" if bard_count else "" 265 + 266 + return ImportResult( 267 + entries_written=len(entries), 268 + entities_seeded=0, 269 + files_created=created_files, 270 + errors=errors, 271 + summary=f"Imported {len(entries)} Gemini activities{bard_info} across {len(created_files)} days", 272 + ) 273 + 274 + 275 + importer = GeminiImporter()

Configure Feed

Configure Feed