Add import formatter + indexer integration for file importers

+295

tests/test_import_formatting.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Tests for think.importers.formatting — the import JSONL formatter.""" 5 + 6 + import json 7 + import os 8 + import tempfile 9 + from pathlib import Path 10 + 11 + from think.importers.formatting import format_imported 12 + 13 + 14 + def _make_entries(source: str, content_entries: list[dict]) -> list[dict]: 15 + """Build a complete JSONL entry list with header.""" 16 + header = { 17 + "import": {"id": "20260101_120000", "source": source}, 18 + "entry_count": len(content_entries), 19 + } 20 + return [header] + content_entries 21 + 22 + 23 + def test_empty_entries(): 24 + chunks, meta = format_imported([], None) 25 + assert chunks == [] 26 + assert meta == {} 27 + 28 + 29 + def test_header_only(): 30 + entries = [{"import": {"id": "t", "source": "ics"}, "entry_count": 0}] 31 + chunks, meta = format_imported(entries, None) 32 + assert chunks == [] 33 + assert meta["indexer"]["agent"] == "import.ics" 34 + 35 + 36 + def test_calendar_event(): 37 + entries = _make_entries( 38 + "ics", 39 + [ 40 + { 41 + "type": "calendar_event", 42 + "ts": "2026-01-15T10:00:00", 43 + "title": "Team standup", 44 + "content": "Weekly sync", 45 + "duration_minutes": 30, 46 + "location": "Room 4B", 47 + "attendees": [ 48 + {"name": "Alice", "email": "alice@co.com"}, 49 + {"name": "Bob", "email": "bob@co.com"}, 50 + ], 51 + } 52 + ], 53 + ) 54 + chunks, meta = format_imported(entries, None) 55 + assert len(chunks) == 1 56 + assert meta["indexer"]["agent"] == "import.ics" 57 + 58 + md = chunks[0]["markdown"] 59 + assert "## Team standup" in md 60 + assert "30 min" in md 61 + assert "Room 4B" in md 62 + assert "Alice" in md 63 + assert "Bob" in md 64 + assert "Weekly sync" in md 65 + assert chunks[0]["timestamp"] > 0 66 + 67 + 68 + def test_note(): 69 + entries = _make_entries( 70 + "obsidian", 71 + [ 72 + { 73 + "type": "note", 74 + "ts": "2026-01-15T00:00:00", 75 + "title": "Project ideas", 76 + "content": "Some thoughts on the new project.", 77 + "tags": ["work", "ideas"], 78 + "wikilinks": ["Project Alpha", "Bob"], 79 + } 80 + ], 81 + ) 82 + chunks, meta = format_imported(entries, None) 83 + assert len(chunks) == 1 84 + assert meta["indexer"]["agent"] == "import.obsidian" 85 + 86 + md = chunks[0]["markdown"] 87 + assert "## Project ideas" in md 88 + assert "work, ideas" in md 89 + assert "Project Alpha" in md 90 + assert "Some thoughts" in md 91 + 92 + 93 + def test_highlight(): 94 + entries = _make_entries( 95 + "kindle", 96 + [ 97 + { 98 + "type": "highlight", 99 + "ts": "2026-01-10T08:30:00", 100 + "book_title": "Thinking, Fast and Slow", 101 + "author": "Daniel Kahneman", 102 + "content": "Nothing in life is as important as you think it is.", 103 + "clip_type": "highlight", 104 + "page": 402, 105 + "location": "6120-6121", 106 + } 107 + ], 108 + ) 109 + chunks, meta = format_imported(entries, None) 110 + assert len(chunks) == 1 111 + assert meta["indexer"]["agent"] == "import.kindle" 112 + 113 + md = chunks[0]["markdown"] 114 + assert "Thinking, Fast and Slow" in md 115 + assert "Daniel Kahneman" in md 116 + assert "Page 402" in md 117 + assert "> Nothing in life" in md 118 + 119 + 120 + def test_highlight_note_type(): 121 + entries = _make_entries( 122 + "kindle", 123 + [ 124 + { 125 + "type": "highlight", 126 + "ts": "2026-01-10T08:30:00", 127 + "book_title": "Some Book", 128 + "author": "", 129 + "content": "My personal note", 130 + "clip_type": "note", 131 + } 132 + ], 133 + ) 134 + chunks, meta = format_imported(entries, None) 135 + md = chunks[0]["markdown"] 136 + assert "Note: My personal note" in md 137 + 138 + 139 + def test_ai_chat(): 140 + entries = _make_entries( 141 + "claude", 142 + [ 143 + { 144 + "type": "ai_chat", 145 + "ts": "2026-01-20T14:00:00", 146 + "title": "Help with Python parsing", 147 + "source": "claude", 148 + "message_count": 12, 149 + "content": "Human: How do I parse JSON?\n\nAssistant: Use json.loads().", 150 + } 151 + ], 152 + ) 153 + chunks, meta = format_imported(entries, None) 154 + assert len(chunks) == 1 155 + assert meta["indexer"]["agent"] == "import.claude" 156 + 157 + md = chunks[0]["markdown"] 158 + assert "Help with Python parsing" in md 159 + assert "claude conversation" in md 160 + assert "12 messages" in md 161 + assert "json.loads()" in md 162 + 163 + 164 + def test_generic_entry(): 165 + entries = _make_entries( 166 + "custom", 167 + [ 168 + { 169 + "type": "something_new", 170 + "ts": "2026-01-01T00:00:00", 171 + "title": "Mystery entry", 172 + "content": "Unknown format content.", 173 + } 174 + ], 175 + ) 176 + chunks, meta = format_imported(entries, None) 177 + assert len(chunks) == 1 178 + md = chunks[0]["markdown"] 179 + assert "Mystery entry" in md 180 + assert "Unknown format content" in md 181 + 182 + 183 + def test_multiple_entries(): 184 + entries = _make_entries( 185 + "ics", 186 + [ 187 + { 188 + "type": "calendar_event", 189 + "ts": "2026-01-15T09:00:00", 190 + "title": "Morning standup", 191 + "content": "", 192 + }, 193 + { 194 + "type": "calendar_event", 195 + "ts": "2026-01-15T14:00:00", 196 + "title": "Design review", 197 + "content": "Review new mockups", 198 + }, 199 + ], 200 + ) 201 + chunks, meta = format_imported(entries, None) 202 + assert len(chunks) == 2 203 + assert "Morning standup" in chunks[0]["markdown"] 204 + assert "Design review" in chunks[1]["markdown"] 205 + # Timestamps should be ordered 206 + assert chunks[0]["timestamp"] < chunks[1]["timestamp"] 207 + 208 + 209 + def test_formatter_registration(): 210 + """Verify the formatter is registered and discoverable.""" 211 + from think.formatters import get_formatter 212 + 213 + formatter = get_formatter("20260115/import.ics/imported.jsonl") 214 + assert formatter is not None 215 + assert formatter.__name__ == "format_imported" 216 + 217 + 218 + def test_formatter_registration_obsidian(): 219 + from think.formatters import get_formatter 220 + 221 + formatter = get_formatter("20260115/import.obsidian/imported.jsonl") 222 + assert formatter is not None 223 + 224 + 225 + def test_formatter_registration_kindle(): 226 + from think.formatters import get_formatter 227 + 228 + formatter = get_formatter("20260115/import.kindle/imported.jsonl") 229 + assert formatter is not None 230 + 231 + 232 + def test_path_metadata_extraction(): 233 + """Verify day is correctly extracted from import paths.""" 234 + from think.formatters import extract_path_metadata 235 + 236 + meta = extract_path_metadata("20260115/import.ics/imported.jsonl") 237 + assert meta["day"] == "20260115" 238 + assert meta["facet"] == "" 239 + 240 + meta = extract_path_metadata("20260301/import.obsidian/imported.jsonl") 241 + assert meta["day"] == "20260301" 242 + 243 + 244 + def test_find_formattable_includes_imports(): 245 + """Verify find_formattable_files picks up import JSONL.""" 246 + from think.formatters import find_formattable_files 247 + 248 + with tempfile.TemporaryDirectory() as tmpdir: 249 + # Create a fake import JSONL file 250 + import_dir = Path(tmpdir) / "20260115" / "import.ics" 251 + import_dir.mkdir(parents=True) 252 + jsonl_path = import_dir / "imported.jsonl" 253 + jsonl_path.write_text( 254 + json.dumps({"import": {"id": "t", "source": "ics"}, "entry_count": 0}) 255 + + "\n" 256 + ) 257 + 258 + files = find_formattable_files(tmpdir) 259 + assert "20260115/import.ics/imported.jsonl" in files 260 + 261 + 262 + def test_format_file_integration(): 263 + """End-to-end: write import JSONL, format it, get chunks.""" 264 + from think.formatters import format_file 265 + 266 + with tempfile.TemporaryDirectory() as tmpdir: 267 + # Set JOURNAL_PATH for format_file 268 + old_journal = os.environ.get("JOURNAL_PATH") 269 + os.environ["JOURNAL_PATH"] = tmpdir 270 + 271 + try: 272 + import_dir = Path(tmpdir) / "20260115" / "import.ics" 273 + import_dir.mkdir(parents=True) 274 + jsonl_path = import_dir / "imported.jsonl" 275 + 276 + header = {"import": {"id": "t", "source": "ics"}, "entry_count": 1} 277 + entry = { 278 + "type": "calendar_event", 279 + "ts": "2026-01-15T10:00:00", 280 + "title": "Lunch with Alice", 281 + "content": "", 282 + } 283 + jsonl_path.write_text( 284 + json.dumps(header) + "\n" + json.dumps(entry) + "\n" 285 + ) 286 + 287 + chunks, meta = format_file(str(jsonl_path)) 288 + assert len(chunks) == 1 289 + assert "Lunch with Alice" in chunks[0]["markdown"] 290 + assert meta["indexer"]["agent"] == "import.ics" 291 + finally: 292 + if old_journal is not None: 293 + os.environ["JOURNAL_PATH"] = old_journal 294 + else: 295 + os.environ.pop("JOURNAL_PATH", None)

+2

think/formatters.py

··· 140 140 "facets/*/calendar/*.jsonl": ("think.events", "format_events", True), 141 141 "facets/*/todos/*.jsonl": ("apps.todos.todo", "format_todos", True), 142 142 "facets/*/logs/*.jsonl": ("think.facets", "format_logs", True), 143 + # Structured file imports (indexed) 144 + "*/import.*/imported.jsonl": ("think.importers.formatting", "format_imported", True), 143 145 # Raw transcripts — formattable but not indexed (agent outputs are more useful) 144 146 # Layout: day/stream/segment/audio.jsonl 145 147 "*/*/*/audio.jsonl": ("observe.hear", "format_audio", False),

+15

think/importers/cli.py

··· 617 617 result.entities_seeded, 618 618 len(result.files_created), 619 619 ) 620 + 621 + # Index imported files so they're searchable 622 + if result.files_created: 623 + _set_stage("indexing") 624 + import subprocess 625 + 626 + for created_file in result.files_created: 627 + try: 628 + subprocess.run( 629 + ["sol", "indexer", "--rescan-file", created_file], 630 + capture_output=True, 631 + timeout=60, 632 + ) 633 + except Exception as exc: 634 + logger.warning("Failed to index %s: %s", created_file, exc) 620 635 if args.json: 621 636 print( 622 637 json.dumps(

+208

think/importers/formatting.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Formatter for structured import JSONL files. 5 + 6 + Converts entries from file importers (ICS, Obsidian, Kindle, AI chat) into 7 + markdown chunks for the search index. Each entry type gets a source-appropriate 8 + markdown representation. 9 + 10 + Import JSONL layout: YYYYMMDD/import.{source}/imported.jsonl 11 + Line 1: header {"import": {"id": "...", "source": "ics"}, "entry_count": N} 12 + Line 2+: entries with type, ts, content fields (source-specific extras) 13 + """ 14 + 15 + from __future__ import annotations 16 + 17 + from datetime import datetime 18 + from typing import Any 19 + 20 + 21 + def format_imported( 22 + entries: list[dict], 23 + context: dict | None = None, 24 + ) -> tuple[list[dict], dict]: 25 + """Format structured import JSONL entries to markdown chunks. 26 + 27 + Args: 28 + entries: Raw JSONL entries (header + content entries) 29 + context: Optional context with file_path 30 + 31 + Returns: 32 + Tuple of (chunks, meta) per formatter contract. 33 + """ 34 + meta: dict[str, Any] = {} 35 + chunks: list[dict[str, Any]] = [] 36 + 37 + if not entries: 38 + return chunks, meta 39 + 40 + # First line is the header 41 + header_entry = entries[0] 42 + import_info = header_entry.get("import", {}) 43 + source = import_info.get("source", "unknown") 44 + entry_count = header_entry.get("entry_count", 0) 45 + 46 + meta["indexer"] = {"agent": f"import.{source}"} 47 + meta["header"] = f"# Imported from {source} ({entry_count} entries)" 48 + 49 + # Format each content entry (skip header) 50 + for entry in entries[1:]: 51 + entry_type = entry.get("type", "") 52 + ts_str = entry.get("ts", "") 53 + 54 + # Parse timestamp for ordering 55 + ts_ms = 0 56 + if ts_str: 57 + try: 58 + dt = datetime.fromisoformat(ts_str) 59 + ts_ms = int(dt.timestamp() * 1000) 60 + except (ValueError, OSError): 61 + pass 62 + 63 + md = _format_entry(entry_type, entry) 64 + if md: 65 + chunks.append({"markdown": md, "timestamp": ts_ms, "source": entry}) 66 + 67 + return chunks, meta 68 + 69 + 70 + def _format_entry(entry_type: str, entry: dict) -> str: 71 + """Dispatch to type-specific formatter.""" 72 + if entry_type == "calendar_event": 73 + return _format_calendar_event(entry) 74 + elif entry_type == "note": 75 + return _format_note(entry) 76 + elif entry_type == "highlight": 77 + return _format_highlight(entry) 78 + elif entry_type == "ai_chat": 79 + return _format_ai_chat(entry) 80 + else: 81 + return _format_generic(entry) 82 + 83 + 84 + def _format_calendar_event(entry: dict) -> str: 85 + """Format a calendar event entry.""" 86 + title = entry.get("title", "Untitled event") 87 + ts = entry.get("ts", "") 88 + lines = [f"## {title}"] 89 + 90 + # Time info 91 + time_parts = [] 92 + if ts: 93 + try: 94 + dt = datetime.fromisoformat(ts) 95 + time_parts.append(dt.strftime("%I:%M %p")) 96 + except (ValueError, OSError): 97 + pass 98 + duration = entry.get("duration_minutes") 99 + if duration: 100 + time_parts.append(f"{duration} min") 101 + if time_parts: 102 + lines.append(" | ".join(time_parts)) 103 + 104 + location = entry.get("location") 105 + if location: 106 + lines.append(f"Location: {location}") 107 + 108 + attendees = entry.get("attendees", []) 109 + if attendees: 110 + names = [a.get("name") or a.get("email", "") for a in attendees if isinstance(a, dict)] 111 + if not names: 112 + names = [str(a) for a in attendees] 113 + if names: 114 + lines.append(f"Attendees: {', '.join(names)}") 115 + 116 + content = entry.get("content", "") 117 + if content: 118 + lines.append("") 119 + lines.append(content) 120 + 121 + return "\n".join(lines) 122 + 123 + 124 + def _format_note(entry: dict) -> str: 125 + """Format a note entry (Obsidian/Logseq).""" 126 + title = entry.get("title", "Untitled note") 127 + lines = [f"## {title}"] 128 + 129 + tags = entry.get("tags", []) 130 + if tags: 131 + lines.append(f"Tags: {', '.join(tags)}") 132 + 133 + wikilinks = entry.get("wikilinks", []) 134 + if wikilinks: 135 + lines.append(f"Links: {', '.join(wikilinks)}") 136 + 137 + content = entry.get("content", "") 138 + if content: 139 + lines.append("") 140 + lines.append(content) 141 + 142 + return "\n".join(lines) 143 + 144 + 145 + def _format_highlight(entry: dict) -> str: 146 + """Format a Kindle highlight entry.""" 147 + book = entry.get("book_title", "Unknown book") 148 + author = entry.get("author", "") 149 + content = entry.get("content", "") 150 + clip_type = entry.get("clip_type", "highlight") 151 + 152 + header = f"## {book}" 153 + if author: 154 + header += f" by {author}" 155 + lines = [header] 156 + 157 + loc_parts = [] 158 + page = entry.get("page") 159 + location = entry.get("location") 160 + if page: 161 + loc_parts.append(f"Page {page}") 162 + if location: 163 + loc_parts.append(f"Location {location}") 164 + if loc_parts: 165 + lines.append(" | ".join(loc_parts)) 166 + 167 + if content: 168 + if clip_type == "note": 169 + lines.append("") 170 + lines.append(f"Note: {content}") 171 + else: 172 + lines.append("") 173 + lines.append(f"> {content}") 174 + 175 + return "\n".join(lines) 176 + 177 + 178 + def _format_ai_chat(entry: dict) -> str: 179 + """Format an AI chat conversation entry.""" 180 + title = entry.get("title", "Untitled conversation") 181 + source = entry.get("source", "ai") 182 + msg_count = entry.get("message_count", 0) 183 + 184 + lines = [f"## {title}"] 185 + lines.append(f"{source} conversation, {msg_count} messages") 186 + 187 + content = entry.get("content", "") 188 + if content: 189 + lines.append("") 190 + lines.append(content) 191 + 192 + return "\n".join(lines) 193 + 194 + 195 + def _format_generic(entry: dict) -> str: 196 + """Format an unknown entry type.""" 197 + title = entry.get("title", "") 198 + content = entry.get("content", "") 199 + 200 + lines = [] 201 + if title: 202 + lines.append(f"## {title}") 203 + if content: 204 + if lines: 205 + lines.append("") 206 + lines.append(content) 207 + 208 + return "\n".join(lines) if lines else ""

Configure Feed

Configure Feed