personal memory agent
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Add import content manifests and review UI

Generate content_manifest.jsonl in all six file importers and thread import_id through the FileImporter protocol so manifests land in the canonical imports/<timestamp> directory.

Add map_items_to_segments() for segment mapping, lazy manifest backfill via generate_content_manifest(), and two new import content API endpoints for paginated/filterable content lists and item detail.

Extend the import detail view with a Content tab, search debounce, month filter, expand-on-click detail loading, and #content hash activation. Add a Browse what was imported CTA to the workspace completion summary.

Add fixture imports plus unit/API coverage for manifest writing, backfill, pagination/filtering, detail lookup, missing-import handling, and importer import_id threading. Update affected API baselines for the new fixture corpus.

+1337 -47
+215
apps/import/_detail.html
··· 50 50 51 51 .no-data { color: #999; font-style: italic; padding: 2em; text-align: center; } 52 52 53 + /* Content tab */ 54 + .import-content-summary { margin-bottom: 1em; padding: 0.75em 1em; background: #f0f7ff; border-radius: 4px; font-size: 0.95em; } 55 + .import-content-controls { display: flex; gap: 0.75em; margin-bottom: 1em; } 56 + .import-content-search { flex: 1; padding: 8px 12px; border: 1px solid #ddd; border-radius: 4px; font-size: 0.9em; } 57 + .import-content-month { padding: 8px 12px; border: 1px solid #ddd; border-radius: 4px; font-size: 0.9em; background: #fff; } 58 + .import-content-items { } 59 + .import-content-item { border: 1px solid #e5e5e5; border-radius: 4px; margin-bottom: 0.5em; cursor: pointer; transition: border-color 0.15s; } 60 + .import-content-item:hover { border-color: #007bff; } 61 + .import-content-item-header { padding: 0.75em 1em; display: flex; align-items: center; gap: 0.5em; } 62 + .import-content-item-title { font-weight: 600; flex: 1; } 63 + .import-content-item-date { color: #888; font-size: 0.85em; white-space: nowrap; } 64 + .import-content-item-type { color: #666; font-size: 0.8em; background: #f0f0f0; padding: 2px 6px; border-radius: 3px; } 65 + .import-content-item-preview { color: #666; font-size: 0.9em; padding: 0 1em 0.5em; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; } 66 + .import-content-item-body { display: none; padding: 0.75em 1em; border-top: 1px solid #e5e5e5; background: #fafafa; } 67 + .import-content-item.expanded .import-content-item-body { display: block; } 68 + .import-content-item-body pre { white-space: pre-wrap; font-size: 0.9em; margin: 0; } 69 + .import-content-item-body .import-content-message { margin: 0.5em 0; } 70 + .import-content-item-body .import-content-speaker { font-weight: 600; margin-right: 0.5em; } 71 + .import-content-item-body .import-content-speaker.human { color: #007bff; } 72 + .import-content-item-body .import-content-speaker.assistant { color: #28a745; } 73 + .import-content-pagination { display: flex; gap: 0.5em; justify-content: center; margin-top: 1em; } 74 + .import-content-pagination button { padding: 6px 14px; border: 1px solid #ddd; border-radius: 4px; background: #fff; cursor: pointer; font-size: 0.9em; } 75 + .import-content-pagination button:hover { background: #f5f5f5; } 76 + .import-content-pagination button.active { background: #007bff; color: #fff; border-color: #007bff; } 77 + .import-content-pagination button:disabled { opacity: 0.5; cursor: not-allowed; } 78 + 53 79 </style> 54 80 55 81 <div class="import-detail-content"> ··· 64 90 65 91 <div class="tabs"> 66 92 <div class="tab active" data-target="overview">Overview</div> 93 + <div class="tab" data-target="content">Content</div> 67 94 <div class="tab" data-target="import-json">Upload Metadata</div> 68 95 <div class="tab" data-target="imported-json">Processing Results</div> 69 96 </div> ··· 84 111 </div> 85 112 </div> 86 113 114 + <div class="tab-content" id="content"> 115 + <div id="contentLoading" class="no-data">Select this tab to browse imported content</div> 116 + <div id="contentContainer" style="display:none;"> 117 + <div class="import-content-summary" id="contentSummary"></div> 118 + <div class="import-content-controls"> 119 + <input type="text" id="contentSearch" class="import-content-search" placeholder="Search items..." /> 120 + <select id="contentMonth" class="import-content-month"> 121 + <option value="">All months</option> 122 + </select> 123 + </div> 124 + <div id="contentItems" class="import-content-items"></div> 125 + <div id="contentPagination" class="import-content-pagination"></div> 126 + </div> 127 + </div> 128 + 87 129 <div class="tab-content" id="import-json"> 88 130 <div class="json-viewer"> 89 131 <pre id="importJsonContent">Loading...</pre> ··· 102 144 // Tab switching 103 145 const tabs = document.querySelectorAll('.tab'); 104 146 const contents = document.querySelectorAll('.tab-content'); 147 + let contentLoaded = false; 148 + let contentPage = 1; 149 + let contentSearch = ''; 150 + let contentMonth = ''; 151 + let searchDebounce = null; 105 152 106 153 tabs.forEach(tab => { 107 154 tab.addEventListener('click', () => { 108 155 const target = tab.dataset.target; 109 156 tabs.forEach(t => t.classList.toggle('active', t.dataset.target === target)); 110 157 contents.forEach(c => c.classList.toggle('active', c.id === target)); 158 + if (target === 'content' && !contentLoaded) { 159 + loadContent(1); 160 + } 111 161 }); 112 162 }); 113 163 164 + function loadContent(page = 1) { 165 + const container = document.getElementById('contentContainer'); 166 + const loading = document.getElementById('contentLoading'); 167 + loading.textContent = 'Loading content...'; 168 + loading.style.display = 'block'; 169 + 170 + const params = new URLSearchParams({page, per_page: 50}); 171 + if (contentSearch) params.set('q', contentSearch); 172 + if (contentMonth) params.set('month', contentMonth); 173 + 174 + fetch(`/app/import/api/{{ timestamp }}/content?${params}`) 175 + .then(r => r.json()) 176 + .then(data => { 177 + if (data.error) { 178 + loading.textContent = data.error; 179 + return; 180 + } 181 + loading.style.display = 'none'; 182 + container.style.display = 'block'; 183 + contentPage = data.page; 184 + renderContentSummary(data); 185 + renderContentMonths(data.months); 186 + renderContentItems(data.items); 187 + renderContentPagination(data); 188 + contentLoaded = true; 189 + }) 190 + .catch(err => { 191 + loading.textContent = 'Error loading content'; 192 + console.error('Content load error:', err); 193 + }); 194 + } 195 + 196 + function renderContentSummary(data) { 197 + const el = document.getElementById('contentSummary'); 198 + const emoji = data.source_emoji || ''; 199 + const display = data.source_display || ''; 200 + el.innerHTML = `${emoji} <strong>${display}</strong> — ${data.total} items`; 201 + } 202 + 203 + function renderContentMonths(months) { 204 + const select = document.getElementById('contentMonth'); 205 + const current = select.value; 206 + select.innerHTML = '<option value="">All months</option>'; 207 + for (const [month, count] of Object.entries(months)) { 208 + const label = month.slice(0, 4) + '-' + month.slice(4, 6); 209 + const opt = document.createElement('option'); 210 + opt.value = month; 211 + opt.textContent = `${label} (${count})`; 212 + select.appendChild(opt); 213 + } 214 + select.value = current; 215 + } 216 + 217 + function renderContentItems(items) { 218 + const el = document.getElementById('contentItems'); 219 + if (!items.length) { 220 + el.innerHTML = '<div class="no-data">No items found</div>'; 221 + return; 222 + } 223 + el.innerHTML = items.map(item => { 224 + const typeLabel = { 225 + conversation: 'Chat', 226 + event: 'Event', 227 + highlight_group: 'Highlights', 228 + note: 'Note', 229 + }[item.type] || item.type; 230 + const date = item.date ? `${item.date.slice(0, 4)}-${item.date.slice(4, 6)}-${item.date.slice(6, 8)}` : ''; 231 + return ` 232 + <div class="import-content-item" data-item-id="${item.id}"> 233 + <div class="import-content-item-header" onclick="toggleContentItem(this.parentElement)"> 234 + <span class="import-content-item-type">${typeLabel}</span> 235 + <span class="import-content-item-title">${escapeContentHtml(item.title)}</span> 236 + <span class="import-content-item-date">${date}</span> 237 + </div> 238 + ${item.preview ? `<div class="import-content-item-preview">${escapeContentHtml(item.preview)}</div>` : ''} 239 + <div class="import-content-item-body"><div class="no-data">Loading...</div></div> 240 + </div>`; 241 + }).join(''); 242 + } 243 + 244 + function toggleContentItem(el) { 245 + const wasExpanded = el.classList.contains('expanded'); 246 + document.querySelectorAll('.import-content-item.expanded').forEach(item => { 247 + item.classList.remove('expanded'); 248 + }); 249 + if (wasExpanded) return; 250 + 251 + el.classList.add('expanded'); 252 + const body = el.querySelector('.import-content-item-body'); 253 + const itemId = el.dataset.itemId; 254 + 255 + fetch(`/app/import/api/{{ timestamp }}/content/${itemId}`) 256 + .then(r => r.json()) 257 + .then(data => { 258 + if (data.error) { 259 + body.innerHTML = `<div class="no-data">${data.error}</div>`; 260 + return; 261 + } 262 + body.innerHTML = renderContentDetail(data); 263 + }) 264 + .catch(() => { 265 + body.innerHTML = '<div class="no-data">Error loading content</div>'; 266 + }); 267 + } 268 + 269 + function renderContentDetail(data) { 270 + if (!data.content || !data.content.length) { 271 + return '<div class="no-data">No content available</div>'; 272 + } 273 + 274 + return data.content.map(part => { 275 + if (part.type === 'markdown') { 276 + return `<pre>${escapeContentHtml(part.content)}</pre>`; 277 + } 278 + const speaker = part.speaker || 'Unknown'; 279 + const cls = speaker === 'Human' ? 'human' : 'assistant'; 280 + return `<div class="import-content-message"><span class="import-content-speaker ${cls}">${speaker}:</span> ${escapeContentHtml(part.text || '')}</div>`; 281 + }).join(''); 282 + } 283 + 284 + function renderContentPagination(data) { 285 + const el = document.getElementById('contentPagination'); 286 + if (data.pages <= 1) { 287 + el.innerHTML = ''; 288 + return; 289 + } 290 + let html = ''; 291 + html += `<button ${data.page <= 1 ? 'disabled' : ''} onclick="loadContent(${data.page - 1})">Prev</button>`; 292 + const start = Math.max(1, data.page - 2); 293 + const end = Math.min(data.pages, data.page + 2); 294 + for (let i = start; i <= end; i++) { 295 + html += `<button class="${i === data.page ? 'active' : ''}" onclick="loadContent(${i})">${i}</button>`; 296 + } 297 + html += `<button ${data.page >= data.pages ? 'disabled' : ''} onclick="loadContent(${data.page + 1})">Next</button>`; 298 + el.innerHTML = html; 299 + } 300 + 301 + function escapeContentHtml(str) { 302 + if (!str) return ''; 303 + return str 304 + .replace(/&/g, '&amp;') 305 + .replace(/</g, '&lt;') 306 + .replace(/>/g, '&gt;') 307 + .replace(/"/g, '&quot;'); 308 + } 309 + 310 + document.getElementById('contentSearch').addEventListener('input', function() { 311 + clearTimeout(searchDebounce); 312 + searchDebounce = setTimeout(() => { 313 + contentSearch = this.value.trim(); 314 + loadContent(1); 315 + }, 300); 316 + }); 317 + 318 + document.getElementById('contentMonth').addEventListener('change', function() { 319 + contentMonth = this.value; 320 + loadContent(1); 321 + }); 322 + 114 323 // Format JSON with syntax highlighting 115 324 function formatJson(obj) { 116 325 if (!obj) return '<span class="json-null">No data available</span>'; ··· 267 476 document.getElementById('importJsonContent').innerHTML = '<span class="no-data">Error loading data</span>'; 268 477 document.getElementById('importedJsonContent').innerHTML = '<span class="no-data">Error loading data</span>'; 269 478 }); 479 + 480 + if (window.location.hash === '#content') { 481 + tabs.forEach(t => t.classList.toggle('active', t.dataset.target === 'content')); 482 + contents.forEach(c => c.classList.toggle('active', c.id === 'content')); 483 + loadContent(1); 484 + } 270 485 </script>
+185
apps/import/routes.py
··· 3 3 4 4 from __future__ import annotations 5 5 6 + import json 6 7 import re 7 8 import time 8 9 from pathlib import Path ··· 15 16 from think.detect_created import detect_created 16 17 from think.importers.utils import ( 17 18 build_import_info, 19 + generate_content_manifest, 18 20 get_import_details, 19 21 list_import_timestamps, 20 22 read_import_metadata, ··· 464 466 return jsonify(result) 465 467 except FileNotFoundError: 466 468 return jsonify({"error": "Import not found"}), 404 469 + 470 + 471 + @import_bp.route("/api/<timestamp>/content") 472 + def import_content_list(timestamp: str) -> Any: 473 + """Get paginated content items for an import.""" 474 + journal_root = Path(state.journal_root) 475 + import_dir = journal_root / "imports" / timestamp 476 + if not import_dir.exists(): 477 + return jsonify({"error": "Import not found"}), 404 478 + 479 + manifest_path = import_dir / "content_manifest.jsonl" 480 + if ( 481 + not manifest_path.exists() 482 + and generate_content_manifest(journal_root, timestamp) is None 483 + ): 484 + return jsonify({"error": "No content available"}), 404 485 + 486 + items: list[dict] = [] 487 + try: 488 + with open(manifest_path, "r", encoding="utf-8") as f: 489 + for line in f: 490 + line = line.strip() 491 + if not line: 492 + continue 493 + try: 494 + items.append(json.loads(line)) 495 + except json.JSONDecodeError: 496 + continue 497 + except OSError: 498 + return jsonify({"error": "Failed to read manifest"}), 500 499 + 500 + source_type = "" 501 + imported_path = import_dir / "imported.json" 502 + if imported_path.exists(): 503 + try: 504 + imported = json.loads(imported_path.read_text(encoding="utf-8")) 505 + source_type = imported.get("source_type", "") 506 + except (OSError, json.JSONDecodeError): 507 + pass 508 + 509 + source_meta = next((s for s in SOURCE_METADATA if s["name"] == source_type), None) 510 + 511 + month_counts: dict[str, int] = {} 512 + for item in items: 513 + date = item.get("date", "") 514 + if len(date) >= 6: 515 + month = date[:6] 516 + month_counts[month] = month_counts.get(month, 0) + 1 517 + 518 + q = request.args.get("q", "").strip().lower() 519 + month = request.args.get("month", "").strip() 520 + 521 + filtered = items 522 + if month: 523 + filtered = [item for item in filtered if item.get("date", "").startswith(month)] 524 + if q: 525 + filtered = [ 526 + item 527 + for item in filtered 528 + if q in item.get("title", "").lower() 529 + or q in item.get("preview", "").lower() 530 + ] 531 + 532 + try: 533 + page = max(1, int(request.args.get("page", 1))) 534 + except ValueError: 535 + page = 1 536 + try: 537 + per_page = min(100, max(1, int(request.args.get("per_page", 50)))) 538 + except ValueError: 539 + per_page = 50 540 + total = len(filtered) 541 + start = (page - 1) * per_page 542 + page_items = filtered[start : start + per_page] 543 + 544 + return jsonify( 545 + { 546 + "items": page_items, 547 + "total": total, 548 + "page": page, 549 + "per_page": per_page, 550 + "pages": (total + per_page - 1) // per_page if total > 0 else 0, 551 + "months": dict(sorted(month_counts.items())), 552 + "source_type": source_type, 553 + "source_display": source_meta["display_name"] 554 + if source_meta 555 + else source_type, 556 + "source_emoji": source_meta["emoji"] if source_meta else "", 557 + } 558 + ) 559 + 560 + 561 + @import_bp.route("/api/<timestamp>/content/<item_id>") 562 + def import_content_detail(timestamp: str, item_id: str) -> Any: 563 + """Get full content for a specific imported item.""" 564 + journal_root = Path(state.journal_root) 565 + import_dir = journal_root / "imports" / timestamp 566 + if not import_dir.exists(): 567 + return jsonify({"error": "Import not found"}), 404 568 + 569 + manifest_path = import_dir / "content_manifest.jsonl" 570 + if not manifest_path.exists(): 571 + generate_content_manifest(journal_root, timestamp) 572 + if not manifest_path.exists(): 573 + return jsonify({"error": "No content available"}), 404 574 + 575 + item = None 576 + try: 577 + with open(manifest_path, "r", encoding="utf-8") as f: 578 + for line in f: 579 + line = line.strip() 580 + if not line: 581 + continue 582 + try: 583 + entry = json.loads(line) 584 + except json.JSONDecodeError: 585 + continue 586 + if entry.get("id") == item_id: 587 + item = entry 588 + break 589 + except OSError: 590 + return jsonify({"error": "Failed to read manifest"}), 500 591 + 592 + if item is None: 593 + return jsonify({"error": "Item not found"}), 404 594 + 595 + source_type = "" 596 + imported_path = import_dir / "imported.json" 597 + if imported_path.exists(): 598 + try: 599 + imported = json.loads(imported_path.read_text(encoding="utf-8")) 600 + source_type = imported.get("source_type", "") 601 + except (OSError, json.JSONDecodeError): 602 + pass 603 + 604 + content_parts: list[dict] = [] 605 + for seg in item.get("segments", []): 606 + day = seg.get("day", "") 607 + key = seg.get("key", "") 608 + if not day or not key: 609 + continue 610 + seg_dir = journal_root / day / f"import.{source_type}" / key 611 + if not seg_dir.exists(): 612 + continue 613 + 614 + jsonl_path = seg_dir / "conversation_transcript.jsonl" 615 + if jsonl_path.exists(): 616 + try: 617 + lines = jsonl_path.read_text(encoding="utf-8").strip().split("\n") 618 + except OSError: 619 + continue 620 + for line in lines[1:]: 621 + try: 622 + content_parts.append(json.loads(line)) 623 + except json.JSONDecodeError: 624 + continue 625 + continue 626 + 627 + for md_file in seg_dir.glob("*_transcript.md"): 628 + try: 629 + md_content = md_file.read_text(encoding="utf-8") 630 + except OSError: 631 + continue 632 + title = item.get("title", "") 633 + if title: 634 + sections = re.split(r"(?m)^## ", md_content) 635 + for section in sections: 636 + stripped = section.strip() 637 + if stripped.startswith(title): 638 + content_parts.append( 639 + {"type": "markdown", "content": "## " + stripped} 640 + ) 641 + break 642 + else: 643 + content_parts.append( 644 + {"type": "markdown", "content": md_content.strip()} 645 + ) 646 + else: 647 + content_parts.append( 648 + {"type": "markdown", "content": md_content.strip()} 649 + ) 650 + 651 + return jsonify({"item": item, "content": content_parts}) 467 652 468 653 469 654 @import_bp.route("/api/start", methods=["POST"])
+1
apps/import/workspace.html
··· 1352 1352 <div><strong>Date range:</strong> ${escapeHtml(formatDateRange(range) || '-')}</div> 1353 1353 <div><strong>Duration:</strong> ${formatElapsed(completedInfo.duration_ms || 0)}</div> 1354 1354 <div class="import-action-row"> 1355 + <a class="import-secondary-btn" href="/app/import/${completedInfo.processed_timestamp || importId}#content">Browse what was imported →</a> 1355 1356 ${day ? `<a class="import-secondary-btn" href="/app/calendar/${day}">View in calendar</a>` : ''} 1356 1357 <a href="#" class="import-secondary-btn" onclick="showGrid(); return false;">Import another source</a> 1357 1358 </div>
+54 -1
tests/baselines/api/import/list.json
··· 1 - [] 1 + [ 2 + { 3 + "created_at": 0, 4 + "date_range": [ 5 + "20260101", 6 + "20260101" 7 + ], 8 + "entities_seeded": 0, 9 + "entries_written": 18, 10 + "error": null, 11 + "error_stage": null, 12 + "facet": "work", 13 + "file_size": 123456, 14 + "imported_at": 1767261600.0, 15 + "mime_type": "application/zip", 16 + "original_filename": "chatgpt-export.zip", 17 + "processed": true, 18 + "setting": "ai_conversation", 19 + "source_display": "ChatGPT", 20 + "source_type": "chatgpt", 21 + "status": "success", 22 + "target_day": "20260101", 23 + "task_id": null, 24 + "timestamp": "20260101_100000", 25 + "total_files_created": 3, 26 + "user_timestamp": "20260101_100000" 27 + }, 28 + { 29 + "created_at": 0, 30 + "date_range": [ 31 + "20260101", 32 + "20260101" 33 + ], 34 + "entities_seeded": 0, 35 + "entries_written": 5, 36 + "error": null, 37 + "error_stage": null, 38 + "facet": "work", 39 + "file_size": 45678, 40 + "imported_at": 1767258000.0, 41 + "mime_type": "application/zip", 42 + "original_filename": "calendar-export.zip", 43 + "processed": true, 44 + "setting": "calendar", 45 + "source_display": "Calendar", 46 + "source_type": "ics", 47 + "status": "success", 48 + "target_day": "20260101", 49 + "task_id": null, 50 + "timestamp": "20260101_090000", 51 + "total_files_created": 2, 52 + "user_timestamp": "20260101_090000" 53 + } 54 + ]
+1 -1
tests/baselines/api/search/day-results.json
··· 14 14 "id": "20260304/agents/knowledge_graph.md:7", 15 15 "idx": 7, 16 16 "path": "20260304/agents/knowledge_graph.md", 17 - "score": -2.6, 17 + "score": -2.0, 18 18 "stream": null, 19 19 "text": "# Part 1: Entity Extraction and Relationship Mapping\n\n## Relationship Mapping\n\n| Source Name | Target Name | Relationship Type | Context |\n| :--- | :--- | :--- | :--- |\n| **Romeo Montague** | **Juliet Capulet** | `met-at-conference` | First <strong>meeting</strong> at Denver Tech Summit keynote. |\n" 20 20 }
+34 -34
tests/baselines/api/search/search.json
··· 79 79 "id": "20260306/default/093000_300/agents/audio.md:0", 80 80 "idx": 0, 81 81 "path": "20260306/default/093000_300/agents/audio.md", 82 - "score": -2.7, 82 + "score": -1.7, 83 83 "stream": "default", 84 84 "text": "# Audio Summary Morning standup. Benvolio noticed <strong>Romeo</strong>'s late-night GitHub activity and pressed him about API gateway commits. <strong>Romeo</strong> deflected, calling it a personal mesh routing prototype. Mercutio covered for him. Balthasar reported progress on the mesh routing fallback PR with an edge case for <strong>Romeo</strong> to review. Benvolio scheduled..." 85 85 }, ··· 95 95 "id": "facets/montague/entities/20260306.jsonl:0", 96 96 "idx": 0, 97 97 "path": "facets/montague/entities/20260306.jsonl", 98 - "score": -3.3, 98 + "score": -2.1, 99 99 "stream": null, 100 100 "text": "### Person: <strong>Romeo</strong> Montague\n\n\nContinued Verona Platform development\n\n" 101 101 }, ··· 111 111 "id": "facets/montague/entities/20260306.jsonl:3", 112 112 "idx": 3, 113 113 "path": "facets/montague/entities/20260306.jsonl", 114 - "score": -3.1, 114 + "score": -2.0, 115 115 "stream": null, 116 116 "text": "### Person: Balthasar Davi\n\n\nReviewed mesh routing PR with <strong>Romeo</strong>\n\n" 117 117 }, ··· 127 127 "id": "facets/montague/entities/20260306.jsonl:4", 128 128 "idx": 4, 129 129 "path": "facets/montague/entities/20260306.jsonl", 130 - "score": -3.2, 130 + "score": -2.1, 131 131 "stream": null, 132 132 "text": "### Person: Mercutio Escalus\n\n\nCovered for <strong>Romeo</strong> during standup\n\n" 133 133 }, ··· 143 143 "id": "20260306/default/093000_300/agents/screen.md:0", 144 144 "idx": 0, 145 145 "path": "20260306/default/093000_300/agents/screen.md", 146 - "score": -2.9, 146 + "score": -1.9, 147 147 "stream": "default", 148 148 "text": "# Screen Summary\n\nSlack standup channel. Benvolio questioning <strong>Romeo</strong> about late-night commits.\n" 149 149 } ··· 168 168 "id": "facets/verona/logs/20260309.jsonl:1", 169 169 "idx": 1, 170 170 "path": "facets/verona/logs/20260309.jsonl", 171 - "score": -2.5, 171 + "score": -1.6, 172 172 "stream": null, 173 173 "text": "### Deploy Complete by <strong>romeo</strong>_montague\n\n**Source:** deploy | **Time:** 13:45:00\n\n**Parameters:**\n- service: verona-gateway\n- version: 0.9.0\n" 174 174 }, ··· 184 184 "id": "20260309/default/090000_300/agents/audio.md:0", 185 185 "idx": 0, 186 186 "path": "20260309/default/090000_300/agents/audio.md", 187 - "score": -2.3, 187 + "score": -1.5, 188 188 "stream": "default", 189 189 "text": "# Audio Summary\n\n<strong>Romeo</strong> confessed the project to Benvolio and asked for infrastructure help. Benvolio agreed to spin up a Kubernetes staging cluster.\n" 190 190 }, ··· 200 200 "id": "facets/montague/entities/20260309.jsonl:0", 201 201 "idx": 0, 202 202 "path": "facets/montague/entities/20260309.jsonl", 203 - "score": -3.1, 203 + "score": -2.0, 204 204 "stream": null, 205 205 "text": "### Person: <strong>Romeo</strong> Montague\n\n\nConfessed project to Benvolio, preparing demo\n\n" 206 206 }, ··· 216 216 "id": "facets/montague/calendar/20260309.jsonl:0", 217 217 "idx": 0, 218 218 "path": "facets/montague/calendar/20260309.jsonl", 219 - "score": -2.7, 219 + "score": -1.7, 220 220 "stream": null, 221 221 "text": "### Event: Team Standup\n\n\n**Time Occurred:** 09:00 - 09:30\n**Participants:** <strong>Romeo</strong> Montague, Benvolio Montague\n\nDaily sync\n" 222 222 }, ··· 232 232 "id": "facets/verona/calendar/20260309.jsonl:0", 233 233 "idx": 0, 234 234 "path": "facets/verona/calendar/20260309.jsonl", 235 - "score": -2.4, 235 + "score": -1.5, 236 236 "stream": null, 237 237 "text": "### Event: Demo Sprint\n\n\n**Time Occurred:** 09:00 - 21:00\n**Participants:** <strong>Romeo</strong> Montague, Juliet Capulet, Benvolio Montague\n\nFull day board presentation preparation\n" 238 238 } ··· 257 257 "id": "20260307/default/100000_300/agents/audio.md:0", 258 258 "idx": 0, 259 259 "path": "20260307/default/100000_300/agents/audio.md", 260 - "score": -3.2, 260 + "score": -2.0, 261 261 "stream": "default", 262 262 "text": "# Audio Summary\n\nHeated confrontation. Tybalt Capulet accused <strong>Romeo</strong> of stealing Capulet IP. Mercutio defended <strong>Romeo</strong> and had his Capulet consulting contract terminated by Tybalt.\n" 263 263 }, ··· 273 273 "id": "20260307/default/150000_300/agents/audio.md:0", 274 274 "idx": 0, 275 275 "path": "20260307/default/150000_300/agents/audio.md", 276 - "score": -3.4, 276 + "score": -2.2, 277 277 "stream": "default", 278 278 "text": "# Audio Summary\n\nEmergency meeting at Montague Tech. Benvolio questioned <strong>Romeo</strong> about the secret project. <strong>Romeo</strong> clarified no company IP was shared. Team discussed legal exposure. <strong>Romeo</strong> proposed Professor Lawrence as mediator.\n" 279 279 }, ··· 289 289 "id": "facets/montague/entities/20260307.jsonl:0", 290 290 "idx": 0, 291 291 "path": "facets/montague/entities/20260307.jsonl", 292 - "score": -3.1, 292 + "score": -2.0, 293 293 "stream": null, 294 294 "text": "### Person: <strong>Romeo</strong> Montague\n\n\nConfronted by Tybalt, called emergency meeting\n\n" 295 295 }, ··· 305 305 "id": "facets/montague/calendar/20260307.jsonl:0", 306 306 "idx": 0, 307 307 "path": "facets/montague/calendar/20260307.jsonl", 308 - "score": -2.4, 308 + "score": -1.6, 309 309 "stream": null, 310 310 "text": "### Event: Emergency Team Meeting\n\n\n**Time Occurred:** 15:00 - 16:00\n**Participants:** <strong>Romeo</strong> Montague, Benvolio Montague\n\nCrisis response to Capulet situation\n" 311 311 }, ··· 321 321 "id": "facets/montague/events/20260307.jsonl:0", 322 322 "idx": 0, 323 323 "path": "facets/montague/events/20260307.jsonl", 324 - "score": -3.0, 324 + "score": -1.9, 325 325 "stream": null, 326 326 "text": "### Meeting: Confrontation with Tybalt\n\n\n**Time Occurred:** 10:00 - 10:30\n**Participants:** <strong>Romeo</strong> Montague, Tybalt Capulet, Mercutio Escalus\n\nTybalt accused <strong>Romeo</strong> of IP theft\n\nMercutio fired from Capulet contract\n" 327 327 } ··· 346 346 "id": "facets/montague/entities/20260308.jsonl:0", 347 347 "idx": 0, 348 348 "path": "facets/montague/entities/20260308.jsonl", 349 - "score": -3.1, 349 + "score": -2.0, 350 350 "stream": null, 351 351 "text": "### Person: <strong>Romeo</strong> Montague\n\n\nUnder board pressure, planning board presentation\n\n" 352 352 }, ··· 362 362 "id": "facets/verona/events/20260308.jsonl:0", 363 363 "idx": 0, 364 364 "path": "facets/verona/events/20260308.jsonl", 365 - "score": -2.1, 365 + "score": -1.4, 366 366 "stream": null, 367 367 "text": "### Meeting: Strategy Call with Professor Lawrence\n\n\n**Time Occurred:** 10:00 - 11:00\n**Participants:** <strong>Romeo</strong> Montague, Juliet Capulet, Friar Lawrence\n\nJoint venture strategy planning\n\nProposed board presentation strategy\n" 368 368 }, ··· 378 378 "id": "20260308/agents/knowledge_graph.md:2", 379 379 "idx": 2, 380 380 "path": "20260308/agents/knowledge_graph.md", 381 - "score": -2.0, 381 + "score": -1.3, 382 382 "stream": null, 383 383 "text": "# Part 1: Entity Extraction and Relationship Mapping ## Entity Profiles | Entity Name | Entity Type | First Appearance | Total Engagement | Context | | :--- | :--- | :--- | :--- | :--- | | **<strong>Romeo</strong> Montague** | Person | 10:00 | High | Under board pressure,..." 384 384 }, ··· 394 394 "id": "20260308/agents/meetings.md:0", 395 395 "idx": 0, 396 396 "path": "20260308/agents/meetings.md", 397 - "score": -3.0, 397 + "score": -1.9, 398 398 "stream": null, 399 399 "text": "# Meetings\n\n- 10:00 Strategy Call with Professor Lawrence, <strong>Romeo</strong>, and Juliet\n" 400 400 } ··· 419 419 "id": "facets/verona/logs/20260305.jsonl:0", 420 420 "idx": 0, 421 421 "path": "facets/verona/logs/20260305.jsonl", 422 - "score": -2.6, 422 + "score": -1.7, 423 423 "stream": null, 424 424 "text": "### Repo Created by <strong>romeo</strong>_montague\n\n**Source:** github | **Time:** 22:05:00\n\n**Parameters:**\n- repo: balcony-app\n- visibility: private\n" 425 425 }, ··· 435 435 "id": "20260305/default/090000_300/agents/audio.md:0", 436 436 "idx": 0, 437 437 "path": "20260305/default/090000_300/agents/audio.md", 438 - "score": -3.0, 438 + "score": -1.9, 439 439 "stream": "default", 440 440 "text": "# Audio Summary\n\nMorning standup at Montague Tech. Benvolio reported CI pipeline is green. <strong>Romeo</strong> mentioned wanting to explore ideas from the conference. Mercutio teased about <strong>Romeo</strong> meeting someone.\n" 441 441 }, ··· 451 451 "id": "facets/montague/entities/20260305.jsonl:0", 452 452 "idx": 0, 453 453 "path": "facets/montague/entities/20260305.jsonl", 454 - "score": -3.1, 454 + "score": -2.0, 455 455 "stream": null, 456 456 "text": "### Person: <strong>Romeo</strong> Montague\n\n\nStarted Balcony App prototype with Juliet\n\n" 457 457 }, ··· 467 467 "id": "facets/verona/entities/20260305.jsonl:0", 468 468 "idx": 0, 469 469 "path": "facets/verona/entities/20260305.jsonl", 470 - "score": -3.1, 470 + "score": -2.0, 471 471 "stream": null, 472 472 "text": "### Person: <strong>Romeo</strong> Montague\n\n\nSet up private repo for collaboration\n\n" 473 473 }, ··· 483 483 "id": "facets/montague/events/20260305.jsonl:0", 484 484 "idx": 0, 485 485 "path": "facets/montague/events/20260305.jsonl", 486 - "score": -3.2, 486 + "score": -2.0, 487 487 "stream": null, 488 488 "text": "### Meeting: Montague Tech Daily Standup\n\n\n**Time Occurred:** 09:00 - 09:30\n**Participants:** <strong>Romeo</strong> Montague, Benvolio Montague, Mercutio Escalus\n\nTeam standup\n\n<strong>Romeo</strong> mentioned conference ideas\n" 489 489 } ··· 508 508 "id": "facets/montague/entities/20260310.jsonl:0", 509 509 "idx": 0, 510 510 "path": "facets/montague/entities/20260310.jsonl", 511 - "score": -3.0, 511 + "score": -1.9, 512 512 "stream": null, 513 513 "text": "### Person: <strong>Romeo</strong> Montague\n\n\nNamed co-lead of Verona Platform joint venture\n\n" 514 514 }, ··· 524 524 "id": "facets/verona/entities/20260310.jsonl:0", 525 525 "idx": 0, 526 526 "path": "facets/verona/entities/20260310.jsonl", 527 - "score": -3.1, 527 + "score": -2.0, 528 528 "stream": null, 529 529 "text": "### Person: <strong>Romeo</strong> Montague\n\n\nNamed co-lead of approved joint venture\n\n" 530 530 }, ··· 540 540 "id": "facets/montague/calendar/20260310.jsonl:0", 541 541 "idx": 0, 542 542 "path": "facets/montague/calendar/20260310.jsonl", 543 - "score": -2.4, 543 + "score": -1.5, 544 544 "stream": null, 545 545 "text": "### Event: Joint Board Meeting\n\n\n**Time Occurred:** 10:00 - 12:00\n**Participants:** <strong>Romeo</strong> Montague, Benvolio Montague\n\nQuarterly review with Verona Platform presentation\n" 546 546 }, ··· 556 556 "id": "facets/verona/calendar/20260310.jsonl:0", 557 557 "idx": 0, 558 558 "path": "facets/verona/calendar/20260310.jsonl", 559 - "score": -2.4, 559 + "score": -1.5, 560 560 "stream": null, 561 561 "text": "### Event: Board Presentation\n\n\n**Time Occurred:** 10:00 - 12:00\n**Participants:** <strong>Romeo</strong> Montague, Juliet Capulet, Friar Lawrence\n\nVerona Platform joint venture pitch\n" 562 562 }, ··· 572 572 "id": "20260310/agents/meetings.md:0", 573 573 "idx": 0, 574 574 "path": "20260310/agents/meetings.md", 575 - "score": -3.1, 575 + "score": -2.0, 576 576 "stream": null, 577 577 "text": "# Meetings\n\n- 08:30 Pre-Board Meeting Prep (<strong>Romeo</strong>, Juliet, Benvolio)\n" 578 578 } ··· 597 597 "id": "20260304/default/180000_300/agents/audio.md:0", 598 598 "idx": 0, 599 599 "path": "20260304/default/180000_300/agents/audio.md", 600 - "score": -3.0, 600 + "score": -1.9, 601 601 "stream": "default", 602 602 "text": "# Audio Summary\n\nEvening mixer at Denver Tech Summit. <strong>Romeo</strong> and Juliet had their first extended conversation about combining their API approaches. Mercutio tried to pull <strong>Romeo</strong> away to karaoke.\n" 603 603 }, ··· 613 613 "id": "facets/capulet/entities/20260304.jsonl:1", 614 614 "idx": 1, 615 615 "path": "facets/capulet/entities/20260304.jsonl", 616 - "score": -3.3, 616 + "score": -2.1, 617 617 "stream": null, 618 618 "text": "### Person: Tybalt Capulet\n\n\nConfronted <strong>Romeo</strong> at hackathon\n\n" 619 619 }, ··· 629 629 "id": "facets/montague/entities/20260304.jsonl:0", 630 630 "idx": 0, 631 631 "path": "facets/montague/entities/20260304.jsonl", 632 - "score": -3.1, 632 + "score": -2.0, 633 633 "stream": null, 634 634 "text": "### Person: <strong>Romeo</strong> Montague\n\n\nAttended Denver Tech Summit, met Juliet Capulet\n\n" 635 635 }, ··· 645 645 "id": "facets/capulet/events/20260304.jsonl:1", 646 646 "idx": 1, 647 647 "path": "facets/capulet/events/20260304.jsonl", 648 - "score": -3.3, 648 + "score": -2.1, 649 649 "stream": null, 650 650 "text": "### Social: Conference Mixer\n\n\n**Time Occurred:** 18:00 - 20:00\n**Participants:** Juliet Capulet, <strong>Romeo</strong> Montague\n\nNetworking event\n\nJuliet and <strong>Romeo</strong> exchanged Signal contacts\n" 651 651 }, ··· 661 661 "id": "facets/montague/events/20260304.jsonl:1", 662 662 "idx": 1, 663 663 "path": "facets/montague/events/20260304.jsonl", 664 - "score": -3.2, 664 + "score": -2.1, 665 665 "stream": null, 666 666 "text": "### Hackathon: Hackathon - API Bridge Challenge\n\n\n**Time Occurred:** 14:00 - 18:00\n**Participants:** <strong>Romeo</strong> Montague, Mercutio Escalus\n\nBuilt API bridge prototype\n\nTybalt confronted <strong>Romeo</strong>\n" 667 667 }
+5
tests/fixtures/journal/imports/20260101_090000/content_manifest.jsonl
··· 1 + {"id": "event-0", "title": "Weekly Engineering Standup", "date": "20260101", "type": "event", "preview": "- **When:** Thursday, January 1, 2026 09:00–09:30 MST", "meta": {}, "segments": [{"day": "20260101", "key": "090000_300"}]} 2 + {"id": "event-1", "title": "1:1 with David Park (Betaworks)", "date": "20260101", "type": "event", "preview": "- **When:** Thursday, January 1, 2026 10:00–10:45 MST", "meta": {}, "segments": [{"day": "20260101", "key": "090000_300"}]} 3 + {"id": "event-2", "title": "Product Review — Solstone v0.4 Roadmap", "date": "20260101", "type": "event", "preview": "- **When:** Thursday, January 1, 2026 11:00–12:00 MST", "meta": {}, "segments": [{"day": "20260101", "key": "090000_300"}]} 4 + {"id": "event-3", "title": "Lunch with Sarah Chen", "date": "20260101", "type": "event", "preview": "- **When:** Thursday, January 1, 2026 12:00–13:00 MST", "meta": {}, "segments": [{"day": "20260101", "key": "093000_300"}]} 5 + {"id": "event-4", "title": "Demo Prep — Betaworks Studio Day", "date": "20260101", "type": "event", "preview": "- **When:** Thursday, January 1, 2026 14:00–15:00 MST", "meta": {}, "segments": [{"day": "20260101", "key": "093000_300"}]}
+11
tests/fixtures/journal/imports/20260101_090000/import.json
··· 1 + { 2 + "original_filename": "calendar-export.zip", 3 + "upload_timestamp": 1767258000000, 4 + "upload_datetime": "2026-01-01T09:00:00", 5 + "user_timestamp": "20260101_090000", 6 + "file_size": 45678, 7 + "mime_type": "application/zip", 8 + "facet": "work", 9 + "setting": "calendar", 10 + "file_path": "imports/20260101_090000/calendar-export.zip" 11 + }
+22
tests/fixtures/journal/imports/20260101_090000/imported.json
··· 1 + { 2 + "processed_timestamp": "20260101_090000", 3 + "processing_completed": "2026-01-01T09:10:00", 4 + "total_files_created": 2, 5 + "all_created_files": [ 6 + "20260101/import.ics/090000_300/event_transcript.md", 7 + "20260101/import.ics/093000_300/event_transcript.md" 8 + ], 9 + "segments": [ 10 + "090000_300", 11 + "093000_300" 12 + ], 13 + "source_type": "ics", 14 + "source_display": "Calendar", 15 + "entries_written": 5, 16 + "entities_seeded": 0, 17 + "date_range": [ 18 + "20260101", 19 + "20260101" 20 + ], 21 + "target_day": "20260101" 22 + }
+3
tests/fixtures/journal/imports/20260101_100000/content_manifest.jsonl
··· 1 + {"id": "conv-0", "title": "Asyncio cancellation debugging", "date": "20260101", "type": "conversation", "preview": "I'm getting really weird behavior with asyncio task cancellation. When I cancel a parent task, sometimes the child coroutines keep running and I end up with dangling connections.", "meta": {"message_count": 6}, "segments": [{"day": "20260101", "key": "100000_300"}]} 2 + {"id": "conv-1", "title": "Postgres event sourcing trade-offs", "date": "20260101", "type": "conversation", "preview": "We're comparing Postgres-backed event sourcing with EventStoreDB for Solstone and trying to keep the stack minimal without painting ourselves into a corner.", "meta": {"message_count": 6}, "segments": [{"day": "20260101", "key": "100500_300"}]} 3 + {"id": "conv-2", "title": "Kubernetes migration options", "date": "20260101", "type": "conversation", "preview": "We're a small team and docker-compose in production is starting to crack. I'm trying to decide between Kubernetes, Nomad, and ECS for the next step.", "meta": {"message_count": 6}, "segments": [{"day": "20260101", "key": "101000_300"}]}
+11
tests/fixtures/journal/imports/20260101_100000/import.json
··· 1 + { 2 + "original_filename": "chatgpt-export.zip", 3 + "upload_timestamp": 1767261600000, 4 + "upload_datetime": "2026-01-01T10:00:00", 5 + "user_timestamp": "20260101_100000", 6 + "file_size": 123456, 7 + "mime_type": "application/zip", 8 + "facet": "work", 9 + "setting": "ai_conversation", 10 + "file_path": "imports/20260101_100000/chatgpt-export.zip" 11 + }
+24
tests/fixtures/journal/imports/20260101_100000/imported.json
··· 1 + { 2 + "processed_timestamp": "20260101_100000", 3 + "processing_completed": "2026-01-01T10:05:00", 4 + "total_files_created": 3, 5 + "all_created_files": [ 6 + "20260101/import.chatgpt/100000_300/conversation_transcript.jsonl", 7 + "20260101/import.chatgpt/100500_300/conversation_transcript.jsonl", 8 + "20260101/import.chatgpt/101000_300/conversation_transcript.jsonl" 9 + ], 10 + "segments": [ 11 + "100000_300", 12 + "100500_300", 13 + "101000_300" 14 + ], 15 + "source_type": "chatgpt", 16 + "source_display": "ChatGPT", 17 + "entries_written": 18, 18 + "entities_seeded": 0, 19 + "date_range": [ 20 + "20260101", 21 + "20260101" 22 + ], 23 + "target_day": "20260101" 24 + }
+130
tests/test_content_api.py
··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + import json 5 + from pathlib import Path 6 + 7 + import pytest 8 + 9 + from convey import create_app 10 + 11 + 12 + @pytest.fixture 13 + def content_client(): 14 + journal = Path(__file__).resolve().parent / "fixtures" / "journal" 15 + app = create_app(str(journal)) 16 + return app.test_client() 17 + 18 + 19 + def test_content_list_endpoint(content_client): 20 + response = content_client.get("/app/import/api/20260101_090000/content") 21 + 22 + assert response.status_code == 200 23 + data = response.get_json() 24 + assert data["total"] == 5 25 + assert data["source_type"] == "ics" 26 + assert data["source_display"] == "Calendar" 27 + assert data["months"] == {"202601": 5} 28 + 29 + 30 + def test_content_list_pagination(content_client): 31 + response = content_client.get( 32 + "/app/import/api/20260101_090000/content?page=2&per_page=2" 33 + ) 34 + 35 + assert response.status_code == 200 36 + data = response.get_json() 37 + assert data["page"] == 2 38 + assert data["per_page"] == 2 39 + assert data["pages"] == 3 40 + assert len(data["items"]) == 2 41 + 42 + 43 + def test_content_list_search_filter(content_client): 44 + response = content_client.get("/app/import/api/20260101_090000/content?q=betaworks") 45 + 46 + assert response.status_code == 200 47 + data = response.get_json() 48 + assert data["total"] == 2 49 + assert all( 50 + "betaworks" in (item["title"] + item["preview"]).lower() 51 + for item in data["items"] 52 + ) 53 + 54 + 55 + def test_content_list_month_filter(content_client): 56 + response = content_client.get( 57 + "/app/import/api/20260101_100000/content?month=202601" 58 + ) 59 + 60 + assert response.status_code == 200 61 + data = response.get_json() 62 + assert data["total"] == 3 63 + assert all(item["date"].startswith("202601") for item in data["items"]) 64 + 65 + 66 + def test_content_detail_endpoint(content_client): 67 + response = content_client.get("/app/import/api/20260101_090000/content/event-0") 68 + 69 + assert response.status_code == 200 70 + data = response.get_json() 71 + assert data["item"]["title"] == "Weekly Engineering Standup" 72 + assert data["content"][0]["type"] == "markdown" 73 + assert "Weekly Engineering Standup" in data["content"][0]["content"] 74 + 75 + 76 + def test_content_endpoint_404_for_missing_import(content_client): 77 + response = content_client.get("/app/import/api/20990101_000000/content") 78 + 79 + assert response.status_code == 404 80 + assert response.get_json()["error"] == "Import not found" 81 + 82 + 83 + def test_content_detail_404_for_missing_item(content_client): 84 + response = content_client.get( 85 + "/app/import/api/20260101_090000/content/missing-item" 86 + ) 87 + 88 + assert response.status_code == 404 89 + assert response.get_json()["error"] == "Item not found" 90 + 91 + 92 + def test_content_lazy_backfill(tmp_path): 93 + journal_root = tmp_path 94 + import_dir = journal_root / "imports" / "20260101_120000" 95 + seg_dir = journal_root / "20260101" / "import.chatgpt" / "120000_300" 96 + import_dir.mkdir(parents=True) 97 + seg_dir.mkdir(parents=True) 98 + 99 + (seg_dir / "conversation_transcript.jsonl").write_text( 100 + "\n".join( 101 + [ 102 + json.dumps({"topics": "planning"}), 103 + json.dumps({"speaker": "Human", "text": "hello"}), 104 + json.dumps({"speaker": "Assistant", "text": "hi"}), 105 + ] 106 + ) 107 + + "\n", 108 + encoding="utf-8", 109 + ) 110 + (import_dir / "imported.json").write_text( 111 + json.dumps( 112 + { 113 + "source_type": "chatgpt", 114 + "all_created_files": [ 115 + "20260101/import.chatgpt/120000_300/conversation_transcript.jsonl", 116 + ], 117 + } 118 + ), 119 + encoding="utf-8", 120 + ) 121 + 122 + app = create_app(str(journal_root)) 123 + client = app.test_client() 124 + 125 + response = client.get("/app/import/api/20260101_120000/content") 126 + 127 + assert response.status_code == 200 128 + data = response.get_json() 129 + assert data["total"] == 1 130 + assert (import_dir / "content_manifest.jsonl").exists()
+172
tests/test_content_manifest.py
··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + import datetime as dt 5 + import json 6 + import zipfile 7 + 8 + from think.importers.chatgpt import ChatGPTImporter 9 + from think.importers.ics import ICSImporter 10 + from think.importers.shared import map_items_to_segments, write_content_manifest 11 + from think.importers.utils import generate_content_manifest 12 + 13 + 14 + def test_write_content_manifest(tmp_path, monkeypatch): 15 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 16 + entries = [ 17 + { 18 + "id": "conv-0", 19 + "title": "Conversation 1", 20 + "date": "20260101", 21 + "type": "conversation", 22 + "preview": "hello", 23 + "segments": [{"day": "20260101", "key": "100000_300"}], 24 + } 25 + ] 26 + 27 + manifest_path = write_content_manifest("20260101_100000", entries) 28 + 29 + assert ( 30 + manifest_path 31 + == tmp_path / "imports" / "20260101_100000" / "content_manifest.jsonl" 32 + ) 33 + lines = manifest_path.read_text(encoding="utf-8").strip().splitlines() 34 + assert [json.loads(line) for line in lines] == entries 35 + 36 + 37 + def test_map_items_to_segments(): 38 + timestamps = [ 39 + dt.datetime(2026, 1, 1, 10, 0, 0).timestamp(), 40 + dt.datetime(2026, 1, 1, 10, 1, 0).timestamp(), 41 + dt.datetime(2026, 1, 1, 10, 10, 0).timestamp(), 42 + ] 43 + 44 + assert map_items_to_segments(timestamps, tz=None) == [ 45 + ("20260101", "100000_300"), 46 + ("20260101", "100000_300"), 47 + ("20260101", "101000_300"), 48 + ] 49 + 50 + 51 + def test_generate_content_manifest_from_segments(tmp_path): 52 + journal_root = tmp_path 53 + import_dir = journal_root / "imports" / "20260101_090000" 54 + segment_dir = journal_root / "20260101" / "import.ics" / "090000_300" 55 + segment_dir.mkdir(parents=True) 56 + import_dir.mkdir(parents=True) 57 + 58 + (segment_dir / "event_transcript.md").write_text( 59 + "## Event One\n\nBody one.\n\n## Event Two\n\nBody two.\n", 60 + encoding="utf-8", 61 + ) 62 + (import_dir / "imported.json").write_text( 63 + json.dumps( 64 + { 65 + "source_type": "ics", 66 + "all_created_files": [ 67 + "20260101/import.ics/090000_300/event_transcript.md", 68 + ], 69 + } 70 + ), 71 + encoding="utf-8", 72 + ) 73 + 74 + manifest_path = generate_content_manifest(journal_root, "20260101_090000") 75 + 76 + assert manifest_path == import_dir / "content_manifest.jsonl" 77 + entries = [ 78 + json.loads(line) 79 + for line in manifest_path.read_text(encoding="utf-8").splitlines() 80 + if line.strip() 81 + ] 82 + assert [entry["title"] for entry in entries] == ["Event One", "Event Two"] 83 + assert all(entry["type"] == "event" for entry in entries) 84 + assert all( 85 + entry["segments"] == [{"day": "20260101", "key": "090000_300"}] 86 + for entry in entries 87 + ) 88 + 89 + 90 + def test_chatgpt_importer_writes_content_manifest(tmp_path, monkeypatch): 91 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 92 + archive = tmp_path / "chatgpt.zip" 93 + conversations = [ 94 + { 95 + "title": "Async help", 96 + "create_time": dt.datetime(2026, 1, 1, 10, 0, 0).timestamp(), 97 + "current_node": "assistant-node", 98 + "mapping": { 99 + "user-node": { 100 + "parent": None, 101 + "message": { 102 + "author": {"role": "user"}, 103 + "content": {"parts": ["How do I debug asyncio cancellation?"]}, 104 + "create_time": dt.datetime(2026, 1, 1, 10, 0, 0).timestamp(), 105 + }, 106 + }, 107 + "assistant-node": { 108 + "parent": "user-node", 109 + "message": { 110 + "author": {"role": "assistant"}, 111 + "content": {"parts": ["Trace CancelledError propagation."]}, 112 + "create_time": dt.datetime(2026, 1, 1, 10, 1, 0).timestamp(), 113 + "metadata": {"model_slug": "gpt-4o"}, 114 + }, 115 + }, 116 + }, 117 + } 118 + ] 119 + with zipfile.ZipFile(archive, "w") as zf: 120 + zf.writestr("conversations.json", json.dumps(conversations)) 121 + 122 + result = ChatGPTImporter().process( 123 + archive, 124 + tmp_path, 125 + import_id="20260101_100000", 126 + ) 127 + 128 + assert result.entries_written == 2 129 + manifest_path = tmp_path / "imports" / "20260101_100000" / "content_manifest.jsonl" 130 + assert manifest_path.exists() 131 + entries = [ 132 + json.loads(line) 133 + for line in manifest_path.read_text(encoding="utf-8").splitlines() 134 + if line.strip() 135 + ] 136 + assert entries[0]["title"] == "Async help" 137 + assert entries[0]["segments"] == [{"day": "20260101", "key": "100000_300"}] 138 + 139 + 140 + def test_ics_importer_writes_content_manifest(tmp_path, monkeypatch): 141 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 142 + ics_path = tmp_path / "calendar.ics" 143 + ics_path.write_bytes( 144 + b"""BEGIN:VCALENDAR 145 + VERSION:2.0 146 + BEGIN:VEVENT 147 + DTSTART:20260101T170000Z 148 + DTEND:20260101T173000Z 149 + SUMMARY:Design Review 150 + DESCRIPTION:Review the roadmap. 151 + CREATED:20260101T120000Z 152 + END:VEVENT 153 + END:VCALENDAR""" 154 + ) 155 + 156 + result = ICSImporter().process( 157 + ics_path, 158 + tmp_path, 159 + import_id="20260101_090000", 160 + ) 161 + 162 + assert result.entries_written == 1 163 + manifest_path = tmp_path / "imports" / "20260101_090000" / "content_manifest.jsonl" 164 + assert manifest_path.exists() 165 + entries = [ 166 + json.loads(line) 167 + for line in manifest_path.read_text(encoding="utf-8").splitlines() 168 + if line.strip() 169 + ] 170 + assert entries[0]["title"] == "Design Review" 171 + assert entries[0]["type"] == "event" 172 + assert entries[0]["segments"] == [{"day": "20260101", "key": "120000_300"}]
+2
tests/test_importer.py
··· 946 946 Path(ics_file), 947 947 Path(tmp_path), 948 948 facet=None, 949 + import_id="20260303_123456", 949 950 progress_callback=ANY, 950 951 ) 951 952 mock_call = callosum.emit.call_args_list[0] ··· 991 992 Path(ics_file), 992 993 Path(tmp_path), 993 994 facet=None, 995 + import_id="20260303_120000", 994 996 progress_callback=ANY, 995 997 ) 996 998 mock_call = callosum.emit.call_args_list[0]
+2 -1
tests/verify_api.py
··· 454 454 return { 455 455 item_key: ( 456 456 0 457 - if item_key == "mtime" and isinstance(item_value, (int, float)) 457 + if item_key in {"mtime", "created_at"} 458 + and isinstance(item_value, (int, float)) 458 459 else ( 459 460 round(item_value, 1) 460 461 if item_key == "score" and isinstance(item_value, float)
+61 -3
think/importers/chatgpt.py
··· 11 11 from typing import Any, Callable 12 12 13 13 from think.importers.file_importer import ImportPreview, ImportResult 14 - from think.importers.shared import _window_messages, write_segment 14 + from think.importers.shared import ( 15 + _window_messages, 16 + map_items_to_segments, 17 + write_content_manifest, 18 + write_segment, 19 + ) 15 20 from think.utils import day_path 16 21 17 22 logger = logging.getLogger(__name__) ··· 59 64 model_counts: dict[str, int] = {} 60 65 skipped = 0 61 66 62 - for conv in conversations: 67 + for conv_idx, conv in enumerate(conversations): 63 68 mapping = conv.get("mapping", {}) 64 69 if not mapping: 65 70 skipped += 1 ··· 98 103 "speaker": "Human" if role == "user" else "Assistant", 99 104 "text": text, 100 105 "model_slug": model_slug, 106 + "conv_id": conv_idx, 107 + "conv_title": conv.get("title", ""), 108 + "conv_create_time": conv.get("create_time"), 101 109 } 102 110 ) 103 111 conv_has_content = True ··· 191 199 journal_root: Path, 192 200 *, 193 201 facet: str | None = None, 202 + import_id: str | None = None, 194 203 progress_callback: Callable | None = None, 195 204 ) -> ImportResult: 196 205 conversations = _open_conversations(path) 197 - import_id = dt.datetime.now().strftime("%Y%m%d_%H%M%S") 206 + import_id = import_id or dt.datetime.now().strftime("%Y%m%d_%H%M%S") 198 207 messages, model_counts, skipped = _extract_messages(conversations) 199 208 if not messages: 200 209 return ImportResult( ··· 206 215 ) 207 216 208 217 messages.sort(key=lambda m: m["create_time"]) 218 + conv_meta: dict[int, dict[str, Any]] = {} 219 + for msg in messages: 220 + conv_id = msg.get("conv_id") 221 + if conv_id is None: 222 + continue 223 + meta = conv_meta.setdefault( 224 + conv_id, 225 + { 226 + "title": msg.get("conv_title", ""), 227 + "first_ts": msg["create_time"], 228 + "preview": "", 229 + "message_count": 0, 230 + }, 231 + ) 232 + meta["message_count"] += 1 233 + if not meta["preview"] and msg["speaker"] == "Human": 234 + meta["preview"] = msg["text"][:200] 209 235 earliest = dt.datetime.fromtimestamp( 210 236 messages[0]["create_time"], tz=dt.timezone.utc 211 237 ).strftime("%Y%m%d") ··· 249 275 250 276 if skipped: 251 277 logger.info("Skipped %d conversations with no content", skipped) 278 + 279 + conv_segments: dict[int, set[tuple[str, str]]] = {} 280 + msg_segments = map_items_to_segments( 281 + [msg["create_time"] for msg in messages], 282 + tz=None, 283 + ) 284 + for msg, segment in zip(messages, msg_segments, strict=False): 285 + conv_id = msg.get("conv_id") 286 + if conv_id is None: 287 + continue 288 + conv_segments.setdefault(conv_id, set()).add(segment) 289 + 290 + manifest_entries: list[dict[str, Any]] = [] 291 + for conv_id, meta in sorted( 292 + conv_meta.items(), key=lambda item: item[1]["first_ts"] 293 + ): 294 + first_dt = dt.datetime.fromtimestamp(meta["first_ts"], tz=dt.timezone.utc) 295 + manifest_entries.append( 296 + { 297 + "id": f"conv-{conv_id}", 298 + "title": meta["title"] or f"Conversation {conv_id + 1}", 299 + "date": first_dt.strftime("%Y%m%d"), 300 + "type": "conversation", 301 + "preview": meta["preview"], 302 + "meta": {"message_count": meta["message_count"]}, 303 + "segments": [ 304 + {"day": day, "key": key} 305 + for day, key in sorted(conv_segments.get(conv_id, set())) 306 + ], 307 + } 308 + ) 309 + write_content_manifest(import_id, manifest_entries) 252 310 253 311 days = sorted({day for day, _ in segments}) 254 312 model_info = ""
+60 -3
think/importers/claude_chat.py
··· 11 11 from typing import Any, Callable 12 12 13 13 from think.importers.file_importer import ImportPreview, ImportResult 14 - from think.importers.shared import _window_messages, write_segment 14 + from think.importers.shared import ( 15 + _window_messages, 16 + map_items_to_segments, 17 + write_content_manifest, 18 + write_segment, 19 + ) 15 20 from think.utils import day_path 16 21 17 22 logger = logging.getLogger(__name__) ··· 35 40 messages: list[dict[str, Any]] = [] 36 41 skipped = 0 37 42 38 - for conv in conversations: 43 + for conv_idx, conv in enumerate(conversations): 39 44 chat_messages = conv.get("chat_messages", []) 40 45 if not chat_messages: 41 46 skipped += 1 ··· 74 79 "speaker": "Human" if sender == "human" else "Assistant", 75 80 "text": text, 76 81 "model_slug": None, 82 + "conv_id": conv_idx, 83 + "conv_title": conv.get("name", ""), 77 84 } 78 85 ) 79 86 conv_has_content = True ··· 154 161 journal_root: Path, 155 162 *, 156 163 facet: str | None = None, 164 + import_id: str | None = None, 157 165 progress_callback: Callable | None = None, 158 166 ) -> ImportResult: 159 167 conversations = _open_conversations(path) 160 - import_id = dt.datetime.now().strftime("%Y%m%d_%H%M%S") 168 + import_id = import_id or dt.datetime.now().strftime("%Y%m%d_%H%M%S") 161 169 messages, skipped = _extract_messages(conversations) 162 170 if not messages: 163 171 return ImportResult( ··· 169 177 ) 170 178 171 179 messages.sort(key=lambda m: m["create_time"]) 180 + conv_meta: dict[int, dict[str, Any]] = {} 181 + for msg in messages: 182 + conv_id = msg.get("conv_id") 183 + if conv_id is None: 184 + continue 185 + meta = conv_meta.setdefault( 186 + conv_id, 187 + { 188 + "title": msg.get("conv_title", ""), 189 + "first_ts": msg["create_time"], 190 + "preview": "", 191 + "message_count": 0, 192 + }, 193 + ) 194 + meta["message_count"] += 1 195 + if not meta["preview"] and msg["speaker"] == "Human": 196 + meta["preview"] = msg["text"][:200] 172 197 earliest = dt.datetime.fromtimestamp( 173 198 messages[0]["create_time"], tz=dt.timezone.utc 174 199 ).strftime("%Y%m%d") ··· 212 237 213 238 if skipped: 214 239 logger.info("Skipped %d conversations with no content", skipped) 240 + 241 + conv_segments: dict[int, set[tuple[str, str]]] = {} 242 + msg_segments = map_items_to_segments( 243 + [msg["create_time"] for msg in messages], 244 + tz=None, 245 + ) 246 + for msg, segment in zip(messages, msg_segments, strict=False): 247 + conv_id = msg.get("conv_id") 248 + if conv_id is None: 249 + continue 250 + conv_segments.setdefault(conv_id, set()).add(segment) 251 + 252 + manifest_entries: list[dict[str, Any]] = [] 253 + for conv_id, meta in sorted( 254 + conv_meta.items(), key=lambda item: item[1]["first_ts"] 255 + ): 256 + first_dt = dt.datetime.fromtimestamp(meta["first_ts"], tz=dt.timezone.utc) 257 + manifest_entries.append( 258 + { 259 + "id": f"conv-{conv_id}", 260 + "title": meta["title"] or f"Conversation {conv_id + 1}", 261 + "date": first_dt.strftime("%Y%m%d"), 262 + "type": "conversation", 263 + "preview": meta["preview"], 264 + "meta": {"message_count": meta["message_count"]}, 265 + "segments": [ 266 + {"day": day, "key": key} 267 + for day, key in sorted(conv_segments.get(conv_id, set())) 268 + ], 269 + } 270 + ) 271 + write_content_manifest(import_id, manifest_entries) 215 272 216 273 days = sorted({day for day, _ in segments}) 217 274
+1
think/importers/cli.py
··· 641 641 Path(args.media), 642 642 journal_root, 643 643 facet=args.facet, 644 + import_id=_import_id, 644 645 progress_callback=_progress_callback, 645 646 ) 646 647
+1
think/importers/file_importer.py
··· 51 51 journal_root: Path, 52 52 *, 53 53 facet: str | None = None, 54 + import_id: str | None = None, 54 55 progress_callback: Callable | None = None, 55 56 ) -> ImportResult: ... 56 57
+57 -2
think/importers/gemini.py
··· 23 23 from typing import Any, Callable 24 24 25 25 from think.importers.file_importer import ImportPreview, ImportResult 26 - from think.importers.shared import _window_messages, write_segment 26 + from think.importers.shared import ( 27 + _window_messages, 28 + map_items_to_segments, 29 + write_content_manifest, 30 + write_segment, 31 + ) 27 32 from think.utils import day_path 28 33 29 34 logger = logging.getLogger(__name__) ··· 220 225 journal_root: Path, 221 226 *, 222 227 facet: str | None = None, 228 + import_id: str | None = None, 223 229 progress_callback: Callable | None = None, 224 230 ) -> ImportResult: 225 231 activities = _load_activities(path) 226 - import_id = dt.datetime.now().strftime("%Y%m%d_%H%M%S") 232 + import_id = import_id or dt.datetime.now().strftime("%Y%m%d_%H%M%S") 227 233 228 234 messages: list[dict[str, Any]] = [] 229 235 errors: list[str] = [] 230 236 skipped = 0 231 237 bard_count = 0 232 238 valid_count = 0 239 + valid_idx = 0 233 240 earliest_so_far: str | None = None 234 241 latest_so_far: str | None = None 242 + activity_meta: dict[int, dict[str, Any]] = {} 235 243 236 244 for i, act in enumerate(activities): 237 245 activity_messages = _parse_activity(act) ··· 239 247 skipped += 1 240 248 continue 241 249 valid_count += 1 250 + 251 + prompt = "" 252 + for activity_message in activity_messages: 253 + activity_message["activity_idx"] = valid_idx 254 + if not prompt and activity_message["speaker"] == "Human": 255 + prompt = activity_message["text"][:200] 256 + activity_meta[valid_idx] = { 257 + "title": prompt[:80] or f"Activity {valid_idx + 1}", 258 + "first_ts": activity_messages[0]["create_time"], 259 + "preview": prompt, 260 + } 261 + valid_idx += 1 242 262 243 263 products = [p.lower() for p in act.get("products", [])] 244 264 header = str(act.get("header", "")).lower() ··· 306 326 except Exception as exc: 307 327 errors.append(f"Failed to write segment {day}/{seg_key}: {exc}") 308 328 logger.warning("Failed to write segment %s/%s: %s", day, seg_key, exc) 329 + 330 + activity_segments: dict[int, set[tuple[str, str]]] = {} 331 + msg_segments = map_items_to_segments( 332 + [msg["create_time"] for msg in messages], 333 + tz=None, 334 + ) 335 + for msg, segment in zip(messages, msg_segments, strict=False): 336 + activity_idx = msg.get("activity_idx") 337 + if activity_idx is None: 338 + continue 339 + activity_segments.setdefault(activity_idx, set()).add(segment) 340 + 341 + manifest_entries: list[dict[str, Any]] = [] 342 + for activity_idx, meta in sorted( 343 + activity_meta.items(), 344 + key=lambda item: item[1]["first_ts"], 345 + ): 346 + first_dt = dt.datetime.fromtimestamp(meta["first_ts"], tz=dt.timezone.utc) 347 + manifest_entries.append( 348 + { 349 + "id": f"activity-{activity_idx}", 350 + "title": meta["title"], 351 + "date": first_dt.strftime("%Y%m%d"), 352 + "type": "conversation", 353 + "preview": meta["preview"], 354 + "meta": {}, 355 + "segments": [ 356 + {"day": day, "key": key} 357 + for day, key in sorted( 358 + activity_segments.get(activity_idx, set()) 359 + ) 360 + ], 361 + } 362 + ) 363 + write_content_manifest(import_id, manifest_entries) 309 364 310 365 segment_days = {day for day, _ in segments} 311 366
+43 -1
think/importers/ics.py
··· 10 10 from typing import Any, Callable 11 11 12 12 from think.importers.file_importer import ImportPreview, ImportResult 13 - from think.importers.shared import seed_entities, window_items, write_markdown_segments 13 + from think.importers.shared import ( 14 + map_items_to_segments, 15 + seed_entities, 16 + window_items, 17 + write_content_manifest, 18 + write_markdown_segments, 19 + ) 14 20 15 21 logger = logging.getLogger(__name__) 16 22 ··· 417 423 journal_root: Path, 418 424 *, 419 425 facet: str | None = None, 426 + import_id: str | None = None, 420 427 progress_callback: Callable | None = None, 421 428 ) -> ImportResult: 422 429 ics_blobs = _extract_ics_data(path) 430 + import_id = import_id or dt.datetime.now().strftime("%Y%m%d_%H%M%S") 423 431 424 432 all_entries: list[dict[str, Any]] = [] 425 433 errors: list[str] = [] ··· 463 471 ) 464 472 465 473 all_entries.sort(key=lambda entry: entry["create_ts"]) 474 + manifest_entries: list[dict[str, Any]] = [] 475 + for i, entry in enumerate(all_entries): 476 + create_dt = dt.datetime.fromtimestamp( 477 + entry["create_ts"], tz=dt.timezone.utc 478 + ) 479 + meta: dict[str, Any] = {} 480 + if entry.get("location"): 481 + meta["location"] = entry["location"] 482 + if entry.get("duration_minutes") is not None: 483 + meta["duration_minutes"] = entry["duration_minutes"] 484 + if entry.get("attendees"): 485 + meta["attendee_count"] = len(entry["attendees"]) 486 + manifest_entries.append( 487 + { 488 + "id": f"event-{i}", 489 + "title": entry.get("title", "Untitled event"), 490 + "date": create_dt.strftime("%Y%m%d"), 491 + "type": "event", 492 + "preview": entry.get("content", "")[:200], 493 + "meta": meta, 494 + "segments": [], 495 + } 496 + ) 466 497 earliest = dt.datetime.fromtimestamp( 467 498 all_entries[0]["create_ts"], tz=dt.timezone.utc 468 499 ).strftime("%Y%m%d") ··· 477 508 lambda items: "\n\n".join(_render_event_markdown(e) for e in items), 478 509 filename="event_transcript.md", 479 510 ) 511 + item_segments = map_items_to_segments( 512 + [entry["create_ts"] for entry in all_entries], 513 + tz=dt.timezone.utc, 514 + ) 515 + for manifest_entry, (day, key) in zip( 516 + manifest_entries, 517 + item_segments, 518 + strict=False, 519 + ): 520 + manifest_entry["segments"] = [{"day": day, "key": key}] 521 + write_content_manifest(import_id, manifest_entries) 480 522 481 523 segment_days = {day for day, _ in segments} 482 524
+42
think/importers/kindle.py
··· 11 11 12 12 from think.importers.file_importer import ImportPreview, ImportResult 13 13 from think.importers.shared import ( 14 + map_items_to_segments, 14 15 seed_entities, 15 16 window_items, 17 + write_content_manifest, 16 18 write_markdown_segments, 17 19 ) 18 20 ··· 252 254 journal_root: Path, 253 255 *, 254 256 facet: str | None = None, 257 + import_id: str | None = None, 255 258 progress_callback: Callable | None = None, 256 259 ) -> ImportResult: 257 260 text = path.read_text(encoding="utf-8-sig") 258 261 blocks = text.split(DELIMITER) 262 + import_id = import_id or dt.datetime.now().strftime("%Y%m%d_%H%M%S") 259 263 260 264 entries: list[dict] = [] 261 265 errors: list[str] = [] ··· 310 314 if earliest_so_far and latest_so_far 311 315 else None 312 316 ) 317 + books_map: dict[str, list[int]] = {} 318 + for i, entry in enumerate(entries): 319 + books_map.setdefault(entry["book_title"], []).append(i) 313 320 314 321 windows = window_items(entries, "create_ts", tz=None) 315 322 created_files, segments = write_markdown_segments( ··· 318 325 _render_highlight_markdown, 319 326 filename="highlights_transcript.md", 320 327 ) 328 + item_segments = map_items_to_segments( 329 + [entry["create_ts"] for entry in entries], 330 + tz=None, 331 + ) 332 + entry_segment_map = {idx: segment for idx, segment in enumerate(item_segments)} 333 + manifest_entries: list[dict] = [] 334 + for book_idx, (book_title, indices) in enumerate(sorted(books_map.items())): 335 + book_entries = [entries[i] for i in indices] 336 + author = book_entries[0].get("author", "") 337 + first_ts = min(entry["create_ts"] for entry in book_entries) 338 + first_dt = dt.datetime.fromtimestamp(first_ts) 339 + highlight_count = sum( 340 + 1 for entry in book_entries if entry.get("clip_type") != "note" 341 + ) 342 + note_count = sum( 343 + 1 for entry in book_entries if entry.get("clip_type") == "note" 344 + ) 345 + meta = {"author": author, "highlight_count": highlight_count} 346 + if note_count: 347 + meta["note_count"] = note_count 348 + segment_set = {entry_segment_map[i] for i in indices} 349 + manifest_entries.append( 350 + { 351 + "id": f"book-{book_idx}", 352 + "title": book_title + (f" by {author}" if author else ""), 353 + "date": first_dt.strftime("%Y%m%d"), 354 + "type": "highlight_group", 355 + "preview": book_entries[0].get("content", "")[:200], 356 + "meta": meta, 357 + "segments": [ 358 + {"day": day, "key": key} for day, key in sorted(segment_set) 359 + ], 360 + } 361 + ) 362 + write_content_manifest(import_id, manifest_entries) 321 363 322 364 segment_days = {day for day, _ in segments} 323 365
+39 -1
think/importers/obsidian.py
··· 11 11 from typing import Any, Callable 12 12 13 13 from think.importers.file_importer import ImportPreview, ImportResult 14 - from think.importers.shared import seed_entities, window_items, write_markdown_segments 14 + from think.importers.shared import ( 15 + map_items_to_segments, 16 + seed_entities, 17 + window_items, 18 + write_content_manifest, 19 + write_markdown_segments, 20 + ) 15 21 16 22 logger = logging.getLogger(__name__) 17 23 ··· 219 225 journal_root: Path, 220 226 *, 221 227 facet: str | None = None, 228 + import_id: str | None = None, 222 229 progress_callback: Callable | None = None, 223 230 ) -> ImportResult: 224 231 md_files = list(self._walk_md_files(path)) 225 232 total = len(md_files) 233 + import_id = import_id or dt.datetime.now().strftime("%Y%m%d_%H%M%S") 226 234 227 235 notes: list[dict[str, Any]] = [] 228 236 all_wikilinks: set[str] = set() ··· 300 308 earliest = latest = dt.datetime.now().strftime("%Y%m%d") 301 309 302 310 notes.sort(key=lambda n: n["mtime"]) 311 + note_manifest: list[dict[str, Any]] = [] 312 + for i, note in enumerate(notes): 313 + note_dt = dt.datetime.fromtimestamp(note["mtime"]) 314 + meta: dict[str, Any] = {} 315 + if note.get("tags"): 316 + meta["tags"] = note["tags"] 317 + if note.get("is_daily"): 318 + meta["is_daily"] = True 319 + note_manifest.append( 320 + { 321 + "id": f"note-{i}", 322 + "title": note["title"], 323 + "date": note_dt.strftime("%Y%m%d"), 324 + "type": "note", 325 + "preview": _strip_frontmatter(note.get("content", "")).strip()[ 326 + :200 327 + ], 328 + "meta": meta, 329 + "segments": [], 330 + } 331 + ) 303 332 304 333 windows = window_items(notes, "mtime", tz=None) 305 334 created_files, segments = write_markdown_segments( ··· 308 337 lambda items: "\n\n".join(_render_note_markdown(n) for n in items), 309 338 filename="note_transcript.md", 310 339 ) 340 + note_segments = map_items_to_segments( 341 + [note["mtime"] for note in notes], 342 + tz=None, 343 + ) 344 + for manifest_entry, (day, key) in zip( 345 + note_manifest, note_segments, strict=False 346 + ): 347 + manifest_entry["segments"] = [{"day": day, "key": key}] 348 + write_content_manifest(import_id, note_manifest) 311 349 312 350 # Seed entities from wikilinks 313 351 entities_seeded = 0
+47
think/importers/shared.py
··· 604 604 return "|".join(parts) 605 605 606 606 607 + def write_content_manifest( 608 + import_id: str, 609 + entries: list[dict[str, Any]], 610 + ) -> Path: 611 + """Write content_manifest.jsonl for an import.""" 612 + journal_root = Path(get_journal()) 613 + manifest_dir = journal_root / "imports" / import_id 614 + manifest_dir.mkdir(parents=True, exist_ok=True) 615 + manifest_path = manifest_dir / "content_manifest.jsonl" 616 + 617 + with open(manifest_path, "w", encoding="utf-8") as f: 618 + for entry in entries: 619 + f.write(json.dumps(entry) + "\n") 620 + 621 + return manifest_path 622 + 623 + 624 + def map_items_to_segments( 625 + timestamps: list[float], 626 + *, 627 + window_duration: int = 300, 628 + tz: dt.timezone | None = dt.timezone.utc, 629 + ) -> list[tuple[str, str]]: 630 + """Map sorted timestamps to the segments produced by windowing helpers.""" 631 + result: list[tuple[str, str]] = [] 632 + window_start: float | None = None 633 + window_day: str | None = None 634 + 635 + for ts in timestamps: 636 + ts_dt = dt.datetime.fromtimestamp(ts, tz=tz) 637 + ts_day = ts_dt.strftime("%Y%m%d") 638 + 639 + if ( 640 + window_start is None 641 + or ts_day != window_day 642 + or ts - window_start >= window_duration 643 + ): 644 + window_start = ts 645 + window_day = ts_day 646 + 647 + start_dt = dt.datetime.fromtimestamp(window_start, tz=tz) 648 + seg_key = f"{start_dt.strftime('%H%M%S')}_{window_duration}" 649 + result.append((window_day, seg_key)) 650 + 651 + return result 652 + 653 + 607 654 def seed_entities( 608 655 facet: str, 609 656 day: str,
+114
think/importers/utils.py
··· 10 10 from __future__ import annotations 11 11 12 12 import json 13 + import re 13 14 from pathlib import Path 14 15 15 16 # ============================================================================ ··· 421 422 pass 422 423 423 424 return result 425 + 426 + 427 + def _backfill_item_type(source_type: str) -> str: 428 + """Map source_type to manifest item type for backfill.""" 429 + return { 430 + "ics": "event", 431 + "kindle": "highlight_group", 432 + "obsidian": "note", 433 + }.get(source_type, "conversation") 434 + 435 + 436 + def generate_content_manifest(journal_root: Path, timestamp: str) -> Path | None: 437 + """Generate content_manifest.jsonl by backfilling from segment files.""" 438 + import_dir = journal_root / "imports" / timestamp 439 + imported_path = import_dir / "imported.json" 440 + if not imported_path.exists(): 441 + return None 442 + 443 + imported = json.loads(imported_path.read_text(encoding="utf-8")) 444 + source_type = imported.get("source_type", "") 445 + all_files = imported.get("all_created_files", []) 446 + 447 + entries: list[dict] = [] 448 + entry_idx = 0 449 + 450 + for file_path_str in all_files: 451 + file_path = Path(file_path_str) 452 + if not file_path.exists(): 453 + file_path = journal_root / file_path_str 454 + if not file_path.exists(): 455 + continue 456 + 457 + parts = file_path.parts 458 + try: 459 + seg_key = parts[-2] 460 + day = parts[-4] if len(parts) >= 4 else "" 461 + if not (len(day) == 8 and day.isdigit()): 462 + day = parts[-3] if len(parts) >= 3 else "" 463 + if not (len(day) == 8 and day.isdigit()): 464 + day = "" 465 + except (IndexError, ValueError): 466 + seg_key = "" 467 + day = "" 468 + 469 + segment = {"day": day, "key": seg_key} if day and seg_key else {} 470 + 471 + if file_path.suffix == ".jsonl": 472 + try: 473 + lines = file_path.read_text(encoding="utf-8").strip().split("\n") 474 + if len(lines) < 2: 475 + continue 476 + header = json.loads(lines[0]) 477 + topic = header.get("topics", "") 478 + messages = [] 479 + for line in lines[1:]: 480 + try: 481 + messages.append(json.loads(line)) 482 + except json.JSONDecodeError: 483 + continue 484 + if not messages: 485 + continue 486 + preview = "" 487 + for message in messages: 488 + if message.get("speaker") == "Human": 489 + preview = message.get("text", "")[:200] 490 + break 491 + entries.append( 492 + { 493 + "id": f"seg-{entry_idx}", 494 + "title": topic or preview[:80] or "Conversation segment", 495 + "date": day, 496 + "type": "conversation", 497 + "preview": preview, 498 + "meta": {"message_count": len(messages)}, 499 + "segments": [segment] if segment else [], 500 + } 501 + ) 502 + entry_idx += 1 503 + except (OSError, json.JSONDecodeError): 504 + continue 505 + elif file_path.suffix == ".md": 506 + try: 507 + content = file_path.read_text(encoding="utf-8") 508 + except OSError: 509 + continue 510 + sections = re.split(r"(?m)^## ", content) 511 + for section in sections: 512 + section = section.strip() 513 + if not section: 514 + continue 515 + title_line = section.split("\n", 1)[0].strip() 516 + body = section.split("\n", 1)[1].strip() if "\n" in section else "" 517 + entries.append( 518 + { 519 + "id": f"item-{entry_idx}", 520 + "title": title_line, 521 + "date": day, 522 + "type": _backfill_item_type(source_type), 523 + "preview": body[:200], 524 + "meta": {}, 525 + "segments": [segment] if segment else [], 526 + } 527 + ) 528 + entry_idx += 1 529 + 530 + if not entries: 531 + return None 532 + 533 + manifest_path = import_dir / "content_manifest.jsonl" 534 + with open(manifest_path, "w", encoding="utf-8") as f: 535 + for entry in entries: 536 + f.write(json.dumps(entry) + "\n") 537 + return manifest_path 424 538 425 539 426 540 # ============================================================================