personal memory agent
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

at d18a7c02359cd827d0ff15058861de5c2600a96f 381 lines 14 kB view raw
1# SPDX-License-Identifier: AGPL-3.0-only 2# Copyright (c) 2026 sol pbc 3 4"""Formatters framework for JSONL and Markdown files. 5 6This module provides a registry-based system for converting structured files 7to markdown chunks. Each formatter is a plain function that lives near its 8source domain code. 9 10Supported file types: 11 - JSONL (.jsonl): Parsed as JSON lines, passed as list[dict] to formatter 12 - Markdown (.md): Read as text, passed as str to formatter 13 14Output contract: All formatters return tuple[list[dict], dict] where: 15 - list[dict]: Chunks, each with: 16 - markdown: str (formatted markdown for this chunk) 17 - timestamp: int (optional - unix timestamp in milliseconds for ordering) 18 - source: dict (optional - original entry from JSONL for enriched streams) 19 - dict: Metadata about the formatting with optional keys: 20 - header: str - Optional header markdown (metadata summary, context, etc.) 21 - error: str - Optional error/warning message (e.g., skipped entries) 22 - indexer: dict - Indexing metadata with keys: 23 - agent: str - Content type (e.g., "event", "audio", "screen") 24 JSONL formatters must provide agent. Markdown agent is path-derived. 25 Day and facet are extracted from path by extract_path_metadata(). 26 27JSONL formatters receive list[dict] entries and are responsible for: 28 - Extracting metadata from entries (typically first line) 29 - Building header from metadata if applicable 30 - Formatting content entries into chunks 31 - Providing indexer.agent in the meta dict 32 33Markdown formatters receive str text and perform semantic chunking. 34""" 35 36import fnmatch 37import json 38import os 39from importlib import import_module 40from pathlib import Path 41from typing import Any, Callable 42 43from think.utils import CHRONICLE_DIR, DATE_RE, get_journal, journal_relative_path 44 45 46def extract_path_metadata(rel_path: str) -> dict[str, str]: 47 """Extract indexing metadata from a journal-relative path. 48 49 Extracts day and facet from path structure. For markdown files, also 50 derives agent from path. For JSONL files, agent should be provided 51 by the formatter via meta["indexer"]["agent"]. 52 53 Args: 54 rel_path: Journal-relative path (e.g., "20240101/talents/flow.md") 55 56 Returns: 57 Dict with keys: day, facet, agent 58 - day: YYYYMMDD string or empty 59 - facet: Facet name or empty 60 - agent: Derived agent for .md files, empty for .jsonl 61 """ 62 parts = rel_path.replace("\\", "/").split("/") 63 filename = parts[-1] 64 basename = os.path.splitext(filename)[0] 65 is_markdown = filename.endswith(".md") 66 67 day = "" 68 facet = "" 69 agent = "" 70 71 # Extract day from YYYYMMDD directory prefix 72 if parts[0] and DATE_RE.fullmatch(parts[0]): 73 day = parts[0] 74 75 # Extract facet from talents/{facet}/... paths 76 try: 77 talents_idx = parts.index("talents") 78 if talents_idx + 2 < len(parts): 79 facet = parts[talents_idx + 1] 80 except ValueError: 81 pass 82 83 # Extract facet from facets/{facet}/... paths 84 if parts[0] == "facets" and len(parts) >= 3: 85 facet = parts[1] 86 # Day from YYYYMMDD filename (events/entities/todos/news) 87 if len(parts) >= 4 and DATE_RE.fullmatch(basename): 88 day = basename 89 # Day from activities/{YYYYMMDD}/{activity_id}/... directory structure 90 elif ( 91 len(parts) >= 5 and parts[2] == "activities" and DATE_RE.fullmatch(parts[3]) 92 ): 93 day = parts[3] 94 95 # Extract day from imports/YYYYMMDD_HHMMSS/... 96 if parts[0] == "imports" and len(parts) >= 2: 97 import_id = parts[1] 98 day = import_id.split("_")[0] if "_" in import_id else import_id[:8] 99 100 # Extract day from config/actions/YYYYMMDD.jsonl (journal-level logs) 101 if parts[0] == "config" and len(parts) >= 3 and parts[1] == "actions": 102 if DATE_RE.fullmatch(basename): 103 day = basename 104 105 # Derive agent for markdown files only 106 if is_markdown: 107 if parts[0] == "facets" and len(parts) >= 4 and parts[2] == "news": 108 agent = "news" 109 elif parts[0] == "imports": 110 agent = "import" 111 elif parts[0] == "apps" and len(parts) >= 4: 112 agent = f"{parts[1]}:{basename}" 113 else: 114 # Daily agent outputs, segment markdown: use basename 115 agent = basename 116 117 return {"day": day, "facet": facet, "agent": agent} 118 119 120# Registry mapping glob patterns to (module_path, function_name, indexed). 121# Patterns are matched against journal-relative paths and must be specific 122# enough to use as Path.glob() arguments from the journal root. The indexed 123# flag controls whether find_formattable_files() collects matching files for 124# the search index. Adding a new journal content location requires a new 125# entry here — see talent/journal/references/storage.md "Search Index" for details. 126# 127# Order matters: first match wins, so place specific patterns before general ones. 128FORMATTERS: dict[str, tuple[str, str, bool]] = { 129 # JSONL formatters (indexed) 130 "config/actions/*.jsonl": ("think.facets", "format_logs", True), 131 "facets/*/entities/*/observations.jsonl": ( 132 "think.entities.formatting", 133 "format_observations", 134 True, 135 ), 136 "facets/*/entities/*.jsonl": ("think.entities.formatting", "format_entities", True), 137 "entities/*/entity.json": ( 138 "think.entities.formatting", 139 "format_entity_identity", 140 False, # Indexed via _index_entity_search_chunks (enriched with relationship data) 141 ), 142 "facets/*/events/*.jsonl": ("think.event_formatter", "format_events", True), 143 "facets/*/activities/*.jsonl": ("think.activities", "format_activities", True), 144 "facets/*/todos/*.jsonl": ("apps.todos.todo", "format_todos", True), 145 "facets/*/logs/*.jsonl": ("think.facets", "format_logs", True), 146 # Structured file imports (indexed) 147 "*/import.*/imported.jsonl": ( 148 "think.importers.formatting", 149 "format_imported", 150 True, 151 ), 152 # Markdown transcript imports (new convention + legacy) 153 "*/import.*/*/*_transcript.md": ("think.markdown", "format_markdown", True), 154 "*/import.*/*/imported.md": ("think.markdown", "format_markdown", True), 155 # AI chat imports — dedicated formatter (new + legacy filenames) 156 "*/import.chatgpt/*/conversation_transcript.jsonl": ( 157 "think.importers.formatting", 158 "format_ai_chat", 159 True, 160 ), 161 "*/import.claude/*/conversation_transcript.jsonl": ( 162 "think.importers.formatting", 163 "format_ai_chat", 164 True, 165 ), 166 "*/import.gemini/*/conversation_transcript.jsonl": ( 167 "think.importers.formatting", 168 "format_ai_chat", 169 True, 170 ), 171 # Legacy AI chat import filenames (backward compat) 172 "*/import.chatgpt/*/imported_audio.jsonl": ( 173 "think.importers.formatting", 174 "format_ai_chat", 175 True, 176 ), 177 "*/import.claude/*/imported_audio.jsonl": ( 178 "think.importers.formatting", 179 "format_ai_chat", 180 True, 181 ), 182 "*/import.gemini/*/imported_audio.jsonl": ( 183 "think.importers.formatting", 184 "format_ai_chat", 185 True, 186 ), 187 # Raw transcripts — formattable but not indexed (agent outputs are more useful) 188 # Layout: day/stream/segment/audio.jsonl 189 "*/*/*/audio.jsonl": ("observe.hear", "format_audio", False), 190 "*/*/*/*_audio.jsonl": ("observe.hear", "format_audio", False), 191 "*/*/*/*_transcript.jsonl": ("observe.hear", "format_audio", False), 192 "*/*/*/screen.jsonl": ("observe.screen", "format_screen", False), 193 "*/*/*/*_screen.jsonl": ("observe.screen", "format_screen", False), 194 # Markdown — day-level agents output and segment-level (day/stream/segment/talents/) 195 "*/talents/*.md": ("think.markdown", "format_markdown", True), 196 # Layout: day/stream/segment/talents/*.md 197 "*/*/*/talents/*.md": ("think.markdown", "format_markdown", True), 198 "*/*/*/talents/*/*.md": ("think.markdown", "format_markdown", True), 199 "facets/*/activities/*/*/*.md": ("think.markdown", "format_markdown", True), 200 "facets/*/news/*.md": ("think.markdown", "format_markdown", True), 201 "imports/*/summary.md": ("think.markdown", "format_markdown", True), 202 "apps/*/talents/*.md": ("think.markdown", "format_markdown", True), 203} 204 205_DAY_ROOTED_PATTERNS = [p for p in FORMATTERS if p.startswith("*/")] 206_STRUCTURAL_PATTERNS = [p for p in FORMATTERS if not p.startswith("*/")] 207 208 209def get_formatter(file_path: str) -> Callable | None: 210 """Return formatter function for a journal-relative file path. 211 212 Matches against registered glob patterns (regardless of indexed flag). 213 214 Args: 215 file_path: Journal-relative path (e.g., "20240101/talents/flow.md") 216 217 Returns: 218 Formatter function or None if no pattern matches 219 """ 220 for pattern, (module_path, func_name, _indexed) in FORMATTERS.items(): 221 if fnmatch.fnmatch(file_path, pattern): 222 module = import_module(module_path) 223 return getattr(module, func_name) 224 225 return None 226 227 228def load_jsonl(file_path: str | Path) -> list[dict[str, Any]]: 229 """Load entries from a JSONL file. 230 231 Args: 232 file_path: Absolute path to JSONL file 233 234 Returns: 235 List of parsed JSON objects (one per line) 236 """ 237 entries = [] 238 with open(file_path, "r", encoding="utf-8") as f: 239 for line in f: 240 line = line.strip() 241 if not line: 242 continue 243 try: 244 entries.append(json.loads(line)) 245 except json.JSONDecodeError: 246 continue 247 return entries 248 249 250def load_markdown(file_path: str | Path) -> str: 251 """Load text from a markdown file. 252 253 Args: 254 file_path: Absolute path to markdown file 255 256 Returns: 257 File contents as string 258 """ 259 with open(file_path, "r", encoding="utf-8") as f: 260 return f.read() 261 262 263def find_formattable_files(journal: str) -> dict[str, str]: 264 """Find all indexable files in the journal. 265 266 Globs each indexed FORMATTERS pattern from the journal root to discover 267 files. The registry is the single source of truth for what gets indexed. 268 269 Args: 270 journal: Path to journal root directory 271 272 Returns: 273 Mapping of journal-relative paths to absolute paths 274 """ 275 files: dict[str, str] = {} 276 journal_path = Path(journal) 277 day_root = ( 278 journal_path / CHRONICLE_DIR 279 if (journal_path / CHRONICLE_DIR).is_dir() 280 else journal_path 281 ) 282 283 for pattern in _STRUCTURAL_PATTERNS: 284 _mod, _func, indexed = FORMATTERS[pattern] 285 if not indexed: 286 continue 287 for match in journal_path.glob(pattern): 288 if match.is_file(): 289 rel = match.relative_to(journal_path).as_posix() 290 files[rel] = str(match) 291 292 for pattern in _DAY_ROOTED_PATTERNS: 293 _mod, _func, indexed = FORMATTERS[pattern] 294 if not indexed: 295 continue 296 for match in day_root.glob(pattern): 297 if match.is_file(): 298 rel = match.relative_to(day_root).as_posix() 299 files[rel] = str(match) 300 301 return files 302 303 304def format_file( 305 file_path: str | Path, 306 context: dict[str, Any] | None = None, 307) -> tuple[list[dict[str, Any]], dict[str, Any]]: 308 """Load file, detect formatter, return formatted chunks and metadata. 309 310 File must be under the journal root. Supports JSONL, JSON, and Markdown files. 311 312 Args: 313 file_path: Absolute or journal-relative path to file 314 context: Optional context dict passed to formatter 315 316 Returns: 317 Tuple of (chunks, meta) where: 318 - chunks: List of dicts with "markdown" key (and optional "timestamp") 319 - meta: Dict with optional "header" and "error" keys 320 321 Raises: 322 ValueError: If file is outside journal or no formatter found 323 FileNotFoundError: If file doesn't exist 324 """ 325 journal_path = Path(get_journal()).resolve() 326 file_path = Path(file_path).resolve() 327 328 if not file_path.exists(): 329 raise FileNotFoundError(f"File not found: {file_path}") 330 331 # Require file to be under journal 332 if not file_path.is_relative_to(journal_path): 333 raise ValueError(f"File is outside journal directory: {file_path}") 334 335 rel_path = journal_relative_path(journal_path, file_path) 336 337 formatter = get_formatter(rel_path) 338 if formatter is None: 339 raise ValueError(f"No formatter found for: {rel_path}") 340 341 # Load file based on extension 342 if file_path.suffix == ".md": 343 content = load_markdown(file_path) 344 elif file_path.suffix == ".json": 345 with open(file_path, encoding="utf-8") as f: 346 content = [json.load(f)] 347 else: 348 content = load_jsonl(file_path) 349 350 # Build context with file path info 351 ctx = context or {} 352 ctx.setdefault("file_path", file_path) 353 354 return formatter(content, ctx) 355 356 357def _format_chunk_summary(chunks: list[dict], raw_chunks: list[dict] | None) -> None: 358 """Print human-readable chunk summary (for markdown files with raw chunks).""" 359 print(f"Total chunks: {len(chunks)}\n") 360 for i, chunk in enumerate(chunks): 361 # Use raw chunk data if available, otherwise extract from markdown 362 if raw_chunks and i < len(raw_chunks): 363 c = raw_chunks[i] 364 chunk_type = c.get("type", "unknown") 365 header_path = c.get("header_path", []) 366 intro = c.get("intro") 367 preview = c.get("preview", "") 368 else: 369 chunk_type = "chunk" 370 header_path = [] 371 intro = None 372 preview = chunk.get("markdown", "")[:70] 373 374 path = " > ".join(f"H{h['level']}:{h['text']}" for h in header_path) 375 print(f"#{i:3d} [{chunk_type:13s}]") 376 if path: 377 print(f" path: {path}") 378 if intro: 379 print(f' intro: "{intro[:60]}{"..." if len(intro) > 60 else ""}"') 380 print(f" {preview[:70]}{'...' if len(preview) > 70 else ''}") 381 print()