personal memory agent
1# SPDX-License-Identifier: AGPL-3.0-only
2# Copyright (c) 2026 sol pbc
3
4"""Formatters framework for JSONL and Markdown files.
5
6This module provides a registry-based system for converting structured files
7to markdown chunks. Each formatter is a plain function that lives near its
8source domain code.
9
10Supported file types:
11 - JSONL (.jsonl): Parsed as JSON lines, passed as list[dict] to formatter
12 - Markdown (.md): Read as text, passed as str to formatter
13
14Output contract: All formatters return tuple[list[dict], dict] where:
15 - list[dict]: Chunks, each with:
16 - markdown: str (formatted markdown for this chunk)
17 - timestamp: int (optional - unix timestamp in milliseconds for ordering)
18 - source: dict (optional - original entry from JSONL for enriched streams)
19 - dict: Metadata about the formatting with optional keys:
20 - header: str - Optional header markdown (metadata summary, context, etc.)
21 - error: str - Optional error/warning message (e.g., skipped entries)
22 - indexer: dict - Indexing metadata with keys:
23 - agent: str - Content type (e.g., "event", "audio", "screen")
24 JSONL formatters must provide agent. Markdown agent is path-derived.
25 Day and facet are extracted from path by extract_path_metadata().
26
27JSONL formatters receive list[dict] entries and are responsible for:
28 - Extracting metadata from entries (typically first line)
29 - Building header from metadata if applicable
30 - Formatting content entries into chunks
31 - Providing indexer.agent in the meta dict
32
33Markdown formatters receive str text and perform semantic chunking.
34"""
35
36import fnmatch
37import json
38import os
39from importlib import import_module
40from pathlib import Path
41from typing import Any, Callable
42
43from think.utils import CHRONICLE_DIR, DATE_RE, get_journal, journal_relative_path
44
45
46def extract_path_metadata(rel_path: str) -> dict[str, str]:
47 """Extract indexing metadata from a journal-relative path.
48
49 Extracts day and facet from path structure. For markdown files, also
50 derives agent from path. For JSONL files, agent should be provided
51 by the formatter via meta["indexer"]["agent"].
52
53 Args:
54 rel_path: Journal-relative path (e.g., "20240101/talents/flow.md")
55
56 Returns:
57 Dict with keys: day, facet, agent
58 - day: YYYYMMDD string or empty
59 - facet: Facet name or empty
60 - agent: Derived agent for .md files, empty for .jsonl
61 """
62 parts = rel_path.replace("\\", "/").split("/")
63 filename = parts[-1]
64 basename = os.path.splitext(filename)[0]
65 is_markdown = filename.endswith(".md")
66
67 day = ""
68 facet = ""
69 agent = ""
70
71 # Extract day from YYYYMMDD directory prefix
72 if parts[0] and DATE_RE.fullmatch(parts[0]):
73 day = parts[0]
74
75 # Extract facet from talents/{facet}/... paths
76 try:
77 talents_idx = parts.index("talents")
78 if talents_idx + 2 < len(parts):
79 facet = parts[talents_idx + 1]
80 except ValueError:
81 pass
82
83 # Extract facet from facets/{facet}/... paths
84 if parts[0] == "facets" and len(parts) >= 3:
85 facet = parts[1]
86 # Day from YYYYMMDD filename (events/entities/todos/news)
87 if len(parts) >= 4 and DATE_RE.fullmatch(basename):
88 day = basename
89 # Day from activities/{YYYYMMDD}/{activity_id}/... directory structure
90 elif (
91 len(parts) >= 5 and parts[2] == "activities" and DATE_RE.fullmatch(parts[3])
92 ):
93 day = parts[3]
94
95 # Extract day from imports/YYYYMMDD_HHMMSS/...
96 if parts[0] == "imports" and len(parts) >= 2:
97 import_id = parts[1]
98 day = import_id.split("_")[0] if "_" in import_id else import_id[:8]
99
100 # Extract day from config/actions/YYYYMMDD.jsonl (journal-level logs)
101 if parts[0] == "config" and len(parts) >= 3 and parts[1] == "actions":
102 if DATE_RE.fullmatch(basename):
103 day = basename
104
105 # Derive agent for markdown files only
106 if is_markdown:
107 if parts[0] == "facets" and len(parts) >= 4 and parts[2] == "news":
108 agent = "news"
109 elif parts[0] == "imports":
110 agent = "import"
111 elif parts[0] == "apps" and len(parts) >= 4:
112 agent = f"{parts[1]}:{basename}"
113 else:
114 # Daily agent outputs, segment markdown: use basename
115 agent = basename
116
117 return {"day": day, "facet": facet, "agent": agent}
118
119
120# Registry mapping glob patterns to (module_path, function_name, indexed).
121# Patterns are matched against journal-relative paths and must be specific
122# enough to use as Path.glob() arguments from the journal root. The indexed
123# flag controls whether find_formattable_files() collects matching files for
124# the search index. Adding a new journal content location requires a new
125# entry here — see talent/journal/references/storage.md "Search Index" for details.
126#
127# Order matters: first match wins, so place specific patterns before general ones.
128FORMATTERS: dict[str, tuple[str, str, bool]] = {
129 # JSONL formatters (indexed)
130 "config/actions/*.jsonl": ("think.facets", "format_logs", True),
131 "facets/*/entities/*/observations.jsonl": (
132 "think.entities.formatting",
133 "format_observations",
134 True,
135 ),
136 "facets/*/entities/*.jsonl": ("think.entities.formatting", "format_entities", True),
137 "entities/*/entity.json": (
138 "think.entities.formatting",
139 "format_entity_identity",
140 False, # Indexed via _index_entity_search_chunks (enriched with relationship data)
141 ),
142 "facets/*/events/*.jsonl": ("think.event_formatter", "format_events", True),
143 "facets/*/activities/*.jsonl": ("think.activities", "format_activities", True),
144 "facets/*/todos/*.jsonl": ("apps.todos.todo", "format_todos", True),
145 "facets/*/logs/*.jsonl": ("think.facets", "format_logs", True),
146 # Structured file imports (indexed)
147 "*/import.*/imported.jsonl": (
148 "think.importers.formatting",
149 "format_imported",
150 True,
151 ),
152 # Markdown transcript imports (new convention + legacy)
153 "*/import.*/*/*_transcript.md": ("think.markdown", "format_markdown", True),
154 "*/import.*/*/imported.md": ("think.markdown", "format_markdown", True),
155 # AI chat imports — dedicated formatter (new + legacy filenames)
156 "*/import.chatgpt/*/conversation_transcript.jsonl": (
157 "think.importers.formatting",
158 "format_ai_chat",
159 True,
160 ),
161 "*/import.claude/*/conversation_transcript.jsonl": (
162 "think.importers.formatting",
163 "format_ai_chat",
164 True,
165 ),
166 "*/import.gemini/*/conversation_transcript.jsonl": (
167 "think.importers.formatting",
168 "format_ai_chat",
169 True,
170 ),
171 # Legacy AI chat import filenames (backward compat)
172 "*/import.chatgpt/*/imported_audio.jsonl": (
173 "think.importers.formatting",
174 "format_ai_chat",
175 True,
176 ),
177 "*/import.claude/*/imported_audio.jsonl": (
178 "think.importers.formatting",
179 "format_ai_chat",
180 True,
181 ),
182 "*/import.gemini/*/imported_audio.jsonl": (
183 "think.importers.formatting",
184 "format_ai_chat",
185 True,
186 ),
187 # Raw transcripts — formattable but not indexed (agent outputs are more useful)
188 # Layout: day/stream/segment/audio.jsonl
189 "*/*/*/audio.jsonl": ("observe.hear", "format_audio", False),
190 "*/*/*/*_audio.jsonl": ("observe.hear", "format_audio", False),
191 "*/*/*/*_transcript.jsonl": ("observe.hear", "format_audio", False),
192 "*/*/*/screen.jsonl": ("observe.screen", "format_screen", False),
193 "*/*/*/*_screen.jsonl": ("observe.screen", "format_screen", False),
194 # Markdown — day-level agents output and segment-level (day/stream/segment/talents/)
195 "*/talents/*.md": ("think.markdown", "format_markdown", True),
196 # Layout: day/stream/segment/talents/*.md
197 "*/*/*/talents/*.md": ("think.markdown", "format_markdown", True),
198 "*/*/*/talents/*/*.md": ("think.markdown", "format_markdown", True),
199 "facets/*/activities/*/*/*.md": ("think.markdown", "format_markdown", True),
200 "facets/*/news/*.md": ("think.markdown", "format_markdown", True),
201 "imports/*/summary.md": ("think.markdown", "format_markdown", True),
202 "apps/*/talents/*.md": ("think.markdown", "format_markdown", True),
203}
204
205_DAY_ROOTED_PATTERNS = [p for p in FORMATTERS if p.startswith("*/")]
206_STRUCTURAL_PATTERNS = [p for p in FORMATTERS if not p.startswith("*/")]
207
208
209def get_formatter(file_path: str) -> Callable | None:
210 """Return formatter function for a journal-relative file path.
211
212 Matches against registered glob patterns (regardless of indexed flag).
213
214 Args:
215 file_path: Journal-relative path (e.g., "20240101/talents/flow.md")
216
217 Returns:
218 Formatter function or None if no pattern matches
219 """
220 for pattern, (module_path, func_name, _indexed) in FORMATTERS.items():
221 if fnmatch.fnmatch(file_path, pattern):
222 module = import_module(module_path)
223 return getattr(module, func_name)
224
225 return None
226
227
228def load_jsonl(file_path: str | Path) -> list[dict[str, Any]]:
229 """Load entries from a JSONL file.
230
231 Args:
232 file_path: Absolute path to JSONL file
233
234 Returns:
235 List of parsed JSON objects (one per line)
236 """
237 entries = []
238 with open(file_path, "r", encoding="utf-8") as f:
239 for line in f:
240 line = line.strip()
241 if not line:
242 continue
243 try:
244 entries.append(json.loads(line))
245 except json.JSONDecodeError:
246 continue
247 return entries
248
249
250def load_markdown(file_path: str | Path) -> str:
251 """Load text from a markdown file.
252
253 Args:
254 file_path: Absolute path to markdown file
255
256 Returns:
257 File contents as string
258 """
259 with open(file_path, "r", encoding="utf-8") as f:
260 return f.read()
261
262
263def find_formattable_files(journal: str) -> dict[str, str]:
264 """Find all indexable files in the journal.
265
266 Globs each indexed FORMATTERS pattern from the journal root to discover
267 files. The registry is the single source of truth for what gets indexed.
268
269 Args:
270 journal: Path to journal root directory
271
272 Returns:
273 Mapping of journal-relative paths to absolute paths
274 """
275 files: dict[str, str] = {}
276 journal_path = Path(journal)
277 day_root = (
278 journal_path / CHRONICLE_DIR
279 if (journal_path / CHRONICLE_DIR).is_dir()
280 else journal_path
281 )
282
283 for pattern in _STRUCTURAL_PATTERNS:
284 _mod, _func, indexed = FORMATTERS[pattern]
285 if not indexed:
286 continue
287 for match in journal_path.glob(pattern):
288 if match.is_file():
289 rel = match.relative_to(journal_path).as_posix()
290 files[rel] = str(match)
291
292 for pattern in _DAY_ROOTED_PATTERNS:
293 _mod, _func, indexed = FORMATTERS[pattern]
294 if not indexed:
295 continue
296 for match in day_root.glob(pattern):
297 if match.is_file():
298 rel = match.relative_to(day_root).as_posix()
299 files[rel] = str(match)
300
301 return files
302
303
304def format_file(
305 file_path: str | Path,
306 context: dict[str, Any] | None = None,
307) -> tuple[list[dict[str, Any]], dict[str, Any]]:
308 """Load file, detect formatter, return formatted chunks and metadata.
309
310 File must be under the journal root. Supports JSONL, JSON, and Markdown files.
311
312 Args:
313 file_path: Absolute or journal-relative path to file
314 context: Optional context dict passed to formatter
315
316 Returns:
317 Tuple of (chunks, meta) where:
318 - chunks: List of dicts with "markdown" key (and optional "timestamp")
319 - meta: Dict with optional "header" and "error" keys
320
321 Raises:
322 ValueError: If file is outside journal or no formatter found
323 FileNotFoundError: If file doesn't exist
324 """
325 journal_path = Path(get_journal()).resolve()
326 file_path = Path(file_path).resolve()
327
328 if not file_path.exists():
329 raise FileNotFoundError(f"File not found: {file_path}")
330
331 # Require file to be under journal
332 if not file_path.is_relative_to(journal_path):
333 raise ValueError(f"File is outside journal directory: {file_path}")
334
335 rel_path = journal_relative_path(journal_path, file_path)
336
337 formatter = get_formatter(rel_path)
338 if formatter is None:
339 raise ValueError(f"No formatter found for: {rel_path}")
340
341 # Load file based on extension
342 if file_path.suffix == ".md":
343 content = load_markdown(file_path)
344 elif file_path.suffix == ".json":
345 with open(file_path, encoding="utf-8") as f:
346 content = [json.load(f)]
347 else:
348 content = load_jsonl(file_path)
349
350 # Build context with file path info
351 ctx = context or {}
352 ctx.setdefault("file_path", file_path)
353
354 return formatter(content, ctx)
355
356
357def _format_chunk_summary(chunks: list[dict], raw_chunks: list[dict] | None) -> None:
358 """Print human-readable chunk summary (for markdown files with raw chunks)."""
359 print(f"Total chunks: {len(chunks)}\n")
360 for i, chunk in enumerate(chunks):
361 # Use raw chunk data if available, otherwise extract from markdown
362 if raw_chunks and i < len(raw_chunks):
363 c = raw_chunks[i]
364 chunk_type = c.get("type", "unknown")
365 header_path = c.get("header_path", [])
366 intro = c.get("intro")
367 preview = c.get("preview", "")
368 else:
369 chunk_type = "chunk"
370 header_path = []
371 intro = None
372 preview = chunk.get("markdown", "")[:70]
373
374 path = " > ".join(f"H{h['level']}:{h['text']}" for h in header_path)
375 print(f"#{i:3d} [{chunk_type:13s}]")
376 if path:
377 print(f" path: {path}")
378 if intro:
379 print(f' intro: "{intro[:60]}{"..." if len(intro) > 60 else ""}"')
380 print(f" {preview[:70]}{'...' if len(preview) > 70 else ''}")
381 print()