think/formatters.py at d18a7c02359cd827d0ff15058861de5c2600a96f

solpbc.org / solstone
fork
personal memory agent
fork
solstone / think / formatters.py
at d18a7c02359cd827d0ff15058861de5c2600a96f 381 lines 14 kB view raw
wrap content
Jer Miller storytellers: write onto activity record; add commitments/closures/decisions 15d ago
c7e28d23
  1# SPDX-License-Identifier: AGPL-3.0-only
  2# Copyright (c) 2026 sol pbc
  3
  4"""Formatters framework for JSONL and Markdown files.
  5
  6This module provides a registry-based system for converting structured files
  7to markdown chunks. Each formatter is a plain function that lives near its
  8source domain code.
  9
 10Supported file types:
 11    - JSONL (.jsonl): Parsed as JSON lines, passed as list[dict] to formatter
 12    - Markdown (.md): Read as text, passed as str to formatter
 13
 14Output contract: All formatters return tuple[list[dict], dict] where:
 15    - list[dict]: Chunks, each with:
 16        - markdown: str (formatted markdown for this chunk)
 17        - timestamp: int (optional - unix timestamp in milliseconds for ordering)
 18        - source: dict (optional - original entry from JSONL for enriched streams)
 19    - dict: Metadata about the formatting with optional keys:
 20        - header: str - Optional header markdown (metadata summary, context, etc.)
 21        - error: str - Optional error/warning message (e.g., skipped entries)
 22        - indexer: dict - Indexing metadata with keys:
 23            - agent: str - Content type (e.g., "event", "audio", "screen")
 24            JSONL formatters must provide agent. Markdown agent is path-derived.
 25            Day and facet are extracted from path by extract_path_metadata().
 26
 27JSONL formatters receive list[dict] entries and are responsible for:
 28    - Extracting metadata from entries (typically first line)
 29    - Building header from metadata if applicable
 30    - Formatting content entries into chunks
 31    - Providing indexer.agent in the meta dict
 32
 33Markdown formatters receive str text and perform semantic chunking.
 34"""
 35
 36import fnmatch
 37import json
 38import os
 39from importlib import import_module
 40from pathlib import Path
 41from typing import Any, Callable
 42
 43from think.utils import CHRONICLE_DIR, DATE_RE, get_journal, journal_relative_path
 44
 45
 46def extract_path_metadata(rel_path: str) -> dict[str, str]:
 47    """Extract indexing metadata from a journal-relative path.
 48
 49    Extracts day and facet from path structure. For markdown files, also
 50    derives agent from path. For JSONL files, agent should be provided
 51    by the formatter via meta["indexer"]["agent"].
 52
 53    Args:
 54        rel_path: Journal-relative path (e.g., "20240101/talents/flow.md")
 55
 56    Returns:
 57        Dict with keys: day, facet, agent
 58        - day: YYYYMMDD string or empty
 59        - facet: Facet name or empty
 60        - agent: Derived agent for .md files, empty for .jsonl
 61    """
 62    parts = rel_path.replace("\\", "/").split("/")
 63    filename = parts[-1]
 64    basename = os.path.splitext(filename)[0]
 65    is_markdown = filename.endswith(".md")
 66
 67    day = ""
 68    facet = ""
 69    agent = ""
 70
 71    # Extract day from YYYYMMDD directory prefix
 72    if parts[0] and DATE_RE.fullmatch(parts[0]):
 73        day = parts[0]
 74
 75    # Extract facet from talents/{facet}/... paths
 76    try:
 77        talents_idx = parts.index("talents")
 78        if talents_idx + 2 < len(parts):
 79            facet = parts[talents_idx + 1]
 80    except ValueError:
 81        pass
 82
 83    # Extract facet from facets/{facet}/... paths
 84    if parts[0] == "facets" and len(parts) >= 3:
 85        facet = parts[1]
 86        # Day from YYYYMMDD filename (events/entities/todos/news)
 87        if len(parts) >= 4 and DATE_RE.fullmatch(basename):
 88            day = basename
 89        # Day from activities/{YYYYMMDD}/{activity_id}/... directory structure
 90        elif (
 91            len(parts) >= 5 and parts[2] == "activities" and DATE_RE.fullmatch(parts[3])
 92        ):
 93            day = parts[3]
 94
 95    # Extract day from imports/YYYYMMDD_HHMMSS/...
 96    if parts[0] == "imports" and len(parts) >= 2:
 97        import_id = parts[1]
 98        day = import_id.split("_")[0] if "_" in import_id else import_id[:8]
 99
100    # Extract day from config/actions/YYYYMMDD.jsonl (journal-level logs)
101    if parts[0] == "config" and len(parts) >= 3 and parts[1] == "actions":
102        if DATE_RE.fullmatch(basename):
103            day = basename
104
105    # Derive agent for markdown files only
106    if is_markdown:
107        if parts[0] == "facets" and len(parts) >= 4 and parts[2] == "news":
108            agent = "news"
109        elif parts[0] == "imports":
110            agent = "import"
111        elif parts[0] == "apps" and len(parts) >= 4:
112            agent = f"{parts[1]}:{basename}"
113        else:
114            # Daily agent outputs, segment markdown: use basename
115            agent = basename
116
117    return {"day": day, "facet": facet, "agent": agent}
118
119
120# Registry mapping glob patterns to (module_path, function_name, indexed).
121# Patterns are matched against journal-relative paths and must be specific
122# enough to use as Path.glob() arguments from the journal root.  The indexed
123# flag controls whether find_formattable_files() collects matching files for
124# the search index.  Adding a new journal content location requires a new
125# entry here — see talent/journal/references/storage.md "Search Index" for details.
126#
127# Order matters: first match wins, so place specific patterns before general ones.
128FORMATTERS: dict[str, tuple[str, str, bool]] = {
129    # JSONL formatters (indexed)
130    "config/actions/*.jsonl": ("think.facets", "format_logs", True),
131    "facets/*/entities/*/observations.jsonl": (
132        "think.entities.formatting",
133        "format_observations",
134        True,
135    ),
136    "facets/*/entities/*.jsonl": ("think.entities.formatting", "format_entities", True),
137    "entities/*/entity.json": (
138        "think.entities.formatting",
139        "format_entity_identity",
140        False,  # Indexed via _index_entity_search_chunks (enriched with relationship data)
141    ),
142    "facets/*/events/*.jsonl": ("think.event_formatter", "format_events", True),
143    "facets/*/activities/*.jsonl": ("think.activities", "format_activities", True),
144    "facets/*/todos/*.jsonl": ("apps.todos.todo", "format_todos", True),
145    "facets/*/logs/*.jsonl": ("think.facets", "format_logs", True),
146    # Structured file imports (indexed)
147    "*/import.*/imported.jsonl": (
148        "think.importers.formatting",
149        "format_imported",
150        True,
151    ),
152    # Markdown transcript imports (new convention + legacy)
153    "*/import.*/*/*_transcript.md": ("think.markdown", "format_markdown", True),
154    "*/import.*/*/imported.md": ("think.markdown", "format_markdown", True),
155    # AI chat imports — dedicated formatter (new + legacy filenames)
156    "*/import.chatgpt/*/conversation_transcript.jsonl": (
157        "think.importers.formatting",
158        "format_ai_chat",
159        True,
160    ),
161    "*/import.claude/*/conversation_transcript.jsonl": (
162        "think.importers.formatting",
163        "format_ai_chat",
164        True,
165    ),
166    "*/import.gemini/*/conversation_transcript.jsonl": (
167        "think.importers.formatting",
168        "format_ai_chat",
169        True,
170    ),
171    # Legacy AI chat import filenames (backward compat)
172    "*/import.chatgpt/*/imported_audio.jsonl": (
173        "think.importers.formatting",
174        "format_ai_chat",
175        True,
176    ),
177    "*/import.claude/*/imported_audio.jsonl": (
178        "think.importers.formatting",
179        "format_ai_chat",
180        True,
181    ),
182    "*/import.gemini/*/imported_audio.jsonl": (
183        "think.importers.formatting",
184        "format_ai_chat",
185        True,
186    ),
187    # Raw transcripts — formattable but not indexed (agent outputs are more useful)
188    # Layout: day/stream/segment/audio.jsonl
189    "*/*/*/audio.jsonl": ("observe.hear", "format_audio", False),
190    "*/*/*/*_audio.jsonl": ("observe.hear", "format_audio", False),
191    "*/*/*/*_transcript.jsonl": ("observe.hear", "format_audio", False),
192    "*/*/*/screen.jsonl": ("observe.screen", "format_screen", False),
193    "*/*/*/*_screen.jsonl": ("observe.screen", "format_screen", False),
194    # Markdown — day-level agents output and segment-level (day/stream/segment/talents/)
195    "*/talents/*.md": ("think.markdown", "format_markdown", True),
196    # Layout: day/stream/segment/talents/*.md
197    "*/*/*/talents/*.md": ("think.markdown", "format_markdown", True),
198    "*/*/*/talents/*/*.md": ("think.markdown", "format_markdown", True),
199    "facets/*/activities/*/*/*.md": ("think.markdown", "format_markdown", True),
200    "facets/*/news/*.md": ("think.markdown", "format_markdown", True),
201    "imports/*/summary.md": ("think.markdown", "format_markdown", True),
202    "apps/*/talents/*.md": ("think.markdown", "format_markdown", True),
203}
204
205_DAY_ROOTED_PATTERNS = [p for p in FORMATTERS if p.startswith("*/")]
206_STRUCTURAL_PATTERNS = [p for p in FORMATTERS if not p.startswith("*/")]
207
208
209def get_formatter(file_path: str) -> Callable | None:
210    """Return formatter function for a journal-relative file path.
211
212    Matches against registered glob patterns (regardless of indexed flag).
213
214    Args:
215        file_path: Journal-relative path (e.g., "20240101/talents/flow.md")
216
217    Returns:
218        Formatter function or None if no pattern matches
219    """
220    for pattern, (module_path, func_name, _indexed) in FORMATTERS.items():
221        if fnmatch.fnmatch(file_path, pattern):
222            module = import_module(module_path)
223            return getattr(module, func_name)
224
225    return None
226
227
228def load_jsonl(file_path: str | Path) -> list[dict[str, Any]]:
229    """Load entries from a JSONL file.
230
231    Args:
232        file_path: Absolute path to JSONL file
233
234    Returns:
235        List of parsed JSON objects (one per line)
236    """
237    entries = []
238    with open(file_path, "r", encoding="utf-8") as f:
239        for line in f:
240            line = line.strip()
241            if not line:
242                continue
243            try:
244                entries.append(json.loads(line))
245            except json.JSONDecodeError:
246                continue
247    return entries
248
249
250def load_markdown(file_path: str | Path) -> str:
251    """Load text from a markdown file.
252
253    Args:
254        file_path: Absolute path to markdown file
255
256    Returns:
257        File contents as string
258    """
259    with open(file_path, "r", encoding="utf-8") as f:
260        return f.read()
261
262
263def find_formattable_files(journal: str) -> dict[str, str]:
264    """Find all indexable files in the journal.
265
266    Globs each indexed FORMATTERS pattern from the journal root to discover
267    files.  The registry is the single source of truth for what gets indexed.
268
269    Args:
270        journal: Path to journal root directory
271
272    Returns:
273        Mapping of journal-relative paths to absolute paths
274    """
275    files: dict[str, str] = {}
276    journal_path = Path(journal)
277    day_root = (
278        journal_path / CHRONICLE_DIR
279        if (journal_path / CHRONICLE_DIR).is_dir()
280        else journal_path
281    )
282
283    for pattern in _STRUCTURAL_PATTERNS:
284        _mod, _func, indexed = FORMATTERS[pattern]
285        if not indexed:
286            continue
287        for match in journal_path.glob(pattern):
288            if match.is_file():
289                rel = match.relative_to(journal_path).as_posix()
290                files[rel] = str(match)
291
292    for pattern in _DAY_ROOTED_PATTERNS:
293        _mod, _func, indexed = FORMATTERS[pattern]
294        if not indexed:
295            continue
296        for match in day_root.glob(pattern):
297            if match.is_file():
298                rel = match.relative_to(day_root).as_posix()
299                files[rel] = str(match)
300
301    return files
302
303
304def format_file(
305    file_path: str | Path,
306    context: dict[str, Any] | None = None,
307) -> tuple[list[dict[str, Any]], dict[str, Any]]:
308    """Load file, detect formatter, return formatted chunks and metadata.
309
310    File must be under the journal root. Supports JSONL, JSON, and Markdown files.
311
312    Args:
313        file_path: Absolute or journal-relative path to file
314        context: Optional context dict passed to formatter
315
316    Returns:
317        Tuple of (chunks, meta) where:
318            - chunks: List of dicts with "markdown" key (and optional "timestamp")
319            - meta: Dict with optional "header" and "error" keys
320
321    Raises:
322        ValueError: If file is outside journal or no formatter found
323        FileNotFoundError: If file doesn't exist
324    """
325    journal_path = Path(get_journal()).resolve()
326    file_path = Path(file_path).resolve()
327
328    if not file_path.exists():
329        raise FileNotFoundError(f"File not found: {file_path}")
330
331    # Require file to be under journal
332    if not file_path.is_relative_to(journal_path):
333        raise ValueError(f"File is outside journal directory: {file_path}")
334
335    rel_path = journal_relative_path(journal_path, file_path)
336
337    formatter = get_formatter(rel_path)
338    if formatter is None:
339        raise ValueError(f"No formatter found for: {rel_path}")
340
341    # Load file based on extension
342    if file_path.suffix == ".md":
343        content = load_markdown(file_path)
344    elif file_path.suffix == ".json":
345        with open(file_path, encoding="utf-8") as f:
346            content = [json.load(f)]
347    else:
348        content = load_jsonl(file_path)
349
350    # Build context with file path info
351    ctx = context or {}
352    ctx.setdefault("file_path", file_path)
353
354    return formatter(content, ctx)
355
356
357def _format_chunk_summary(chunks: list[dict], raw_chunks: list[dict] | None) -> None:
358    """Print human-readable chunk summary (for markdown files with raw chunks)."""
359    print(f"Total chunks: {len(chunks)}\n")
360    for i, chunk in enumerate(chunks):
361        # Use raw chunk data if available, otherwise extract from markdown
362        if raw_chunks and i < len(raw_chunks):
363            c = raw_chunks[i]
364            chunk_type = c.get("type", "unknown")
365            header_path = c.get("header_path", [])
366            intro = c.get("intro")
367            preview = c.get("preview", "")
368        else:
369            chunk_type = "chunk"
370            header_path = []
371            intro = None
372            preview = chunk.get("markdown", "")[:70]
373
374        path = " > ".join(f"H{h['level']}:{h['text']}" for h in header_path)
375        print(f"#{i:3d} [{chunk_type:13s}]")
376        if path:
377            print(f"      path: {path}")
378        if intro:
379            print(f'      intro: "{intro[:60]}{"..." if len(intro) > 60 else ""}"')
380        print(f"      {preview[:70]}{'...' if len(preview) > 70 else ''}")
381        print()
Configure Feed

Configure Feed