personal memory agent
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Add entity metadata to SQLite search index

Entity identity records (name, type, aka) were indexed but thin — no
descriptions. Relationship records (descriptions, tags, facet context)
were not indexed into FTS5 at all.

New _index_entity_search_chunks() combines identity + relationship data
into rich, searchable chunks: one per entity-facet relationship with
name, type, aliases, description, and tags. Replaces the previous
formatter-based identity-only chunks.

Entity chunks use agent="entity" and path="entity_search:{id}", so they
surface in Convey search alongside prose mentions with the existing
Entity icon and filter.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

+184 -9
tests/fixtures/journal/indexer/journal.sqlite

This is a binary file and will not be displayed.

+47 -8
tests/test_journal_index.py
··· 1229 1229 conn.close() 1230 1230 1231 1231 1232 - def test_entity_identity_fts_chunks_indexed(): 1233 - """Entity identity files produce FTS chunks with agent='entity'.""" 1232 + def test_entity_search_chunks_indexed(): 1233 + """Entity search chunks are generated from identity + relationship data.""" 1234 1234 from think.indexer.journal import scan_journal 1235 1235 1236 1236 os.environ["JOURNAL_PATH"] = "tests/fixtures/journal" ··· 1240 1240 0 1241 1241 ] 1242 1242 conn.close() 1243 - assert count == 33 1243 + # One chunk per entity-facet relationship (33 identities × relationships per facet) 1244 + assert count == 40 1244 1245 1245 1246 1246 - def test_entity_identity_search_by_name(): 1247 - """Entity identity name is searchable via FTS.""" 1247 + def test_entity_search_chunks_use_entity_search_path(): 1248 + """Entity search chunks use entity_search: path prefix.""" 1249 + from think.indexer.journal import scan_journal 1250 + 1251 + os.environ["JOURNAL_PATH"] = "tests/fixtures/journal" 1252 + scan_journal("tests/fixtures/journal", full=True) 1253 + conn, _ = get_journal_index("tests/fixtures/journal") 1254 + rows = conn.execute( 1255 + "SELECT DISTINCT path FROM chunks WHERE agent='entity'" 1256 + ).fetchall() 1257 + conn.close() 1258 + assert all(r[0].startswith("entity_search:") for r in rows) 1259 + 1260 + 1261 + def test_entity_search_by_name(): 1262 + """Entity name is searchable via FTS.""" 1248 1263 from think.indexer.journal import scan_journal 1249 1264 1250 1265 os.environ["JOURNAL_PATH"] = "tests/fixtures/journal" ··· 1254 1269 assert any(r["metadata"]["agent"] == "entity" for r in results) 1255 1270 1256 1271 1257 - def test_entity_identity_search_by_type(): 1272 + def test_entity_search_by_type(): 1258 1273 """Entity type is searchable via FTS.""" 1259 1274 from think.indexer.journal import scan_journal 1260 1275 ··· 1264 1279 assert total >= 1 1265 1280 1266 1281 1267 - def test_entity_identity_fts_idempotent(): 1282 + def test_entity_search_includes_description(): 1283 + """Entity search chunks include relationship descriptions.""" 1284 + from think.indexer.journal import scan_journal 1285 + 1286 + os.environ["JOURNAL_PATH"] = "tests/fixtures/journal" 1287 + scan_journal("tests/fixtures/journal", full=True) 1288 + # Alice has description "Close friend from college" in personal facet 1289 + total, results = search_journal("college", agent="entity") 1290 + assert total >= 1 1291 + matched = [r for r in results if "college" in r["text"].lower()] 1292 + assert len(matched) >= 1 1293 + 1294 + 1295 + def test_entity_search_includes_facet(): 1296 + """Entity search chunks have facet metadata from relationships.""" 1297 + from think.indexer.journal import scan_journal 1298 + 1299 + os.environ["JOURNAL_PATH"] = "tests/fixtures/journal" 1300 + scan_journal("tests/fixtures/journal", full=True) 1301 + total, results = search_journal("Alice Johnson", agent="entity", facet="personal") 1302 + assert total >= 1 1303 + assert all(r["metadata"]["facet"] == "personal" for r in results) 1304 + 1305 + 1306 + def test_entity_search_idempotent(): 1268 1307 """Two full scans produce identical entity chunk count (no duplicates).""" 1269 1308 from think.indexer.journal import scan_journal 1270 1309 ··· 1281 1320 "SELECT count(*) FROM chunks WHERE agent='entity'" 1282 1321 ).fetchone()[0] 1283 1322 conn.close() 1284 - assert count1 == count2 == 33 1323 + assert count1 == count2 == 40
+1 -1
think/formatters.py
··· 139 139 "entities/*/entity.json": ( 140 140 "think.entities.formatting", 141 141 "format_entity_identity", 142 - True, 142 + False, # Indexed via _index_entity_search_chunks (enriched with relationship data) 143 143 ), 144 144 "facets/*/events/*.jsonl": ("think.events", "format_events", True), 145 145 "facets/*/calendar/*.jsonl": ("think.events", "format_events", True),
+136
think/indexer/journal.py
··· 1235 1235 return bool(to_index or removed) 1236 1236 1237 1237 1238 + def _ts_to_day(ts_value: str | int | None) -> str: 1239 + """Convert a millisecond timestamp to YYYYMMDD string. 1240 + 1241 + Returns empty string if the value is missing or unparseable. 1242 + """ 1243 + if ts_value is None: 1244 + return "" 1245 + try: 1246 + ms = int(ts_value) 1247 + if ms <= 0: 1248 + return "" 1249 + return date.fromtimestamp(ms / 1000).strftime("%Y%m%d") 1250 + except (ValueError, TypeError, OSError): 1251 + return "" 1252 + 1253 + 1254 + def _index_entity_search_chunks(conn: sqlite3.Connection) -> int: 1255 + """Generate FTS5 search chunks from the entities table. 1256 + 1257 + Combines identity records (name, type, aka) with relationship records 1258 + (description, tags, facet) to create searchable chunks for each entity. 1259 + One chunk per entity-facet relationship, plus one for identity-only entities. 1260 + 1261 + Returns the number of entity chunks indexed. 1262 + """ 1263 + # Clean up: remove previous entity search chunks and legacy formatter chunks 1264 + conn.execute("DELETE FROM chunks WHERE path LIKE 'entity_search:%'") 1265 + conn.execute("DELETE FROM chunks WHERE path LIKE 'entities/%/entity.json'") 1266 + 1267 + # Load all non-blocked identity records 1268 + identities: dict[str, dict[str, Any]] = {} 1269 + for row in conn.execute( 1270 + "SELECT entity_id, name, type, aka, created_at, updated_at " 1271 + "FROM entities WHERE source='identity' AND (blocked IS NULL OR blocked=0)" 1272 + ).fetchall(): 1273 + entity_id, name, etype, aka, created_at, updated_at = row 1274 + identities[entity_id] = { 1275 + "name": name, 1276 + "type": etype, 1277 + "aka": aka, 1278 + "created_at": created_at, 1279 + "updated_at": updated_at, 1280 + } 1281 + 1282 + # Load all non-detached relationship records, grouped by entity_id 1283 + relationships: dict[str, list[dict[str, Any]]] = {} 1284 + for row in conn.execute( 1285 + "SELECT entity_id, facet, description, tags, last_seen, updated_at, attached_at " 1286 + "FROM entities " 1287 + "WHERE source='relationship' AND (detached IS NULL OR detached=0)" 1288 + ).fetchall(): 1289 + entity_id, facet, description, tags, last_seen, updated_at, attached_at = row 1290 + relationships.setdefault(entity_id, []).append( 1291 + { 1292 + "facet": facet, 1293 + "description": description, 1294 + "tags": tags, 1295 + "last_seen": last_seen, 1296 + "updated_at": updated_at, 1297 + "attached_at": attached_at, 1298 + } 1299 + ) 1300 + 1301 + count = 0 1302 + for entity_id, identity in identities.items(): 1303 + name = identity["name"] or entity_id.replace("_", " ").title() 1304 + etype = identity["type"] or "Unknown" 1305 + aka_raw = identity["aka"] 1306 + 1307 + # Build common identity lines (included in every chunk for this entity) 1308 + identity_lines = [f"{name} ({etype})"] 1309 + if aka_raw: 1310 + try: 1311 + aka_list = json.loads(aka_raw) 1312 + if aka_list: 1313 + identity_lines.append(f"Also known as: {', '.join(aka_list)}") 1314 + except (json.JSONDecodeError, TypeError): 1315 + pass 1316 + 1317 + path = f"entity_search:{entity_id}" 1318 + rels = relationships.get(entity_id, []) 1319 + 1320 + if rels: 1321 + # One chunk per facet relationship, enriched with identity data 1322 + for idx, rel in enumerate(rels): 1323 + lines = list(identity_lines) 1324 + if rel["description"]: 1325 + lines.append(rel["description"]) 1326 + if rel["tags"]: 1327 + try: 1328 + tags_list = json.loads(rel["tags"]) 1329 + if tags_list: 1330 + lines.append(f"Tags: {', '.join(tags_list)}") 1331 + except (json.JSONDecodeError, TypeError): 1332 + pass 1333 + 1334 + content = "\n".join(lines) 1335 + facet = (rel["facet"] or "").lower() 1336 + 1337 + # Best available day: last_seen > updated_at > attached_at 1338 + day = "" 1339 + if rel["last_seen"] and len(rel["last_seen"]) == 8: 1340 + day = rel["last_seen"] 1341 + else: 1342 + day = _ts_to_day(rel["updated_at"]) or _ts_to_day( 1343 + rel["attached_at"] 1344 + ) 1345 + 1346 + conn.execute( 1347 + "INSERT INTO chunks(content, path, day, facet, agent, stream, idx) " 1348 + "VALUES (?, ?, ?, ?, ?, ?, ?)", 1349 + (content, path, day, facet, "entity", "", idx), 1350 + ) 1351 + count += 1 1352 + else: 1353 + # Identity-only entity — one chunk with no facet 1354 + content = "\n".join(identity_lines) 1355 + day = _ts_to_day(identity["updated_at"]) or _ts_to_day( 1356 + identity["created_at"] 1357 + ) 1358 + conn.execute( 1359 + "INSERT INTO chunks(content, path, day, facet, agent, stream, idx) " 1360 + "VALUES (?, ?, ?, ?, ?, ?, ?)", 1361 + (content, path, day, "", "entity", "", 0), 1362 + ) 1363 + count += 1 1364 + 1365 + conn.commit() 1366 + logger.info("%s entity search chunks indexed", count) 1367 + return count 1368 + 1369 + 1238 1370 def consolidate_segment_entities(journal: str, full: bool = False) -> int: 1239 1371 """Consolidate per-segment entity detections into the journal entity store. 1240 1372 ··· 1483 1615 consolidate_segment_entities(journal, full=full) 1484 1616 entity_changed = scan_entities(journal, conn, verbose=verbose, full=full) 1485 1617 signal_changed = scan_signals(journal, conn, verbose=verbose, full=full) 1618 + 1619 + # Regenerate entity search chunks when entity data changes 1620 + if entity_changed: 1621 + _index_entity_search_chunks(conn) 1486 1622 1487 1623 conn.close() 1488 1624 return bool(to_index or removed or entity_changed or signal_changed)