Add entity metadata to SQLite search index · solpbc.org/solstone@a28f8d1

tests/fixtures/journal/indexer/journal.sqlite

This is a binary file and will not be displayed.

+47 -8

tests/test_journal_index.py

··· 1229 1229 conn.close() 1230 1230 1231 1231 1232 - def test_entity_identity_fts_chunks_indexed(): 1233 - """Entity identity files produce FTS chunks with agent='entity'.""" 1232 + def test_entity_search_chunks_indexed(): 1233 + """Entity search chunks are generated from identity + relationship data.""" 1234 1234 from think.indexer.journal import scan_journal 1235 1235 1236 1236 os.environ["JOURNAL_PATH"] = "tests/fixtures/journal" ··· 1240 1240 0 1241 1241 ] 1242 1242 conn.close() 1243 - assert count == 33 1243 + # One chunk per entity-facet relationship (33 identities × relationships per facet) 1244 + assert count == 40 1244 1245 1245 1246 1246 - def test_entity_identity_search_by_name(): 1247 - """Entity identity name is searchable via FTS.""" 1247 + def test_entity_search_chunks_use_entity_search_path(): 1248 + """Entity search chunks use entity_search: path prefix.""" 1249 + from think.indexer.journal import scan_journal 1250 + 1251 + os.environ["JOURNAL_PATH"] = "tests/fixtures/journal" 1252 + scan_journal("tests/fixtures/journal", full=True) 1253 + conn, _ = get_journal_index("tests/fixtures/journal") 1254 + rows = conn.execute( 1255 + "SELECT DISTINCT path FROM chunks WHERE agent='entity'" 1256 + ).fetchall() 1257 + conn.close() 1258 + assert all(r[0].startswith("entity_search:") for r in rows) 1259 + 1260 + 1261 + def test_entity_search_by_name(): 1262 + """Entity name is searchable via FTS.""" 1248 1263 from think.indexer.journal import scan_journal 1249 1264 1250 1265 os.environ["JOURNAL_PATH"] = "tests/fixtures/journal" ··· 1254 1269 assert any(r["metadata"]["agent"] == "entity" for r in results) 1255 1270 1256 1271 1257 - def test_entity_identity_search_by_type(): 1272 + def test_entity_search_by_type(): 1258 1273 """Entity type is searchable via FTS.""" 1259 1274 from think.indexer.journal import scan_journal 1260 1275 ··· 1264 1279 assert total >= 1 1265 1280 1266 1281 1267 - def test_entity_identity_fts_idempotent(): 1282 + def test_entity_search_includes_description(): 1283 + """Entity search chunks include relationship descriptions.""" 1284 + from think.indexer.journal import scan_journal 1285 + 1286 + os.environ["JOURNAL_PATH"] = "tests/fixtures/journal" 1287 + scan_journal("tests/fixtures/journal", full=True) 1288 + # Alice has description "Close friend from college" in personal facet 1289 + total, results = search_journal("college", agent="entity") 1290 + assert total >= 1 1291 + matched = [r for r in results if "college" in r["text"].lower()] 1292 + assert len(matched) >= 1 1293 + 1294 + 1295 + def test_entity_search_includes_facet(): 1296 + """Entity search chunks have facet metadata from relationships.""" 1297 + from think.indexer.journal import scan_journal 1298 + 1299 + os.environ["JOURNAL_PATH"] = "tests/fixtures/journal" 1300 + scan_journal("tests/fixtures/journal", full=True) 1301 + total, results = search_journal("Alice Johnson", agent="entity", facet="personal") 1302 + assert total >= 1 1303 + assert all(r["metadata"]["facet"] == "personal" for r in results) 1304 + 1305 + 1306 + def test_entity_search_idempotent(): 1268 1307 """Two full scans produce identical entity chunk count (no duplicates).""" 1269 1308 from think.indexer.journal import scan_journal 1270 1309 ··· 1281 1320 "SELECT count(*) FROM chunks WHERE agent='entity'" 1282 1321 ).fetchone()[0] 1283 1322 conn.close() 1284 - assert count1 == count2 == 33 1323 + assert count1 == count2 == 40

+1 -1

think/formatters.py

··· 139 139 "entities/*/entity.json": ( 140 140 "think.entities.formatting", 141 141 "format_entity_identity", 142 - True, 142 + False, # Indexed via _index_entity_search_chunks (enriched with relationship data) 143 143 ), 144 144 "facets/*/events/*.jsonl": ("think.events", "format_events", True), 145 145 "facets/*/calendar/*.jsonl": ("think.events", "format_events", True),

+136

think/indexer/journal.py

··· 1235 1235 return bool(to_index or removed) 1236 1236 1237 1237 1238 + def _ts_to_day(ts_value: str | int | None) -> str: 1239 + """Convert a millisecond timestamp to YYYYMMDD string. 1240 + 1241 + Returns empty string if the value is missing or unparseable. 1242 + """ 1243 + if ts_value is None: 1244 + return "" 1245 + try: 1246 + ms = int(ts_value) 1247 + if ms <= 0: 1248 + return "" 1249 + return date.fromtimestamp(ms / 1000).strftime("%Y%m%d") 1250 + except (ValueError, TypeError, OSError): 1251 + return "" 1252 + 1253 + 1254 + def _index_entity_search_chunks(conn: sqlite3.Connection) -> int: 1255 + """Generate FTS5 search chunks from the entities table. 1256 + 1257 + Combines identity records (name, type, aka) with relationship records 1258 + (description, tags, facet) to create searchable chunks for each entity. 1259 + One chunk per entity-facet relationship, plus one for identity-only entities. 1260 + 1261 + Returns the number of entity chunks indexed. 1262 + """ 1263 + # Clean up: remove previous entity search chunks and legacy formatter chunks 1264 + conn.execute("DELETE FROM chunks WHERE path LIKE 'entity_search:%'") 1265 + conn.execute("DELETE FROM chunks WHERE path LIKE 'entities/%/entity.json'") 1266 + 1267 + # Load all non-blocked identity records 1268 + identities: dict[str, dict[str, Any]] = {} 1269 + for row in conn.execute( 1270 + "SELECT entity_id, name, type, aka, created_at, updated_at " 1271 + "FROM entities WHERE source='identity' AND (blocked IS NULL OR blocked=0)" 1272 + ).fetchall(): 1273 + entity_id, name, etype, aka, created_at, updated_at = row 1274 + identities[entity_id] = { 1275 + "name": name, 1276 + "type": etype, 1277 + "aka": aka, 1278 + "created_at": created_at, 1279 + "updated_at": updated_at, 1280 + } 1281 + 1282 + # Load all non-detached relationship records, grouped by entity_id 1283 + relationships: dict[str, list[dict[str, Any]]] = {} 1284 + for row in conn.execute( 1285 + "SELECT entity_id, facet, description, tags, last_seen, updated_at, attached_at " 1286 + "FROM entities " 1287 + "WHERE source='relationship' AND (detached IS NULL OR detached=0)" 1288 + ).fetchall(): 1289 + entity_id, facet, description, tags, last_seen, updated_at, attached_at = row 1290 + relationships.setdefault(entity_id, []).append( 1291 + { 1292 + "facet": facet, 1293 + "description": description, 1294 + "tags": tags, 1295 + "last_seen": last_seen, 1296 + "updated_at": updated_at, 1297 + "attached_at": attached_at, 1298 + } 1299 + ) 1300 + 1301 + count = 0 1302 + for entity_id, identity in identities.items(): 1303 + name = identity["name"] or entity_id.replace("_", " ").title() 1304 + etype = identity["type"] or "Unknown" 1305 + aka_raw = identity["aka"] 1306 + 1307 + # Build common identity lines (included in every chunk for this entity) 1308 + identity_lines = [f"{name} ({etype})"] 1309 + if aka_raw: 1310 + try: 1311 + aka_list = json.loads(aka_raw) 1312 + if aka_list: 1313 + identity_lines.append(f"Also known as: {', '.join(aka_list)}") 1314 + except (json.JSONDecodeError, TypeError): 1315 + pass 1316 + 1317 + path = f"entity_search:{entity_id}" 1318 + rels = relationships.get(entity_id, []) 1319 + 1320 + if rels: 1321 + # One chunk per facet relationship, enriched with identity data 1322 + for idx, rel in enumerate(rels): 1323 + lines = list(identity_lines) 1324 + if rel["description"]: 1325 + lines.append(rel["description"]) 1326 + if rel["tags"]: 1327 + try: 1328 + tags_list = json.loads(rel["tags"]) 1329 + if tags_list: 1330 + lines.append(f"Tags: {', '.join(tags_list)}") 1331 + except (json.JSONDecodeError, TypeError): 1332 + pass 1333 + 1334 + content = "\n".join(lines) 1335 + facet = (rel["facet"] or "").lower() 1336 + 1337 + # Best available day: last_seen > updated_at > attached_at 1338 + day = "" 1339 + if rel["last_seen"] and len(rel["last_seen"]) == 8: 1340 + day = rel["last_seen"] 1341 + else: 1342 + day = _ts_to_day(rel["updated_at"]) or _ts_to_day( 1343 + rel["attached_at"] 1344 + ) 1345 + 1346 + conn.execute( 1347 + "INSERT INTO chunks(content, path, day, facet, agent, stream, idx) " 1348 + "VALUES (?, ?, ?, ?, ?, ?, ?)", 1349 + (content, path, day, facet, "entity", "", idx), 1350 + ) 1351 + count += 1 1352 + else: 1353 + # Identity-only entity — one chunk with no facet 1354 + content = "\n".join(identity_lines) 1355 + day = _ts_to_day(identity["updated_at"]) or _ts_to_day( 1356 + identity["created_at"] 1357 + ) 1358 + conn.execute( 1359 + "INSERT INTO chunks(content, path, day, facet, agent, stream, idx) " 1360 + "VALUES (?, ?, ?, ?, ?, ?, ?)", 1361 + (content, path, day, "", "entity", "", 0), 1362 + ) 1363 + count += 1 1364 + 1365 + conn.commit() 1366 + logger.info("%s entity search chunks indexed", count) 1367 + return count 1368 + 1369 + 1238 1370 def consolidate_segment_entities(journal: str, full: bool = False) -> int: 1239 1371 """Consolidate per-segment entity detections into the journal entity store. 1240 1372 ··· 1483 1615 consolidate_segment_entities(journal, full=full) 1484 1616 entity_changed = scan_entities(journal, conn, verbose=verbose, full=full) 1485 1617 signal_changed = scan_signals(journal, conn, verbose=verbose, full=full) 1618 + 1619 + # Regenerate entity search chunks when entity data changes 1620 + if entity_changed: 1621 + _index_entity_search_chunks(conn) 1486 1622 1487 1623 conn.close() 1488 1624 return bool(to_index or removed or entity_changed or signal_changed)

Configure Feed

Configure Feed