personal memory agent
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

importers: manifest + audit log on --force rmtree

Build a per-file SHA-256 manifest before shutil.rmtree in
_setup_import's --force branch and append an
import_force_reimport entry via log_app_action. Thread
dry_run through so --force --dry-run logs the would-be-deleted
manifest without touching imports/.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

+253 -4
+33
tests/test_import_dedup.py
··· 3 3 4 4 """Tests for import deduplication — manifests, source-level dedup, entry merge.""" 5 5 6 + import hashlib 6 7 import json 7 8 import os 8 9 import tempfile 9 10 from pathlib import Path 10 11 11 12 from think.importers.shared import ( 13 + _build_import_manifest, 12 14 _entry_content_key, 13 15 _load_existing_entries, 14 16 find_manifest_by_hash, ··· 66 68 67 69 68 70 # --- manifest tests --- 71 + 72 + 73 + def test_build_import_manifest(): 74 + with tempfile.TemporaryDirectory() as tmpdir: 75 + import_dir = Path(tmpdir) 76 + alpha = import_dir / "alpha.txt" 77 + alpha_bytes = b"alpha payload" 78 + alpha.write_bytes(alpha_bytes) 79 + beta = import_dir / "nested" / "beta.bin" 80 + beta.parent.mkdir(parents=True) 81 + beta_bytes = b"\x00\x01beta" 82 + beta.write_bytes(beta_bytes) 83 + 84 + manifest = _build_import_manifest(import_dir) 85 + 86 + assert manifest["import_dir"] == str(import_dir) 87 + assert manifest["file_count"] == 2 88 + assert manifest["total_bytes"] == len(alpha_bytes) + len(beta_bytes) 89 + assert manifest["files"] == [ 90 + { 91 + "name": "alpha.txt", 92 + "bytes": len(alpha_bytes), 93 + "hash": hashlib.sha256(alpha_bytes).hexdigest(), 94 + }, 95 + { 96 + "name": "nested/beta.bin", 97 + "bytes": len(beta_bytes), 98 + "hash": hashlib.sha256(beta_bytes).hexdigest(), 99 + }, 100 + ] 101 + assert manifest["timestamp"].endswith("+00:00") 69 102 70 103 71 104 def test_write_and_find_manifest():
+159
tests/test_importer.py
··· 2 2 # Copyright (c) 2026 sol pbc 3 3 4 4 import datetime as dt 5 + import hashlib 5 6 import importlib 6 7 import json 7 8 import subprocess 8 9 import zipfile 9 10 from pathlib import Path 10 11 from unittest.mock import ANY, MagicMock, patch 12 + 13 + import pytest 11 14 12 15 from think.importers.file_importer import ImportPreview, ImportResult 13 16 from think.utils import day_path ··· 35 38 summary="Imported 42 events", 36 39 ) 37 40 return mock_imp 41 + 42 + 43 + def _configure_text_import_runtime(monkeypatch, mod): 44 + """Patch text import processing and callosum helpers for CLI tests.""" 45 + text_mod = importlib.import_module("think.importers.text") 46 + 47 + monkeypatch.setattr( 48 + text_mod, 49 + "detect_transcript_segment", 50 + lambda text, start_time: [("12:00:00", text)], 51 + ) 52 + monkeypatch.setattr( 53 + text_mod, 54 + "detect_transcript_json", 55 + lambda text, segment_start: { 56 + "entries": [ 57 + { 58 + "start": segment_start, 59 + "speaker": "Unknown", 60 + "text": text, 61 + } 62 + ], 63 + "topics": "", 64 + "setting": "", 65 + }, 66 + ) 67 + monkeypatch.setattr(mod, "CallosumConnection", lambda **kwargs: MagicMock()) 68 + monkeypatch.setattr(mod, "_status_emitter", lambda: None) 69 + 70 + 71 + def _read_action_entries(journal_root: Path) -> list[dict]: 72 + """Read journal-level app action log entries for today.""" 73 + today = dt.datetime.now().strftime("%Y%m%d") 74 + log_path = journal_root / "config" / "actions" / f"{today}.jsonl" 75 + if not log_path.exists(): 76 + return [] 77 + return [ 78 + json.loads(line) 79 + for line in log_path.read_text(encoding="utf-8").splitlines() 80 + if line.strip() 81 + ] 38 82 39 83 40 84 def test_slice_audio_segment(tmp_path): ··· 922 966 923 967 assert not (tmp_path / "imports").exists() 924 968 assert not (tmp_path / "chronicle" / "20240315").exists() 969 + 970 + 971 + def test_importer_force_reimport_logs_manifest_and_replaces_directory( 972 + tmp_path, monkeypatch 973 + ): 974 + """--force logs a manifest, removes the old import dir, and writes the new file.""" 975 + mod = importlib.import_module("think.importers.cli") 976 + 977 + timestamp = "20240101_120000" 978 + old_import_dir = tmp_path / "imports" / timestamp 979 + old_import_dir.mkdir(parents=True) 980 + stale_file = old_import_dir / "stale.txt" 981 + stale_bytes = b"old import payload" 982 + stale_file.write_bytes(stale_bytes) 983 + nested_file = old_import_dir / "nested" / "extra.bin" 984 + nested_file.parent.mkdir(parents=True) 985 + nested_bytes = b"\x00\x01nested" 986 + nested_file.write_bytes(nested_bytes) 987 + 988 + txt = tmp_path / "replacement.txt" 989 + txt.write_text("replacement transcript") 990 + 991 + monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path)) 992 + _configure_text_import_runtime(monkeypatch, mod) 993 + monkeypatch.setattr( 994 + "sys.argv", 995 + ["sol import", str(txt), "--timestamp", timestamp, "--force"], 996 + ) 997 + 998 + mod.main() 999 + 1000 + action_entries = _read_action_entries(tmp_path) 1001 + assert len(action_entries) == 1 1002 + entry = action_entries[0] 1003 + assert entry["action"] == "import_force_reimport" 1004 + assert entry["actor"] == "import" 1005 + assert entry["params"]["dry_run"] is False 1006 + assert entry["params"]["import_dir"] == str(old_import_dir) 1007 + assert entry["params"]["file_count"] == 2 1008 + assert entry["params"]["total_bytes"] == len(stale_bytes) + len(nested_bytes) 1009 + assert entry["params"]["files"] == [ 1010 + { 1011 + "name": "nested/extra.bin", 1012 + "bytes": len(nested_bytes), 1013 + "hash": hashlib.sha256(nested_bytes).hexdigest(), 1014 + }, 1015 + { 1016 + "name": "stale.txt", 1017 + "bytes": len(stale_bytes), 1018 + "hash": hashlib.sha256(stale_bytes).hexdigest(), 1019 + }, 1020 + ] 1021 + 1022 + assert not stale_file.exists() 1023 + assert not nested_file.exists() 1024 + new_imported_file = old_import_dir / "replacement.txt" 1025 + assert new_imported_file.exists() 1026 + assert new_imported_file.read_text() == "replacement transcript" 1027 + 1028 + 1029 + def test_importer_force_dry_run_logs_manifest_without_deleting(tmp_path, monkeypatch): 1030 + """--force --dry-run logs the manifest but leaves the old import dir untouched.""" 1031 + mod = importlib.import_module("think.importers.cli") 1032 + 1033 + timestamp = "20240101_120000" 1034 + old_import_dir = tmp_path / "imports" / timestamp 1035 + old_import_dir.mkdir(parents=True) 1036 + stale_file = old_import_dir / "stale.txt" 1037 + stale_file.write_text("old import payload") 1038 + 1039 + txt = tmp_path / "replacement.txt" 1040 + txt.write_text("replacement transcript") 1041 + 1042 + monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path)) 1043 + monkeypatch.setattr( 1044 + "sys.argv", 1045 + ["sol import", str(txt), "--timestamp", timestamp, "--force", "--dry-run"], 1046 + ) 1047 + 1048 + mod.main() 1049 + 1050 + action_entries = _read_action_entries(tmp_path) 1051 + assert len(action_entries) == 1 1052 + entry = action_entries[0] 1053 + assert entry["action"] == "import_force_reimport" 1054 + assert entry["params"]["dry_run"] is True 1055 + assert entry["params"]["import_dir"] == str(old_import_dir) 1056 + assert entry["params"]["file_count"] == 1 1057 + assert stale_file.exists() 1058 + assert stale_file.read_text() == "old import payload" 1059 + assert not (old_import_dir / "replacement.txt").exists() 1060 + 1061 + 1062 + def test_importer_existing_import_without_force_still_errors(tmp_path, monkeypatch): 1063 + """Existing imports still error without --force and do not log a reimport action.""" 1064 + mod = importlib.import_module("think.importers.cli") 1065 + 1066 + timestamp = "20240101_120000" 1067 + existing_import_dir = tmp_path / "imports" / timestamp 1068 + existing_import_dir.mkdir(parents=True) 1069 + (existing_import_dir / "stale.txt").write_text("old import payload") 1070 + 1071 + txt = tmp_path / "replacement.txt" 1072 + txt.write_text("replacement transcript") 1073 + 1074 + monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path)) 1075 + monkeypatch.setattr( 1076 + "sys.argv", 1077 + ["sol import", str(txt), "--timestamp", timestamp], 1078 + ) 1079 + 1080 + with pytest.raises(SystemExit, match="Import already exists"): 1081 + mod.main() 1082 + 1083 + assert _read_action_entries(tmp_path) == [] 925 1084 926 1085 927 1086 def test_file_importer_without_timestamp(tmp_path, monkeypatch, capsys):
+17 -3
think/importers/cli.py
··· 497 497 import_source = "audio" 498 498 499 499 stream = stream_name(import_source=import_source) 500 + needs_setup = _file_importer is None and not _is_in_imports(args.media) 501 + force_reimport_preview = ( 502 + args.force 503 + and needs_setup 504 + and (Path(get_journal()) / "imports" / args.timestamp).exists() 505 + ) 500 506 501 507 if args.dry_run and _file_importer is not None: 502 508 preview = _file_importer.preview(Path(args.media)) ··· 618 624 print() 619 625 print(" Duration: unknown (ffprobe failed)") 620 626 print() 627 + if force_reimport_preview: 628 + _setup_import( 629 + args.media, 630 + args.timestamp, 631 + args.facet, 632 + args.setting, 633 + detection_result, 634 + force=True, 635 + dry_run=True, 636 + ) 621 637 return 622 - 623 - # Check if file needs setup (not already in imports/) 624 - needs_setup = _file_importer is None and not _is_in_imports(args.media) 625 638 626 639 # Copy to imports/ if file is not already there 627 640 if needs_setup: ··· 632 645 args.setting, 633 646 detection_result, 634 647 force=args.force, 648 + dry_run=args.dry_run, 635 649 ) 636 650 print("Starting import...") 637 651
+44 -1
think/importers/shared.py
··· 298 298 return abs_media.startswith(abs_imports + os.sep) 299 299 300 300 301 + def _build_import_manifest(import_dir: Path) -> dict[str, Any]: 302 + """Return a hash manifest for files currently stored in an import directory.""" 303 + files = sorted(path for path in import_dir.rglob("*") if path.is_file()) 304 + entries: list[dict[str, Any]] = [] 305 + total_bytes = 0 306 + 307 + for path in files: 308 + size = path.stat().st_size 309 + digest = hashlib.sha256() 310 + with open(path, "rb") as handle: 311 + while chunk := handle.read(64 * 1024): 312 + digest.update(chunk) 313 + entries.append( 314 + { 315 + "name": path.relative_to(import_dir).as_posix(), 316 + "bytes": size, 317 + "hash": digest.hexdigest(), 318 + } 319 + ) 320 + total_bytes += size 321 + 322 + return { 323 + "timestamp": dt.datetime.now(dt.timezone.utc).isoformat(), 324 + "import_dir": str(import_dir), 325 + "total_bytes": total_bytes, 326 + "file_count": len(entries), 327 + "files": entries, 328 + } 329 + 330 + 301 331 def _setup_import( 302 332 media_path: str, 303 333 timestamp: str, ··· 305 335 setting: str | None, 306 336 detection_result: dict | None, 307 337 force: bool = False, 338 + dry_run: bool = False, 308 339 ) -> str: 309 340 """Copy file to imports/ and write metadata. Returns new file path.""" 310 341 journal_root = Path(get_journal()) 311 342 import_dir = journal_root / "imports" / timestamp 343 + filename = os.path.basename(media_path) 344 + dry_run_target_path = str(import_dir / filename) 312 345 313 346 # Check for conflict 314 347 if import_dir.exists(): 315 348 if force: 349 + from apps.utils import log_app_action 350 + 351 + manifest = _build_import_manifest(import_dir) 352 + log_app_action( 353 + app="import", 354 + facet=None, 355 + action="import_force_reimport", 356 + params={**manifest, "dry_run": dry_run}, 357 + ) 358 + if dry_run: 359 + return dry_run_target_path 316 360 logger.info(f"Removing existing import directory: {import_dir}") 317 361 shutil.rmtree(import_dir) 318 362 else: ··· 322 366 ) 323 367 324 368 # Copy file to imports/ 325 - filename = os.path.basename(media_path) 326 369 new_path = save_import_file( 327 370 journal_root=journal_root, 328 371 timestamp=timestamp,