Add provenance fields to retention purge log · solpbc.org/solstone@e6b0e08

+126 -1

2 changed files

expand all

tests

test_retention.py

think

retention.py

+87

tests/test_retention.py

··· 3 3 4 4 """Tests for think.retention — media retention service.""" 5 5 6 + import hashlib 7 + import os 8 + from datetime import datetime 9 + 6 10 from think.retention import ( 7 11 RetentionConfig, 8 12 RetentionPolicy, ··· 351 355 # Only plaud segment (60 days old) should be eligible 352 356 assert result.files_deleted == 1 353 357 assert result.details[0]["stream"] == "plaud" 358 + 359 + 360 + class TestPurgeProvenance: 361 + def _setup_journal(self, tmp_path, monkeypatch): 362 + return TestPurge()._setup_journal(tmp_path, monkeypatch) 363 + 364 + def test_hash_field_in_dry_run(self, tmp_path, monkeypatch): 365 + self._setup_journal(tmp_path, monkeypatch) 366 + 367 + result = purge(older_than_days=30, dry_run=True) 368 + expected_hash = hashlib.sha256(b"x" * 1000).hexdigest() 369 + 370 + for detail in result.details: 371 + for file_info in detail["files"]: 372 + file_hash = file_info["hash"] 373 + assert len(file_hash) == 64 374 + assert all(c in "0123456789abcdef" for c in file_hash) 375 + 376 + default_detail = next( 377 + detail 378 + for detail in result.details 379 + if detail["stream"] == "default" and detail["segment"] == "100000_300" 380 + ) 381 + assert default_detail["files"][0]["hash"] == expected_hash 382 + 383 + def test_hash_field_in_actual_purge(self, tmp_path, monkeypatch): 384 + self._setup_journal(tmp_path, monkeypatch) 385 + 386 + result = purge(older_than_days=30, dry_run=False) 387 + expected_hash = hashlib.sha256(b"x" * 1000).hexdigest() 388 + 389 + for detail in result.details: 390 + for file_info in detail["files"]: 391 + file_hash = file_info["hash"] 392 + assert len(file_hash) == 64 393 + assert all(c in "0123456789abcdef" for c in file_hash) 394 + 395 + default_detail = next( 396 + detail 397 + for detail in result.details 398 + if detail["stream"] == "default" and detail["segment"] == "100000_300" 399 + ) 400 + assert default_detail["files"][0]["hash"] == expected_hash 401 + 402 + def test_processed_at_field(self, tmp_path, monkeypatch): 403 + self._setup_journal(tmp_path, monkeypatch) 404 + 405 + result = purge(older_than_days=30, dry_run=True) 406 + 407 + for detail in result.details: 408 + assert "processed_at" in detail 409 + assert isinstance(detail["processed_at"], str) 410 + datetime.fromisoformat(detail["processed_at"]) 411 + 412 + def test_processed_at_reflects_latest_mtime(self, tmp_path, monkeypatch): 413 + journal = self._setup_journal(tmp_path, monkeypatch) 414 + segment = journal / "20260115" / "default" / "100000_300" 415 + audio_jsonl = segment / "audio.jsonl" 416 + alternate_audio_jsonl = segment / "meeting_audio.jsonl" 417 + speaker_labels = segment / "agents" / "speaker_labels.json" 418 + 419 + alternate_audio_jsonl.write_text('{"raw":"audio.flac"}\n') 420 + speaker_labels.write_text("{}") 421 + 422 + older_ts = datetime(2026, 1, 15, 10, 0, 0).timestamp() 423 + middle_ts = datetime(2026, 1, 15, 11, 0, 0).timestamp() 424 + latest_ts = datetime(2026, 1, 15, 12, 0, 0).timestamp() 425 + 426 + os.utime(audio_jsonl, (older_ts, older_ts)) 427 + os.utime(speaker_labels, (middle_ts, middle_ts)) 428 + os.utime(alternate_audio_jsonl, (latest_ts, latest_ts)) 429 + 430 + result = purge(older_than_days=30, dry_run=True) 431 + 432 + default_detail = next( 433 + detail 434 + for detail in result.details 435 + if detail["stream"] == "default" and detail["segment"] == "100000_300" 436 + ) 437 + assert ( 438 + default_detail["processed_at"] 439 + == datetime.fromtimestamp(latest_ts).isoformat() 440 + ) 354 441 355 442 356 443 # ---------------------------------------------------------------------------

+39 -1

think/retention.py

··· 15 15 16 16 from __future__ import annotations 17 17 18 + import hashlib 18 19 import json 19 20 import logging 20 21 from dataclasses import dataclass, field ··· 107 108 return False 108 109 109 110 return True 111 + 112 + 113 + def _get_completion_files(segment_path: Path) -> list[Path]: 114 + """Return existing completion-indicating files for a segment.""" 115 + completion_files: list[Path] = [] 116 + 117 + for name in ("audio.jsonl", "screen.jsonl"): 118 + path = segment_path / name 119 + if path.exists(): 120 + completion_files.append(path) 121 + 122 + completion_files.extend( 123 + path 124 + for pattern in ("*_audio.jsonl", "*_screen.jsonl") 125 + for path in segment_path.glob(pattern) 126 + if path.is_file() 127 + ) 128 + 129 + speaker_labels = segment_path / "agents" / "speaker_labels.json" 130 + if speaker_labels.exists(): 131 + completion_files.append(speaker_labels) 132 + 133 + return completion_files 110 134 111 135 112 136 # --------------------------------------------------------------------------- ··· 312 336 segment_files = [] 313 337 for f in raw_files: 314 338 size = f.stat().st_size 339 + digest = hashlib.sha256() 340 + with open(f, "rb") as handle: 341 + while chunk := handle.read(64 * 1024): 342 + digest.update(chunk) 343 + hex_digest = digest.hexdigest() 315 344 segment_bytes += size 316 - segment_files.append({"name": f.name, "bytes": size}) 345 + segment_files.append( 346 + {"name": f.name, "bytes": size, "hash": hex_digest} 347 + ) 317 348 if not dry_run: 318 349 f.unlink() 319 350 logger.info("Deleted: %s (%s)", f, _human_bytes(size)) 320 351 352 + completion_files = _get_completion_files(seg_path) 353 + processed_at = None 354 + if completion_files: 355 + latest_mtime = max(f.stat().st_mtime for f in completion_files) 356 + processed_at = datetime.fromtimestamp(latest_mtime).isoformat() 357 + 321 358 result.files_deleted += len(raw_files) 322 359 result.bytes_freed += segment_bytes 323 360 result.details.append( ··· 327 364 "segment": seg_key, 328 365 "files": segment_files, 329 366 "bytes_freed": segment_bytes, 367 + "processed_at": processed_at, 330 368 } 331 369 ) 332 370

Configure Feed

Configure Feed