speakers: wipe command also clears legacy agents/ label paths

`sol call speakers wipe` globbed only chronicle/*/*/*/talents/ for
speaker_labels.json + speaker_corrections.json. In practice every
historical segment stores those files under agents/ — talents/ is the
destination for new writes from apps/speakers/attribution.py. A real
dry-run on jer's journal reported 0 labels and 0 corrections when the
actual counts are 3360 + 96.

Cover both paths so the wipe catches historical and new-write artifacts.
Test fixture now creates both layouts and asserts counts of 2 in each
category.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Jer Miller 2 months ago 2dd6bf2c 31649dc3

+34 -18

2 changed files

expand all

apps

speakers

tests

test_wipe.py

wipe.py

+14 -10

apps/speakers/tests/test_wipe.py

··· 27 27 28 28 targets = [ 29 29 journal / "chronicle/20240101/test/120000_300/mic_audio.npz", 30 - journal / "chronicle/20240101/test/120000_300/talents/speaker_labels.json", 31 - journal / "chronicle/20240101/test/120000_300/talents/speaker_corrections.json", 30 + # labels + corrections live under both agents/ (historical) and talents/ 31 + # (current writes); the wipe must catch both. 32 + journal / "chronicle/20240101/test/120000_300/agents/speaker_labels.json", 33 + journal / "chronicle/20240101/test/120000_300/agents/speaker_corrections.json", 34 + journal / "chronicle/20240102/test/120000_300/talents/speaker_labels.json", 35 + journal / "chronicle/20240102/test/120000_300/talents/speaker_corrections.json", 32 36 journal / "entities/alice/voiceprints.npz", 33 37 journal / "entities/alice/owner_centroid.npz", 34 38 journal / "entities/bob/owner_centroid.npz", ··· 49 53 report = wipe_speaker_artifacts(dry_run=True) 50 54 51 55 assert report.segment_embeddings.count == 1 52 - assert report.speaker_labels.count == 1 53 - assert report.speaker_corrections.count == 1 56 + assert report.speaker_labels.count == 2 # agents/ + talents/ 57 + assert report.speaker_corrections.count == 2 # agents/ + talents/ 54 58 assert report.entity_voiceprints.count == 1 55 59 assert report.owner_centroids.count == 2 56 60 assert report.owner_candidate.count == 1 57 - assert report.total_files == 7 58 - assert report.total_bytes == 7 * len(FILE_BYTES) 61 + assert report.total_files == 9 62 + assert report.total_bytes == 9 * len(FILE_BYTES) 59 63 60 64 for path in targets: 61 65 assert path.exists() ··· 67 71 report = wipe_speaker_artifacts(dry_run=False) 68 72 69 73 assert report.segment_embeddings.count == 1 70 - assert report.speaker_labels.count == 1 71 - assert report.speaker_corrections.count == 1 74 + assert report.speaker_labels.count == 2 # agents/ + talents/ 75 + assert report.speaker_corrections.count == 2 # agents/ + talents/ 72 76 assert report.entity_voiceprints.count == 1 73 77 assert report.owner_centroids.count == 2 74 78 assert report.owner_candidate.count == 1 75 - assert report.total_files == 7 76 - assert report.total_bytes == 7 * len(FILE_BYTES) 79 + assert report.total_files == 9 80 + assert report.total_bytes == 9 * len(FILE_BYTES) 77 81 78 82 for path in targets: 79 83 assert not path.exists()

+20 -8

apps/speakers/wipe.py

··· 38 38 journal = Path(get_journal()) 39 39 report = WipeReport() 40 40 41 + # Labels and corrections currently live under agents/ for most historical 42 + # segments; the attribution module now writes new files under talents/. 43 + # Cover both paths so no resemblyzer-space artifacts leak through the wipe. 44 + label_patterns = [ 45 + "chronicle/*/*/*/agents/speaker_labels.json", 46 + "chronicle/*/*/*/talents/speaker_labels.json", 47 + ] 48 + correction_patterns = [ 49 + "chronicle/*/*/*/agents/speaker_corrections.json", 50 + "chronicle/*/*/*/talents/speaker_corrections.json", 51 + ] 52 + 53 + def _expand(patterns: list[str]) -> list[Path]: 54 + paths: list[Path] = [] 55 + for pattern in patterns: 56 + paths.extend(journal.glob(pattern)) 57 + return sorted(paths) 58 + 41 59 categories: list[tuple[WipeCategory, list[Path]]] = [ 42 60 (report.segment_embeddings, sorted(journal.glob("chronicle/*/*/*/*.npz"))), 43 - ( 44 - report.speaker_labels, 45 - sorted(journal.glob("chronicle/*/*/*/talents/speaker_labels.json")), 46 - ), 47 - ( 48 - report.speaker_corrections, 49 - sorted(journal.glob("chronicle/*/*/*/talents/speaker_corrections.json")), 50 - ), 61 + (report.speaker_labels, _expand(label_patterns)), 62 + (report.speaker_corrections, _expand(correction_patterns)), 51 63 (report.entity_voiceprints, sorted(journal.glob("entities/*/voiceprints.npz"))), 52 64 (report.owner_centroids, sorted(journal.glob("entities/*/owner_centroid.npz"))), 53 65 (report.owner_candidate, [journal / "awareness" / "owner_candidate.npz"]),

Configure Feed

Configure Feed