speakers: add `sol call speakers wipe` to clear legacy encoder artifacts

+61

apps/speakers/call.py

··· 13 13 sol call speakers resolve-names [--commit] [--json] 14 14 sol call speakers attribute-segment <day> <stream> <segment> [--commit] [--json] 15 15 sol call speakers backfill [--commit] [--json] 16 + sol call speakers wipe [--commit] [--json] 16 17 sol call speakers discover [--json] 17 18 sol call speakers identify <cluster-id> <name> [--entity-id ID] 18 19 sol call speakers merge-names <alias> <canonical> ··· 337 338 typer.echo(f" {err}", err=True) 338 339 if len(stats["errors"]) > 10: 339 340 typer.echo(f" ... and {len(stats['errors']) - 10} more", err=True) 341 + 342 + 343 + @app.command() 344 + def wipe( 345 + commit: bool = typer.Option( 346 + False, 347 + "--commit", 348 + help="Actually delete files. Without this flag the command only reports what would happen.", 349 + ), 350 + json_output: bool = typer.Option( 351 + False, "--json", help="Output full result as JSON." 352 + ), 353 + ) -> None: 354 + """Remove all legacy speaker artifacts from the journal (DESTRUCTIVE). 355 + 356 + DESTRUCTIVE. Without --commit, prints a report of what would be 357 + removed. With --commit, permanently deletes segment-embedding NPZs, 358 + speaker labels/corrections, per-entity voiceprints, owner centroids, 359 + and the owner-candidate snapshot. 360 + """ 361 + import json as json_mod 362 + 363 + from apps.speakers.wipe import wipe_speaker_artifacts 364 + 365 + if not commit and not json_output: 366 + typer.echo("REPORT ONLY — pass --commit to persist.\n") 367 + 368 + report = wipe_speaker_artifacts(dry_run=not commit) 369 + 370 + if json_output: 371 + typer.echo(json_mod.dumps(report.to_dict(), indent=2, default=str)) 372 + return 373 + 374 + typer.echo( 375 + f"segment_embeddings : {report.segment_embeddings.count} files " 376 + f"({report.segment_embeddings.bytes} B)" 377 + ) 378 + typer.echo( 379 + f"speaker_labels : {report.speaker_labels.count} files " 380 + f"({report.speaker_labels.bytes} B)" 381 + ) 382 + typer.echo( 383 + f"speaker_corrections: {report.speaker_corrections.count} files " 384 + f"({report.speaker_corrections.bytes} B)" 385 + ) 386 + typer.echo( 387 + f"entity_voiceprints : {report.entity_voiceprints.count} files " 388 + f"({report.entity_voiceprints.bytes} B)" 389 + ) 390 + typer.echo( 391 + f"owner_centroids : {report.owner_centroids.count} files " 392 + f"({report.owner_centroids.bytes} B)" 393 + ) 394 + typer.echo( 395 + f"owner_candidate : {report.owner_candidate.count} files " 396 + f"({report.owner_candidate.bytes} B)" 397 + ) 398 + typer.echo( 399 + f"total : {report.total_files} files ({report.total_bytes} B)" 400 + ) 340 401 341 402 342 403 @app.command()

+79

apps/speakers/tests/test_wipe.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Tests for speaker artifact wipe reporting.""" 5 + 6 + from __future__ import annotations 7 + 8 + from pathlib import Path 9 + 10 + import pytest 11 + 12 + from apps.speakers.wipe import wipe_speaker_artifacts 13 + 14 + FILE_BYTES = b"test-data" 15 + 16 + 17 + @pytest.fixture 18 + def wipe_journal( 19 + tmp_path: Path, monkeypatch: pytest.MonkeyPatch 20 + ) -> tuple[Path, list[Path]]: 21 + journal = tmp_path / "journal" 22 + monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(journal)) 23 + 24 + import think.utils 25 + 26 + think.utils._journal_path_cache = None 27 + 28 + targets = [ 29 + journal / "chronicle/20240101/test/120000_300/mic_audio.npz", 30 + journal / "chronicle/20240101/test/120000_300/talents/speaker_labels.json", 31 + journal / "chronicle/20240101/test/120000_300/talents/speaker_corrections.json", 32 + journal / "entities/alice/voiceprints.npz", 33 + journal / "entities/alice/owner_centroid.npz", 34 + journal / "entities/bob/owner_centroid.npz", 35 + journal / "awareness/owner_candidate.npz", 36 + ] 37 + 38 + for path in targets: 39 + path.parent.mkdir(parents=True, exist_ok=True) 40 + path.write_bytes(FILE_BYTES) 41 + 42 + yield journal, targets 43 + think.utils._journal_path_cache = None 44 + 45 + 46 + def test_wipe_dry_run(wipe_journal: tuple[Path, list[Path]]) -> None: 47 + _, targets = wipe_journal 48 + 49 + report = wipe_speaker_artifacts(dry_run=True) 50 + 51 + assert report.segment_embeddings.count == 1 52 + assert report.speaker_labels.count == 1 53 + assert report.speaker_corrections.count == 1 54 + assert report.entity_voiceprints.count == 1 55 + assert report.owner_centroids.count == 2 56 + assert report.owner_candidate.count == 1 57 + assert report.total_files == 7 58 + assert report.total_bytes == 7 * len(FILE_BYTES) 59 + 60 + for path in targets: 61 + assert path.exists() 62 + 63 + 64 + def test_wipe_commit(wipe_journal: tuple[Path, list[Path]]) -> None: 65 + _, targets = wipe_journal 66 + 67 + report = wipe_speaker_artifacts(dry_run=False) 68 + 69 + assert report.segment_embeddings.count == 1 70 + assert report.speaker_labels.count == 1 71 + assert report.speaker_corrections.count == 1 72 + assert report.entity_voiceprints.count == 1 73 + assert report.owner_centroids.count == 2 74 + assert report.owner_candidate.count == 1 75 + assert report.total_files == 7 76 + assert report.total_bytes == 7 * len(FILE_BYTES) 77 + 78 + for path in targets: 79 + assert not path.exists()

+76

apps/speakers/wipe.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Wipe legacy speaker artifacts from the journal.""" 5 + 6 + from __future__ import annotations 7 + 8 + from dataclasses import asdict, dataclass, field 9 + from pathlib import Path 10 + 11 + from think.utils import get_journal 12 + 13 + 14 + @dataclass 15 + class WipeCategory: 16 + count: int = 0 17 + bytes: int = 0 18 + paths: list[str] = field(default_factory=list) 19 + 20 + 21 + @dataclass 22 + class WipeReport: 23 + segment_embeddings: WipeCategory = field(default_factory=WipeCategory) 24 + speaker_labels: WipeCategory = field(default_factory=WipeCategory) 25 + speaker_corrections: WipeCategory = field(default_factory=WipeCategory) 26 + entity_voiceprints: WipeCategory = field(default_factory=WipeCategory) 27 + owner_centroids: WipeCategory = field(default_factory=WipeCategory) 28 + owner_candidate: WipeCategory = field(default_factory=WipeCategory) 29 + total_files: int = 0 30 + total_bytes: int = 0 31 + 32 + def to_dict(self) -> dict: 33 + return asdict(self) 34 + 35 + 36 + def wipe_speaker_artifacts(dry_run: bool = True) -> WipeReport: 37 + """Remove legacy speaker artifacts from the journal.""" 38 + journal = Path(get_journal()) 39 + report = WipeReport() 40 + 41 + categories: list[tuple[WipeCategory, list[Path]]] = [ 42 + (report.segment_embeddings, sorted(journal.glob("chronicle/*/*/*/*.npz"))), 43 + ( 44 + report.speaker_labels, 45 + sorted(journal.glob("chronicle/*/*/*/talents/speaker_labels.json")), 46 + ), 47 + ( 48 + report.speaker_corrections, 49 + sorted(journal.glob("chronicle/*/*/*/talents/speaker_corrections.json")), 50 + ), 51 + (report.entity_voiceprints, sorted(journal.glob("entities/*/voiceprints.npz"))), 52 + (report.owner_centroids, sorted(journal.glob("entities/*/owner_centroid.npz"))), 53 + (report.owner_candidate, [journal / "awareness" / "owner_candidate.npz"]), 54 + ] 55 + 56 + for category, paths in categories: 57 + for path in paths: 58 + if not path.is_file(): 59 + continue 60 + category.count += 1 61 + category.bytes += path.stat().st_size 62 + category.paths.append(path.relative_to(journal).as_posix()) 63 + if not dry_run: 64 + path.unlink() 65 + 66 + tracked_categories = [ 67 + report.segment_embeddings, 68 + report.speaker_labels, 69 + report.speaker_corrections, 70 + report.entity_voiceprints, 71 + report.owner_centroids, 72 + report.owner_candidate, 73 + ] 74 + report.total_files = sum(category.count for category in tracked_categories) 75 + report.total_bytes = sum(category.bytes for category in tracked_categories) 76 + return report

Configure Feed

Configure Feed