speakers: record voiceprints_removed in attribution-correct audit

+66 -19

2 changed files

expand all

apps

speakers

routes.py

tests

test_routes.py

+21 -10

apps/speakers/routes.py

··· 51 51 from think.utils import ( 52 52 day_dirs, 53 53 day_path, 54 + get_journal, 54 55 iter_segments, 55 56 now_ms, 56 57 segment_parse, ··· 212 213 segment_key: str, 213 214 source: str, 214 215 sentence_id: int, 215 - ) -> bool: 216 + ) -> Path | None: 216 217 """Remove a specific voiceprint entry from an entity's voiceprints.npz. 217 218 218 219 Matches by (day, segment_key, source, sentence_id) metadata key. 219 - Returns True if an entry was removed, False if not found. 220 + Returns the unlinked NPZ path if the file was removed (all entries filtered 221 + out), or None if the entry was rewritten or not found. 220 222 """ 221 223 try: 222 224 folder = journal_entity_memory_path(entity_id) 223 225 except (RuntimeError, ValueError): 224 - return False 226 + return None 225 227 226 228 npz_path = folder / "voiceprints.npz" 227 229 if not npz_path.exists(): 228 - return False 230 + return None 229 231 230 232 try: 231 233 data = np.load(npz_path, allow_pickle=False) 232 234 embeddings = data.get("embeddings") 233 235 metadata_arr = data.get("metadata") 234 236 if embeddings is None or metadata_arr is None: 235 - return False 237 + return None 236 238 except Exception: 237 - return False 239 + return None 238 240 239 241 keep = [] 240 242 for i, m_str in enumerate(metadata_arr): ··· 252 254 keep.append(i) 253 255 254 256 if len(keep) == len(metadata_arr): 255 - return False 257 + return None 256 258 257 259 if not keep: 258 260 npz_path.unlink() 259 - return True 261 + return npz_path 260 262 261 263 new_embeddings = embeddings[keep] 262 264 new_metadata = metadata_arr[keep] 263 265 tmp_path = npz_path.with_name(npz_path.stem + ".tmp.npz") 264 266 np.savez_compressed(tmp_path, embeddings=new_embeddings, metadata=new_metadata) 265 267 tmp_path.rename(npz_path) 266 - return True 268 + return None 267 269 268 270 269 271 def _load_speaker_labels(segment_dir: Path) -> dict | None: ··· 937 939 return error_response("Embedding too similar to owner voice — cannot save", 400) 938 940 939 941 auto_accumulated_methods = {"acoustic", "context", "contextual"} 942 + removed_voiceprint_path = None 940 943 if old_speaker and old_method in auto_accumulated_methods: 941 - _remove_voiceprint(old_speaker, day, segment_key, source, sentence_id) 944 + removed_voiceprint_path = _remove_voiceprint( 945 + old_speaker, day, segment_key, source, sentence_id 946 + ) 947 + 948 + voiceprints_removed: list[str] = [] 949 + if removed_voiceprint_path is not None: 950 + journal_root = Path(get_journal()) 951 + voiceprints_removed = [str(removed_voiceprint_path.relative_to(journal_root))] 942 952 943 953 _save_voiceprint( 944 954 new_speaker, ··· 978 988 "sentence_id": sentence_id, 979 989 "old_speaker": old_speaker, 980 990 "new_speaker": new_speaker, 991 + "voiceprints_removed": voiceprints_removed, 981 992 }, 982 993 ) 983 994

+45 -9

apps/speakers/tests/test_routes.py

··· 3 3 4 4 """Tests for speakers app - sentence-based embeddings.""" 5 5 6 + import json 7 + from datetime import datetime 8 + 6 9 import numpy as np 7 10 from flask import Flask 11 + 12 + 13 + def _read_action_entries(journal_root): 14 + """Read journal-level action log entries for today.""" 15 + today = datetime.now().strftime("%Y%m%d") 16 + log_path = journal_root / "config" / "actions" / f"{today}.jsonl" 17 + if not log_path.exists(): 18 + return [] 19 + return [ 20 + json.loads(line) 21 + for line in log_path.read_text(encoding="utf-8").splitlines() 22 + if line.strip() 23 + ] 8 24 9 25 10 26 def test_normalize_embedding(): ··· 812 828 813 829 def test_api_correct_attribution(speakers_env): 814 830 """Correct changes speaker attribution and manages voiceprints.""" 815 - import json 816 - 817 - from flask import Flask 818 - 819 831 from apps.speakers.routes import speakers_bp 820 832 821 833 env = speakers_env() ··· 872 884 assert bob_vp.exists() 873 885 alice_vp = env.journal / "entities" / "alice_test" / "voiceprints.npz" 874 886 assert not alice_vp.exists() 887 + 888 + action_entries = _read_action_entries(env.journal) 889 + assert len(action_entries) == 1 890 + assert action_entries[0]["action"] == "attribution_correct" 891 + assert action_entries[0]["params"]["voiceprints_removed"] == [ 892 + "entities/alice_test/voiceprints.npz" 893 + ] 875 894 876 895 877 896 def test_api_correct_same_speaker(speakers_env): ··· 1067 1086 from apps.speakers.routes import _remove_voiceprint 1068 1087 1069 1088 removed = _remove_voiceprint("alice_test", "20240101", "143022_300", "mic_audio", 1) 1070 - assert removed is True 1089 + assert removed is None 1071 1090 1072 1091 vp_path = env.journal / "entities" / "alice_test" / "voiceprints.npz" 1073 1092 data = np.load(vp_path, allow_pickle=False) ··· 1075 1094 assert data["metadata"].shape[0] == 1 1076 1095 1077 1096 1097 + def test_remove_voiceprint_unlinks_file_when_last_entry_removed(speakers_env): 1098 + """_remove_voiceprint returns the NPZ path when the final entry is removed.""" 1099 + env = speakers_env() 1100 + env.create_entity( 1101 + "Alice Test", 1102 + voiceprints=[("20240101", "143022_300", "mic_audio", 1)], 1103 + ) 1104 + 1105 + from apps.speakers.routes import _remove_voiceprint 1106 + 1107 + removed = _remove_voiceprint("alice_test", "20240101", "143022_300", "mic_audio", 1) 1108 + 1109 + vp_path = env.journal / "entities" / "alice_test" / "voiceprints.npz" 1110 + assert removed == vp_path 1111 + assert not vp_path.exists() 1112 + 1113 + 1078 1114 def test_remove_voiceprint_not_found(speakers_env): 1079 - """_remove_voiceprint returns False when no matching entry.""" 1115 + """_remove_voiceprint returns None when no matching entry.""" 1080 1116 env = speakers_env() 1081 1117 env.create_entity( 1082 1118 "Alice Test", ··· 1088 1124 removed = _remove_voiceprint( 1089 1125 "alice_test", "20240101", "143022_300", "mic_audio", 999 1090 1126 ) 1091 - assert removed is False 1127 + assert removed is None 1092 1128 1093 1129 1094 1130 def test_remove_voiceprint_no_file(speakers_env): 1095 - """_remove_voiceprint returns False when entity has no voiceprints.""" 1131 + """_remove_voiceprint returns None when entity has no voiceprints.""" 1096 1132 env = speakers_env() 1097 1133 env.create_entity("Alice Test") 1098 1134 1099 1135 from apps.speakers.routes import _remove_voiceprint 1100 1136 1101 1137 removed = _remove_voiceprint("alice_test", "20240101", "143022_300", "mic_audio", 1) 1102 - assert removed is False 1138 + assert removed is None 1103 1139 1104 1140 1105 1141 def test_api_segments_pagination(speakers_env):

Configure Feed

Configure Feed