Merge branch 'hopper-vl4tovh4-attribution-skip-stub'

+27 -3

muse/speaker_attribution.py

··· 36 36 return {"skip_reason": "no_segment_context"} 37 37 38 38 result = attribute_segment(day, stream, segment) 39 + seg_dir = segment_path(day, segment, stream) 39 40 40 41 if result.get("error"): 41 42 logger.info("Attribution skipped: %s", result["error"]) 42 - return {"skip_reason": result["error"]} 43 + reason = result["error"] 44 + if any(seg_dir.glob("*.npz")): 45 + agents_dir = seg_dir / "agents" 46 + agents_dir.mkdir(parents=True, exist_ok=True) 47 + out_path = agents_dir / "speaker_labels.json" 48 + with open(out_path, "w", encoding="utf-8") as fh: 49 + json.dump( 50 + {"labels": [], "skipped": True, "reason": reason}, 51 + fh, 52 + indent=2, 53 + ) 54 + logger.info("Wrote attribution stub: %s (%s)", out_path, reason) 55 + return {"skip_reason": reason} 43 56 44 57 labels = result.get("labels", []) 45 58 if not labels: 46 - return {"skip_reason": "no_embeddings"} 59 + reason = "no_embeddings" 60 + if any(seg_dir.glob("*.npz")): 61 + agents_dir = seg_dir / "agents" 62 + agents_dir.mkdir(parents=True, exist_ok=True) 63 + out_path = agents_dir / "speaker_labels.json" 64 + with open(out_path, "w", encoding="utf-8") as fh: 65 + json.dump( 66 + {"labels": [], "skipped": True, "reason": reason}, 67 + fh, 68 + indent=2, 69 + ) 70 + logger.info("Wrote attribution stub: %s (%s)", out_path, reason) 71 + return {"skip_reason": reason} 47 72 48 73 unmatched = result.get("unmatched", []) 49 74 metadata = result.get("metadata", {}) ··· 55 80 56 81 if not unmatched: 57 82 # All sentences resolved — write output and skip the LLM 58 - seg_dir = segment_path(day, segment, stream) 59 83 save_speaker_labels(seg_dir, labels, metadata) 60 84 61 85 # Voiceprint accumulation

+10

tests/test_retention.py

··· 4 4 """Tests for think.retention — media retention service.""" 5 5 6 6 import hashlib 7 + import json 7 8 import os 8 9 from datetime import datetime 9 10 ··· 159 160 def test_incomplete_missing_speaker_labels(self, tmp_path): 160 161 seg = _make_segment(tmp_path, audio=True, embeddings=True, speaker_labels=False) 161 162 assert not is_segment_complete(seg) 163 + 164 + def test_complete_with_stub_speaker_labels(self, tmp_path): 165 + """Stub speaker_labels.json (skipped=True, labels=[]) unblocks retention.""" 166 + seg = _make_segment(tmp_path, audio=True, embeddings=True, speaker_labels=False) 167 + stub = seg / "agents" / "speaker_labels.json" 168 + stub.write_text( 169 + json.dumps({"labels": [], "skipped": True, "reason": "no_owner_centroid"}) 170 + ) 171 + assert is_segment_complete(seg) 162 172 163 173 def test_incomplete_active_agents(self, tmp_path): 164 174 seg = _make_segment(tmp_path, audio=True, active_agents=True)

+90

tests/test_speaker_attribution_hook.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Unit tests for muse.speaker_attribution pre_process stub-writing behavior.""" 5 + 6 + import json 7 + from unittest.mock import patch 8 + 9 + 10 + def _run_pre_process(context, seg_dir, attribute_result): 11 + """Helper: run pre_process with mocked dependencies.""" 12 + with ( 13 + patch( 14 + "apps.speakers.attribution.attribute_segment", 15 + return_value=attribute_result, 16 + ), 17 + patch( 18 + "think.utils.segment_path", 19 + return_value=seg_dir, 20 + ), 21 + ): 22 + from muse.speaker_attribution import pre_process 23 + 24 + return pre_process(context) 25 + 26 + 27 + CONTEXT = {"day": "20260318", "segment": "100000_300", "stream": "default"} 28 + 29 + 30 + class TestPreProcessStub: 31 + def test_error_with_npz_writes_stub(self, tmp_path): 32 + """no_owner_centroid error + .npz present -> stub written.""" 33 + (tmp_path / "audio.npz").write_bytes(b"x") 34 + result = _run_pre_process( 35 + CONTEXT, 36 + tmp_path, 37 + {"error": "no_owner_centroid"}, 38 + ) 39 + stub_path = tmp_path / "agents" / "speaker_labels.json" 40 + assert stub_path.exists() 41 + data = json.loads(stub_path.read_text()) 42 + assert data == {"labels": [], "skipped": True, "reason": "no_owner_centroid"} 43 + assert result == {"skip_reason": "no_owner_centroid"} 44 + 45 + def test_error_without_npz_no_stub(self, tmp_path): 46 + """no_owner_centroid error + no .npz -> no stub written.""" 47 + result = _run_pre_process( 48 + CONTEXT, 49 + tmp_path, 50 + {"error": "no_owner_centroid"}, 51 + ) 52 + stub_path = tmp_path / "agents" / "speaker_labels.json" 53 + assert not stub_path.exists() 54 + assert result == {"skip_reason": "no_owner_centroid"} 55 + 56 + def test_empty_labels_with_npz_writes_stub(self, tmp_path): 57 + """Empty labels (loaded-but-empty .npz) + .npz present -> stub written.""" 58 + (tmp_path / "audio.npz").write_bytes(b"x") 59 + result = _run_pre_process( 60 + CONTEXT, 61 + tmp_path, 62 + {"labels": []}, 63 + ) 64 + stub_path = tmp_path / "agents" / "speaker_labels.json" 65 + assert stub_path.exists() 66 + data = json.loads(stub_path.read_text()) 67 + assert data == {"labels": [], "skipped": True, "reason": "no_embeddings"} 68 + assert result == {"skip_reason": "no_embeddings"} 69 + 70 + def test_empty_labels_without_npz_no_stub(self, tmp_path): 71 + """Empty labels + no .npz -> no stub written.""" 72 + result = _run_pre_process( 73 + CONTEXT, 74 + tmp_path, 75 + {"labels": []}, 76 + ) 77 + stub_path = tmp_path / "agents" / "speaker_labels.json" 78 + assert not stub_path.exists() 79 + assert result == {"skip_reason": "no_embeddings"} 80 + 81 + def test_no_segment_context_no_stub(self, tmp_path): 82 + """Missing day/segment -> returns early before any stub logic.""" 83 + (tmp_path / "audio.npz").write_bytes(b"x") 84 + with patch("think.utils.segment_path", return_value=tmp_path): 85 + from muse.speaker_attribution import pre_process 86 + 87 + result = pre_process({"stream": "default"}) 88 + stub_path = tmp_path / "agents" / "speaker_labels.json" 89 + assert not stub_path.exists() 90 + assert result == {"skip_reason": "no_segment_context"}

Configure Feed

Configure Feed