talents/speaker_attribution: schema-constrain + drop wrapper tolerance

+1

talent/speaker_attribution.md

··· 6 6 "schedule": "segment", 7 7 "priority": 40, 8 8 "output": "json", 9 + "schema": "speaker_attribution.schema.json", 9 10 "color": "#d84315", 10 11 "hook": {"pre": "speaker_attribution", "post": "speaker_attribution"}, 11 12 "load": {"transcripts": true, "talents": {"speakers": true, "screen": true}}

+3 -6

talent/speaker_attribution.py

··· 149 149 if result: 150 150 try: 151 151 parsed = json.loads(result) 152 - if isinstance(parsed, list): 153 - items = parsed 154 - elif isinstance(parsed, dict): 155 - items = parsed.get("attributions", parsed.get("labels", [])) 156 - else: 157 - items = [] 152 + if not isinstance(parsed, list): 153 + raise TypeError(f"expected JSON array, got {type(parsed).__name__}") 154 + items = parsed 158 155 159 156 journal_entities = load_all_journal_entities() 160 157 entities_list = [

+14

talent/speaker_attribution.schema.json

··· 1 + { 2 + "$schema": "https://json-schema.org/draft/2020-12/schema", 3 + "type": "array", 4 + "items": { 5 + "type": "object", 6 + "additionalProperties": false, 7 + "required": ["sentence_id", "speaker", "reasoning"], 8 + "properties": { 9 + "sentence_id": {"type": "integer"}, 10 + "speaker": {"type": "string", "minLength": 1}, 11 + "reasoning": {"type": "string", "minLength": 1} 12 + } 13 + } 14 + }

+1

tests/baselines/api/stats/stats.json

··· 281 281 "path": "<PROJECT>/talent/speaker_attribution.md", 282 282 "priority": 40, 283 283 "schedule": "segment", 284 + "schema": "speaker_attribution.schema.json", 284 285 "source": "system", 285 286 "title": "Speaker Attribution", 286 287 "type": "generate"

+148

tests/test_speaker_attribution_hook.py

··· 4 4 """Unit tests for talent.speaker_attribution pre_process stub-writing behavior.""" 5 5 6 6 import json 7 + import logging 7 8 from unittest.mock import patch 8 9 9 10 ··· 140 141 assert "Let me explain" in unmatched_context 141 142 assert "Sentence 1" in unmatched_context 142 143 assert "Sentence 2" in unmatched_context 144 + 145 + 146 + def _post_process_context(): 147 + attribution_result = { 148 + "labels": [ 149 + {"sentence_id": 1, "speaker": None, "confidence": None, "method": None}, 150 + { 151 + "sentence_id": 2, 152 + "speaker": "bob", 153 + "confidence": "high", 154 + "method": "owner", 155 + }, 156 + ], 157 + "metadata": {}, 158 + "source": None, 159 + } 160 + return { 161 + "day": "20260419", 162 + "segment": "000000", 163 + "stream": "default", 164 + "meta": {"attribution_result": attribution_result}, 165 + } 166 + 167 + 168 + def _match_entity(name, _entities): 169 + if name == "Alice": 170 + return {"id": "alice"} 171 + return None 172 + 173 + 174 + class TestPostProcess: 175 + def test_bare_list_merges_layer4_attributions(self, tmp_path): 176 + result = json.dumps( 177 + [{"sentence_id": 1, "speaker": "Alice", "reasoning": "said her name"}] 178 + ) 179 + context = _post_process_context() 180 + 181 + with ( 182 + patch("apps.speakers.attribution.save_speaker_labels") as save_mock, 183 + patch( 184 + "apps.speakers.attribution.accumulate_voiceprints" 185 + ) as accumulate_mock, 186 + patch( 187 + "think.entities.find_matching_entity", 188 + side_effect=_match_entity, 189 + ), 190 + patch( 191 + "think.entities.journal.load_all_journal_entities", 192 + return_value={"alice": {"id": "alice"}}, 193 + ), 194 + patch("think.utils.segment_path", return_value=tmp_path), 195 + ): 196 + from talent.speaker_attribution import post_process 197 + 198 + post_process(result, context) 199 + 200 + saved_labels = save_mock.call_args[0][1] 201 + assert saved_labels[0] == { 202 + "sentence_id": 1, 203 + "speaker": "alice", 204 + "confidence": "medium", 205 + "method": "contextual", 206 + } 207 + assert saved_labels[1] == { 208 + "sentence_id": 2, 209 + "speaker": "bob", 210 + "confidence": "high", 211 + "method": "owner", 212 + } 213 + accumulate_mock.assert_not_called() 214 + 215 + def test_wrapper_shape_yields_zero_merges(self, tmp_path): 216 + result = json.dumps( 217 + {"attributions": [{"sentence_id": 1, "speaker": "Alice", "reasoning": "x"}]} 218 + ) 219 + context = _post_process_context() 220 + 221 + with ( 222 + patch("apps.speakers.attribution.save_speaker_labels") as save_mock, 223 + patch("apps.speakers.attribution.accumulate_voiceprints"), 224 + patch( 225 + "think.entities.find_matching_entity", 226 + side_effect=_match_entity, 227 + ) as match_mock, 228 + patch( 229 + "think.entities.journal.load_all_journal_entities", 230 + return_value={"alice": {"id": "alice"}}, 231 + ) as load_mock, 232 + patch("think.utils.segment_path", return_value=tmp_path), 233 + ): 234 + from talent.speaker_attribution import post_process 235 + 236 + post_process(result, context) 237 + 238 + saved_labels = save_mock.call_args[0][1] 239 + assert saved_labels[0] == { 240 + "sentence_id": 1, 241 + "speaker": None, 242 + "confidence": None, 243 + "method": None, 244 + } 245 + assert saved_labels[1] == { 246 + "sentence_id": 2, 247 + "speaker": "bob", 248 + "confidence": "high", 249 + "method": "owner", 250 + } 251 + load_mock.assert_not_called() 252 + match_mock.assert_not_called() 253 + 254 + def test_non_list_non_dict_yields_zero_merges_and_warns(self, tmp_path, caplog): 255 + context = _post_process_context() 256 + 257 + with ( 258 + patch("apps.speakers.attribution.save_speaker_labels") as save_mock, 259 + patch("apps.speakers.attribution.accumulate_voiceprints"), 260 + patch( 261 + "think.entities.find_matching_entity", 262 + side_effect=_match_entity, 263 + ) as match_mock, 264 + patch( 265 + "think.entities.journal.load_all_journal_entities", 266 + return_value={"alice": {"id": "alice"}}, 267 + ) as load_mock, 268 + patch("think.utils.segment_path", return_value=tmp_path), 269 + caplog.at_level(logging.WARNING), 270 + ): 271 + from talent.speaker_attribution import post_process 272 + 273 + post_process("42", context) 274 + 275 + saved_labels = save_mock.call_args[0][1] 276 + assert saved_labels[0] == { 277 + "sentence_id": 1, 278 + "speaker": None, 279 + "confidence": None, 280 + "method": None, 281 + } 282 + assert saved_labels[1] == { 283 + "sentence_id": 2, 284 + "speaker": "bob", 285 + "confidence": "high", 286 + "method": "owner", 287 + } 288 + assert "expected JSON array, got int" in caplog.text 289 + load_mock.assert_not_called() 290 + match_mock.assert_not_called()

+93

tests/test_speaker_attribution_schema.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + import json 5 + from pathlib import Path 6 + 7 + import pytest 8 + from jsonschema import Draft202012Validator 9 + 10 + from think.talent import get_talent 11 + 12 + SCHEMA_PATH = ( 13 + Path(__file__).parent.parent / "talent" / "speaker_attribution.schema.json" 14 + ) 15 + 16 + 17 + def _load_schema() -> dict: 18 + return json.loads(SCHEMA_PATH.read_text(encoding="utf-8")) 19 + 20 + 21 + def test_speaker_attribution_schema_file_is_valid_draft_2020_12(): 22 + Draft202012Validator.check_schema(_load_schema()) 23 + 24 + 25 + def test_speaker_attribution_talent_loads_schema(): 26 + assert get_talent("speaker_attribution")["json_schema"] == _load_schema() 27 + 28 + 29 + @pytest.mark.parametrize( 30 + "payload", 31 + [ 32 + [{"sentence_id": 1, "speaker": "Alice", "reasoning": "Introduced herself."}], 33 + [ 34 + {"sentence_id": 1, "speaker": "Alice", "reasoning": "Introduced herself."}, 35 + {"sentence_id": 2, "speaker": "Bob", "reasoning": "Replied to Alice."}, 36 + ], 37 + ], 38 + ) 39 + def test_positive_payload_validates(payload): 40 + validator = Draft202012Validator(_load_schema()) 41 + 42 + assert validator.is_valid(payload) 43 + 44 + 45 + def test_negative_wrapper_object_rejected(): 46 + validator = Draft202012Validator(_load_schema()) 47 + 48 + assert not validator.is_valid( 49 + { 50 + "attributions": [ 51 + { 52 + "sentence_id": 1, 53 + "speaker": "Alice", 54 + "reasoning": "Introduced herself.", 55 + } 56 + ] 57 + } 58 + ) 59 + 60 + 61 + def test_negative_missing_required_field_rejected(): 62 + validator = Draft202012Validator(_load_schema()) 63 + 64 + assert not validator.is_valid([{"sentence_id": 1, "speaker": "Alice"}]) 65 + 66 + 67 + def test_negative_empty_string_fields_rejected(): 68 + validator = Draft202012Validator(_load_schema()) 69 + 70 + assert not validator.is_valid([{"sentence_id": 1, "speaker": "", "reasoning": "x"}]) 71 + 72 + 73 + def test_negative_non_integer_sentence_id_rejected(): 74 + validator = Draft202012Validator(_load_schema()) 75 + 76 + assert not validator.is_valid( 77 + [{"sentence_id": "1", "speaker": "Alice", "reasoning": "x"}] 78 + ) 79 + 80 + 81 + def test_negative_additional_properties_rejected(): 82 + validator = Draft202012Validator(_load_schema()) 83 + 84 + assert not validator.is_valid( 85 + [ 86 + { 87 + "sentence_id": 1, 88 + "speaker": "Alice", 89 + "reasoning": "x", 90 + "confidence": "high", 91 + } 92 + ] 93 + )

Configure Feed

Configure Feed