personal memory agent
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'hopper-pn3qkesk-speakers-suggest'

# Conflicts:
# apps/speakers/suggest.py
# apps/speakers/tests/test_suggest.py

+538 -624
+23 -44
apps/speakers/call.py
··· 10 10 sol call speakers attribute-segment <day> <stream> <segment> [--json] 11 11 sol call speakers backfill [--dry-run] [--json] 12 12 sol call speakers discover [--json] 13 - sol call speakers suggest [--limit N] [--json] 14 13 sol call speakers identify <cluster-id> <name> [--entity-id ID] 15 14 sol call speakers merge-names <alias> <canonical> 15 + sol call speakers suggest [--limit N] [--json] 16 16 """ 17 17 18 18 from __future__ import annotations ··· 347 347 348 348 349 349 @app.command() 350 - def suggest( 351 - limit: int = typer.Option(5, "--limit", help="Maximum suggestions to return."), 352 - json_output: bool = typer.Option( 353 - False, "--json", help="Output full result as JSON." 354 - ), 355 - ) -> None: 356 - """Suggest speaker curation actions.""" 357 - import json as json_mod 358 - 359 - from apps.speakers.suggest import suggest_speakers 360 - 361 - suggestions = suggest_speakers(limit=limit) 362 - if json_output: 363 - typer.echo(json_mod.dumps(suggestions, indent=2, default=str)) 364 - return 365 - if not suggestions: 366 - typer.echo("No suggestions available.") 367 - raise typer.Exit() 368 - for s in suggestions: 369 - stype = s["type"] 370 - if stype == "unknown_recurring": 371 - typer.echo( 372 - f" [{stype}] Cluster {s['cluster_id']}: " 373 - f"{s['size']} samples across {s['segment_count']} segments" 374 - ) 375 - elif stype == "import_linkable": 376 - typer.echo( 377 - f" [{stype}] {s['name']}: " 378 - f"{s['meetings_count']} meetings, no voiceprint" 379 - ) 380 - elif stype == "name_variant": 381 - typer.echo( 382 - f" [{stype}] {s['names'][0]} / {s['names'][1]}: " 383 - f"similarity {s['similarity']:.4f}" 384 - ) 385 - elif stype == "low_confidence_review": 386 - typer.echo( 387 - f" [{stype}] {s['day']}: " 388 - f"{s['medium_count']} medium + {s['null_count']} null" 389 - ) 390 - 391 - 392 - @app.command() 393 350 def identify( 394 351 cluster_id: int = typer.Argument(..., help="Cluster ID from discovery output."), 395 352 name: str = typer.Argument(..., help="Speaker name to assign."), ··· 426 383 typer.echo(output, err=True) 427 384 raise typer.Exit(1) 428 385 typer.echo(output) 386 + 387 + 388 + @app.command() 389 + def suggest( 390 + limit: int = typer.Option( 391 + 5, "--limit", "-n", help="Maximum suggestions to return." 392 + ), 393 + json_output: bool = typer.Option(False, "--json", help="Output as JSON array."), 394 + ) -> None: 395 + """Suggest speaker curation opportunities.""" 396 + import json as json_mod 397 + 398 + from apps.speakers.suggest import suggest_opportunities 399 + 400 + results = suggest_opportunities(limit=limit) 401 + if json_output: 402 + typer.echo(json_mod.dumps(results, indent=2, default=str)) 403 + return 404 + 405 + from apps.speakers.suggest import format_suggestions 406 + 407 + typer.echo(format_suggestions(results))
+303 -307
apps/speakers/suggest.py
··· 1 1 # SPDX-License-Identifier: AGPL-3.0-only 2 2 # Copyright (c) 2026 sol pbc 3 3 4 - """Speaker curation suggestions - computed on the fly from existing data.""" 4 + """Speaker curation suggestion helpers.""" 5 5 6 6 from __future__ import annotations 7 7 8 8 import json 9 9 import logging 10 - from collections import defaultdict 10 + import re 11 11 from datetime import time 12 12 from pathlib import Path 13 13 from typing import Any 14 14 15 - from think.utils import day_dirs, get_journal, segment_parse 15 + from think.utils import day_dirs, get_journal, iter_segments, segment_parse 16 16 17 17 logger = logging.getLogger(__name__) 18 18 19 + _MEETING_LINE_RE = re.compile(r"^-\s+(\d{2}:\d{2})\s+(.*)") 20 + _PARTICIPANTS_RE = re.compile( 21 + r"\*\*Participants?\*\*\s*[:\u2013\u2014\-]\s*(.*)", 22 + re.IGNORECASE, 23 + ) 24 + _PAREN_RE = re.compile(r"\(([^)]+)\)") 25 + _WITH_RE = re.compile(r"\bwith\s+(.+?)(?:\s*\(|$)", re.IGNORECASE) 26 + _SKIP_PARTICIPANT_TERMS = ("presenting", "private", "unscheduled") 27 + 28 + 29 + def _bootstrap_helpers(): 30 + from apps.speakers.bootstrap import resolve_name_variants 31 + 32 + return resolve_name_variants 19 33 20 - def suggest_speakers(limit: int = 5) -> list[dict[str, Any]]: 21 - """Return prioritized speaker curation suggestions. 34 + 35 + def _discovery_helpers(): 36 + from apps.speakers.discovery import discover_unknown_speakers 37 + 38 + return discover_unknown_speakers 39 + 40 + 41 + def _split_participants(text: str) -> list[str]: 42 + parts = re.split(r",|\band\b", text, flags=re.IGNORECASE) 43 + return [part.strip().strip("*").strip() for part in parts if part.strip()] 44 + 22 45 23 - Priority order: unknown_recurring > import_linkable > name_variant > 24 - low_confidence_review. Returns at most ``limit`` suggestions total. 25 - """ 26 - suggestions: list[dict[str, Any]] = [] 27 - suggestions.extend(_suggest_unknown_recurring()) 28 - suggestions.extend(_suggest_import_linkable()) 29 - suggestions.extend(_suggest_name_variants()) 30 - suggestions.extend(_suggest_low_confidence_review()) 31 - return suggestions[:limit] 46 + def _name_matches_entity(participant: str, names: set[str]) -> bool: 47 + participant_lower = participant.strip().lower() 48 + if not participant_lower: 49 + return False 50 + if participant_lower in names: 51 + return True 52 + first_word = participant_lower.split()[0] 53 + return any(name.split()[0] == first_word for name in names if name) 32 54 33 55 34 - def _suggest_unknown_recurring() -> list[dict[str, Any]]: 35 - """Transform the discovery cache into actionable cluster suggestions.""" 36 - cache_path = Path(get_journal()) / "awareness" / "discovery_clusters.json" 37 - if not cache_path.exists(): 56 + def _parse_meetings(day_path: str) -> list[dict[str, Any]]: 57 + meetings_path = Path(day_path) / "agents" / "meetings.md" 58 + if not meetings_path.exists(): 38 59 return [] 39 60 40 61 try: 41 - cache_data = json.loads(cache_path.read_text(encoding="utf-8")) 42 - except (json.JSONDecodeError, OSError): 43 - logger.warning("Failed to read discovery cache", exc_info=True) 62 + content = meetings_path.read_text(encoding="utf-8") 63 + except OSError: 44 64 return [] 45 65 46 - clusters = cache_data.get("clusters") 47 - if not isinstance(clusters, dict): 48 - return [] 66 + meetings: list[dict[str, Any]] = [] 67 + for raw_line in content.splitlines(): 68 + line = raw_line.strip() 69 + if not line or line.startswith("#"): 70 + continue 49 71 50 - suggestions: list[dict[str, Any]] = [] 51 - for cluster_id_str, records in clusters.items(): 52 - if not isinstance(records, list): 72 + participants_match = _PARTICIPANTS_RE.search(line) 73 + if participants_match: 74 + participants = [ 75 + name 76 + for name in _split_participants(participants_match.group(1)) 77 + if len(name) >= 2 78 + ] 79 + meetings.append( 80 + { 81 + "time": None, 82 + "line": raw_line, 83 + "participants": participants, 84 + } 85 + ) 53 86 continue 54 - try: 55 - cluster_id = int(cluster_id_str) 56 - except (TypeError, ValueError): 87 + 88 + match = _MEETING_LINE_RE.match(line) 89 + if not match: 57 90 continue 58 91 59 - segment_keys = { 60 - (record.get("day"), record.get("segment_key")) 61 - for record in records 62 - if isinstance(record, dict) 63 - } 64 - segments = sorted( 65 - { 66 - f"{record['day']}/{record['stream']}/{record['segment_key']}" 67 - for record in records 68 - if isinstance(record, dict) 69 - and record.get("day") 70 - and record.get("stream") 71 - and record.get("segment_key") 72 - } 73 - ) 74 - suggestions.append( 92 + meeting_time, description = match.groups() 93 + participants: list[str] = [] 94 + 95 + for paren_match in _PAREN_RE.findall(description): 96 + for name in _split_participants(paren_match): 97 + if len(name) < 2: 98 + continue 99 + if any(term in name.lower() for term in _SKIP_PARTICIPANT_TERMS): 100 + continue 101 + participants.append(name) 102 + 103 + with_match = _WITH_RE.search(description) 104 + if with_match: 105 + participants.extend( 106 + name 107 + for name in _split_participants(with_match.group(1)) 108 + if len(name) >= 2 109 + ) 110 + 111 + meetings.append( 75 112 { 76 - "type": "unknown_recurring", 77 - "cluster_id": cluster_id, 78 - "size": len(records), 79 - "segment_count": len( 80 - { 81 - (day, segment_key) 82 - for day, segment_key in segment_keys 83 - if day and segment_key 84 - } 85 - ), 86 - "segments": segments, 87 - "samples": records[:3], 88 - "import_hints": { 89 - "calendar_overlap": _calendar_overlap_for_segments(segments) 90 - }, 113 + "time": meeting_time, 114 + "line": raw_line, 115 + "participants": list(dict.fromkeys(participants)), 91 116 } 92 117 ) 93 118 94 - suggestions.sort(key=lambda item: item["size"], reverse=True) 95 - return suggestions 119 + return meetings 96 120 97 121 98 - def _suggest_import_linkable() -> list[dict[str, Any]]: 99 - """Suggest meeting participants who appear in events but lack voiceprints.""" 100 - participant_data: dict[str, dict[str, Any]] = {} 101 - for events_path in Path(get_journal()).glob("facets/*/events/*.jsonl"): 102 - day = events_path.stem 122 + def _meetings_overlapping_segment( 123 + meetings: list[dict[str, Any]], segment_key: str 124 + ) -> list[str]: 125 + start_time, end_time = segment_parse(segment_key) 126 + if start_time is None or end_time is None: 127 + return [] 128 + 129 + overlaps: list[str] = [] 130 + for meeting in meetings: 131 + meeting_time = meeting.get("time") 132 + if not meeting_time: 133 + continue 103 134 try: 104 - with open(events_path, encoding="utf-8") as handle: 105 - for line in handle: 106 - try: 107 - event = json.loads(line) 108 - except json.JSONDecodeError: 109 - continue 110 - if ( 111 - event.get("type") != "meeting" 112 - or event.get("occurred") is not True 113 - ): 114 - continue 115 - participants = event.get("participants") 116 - if not isinstance(participants, list) or not participants: 117 - continue 118 - for name in participants: 119 - if not isinstance(name, str) or not name.strip(): 120 - continue 121 - entry = participant_data.setdefault( 122 - name, 123 - {"count": 0, "day_events": []}, 124 - ) 125 - entry["count"] += 1 126 - entry["day_events"].append((day, event)) 127 - except OSError: 128 - logger.warning("Failed reading event file %s", events_path, exc_info=True) 135 + hour, minute = meeting_time.split(":", 1) 136 + meeting_clock = time(hour=int(hour), minute=int(minute)) 137 + except (TypeError, ValueError): 129 138 continue 139 + if start_time <= meeting_clock <= end_time: 140 + overlaps.append(meeting.get("line", "")) 141 + return overlaps 130 142 131 - if not participant_data: 132 - return [] 133 143 134 - from think.entities.journal import load_journal_entity, scan_journal_entities 144 + def _unknown_recurring() -> list[dict[str, Any]]: 145 + discover_unknown_speakers = _discovery_helpers() 135 146 136 - name_to_entity: dict[str, dict[str, Any]] = {} 137 - for entity_id in scan_journal_entities(): 138 - entity = load_journal_entity(entity_id) 139 - if not entity: 140 - continue 141 - entity_names = [entity.get("name", "")] 142 - entity_names.extend(entity.get("aka", [])) 143 - for name in entity_names: 144 - if isinstance(name, str) and name.strip(): 145 - name_to_entity[name.lower()] = entity 147 + result = discover_unknown_speakers() 148 + clusters = result.get("clusters", []) 146 149 147 150 suggestions: list[dict[str, Any]] = [] 148 - journal = Path(get_journal()) 149 - for name, entry in participant_data.items(): 150 - matched_entity = name_to_entity.get(name.lower()) 151 - has_voiceprint = False 152 - if matched_entity is not None: 153 - voiceprint_path = ( 154 - journal / "entities" / matched_entity["id"] / "voiceprints.npz" 155 - ) 156 - has_voiceprint = voiceprint_path.exists() 157 - if has_voiceprint: 158 - continue 151 + meetings_cache: dict[str, list[dict[str, Any]]] = {} 152 + all_days = day_dirs() 159 153 160 - overlapping_segments: set[str] = set() 161 - segments_by_day: dict[str, list[tuple[str, str, time, time]]] = {} 162 - for day, event in entry["day_events"]: 163 - event_start = _parse_event_time(event.get("start")) 164 - event_end = _parse_event_time(event.get("end")) 165 - if event_start is None or event_end is None: 154 + for cluster in clusters: 155 + samples = cluster.get("samples", []) 156 + segments: list[str] = [] 157 + seen_segments: set[str] = set() 158 + overlap_lines: list[str] = [] 159 + seen_overlap_keys: set[tuple[str, str]] = set() 160 + 161 + for sample in samples: 162 + segment_ref = f"{sample['day']}/{sample['stream']}/{sample['segment_key']}" 163 + if segment_ref not in seen_segments: 164 + seen_segments.add(segment_ref) 165 + segments.append(segment_ref) 166 + 167 + day = sample["day"] 168 + segment_key = sample["segment_key"] 169 + overlap_key = (day, segment_key) 170 + if overlap_key in seen_overlap_keys: 166 171 continue 167 - day_segments = segments_by_day.setdefault(day, _iter_day_segments(day)) 168 - for stream, segment_key, seg_start, seg_end in day_segments: 169 - if _time_overlaps(seg_start, seg_end, event_start, event_end): 170 - overlapping_segments.add(f"{day}/{stream}/{segment_key}") 172 + seen_overlap_keys.add(overlap_key) 173 + 174 + if day not in meetings_cache: 175 + dp = all_days.get(day) 176 + meetings_cache[day] = _parse_meetings(dp) if dp else [] 177 + for line in _meetings_overlapping_segment(meetings_cache[day], segment_key): 178 + if line not in overlap_lines: 179 + overlap_lines.append(line) 171 180 172 181 suggestions.append( 173 182 { 174 - "type": "import_linkable", 175 - "name": name, 176 - "source": "meetings", 177 - "meetings_count": entry["count"], 178 - "has_voiceprint": False, 179 - "overlapping_segments": sorted(overlapping_segments), 183 + "type": "unknown_recurring", 184 + "cluster_id": cluster["cluster_id"], 185 + "size": cluster["size"], 186 + "segment_count": cluster["segment_count"], 187 + "segments": segments, 188 + "samples": samples, 189 + "import_hints": {"calendar_overlap": overlap_lines}, 180 190 } 181 191 ) 182 192 183 - suggestions.sort(key=lambda item: item["meetings_count"], reverse=True) 184 193 return suggestions 185 194 186 195 187 - def _suggest_name_variants() -> list[dict[str, Any]]: 188 - """Return high-similarity speaker name pairs with resolved entity IDs.""" 189 - from apps.speakers.bootstrap import resolve_name_variants 190 - from think.entities.journal import load_journal_entity, scan_journal_entities 196 + def _import_linkable() -> list[dict[str, Any]]: 197 + from think.entities.journal import load_all_journal_entities 191 198 192 - stats = resolve_name_variants(dry_run=True) 193 - matches = stats.get("matches_found", []) 194 - if not isinstance(matches, list): 195 - return [] 199 + entities = load_all_journal_entities() 200 + # Track both the count of meeting lines and which days, per participant name 201 + participant_info: dict[str, dict[str, Any]] = {} 202 + 203 + for day, dp in day_dirs().items(): 204 + for meeting in _parse_meetings(dp): 205 + for participant in meeting.get("participants", []): 206 + key = participant.lower() 207 + info = participant_info.setdefault(key, {"count": 0, "days": set()}) 208 + info["count"] += 1 209 + info["days"].add(day) 210 + 211 + suggestions: list[dict[str, Any]] = [] 212 + journal_path = Path(get_journal()) 196 213 197 - name_to_entity_id: dict[str, str] = {} 198 - for entity_id in scan_journal_entities(): 199 - entity = load_journal_entity(entity_id) 200 - if not entity: 214 + for entity_id, entity in entities.items(): 215 + if entity.get("is_principal") or entity.get("blocked"): 201 216 continue 202 - name = entity.get("name") 203 - if isinstance(name, str) and name.strip(): 204 - name_to_entity_id[name.lower()] = entity_id 205 217 206 - suggestions = [] 207 - for match in matches: 208 - if not isinstance(match, dict): 218 + if (journal_path / "entities" / entity_id / "voiceprints.npz").exists(): 209 219 continue 210 - name_a = match.get("name_a") 211 - name_b = match.get("name_b") 212 - if not isinstance(name_a, str) or not isinstance(name_b, str): 220 + 221 + names = { 222 + str(name).strip().lower() 223 + for name in [entity.get("name"), *(entity.get("aka", []))] 224 + if str(name).strip() 225 + } 226 + if not names: 227 + continue 228 + 229 + mention_count = 0 230 + matched_days: set[str] = set() 231 + for participant, info in participant_info.items(): 232 + if _name_matches_entity(participant, names): 233 + mention_count += info["count"] 234 + matched_days.update(info["days"]) 235 + 236 + if not matched_days: 213 237 continue 238 + 214 239 suggestions.append( 215 240 { 216 - "type": "name_variant", 217 - "names": [name_a, name_b], 218 - "entity_ids": [ 219 - name_to_entity_id.get(name_a.lower()), 220 - name_to_entity_id.get(name_b.lower()), 221 - ], 222 - "similarity": match.get("similarity"), 241 + "type": "import_linkable", 242 + "entity_id": entity_id, 243 + "name": entity["name"], 244 + "meetings_mentioned": mention_count, 245 + "meeting_days": sorted(matched_days), 223 246 } 224 247 ) 225 248 249 + suggestions.sort( 250 + key=lambda item: item["meetings_mentioned"], 251 + reverse=True, 252 + ) 226 253 return suggestions 227 254 228 255 229 - def _suggest_low_confidence_review() -> list[dict[str, Any]]: 230 - """Suggest days with a large number of medium or null speaker labels.""" 231 - journal = Path(get_journal()) 232 - by_day: dict[str, dict[str, Any]] = defaultdict( 233 - lambda: {"medium_count": 0, "null_count": 0, "segments": set()} 234 - ) 256 + def _name_variant() -> list[dict[str, Any]]: 257 + resolve_name_variants = _bootstrap_helpers() 258 + from think.entities.journal import load_all_journal_entities 235 259 236 - for day in day_dirs().keys(): 237 - day_dir = journal / day 238 - if not day_dir.is_dir(): 260 + result = resolve_name_variants(dry_run=True) 261 + entities = load_all_journal_entities() 262 + name_to_id = { 263 + entity.get("name", "").strip().lower(): entity_id 264 + for entity_id, entity in entities.items() 265 + if entity.get("name") 266 + } 267 + 268 + suggestions: list[dict[str, Any]] = [] 269 + for pair in result.get("matches_found", []): 270 + name_a = pair["name_a"] 271 + name_b = pair["name_b"] 272 + lower_a = name_a.lower() 273 + lower_b = name_b.lower() 274 + first_word_match = ( 275 + lower_a.split()[0] == lower_b or lower_b.split()[0] == lower_a 276 + ) 277 + substring_match = lower_a in lower_b or lower_b in lower_a 278 + if not (first_word_match or substring_match): 239 279 continue 240 - for stream_dir in sorted(day_dir.iterdir()): 241 - if not stream_dir.is_dir(): 242 - continue 243 - for seg_dir in sorted(stream_dir.iterdir()): 244 - if not seg_dir.is_dir(): 245 - continue 246 - labels_path = seg_dir / "agents" / "speaker_labels.json" 247 - if not labels_path.exists(): 248 - continue 249 - try: 250 - data = json.loads(labels_path.read_text(encoding="utf-8")) 251 - except (json.JSONDecodeError, OSError): 252 - continue 253 - needs_review = False 254 - for label in data.get("labels", []): 255 - if label.get("confidence") == "medium": 256 - by_day[day]["medium_count"] += 1 257 - needs_review = True 258 - if label.get("speaker") is None: 259 - by_day[day]["null_count"] += 1 260 - needs_review = True 261 - if needs_review: 262 - by_day[day]["segments"].add( 263 - f"{day}/{stream_dir.name}/{seg_dir.name}" 264 - ) 265 280 266 - suggestions = [] 267 - for day, info in by_day.items(): 268 - total = info["medium_count"] + info["null_count"] 269 - if total <= 10: 270 - continue 271 281 suggestions.append( 272 282 { 273 - "type": "low_confidence_review", 274 - "day": day, 275 - "medium_count": info["medium_count"], 276 - "null_count": info["null_count"], 277 - "segments_needing_review": sorted(info["segments"]), 283 + "type": "name_variant", 284 + "entity_a": {"id": name_to_id.get(lower_a), "name": name_a}, 285 + "entity_b": {"id": name_to_id.get(lower_b), "name": name_b}, 286 + "similarity": pair["similarity"], 278 287 } 279 288 ) 280 289 281 - suggestions.sort( 282 - key=lambda item: item["medium_count"] + item["null_count"], 283 - reverse=True, 284 - ) 290 + suggestions.sort(key=lambda item: item["similarity"], reverse=True) 285 291 return suggestions 286 292 287 293 288 - def _calendar_overlap_for_segments(segments: list[str]) -> list[dict[str, Any]]: 289 - """Find meeting events that overlap with the given segment strings. 290 - 291 - Args: 292 - segments: list of "day/stream/segment_key" strings 293 - """ 294 - segments_by_day: dict[str, list[tuple[str, str, time, time]]] = defaultdict(list) 295 - for segment in segments: 296 - parts = segment.split("/") 297 - if len(parts) != 3: 298 - continue 299 - day, stream, segment_key = parts 300 - seg_start, seg_end = segment_parse(segment_key) 301 - if seg_start is None or seg_end is None: 302 - continue 303 - segments_by_day[day].append((stream, segment_key, seg_start, seg_end)) 294 + def _low_confidence_review() -> list[dict[str, Any]]: 295 + day_totals: dict[str, dict[str, int]] = {} 304 296 305 - overlaps: list[dict[str, Any]] = [] 306 - for day, segment_entries in segments_by_day.items(): 307 - for event in _load_day_events(day): 308 - if event.get("type") != "meeting": 297 + for day in sorted(day_dirs().keys()): 298 + for _stream, _segment_key, seg_path in iter_segments(day): 299 + labels_path = seg_path / "agents" / "speaker_labels.json" 300 + if not labels_path.exists(): 309 301 continue 310 - participants = event.get("participants") 311 - if not isinstance(participants, list) or not participants: 302 + try: 303 + data = json.loads(labels_path.read_text(encoding="utf-8")) 304 + except (OSError, json.JSONDecodeError): 312 305 continue 313 - event_start = _parse_event_time(event.get("start")) 314 - event_end = _parse_event_time(event.get("end")) 315 - if event_start is None or event_end is None: 306 + 307 + labels = data.get("labels", []) 308 + if not isinstance(labels, list): 316 309 continue 317 310 318 - overlapping_segments = [] 319 - for stream, segment_key, seg_start, seg_end in segment_entries: 320 - segment_label = f"{day}/{stream}/{segment_key}" 321 - if _time_overlaps(seg_start, seg_end, event_start, event_end): 322 - overlapping_segments.append(segment_label) 323 - 324 - if overlapping_segments: 325 - overlaps.append( 326 - { 327 - "day": day, 328 - "title": event.get("title", ""), 329 - "facet": event.get("facet", ""), 330 - "participants": participants, 331 - "start": event.get("start"), 332 - "end": event.get("end"), 333 - "segments": sorted(set(overlapping_segments)), 334 - } 335 - ) 311 + counts = day_totals.setdefault(day, {"medium_or_null": 0, "total": 0}) 312 + for label in labels: 313 + if not isinstance(label, dict): 314 + continue 315 + counts["total"] += 1 316 + if label.get("confidence") != "high": 317 + counts["medium_or_null"] += 1 336 318 337 - return overlaps 338 - 339 - 340 - def _time_overlaps( 341 - seg_start: time, seg_end: time, event_start: time, event_end: time 342 - ) -> bool: 343 - """Return True if two time ranges overlap.""" 344 - return seg_start < event_end and event_start < seg_end 319 + suggestions = [ 320 + { 321 + "type": "low_confidence_review", 322 + "day": day, 323 + "medium_or_null_count": counts["medium_or_null"], 324 + "total_labels": counts["total"], 325 + } 326 + for day, counts in day_totals.items() 327 + if counts["medium_or_null"] > 10 328 + ] 329 + suggestions.sort(key=lambda item: item["medium_or_null_count"], reverse=True) 330 + return suggestions 345 331 346 332 347 - def _load_day_events(day: str) -> list[dict[str, Any]]: 348 - """Load all events for a day from facets/*/events/{day}.jsonl.""" 349 - events: list[dict[str, Any]] = [] 350 - for events_path in Path(get_journal()).glob(f"facets/*/events/{day}.jsonl"): 333 + def suggest_opportunities(limit: int = 5) -> list[dict[str, Any]]: 334 + suggestions: list[dict[str, Any]] = [] 335 + for generator in [ 336 + _unknown_recurring, 337 + _import_linkable, 338 + _name_variant, 339 + _low_confidence_review, 340 + ]: 341 + if len(suggestions) >= limit: 342 + break 351 343 try: 352 - with open(events_path, encoding="utf-8") as handle: 353 - for line in handle: 354 - try: 355 - event = json.loads(line) 356 - except json.JSONDecodeError: 357 - continue 358 - if not isinstance(event, dict): 359 - continue 360 - event.setdefault("facet", events_path.parent.parent.name) 361 - events.append(event) 362 - except OSError: 363 - logger.warning( 364 - "Failed reading day events from %s", events_path, exc_info=True 365 - ) 366 - continue 367 - return events 344 + suggestions.extend(generator()) 345 + except Exception: 346 + logger.exception("Suggestion generator %s failed", generator.__name__) 347 + return suggestions[:limit] 368 348 369 349 370 - def _iter_day_segments(day: str) -> list[tuple[str, str, time, time]]: 371 - """Return (stream, segment_key, start_time, end_time) for all segments on a day.""" 372 - day_dir = Path(get_journal()) / day 373 - if not day_dir.is_dir(): 374 - return [] 375 - 376 - segments = [] 377 - for stream_dir in sorted(day_dir.iterdir()): 378 - if not stream_dir.is_dir(): 379 - continue 380 - for segment_dir in sorted(stream_dir.iterdir()): 381 - if not segment_dir.is_dir(): 382 - continue 383 - start_time, end_time = segment_parse(segment_dir.name) 384 - if start_time is None or end_time is None: 385 - continue 386 - segments.append((stream_dir.name, segment_dir.name, start_time, end_time)) 387 - return segments 350 + def format_suggestions(suggestions: list[dict[str, Any]]) -> str: 351 + if not suggestions: 352 + return "No speaker curation suggestions found." 388 353 354 + lines: list[str] = [] 355 + for suggestion in suggestions: 356 + suggestion_type = suggestion.get("type") 357 + if suggestion_type == "unknown_recurring": 358 + lines.append( 359 + "Unknown recurring speaker " 360 + f"(cluster {suggestion['cluster_id']}): " 361 + f"{suggestion['size']} samples across " 362 + f"{suggestion['segment_count']} segments" 363 + ) 364 + segments = suggestion.get("segments", []) 365 + if segments: 366 + lines.append(f" Segments: {', '.join(segments)}") 367 + for meeting_line in suggestion.get("import_hints", {}).get( 368 + "calendar_overlap", [] 369 + ): 370 + lines.append(f" Calendar overlap: {meeting_line.strip()}") 371 + elif suggestion_type == "import_linkable": 372 + lines.append( 373 + "Import linkable: " 374 + f"{suggestion['name']} ({suggestion['entity_id']}) " 375 + f"\u2014 mentioned in {suggestion['meetings_mentioned']} meetings" 376 + ) 377 + lines.append(f" Days: {', '.join(suggestion['meeting_days'])}") 378 + elif suggestion_type == "name_variant": 379 + lines.append( 380 + "Name variant: " 381 + f'"{suggestion["entity_a"]["name"]}" ' 382 + f'\u2194 "{suggestion["entity_b"]["name"]}" ' 383 + f"(similarity: {suggestion['similarity']:.2f})" 384 + ) 385 + elif suggestion_type == "low_confidence_review": 386 + lines.append( 387 + "Low confidence review: " 388 + f"{suggestion['day']} \u2014 " 389 + f"{suggestion['medium_or_null_count']} of " 390 + f"{suggestion['total_labels']} labels are medium/unresolved" 391 + ) 389 392 390 - def _parse_event_time(value: Any) -> time | None: 391 - """Parse an event time string if valid.""" 392 - if not isinstance(value, str) or not value: 393 - return None 394 - try: 395 - return time.fromisoformat(value) 396 - except ValueError: 397 - return None 393 + return "\n".join(lines)
+212 -273
apps/speakers/tests/test_suggest.py
··· 1 1 # SPDX-License-Identifier: AGPL-3.0-only 2 2 # Copyright (c) 2026 sol pbc 3 3 4 - """Tests for speaker suggestion generation.""" 5 - 6 4 from __future__ import annotations 7 5 8 6 import json 9 7 from pathlib import Path 10 8 11 9 import numpy as np 10 + from typer.testing import CliRunner 12 11 12 + from apps.speakers.call import app 13 + from apps.speakers.suggest import ( 14 + _parse_meetings, 15 + format_suggestions, 16 + suggest_opportunities, 17 + ) 13 18 14 - def _write_discovery_cache(journal: Path, clusters: dict[str, list[dict]]) -> None: 15 - cache_dir = journal / "awareness" 16 - cache_dir.mkdir(parents=True, exist_ok=True) 17 - (cache_dir / "discovery_clusters.json").write_text( 18 - json.dumps({"version": "2026-03-19T00:00:00", "clusters": clusters}), 19 - encoding="utf-8", 20 - ) 21 19 20 + def create_meetings_md(env, day: str, content: str) -> Path: 21 + meetings_path = env.journal / day / "agents" / "meetings.md" 22 + meetings_path.parent.mkdir(parents=True, exist_ok=True) 23 + meetings_path.write_text(content, encoding="utf-8") 24 + return meetings_path 22 25 23 - def _write_events(journal: Path, facet: str, day: str, events: list[dict]) -> None: 24 - events_dir = journal / "facets" / facet / "events" 25 - events_dir.mkdir(parents=True, exist_ok=True) 26 - lines = [json.dumps(event) for event in events] 27 - (events_dir / f"{day}.jsonl").write_text("\n".join(lines) + "\n", encoding="utf-8") 28 26 29 - 30 - def _write_voiceprints( 31 - journal: Path, entity_id: str, vectors: list[np.ndarray] 32 - ) -> None: 33 - entity_dir = journal / "entities" / entity_id 34 - entity_dir.mkdir(parents=True, exist_ok=True) 27 + def _write_voiceprints(entity_dir: Path, embeddings: list[np.ndarray]) -> None: 35 28 metadata = np.array( 36 29 [ 37 30 json.dumps( 38 31 { 39 32 "day": "20240101", 40 - "segment_key": f"0900{idx:02d}_300", 41 - "source": "audio", 42 - "sentence_id": 1, 33 + "segment_key": f"10000{i}_300", 34 + "source": "mic_audio", 35 + "sentence_id": i + 1, 43 36 "added_at": 1700000000000, 44 37 } 45 38 ) 46 - for idx, _ in enumerate(vectors, start=1) 39 + for i in range(len(embeddings)) 47 40 ], 48 41 dtype=str, 49 42 ) 50 43 np.savez_compressed( 51 44 entity_dir / "voiceprints.npz", 52 - embeddings=np.array(vectors, dtype=np.float32), 45 + embeddings=np.array(embeddings, dtype=np.float32), 53 46 metadata=metadata, 54 47 ) 55 48 56 49 57 - def test_suggest_empty(speakers_env): 58 - """No suggestions when journal has no relevant data.""" 50 + def test_suggest_empty_journal(speakers_env): 59 51 speakers_env() 60 52 61 - from apps.speakers.suggest import suggest_speakers 62 - 63 - result = suggest_speakers() 64 - 65 - assert result == [] 53 + assert suggest_opportunities() == [] 66 54 67 55 68 - def test_suggest_unknown_recurring_from_cache(speakers_env): 56 + def test_suggest_low_confidence_review(speakers_env): 69 57 env = speakers_env() 70 - _write_discovery_cache( 71 - env.journal, 72 - { 73 - "7": [ 58 + for idx in range(4): 59 + segment_key = f"1000{idx:02d}_300" 60 + env.create_segment("20240101", segment_key, ["mic_audio"]) 61 + env.create_speaker_labels( 62 + "20240101", 63 + segment_key, 64 + [ 74 65 { 75 - "day": "20240101", 76 - "stream": "test", 77 - "segment_key": "090000_1800", 78 - "source": "audio", 79 66 "sentence_id": 1, 67 + "speaker": "alice_test", 68 + "confidence": "medium", 69 + "method": "voiceprint", 80 70 }, 81 71 { 82 - "day": "20240101", 83 - "stream": "test", 84 - "segment_key": "090000_1800", 85 - "source": "audio", 86 72 "sentence_id": 2, 73 + "speaker": None, 74 + "confidence": None, 75 + "method": None, 87 76 }, 88 77 { 89 - "day": "20240101", 90 - "stream": "test", 91 - "segment_key": "100000_1800", 92 - "source": "audio", 93 78 "sentence_id": 3, 79 + "speaker": "alice_test", 80 + "confidence": "medium", 81 + "method": "voiceprint", 94 82 }, 95 - ] 96 - }, 83 + ], 84 + ) 85 + 86 + results = suggest_opportunities() 87 + 88 + suggestion = next( 89 + item for item in results if item["type"] == "low_confidence_review" 97 90 ) 91 + assert suggestion["day"] == "20240101" 92 + assert suggestion["medium_or_null_count"] == 12 93 + assert suggestion["total_labels"] == 12 98 94 99 - from apps.speakers.suggest import suggest_speakers 95 + 96 + def test_suggest_low_confidence_below_threshold(speakers_env): 97 + env = speakers_env() 98 + for idx in range(2): 99 + segment_key = f"1100{idx:02d}_300" 100 + env.create_segment("20240101", segment_key, ["mic_audio"]) 101 + env.create_speaker_labels( 102 + "20240101", 103 + segment_key, 104 + [ 105 + { 106 + "sentence_id": 1, 107 + "speaker": "alice_test", 108 + "confidence": "medium", 109 + "method": "voiceprint", 110 + }, 111 + { 112 + "sentence_id": 2, 113 + "speaker": None, 114 + "confidence": None, 115 + "method": None, 116 + }, 117 + ], 118 + ) 100 119 101 - result = suggest_speakers(limit=10) 120 + results = suggest_opportunities() 102 121 103 - assert result[0]["type"] == "unknown_recurring" 104 - assert result[0]["cluster_id"] == 7 105 - assert result[0]["size"] == 3 106 - assert result[0]["segment_count"] == 2 107 - assert result[0]["segments"] == [ 108 - "20240101/test/090000_1800", 109 - "20240101/test/100000_1800", 110 - ] 111 - assert "calendar_overlap" in result[0]["import_hints"] 122 + assert all(item["type"] != "low_confidence_review" for item in results) 112 123 113 124 114 - def test_suggest_unknown_recurring_calendar_overlap(speakers_env): 125 + def test_suggest_name_variant(speakers_env): 115 126 env = speakers_env() 116 - env.create_segment("20240101", "090000_1800", ["audio"]) 117 - _write_discovery_cache( 118 - env.journal, 119 - { 120 - "2": [ 121 - { 122 - "day": "20240101", 123 - "stream": "test", 124 - "segment_key": "090000_1800", 125 - "source": "audio", 126 - "sentence_id": 1, 127 - } 128 - ] 129 - }, 130 - ) 131 - _write_events( 132 - env.journal, 133 - "testfacet", 134 - "20240101", 135 - [ 136 - { 137 - "type": "meeting", 138 - "start": "09:15:00", 139 - "end": "09:45:00", 140 - "title": "Design Sync", 141 - "facet": "testfacet", 142 - "participants": ["Alice", "Bob"], 143 - "occurred": True, 144 - } 145 - ], 146 - ) 127 + alice_dir = env.create_entity("Alice") 128 + alice_test_dir = env.create_entity("Alice Test") 147 129 148 - from apps.speakers.suggest import suggest_speakers 130 + base = env.create_embedding([1.0, 0.0, 0.0]) 131 + similar = env.create_embedding([1.0, 0.01, 0.0]) 132 + _write_voiceprints(alice_dir, [base, similar]) 133 + _write_voiceprints(alice_test_dir, [similar, base]) 149 134 150 - result = suggest_speakers(limit=10) 151 - overlap = result[0]["import_hints"]["calendar_overlap"][0] 135 + results = suggest_opportunities() 152 136 153 - assert overlap["title"] == "Design Sync" 154 - assert overlap["facet"] == "testfacet" 155 - assert overlap["participants"] == ["Alice", "Bob"] 156 - assert overlap["segments"] == ["20240101/test/090000_1800"] 137 + suggestion = next(item for item in results if item["type"] == "name_variant") 138 + assert suggestion["entity_a"]["id"] in {"alice", "alice_test"} 139 + assert suggestion["entity_b"]["id"] in {"alice", "alice_test"} 140 + assert suggestion["entity_a"]["id"] != suggestion["entity_b"]["id"] 141 + assert suggestion["similarity"] > 0.90 157 142 158 143 159 144 def test_suggest_import_linkable(speakers_env): 160 145 env = speakers_env() 161 - env.create_segment("20240101", "090000_1800", ["audio"]) 162 - env.create_segment("20240101", "120000_1800", ["audio"]) 163 - env.create_entity("Has Voiceprint") 164 - env.create_entity("Needs Import") 165 - _write_voiceprints( 166 - env.journal, 167 - "has_voiceprint", 168 - [env.create_embedding([1.0, 0.0])], 169 - ) 170 - _write_events( 171 - env.journal, 172 - "work", 146 + env.create_entity("Romeo Montague") 147 + create_meetings_md( 148 + env, 173 149 "20240101", 174 - [ 175 - { 176 - "type": "meeting", 177 - "start": "09:15:00", 178 - "end": "09:45:00", 179 - "title": "Planning", 180 - "facet": "work", 181 - "participants": ["Has Voiceprint", "Needs Import"], 182 - "occurred": True, 183 - }, 184 - { 185 - "type": "meeting", 186 - "start": "12:15:00", 187 - "end": "12:45:00", 188 - "title": "Followup", 189 - "facet": "work", 190 - "participants": ["Needs Import"], 191 - "occurred": True, 192 - }, 193 - ], 150 + "# Meetings\n\n- 10:00 Strategy Call with Romeo and Juliet\n", 194 151 ) 195 152 196 - from apps.speakers.suggest import suggest_speakers 197 - 198 - result = suggest_speakers(limit=10) 199 - import_linkable = [s for s in result if s["type"] == "import_linkable"] 153 + results = suggest_opportunities() 200 154 201 - assert len(import_linkable) == 1 202 - assert import_linkable[0]["name"] == "Needs Import" 203 - assert import_linkable[0]["meetings_count"] == 2 204 - assert import_linkable[0]["has_voiceprint"] is False 205 - assert import_linkable[0]["overlapping_segments"] == [ 206 - "20240101/test/090000_1800", 207 - "20240101/test/120000_1800", 208 - ] 155 + suggestion = next(item for item in results if item["type"] == "import_linkable") 156 + assert suggestion["entity_id"] == "romeo_montague" 157 + assert suggestion["name"] == "Romeo Montague" 158 + assert suggestion["meetings_mentioned"] == 1 159 + assert suggestion["meeting_days"] == ["20240101"] 209 160 210 161 211 - def test_suggest_name_variant(speakers_env): 162 + def test_suggest_import_linkable_with_voiceprint_excluded(speakers_env): 212 163 env = speakers_env() 213 - env.create_entity("Owner Person", is_principal=True) 214 - env.create_entity("Alice", voiceprints=[("20240101", "090000_300", "audio", 1)]) 215 - env.create_entity( 216 - "Alice Johnson", 217 - voiceprints=[("20240101", "090000_300", "audio", 1)], 164 + entity_dir = env.create_entity("Romeo Montague") 165 + _write_voiceprints(entity_dir, [env.create_embedding([1.0, 0.0, 0.0])]) 166 + create_meetings_md( 167 + env, 168 + "20240101", 169 + "# Meetings\n\n- 10:00 Strategy Call with Romeo and Juliet\n", 218 170 ) 219 - shared = env.create_embedding([1.0, 0.0, 0.0]) 220 - _write_voiceprints(env.journal, "alice", [shared, shared]) 221 - _write_voiceprints(env.journal, "alice_johnson", [shared, shared]) 222 171 223 - from apps.speakers.suggest import suggest_speakers 172 + results = suggest_opportunities() 224 173 225 - result = suggest_speakers(limit=10) 226 - variants = [s for s in result if s["type"] == "name_variant"] 174 + assert all( 175 + not ( 176 + item["type"] == "import_linkable" and item["entity_id"] == "romeo_montague" 177 + ) 178 + for item in results 179 + ) 227 180 228 - assert len(variants) == 1 229 - assert variants[0]["names"] == ["Alice", "Alice Johnson"] 230 - assert variants[0]["entity_ids"] == ["alice", "alice_johnson"] 231 - assert variants[0]["similarity"] >= 0.9 232 181 233 - 234 - def test_suggest_low_confidence_review(speakers_env): 182 + def test_suggest_limit(speakers_env): 235 183 env = speakers_env() 236 - env.create_segment("20240102", "090000_1800", ["audio"], num_sentences=12) 237 - labels = [] 238 - for idx in range(1, 7): 239 - labels.append( 240 - { 241 - "sentence_id": idx, 242 - "speaker": f"speaker_{idx}", 243 - "confidence": "medium", 244 - "method": "acoustic", 245 - } 184 + env.create_entity("Romeo Montague") 185 + alice_dir = env.create_entity("Alice") 186 + alice_test_dir = env.create_entity("Alice Test") 187 + _write_voiceprints(alice_dir, [env.create_embedding([1.0, 0.0, 0.0])]) 188 + _write_voiceprints(alice_test_dir, [env.create_embedding([1.0, 0.01, 0.0])]) 189 + create_meetings_md( 190 + env, 191 + "20240101", 192 + "# Meetings\n\n- 10:00 Strategy Call with Romeo and Juliet\n", 193 + ) 194 + for idx in range(4): 195 + segment_key = f"1200{idx:02d}_300" 196 + env.create_segment("20240101", segment_key, ["mic_audio"]) 197 + env.create_speaker_labels( 198 + "20240101", 199 + segment_key, 200 + [ 201 + { 202 + "sentence_id": sid, 203 + "speaker": None, 204 + "confidence": None, 205 + "method": None, 206 + } 207 + for sid in range(1, 4) 208 + ], 246 209 ) 247 - for idx in range(7, 13): 248 - labels.append( 249 - { 250 - "sentence_id": idx, 251 - "speaker": None, 252 - "confidence": None, 253 - "method": None, 254 - } 255 - ) 256 - env.create_speaker_labels("20240102", "090000_1800", labels) 257 210 258 - from apps.speakers.suggest import suggest_speakers 211 + results = suggest_opportunities(limit=1) 259 212 260 - result = suggest_speakers(limit=10) 261 - review = [s for s in result if s["type"] == "low_confidence_review"] 213 + assert len(results) == 1 262 214 263 - assert len(review) == 1 264 - assert review[0]["day"] == "20240102" 265 - assert review[0]["medium_count"] == 6 266 - assert review[0]["null_count"] == 6 267 - assert review[0]["segments_needing_review"] == ["20240102/test/090000_1800"] 268 215 269 - 270 - def test_suggest_limit_truncation(speakers_env): 216 + def test_suggest_priority_order(speakers_env): 271 217 env = speakers_env() 272 - _write_discovery_cache( 273 - env.journal, 274 - { 275 - "1": [ 218 + env.create_entity("Romeo Montague") 219 + alice_dir = env.create_entity("Alice") 220 + alice_test_dir = env.create_entity("Alice Test") 221 + _write_voiceprints(alice_dir, [env.create_embedding([1.0, 0.0, 0.0])]) 222 + _write_voiceprints(alice_test_dir, [env.create_embedding([1.0, 0.01, 0.0])]) 223 + create_meetings_md( 224 + env, 225 + "20240101", 226 + "# Meetings\n\n- 10:00 Strategy Call with Romeo and Juliet\n", 227 + ) 228 + for idx in range(4): 229 + segment_key = f"1300{idx:02d}_300" 230 + env.create_segment("20240101", segment_key, ["mic_audio"]) 231 + env.create_speaker_labels( 232 + "20240101", 233 + segment_key, 234 + [ 276 235 { 277 - "day": "20240101", 278 - "stream": "test", 279 - "segment_key": "090000_1800", 280 - "source": "audio", 281 - "sentence_id": 1, 236 + "sentence_id": sid, 237 + "speaker": None, 238 + "confidence": None, 239 + "method": None, 282 240 } 283 - ] 284 - }, 241 + for sid in range(1, 4) 242 + ], 243 + ) 244 + 245 + results = suggest_opportunities(limit=3) 246 + 247 + assert [item["type"] for item in results] == [ 248 + "import_linkable", 249 + "name_variant", 250 + "low_confidence_review", 251 + ] 252 + 253 + 254 + def test_parse_meetings_parenthesized(speakers_env): 255 + env = speakers_env() 256 + meetings_path = create_meetings_md( 257 + env, 258 + "20240101", 259 + "# Meetings\n\n- 08:30 Pre-Board Meeting Prep (Romeo, Juliet, Benvolio)\n", 285 260 ) 286 - env.create_segment("20240102", "090000_1800", ["audio"], num_sentences=12) 287 - labels = [ 261 + 262 + meetings = _parse_meetings(str(meetings_path.parent.parent)) 263 + 264 + assert meetings == [ 288 265 { 289 - "sentence_id": idx, 290 - "speaker": None, 291 - "confidence": None, 292 - "method": None, 266 + "time": "08:30", 267 + "line": "- 08:30 Pre-Board Meeting Prep (Romeo, Juliet, Benvolio)", 268 + "participants": ["Romeo", "Juliet", "Benvolio"], 293 269 } 294 - for idx in range(1, 13) 295 270 ] 296 - env.create_speaker_labels("20240102", "090000_1800", labels) 297 271 298 - from apps.speakers.suggest import suggest_speakers 299 272 300 - result = suggest_speakers(limit=1) 301 - 302 - assert len(result) == 1 303 - assert result[0]["type"] == "unknown_recurring" 304 - 305 - 306 - def test_suggest_priority_ordering(speakers_env): 273 + def test_parse_meetings_with_keyword(speakers_env): 307 274 env = speakers_env() 308 - env.create_segment("20240101", "090000_1800", ["audio"]) 309 - _write_discovery_cache( 310 - env.journal, 311 - { 312 - "4": [ 313 - { 314 - "day": "20240101", 315 - "stream": "test", 316 - "segment_key": "090000_1800", 317 - "source": "audio", 318 - "sentence_id": 1, 319 - } 320 - ] 321 - }, 322 - ) 323 - env.create_entity("Import Target") 324 - _write_events( 325 - env.journal, 326 - "work", 275 + meetings_path = create_meetings_md( 276 + env, 327 277 "20240101", 328 - [ 329 - { 330 - "type": "meeting", 331 - "start": "09:15:00", 332 - "end": "09:45:00", 333 - "title": "Planning", 334 - "facet": "work", 335 - "participants": ["Import Target"], 336 - "occurred": True, 337 - } 338 - ], 339 - ) 340 - env.create_entity("Owner Person", is_principal=True) 341 - env.create_entity("Bob", voiceprints=[("20240101", "090000_300", "audio", 1)]) 342 - env.create_entity( 343 - "Bob Smith", 344 - voiceprints=[("20240101", "090000_300", "audio", 1)], 278 + "# Meetings\n\n- 10:00 Strategy Call with Professor Lawrence, Romeo, and Juliet\n", 345 279 ) 346 - shared = env.create_embedding([0.0, 1.0, 0.0]) 347 - _write_voiceprints(env.journal, "bob", [shared, shared]) 348 - _write_voiceprints(env.journal, "bob_smith", [shared, shared]) 349 - env.create_segment("20240103", "090000_1800", ["audio"], num_sentences=12) 350 - review_labels = [ 280 + 281 + meetings = _parse_meetings(str(meetings_path.parent.parent)) 282 + 283 + assert meetings == [ 351 284 { 352 - "sentence_id": idx, 353 - "speaker": None, 354 - "confidence": None, 355 - "method": None, 285 + "time": "10:00", 286 + "line": "- 10:00 Strategy Call with Professor Lawrence, Romeo, and Juliet", 287 + "participants": ["Professor Lawrence", "Romeo", "Juliet"], 356 288 } 357 - for idx in range(1, 13) 358 289 ] 359 - env.create_speaker_labels("20240103", "090000_1800", review_labels) 360 290 361 - from apps.speakers.suggest import suggest_speakers 362 291 363 - result = suggest_speakers(limit=50) 364 - types = [item["type"] for item in result] 292 + def test_parse_meetings_missing_file(tmp_path): 293 + assert _parse_meetings(str(tmp_path)) == [] 365 294 366 - assert types.index("unknown_recurring") < types.index("import_linkable") 367 - assert types.index("import_linkable") < types.index("name_variant") 368 - assert types.index("name_variant") < types.index("low_confidence_review") 295 + 296 + def test_format_suggestions_empty(): 297 + assert format_suggestions([]) == "No speaker curation suggestions found." 298 + 299 + 300 + def test_suggest_cli_json(speakers_env): 301 + speakers_env() 302 + runner = CliRunner() 303 + 304 + result = runner.invoke(app, ["suggest", "--json"]) 305 + 306 + assert result.exit_code == 0 307 + assert json.loads(result.stdout) == []