Merge branch 'hopper-6hwyzh6f-transcripts-resilience'

+26 -19

apps/transcripts/routes.py

··· 6 6 from __future__ import annotations 7 7 8 8 import json 9 + import logging 9 10 import os 10 11 import re 11 12 import shutil ··· 35 36 from think.utils import day_dirs, day_path, segment_path 36 37 from think.utils import segment_key as validate_segment_key 37 38 38 - # Regex for HHMMSS time format validation 39 - TIME_RE = re.compile(r"\d{6}") 39 + logger = logging.getLogger(__name__) 40 + 41 + # Regex for YYYYMM month format validation 42 + MONTH_RE = re.compile(r"\d{6}") 40 43 41 44 transcripts_bp = Blueprint( 42 45 "app:transcripts", ··· 49 52 def index() -> Any: 50 53 """Redirect to the most recent day with segments, falling back to today.""" 51 54 today = date.today().strftime("%Y%m%d") 52 - if cluster_segments(today): 53 - return redirect(url_for("app:transcripts.transcripts_day", day=today)) 54 55 for day in sorted(day_dirs().keys(), reverse=True): 55 56 if cluster_segments(day): 56 57 return redirect(url_for("app:transcripts.transcripts_day", day=day)) ··· 61 62 def transcripts_day(day: str) -> str: 62 63 """Render transcript viewer for a specific day.""" 63 64 if not DATE_RE.fullmatch(day): 64 - return "", 404 65 + return error_response("Day not found", 404) 65 66 66 67 title = format_date(day) 67 68 ··· 72 73 def transcript_ranges(day: str) -> Any: 73 74 """Return available transcript ranges for a day.""" 74 75 if not DATE_RE.fullmatch(day): 75 - return "", 404 76 + return error_response("Day not found", 404) 76 77 77 78 audio_ranges, screen_ranges = cluster_scan(day) 78 79 return jsonify({"audio": audio_ranges, "screen": screen_ranges}) ··· 85 86 Returns list of segments with their content types for the segment selector UI. 86 87 """ 87 88 if not DATE_RE.fullmatch(day): 88 - return "", 404 89 + return error_response("Day not found", 404) 89 90 90 91 segments = cluster_segments(day) 91 92 return jsonify({"segments": segments}) ··· 95 96 def transcript_day_data(day: str) -> Any: 96 97 """Return combined ranges and segments for a day in a single response.""" 97 98 if not DATE_RE.fullmatch(day): 98 - return "", 404 99 + return error_response("Day not found", 404) 99 100 100 101 audio_ranges, screen_ranges, segments = scan_day(day) 101 102 return jsonify({"audio": audio_ranges, "screen": screen_ranges, "segments": segments}) ··· 105 106 def serve_file(day: str, encoded_path: str) -> Any: 106 107 """Serve actual media files for embedding.""" 107 108 if not DATE_RE.fullmatch(day): 108 - return "", 404 109 + return error_response("Day not found", 404) 109 110 110 111 try: 111 112 rel_path = encoded_path.replace("__", "/") ··· 113 114 114 115 day_dir = str(day_path(day, create=False)) 115 116 if not os.path.commonpath([full_path, day_dir]) == day_dir: 116 - return "", 403 117 + return error_response("Invalid file path", 403) 117 118 118 119 if not os.path.isfile(full_path): 119 - return "", 404 120 + return error_response("File not found", 404) 120 121 121 122 return send_file(full_path) 122 123 123 124 except Exception: 124 - return "", 404 125 + return error_response("Failed to serve file", 404) 125 126 126 127 127 128 @transcripts_bp.route("/api/stats/<month>") ··· 135 136 JSON dict mapping day (YYYYMMDD) to transcript range count. 136 137 Transcripts app is not facet-aware, so returns simple {day: count} mapping. 137 138 """ 138 - if not TIME_RE.fullmatch(month): 139 - return jsonify({"error": "Invalid month format, expected YYYYMM"}), 400 139 + if not MONTH_RE.fullmatch(month): 140 + return error_response("Invalid month format", 400) 140 141 141 142 stats: dict[str, int] = {} 142 143 ··· 203 204 - media_sizes: dict with audio/screen byte counts for raw media files 204 205 """ 205 206 if not DATE_RE.fullmatch(day): 206 - return "", 404 207 + return error_response("Invalid day format", 404) 207 208 208 209 if not validate_segment_key(segment_key): 209 - return "", 404 210 + return error_response("Invalid segment key format", 404) 210 211 211 212 segment_dir = str(segment_path(day, segment_key, stream)) 212 213 if not os.path.isdir(segment_dir): 213 - return "", 404 214 + return error_response("Segment directory not found", 404) 214 215 215 216 chunks: list[dict] = [] 216 217 audio_file_url = None ··· 218 219 media_sizes: dict[str, int] = {"audio": 0, "screen": 0} 219 220 has_raw_reference = False 220 221 has_raw_file = False 222 + warnings = 0 221 223 222 224 # Load speaker labels if available. 223 225 speaker_labels_path = Path(segment_dir) / "agents" / "speaker_labels.json" ··· 306 308 if speaker_label: 307 309 chunk_data["speaker_label"] = speaker_label 308 310 chunks.append(chunk_data) 309 - except Exception: 311 + except Exception as e: 312 + logger.warning("Failed to parse audio segment %s: %s", audio_path, e) 313 + warnings += 1 310 314 continue 311 315 312 316 # Process screen files and collect video URLs for client-side decoding ··· 396 400 "basic": is_basic, 397 401 } 398 402 ) 399 - except Exception: 403 + except Exception as e: 404 + logger.warning("Failed to parse screen segment %s: %s", screen_path, e) 405 + warnings += 1 400 406 continue 401 407 402 408 # Sort all chunks by timestamp ··· 427 433 "cost": cost_data["cost"], 428 434 "media_sizes": media_sizes, 429 435 "media_purged": media_purged, 436 + "warnings": warnings, 430 437 } 431 438 ) 432 439

-2

convey/root.py

··· 117 117 def app_today() -> Any: 118 118 """Redirect /app/today to the most recent day with journal data.""" 119 119 today = date.today().strftime("%Y%m%d") 120 - if cluster_segments(today): 121 - return redirect(url_for("app:transcripts.transcripts_day", day=today)) 122 120 for day in sorted(day_dirs().keys(), reverse=True): 123 121 if cluster_segments(day): 124 122 return redirect(url_for("app:transcripts.transcripts_day", day=day))

+1

tests/baselines/api/transcripts/segment-detail.json

··· 138 138 "screen": 0 139 139 }, 140 140 "segment_key": "090000_300", 141 + "warnings": 0, 141 142 "video_files": {} 142 143 }

tests/fixtures/journal/indexer/journal.sqlite

This is a binary file and will not be displayed.

+8

tests/test_think_utils.py

··· 8 8 import os 9 9 import sys 10 10 import tempfile 11 + from datetime import time 11 12 from pathlib import Path 12 13 13 14 import pytest ··· 17 18 DEFAULT_STREAM, 18 19 day_from_path, 19 20 iter_segments, 21 + segment_parse, 20 22 segment_key, 21 23 setup_cli, 22 24 ) ··· 641 643 assert segment_key("prefix 143022_300 suffix") == "143022_300" 642 644 # Multiple potential matches (should match first) 643 645 assert segment_key("143022_300 and 150000_600") == "143022_300" 646 + 647 + 648 + def test_segment_parse_clamps_midnight_crossing(): 649 + """Test segment_parse clamps end time when a segment crosses midnight.""" 650 + assert segment_parse("235900_300") == (time(23, 59, 0), time(23, 59, 59)) 651 + assert segment_parse("143022_300") == (time(14, 30, 22), time(14, 35, 22)) 644 652 645 653 646 654 class TestSetupCliConfigEnv:

+4 -1

think/utils.py

··· 413 413 # Compute end time by adding duration 414 414 start_dt = datetime.combine(datetime.today(), start_time) 415 415 end_dt = start_dt + timedelta(seconds=length_seconds) 416 - end_time = end_dt.time() 416 + if end_dt.date() > start_dt.date(): 417 + end_time = time(23, 59, 59) 418 + else: 419 + end_time = end_dt.time() 417 420 return (start_time, end_time) 418 421 except ValueError: 419 422 return (None, None)

Configure Feed

Configure Feed