vod frog, frog with the vods
5
fork

Configure Feed

Select the types of activity you want to include in your feed.

captions v3: offline pre-computation with incremental sentence splitting, debug overlay (Shift+D), cc loading animation, model persistence, processed range timeline

+468 -180
spec/cc_load_1.png

This is a binary file and will not be displayed.

spec/cc_load_2.png

This is a binary file and will not be displayed.

+171 -7
src/lib/VideoPlayer.svelte
··· 3 3 import Hls from "hls.js"; 4 4 import WavyBorder from "./WavyBorder.svelte"; 5 5 import { playCroak } from "./croak"; 6 - import { getModelStatus, getCaptionsEnabled, getCurrentCaption, attachVideo, detachVideo, toggleCaptionsDisplay, updateCaptionForTime, destroyCaptions } from "./captions.svelte"; 6 + import { getModelStatus, getCaptionsEnabled, getCurrentCaption, toggleCaptionsDisplay, updateCaptionForTime, destroyCaptions, getCaptionCount, getDebugState, precomputeCaptions, getIsProcessing, getProcessProgress } from "./captions.svelte"; 7 7 8 - // HLS video source URL (m3u8 playlist) 9 - let { src }: { src: string } = $props(); 8 + // HLS video source URL (m3u8 playlist) and AT URI for caption pre-computation 9 + let { src, atUri = '' }: { src: string; atUri?: string } = $props(); 10 10 11 11 let videoEl: HTMLVideoElement | undefined = $state(); 12 12 let hls: Hls | null = null; 13 13 let errorMsg = $state(""); 14 14 15 + // CC debug overlay — toggle with Shift+D 16 + let ccDebug = $state(false); 17 + let debugInfo = $state<any>({}); 18 + let debugTimer: ReturnType<typeof setInterval> | null = null; 19 + 20 + function toggleDebug() { 21 + ccDebug = !ccDebug; 22 + if (ccDebug && !debugTimer) { 23 + debugTimer = setInterval(() => { debugInfo = getDebugState(currentTime); }, 250); 24 + } else if (!ccDebug && debugTimer) { 25 + clearInterval(debugTimer); 26 + debugTimer = null; 27 + } 28 + } 29 + 30 + // CC loading animation 31 + let ccLoadFrame = $state(1); 32 + let ccLoadTimer: ReturnType<typeof setInterval> | null = null; 33 + let ccLoading = $state(false); 34 + 35 + function startCcLoadAnim() { 36 + if (ccLoadTimer) return; 37 + ccLoading = true; 38 + ccLoadTimer = setInterval(() => { ccLoadFrame = ccLoadFrame === 1 ? 2 : 1; }, 500); 39 + } 40 + function stopCcLoadAnim() { 41 + if (ccLoadTimer) { clearInterval(ccLoadTimer); ccLoadTimer = null; } 42 + ccLoading = false; 43 + } 44 + 45 + // Stop loading animation once first captions arrive 46 + let prevCaptionCount = 0; 47 + $effect(() => { 48 + const count = getCaptionCount(); 49 + if (getCaptionsEnabled() && ccLoading && count > prevCaptionCount) { 50 + stopCcLoadAnim(); 51 + } 52 + prevCaptionCount = count; 53 + }); 54 + 15 55 // Playback state 16 56 let playing = $state(false); 17 57 let currentTime = $state(0); ··· 52 92 hls.attachMedia(videoEl); 53 93 hls.on(Hls.Events.MANIFEST_PARSED, () => { 54 94 videoEl?.play().catch(() => {}); 55 - // Attach video for background caption capture 56 - if (videoEl) attachVideo(videoEl); 95 + if (atUri) precomputeCaptions(atUri); 57 96 }); 58 97 hls.on(Hls.Events.ERROR, (_event, data) => { 59 98 console.error("HLS error:", data); ··· 129 168 function toggleCaptions() { 130 169 playCroak(); 131 170 toggleCaptionsDisplay(); 171 + if (getCaptionsEnabled()) { 172 + startCcLoadAnim(); 173 + } else { 174 + stopCcLoadAnim(); 175 + } 132 176 } 133 177 134 178 function togglePlay() { ··· 243 287 e.preventDefault(); 244 288 togglePlay(); 245 289 } 290 + if (e.code === "KeyD" && e.shiftKey) { 291 + toggleDebug(); 292 + } 246 293 } 247 294 248 295 $effect(() => { ··· 274 321 <video 275 322 bind:this={videoEl} 276 323 playsinline 277 - crossorigin="anonymous" 278 324 ontimeupdate={onTimeUpdate} 279 325 onplay={onPlay} 280 326 onpause={onPause} ··· 348 394 onclick={toggleCaptions} 349 395 title={getCaptionsEnabled() ? "Disable captions" : "Enable captions"} 350 396 > 351 - <img src={getCaptionsEnabled() ? "/cc_on.png" : "/cc_off.png"} alt="captions" class="cc-icon" /> 397 + <img src={getCaptionsEnabled() 398 + ? (ccLoading ? `/cc_load_${ccLoadFrame}.png` : "/cc_on.png") 399 + : "/cc_off.png"} alt="captions" class="cc-icon" /> 352 400 </button> 353 401 {/if} 354 402 ··· 359 407 </div> 360 408 {/if} 361 409 410 + {#if ccDebug} 411 + <div class="cc-debug"> 412 + <div class="cc-debug-header">CC Debug (Shift+D) | {debugInfo.isProcessing ? `Processing ${debugInfo.processProgress}%` : `Ready`}</div> 413 + <div>model: {debugInfo.modelStatus} | enabled: {debugInfo.captionsEnabled} | video: {debugInfo.videoTime?.toFixed(1)}s</div> 414 + <div>showing: "{debugInfo.currentCaption?.substring(0, 60)}"</div> 415 + 416 + <!-- Processed ranges timeline --> 417 + {#if duration > 0} 418 + <div class="cc-debug-timeline"> 419 + {#each (debugInfo.processedRanges ?? []) as range} 420 + <div class="cc-debug-range" style="left: {(range.start / duration) * 100}%; width: {((range.end - range.start) / duration) * 100}%;"></div> 421 + {/each} 422 + <div class="cc-debug-playhead" style="left: {(debugInfo.videoTime / duration) * 100}%;"></div> 423 + </div> 424 + {/if} 425 + 426 + <div class="cc-debug-header" style="margin-top: 4px;">Captions ({debugInfo.totalCaptions}):</div> 427 + <div class="cc-debug-chunks"> 428 + {#each (debugInfo.captions ?? []) as cap} 429 + {@const vt = debugInfo.videoTime ?? 0} 430 + {@const active = vt >= cap.start && vt <= cap.end} 431 + <div class="cc-debug-chunk" class:active> 432 + <span class="cc-debug-time">{cap.start?.toFixed(1)}s-{cap.end?.toFixed(1)}s</span> 433 + <span class="cc-debug-text">{cap.text}</span> 434 + {#if active}<span class="cc-debug-active">NOW</span>{/if} 435 + </div> 436 + {/each} 437 + </div> 438 + </div> 439 + {/if} 440 + 362 441 {#if errorMsg} 363 442 <div class="error-overlay">{errorMsg}</div> 364 443 {/if} ··· 531 610 padding: 4px 12px; 532 611 border-radius: 4px; 533 612 line-height: 1.5; 613 + } 614 + 615 + .cc-debug { 616 + position: absolute; 617 + top: 6%; 618 + left: 6%; 619 + right: 6%; 620 + bottom: 6%; 621 + background: rgba(0, 0, 0, 0.85); 622 + color: #39FF44; 623 + font-family: monospace; 624 + font-size: 11px; 625 + padding: 12px; 626 + overflow-y: auto; 627 + z-index: 20; 628 + line-height: 1.6; 629 + border-radius: 8px; 630 + } 631 + 632 + .cc-debug-header { 633 + color: #FFDEED; 634 + font-weight: bold; 635 + font-size: 12px; 636 + } 637 + 638 + .cc-debug-timeline { 639 + position: relative; 640 + height: 12px; 641 + background: rgba(255, 255, 255, 0.1); 642 + border-radius: 3px; 643 + margin: 6px 0; 644 + overflow: hidden; 645 + } 646 + 647 + .cc-debug-range { 648 + position: absolute; 649 + top: 0; 650 + height: 100%; 651 + background: #39FF44; 652 + opacity: 0.5; 653 + } 654 + 655 + .cc-debug-playhead { 656 + position: absolute; 657 + top: 0; 658 + width: 2px; 659 + height: 100%; 660 + background: #FF3992; 661 + z-index: 1; 662 + } 663 + 664 + .cc-debug-chunks { 665 + max-height: 50%; 666 + overflow-y: auto; 667 + } 668 + 669 + .cc-debug-chunk { 670 + display: flex; 671 + gap: 8px; 672 + padding: 1px 0; 673 + border-bottom: 1px solid rgba(57, 255, 68, 0.1); 674 + } 675 + 676 + .cc-debug-chunk.active { 677 + background: rgba(57, 255, 68, 0.15); 678 + color: #FFDEED; 679 + } 680 + 681 + .cc-debug-time { 682 + flex-shrink: 0; 683 + width: 100px; 684 + color: #FFA639; 685 + } 686 + 687 + .cc-debug-text { 688 + flex: 1; 689 + overflow: hidden; 690 + text-overflow: ellipsis; 691 + white-space: nowrap; 692 + } 693 + 694 + .cc-debug-active { 695 + color: #FF3992; 696 + font-weight: bold; 697 + flex-shrink: 0; 534 698 } 535 699 536 700 .error-overlay {
+293 -171
src/lib/captions.svelte.ts
··· 1 1 /** 2 - * Captions manager — captures audio from a video element in the background, 3 - * sends chunks to a Whisper Web Worker, and provides reactive caption state. 2 + * Captions v3 — Full pre-computation with sentence splitting. 3 + * 4 + * 1. Fetches all HLS audio segments for the video 5 + * 2. Decodes to PCM in large chunks (30s) 6 + * 3. Sends each chunk to Whisper for transcription 7 + * 4. Collects all sentence-level results with timestamps 8 + * 5. Post-processes: merges fragments, splits on sentence boundaries 9 + * 6. Displays captions based on video playback time 4 10 * 5 - * Audio capture starts automatically when a video is attached (if the model is loaded). 6 - * The CC toggle only controls whether captions are *displayed* — transcription 7 - * happens continuously in the background so captions are pre-computed. 11 + * No real-time audio capture — everything is pre-computed. 8 12 */ 9 13 10 - const SEND_INTERVAL = 4; // send audio every N seconds 11 - const CHUNK_DURATION = 5; // label duration for display 12 - const SAMPLE_RATE = 16000; // Whisper expects 16kHz mono 14 + const PLAYBACK_BASE = 'https://vod-beta.stream.place/xrpc/place.stream.playback'; 15 + const TARGET_SAMPLE_RATE = 16000; 16 + const CHUNK_SECONDS = 30; // Large chunks for best Whisper context 17 + const MODEL_DOWNLOADED_KEY = 'vodfrog-whisper-downloaded'; 13 18 14 19 type ModelStatus = 'idle' | 'loading' | 'ready' | 'error'; 15 20 16 - interface CaptionChunk { 21 + interface Caption { 17 22 text: string; 18 23 start: number; 19 24 end: number; 20 - receivedAt: number; 21 25 } 22 26 27 + interface AudioSegment { 28 + url: string; 29 + byteRange?: { start: number; length: number }; 30 + duration: number; 31 + startTime: number; 32 + } 33 + 34 + // ---- Reactive state ---- 23 35 let modelStatus = $state<ModelStatus>('idle'); 24 36 let modelProgress = $state(0); 25 37 let modelError = $state(''); 26 38 let worker: Worker | null = null; 27 39 28 - // Display toggle — does NOT control capture, only visibility 29 40 let captionsEnabled = $state(false); 30 41 let currentCaption = $state(''); 31 - let captionChunks = $state<CaptionChunk[]>([]); 42 + let captions = $state<Caption[]>([]); 43 + let isProcessing = $state(false); 44 + let processProgress = $state(0); 32 45 33 - // Audio capture state (runs in background regardless of captionsEnabled) 34 - let capturing = false; 35 - let audioContext: AudioContext | null = null; 36 - let sourceNode: MediaElementAudioSourceNode | null = null; 37 - let processorNode: ScriptProcessorNode | null = null; 38 - let audioBuffer: Float32Array[] = []; 39 - let bufferSamples = 0; 40 - let attachedVideoEl: HTMLVideoElement | null = null; 41 - let chunkTimer: ReturnType<typeof setInterval> | null = null; 42 - let chunkTimeOffset = 0; 46 + let pendingAtUri = ''; 47 + let abortCtrl: AbortController | null = null; 48 + let rawCaptionBuffer: Caption[] = []; 49 + let processedRanges = $state<{ start: number; end: number }[]>([]); 43 50 51 + // ---- Exports ---- 44 52 export function getModelStatus() { return modelStatus; } 45 53 export function getModelProgress() { return modelProgress; } 46 54 export function getModelError() { return modelError; } 47 55 export function getCaptionsEnabled() { return captionsEnabled; } 48 56 export function getCurrentCaption() { return currentCaption; } 57 + export function getCaptionCount() { return captions.length; } 58 + export function getIsProcessing() { return isProcessing; } 59 + export function getProcessProgress() { return processProgress; } 60 + export function getAllCaptions() { return captions; } 61 + export function getProcessedRanges() { return processedRanges; } 49 62 50 - /** Load the Whisper model in the Web Worker */ 63 + export function getDebugState(videoTime: number) { 64 + return { 65 + modelStatus, 66 + captionsEnabled, 67 + isProcessing, 68 + processProgress: processProgress.toFixed(0), 69 + totalCaptions: captions.length, 70 + currentCaption, 71 + videoTime, 72 + processedRanges, 73 + captions: captions.map(c => ({ 74 + text: c.text.substring(0, 50), 75 + start: c.start, 76 + end: c.end 77 + })) 78 + }; 79 + } 80 + 81 + // ---- Model ---- 82 + export function initCaptions() { 83 + if (typeof localStorage === 'undefined') return; 84 + if (localStorage.getItem(MODEL_DOWNLOADED_KEY) === 'true') loadModel(); 85 + } 86 + 51 87 export function loadModel() { 52 88 if (modelStatus === 'loading' || modelStatus === 'ready') return; 53 - 54 89 modelStatus = 'loading'; 55 90 modelProgress = 0; 56 91 modelError = ''; 57 92 58 - worker = new Worker( 59 - new URL('./whisper-worker.ts', import.meta.url), 60 - { type: 'module' } 61 - ); 93 + worker = new Worker(new URL('./whisper-worker.ts', import.meta.url), { type: 'module' }); 94 + worker.onmessage = onWorkerMessage; 95 + worker.postMessage({ type: 'load' }); 96 + } 62 97 63 - worker.onmessage = (e: MessageEvent) => { 64 - const { type } = e.data; 98 + let resolveTranscription: ((caps: Caption[]) => void) | null = null; 99 + 100 + function onWorkerMessage(e: MessageEvent) { 101 + const { type } = e.data; 65 102 66 - if (type === 'status') { 67 - if (e.data.status === 'ready') { 68 - modelStatus = 'ready'; 69 - modelProgress = 100; 70 - // If a video is already attached, start capturing immediately 71 - if (attachedVideoEl && !capturing) { 72 - beginCapture(attachedVideoEl); 103 + if (type === 'status') { 104 + if (e.data.status === 'ready') { 105 + modelStatus = 'ready'; 106 + modelProgress = 100; 107 + try { localStorage.setItem(MODEL_DOWNLOADED_KEY, 'true'); } catch {} 108 + if (pendingAtUri) { 109 + const uri = pendingAtUri; 110 + pendingAtUri = ''; 111 + precomputeCaptions(uri); 112 + } 113 + } else if (e.data.status === 'error') { 114 + modelStatus = 'error'; 115 + modelError = e.data.error; 116 + } 117 + } 118 + if (type === 'progress') modelProgress = e.data.progress; 119 + 120 + if (type === 'result') { 121 + const results: Caption[] = []; 122 + if (e.data.chunks?.length) { 123 + for (const c of e.data.chunks) { 124 + const text = c.text?.trim(); 125 + if (text && text.length > 1) { 126 + results.push({ text, start: c.start ?? 0, end: c.end ?? 0 }); 73 127 } 74 - } else if (e.data.status === 'error') { 75 - modelStatus = 'error'; 76 - modelError = e.data.error; 77 - } else if (e.data.status === 'loading') { 78 - modelStatus = 'loading'; 79 128 } 129 + } else if (e.data.text?.trim()) { 130 + results.push({ text: e.data.text.trim(), start: 0, end: 0 }); 80 131 } 132 + if (resolveTranscription) { 133 + const r = resolveTranscription; 134 + resolveTranscription = null; 135 + r(results); 136 + } 137 + } 138 + } 81 139 82 - if (type === 'progress') { 83 - modelProgress = e.data.progress; 140 + function transcribe(audio: Float32Array): Promise<Caption[]> { 141 + return new Promise(resolve => { 142 + if (!worker) { resolve([]); return; } 143 + resolveTranscription = resolve; 144 + worker.postMessage({ type: 'transcribe', audio, sampleRate: TARGET_SAMPLE_RATE }); 145 + }); 146 + } 147 + 148 + // ---- HLS parsing ---- 149 + function parseM3u8(text: string, baseUrl: string) { 150 + const lines = text.split('\n'); 151 + let initUrl: string | null = null; 152 + let initBR: { start: number; length: number } | null = null; 153 + const segs: AudioSegment[] = []; 154 + let dur = 0, cum = 0; 155 + let pBR: { start: number; length: number } | null = null; 156 + 157 + for (const l of lines) { 158 + const t = l.trim(); 159 + if (t.startsWith('#EXT-X-MAP:')) { 160 + const m = t.match(/URI="([^"]+)"/); if (m) initUrl = new URL(m[1], baseUrl).href; 161 + const b = t.match(/BYTERANGE="(\d+)@(\d+)"/); if (b) initBR = { length: +b[1], start: +b[2] }; 84 162 } 163 + if (t.startsWith('#EXTINF:')) dur = parseFloat(t.split(':')[1]); 164 + if (t.startsWith('#EXT-X-BYTERANGE:')) { const [l, o] = t.split(':')[1].split('@'); pBR = { length: +l, start: +o }; } 165 + if (!t.startsWith('#') && t.length > 0 && dur > 0) { 166 + segs.push({ url: new URL(t, baseUrl).href, byteRange: pBR || undefined, duration: dur, startTime: cum }); 167 + cum += dur; dur = 0; pBR = null; 168 + } 169 + } 170 + return { initUrl, initBR, segs }; 171 + } 85 172 86 - if (type === 'result') { 87 - console.log('[CC] Worker result:', e.data); 88 - if (e.data.error) { 89 - console.warn('[CC] Whisper error:', e.data.error); 90 - return; 91 - } 92 - const now = Date.now(); 93 - if (e.data.chunks?.length) { 94 - const newChunks: CaptionChunk[] = e.data.chunks.map((c: CaptionChunk) => ({ 95 - text: c.text, 96 - start: c.start + chunkTimeOffset, 97 - end: c.end + chunkTimeOffset, 98 - receivedAt: now 99 - })); 100 - captionChunks = [...captionChunks, ...newChunks]; 101 - } else if (e.data.text) { 102 - captionChunks = [...captionChunks, { 103 - text: e.data.text, 104 - start: chunkTimeOffset, 105 - end: chunkTimeOffset + CHUNK_DURATION, 106 - receivedAt: now 107 - }]; 108 - } 173 + function findAudioUrl(master: string, base: string): string | null { 174 + for (const l of master.split('\n')) { 175 + if (l.includes('EXT-X-MEDIA') && l.includes('TYPE=AUDIO') && l.includes('mp4a')) { 176 + const m = l.match(/URI="([^"]+)"/); if (m) return new URL(m[1], base).href; 109 177 } 110 - }; 178 + } 179 + for (const l of master.split('\n')) { 180 + if (l.includes('EXT-X-MEDIA') && l.includes('TYPE=AUDIO')) { 181 + const m = l.match(/URI="([^"]+)"/); if (m) return new URL(m[1], base).href; 182 + } 183 + } 184 + return null; 185 + } 111 186 112 - worker.postMessage({ type: 'load' }); 187 + async function fetchRange(url: string, br: { start: number; length: number }, signal?: AbortSignal) { 188 + return (await fetch(url, { headers: { Range: `bytes=${br.start}-${br.start + br.length - 1}` }, signal })).arrayBuffer(); 113 189 } 114 190 115 - /** 116 - * Attach a video element — starts background audio capture immediately 117 - * if the model is loaded. Call this whenever a new video starts playing. 118 - */ 119 - export function attachVideo(videoEl: HTMLVideoElement) { 120 - if (videoEl === attachedVideoEl) return; 191 + async function decodeToMono16k(data: ArrayBuffer): Promise<Float32Array> { 192 + try { 193 + const ctx = new OfflineAudioContext(1, 1, TARGET_SAMPLE_RATE); 194 + const decoded = await ctx.decodeAudioData(data.slice(0)); 195 + const n = Math.ceil(decoded.duration * TARGET_SAMPLE_RATE); 196 + const off = new OfflineAudioContext(1, n, TARGET_SAMPLE_RATE); 197 + const s = off.createBufferSource(); s.buffer = decoded; s.connect(off.destination); s.start(0); 198 + return (await off.startRendering()).getChannelData(0); 199 + } catch { return new Float32Array(0); } 200 + } 121 201 122 - // Stop any existing capture 123 - stopCapture(); 202 + // ---- Sentence splitting ---- 203 + function splitIntoSentences(rawCaptions: Caption[]): Caption[] { 204 + if (rawCaptions.length === 0) return []; 124 205 125 - attachedVideoEl = videoEl; 126 - captionChunks = []; 127 - currentCaption = ''; 206 + // Merge all raw captions into one stream, preserving timestamps 207 + const merged: { text: string; start: number; end: number }[] = []; 208 + for (const c of rawCaptions) { 209 + merged.push({ text: c.text, start: c.start, end: c.end }); 210 + } 211 + 212 + // Now split on sentence boundaries 213 + const result: Caption[] = []; 214 + let accText = ''; 215 + let accStart = merged[0].start; 216 + 217 + for (const m of merged) { 218 + accText += (accText ? ' ' : '') + m.text; 128 219 129 - // Start capturing if model is ready 130 - if (modelStatus === 'ready' && worker) { 131 - beginCapture(videoEl); 220 + // Check if accumulated text ends with sentence-ending punctuation 221 + if (/[.!?][\s]*$/.test(accText.trim())) { 222 + result.push({ 223 + text: accText.trim(), 224 + start: accStart, 225 + end: m.end > 0 ? m.end : m.start + estimateDuration(accText) 226 + }); 227 + accText = ''; 228 + accStart = m.end > 0 ? m.end : m.start + estimateDuration(accText); 229 + } 132 230 } 231 + 232 + // Flush remaining 233 + if (accText.trim()) { 234 + const lastEnd = merged[merged.length - 1].end; 235 + result.push({ 236 + text: accText.trim(), 237 + start: accStart, 238 + end: lastEnd > 0 ? lastEnd : accStart + estimateDuration(accText) 239 + }); 240 + } 241 + 242 + return result; 133 243 } 134 244 135 - /** Detach from the current video and stop capture */ 136 - export function detachVideo() { 137 - stopCapture(); 138 - attachedVideoEl = null; 139 - captionChunks = []; 140 - currentCaption = ''; 245 + function estimateDuration(text: string): number { 246 + return Math.max(2, text.split(/\s+/).length * 0.35); 141 247 } 142 248 143 - /** Toggle caption display on/off (capture continues in background) */ 144 - export function toggleCaptionsDisplay() { 145 - captionsEnabled = !captionsEnabled; 146 - if (!captionsEnabled) { 147 - currentCaption = ''; 249 + // ---- Pre-computation ---- 250 + export async function precomputeCaptions(atUri: string) { 251 + if (modelStatus !== 'ready' || !worker) { 252 + pendingAtUri = atUri; 253 + console.log('[CC] Model not ready, queuing:', atUri); 254 + return; 148 255 } 149 - } 150 256 151 - /** Internal: start background audio capture */ 152 - function beginCapture(videoEl: HTMLVideoElement) { 153 - if (capturing || !worker) return; 257 + abortCtrl?.abort(); 258 + abortCtrl = new AbortController(); 259 + const signal = abortCtrl.signal; 154 260 155 - console.log('[CC] Starting background audio capture'); 156 - capturing = true; 157 - audioBuffer = []; 158 - bufferSamples = 0; 159 - chunkTimeOffset = videoEl.currentTime; 261 + isProcessing = true; 262 + processProgress = 0; 263 + captions = []; 264 + rawCaptionBuffer = []; 265 + processedRanges = []; 266 + currentCaption = ''; 160 267 161 268 try { 162 - if (!audioContext) { 163 - audioContext = new AudioContext({ sampleRate: SAMPLE_RATE }); 164 - } 165 - 166 - if (!sourceNode) { 167 - sourceNode = audioContext.createMediaElementSource(videoEl); 168 - sourceNode.connect(audioContext.destination); 169 - } 269 + const url = `${PLAYBACK_BASE}.getVideoPlaylist?uri=${encodeURIComponent(atUri)}`; 270 + console.log('[CC] Fetching manifest...'); 271 + const masterM3u8 = await (await fetch(url, { signal })).text(); 272 + const audioUrl = findAudioUrl(masterM3u8, url); 273 + if (!audioUrl) { console.warn('[CC] No audio track'); isProcessing = false; return; } 170 274 171 - processorNode = audioContext.createScriptProcessor(4096, 1, 1); 172 - processorNode.onaudioprocess = (e) => { 173 - if (!capturing) return; 174 - const input = e.inputBuffer.getChannelData(0); 175 - audioBuffer.push(new Float32Array(input)); 176 - bufferSamples += input.length; 177 - }; 275 + const audioM3u8 = await (await fetch(audioUrl, { signal })).text(); 276 + const { initUrl, initBR, segs } = parseM3u8(audioM3u8, audioUrl); 277 + if (segs.length === 0) { console.warn('[CC] No segments'); isProcessing = false; return; } 178 278 179 - sourceNode.connect(processorNode); 180 - processorNode.connect(audioContext.destination); 279 + let initData: ArrayBuffer | null = null; 280 + if (initUrl) { 281 + initData = initBR ? await fetchRange(initUrl, initBR, signal) : await (await fetch(initUrl, { signal })).arrayBuffer(); 282 + } 181 283 182 - chunkTimer = setInterval(() => { 183 - if (!worker || bufferSamples < SAMPLE_RATE * 1.5) return; 284 + const totalDuration = segs[segs.length - 1].startTime + segs[segs.length - 1].duration; 285 + console.log(`[CC] ${segs.length} segments, ${totalDuration.toFixed(0)}s total`); 184 286 185 - const totalSamples = audioBuffer.reduce((acc, b) => acc + b.length, 0); 186 - const merged = new Float32Array(totalSamples); 187 - let offset = 0; 188 - for (const buf of audioBuffer) { 189 - merged.set(buf, offset); 190 - offset += buf.length; 287 + // Group segments into ~30s chunks 288 + let i = 0; 289 + while (i < segs.length) { 290 + if (signal.aborted) break; 291 + const chunkStart = segs[i].startTime; 292 + const groupSegs: AudioSegment[] = []; 293 + let groupDur = 0; 294 + while (i < segs.length && groupDur < CHUNK_SECONDS) { 295 + groupSegs.push(segs[i]); 296 + groupDur += segs[i].duration; 297 + i++; 191 298 } 299 + const chunkEnd = groupSegs[groupSegs.length - 1].startTime + groupSegs[groupSegs.length - 1].duration; 192 300 193 - chunkTimeOffset = (attachedVideoEl?.currentTime ?? 0) - (totalSamples / SAMPLE_RATE); 301 + // Fetch segments 302 + const bufs: ArrayBuffer[] = []; 303 + for (const seg of groupSegs) { 304 + if (signal.aborted) break; 305 + bufs.push(seg.byteRange 306 + ? await fetchRange(seg.url, seg.byteRange, signal) 307 + : await (await fetch(seg.url, { signal })).arrayBuffer()); 308 + } 309 + if (signal.aborted) break; 194 310 195 - console.log(`[CC] Sending ${(totalSamples / SAMPLE_RATE).toFixed(1)}s audio to worker`); 311 + // Concat init + segments 312 + const totalSize = (initData?.byteLength ?? 0) + bufs.reduce((a, b) => a + b.byteLength, 0); 313 + const combined = new Uint8Array(totalSize); 314 + let off = 0; 315 + if (initData) { combined.set(new Uint8Array(initData), 0); off = initData.byteLength; } 316 + for (const b of bufs) { combined.set(new Uint8Array(b), off); off += b.byteLength; } 196 317 197 - worker!.postMessage({ 198 - type: 'transcribe', 199 - audio: merged, 200 - sampleRate: SAMPLE_RATE 201 - }); 318 + // Decode + transcribe 319 + const pcm = await decodeToMono16k(combined.buffer); 320 + if (pcm.length === 0 || signal.aborted) continue; 202 321 203 - audioBuffer = []; 204 - bufferSamples = 0; 205 - }, SEND_INTERVAL * 1000); 322 + console.log(`[CC] Transcribing ${chunkStart.toFixed(0)}s-${chunkEnd.toFixed(0)}s...`); 323 + const results = await transcribe(pcm); 324 + if (signal.aborted) break; 206 325 207 - console.log('[CC] Audio capture started, sampleRate:', audioContext.sampleRate); 326 + // Offset timestamps to absolute video time 327 + const chunkCaptions: Caption[] = []; 328 + for (const c of results) { 329 + chunkCaptions.push({ 330 + text: c.text, 331 + start: chunkStart + c.start, 332 + end: chunkStart + (c.end > 0 ? c.end : chunkEnd - chunkStart) 333 + }); 334 + } 208 335 209 - } catch (err: any) { 210 - console.error('[CC] Audio capture failed:', err); 211 - capturing = false; 212 - } 213 - } 336 + // Incrementally coalesce into sentence captions and update live 337 + if (chunkCaptions.length > 0) { 338 + rawCaptionBuffer = [...rawCaptionBuffer, ...chunkCaptions]; 339 + captions = splitIntoSentences(rawCaptionBuffer); 340 + processedRanges = [...processedRanges, { start: chunkStart, end: chunkEnd }]; 341 + } 214 342 215 - /** Internal: stop background audio capture */ 216 - function stopCapture() { 217 - capturing = false; 343 + processProgress = (i / segs.length) * 100; 344 + console.log(`[CC] Progress: ${processProgress.toFixed(0)}% — ${captions.length} captions ready`); 345 + } 218 346 219 - if (chunkTimer) { 220 - clearInterval(chunkTimer); 221 - chunkTimer = null; 347 + if (!signal.aborted) { 348 + console.log(`[CC] Done! ${captions.length} sentence captions`); 349 + } 350 + } catch (err: any) { 351 + if (err.name !== 'AbortError') console.error('[CC] Failed:', err); 222 352 } 223 353 224 - if (processorNode) { 225 - processorNode.disconnect(); 226 - processorNode = null; 227 - } 354 + isProcessing = false; 355 + processProgress = 100; 356 + } 228 357 229 - audioBuffer = []; 230 - bufferSamples = 0; 358 + // ---- Display ---- 359 + export function toggleCaptionsDisplay() { 360 + captionsEnabled = !captionsEnabled; 361 + if (!captionsEnabled) currentCaption = ''; 231 362 } 232 363 233 - /** Update the current visible caption based on video time */ 234 364 export function updateCaptionForTime(time: number) { 235 - if (!captionsEnabled || captionChunks.length === 0) { 365 + if (!captionsEnabled || captions.length === 0) { 236 366 currentCaption = ''; 237 367 return; 238 368 } 239 369 240 - // Show each chunk for 5 seconds from when it was received 241 - const now = Date.now(); 242 - for (let i = captionChunks.length - 1; i >= 0; i--) { 243 - const c = captionChunks[i]; 244 - if (c.text && c.receivedAt && (now - c.receivedAt) < 5000) { 370 + for (const c of captions) { 371 + if (time >= c.start && time <= c.end) { 245 372 currentCaption = c.text; 246 373 return; 247 374 } ··· 249 376 currentCaption = ''; 250 377 } 251 378 252 - /** Clean up everything */ 253 379 export function destroyCaptions() { 254 - stopCapture(); 380 + abortCtrl?.abort(); 255 381 captionsEnabled = false; 256 382 currentCaption = ''; 257 - attachedVideoEl = null; 258 - if (audioContext) { 259 - audioContext.close().catch(() => {}); 260 - audioContext = null; 261 - sourceNode = null; 262 - } 383 + captions = []; 384 + isProcessing = false; 263 385 }
+2
src/routes/+layout.svelte
··· 5 5 import FlySpawner from '$lib/FlySpawner.svelte'; 6 6 7 7 import { initAuth } from '$lib/auth.svelte'; 8 + import { initCaptions } from '$lib/captions.svelte'; 8 9 9 10 let { children } = $props(); 10 11 11 12 onMount(() => { 12 13 initAuth(); 14 + initCaptions(); 13 15 }); 14 16 </script> 15 17
+1 -1
src/routes/+page.svelte
··· 129 129 130 130 {#if selectedVideo} 131 131 <section class="player-section"> 132 - <VideoPlayer src={getPlaylistUrl(selectedVideo.uri)} /> 132 + <VideoPlayer src={getPlaylistUrl(selectedVideo.uri)} atUri={selectedVideo.uri} /> 133 133 <div class="player-info"> 134 134 <WavyBorder seed="player-info" fill="#39FF44" strokeColor="#0A182B" strokeWidth={1.8} padding="32px clamp(40px, 6vw, 80px)"> 135 135 <h2 class="player-title">{selectedVideo.value.title}</h2>
+1 -1
src/routes/profile/[handle]/+page.svelte
··· 153 153 <!-- Selected video player --> 154 154 {#if selectedVideo} 155 155 <section class="player-section"> 156 - <VideoPlayer src={getPlaylistUrl(selectedVideo.uri)} /> 156 + <VideoPlayer src={getPlaylistUrl(selectedVideo.uri)} atUri={selectedVideo.uri} /> 157 157 <div class="player-info"> 158 158 <WavyBorder seed="profile-player-info" fill="#39FF44" strokeColor="#0A182B" strokeWidth={1.8} padding={48}> 159 159 <h2 class="player-title">{selectedVideo.value.title}</h2>
static/cc_load_1.png

This is a binary file and will not be displayed.

static/cc_load_2.png

This is a binary file and will not be displayed.