feat: HLS-inspired chunked transcript endpoint for track pages

+138 -1

2 changed files

expand all

apps

ionosphere-appview

src

routes.ts

tracks.ts

+29 -1

apps/ionosphere-appview/src/routes.ts

··· 1 1 import { Hono } from "hono"; 2 2 import { buildConcordance } from "./concordance.js"; 3 - import { getTracksIndex, getTrackData, STREAMS } from "./tracks.js"; 3 + import { getTracksIndex, getTrackData, getTranscriptManifest, getTranscriptChunk, STREAMS } from "./tracks.js"; 4 4 import type Database from "better-sqlite3"; 5 5 import { readFileSync, existsSync, writeFileSync, mkdirSync } from "node:fs"; 6 6 import path from "node:path"; ··· 685 685 686 686 c.header("Cache-Control", "public, max-age=3600, stale-while-revalidate=86400"); 687 687 return c.json(data); 688 + }); 689 + 690 + // --- Chunked track transcript (HLS-inspired) --- 691 + 692 + app.get("/xrpc/tv.ionosphere.getTrackTranscript", (c) => { 693 + const stream = c.req.query("stream"); 694 + if (!stream) return c.json({ error: "missing stream parameter" }, 400); 695 + 696 + const chunkParam = c.req.query("chunk"); 697 + 698 + if (chunkParam === undefined || chunkParam === null) { 699 + // Return manifest (chunk list without data) 700 + const manifest = getTranscriptManifest(db, stream); 701 + if (!manifest) return c.json({ error: "no transcript" }, 404); 702 + c.header("Cache-Control", "public, max-age=3600, stale-while-revalidate=86400"); 703 + return c.json(manifest); 704 + } 705 + 706 + // Return a single chunk 707 + const chunkIndex = parseInt(chunkParam, 10); 708 + if (isNaN(chunkIndex) || chunkIndex < 0) return c.json({ error: "invalid chunk" }, 400); 709 + 710 + const chunk = getTranscriptChunk(db, stream, chunkIndex); 711 + if (!chunk) return c.json({ error: "chunk not found" }, 404); 712 + 713 + // Chunks are immutable once built — cache aggressively 714 + c.header("Cache-Control", "public, max-age=86400, immutable"); 715 + return c.json(chunk); 688 716 }); 689 717 690 718 // --- Corrections sidecar ---

+109

apps/ionosphere-appview/src/tracks.ts

··· 99 99 return { text, facets }; 100 100 } 101 101 102 + /** Chunk duration for HLS-inspired transcript streaming (5 minutes in ms) */ 103 + const CHUNK_DURATION_MS = 300_000; 104 + 105 + export interface TranscriptManifest { 106 + totalDurationMs: number; 107 + chunkDurationMs: number; 108 + chunkCount: number; 109 + chunks: Array<{ index: number; startMs: number; endMs: number; wordCount: number }>; 110 + } 111 + 112 + export interface TranscriptChunk { 113 + index: number; 114 + startMs: number; 115 + endMs: number; 116 + text: string; 117 + facets: any[]; 118 + } 119 + 120 + /** 121 + * Build a manifest of time-based transcript chunks for a stream. 122 + * Each chunk covers CHUNK_DURATION_MS of the stream. 123 + */ 124 + export function getTranscriptManifest(db: Database.Database, slug: string): TranscriptManifest | null { 125 + const dbStream = getStreamFromDb(db, slug); 126 + const hardcoded = STREAMS.find((s) => s.slug === slug); 127 + const streamUri = dbStream?.stream_video_uri ?? hardcoded?.uri; 128 + if (!streamUri) return null; 129 + 130 + const durationSeconds = dbStream?.duration_seconds ?? hardcoded?.durationSeconds ?? 0; 131 + const totalDurationMs = durationSeconds * 1000; 132 + const chunkCount = Math.ceil(totalDurationMs / CHUNK_DURATION_MS); 133 + 134 + // Decode full transcript to get word timings (we need these for the manifest) 135 + const dbChunks = db.prepare( 136 + "SELECT * FROM stream_transcripts WHERE stream_uri = ? ORDER BY chunk_index ASC" 137 + ).all(streamUri) as any[]; 138 + 139 + if (dbChunks.length === 0) return null; 140 + 141 + const { words } = decodeChunkedTranscript(dbChunks); 142 + 143 + // Count words per time chunk 144 + const chunks: TranscriptManifest["chunks"] = []; 145 + for (let i = 0; i < chunkCount; i++) { 146 + const startMs = i * CHUNK_DURATION_MS; 147 + const endMs = Math.min((i + 1) * CHUNK_DURATION_MS, totalDurationMs); 148 + const wordCount = words.filter((w) => w.start * 1000 >= startMs && w.start * 1000 < endMs).length; 149 + chunks.push({ index: i, startMs, endMs, wordCount }); 150 + } 151 + 152 + return { totalDurationMs, chunkDurationMs: CHUNK_DURATION_MS, chunkCount, chunks }; 153 + } 154 + 155 + /** 156 + * Get a single time-based transcript chunk for a stream. 157 + * Returns text + facets for words within the time window. 158 + */ 159 + export function getTranscriptChunk(db: Database.Database, slug: string, chunkIndex: number): TranscriptChunk | null { 160 + const dbStream = getStreamFromDb(db, slug); 161 + const hardcoded = STREAMS.find((s) => s.slug === slug); 162 + const streamUri = dbStream?.stream_video_uri ?? hardcoded?.uri; 163 + if (!streamUri) return null; 164 + 165 + const durationSeconds = dbStream?.duration_seconds ?? hardcoded?.durationSeconds ?? 0; 166 + const totalDurationMs = durationSeconds * 1000; 167 + const startMs = chunkIndex * CHUNK_DURATION_MS; 168 + const endMs = Math.min((chunkIndex + 1) * CHUNK_DURATION_MS, totalDurationMs); 169 + 170 + if (startMs >= totalDurationMs) return null; 171 + 172 + // Decode full transcript and slice by time window 173 + const dbChunks = db.prepare( 174 + "SELECT * FROM stream_transcripts WHERE stream_uri = ? ORDER BY chunk_index ASC" 175 + ).all(streamUri) as any[]; 176 + 177 + if (dbChunks.length === 0) return null; 178 + 179 + const { text: fullText, facets: allFacets } = decodeChunkedTranscript(dbChunks); 180 + 181 + // Filter facets to this time window 182 + const chunkFacets = allFacets.filter((f: any) => { 183 + const ts = f.features?.[0]; 184 + if (!ts || ts.$type !== "tv.ionosphere.facet#timestamp") return false; 185 + const wordStartMs = ts.startTime / 1e6; 186 + return wordStartMs >= startMs && wordStartMs < endMs; 187 + }); 188 + 189 + if (chunkFacets.length === 0) return { index: chunkIndex, startMs, endMs, text: "", facets: [] }; 190 + 191 + // Extract the text slice covered by these facets 192 + const firstByte = chunkFacets[0].index.byteStart; 193 + const lastByte = chunkFacets[chunkFacets.length - 1].index.byteEnd; 194 + const encoder = new TextEncoder(); 195 + const fullBytes = encoder.encode(fullText); 196 + const decoder = new TextDecoder(); 197 + const chunkText = decoder.decode(fullBytes.slice(firstByte, lastByte)); 198 + 199 + // Rebase facet byte offsets to be relative to chunkText 200 + const rebasedFacets = chunkFacets.map((f: any) => ({ 201 + ...f, 202 + index: { 203 + byteStart: f.index.byteStart - firstByte, 204 + byteEnd: f.index.byteEnd - firstByte, 205 + }, 206 + })); 207 + 208 + return { index: chunkIndex, startMs, endMs, text: chunkText, facets: rebasedFacets }; 209 + } 210 + 102 211 function getStreamWordsFromDb(db: Database.Database, streamUri: string): Array<{ start: number; end: number; speaker: string }> { 103 212 const chunks = db.prepare( 104 213 "SELECT * FROM stream_transcripts WHERE stream_uri = ? ORDER BY chunk_index ASC"

Configure Feed

Configure Feed