chore: update the vtt cleaner · dunkirk.sh/thistle@1720635

+240 -52

1 changed file

expand all

src

lib

+240 -52

src/lib/vtt-cleaner.ts

··· 103 103 } 104 104 105 105 /** 106 - * Clean VTT text using AI to create paragraph-separated VTT file. 107 - * Uses OpenRouter API to intelligently group segments into paragraphs 108 - * while preserving timing information. 106 + * Chunk size for VTT processing 109 107 */ 110 - export async function cleanVTT( 111 - transcriptionId: string, 112 - vttContent: string, 113 - ): Promise<string> { 114 - const segments = parseVTT(vttContent); 108 + const CHUNK_SIZE = 40; // Segments per chunk 115 109 116 - if (segments.length === 0) { 117 - return vttContent; 110 + /** 111 + * Find paragraph boundaries in processed VTT content 112 + * Returns the segments in the last paragraph and highest paragraph number found 113 + */ 114 + function extractLastParagraphAndHighestNumber(vttContent: string): { 115 + segments: string, 116 + paragraphNumber: string | null, 117 + highestParagraphNumber: number 118 + } { 119 + if (!vttContent) return { segments: '', paragraphNumber: null, highestParagraphNumber: 0 }; 120 + 121 + // Split into segments (separated by double newline) 122 + const segments = vttContent.split('\n\n').filter(Boolean); 123 + if (segments.length === 0) return { segments: '', paragraphNumber: null, highestParagraphNumber: 0 }; 124 + 125 + // Get all segments from the last paragraph number 126 + const lastSegments: string[] = []; 127 + let currentParagraphNumber: string | null = null; 128 + let highestParagraphNumber = 0; 129 + 130 + // First, scan through all segments to find the highest paragraph number 131 + for (const segment of segments) { 132 + if (!segment) continue; 133 + 134 + const lines = segment.split('\n'); 135 + const firstLine = lines[0] || ''; 136 + 137 + // Check for paragraph number pattern 138 + const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine); 139 + if (paragraphMatch?.[1]) { 140 + const paragraphNum = parseInt(paragraphMatch[1], 10); 141 + if (!Number.isNaN(paragraphNum) && paragraphNum > highestParagraphNumber) { 142 + highestParagraphNumber = paragraphNum; 143 + } 144 + } 118 145 } 119 - 120 - console.log( 121 - `[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`, 122 - ); 123 - 124 - const apiKey = process.env.LLM_API_KEY; 125 - const apiBaseUrl = process.env.LLM_API_BASE_URL; 126 - const model = process.env.LLM_MODEL; 127 146 128 - if (!apiKey || !apiBaseUrl || !model) { 129 - console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT"); 130 - return vttContent; 147 + // Start from the end and work backwards to find the last paragraph 148 + for (let i = segments.length - 1; i >= 0; i--) { 149 + const segment = segments[i]; 150 + if (!segment) continue; 151 + 152 + const lines = segment.split('\n'); 153 + const firstLine = lines[0] || ''; 154 + 155 + // Check for paragraph number pattern 156 + const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine); 157 + if (paragraphMatch?.[1]) { 158 + const paragraphNumber = paragraphMatch[1]; 159 + 160 + if (!currentParagraphNumber) { 161 + // This is the first paragraph number we've found working backwards 162 + currentParagraphNumber = paragraphNumber; 163 + lastSegments.unshift(segment); 164 + } else if (paragraphNumber === currentParagraphNumber) { 165 + // Same paragraph, add it 166 + lastSegments.unshift(segment); 167 + } else { 168 + // Different paragraph, we're done 169 + break; 170 + } 171 + } else { 172 + // No paragraph number, but might be part of current paragraph 173 + // Add it if we've already started collecting segments 174 + if (currentParagraphNumber) { 175 + lastSegments.unshift(segment); 176 + } 177 + } 131 178 } 179 + 180 + return { 181 + segments: lastSegments.join('\n\n'), 182 + paragraphNumber: currentParagraphNumber, 183 + highestParagraphNumber 184 + }; 185 + } 132 186 133 - try { 134 - // Build the input for the AI 135 - const inputSegments = segments.map((seg, idx) => ({ 136 - index: idx, 137 - timestamp: seg.timestamp, 138 - text: seg.text, 139 - })); 140 - 141 - const prompt = `Can you turn this into a paragraph separated vtt file? 187 + /** 188 + * Process a chunk of VTT segments using AI 189 + */ 190 + async function processVTTChunk( 191 + transcriptionId: string, 192 + inputSegments: Array<{index: number, timestamp: string, text: string}>, 193 + chunkIndex: number, 194 + previousParagraphNumber: string | null, 195 + apiKey: string, 196 + apiBaseUrl: string, 197 + model: string, 198 + previousParagraphText?: string, 199 + ): Promise<string> { 200 + const chunkId = `${transcriptionId}-chunk${chunkIndex}`; 201 + 202 + const hasTextContext = !!previousParagraphText; 203 + 204 + console.log(`[VTTCleaner] Processing chunk ${chunkIndex} with ${inputSegments.length} segments${hasTextContext ? ' and previous paragraph text context' : ''}`); 205 + 206 + const nextParagraphNumber = previousParagraphNumber ? String(parseInt(previousParagraphNumber, 10) + 1) : '1'; 207 + 208 + const prompt = `Can you turn this into a paragraph separated vtt file? 142 209 143 210 Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph: 144 211 ··· 160 227 161 228 I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block. 162 229 230 + Here are important guidelines for forming paragraphs: 231 + 1. Create a new paragraph when there's a change in topic or speaker. 232 + 2. Don't make paragraphs too long - aim for 4-5 sentences per paragraph maximum. 233 + 3. Group related thoughts together in the same paragraph. 234 + 4. Start a new paragraph when a sentence introduces a completely new idea. 235 + 5. Focus on the number of sentences, not segments, when creating paragraphs. 236 + 6. The number of segments in a paragraph may vary, but keep paragraphs to a reasonable length. 237 + 163 238 Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one. 164 239 165 - Input segments: 240 + ${hasTextContext ? 241 + `The following is the last paragraph from the previous chunk and is provided for context only. DO NOT include it in your output - it's already in the transcript: 242 + 243 + ${previousParagraphText} 244 + 245 + Now process the following new segments, continuing from the previous paragraph. ${previousParagraphNumber ? `Start your paragraphs with number ${nextParagraphNumber} (unless you're continuing the previous paragraph).` : ''}` 246 + : 'Process the following segments:'} 247 + 166 248 ${JSON.stringify(inputSegments, null, 2)} 167 249 168 - Return ONLY the VTT content starting with "WEBVTT" and nothing else. No explanations or additional text.`; 250 + Return ONLY the VTT content WITHOUT the "WEBVTT" header and nothing else. No explanations or additional text.`; 169 251 252 + try { 170 253 const response = await fetch( 171 254 `${apiBaseUrl}/chat/completions`, 172 255 { ··· 175 258 "Content-Type": "application/json", 176 259 "Authorization": `Bearer ${apiKey}`, 177 260 "HTTP-Referer": "https://thistle.app", 178 - "X-Title": "Thistle Transcription", 261 + "X-Title": `Thistle Transcription Chunk ${chunkIndex}`, 179 262 }, 180 263 body: JSON.stringify({ 181 264 model, ··· 183 266 { role: "user", content: prompt }, 184 267 ], 185 268 temperature: 0.3, 186 - max_tokens: 16384, 269 + max_tokens: 8192, // Reduced for chunks 187 270 }), 188 271 }, 189 272 ); 190 273 191 274 if (!response.ok) { 192 275 const errorText = await response.text(); 193 - console.error(`[VTTCleaner] OpenRouter error for ${transcriptionId}:`, errorText); 194 - console.warn("[VTTCleaner] Falling back to uncleaned VTT"); 195 - return vttContent; 276 + console.error(`[VTTCleaner] OpenRouter error for ${chunkId}:`, errorText); 277 + throw new Error(`API error: ${response.status}`); 196 278 } 197 279 198 280 const result = await response.json(); 199 281 const cleanedVTT = result.choices?.[0]?.message?.content?.trim(); 200 282 201 283 if (!cleanedVTT) { 202 - console.warn("[VTTCleaner] Empty response from AI, returning uncleaned VTT"); 203 - return vttContent; 284 + throw new Error("Empty response from AI"); 204 285 } 205 286 206 287 // Extract VTT content if the model wrapped it in markdown 207 - let finalVTT = cleanedVTT; 288 + let chunkVTT = cleanedVTT; 208 289 if (cleanedVTT.includes("```")) { 209 290 const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/); 210 291 if (vttMatch?.[1]) { 211 - finalVTT = vttMatch[1].trim(); 292 + chunkVTT = vttMatch[1].trim(); 212 293 } 213 294 } 214 295 215 - // Ensure it starts with WEBVTT 216 - if (!finalVTT.startsWith("WEBVTT")) { 217 - const webvttIndex = finalVTT.indexOf("WEBVTT"); 218 - if (webvttIndex !== -1) { 219 - finalVTT = finalVTT.substring(webvttIndex); 220 - } else { 221 - finalVTT = `WEBVTT\n\n${finalVTT}`; 296 + // Remove WEBVTT header if present (we'll add it once at the end) 297 + if (chunkVTT.startsWith("WEBVTT")) { 298 + const lines = chunkVTT.split("\n"); 299 + // Skip WEBVTT line and any blank lines that follow 300 + let i = 1; 301 + while (i < lines.length && !lines[i]?.trim()) { 302 + i++; 222 303 } 304 + chunkVTT = lines.slice(i).join("\n"); 223 305 } 224 306 307 + console.log(`[VTTCleaner] Successfully processed chunk ${chunkIndex}`); 308 + return chunkVTT; 309 + } catch (error) { 310 + console.error(`[VTTCleaner] Exception in chunk ${chunkIndex}:`, error); 311 + throw error; 312 + } 313 + } 314 + 315 + /** 316 + * Clean VTT text using AI to create paragraph-separated VTT file. 317 + * Uses OpenRouter API to intelligently group segments into paragraphs 318 + * while preserving timing information. Processes sequentially in chunks 319 + * with context from previous chunks to maintain paragraph continuity. 320 + */ 321 + export async function cleanVTT( 322 + transcriptionId: string, 323 + vttContent: string, 324 + ): Promise<string> { 325 + const segments = parseVTT(vttContent); 326 + 327 + if (segments.length === 0) { 328 + return vttContent; 329 + } 330 + 331 + console.log( 332 + `[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`, 333 + ); 334 + 335 + const apiKey = process.env.LLM_API_KEY; 336 + const apiBaseUrl = process.env.LLM_API_BASE_URL; 337 + const model = process.env.LLM_MODEL; 338 + 339 + if (!apiKey || !apiBaseUrl || !model) { 340 + console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT"); 341 + return vttContent; 342 + } 343 + 344 + try { 345 + // Build the input segments 346 + const inputSegments = segments.map((seg, idx) => ({ 347 + index: idx, 348 + timestamp: seg.timestamp, 349 + text: seg.text, 350 + })); 351 + 352 + // Prepare chunks for sequential processing 353 + const chunks: Array<typeof inputSegments> = []; 354 + for (let i = 0; i < inputSegments.length; i += CHUNK_SIZE) { 355 + // Don't go beyond array bounds 356 + const end = Math.min(i + CHUNK_SIZE, inputSegments.length); 357 + chunks.push(inputSegments.slice(i, end)); 358 + } 359 + 360 + console.log(`[VTTCleaner] Split into ${chunks.length} chunks for sequential processing with paragraph context`); 361 + 362 + // Process chunks sequentially with context from previous chunk 363 + const processedChunks: string[] = []; 364 + let previousParagraphText: string | undefined; 365 + let previousParagraphNumber: string | null = null; 366 + 367 + for (let i = 0; i < chunks.length; i++) { 368 + const chunk = chunks[i]; 369 + if (!chunk || chunk.length === 0) continue; 370 + 371 + try { 372 + const processedChunk = await processVTTChunk( 373 + transcriptionId, 374 + chunk, 375 + i, 376 + previousParagraphNumber, 377 + apiKey, 378 + apiBaseUrl, 379 + model, 380 + previousParagraphText 381 + ); 382 + processedChunks.push(processedChunk); 383 + console.log(`[VTTCleaner] Completed chunk ${i}/${chunks.length - 1}${previousParagraphText ? ' (with context)' : ''}`); 384 + 385 + // Extract context for the next chunk 386 + if (i < chunks.length - 1) { 387 + const { segments: lastParagraphText, paragraphNumber, highestParagraphNumber } = extractLastParagraphAndHighestNumber(processedChunk); 388 + 389 + if (lastParagraphText) { 390 + console.log(`[VTTCleaner] Using paragraph ${paragraphNumber || 'unknown'} as context for next chunk (highest paragraph: ${highestParagraphNumber})`); 391 + previousParagraphText = lastParagraphText; 392 + previousParagraphNumber = highestParagraphNumber.toString(); 393 + } else { 394 + previousParagraphText = undefined; 395 + previousParagraphNumber = null; 396 + } 397 + } 398 + } catch (error) { 399 + console.error(`[VTTCleaner] Chunk ${i} failed:`, error); 400 + // Return the original segments for this chunk if processing fails 401 + const fallbackChunk = chunk.map(seg => 402 + `${seg.index || ''}\n${seg.timestamp}\n${seg.text}` 403 + ).join('\n\n'); 404 + processedChunks.push(fallbackChunk); 405 + previousParagraphText = undefined; 406 + previousParagraphNumber = null; 407 + } 408 + } 409 + 410 + // Combine all processed chunks 411 + const finalVTT = `WEBVTT\n\n${processedChunks.join('\n\n')}`; 412 + 225 413 console.log( 226 - `[VTTCleaner] Successfully cleaned ${segments.length} segments using AI`, 414 + `[VTTCleaner] Successfully cleaned ${segments.length} segments in ${chunks.length} sequential chunks with paragraph context`, 227 415 ); 228 416 229 417 return finalVTT; 230 - } catch (err) { 231 - console.error("[VTTCleaner] Exception:", err); 418 + } catch (error) { 419 + console.error("[VTTCleaner] Exception:", error); 232 420 console.warn("[VTTCleaner] Falling back to uncleaned VTT"); 233 421 return vttContent; 234 422 } 235 - } 423 + }

Configure Feed

Configure Feed