···103103}
104104105105/**
106106- * Clean VTT text using AI to create paragraph-separated VTT file.
107107- * Uses OpenRouter API to intelligently group segments into paragraphs
108108- * while preserving timing information.
106106+ * Chunk size for VTT processing
109107 */
110110-export async function cleanVTT(
111111- transcriptionId: string,
112112- vttContent: string,
113113-): Promise<string> {
114114- const segments = parseVTT(vttContent);
108108+const CHUNK_SIZE = 40; // Segments per chunk
115109116116- if (segments.length === 0) {
117117- return vttContent;
110110+/**
111111+ * Find paragraph boundaries in processed VTT content
112112+ * Returns the segments in the last paragraph and highest paragraph number found
113113+ */
114114+function extractLastParagraphAndHighestNumber(vttContent: string): {
115115+ segments: string,
116116+ paragraphNumber: string | null,
117117+ highestParagraphNumber: number
118118+} {
119119+ if (!vttContent) return { segments: '', paragraphNumber: null, highestParagraphNumber: 0 };
120120+121121+ // Split into segments (separated by double newline)
122122+ const segments = vttContent.split('\n\n').filter(Boolean);
123123+ if (segments.length === 0) return { segments: '', paragraphNumber: null, highestParagraphNumber: 0 };
124124+125125+ // Get all segments from the last paragraph number
126126+ const lastSegments: string[] = [];
127127+ let currentParagraphNumber: string | null = null;
128128+ let highestParagraphNumber = 0;
129129+130130+ // First, scan through all segments to find the highest paragraph number
131131+ for (const segment of segments) {
132132+ if (!segment) continue;
133133+134134+ const lines = segment.split('\n');
135135+ const firstLine = lines[0] || '';
136136+137137+ // Check for paragraph number pattern
138138+ const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);
139139+ if (paragraphMatch?.[1]) {
140140+ const paragraphNum = parseInt(paragraphMatch[1], 10);
141141+ if (!Number.isNaN(paragraphNum) && paragraphNum > highestParagraphNumber) {
142142+ highestParagraphNumber = paragraphNum;
143143+ }
144144+ }
118145 }
119119-120120- console.log(
121121- `[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
122122- );
123123-124124- const apiKey = process.env.LLM_API_KEY;
125125- const apiBaseUrl = process.env.LLM_API_BASE_URL;
126126- const model = process.env.LLM_MODEL;
127146128128- if (!apiKey || !apiBaseUrl || !model) {
129129- console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT");
130130- return vttContent;
147147+ // Start from the end and work backwards to find the last paragraph
148148+ for (let i = segments.length - 1; i >= 0; i--) {
149149+ const segment = segments[i];
150150+ if (!segment) continue;
151151+152152+ const lines = segment.split('\n');
153153+ const firstLine = lines[0] || '';
154154+155155+ // Check for paragraph number pattern
156156+ const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);
157157+ if (paragraphMatch?.[1]) {
158158+ const paragraphNumber = paragraphMatch[1];
159159+160160+ if (!currentParagraphNumber) {
161161+ // This is the first paragraph number we've found working backwards
162162+ currentParagraphNumber = paragraphNumber;
163163+ lastSegments.unshift(segment);
164164+ } else if (paragraphNumber === currentParagraphNumber) {
165165+ // Same paragraph, add it
166166+ lastSegments.unshift(segment);
167167+ } else {
168168+ // Different paragraph, we're done
169169+ break;
170170+ }
171171+ } else {
172172+ // No paragraph number, but might be part of current paragraph
173173+ // Add it if we've already started collecting segments
174174+ if (currentParagraphNumber) {
175175+ lastSegments.unshift(segment);
176176+ }
177177+ }
131178 }
179179+180180+ return {
181181+ segments: lastSegments.join('\n\n'),
182182+ paragraphNumber: currentParagraphNumber,
183183+ highestParagraphNumber
184184+ };
185185+}
132186133133- try {
134134- // Build the input for the AI
135135- const inputSegments = segments.map((seg, idx) => ({
136136- index: idx,
137137- timestamp: seg.timestamp,
138138- text: seg.text,
139139- }));
140140-141141- const prompt = `Can you turn this into a paragraph separated vtt file?
187187+/**
188188+ * Process a chunk of VTT segments using AI
189189+ */
190190+async function processVTTChunk(
191191+ transcriptionId: string,
192192+ inputSegments: Array<{index: number, timestamp: string, text: string}>,
193193+ chunkIndex: number,
194194+ previousParagraphNumber: string | null,
195195+ apiKey: string,
196196+ apiBaseUrl: string,
197197+ model: string,
198198+ previousParagraphText?: string,
199199+): Promise<string> {
200200+ const chunkId = `${transcriptionId}-chunk${chunkIndex}`;
201201+202202+ const hasTextContext = !!previousParagraphText;
203203+204204+ console.log(`[VTTCleaner] Processing chunk ${chunkIndex} with ${inputSegments.length} segments${hasTextContext ? ' and previous paragraph text context' : ''}`);
205205+206206+ const nextParagraphNumber = previousParagraphNumber ? String(parseInt(previousParagraphNumber, 10) + 1) : '1';
207207+208208+ const prompt = `Can you turn this into a paragraph separated vtt file?
142209143210Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph:
144211···160227161228I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block.
162229230230+Here are important guidelines for forming paragraphs:
231231+1. Create a new paragraph when there's a change in topic or speaker.
232232+2. Don't make paragraphs too long - aim for 4-5 sentences per paragraph maximum.
233233+3. Group related thoughts together in the same paragraph.
234234+4. Start a new paragraph when a sentence introduces a completely new idea.
235235+5. Focus on the number of sentences, not segments, when creating paragraphs.
236236+6. The number of segments in a paragraph may vary, but keep paragraphs to a reasonable length.
237237+163238Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one.
164239165165-Input segments:
240240+${hasTextContext ?
241241+`The following is the last paragraph from the previous chunk and is provided for context only. DO NOT include it in your output - it's already in the transcript:
242242+243243+${previousParagraphText}
244244+245245+Now process the following new segments, continuing from the previous paragraph. ${previousParagraphNumber ? `Start your paragraphs with number ${nextParagraphNumber} (unless you're continuing the previous paragraph).` : ''}`
246246+: 'Process the following segments:'}
247247+166248${JSON.stringify(inputSegments, null, 2)}
167249168168-Return ONLY the VTT content starting with "WEBVTT" and nothing else. No explanations or additional text.`;
250250+Return ONLY the VTT content WITHOUT the "WEBVTT" header and nothing else. No explanations or additional text.`;
169251252252+ try {
170253 const response = await fetch(
171254 `${apiBaseUrl}/chat/completions`,
172255 {
···175258 "Content-Type": "application/json",
176259 "Authorization": `Bearer ${apiKey}`,
177260 "HTTP-Referer": "https://thistle.app",
178178- "X-Title": "Thistle Transcription",
261261+ "X-Title": `Thistle Transcription Chunk ${chunkIndex}`,
179262 },
180263 body: JSON.stringify({
181264 model,
···183266 { role: "user", content: prompt },
184267 ],
185268 temperature: 0.3,
186186- max_tokens: 16384,
269269+ max_tokens: 8192, // Reduced for chunks
187270 }),
188271 },
189272 );
190273191274 if (!response.ok) {
192275 const errorText = await response.text();
193193- console.error(`[VTTCleaner] OpenRouter error for ${transcriptionId}:`, errorText);
194194- console.warn("[VTTCleaner] Falling back to uncleaned VTT");
195195- return vttContent;
276276+ console.error(`[VTTCleaner] OpenRouter error for ${chunkId}:`, errorText);
277277+ throw new Error(`API error: ${response.status}`);
196278 }
197279198280 const result = await response.json();
199281 const cleanedVTT = result.choices?.[0]?.message?.content?.trim();
200282201283 if (!cleanedVTT) {
202202- console.warn("[VTTCleaner] Empty response from AI, returning uncleaned VTT");
203203- return vttContent;
284284+ throw new Error("Empty response from AI");
204285 }
205286206287 // Extract VTT content if the model wrapped it in markdown
207207- let finalVTT = cleanedVTT;
288288+ let chunkVTT = cleanedVTT;
208289 if (cleanedVTT.includes("```")) {
209290 const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/);
210291 if (vttMatch?.[1]) {
211211- finalVTT = vttMatch[1].trim();
292292+ chunkVTT = vttMatch[1].trim();
212293 }
213294 }
214295215215- // Ensure it starts with WEBVTT
216216- if (!finalVTT.startsWith("WEBVTT")) {
217217- const webvttIndex = finalVTT.indexOf("WEBVTT");
218218- if (webvttIndex !== -1) {
219219- finalVTT = finalVTT.substring(webvttIndex);
220220- } else {
221221- finalVTT = `WEBVTT\n\n${finalVTT}`;
296296+ // Remove WEBVTT header if present (we'll add it once at the end)
297297+ if (chunkVTT.startsWith("WEBVTT")) {
298298+ const lines = chunkVTT.split("\n");
299299+ // Skip WEBVTT line and any blank lines that follow
300300+ let i = 1;
301301+ while (i < lines.length && !lines[i]?.trim()) {
302302+ i++;
222303 }
304304+ chunkVTT = lines.slice(i).join("\n");
223305 }
224306307307+ console.log(`[VTTCleaner] Successfully processed chunk ${chunkIndex}`);
308308+ return chunkVTT;
309309+ } catch (error) {
310310+ console.error(`[VTTCleaner] Exception in chunk ${chunkIndex}:`, error);
311311+ throw error;
312312+ }
313313+}
314314+315315+/**
316316+ * Clean VTT text using AI to create paragraph-separated VTT file.
317317+ * Uses OpenRouter API to intelligently group segments into paragraphs
318318+ * while preserving timing information. Processes sequentially in chunks
319319+ * with context from previous chunks to maintain paragraph continuity.
320320+ */
321321+export async function cleanVTT(
322322+ transcriptionId: string,
323323+ vttContent: string,
324324+): Promise<string> {
325325+ const segments = parseVTT(vttContent);
326326+327327+ if (segments.length === 0) {
328328+ return vttContent;
329329+ }
330330+331331+ console.log(
332332+ `[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
333333+ );
334334+335335+ const apiKey = process.env.LLM_API_KEY;
336336+ const apiBaseUrl = process.env.LLM_API_BASE_URL;
337337+ const model = process.env.LLM_MODEL;
338338+339339+ if (!apiKey || !apiBaseUrl || !model) {
340340+ console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT");
341341+ return vttContent;
342342+ }
343343+344344+ try {
345345+ // Build the input segments
346346+ const inputSegments = segments.map((seg, idx) => ({
347347+ index: idx,
348348+ timestamp: seg.timestamp,
349349+ text: seg.text,
350350+ }));
351351+352352+ // Prepare chunks for sequential processing
353353+ const chunks: Array<typeof inputSegments> = [];
354354+ for (let i = 0; i < inputSegments.length; i += CHUNK_SIZE) {
355355+ // Don't go beyond array bounds
356356+ const end = Math.min(i + CHUNK_SIZE, inputSegments.length);
357357+ chunks.push(inputSegments.slice(i, end));
358358+ }
359359+360360+ console.log(`[VTTCleaner] Split into ${chunks.length} chunks for sequential processing with paragraph context`);
361361+362362+ // Process chunks sequentially with context from previous chunk
363363+ const processedChunks: string[] = [];
364364+ let previousParagraphText: string | undefined;
365365+ let previousParagraphNumber: string | null = null;
366366+367367+ for (let i = 0; i < chunks.length; i++) {
368368+ const chunk = chunks[i];
369369+ if (!chunk || chunk.length === 0) continue;
370370+371371+ try {
372372+ const processedChunk = await processVTTChunk(
373373+ transcriptionId,
374374+ chunk,
375375+ i,
376376+ previousParagraphNumber,
377377+ apiKey,
378378+ apiBaseUrl,
379379+ model,
380380+ previousParagraphText
381381+ );
382382+ processedChunks.push(processedChunk);
383383+ console.log(`[VTTCleaner] Completed chunk ${i}/${chunks.length - 1}${previousParagraphText ? ' (with context)' : ''}`);
384384+385385+ // Extract context for the next chunk
386386+ if (i < chunks.length - 1) {
387387+ const { segments: lastParagraphText, paragraphNumber, highestParagraphNumber } = extractLastParagraphAndHighestNumber(processedChunk);
388388+389389+ if (lastParagraphText) {
390390+ console.log(`[VTTCleaner] Using paragraph ${paragraphNumber || 'unknown'} as context for next chunk (highest paragraph: ${highestParagraphNumber})`);
391391+ previousParagraphText = lastParagraphText;
392392+ previousParagraphNumber = highestParagraphNumber.toString();
393393+ } else {
394394+ previousParagraphText = undefined;
395395+ previousParagraphNumber = null;
396396+ }
397397+ }
398398+ } catch (error) {
399399+ console.error(`[VTTCleaner] Chunk ${i} failed:`, error);
400400+ // Return the original segments for this chunk if processing fails
401401+ const fallbackChunk = chunk.map(seg =>
402402+ `${seg.index || ''}\n${seg.timestamp}\n${seg.text}`
403403+ ).join('\n\n');
404404+ processedChunks.push(fallbackChunk);
405405+ previousParagraphText = undefined;
406406+ previousParagraphNumber = null;
407407+ }
408408+ }
409409+410410+ // Combine all processed chunks
411411+ const finalVTT = `WEBVTT\n\n${processedChunks.join('\n\n')}`;
412412+225413 console.log(
226226- `[VTTCleaner] Successfully cleaned ${segments.length} segments using AI`,
414414+ `[VTTCleaner] Successfully cleaned ${segments.length} segments in ${chunks.length} sequential chunks with paragraph context`,
227415 );
228416229417 return finalVTT;
230230- } catch (err) {
231231- console.error("[VTTCleaner] Exception:", err);
418418+ } catch (error) {
419419+ console.error("[VTTCleaner] Exception:", error);
232420 console.warn("[VTTCleaner] Falling back to uncleaned VTT");
233421 return vttContent;
234422 }
235235-}
423423+}