The AtmosphereConf talks your skyline missed
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 115 lines 3.4 kB view raw
1import type { Utterance, TranscriptSegment } from "./types"; 2 3/** 4 * Strip trailing punctuation for word comparison. 5 */ 6function normalize(s: string): string { 7 return s 8 .replace(/[.,!?;:'")\]]+$/, "") 9 .replace(/^['"(\[]+/, "") 10 .toLowerCase(); 11} 12 13/** 14 * Split utterances into sentence-level segments with interpolated timestamps. 15 * Walks the words array sequentially, matching tokens to derive accurate timing. 16 */ 17export function splitUtterances( 18 utterances: Utterance[], 19): TranscriptSegment[] { 20 const segments: TranscriptSegment[] = []; 21 22 if (!utterances || utterances.length === 0) return segments; 23 24 for (let uIdx = 0; uIdx < utterances.length; uIdx++) { 25 const utterance = utterances[uIdx]; 26 const sentences = utterance.text.split(/(?<=[.!?])\s+/).filter(Boolean); 27 28 if (sentences.length === 0) continue; 29 30 // If only one sentence or no words, treat entire utterance as one segment 31 if ( 32 sentences.length === 1 || 33 !utterance.words || 34 utterance.words.length === 0 35 ) { 36 segments.push({ 37 id: `u${uIdx}-s0`, 38 speaker: utterance.speaker, 39 text: utterance.text, 40 startMs: utterance.start, 41 endMs: utterance.end, 42 }); 43 continue; 44 } 45 46 let wordPtr = 0; 47 const words = utterance.words; 48 49 for (let sIdx = 0; sIdx < sentences.length; sIdx++) { 50 const sentence = sentences[sIdx]; 51 const tokens = sentence.split(/\s+/).filter(Boolean); 52 53 // Try to match tokens to words sequentially 54 const startWord = wordPtr; 55 56 for (const token of tokens) { 57 if (wordPtr >= words.length) break; 58 const normalizedToken = normalize(token); 59 const normalizedWord = normalize(words[wordPtr].text); 60 61 if ( 62 normalizedToken === normalizedWord || 63 normalizedWord.startsWith(normalizedToken) || 64 normalizedToken.startsWith(normalizedWord) 65 ) { 66 wordPtr++; 67 } else { 68 // Skip ahead up to 2 words to handle minor mismatches 69 let found = false; 70 for ( 71 let skip = 1; 72 skip <= 2 && wordPtr + skip < words.length; 73 skip++ 74 ) { 75 if (normalize(words[wordPtr + skip].text) === normalizedToken) { 76 wordPtr += skip + 1; 77 found = true; 78 break; 79 } 80 } 81 if (!found) { 82 wordPtr++; 83 } 84 } 85 } 86 87 // Derive timestamps from matched words 88 let startMs: number; 89 let endMs: number; 90 91 if (startWord < words.length && wordPtr > startWord) { 92 startMs = words[startWord].start; 93 endMs = words[Math.min(wordPtr - 1, words.length - 1)].end; 94 } else { 95 // Fallback: interpolate proportionally 96 const charStart = sentences.slice(0, sIdx).join(" ").length; 97 const charEnd = charStart + sentence.length; 98 const totalChars = utterance.text.length; 99 const duration = utterance.end - utterance.start; 100 startMs = utterance.start + (charStart / totalChars) * duration; 101 endMs = utterance.start + (charEnd / totalChars) * duration; 102 } 103 104 segments.push({ 105 id: `u${uIdx}-s${sIdx}`, 106 speaker: utterance.speaker, 107 text: sentence, 108 startMs: Math.round(startMs), 109 endMs: Math.round(endMs), 110 }); 111 } 112 } 113 114 return segments; 115}