The AtmosphereConf talks your skyline missed
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 208 lines 5.8 kB view raw
1import "dotenv/config"; 2import { AssemblyAI } from "assemblyai"; 3import { execFileSync } from "child_process"; 4import * as fs from "fs"; 5import * as path from "path"; 6 7const REPO_DID = "did:plc:rbvrr34edl5ddpuwcubjiost"; 8const PDS_HOST = "https://iameli.com"; 9const VOD_HOST = "https://vod-beta.stream.place"; 10const COLLECTION = "place.stream.video"; 11const TRANSCRIPT_DIR = path.resolve(__dirname, "../data/transcripts"); 12const BATCH_SIZE = 10; 13 14interface VideoRecord { 15 uri: string; 16 cid: string; 17 value: { 18 $type: string; 19 title: string; 20 source: { 21 ref: string; 22 size: number; 23 $type: string; 24 mimeType: string; 25 start?: number; 26 end?: number; 27 }; 28 creator: string; 29 duration: number; 30 createdAt: string; 31 }; 32} 33 34interface ListRecordsResponse { 35 records: VideoRecord[]; 36 cursor?: string; 37} 38 39async function fetchAllVideoRecords(): Promise<VideoRecord[]> { 40 const records: VideoRecord[] = []; 41 let cursor: string | undefined; 42 43 do { 44 const params = new URLSearchParams({ 45 repo: REPO_DID, 46 collection: COLLECTION, 47 limit: "100", 48 }); 49 if (cursor) params.set("cursor", cursor); 50 51 const url = `${PDS_HOST}/xrpc/com.atproto.repo.listRecords?${params}`; 52 const res = await fetch(url); 53 if (!res.ok) throw new Error(`Failed to fetch records: ${res.status}`); 54 55 const data: ListRecordsResponse = await res.json(); 56 records.push(...data.records); 57 cursor = data.cursor; 58 } while (cursor); 59 60 return records; 61} 62 63function isIndividualTalkVOD(record: VideoRecord): boolean { 64 // Full-day room streams are created by the repo owner (Streamplace itself). 65 // Individual talk VODs are created by different DIDs (room-specific stream accounts). 66 if (record.value.creator === REPO_DID) return false; 67 return true; 68} 69 70function getRkey(uri: string): string { 71 return uri.split("/").pop()!; 72} 73 74function getPlaylistUrl(uri: string): string { 75 return `${VOD_HOST}/xrpc/place.stream.playback.getVideoPlaylist?uri=${encodeURIComponent(uri)}`; 76} 77 78async function extractAudio( 79 playlistUrl: string, 80 outputPath: string, 81): Promise<void> { 82 console.log(` Extracting audio to ${path.basename(outputPath)}...`); 83 execFileSync( 84 "ffmpeg", 85 ["-y", "-i", playlistUrl, "-vn", "-q:a", "2", outputPath], 86 { timeout: 600_000, stdio: ["pipe", "pipe", "pipe"] }, 87 ); 88} 89 90async function transcribeTalk( 91 client: AssemblyAI, 92 record: VideoRecord, 93): Promise<void> { 94 const rkey = getRkey(record.uri); 95 const outputPath = path.join(TRANSCRIPT_DIR, `${rkey}.json`); 96 97 // Skip if already transcribed 98 if (fs.existsSync(outputPath)) { 99 console.log(` [skip] ${record.value.title} — already transcribed`); 100 return; 101 } 102 103 const mp3Path = `/tmp/${rkey}.mp3`; 104 const playlistUrl = getPlaylistUrl(record.uri); 105 106 try { 107 // Extract audio 108 await extractAudio(playlistUrl, mp3Path); 109 110 // Transcribe with AssemblyAI 111 console.log(` Transcribing: ${record.value.title}...`); 112 const transcript = await client.transcripts.transcribe({ 113 audio: mp3Path, 114 speaker_labels: true, 115 speech_models: ["universal-3-pro", "universal-2"], 116 } as any); 117 118 if (transcript.status === "error") { 119 console.error(` [error] ${record.value.title}: ${transcript.error}`); 120 return; 121 } 122 123 // Save result 124 const result = { 125 uri: record.uri, 126 cid: record.cid, 127 title: record.value.title, 128 creator: record.value.creator, 129 duration: record.value.duration, 130 createdAt: record.value.createdAt, 131 transcription: { 132 id: transcript.id, 133 status: transcript.status, 134 text: transcript.text, 135 utterances: transcript.utterances, 136 words: transcript.words, 137 audio_duration: transcript.audio_duration, 138 }, 139 }; 140 141 fs.writeFileSync(outputPath, JSON.stringify(result, null, 2)); 142 console.log( 143 ` [done] ${record.value.title} (${transcript.audio_duration}s)`, 144 ); 145 } finally { 146 // Clean up mp3 147 if (fs.existsSync(mp3Path)) fs.unlinkSync(mp3Path); 148 } 149} 150 151async function processBatch( 152 client: AssemblyAI, 153 records: VideoRecord[], 154): Promise<void> { 155 await Promise.all(records.map((r) => transcribeTalk(client, r))); 156} 157 158async function main() { 159 const apiKey = process.env.ASSEMBLYAI_API_KEY; 160 if (!apiKey) { 161 console.error("ASSEMBLYAI_API_KEY not set"); 162 process.exit(1); 163 } 164 165 const client = new AssemblyAI({ apiKey }); 166 167 // Ensure output directory exists 168 fs.mkdirSync(TRANSCRIPT_DIR, { recursive: true }); 169 170 // Fetch all video records 171 console.log("Fetching video records..."); 172 const allRecords = await fetchAllVideoRecords(); 173 console.log(`Found ${allRecords.length} total video records`); 174 175 // Filter to individual talk VODs 176 const talks = allRecords.filter(isIndividualTalkVOD); 177 console.log(`Filtered to ${talks.length} individual talk VODs`); 178 console.log( 179 `Skipped ${allRecords.length - talks.length} full-day room streams`, 180 ); 181 182 // Check which ones are already done 183 const remaining = talks.filter((t) => { 184 const rkey = getRkey(t.uri); 185 return !fs.existsSync(path.join(TRANSCRIPT_DIR, `${rkey}.json`)); 186 }); 187 console.log( 188 `${talks.length - remaining.length} already transcribed, ${remaining.length} remaining\n`, 189 ); 190 191 // Process in batches of BATCH_SIZE 192 for (let i = 0; i < remaining.length; i += BATCH_SIZE) { 193 const batch = remaining.slice(i, i + BATCH_SIZE); 194 const batchNum = Math.floor(i / BATCH_SIZE) + 1; 195 const totalBatches = Math.ceil(remaining.length / BATCH_SIZE); 196 console.log( 197 `\n=== Batch ${batchNum}/${totalBatches} (${batch.length} talks) ===`, 198 ); 199 batch.forEach((r) => console.log(` - ${r.value.title}`)); 200 console.log(); 201 202 await processBatch(client, batch); 203 } 204 205 console.log("\nDone! All talks transcribed."); 206} 207 208main().catch(console.error);