The AtmosphereConf talks your skyline missed
1import "dotenv/config";
2import { AssemblyAI } from "assemblyai";
3import { execFileSync } from "child_process";
4import * as fs from "fs";
5import * as path from "path";
6
7const REPO_DID = "did:plc:rbvrr34edl5ddpuwcubjiost";
8const PDS_HOST = "https://iameli.com";
9const VOD_HOST = "https://vod-beta.stream.place";
10const COLLECTION = "place.stream.video";
11const TRANSCRIPT_DIR = path.resolve(__dirname, "../data/transcripts");
12const BATCH_SIZE = 10;
13
14interface VideoRecord {
15 uri: string;
16 cid: string;
17 value: {
18 $type: string;
19 title: string;
20 source: {
21 ref: string;
22 size: number;
23 $type: string;
24 mimeType: string;
25 start?: number;
26 end?: number;
27 };
28 creator: string;
29 duration: number;
30 createdAt: string;
31 };
32}
33
34interface ListRecordsResponse {
35 records: VideoRecord[];
36 cursor?: string;
37}
38
39async function fetchAllVideoRecords(): Promise<VideoRecord[]> {
40 const records: VideoRecord[] = [];
41 let cursor: string | undefined;
42
43 do {
44 const params = new URLSearchParams({
45 repo: REPO_DID,
46 collection: COLLECTION,
47 limit: "100",
48 });
49 if (cursor) params.set("cursor", cursor);
50
51 const url = `${PDS_HOST}/xrpc/com.atproto.repo.listRecords?${params}`;
52 const res = await fetch(url);
53 if (!res.ok) throw new Error(`Failed to fetch records: ${res.status}`);
54
55 const data: ListRecordsResponse = await res.json();
56 records.push(...data.records);
57 cursor = data.cursor;
58 } while (cursor);
59
60 return records;
61}
62
63function isIndividualTalkVOD(record: VideoRecord): boolean {
64 // Full-day room streams are created by the repo owner (Streamplace itself).
65 // Individual talk VODs are created by different DIDs (room-specific stream accounts).
66 if (record.value.creator === REPO_DID) return false;
67 return true;
68}
69
70function getRkey(uri: string): string {
71 return uri.split("/").pop()!;
72}
73
74function getPlaylistUrl(uri: string): string {
75 return `${VOD_HOST}/xrpc/place.stream.playback.getVideoPlaylist?uri=${encodeURIComponent(uri)}`;
76}
77
78async function extractAudio(
79 playlistUrl: string,
80 outputPath: string,
81): Promise<void> {
82 console.log(` Extracting audio to ${path.basename(outputPath)}...`);
83 execFileSync(
84 "ffmpeg",
85 ["-y", "-i", playlistUrl, "-vn", "-q:a", "2", outputPath],
86 { timeout: 600_000, stdio: ["pipe", "pipe", "pipe"] },
87 );
88}
89
90async function transcribeTalk(
91 client: AssemblyAI,
92 record: VideoRecord,
93): Promise<void> {
94 const rkey = getRkey(record.uri);
95 const outputPath = path.join(TRANSCRIPT_DIR, `${rkey}.json`);
96
97 // Skip if already transcribed
98 if (fs.existsSync(outputPath)) {
99 console.log(` [skip] ${record.value.title} — already transcribed`);
100 return;
101 }
102
103 const mp3Path = `/tmp/${rkey}.mp3`;
104 const playlistUrl = getPlaylistUrl(record.uri);
105
106 try {
107 // Extract audio
108 await extractAudio(playlistUrl, mp3Path);
109
110 // Transcribe with AssemblyAI
111 console.log(` Transcribing: ${record.value.title}...`);
112 const transcript = await client.transcripts.transcribe({
113 audio: mp3Path,
114 speaker_labels: true,
115 speech_models: ["universal-3-pro", "universal-2"],
116 } as any);
117
118 if (transcript.status === "error") {
119 console.error(` [error] ${record.value.title}: ${transcript.error}`);
120 return;
121 }
122
123 // Save result
124 const result = {
125 uri: record.uri,
126 cid: record.cid,
127 title: record.value.title,
128 creator: record.value.creator,
129 duration: record.value.duration,
130 createdAt: record.value.createdAt,
131 transcription: {
132 id: transcript.id,
133 status: transcript.status,
134 text: transcript.text,
135 utterances: transcript.utterances,
136 words: transcript.words,
137 audio_duration: transcript.audio_duration,
138 },
139 };
140
141 fs.writeFileSync(outputPath, JSON.stringify(result, null, 2));
142 console.log(
143 ` [done] ${record.value.title} (${transcript.audio_duration}s)`,
144 );
145 } finally {
146 // Clean up mp3
147 if (fs.existsSync(mp3Path)) fs.unlinkSync(mp3Path);
148 }
149}
150
151async function processBatch(
152 client: AssemblyAI,
153 records: VideoRecord[],
154): Promise<void> {
155 await Promise.all(records.map((r) => transcribeTalk(client, r)));
156}
157
158async function main() {
159 const apiKey = process.env.ASSEMBLYAI_API_KEY;
160 if (!apiKey) {
161 console.error("ASSEMBLYAI_API_KEY not set");
162 process.exit(1);
163 }
164
165 const client = new AssemblyAI({ apiKey });
166
167 // Ensure output directory exists
168 fs.mkdirSync(TRANSCRIPT_DIR, { recursive: true });
169
170 // Fetch all video records
171 console.log("Fetching video records...");
172 const allRecords = await fetchAllVideoRecords();
173 console.log(`Found ${allRecords.length} total video records`);
174
175 // Filter to individual talk VODs
176 const talks = allRecords.filter(isIndividualTalkVOD);
177 console.log(`Filtered to ${talks.length} individual talk VODs`);
178 console.log(
179 `Skipped ${allRecords.length - talks.length} full-day room streams`,
180 );
181
182 // Check which ones are already done
183 const remaining = talks.filter((t) => {
184 const rkey = getRkey(t.uri);
185 return !fs.existsSync(path.join(TRANSCRIPT_DIR, `${rkey}.json`));
186 });
187 console.log(
188 `${talks.length - remaining.length} already transcribed, ${remaining.length} remaining\n`,
189 );
190
191 // Process in batches of BATCH_SIZE
192 for (let i = 0; i < remaining.length; i += BATCH_SIZE) {
193 const batch = remaining.slice(i, i + BATCH_SIZE);
194 const batchNum = Math.floor(i / BATCH_SIZE) + 1;
195 const totalBatches = Math.ceil(remaining.length / BATCH_SIZE);
196 console.log(
197 `\n=== Batch ${batchNum}/${totalBatches} (${batch.length} talks) ===`,
198 );
199 batch.forEach((r) => console.log(` - ${r.value.title}`));
200 console.log();
201
202 await processBatch(client, batch);
203 }
204
205 console.log("\nDone! All talks transcribed.");
206}
207
208main().catch(console.error);