A simple, clean, fast browser for the AtmosphereConf(2026) VODs
1import { readFile, writeFile } from 'node:fs/promises'
2import { existsSync } from 'node:fs'
3import path from 'node:path'
4
5const REPO_DID = 'did:plc:rbvrr34edl5ddpuwcubjiost'
6const PLC_DIRECTORY_URL = 'https://plc.directory'
7const OPENROUTER_API_URL = 'https://openrouter.ai/api/v1'
8const CATEGORIZER_MODEL = process.env.OPENROUTER_TAXONOMY_MODEL ?? 'google/gemini-3.1-flash-lite-preview'
9const EMBEDDING_MODEL = process.env.OPENROUTER_EMBEDDING_MODEL ?? 'openai/text-embedding-3-small'
10const OUTPUT_PATH = path.resolve(process.cwd(), 'src/lib/video-taxonomy.json')
11
12const STOPWORDS = new Set([
13 'the', 'and', 'for', 'with', 'from', 'that', 'this', 'your', 'into', 'about', 'what', 'when',
14 'where', 'have', 'will', 'just', 'talk', 'video', 'stream', 'conference', 'atmosphere', 'place',
15 'vod', 'beta', '2026', 'how', 'why', 'can', 'you', 'all', 'are', 'its', 'our', 'new', 'more',
16 'using', 'use', 'intro', 'introduction', 'deep', 'dive',
17])
18
19function parseEnvFile(content) {
20 const vars = {}
21
22 for (const rawLine of content.split(/\r?\n/)) {
23 const line = rawLine.trim()
24 if (!line || line.startsWith('#')) {
25 continue
26 }
27
28 const idx = line.indexOf('=')
29 if (idx <= 0) {
30 continue
31 }
32
33 const key = line.slice(0, idx).trim()
34 let value = line.slice(idx + 1).trim()
35 if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
36 value = value.slice(1, -1)
37 }
38
39 vars[key] = value
40 }
41
42 return vars
43}
44
45async function loadEnv() {
46 const candidatePaths = [
47 path.resolve(process.cwd(), '.env'),
48 path.resolve(process.cwd(), '..', '.env'),
49 ]
50
51 for (const filePath of candidatePaths) {
52 if (!existsSync(filePath)) {
53 continue
54 }
55
56 const content = await readFile(filePath, 'utf8')
57 const parsed = parseEnvFile(content)
58
59 for (const [key, value] of Object.entries(parsed)) {
60 if (!process.env[key]) {
61 process.env[key] = value
62 }
63 }
64 }
65}
66
67function normalizeToken(value) {
68 return value
69 .toLowerCase()
70 .replace(/&/g, ' and ')
71 .replace(/[^a-z0-9\s-]+/g, ' ')
72 .replace(/\s+/g, '-')
73 .replace(/-+/g, '-')
74 .replace(/^-|-$/g, '')
75}
76
77function uniqueTokens(values, max = 4) {
78 const output = []
79 const seen = new Set()
80
81 for (const raw of values) {
82 const token = normalizeToken(raw)
83 if (!token || seen.has(token)) {
84 continue
85 }
86
87 seen.add(token)
88 output.push(token)
89 if (output.length >= max) {
90 break
91 }
92 }
93
94 return output
95}
96
97function extractJsonObject(text) {
98 const fencedMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/i)
99 const source = fencedMatch ? fencedMatch[1] : text
100 const start = source.indexOf('{')
101 const end = source.lastIndexOf('}')
102
103 if (start < 0 || end < 0 || end <= start) {
104 throw new Error('No JSON object found in model response')
105 }
106
107 return JSON.parse(source.slice(start, end + 1))
108}
109
110async function fetchJson(url) {
111 const response = await fetch(url)
112 if (!response.ok) {
113 throw new Error(`Request failed (${response.status}) for ${url}`)
114 }
115 return response.json()
116}
117
118async function resolvePdsUrl() {
119 const didDoc = await fetchJson(`${PLC_DIRECTORY_URL}/${REPO_DID}`)
120 const pdsService = didDoc.service?.find((entry) => entry.id === '#atproto_pds')
121
122 if (!pdsService?.serviceEndpoint) {
123 throw new Error('Could not resolve PDS endpoint from PLC DID document')
124 }
125
126 return pdsService.serviceEndpoint.replace(/\/$/, '')
127}
128
129async function fetchAllTalkRecords() {
130 const pdsUrl = await resolvePdsUrl()
131 const records = []
132 let cursor = undefined
133
134 do {
135 const query = new URLSearchParams({
136 repo: REPO_DID,
137 collection: 'place.stream.video',
138 limit: '100',
139 })
140 if (cursor) {
141 query.set('cursor', cursor)
142 }
143
144 const page = await fetchJson(`${pdsUrl}/xrpc/com.atproto.repo.listRecords?${query.toString()}`)
145 records.push(...(page.records ?? []))
146 cursor = page.cursor
147 } while (cursor)
148
149 return records.map((record) => ({
150 uri: record.uri,
151 title: record.value?.title ?? 'Untitled',
152 description: record.value?.description,
153 createdAt: record.value?.createdAt,
154 }))
155}
156
157async function callOpenRouter(pathname, body) {
158 const apiKey = process.env.OPENROUTER_API_KEY
159 if (!apiKey) {
160 throw new Error('OPENROUTER_API_KEY is missing')
161 }
162
163 const response = await fetch(`${OPENROUTER_API_URL}${pathname}`, {
164 method: 'POST',
165 headers: {
166 Authorization: `Bearer ${apiKey}`,
167 'Content-Type': 'application/json',
168 'HTTP-Referer': 'https://vods.j4ck.xyz',
169 'X-OpenRouter-Title': 'Atmosphere VODs taxonomy generation',
170 },
171 body: JSON.stringify(body),
172 })
173
174 if (!response.ok) {
175 const errorText = await response.text()
176 throw new Error(`OpenRouter request failed (${response.status}): ${errorText}`)
177 }
178
179 return response.json()
180}
181
182async function buildEmbeddingVectors(talks) {
183 const input = talks.map((talk) => [talk.title, talk.description].filter(Boolean).join('\n\n'))
184 const response = await callOpenRouter('/embeddings', {
185 model: EMBEDDING_MODEL,
186 input,
187 input_type: 'search_document',
188 })
189
190 return response.data.map((entry) => entry.embedding)
191}
192
193function cosineSimilarity(a, b) {
194 let dot = 0
195 let magA = 0
196 let magB = 0
197
198 for (let i = 0; i < a.length; i += 1) {
199 dot += a[i] * b[i]
200 magA += a[i] * a[i]
201 magB += b[i] * b[i]
202 }
203
204 if (magA === 0 || magB === 0) {
205 return 0
206 }
207
208 return dot / (Math.sqrt(magA) * Math.sqrt(magB))
209}
210
211function titleKeywords(title) {
212 return (title.toLowerCase().match(/[a-z0-9][a-z0-9-]{2,}/g) ?? [])
213 .map((word) => normalizeToken(word))
214 .filter((word) => word && !STOPWORDS.has(word))
215}
216
217function enrichWithEmbeddingNeighbors(entriesByUri, talks, embeddings) {
218 for (let i = 0; i < talks.length; i += 1) {
219 const talk = talks[i]
220 const entry = entriesByUri.get(talk.uri)
221 if (!entry) {
222 continue
223 }
224
225 const scores = []
226 for (let j = 0; j < talks.length; j += 1) {
227 if (i === j) {
228 continue
229 }
230
231 scores.push({
232 index: j,
233 score: cosineSimilarity(embeddings[i], embeddings[j]),
234 })
235 }
236
237 scores.sort((a, b) => b.score - a.score)
238 const neighbors = scores.slice(0, 3)
239 const keywordCounts = new Map()
240
241 for (const neighbor of neighbors) {
242 for (const keyword of titleKeywords(talks[neighbor.index].title)) {
243 keywordCounts.set(keyword, (keywordCounts.get(keyword) ?? 0) + 1)
244 }
245 }
246
247 const neighborKeywords = [...keywordCounts.entries()]
248 .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
249 .map(([word]) => word)
250 .slice(0, 2)
251
252 entry.keywords = uniqueTokens([...(entry.keywords ?? []), ...neighborKeywords], 6)
253 }
254}
255
256function fallbackEntry(uri, title) {
257 const tokens = titleKeywords(title)
258 return {
259 uri,
260 group: 'general',
261 tags: uniqueTokens(tokens.slice(0, 2), 3),
262 topics: uniqueTokens(tokens.slice(0, 1), 2),
263 keywords: uniqueTokens(tokens.slice(0, 4), 6),
264 }
265}
266
267async function categorizeTalks(talks) {
268 const compactInput = talks.map((talk) => ({
269 uri: talk.uri,
270 title: talk.title,
271 description: talk.description ?? null,
272 }))
273
274 const response = await callOpenRouter('/chat/completions', {
275 model: CATEGORIZER_MODEL,
276 temperature: 0.2,
277 max_tokens: 12_000,
278 messages: [
279 {
280 role: 'system',
281 content:
282 'You classify conference talk metadata into a compact, useful taxonomy. Use lowercase kebab-case tokens only. Keep tags and topics broad and reusable. Output valid JSON only.',
283 },
284 {
285 role: 'user',
286 content: `Classify each talk. Constraints:\n- 8-14 groups total across all talks\n- each talk: group (1), tags (2-4), topics (1-2), keywords (2-5)\n- tags/topics must be concise and reusable\n- if description is null, classify from title only\n\nTalks JSON:\n${JSON.stringify(compactInput)}`,
287 },
288 ],
289 response_format: {
290 type: 'json_schema',
291 json_schema: {
292 name: 'video_taxonomy',
293 strict: true,
294 schema: {
295 type: 'object',
296 properties: {
297 entries: {
298 type: 'array',
299 items: {
300 type: 'object',
301 properties: {
302 uri: { type: 'string' },
303 group: { type: 'string' },
304 tags: { type: 'array', items: { type: 'string' } },
305 topics: { type: 'array', items: { type: 'string' } },
306 keywords: { type: 'array', items: { type: 'string' } },
307 },
308 required: ['uri', 'group', 'tags', 'topics', 'keywords'],
309 additionalProperties: false,
310 },
311 },
312 },
313 required: ['entries'],
314 additionalProperties: false,
315 },
316 },
317 },
318 })
319
320 const content = response.choices?.[0]?.message?.content
321 if (typeof content !== 'string') {
322 throw new Error('OpenRouter categorizer returned empty content')
323 }
324
325 const parsed = extractJsonObject(content)
326 return parsed.entries
327}
328
329async function main() {
330 await loadEnv()
331
332 const talks = await fetchAllTalkRecords()
333 talks.sort((a, b) => new Date(b.createdAt ?? 0).getTime() - new Date(a.createdAt ?? 0).getTime())
334
335 if (talks.length === 0) {
336 throw new Error('No talks found to categorize')
337 }
338
339 console.log(`Fetched ${talks.length} talks`)
340
341 const embeddings = await buildEmbeddingVectors(talks)
342 console.log(`Generated ${embeddings.length} embeddings with ${EMBEDDING_MODEL}`)
343
344 const llmEntries = await categorizeTalks(talks)
345 const entriesByUri = new Map()
346
347 for (const entry of llmEntries) {
348 entriesByUri.set(entry.uri, {
349 uri: entry.uri,
350 group: normalizeToken(entry.group) || 'general',
351 tags: uniqueTokens(entry.tags ?? [], 4),
352 topics: uniqueTokens(entry.topics ?? [], 3),
353 keywords: uniqueTokens(entry.keywords ?? [], 6),
354 })
355 }
356
357 for (const talk of talks) {
358 if (!entriesByUri.has(talk.uri)) {
359 entriesByUri.set(talk.uri, fallbackEntry(talk.uri, talk.title))
360 }
361 }
362
363 enrichWithEmbeddingNeighbors(entriesByUri, talks, embeddings)
364
365 const output = {
366 version: 1,
367 generatedAt: new Date().toISOString(),
368 models: {
369 categorizer: CATEGORIZER_MODEL,
370 embeddings: EMBEDDING_MODEL,
371 },
372 entries: talks.map((talk) => entriesByUri.get(talk.uri)),
373 }
374
375 await writeFile(OUTPUT_PATH, `${JSON.stringify(output, null, 2)}\n`, 'utf8')
376 console.log(`Wrote taxonomy to ${OUTPUT_PATH}`)
377}
378
379main().catch((error) => {
380 console.error(error)
381 process.exit(1)
382})