A simple, clean, fast browser for the AtmosphereConf(2026) VODs
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 382 lines 11 kB view raw
1import { readFile, writeFile } from 'node:fs/promises' 2import { existsSync } from 'node:fs' 3import path from 'node:path' 4 5const REPO_DID = 'did:plc:rbvrr34edl5ddpuwcubjiost' 6const PLC_DIRECTORY_URL = 'https://plc.directory' 7const OPENROUTER_API_URL = 'https://openrouter.ai/api/v1' 8const CATEGORIZER_MODEL = process.env.OPENROUTER_TAXONOMY_MODEL ?? 'google/gemini-3.1-flash-lite-preview' 9const EMBEDDING_MODEL = process.env.OPENROUTER_EMBEDDING_MODEL ?? 'openai/text-embedding-3-small' 10const OUTPUT_PATH = path.resolve(process.cwd(), 'src/lib/video-taxonomy.json') 11 12const STOPWORDS = new Set([ 13 'the', 'and', 'for', 'with', 'from', 'that', 'this', 'your', 'into', 'about', 'what', 'when', 14 'where', 'have', 'will', 'just', 'talk', 'video', 'stream', 'conference', 'atmosphere', 'place', 15 'vod', 'beta', '2026', 'how', 'why', 'can', 'you', 'all', 'are', 'its', 'our', 'new', 'more', 16 'using', 'use', 'intro', 'introduction', 'deep', 'dive', 17]) 18 19function parseEnvFile(content) { 20 const vars = {} 21 22 for (const rawLine of content.split(/\r?\n/)) { 23 const line = rawLine.trim() 24 if (!line || line.startsWith('#')) { 25 continue 26 } 27 28 const idx = line.indexOf('=') 29 if (idx <= 0) { 30 continue 31 } 32 33 const key = line.slice(0, idx).trim() 34 let value = line.slice(idx + 1).trim() 35 if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) { 36 value = value.slice(1, -1) 37 } 38 39 vars[key] = value 40 } 41 42 return vars 43} 44 45async function loadEnv() { 46 const candidatePaths = [ 47 path.resolve(process.cwd(), '.env'), 48 path.resolve(process.cwd(), '..', '.env'), 49 ] 50 51 for (const filePath of candidatePaths) { 52 if (!existsSync(filePath)) { 53 continue 54 } 55 56 const content = await readFile(filePath, 'utf8') 57 const parsed = parseEnvFile(content) 58 59 for (const [key, value] of Object.entries(parsed)) { 60 if (!process.env[key]) { 61 process.env[key] = value 62 } 63 } 64 } 65} 66 67function normalizeToken(value) { 68 return value 69 .toLowerCase() 70 .replace(/&/g, ' and ') 71 .replace(/[^a-z0-9\s-]+/g, ' ') 72 .replace(/\s+/g, '-') 73 .replace(/-+/g, '-') 74 .replace(/^-|-$/g, '') 75} 76 77function uniqueTokens(values, max = 4) { 78 const output = [] 79 const seen = new Set() 80 81 for (const raw of values) { 82 const token = normalizeToken(raw) 83 if (!token || seen.has(token)) { 84 continue 85 } 86 87 seen.add(token) 88 output.push(token) 89 if (output.length >= max) { 90 break 91 } 92 } 93 94 return output 95} 96 97function extractJsonObject(text) { 98 const fencedMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/i) 99 const source = fencedMatch ? fencedMatch[1] : text 100 const start = source.indexOf('{') 101 const end = source.lastIndexOf('}') 102 103 if (start < 0 || end < 0 || end <= start) { 104 throw new Error('No JSON object found in model response') 105 } 106 107 return JSON.parse(source.slice(start, end + 1)) 108} 109 110async function fetchJson(url) { 111 const response = await fetch(url) 112 if (!response.ok) { 113 throw new Error(`Request failed (${response.status}) for ${url}`) 114 } 115 return response.json() 116} 117 118async function resolvePdsUrl() { 119 const didDoc = await fetchJson(`${PLC_DIRECTORY_URL}/${REPO_DID}`) 120 const pdsService = didDoc.service?.find((entry) => entry.id === '#atproto_pds') 121 122 if (!pdsService?.serviceEndpoint) { 123 throw new Error('Could not resolve PDS endpoint from PLC DID document') 124 } 125 126 return pdsService.serviceEndpoint.replace(/\/$/, '') 127} 128 129async function fetchAllTalkRecords() { 130 const pdsUrl = await resolvePdsUrl() 131 const records = [] 132 let cursor = undefined 133 134 do { 135 const query = new URLSearchParams({ 136 repo: REPO_DID, 137 collection: 'place.stream.video', 138 limit: '100', 139 }) 140 if (cursor) { 141 query.set('cursor', cursor) 142 } 143 144 const page = await fetchJson(`${pdsUrl}/xrpc/com.atproto.repo.listRecords?${query.toString()}`) 145 records.push(...(page.records ?? [])) 146 cursor = page.cursor 147 } while (cursor) 148 149 return records.map((record) => ({ 150 uri: record.uri, 151 title: record.value?.title ?? 'Untitled', 152 description: record.value?.description, 153 createdAt: record.value?.createdAt, 154 })) 155} 156 157async function callOpenRouter(pathname, body) { 158 const apiKey = process.env.OPENROUTER_API_KEY 159 if (!apiKey) { 160 throw new Error('OPENROUTER_API_KEY is missing') 161 } 162 163 const response = await fetch(`${OPENROUTER_API_URL}${pathname}`, { 164 method: 'POST', 165 headers: { 166 Authorization: `Bearer ${apiKey}`, 167 'Content-Type': 'application/json', 168 'HTTP-Referer': 'https://vods.j4ck.xyz', 169 'X-OpenRouter-Title': 'Atmosphere VODs taxonomy generation', 170 }, 171 body: JSON.stringify(body), 172 }) 173 174 if (!response.ok) { 175 const errorText = await response.text() 176 throw new Error(`OpenRouter request failed (${response.status}): ${errorText}`) 177 } 178 179 return response.json() 180} 181 182async function buildEmbeddingVectors(talks) { 183 const input = talks.map((talk) => [talk.title, talk.description].filter(Boolean).join('\n\n')) 184 const response = await callOpenRouter('/embeddings', { 185 model: EMBEDDING_MODEL, 186 input, 187 input_type: 'search_document', 188 }) 189 190 return response.data.map((entry) => entry.embedding) 191} 192 193function cosineSimilarity(a, b) { 194 let dot = 0 195 let magA = 0 196 let magB = 0 197 198 for (let i = 0; i < a.length; i += 1) { 199 dot += a[i] * b[i] 200 magA += a[i] * a[i] 201 magB += b[i] * b[i] 202 } 203 204 if (magA === 0 || magB === 0) { 205 return 0 206 } 207 208 return dot / (Math.sqrt(magA) * Math.sqrt(magB)) 209} 210 211function titleKeywords(title) { 212 return (title.toLowerCase().match(/[a-z0-9][a-z0-9-]{2,}/g) ?? []) 213 .map((word) => normalizeToken(word)) 214 .filter((word) => word && !STOPWORDS.has(word)) 215} 216 217function enrichWithEmbeddingNeighbors(entriesByUri, talks, embeddings) { 218 for (let i = 0; i < talks.length; i += 1) { 219 const talk = talks[i] 220 const entry = entriesByUri.get(talk.uri) 221 if (!entry) { 222 continue 223 } 224 225 const scores = [] 226 for (let j = 0; j < talks.length; j += 1) { 227 if (i === j) { 228 continue 229 } 230 231 scores.push({ 232 index: j, 233 score: cosineSimilarity(embeddings[i], embeddings[j]), 234 }) 235 } 236 237 scores.sort((a, b) => b.score - a.score) 238 const neighbors = scores.slice(0, 3) 239 const keywordCounts = new Map() 240 241 for (const neighbor of neighbors) { 242 for (const keyword of titleKeywords(talks[neighbor.index].title)) { 243 keywordCounts.set(keyword, (keywordCounts.get(keyword) ?? 0) + 1) 244 } 245 } 246 247 const neighborKeywords = [...keywordCounts.entries()] 248 .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])) 249 .map(([word]) => word) 250 .slice(0, 2) 251 252 entry.keywords = uniqueTokens([...(entry.keywords ?? []), ...neighborKeywords], 6) 253 } 254} 255 256function fallbackEntry(uri, title) { 257 const tokens = titleKeywords(title) 258 return { 259 uri, 260 group: 'general', 261 tags: uniqueTokens(tokens.slice(0, 2), 3), 262 topics: uniqueTokens(tokens.slice(0, 1), 2), 263 keywords: uniqueTokens(tokens.slice(0, 4), 6), 264 } 265} 266 267async function categorizeTalks(talks) { 268 const compactInput = talks.map((talk) => ({ 269 uri: talk.uri, 270 title: talk.title, 271 description: talk.description ?? null, 272 })) 273 274 const response = await callOpenRouter('/chat/completions', { 275 model: CATEGORIZER_MODEL, 276 temperature: 0.2, 277 max_tokens: 12_000, 278 messages: [ 279 { 280 role: 'system', 281 content: 282 'You classify conference talk metadata into a compact, useful taxonomy. Use lowercase kebab-case tokens only. Keep tags and topics broad and reusable. Output valid JSON only.', 283 }, 284 { 285 role: 'user', 286 content: `Classify each talk. Constraints:\n- 8-14 groups total across all talks\n- each talk: group (1), tags (2-4), topics (1-2), keywords (2-5)\n- tags/topics must be concise and reusable\n- if description is null, classify from title only\n\nTalks JSON:\n${JSON.stringify(compactInput)}`, 287 }, 288 ], 289 response_format: { 290 type: 'json_schema', 291 json_schema: { 292 name: 'video_taxonomy', 293 strict: true, 294 schema: { 295 type: 'object', 296 properties: { 297 entries: { 298 type: 'array', 299 items: { 300 type: 'object', 301 properties: { 302 uri: { type: 'string' }, 303 group: { type: 'string' }, 304 tags: { type: 'array', items: { type: 'string' } }, 305 topics: { type: 'array', items: { type: 'string' } }, 306 keywords: { type: 'array', items: { type: 'string' } }, 307 }, 308 required: ['uri', 'group', 'tags', 'topics', 'keywords'], 309 additionalProperties: false, 310 }, 311 }, 312 }, 313 required: ['entries'], 314 additionalProperties: false, 315 }, 316 }, 317 }, 318 }) 319 320 const content = response.choices?.[0]?.message?.content 321 if (typeof content !== 'string') { 322 throw new Error('OpenRouter categorizer returned empty content') 323 } 324 325 const parsed = extractJsonObject(content) 326 return parsed.entries 327} 328 329async function main() { 330 await loadEnv() 331 332 const talks = await fetchAllTalkRecords() 333 talks.sort((a, b) => new Date(b.createdAt ?? 0).getTime() - new Date(a.createdAt ?? 0).getTime()) 334 335 if (talks.length === 0) { 336 throw new Error('No talks found to categorize') 337 } 338 339 console.log(`Fetched ${talks.length} talks`) 340 341 const embeddings = await buildEmbeddingVectors(talks) 342 console.log(`Generated ${embeddings.length} embeddings with ${EMBEDDING_MODEL}`) 343 344 const llmEntries = await categorizeTalks(talks) 345 const entriesByUri = new Map() 346 347 for (const entry of llmEntries) { 348 entriesByUri.set(entry.uri, { 349 uri: entry.uri, 350 group: normalizeToken(entry.group) || 'general', 351 tags: uniqueTokens(entry.tags ?? [], 4), 352 topics: uniqueTokens(entry.topics ?? [], 3), 353 keywords: uniqueTokens(entry.keywords ?? [], 6), 354 }) 355 } 356 357 for (const talk of talks) { 358 if (!entriesByUri.has(talk.uri)) { 359 entriesByUri.set(talk.uri, fallbackEntry(talk.uri, talk.title)) 360 } 361 } 362 363 enrichWithEmbeddingNeighbors(entriesByUri, talks, embeddings) 364 365 const output = { 366 version: 1, 367 generatedAt: new Date().toISOString(), 368 models: { 369 categorizer: CATEGORIZER_MODEL, 370 embeddings: EMBEDDING_MODEL, 371 }, 372 entries: talks.map((talk) => entriesByUri.get(talk.uri)), 373 } 374 375 await writeFile(OUTPUT_PATH, `${JSON.stringify(output, null, 2)}\n`, 'utf8') 376 console.log(`Wrote taxonomy to ${OUTPUT_PATH}`) 377} 378 379main().catch((error) => { 380 console.error(error) 381 process.exit(1) 382})