scripts/generate-video-taxonomy.mjs at main

j4ck.xyz / atmosphere-vods
fork
A simple, clean, fast browser for the AtmosphereConf(2026) VODs
fork
atmosphere-vods / scripts / generate-video-taxonomy.mjs
at main 382 lines 11 kB view raw
wrap content
jack feat: deep-linkable URLs, keyboard shortcuts, SPA routing, footer links (AI-assisted) 17d ago
6de82296
  1import { readFile, writeFile } from 'node:fs/promises'
  2import { existsSync } from 'node:fs'
  3import path from 'node:path'
  4
  5const REPO_DID = 'did:plc:rbvrr34edl5ddpuwcubjiost'
  6const PLC_DIRECTORY_URL = 'https://plc.directory'
  7const OPENROUTER_API_URL = 'https://openrouter.ai/api/v1'
  8const CATEGORIZER_MODEL = process.env.OPENROUTER_TAXONOMY_MODEL ?? 'google/gemini-3.1-flash-lite-preview'
  9const EMBEDDING_MODEL = process.env.OPENROUTER_EMBEDDING_MODEL ?? 'openai/text-embedding-3-small'
 10const OUTPUT_PATH = path.resolve(process.cwd(), 'src/lib/video-taxonomy.json')
 11
 12const STOPWORDS = new Set([
 13  'the', 'and', 'for', 'with', 'from', 'that', 'this', 'your', 'into', 'about', 'what', 'when',
 14  'where', 'have', 'will', 'just', 'talk', 'video', 'stream', 'conference', 'atmosphere', 'place',
 15  'vod', 'beta', '2026', 'how', 'why', 'can', 'you', 'all', 'are', 'its', 'our', 'new', 'more',
 16  'using', 'use', 'intro', 'introduction', 'deep', 'dive',
 17])
 18
 19function parseEnvFile(content) {
 20  const vars = {}
 21
 22  for (const rawLine of content.split(/\r?\n/)) {
 23    const line = rawLine.trim()
 24    if (!line || line.startsWith('#')) {
 25      continue
 26    }
 27
 28    const idx = line.indexOf('=')
 29    if (idx <= 0) {
 30      continue
 31    }
 32
 33    const key = line.slice(0, idx).trim()
 34    let value = line.slice(idx + 1).trim()
 35    if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
 36      value = value.slice(1, -1)
 37    }
 38
 39    vars[key] = value
 40  }
 41
 42  return vars
 43}
 44
 45async function loadEnv() {
 46  const candidatePaths = [
 47    path.resolve(process.cwd(), '.env'),
 48    path.resolve(process.cwd(), '..', '.env'),
 49  ]
 50
 51  for (const filePath of candidatePaths) {
 52    if (!existsSync(filePath)) {
 53      continue
 54    }
 55
 56    const content = await readFile(filePath, 'utf8')
 57    const parsed = parseEnvFile(content)
 58
 59    for (const [key, value] of Object.entries(parsed)) {
 60      if (!process.env[key]) {
 61        process.env[key] = value
 62      }
 63    }
 64  }
 65}
 66
 67function normalizeToken(value) {
 68  return value
 69    .toLowerCase()
 70    .replace(/&/g, ' and ')
 71    .replace(/[^a-z0-9\s-]+/g, ' ')
 72    .replace(/\s+/g, '-')
 73    .replace(/-+/g, '-')
 74    .replace(/^-|-$/g, '')
 75}
 76
 77function uniqueTokens(values, max = 4) {
 78  const output = []
 79  const seen = new Set()
 80
 81  for (const raw of values) {
 82    const token = normalizeToken(raw)
 83    if (!token || seen.has(token)) {
 84      continue
 85    }
 86
 87    seen.add(token)
 88    output.push(token)
 89    if (output.length >= max) {
 90      break
 91    }
 92  }
 93
 94  return output
 95}
 96
 97function extractJsonObject(text) {
 98  const fencedMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/i)
 99  const source = fencedMatch ? fencedMatch[1] : text
100  const start = source.indexOf('{')
101  const end = source.lastIndexOf('}')
102
103  if (start < 0 || end < 0 || end <= start) {
104    throw new Error('No JSON object found in model response')
105  }
106
107  return JSON.parse(source.slice(start, end + 1))
108}
109
110async function fetchJson(url) {
111  const response = await fetch(url)
112  if (!response.ok) {
113    throw new Error(`Request failed (${response.status}) for ${url}`)
114  }
115  return response.json()
116}
117
118async function resolvePdsUrl() {
119  const didDoc = await fetchJson(`${PLC_DIRECTORY_URL}/${REPO_DID}`)
120  const pdsService = didDoc.service?.find((entry) => entry.id === '#atproto_pds')
121
122  if (!pdsService?.serviceEndpoint) {
123    throw new Error('Could not resolve PDS endpoint from PLC DID document')
124  }
125
126  return pdsService.serviceEndpoint.replace(/\/$/, '')
127}
128
129async function fetchAllTalkRecords() {
130  const pdsUrl = await resolvePdsUrl()
131  const records = []
132  let cursor = undefined
133
134  do {
135    const query = new URLSearchParams({
136      repo: REPO_DID,
137      collection: 'place.stream.video',
138      limit: '100',
139    })
140    if (cursor) {
141      query.set('cursor', cursor)
142    }
143
144    const page = await fetchJson(`${pdsUrl}/xrpc/com.atproto.repo.listRecords?${query.toString()}`)
145    records.push(...(page.records ?? []))
146    cursor = page.cursor
147  } while (cursor)
148
149  return records.map((record) => ({
150    uri: record.uri,
151    title: record.value?.title ?? 'Untitled',
152    description: record.value?.description,
153    createdAt: record.value?.createdAt,
154  }))
155}
156
157async function callOpenRouter(pathname, body) {
158  const apiKey = process.env.OPENROUTER_API_KEY
159  if (!apiKey) {
160    throw new Error('OPENROUTER_API_KEY is missing')
161  }
162
163  const response = await fetch(`${OPENROUTER_API_URL}${pathname}`, {
164    method: 'POST',
165    headers: {
166      Authorization: `Bearer ${apiKey}`,
167      'Content-Type': 'application/json',
168      'HTTP-Referer': 'https://vods.j4ck.xyz',
169      'X-OpenRouter-Title': 'Atmosphere VODs taxonomy generation',
170    },
171    body: JSON.stringify(body),
172  })
173
174  if (!response.ok) {
175    const errorText = await response.text()
176    throw new Error(`OpenRouter request failed (${response.status}): ${errorText}`)
177  }
178
179  return response.json()
180}
181
182async function buildEmbeddingVectors(talks) {
183  const input = talks.map((talk) => [talk.title, talk.description].filter(Boolean).join('\n\n'))
184  const response = await callOpenRouter('/embeddings', {
185    model: EMBEDDING_MODEL,
186    input,
187    input_type: 'search_document',
188  })
189
190  return response.data.map((entry) => entry.embedding)
191}
192
193function cosineSimilarity(a, b) {
194  let dot = 0
195  let magA = 0
196  let magB = 0
197
198  for (let i = 0; i < a.length; i += 1) {
199    dot += a[i] * b[i]
200    magA += a[i] * a[i]
201    magB += b[i] * b[i]
202  }
203
204  if (magA === 0 || magB === 0) {
205    return 0
206  }
207
208  return dot / (Math.sqrt(magA) * Math.sqrt(magB))
209}
210
211function titleKeywords(title) {
212  return (title.toLowerCase().match(/[a-z0-9][a-z0-9-]{2,}/g) ?? [])
213    .map((word) => normalizeToken(word))
214    .filter((word) => word && !STOPWORDS.has(word))
215}
216
217function enrichWithEmbeddingNeighbors(entriesByUri, talks, embeddings) {
218  for (let i = 0; i < talks.length; i += 1) {
219    const talk = talks[i]
220    const entry = entriesByUri.get(talk.uri)
221    if (!entry) {
222      continue
223    }
224
225    const scores = []
226    for (let j = 0; j < talks.length; j += 1) {
227      if (i === j) {
228        continue
229      }
230
231      scores.push({
232        index: j,
233        score: cosineSimilarity(embeddings[i], embeddings[j]),
234      })
235    }
236
237    scores.sort((a, b) => b.score - a.score)
238    const neighbors = scores.slice(0, 3)
239    const keywordCounts = new Map()
240
241    for (const neighbor of neighbors) {
242      for (const keyword of titleKeywords(talks[neighbor.index].title)) {
243        keywordCounts.set(keyword, (keywordCounts.get(keyword) ?? 0) + 1)
244      }
245    }
246
247    const neighborKeywords = [...keywordCounts.entries()]
248      .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
249      .map(([word]) => word)
250      .slice(0, 2)
251
252    entry.keywords = uniqueTokens([...(entry.keywords ?? []), ...neighborKeywords], 6)
253  }
254}
255
256function fallbackEntry(uri, title) {
257  const tokens = titleKeywords(title)
258  return {
259    uri,
260    group: 'general',
261    tags: uniqueTokens(tokens.slice(0, 2), 3),
262    topics: uniqueTokens(tokens.slice(0, 1), 2),
263    keywords: uniqueTokens(tokens.slice(0, 4), 6),
264  }
265}
266
267async function categorizeTalks(talks) {
268  const compactInput = talks.map((talk) => ({
269    uri: talk.uri,
270    title: talk.title,
271    description: talk.description ?? null,
272  }))
273
274  const response = await callOpenRouter('/chat/completions', {
275    model: CATEGORIZER_MODEL,
276    temperature: 0.2,
277    max_tokens: 12_000,
278    messages: [
279      {
280        role: 'system',
281        content:
282          'You classify conference talk metadata into a compact, useful taxonomy. Use lowercase kebab-case tokens only. Keep tags and topics broad and reusable. Output valid JSON only.',
283      },
284      {
285        role: 'user',
286        content: `Classify each talk. Constraints:\n- 8-14 groups total across all talks\n- each talk: group (1), tags (2-4), topics (1-2), keywords (2-5)\n- tags/topics must be concise and reusable\n- if description is null, classify from title only\n\nTalks JSON:\n${JSON.stringify(compactInput)}`,
287      },
288    ],
289    response_format: {
290      type: 'json_schema',
291      json_schema: {
292        name: 'video_taxonomy',
293        strict: true,
294        schema: {
295          type: 'object',
296          properties: {
297            entries: {
298              type: 'array',
299              items: {
300                type: 'object',
301                properties: {
302                  uri: { type: 'string' },
303                  group: { type: 'string' },
304                  tags: { type: 'array', items: { type: 'string' } },
305                  topics: { type: 'array', items: { type: 'string' } },
306                  keywords: { type: 'array', items: { type: 'string' } },
307                },
308                required: ['uri', 'group', 'tags', 'topics', 'keywords'],
309                additionalProperties: false,
310              },
311            },
312          },
313          required: ['entries'],
314          additionalProperties: false,
315        },
316      },
317    },
318  })
319
320  const content = response.choices?.[0]?.message?.content
321  if (typeof content !== 'string') {
322    throw new Error('OpenRouter categorizer returned empty content')
323  }
324
325  const parsed = extractJsonObject(content)
326  return parsed.entries
327}
328
329async function main() {
330  await loadEnv()
331
332  const talks = await fetchAllTalkRecords()
333  talks.sort((a, b) => new Date(b.createdAt ?? 0).getTime() - new Date(a.createdAt ?? 0).getTime())
334
335  if (talks.length === 0) {
336    throw new Error('No talks found to categorize')
337  }
338
339  console.log(`Fetched ${talks.length} talks`)
340
341  const embeddings = await buildEmbeddingVectors(talks)
342  console.log(`Generated ${embeddings.length} embeddings with ${EMBEDDING_MODEL}`)
343
344  const llmEntries = await categorizeTalks(talks)
345  const entriesByUri = new Map()
346
347  for (const entry of llmEntries) {
348    entriesByUri.set(entry.uri, {
349      uri: entry.uri,
350      group: normalizeToken(entry.group) || 'general',
351      tags: uniqueTokens(entry.tags ?? [], 4),
352      topics: uniqueTokens(entry.topics ?? [], 3),
353      keywords: uniqueTokens(entry.keywords ?? [], 6),
354    })
355  }
356
357  for (const talk of talks) {
358    if (!entriesByUri.has(talk.uri)) {
359      entriesByUri.set(talk.uri, fallbackEntry(talk.uri, talk.title))
360    }
361  }
362
363  enrichWithEmbeddingNeighbors(entriesByUri, talks, embeddings)
364
365  const output = {
366    version: 1,
367    generatedAt: new Date().toISOString(),
368    models: {
369      categorizer: CATEGORIZER_MODEL,
370      embeddings: EMBEDDING_MODEL,
371    },
372    entries: talks.map((talk) => entriesByUri.get(talk.uri)),
373  }
374
375  await writeFile(OUTPUT_PATH, `${JSON.stringify(output, null, 2)}\n`, 'utf8')
376  console.log(`Wrote taxonomy to ${OUTPUT_PATH}`)
377}
378
379main().catch((error) => {
380  console.error(error)
381  process.exit(1)
382})
Configure Feed

Configure Feed