data/cli/utils/subjects.ts at main

bpev.me / hanzi
fork
this repo has no description
fork
hanzi / data / cli / utils / subjects.ts
at main 247 lines 9.1 kB view raw
wrap content
Ben Pevsner feat: performance improvements 6w ago
33f9dd9d
  1/**
  2 * Subject utilities: reading/writing compiled subject JSON files, and creating
  3 * new Subject objects from dictionary + curriculum data.
  4 *
  5 * Compiled subject files live at www/static/gen/lang/{userLang}/{targetLang}.json.
  6 */
  7
  8import { distinct } from '@std/collections/distinct'
  9import { dirname } from '@std/path'
 10import stringifyJSON from 'json-stringify-pretty-compact'
 11import { Locale, SubjectType } from '$/enums.ts'
 12import type { Audio, Subject } from '$/models/subjects.ts'
 13import { APP_ROOT } from './fs.ts'
 14import type { Definition } from './dict.ts'
 15import type { Sentences } from './sentences.ts'
 16
 17const { Character, Vocabulary } = SubjectType
 18
 19// ---------------------------------------------------------------------------
 20// Subject I/O
 21// ---------------------------------------------------------------------------
 22
 23/** Reads compiled subject JSON from `www/static/gen/`. Returns an empty array on error. */
 24export function readSubjects(input: string): Subject[] {
 25  try {
 26    return JSON.parse(Deno.readTextFileSync(APP_ROOT + input))
 27  } catch {
 28    return []
 29  }
 30}
 31
 32/**
 33 * Reads compiled subject JSON and returns a map keyed by `data.slug`.
 34 * Subjects with a missing id or slug are skipped with a warning.
 35 */
 36export function readSubjectsMap(input: string): Record<string, Subject> {
 37  const map: Record<string, Subject> = {}
 38  readSubjects(input).forEach((subject) => {
 39    if (!subject.id || !subject.data?.slug) {
 40      console.warn(
 41        `Skipping subject with missing id/slug in ${input}:`,
 42        JSON.stringify(subject).slice(0, 120),
 43      )
 44      return
 45    }
 46    map[subject.data.slug] = subject
 47  })
 48  return map
 49}
 50
 51/**
 52 * Writes compiled subjects to `www/static/gen/`. Before writing, subjects are:
 53 *   - Filtered to require id, slug, and type (corrupt entries are dropped)
 54 *   - Remapped with a stable property order for consistent diffs
 55 *   - Sorted by level → type (Radical < Character < Vocabulary) → position
 56 */
 57export function writeSubjects(output: string, subjects: Subject[], minify = false): void {
 58  const levelAndPosition = new Set<string>()
 59
 60  const toWrite = subjects
 61    .filter((subject) => {
 62      if (!subject.id || !subject.data?.slug || !subject.data?.type) {
 63        console.warn(
 64          'Dropping invalid subject (missing id/slug/type):',
 65          JSON.stringify(subject).slice(0, 120),
 66        )
 67        return false
 68      }
 69      return true
 70    })
 71    .map((subject) => {
 72      const { data } = subject
 73      const levelPosition = `${data.type}-${data.level}-${data.position}`
 74      if (levelAndPosition.has(levelPosition) && levelPosition !== `${data.type}-0-0`) {
 75        console.warn(`Two subjects at same position ${levelPosition}: ${data.slug}`)
 76      } else {
 77        levelAndPosition.add(levelPosition)
 78      }
 79      // Explicit property order for stable JSON diffs
 80      return {
 81        id: subject.id,
 82        hiddenAt: subject.hiddenAt,
 83        learnCards: subject.learnCards?.length ? subject.learnCards : ['meanings'],
 84        quizCards: subject.quizCards?.length ? subject.quizCards : ['meanings', 'readings'],
 85        data: {
 86          audios: data.audios,
 87          character: data.character,
 88          requiredSubjects: data.requiredSubjects,
 89          examples: data.examples,
 90          level: data.level,
 91          meanings: data.meanings,
 92          meaningHint: data.meaningHint,
 93          meaningMnemonic: data.meaningMnemonic,
 94          position: data.position,
 95          readings: data.readings,
 96          readingHint: data.readingHint,
 97          readingMnemonic: data.readingMnemonic,
 98          slug: data.slug,
 99          srsId: data.srsId,
100          type: data.type,
101        },
102      } as Subject
103    })
104    .sort((a, b) => {
105      if (!a.data.level || !a.data.position) return 1
106      if (!b.data.level || !b.data.position) return -1
107      const levelDiff = a.data.level - b.data.level
108      if (levelDiff) return levelDiff
109      const typePriority: Record<string, number> = { Radical: 0, Character: 1, Vocabulary: 2 }
110      const typeDiff = (typePriority[a.data.type] ?? 0) - (typePriority[b.data.type] ?? 0)
111      if (typeDiff) return typeDiff
112      return a.data.position - b.data.position
113    })
114
115  const outPath = APP_ROOT + output
116  Deno.mkdirSync(dirname(outPath), { recursive: true })
117  Deno.writeTextFileSync(outPath, minify ? JSON.stringify(toWrite) : stringifyJSON(toWrite))
118}
119
120// ---------------------------------------------------------------------------
121// Subject creation
122// ---------------------------------------------------------------------------
123
124/**
125 * Indexes for fast slug/hans/ja lookups. Built lazily on first call to createSubject.
126 * We defer loading so that commands that don't need subject creation (gen-progress,
127 * gen-licenses) don't pay the startup cost of reading the dictionary files.
128 */
129let charBySlug: Record<string, Definition> | null = null
130let charByHans: Record<string, Definition> | null = null
131let charByJa: Record<string, Definition> | null = null
132let vocabBySlug: Record<string, Definition> | null = null
133let vocabByJa: Record<string, Definition> | null = null
134let audioMeta: Record<string, Record<string, Audio[]>> | null = null
135
136/**
137 * Builds an audio index from a list of audio filenames.
138 * Handles both filename formats:
139 *   {id}_{locale}_{voiceId}.mp3
140 *   {id}_{locale}_{voiceId}_{col}_{reading}.mp3
141 * Returns a nested map of locale → id → Audio[].
142 */
143export function buildAudioIndex(audioFiles: string[]): Record<string, Record<string, Audio[]>> {
144  const meta: Record<string, Record<string, Audio[]>> = {}
145  audioFiles.forEach((filename) => {
146    const parts = filename.replace('.mp3', '').split('_')
147    if (parts.length < 3) return
148    const [idStr, localeHyphen, voiceId, , reading] = parts
149    const locale = localeHyphen?.replace('-', '_')
150    if (!locale || !voiceId) return
151    meta[locale] ??= {}
152    meta[locale][idStr] ??= []
153    meta[locale][idStr].push({ url: filename, voiceId, reading: reading || undefined })
154  })
155  return meta
156}
157
158function initDicts(
159  charDefs: Definition[],
160  vocabDefs: Definition[],
161  audioFiles: string[],
162): void {
163  if (charBySlug) return // already initialized
164  charBySlug = Object.fromEntries(charDefs.map((d) => [d.hant, d]))
165  charByHans = Object.fromEntries(charDefs.map((d) => [d.hans, d]))
166  charByJa = Object.fromEntries(charDefs.filter((d) => d.ja).map((d) => [d.ja!, d]))
167  vocabBySlug = Object.fromEntries(vocabDefs.map((d) => [d.hant, d]))
168  vocabByJa = Object.fromEntries(vocabDefs.filter((d) => d.ja).map((d) => [d.ja!, d]))
169
170  audioMeta = buildAudioIndex(audioFiles)
171}
172
173function getCharForLocale(targetLang: string, hans: string, hant: string, ja?: string): string {
174  if (targetLang === 'ja') return ja || hant
175  return targetLang === Locale.zh_CN ? hans : hant
176}
177
178/**
179 * Creates a new Subject from dictionary and curriculum data.
180 * Used when a slug has no existing entry in the output JSON.
181 *
182 * @param charDefs - All character definitions (from lang/characters.tsv)
183 * @param vocabDefs - All vocabulary definitions (from lang/vocabulary.tsv)
184 * @param audioFiles - List of existing audio filenames (from listAudioFiles)
185 */
186export function createSubject(
187  slug: string,
188  level: number,
189  position: number,
190  targetLang: string,
191  charMeanings: Record<string, string>,
192  vocabMeanings: Record<string, string>,
193  sentences: Sentences,
194  charDefs: Definition[],
195  vocabDefs: Definition[],
196  audioFiles: string[],
197): Subject {
198  initDicts(charDefs, vocabDefs, audioFiles)
199
200  const isVocab = slug.length > 1
201  const dictEntry = isVocab
202    ? (vocabBySlug![slug] || vocabByJa![slug])
203    : (charBySlug![slug] || charByHans![slug] || charByJa![slug])
204
205  if (!dictEntry) {
206    console.error(`No valid dictionary entry for slug: ${slug}`)
207    return { data: {} } as Subject
208  }
209
210  const { id, hans, hant, ja } = dictEntry
211  const en = isVocab ? (vocabMeanings[id] || '') : (charMeanings[id] || '')
212  const character = getCharForLocale(targetLang, hans, hant, ja)
213  const charForSentences = targetLang === Locale.zh_CN ? hans : hant
214
215  return {
216    id,
217    learnCards: ['meanings'],
218    quizCards: ['meanings', 'readings'],
219    data: {
220      audios: audioMeta![targetLang]?.[id] ?? [],
221      character,
222      examples: (
223        charForSentences.length === 1
224          ? (sentences.byChar.get(charForSentences) ?? [])
225          : (sentences.byChar.get(charForSentences[0]) ?? []).filter((key) =>
226            key.includes(charForSentences)
227          )
228      )
229        .slice(0, 3)
230        .map((value) => ({ value, translation: sentences.bySentence[value] })),
231      level,
232      meanings: en.split(';').map((def, i) => ({
233        value: def.trim(),
234        isPrimary: i === 0,
235        isAcceptedAnswer: true,
236      })),
237      position,
238      readings: [],
239      requiredSubjects: distinct(
240        slug.split('').map((c) => charBySlug![c]?.id ?? ''),
241      ).filter((reqId) => reqId && reqId !== charBySlug![slug]?.id),
242      slug,
243      srsId: level > 2 ? 1 : 2,
244      type: isVocab ? Vocabulary : Character,
245    },
246  } as Subject
247}
Configure Feed

Configure Feed