data/cli/utils/sentences.ts at main

bpev.me / hanzi
fork
this repo has no description
fork
hanzi / data / cli / utils / sentences.ts
at main 90 lines 3.0 kB view raw
wrap content
Ben Pevsner feat: add games 6w ago
07f4fe71
 1/**
 2 * Sentence utilities: reading pre-processed example sentence files and building
 3 * the character-level indexes used when generating subject data.
 4 *
 5 * Sentence files live at data/lang/{userLang}/sentences/{targetLang}.tsv.
 6 */
 7
 8import { distinct } from '@std/collections/distinct'
 9import { parse } from '@std/csv/parse'
10import type { Locale } from '$/enums.ts'
11import { DATA_ROOT } from './fs.ts'
12
13export interface Sentences {
14  /** Maps sentence text → user-language translation. */
15  bySentence: Record<string, string>
16  /** Maps each individual character → all sentence texts that contain it. */
17  byChar: Map<string, string[]>
18  /** All sentence texts, in curriculum-sorted order (simplest first). */
19  sorted: string[]
20}
21
22/**
23 * Reads the pre-processed sentence TSV for a given user + target language pair.
24 * Returns `{ bySentence, keys }`. Returns empty values if the file doesn't exist.
25 */
26export function readSentences(
27  userLang: string,
28  locale: Locale,
29): { bySentence: Record<string, string>; keys: string[] } {
30  let text = ''
31  try {
32    text = Deno.readTextFileSync(`${DATA_ROOT}lang/${userLang}/sentences/${locale}.tsv`)
33  } catch {
34    return { bySentence: {}, keys: [] }
35  }
36  const rows = parse(text, { separator: '\t', lazyQuotes: true })
37  const bySentence: Record<string, string> = {}
38  const keys = distinct(
39    rows.map(([_id, value, _enId, translation]) => {
40      bySentence[value] = translation
41      return value
42    }),
43  )
44  return { bySentence, keys }
45}
46
47export interface SentenceEntry {
48  value: string
49  translation: string
50  level: number
51}
52
53/**
54 * Reads the pre-processed sentence TSV (with optional level column) for a given
55 * user + target language pair, returning only sentences that have a level assigned.
56 * Used to generate the sentence game data files.
57 */
58export function readSentencesForGame(userLang: string, locale: string): SentenceEntry[] {
59  let text = ''
60  try {
61    text = Deno.readTextFileSync(`${DATA_ROOT}lang/${userLang}/sentences/${locale}.tsv`)
62  } catch {
63    return []
64  }
65  const rows = parse(text, { separator: '\t', lazyQuotes: true })
66  const result: SentenceEntry[] = []
67  for (const [_id, value, _enId, translation, levelStr] of rows) {
68    const level = parseInt(levelStr ?? '')
69    if (value && translation && !isNaN(level)) {
70      result.push({ value, translation, level })
71    }
72  }
73  return result
74}
75
76/**
77 * Loads and indexes sentences for a user + target language pair.
78 * Builds `byChar` for fast per-character lookup when generating subject examples.
79 */
80export function loadSentences(userLang: string, targetLang: string): Sentences {
81  const raw = readSentences(userLang, targetLang as Locale)
82  const byChar = new Map<string, string[]>()
83  for (const key of raw.keys) {
84    for (const char of key) {
85      if (!byChar.has(char)) byChar.set(char, [])
86      byChar.get(char)!.push(key)
87    }
88  }
89  return { bySentence: raw.bySentence, byChar, sorted: raw.keys }
90}
Configure Feed

Configure Feed