this repo has no description
1/**
2 * Sentence utilities: reading pre-processed example sentence files and building
3 * the character-level indexes used when generating subject data.
4 *
5 * Sentence files live at data/lang/{userLang}/sentences/{targetLang}.tsv.
6 */
7
8import { distinct } from '@std/collections/distinct'
9import { parse } from '@std/csv/parse'
10import type { Locale } from '$/enums.ts'
11import { DATA_ROOT } from './fs.ts'
12
13export interface Sentences {
14 /** Maps sentence text → user-language translation. */
15 bySentence: Record<string, string>
16 /** Maps each individual character → all sentence texts that contain it. */
17 byChar: Map<string, string[]>
18 /** All sentence texts, in curriculum-sorted order (simplest first). */
19 sorted: string[]
20}
21
22/**
23 * Reads the pre-processed sentence TSV for a given user + target language pair.
24 * Returns `{ bySentence, keys }`. Returns empty values if the file doesn't exist.
25 */
26export function readSentences(
27 userLang: string,
28 locale: Locale,
29): { bySentence: Record<string, string>; keys: string[] } {
30 let text = ''
31 try {
32 text = Deno.readTextFileSync(`${DATA_ROOT}lang/${userLang}/sentences/${locale}.tsv`)
33 } catch {
34 return { bySentence: {}, keys: [] }
35 }
36 const rows = parse(text, { separator: '\t', lazyQuotes: true })
37 const bySentence: Record<string, string> = {}
38 const keys = distinct(
39 rows.map(([_id, value, _enId, translation]) => {
40 bySentence[value] = translation
41 return value
42 }),
43 )
44 return { bySentence, keys }
45}
46
47export interface SentenceEntry {
48 value: string
49 translation: string
50 level: number
51}
52
53/**
54 * Reads the pre-processed sentence TSV (with optional level column) for a given
55 * user + target language pair, returning only sentences that have a level assigned.
56 * Used to generate the sentence game data files.
57 */
58export function readSentencesForGame(userLang: string, locale: string): SentenceEntry[] {
59 let text = ''
60 try {
61 text = Deno.readTextFileSync(`${DATA_ROOT}lang/${userLang}/sentences/${locale}.tsv`)
62 } catch {
63 return []
64 }
65 const rows = parse(text, { separator: '\t', lazyQuotes: true })
66 const result: SentenceEntry[] = []
67 for (const [_id, value, _enId, translation, levelStr] of rows) {
68 const level = parseInt(levelStr ?? '')
69 if (value && translation && !isNaN(level)) {
70 result.push({ value, translation, level })
71 }
72 }
73 return result
74}
75
76/**
77 * Loads and indexes sentences for a user + target language pair.
78 * Builds `byChar` for fast per-character lookup when generating subject examples.
79 */
80export function loadSentences(userLang: string, targetLang: string): Sentences {
81 const raw = readSentences(userLang, targetLang as Locale)
82 const byChar = new Map<string, string[]>()
83 for (const key of raw.keys) {
84 for (const char of key) {
85 if (!byChar.has(char)) byChar.set(char, [])
86 byChar.get(char)!.push(key)
87 }
88 }
89 return { bySentence: raw.bySentence, byChar, sorted: raw.keys }
90}