feat: add zhuyin audio · bpev.me/hanzi@52d7da8

+1

.gitignore

··· 2 2 .env 3 3 node_modules 4 4 5 + .tmp 5 6 coverage 6 7 dist 7 8 gen

+243

data/cli/commands/games/gen_audio_zhuyin.ts

··· 1 + /** 2 + * Generates TTS audio sprite files for the zhuyin (Bopomofo) game. 3 + * 4 + * For each flashcard TSV in data/games/zhuyin/flashcards/, reads cards in order, 5 + * synthesizes via Azure zh-TW voice (using hant characters for words so they're 6 + * read naturally, and raw zhuyin for bare phonemes), splits by silence, pads each 7 + * clip to CLIP_LENGTH_MS, and concatenates into a sprite MP3 at 8 + * data/games/zhuyin/audio/{filename}.mp3. 9 + * 10 + * CLIP_LENGTH_MS must match the constant in www/utils/games/zhuyin.ts. 11 + * 12 + * Requirements: 13 + * - AZURE_SPEECH_KEY and AZURE_SPEECH_REGION in .env 14 + * - ffmpeg on $PATH 15 + * 16 + * Usage: 17 + * hanzi games gen-audio:zhuyin 18 + * hanzi games gen-audio:zhuyin --force 19 + */ 20 + 21 + import { Command } from '@cliffy/command' 22 + import { load } from '@std/dotenv' 23 + import { ensureDir } from '@std/fs' 24 + import { join, basename } from '@std/path' 25 + import { parse as parseTsv } from '@std/csv/parse' 26 + 27 + const VOICE_ID = 'zh-TW-YunJheNeural' 28 + const LOCALE = 'zh-TW' 29 + // Must match CLIP_LENGTH_MS in www/utils/games/zhuyin.ts 30 + const CLIP_LENGTH_MS = 1500 31 + // Break inserted between items in SSML 32 + const SILENCE_BETWEEN_S = 1 33 + // Minimum silence duration for ffmpeg silencedetect — must be < SILENCE_BETWEEN_S 34 + const SILENCE_DETECT_D = 0.5 35 + const FLASHCARDS_DIR = 'data/games/zhuyin/flashcards' 36 + const AUDIO_DIR = 'data/games/zhuyin/audio' 37 + const TEMP_DIR = 'data/games/zhuyin/.tmp' 38 + 39 + interface ZhuyinCard { 40 + zhuyin: string 41 + hant: string 42 + } 43 + 44 + /** Reads cards from a flashcard TSV. Returns zhuyin + hant (may be empty) per card. */ 45 + function readFlashcardTsv(tsvPath: string): ZhuyinCard[] { 46 + const text = Deno.readTextFileSync(tsvPath) 47 + const rows = parseTsv(text, { separator: '\t', skipFirstRow: true }) as Record<string, string>[] 48 + return rows 49 + .map((r) => ({ zhuyin: r['zhuyin']?.trim() ?? '', hant: r['hant']?.trim() ?? '' })) 50 + .filter((c) => c.zhuyin) 51 + } 52 + 53 + /** 54 + * Returns the TTS text for a card. 55 + * Uses the hant character for words so Azure reads them naturally. 56 + * Falls back to zhuyin for bare phonemes (no hant, or whitespace-only hant). 57 + */ 58 + function ttsText(card: ZhuyinCard): string { 59 + return card.hant || card.zhuyin 60 + } 61 + 62 + /** 63 + * Calls Azure TTS with a short break between items. 64 + * Returns raw MP3 bytes, or null on error. 65 + */ 66 + async function azureTts( 67 + texts: string[], 68 + env: Record<string, string>, 69 + ): Promise<Uint8Array | null> { 70 + const region = env['AZURE_SPEECH_REGION'] 71 + const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/v1` 72 + const ssml = ` 73 + <speak version='1.0' xml:lang='${LOCALE}'> 74 + <voice name='${VOICE_ID}' xml:lang='${LOCALE}'> 75 + <prosody rate="-20.00%"> 76 + ${texts.join(`, <break time="${SILENCE_BETWEEN_S}s"/> `)} 77 + </prosody> 78 + </voice> 79 + </speak> 80 + ` 81 + const response = await fetch(url, { 82 + method: 'POST', 83 + headers: { 84 + 'Ocp-Apim-Subscription-Key': env['AZURE_SPEECH_KEY'], 85 + 'Content-Type': 'application/ssml+xml', 86 + 'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3', 87 + 'User-Agent': 'curl', 88 + }, 89 + body: ssml, 90 + }) 91 + if (response.status > 399) { 92 + console.warn(`Azure error ${response.status}:`, await response.text()) 93 + return null 94 + } 95 + return new Uint8Array(await response.arrayBuffer()) 96 + } 97 + 98 + /** 99 + * Splits a combined TTS audio file by silence, pads each clip to CLIP_LENGTH_MS, 100 + * and concatenates them into a single sprite file at outputPath. 101 + * 102 + * Azure may add trailing silence creating an extra boundary — we cap at 103 + * count - 1 gaps so any trailing silence is absorbed into the last clip. 104 + */ 105 + async function writeSpriteFile( 106 + sourceFile: string, 107 + count: number, 108 + outputPath: string, 109 + tempDir: string, 110 + ): Promise<void> { 111 + const { stderr } = await new Deno.Command('ffmpeg', { 112 + stdout: 'piped', 113 + stderr: 'piped', 114 + args: ['-i', sourceFile, '-af', `silencedetect=noise=-40dB:d=${SILENCE_DETECT_D}`, '-f', 'null', '-'], 115 + }).output() 116 + const detected = new TextDecoder().decode(stderr) 117 + 118 + // Collect silence gaps; cap at count - 1 to ignore trailing silence from Azure 119 + const gaps: { startMS: number; endMS: number }[] = [] 120 + const re = /silence_start: ([\w.]+)[\s\S]+?silence_end: ([\w.]+)/g 121 + let m = re.exec(detected) 122 + while (m && gaps.length < count - 1) { 123 + gaps.push({ 124 + startMS: Math.round(1000 * parseFloat(m[1])), 125 + endMS: Math.round(1000 * (parseFloat(m[2]) - 0.1)), 126 + }) 127 + m = re.exec(detected) 128 + } 129 + 130 + // Extract each speech clip and pad to CLIP_LENGTH_MS 131 + // Use bare filenames so the concat list (in the same dir) resolves them correctly 132 + const clipNames: string[] = [] 133 + let clipStartMS = 0 134 + 135 + for (let i = 0; i < count; i++) { 136 + const rawName = `raw_${i}.mp3` 137 + const paddedName = `padded_${i}.mp3` 138 + const rawPath = join(tempDir, rawName) 139 + const paddedPath = join(tempDir, paddedName) 140 + const gap = gaps[i] 141 + 142 + await new Deno.Command('ffmpeg', { 143 + stdout: 'piped', 144 + stderr: 'piped', 145 + args: [ 146 + '-y', '-ss', `${clipStartMS}ms`, 147 + ...(gap ? ['-t', `${gap.startMS - clipStartMS}ms`] : []), 148 + '-i', sourceFile, '-c:a', 'copy', rawPath, 149 + ], 150 + }).output() 151 + 152 + await new Deno.Command('ffmpeg', { 153 + stdout: 'piped', 154 + stderr: 'piped', 155 + args: ['-y', '-i', rawPath, '-af', 'apad', '-t', `${CLIP_LENGTH_MS / 1000}`, paddedPath], 156 + }).output() 157 + 158 + clipNames.push(paddedName) 159 + if (gap) clipStartMS = gap.endMS 160 + } 161 + 162 + // Concat list uses bare filenames — ffmpeg resolves them relative to the list's directory 163 + const concatListPath = join(tempDir, 'concat.txt') 164 + await Deno.writeTextFile(concatListPath, clipNames.map((n) => `file '${n}'`).join('\n')) 165 + 166 + const { success, stderr: concatErr } = await new Deno.Command('ffmpeg', { 167 + stdout: 'piped', 168 + stderr: 'piped', 169 + args: [ 170 + '-y', '-f', 'concat', '-safe', '0', '-i', concatListPath, 171 + '-acodec', 'libmp3lame', '-q:a', '2', outputPath, 172 + ], 173 + }).output() 174 + 175 + if (!success) { 176 + console.error(' ffmpeg concat failed:', new TextDecoder().decode(concatErr)) 177 + return 178 + } 179 + 180 + console.log(` wrote ${outputPath} (${clipNames.length} clips × ${CLIP_LENGTH_MS}ms)`) 181 + } 182 + 183 + export const genAudioZhuyinCmd = new Command() 184 + .description( 185 + 'Generate TTS audio sprite files for the zhuyin game. ' + 186 + 'Reads flashcard TSVs, synthesizes via Azure zh-TW voice (hant for words, ' + 187 + 'zhuyin for bare phonemes), splits by silence, pads each clip to CLIP_LENGTH_MS, ' + 188 + 'and writes sprite MP3s to data/games/zhuyin/audio/. ' + 189 + 'Requires AZURE_SPEECH_KEY and AZURE_SPEECH_REGION in .env, and ffmpeg on $PATH.', 190 + ) 191 + .option('--force', 'Regenerate audio even if the output file already exists.') 192 + .action(async ({ force }) => { 193 + const env = await load() 194 + 195 + await ensureDir(AUDIO_DIR) 196 + await ensureDir(TEMP_DIR) 197 + 198 + const entries: Deno.DirEntry[] = [] 199 + for await (const entry of Deno.readDir(FLASHCARDS_DIR)) { 200 + if (entry.isFile && entry.name.endsWith('.tsv')) entries.push(entry) 201 + } 202 + entries.sort((a, b) => a.name.localeCompare(b.name)) 203 + 204 + let processed = 0 205 + 206 + for (const [i, entry] of entries.entries()) { 207 + const filename = basename(entry.name, '.tsv') 208 + const outputPath = join(AUDIO_DIR, `${filename}.mp3`) 209 + 210 + if (!force) { 211 + try { 212 + await Deno.stat(outputPath) 213 + console.log(`skipping (exists): ${filename}.mp3`) 214 + continue 215 + } catch { /* doesn't exist — proceed */ } 216 + } 217 + 218 + const cards = readFlashcardTsv(join(FLASHCARDS_DIR, entry.name)) 219 + if (!cards.length) { 220 + console.warn(`no cards found in ${entry.name}, skipping`) 221 + continue 222 + } 223 + 224 + const texts = cards.map(ttsText) 225 + console.log(`\n[${i + 1}/${entries.length}] ${entry.name} — ${cards.length} items`) 226 + console.log(` tts: ${texts.join(', ')}`) 227 + 228 + const bytes = await azureTts(texts, env) 229 + if (!bytes) { 230 + console.warn(` Azure TTS failed for ${entry.name}`) 231 + continue 232 + } 233 + 234 + const sourcePath = join(TEMP_DIR, `${filename}.mp3`) 235 + await Deno.writeFile(sourcePath, bytes) 236 + 237 + await writeSpriteFile(sourcePath, cards.length, outputPath, TEMP_DIR) 238 + processed++ 239 + } 240 + 241 + console.log(`\nDone — ${processed} sprite(s) generated.`) 242 + Deno.exit(0) 243 + })

+3 -1

data/cli/main.ts

··· 32 32 import { renameAudioCmd } from './commands/studio/rename_audio.ts' 33 33 import { audioMatrixCmd } from './commands/games/audio_matrix.ts' 34 34 import { buildGamesCmd } from './commands/games/build.ts' 35 + import { genAudioZhuyinCmd } from './commands/games/gen_audio_zhuyin.ts' 35 36 36 37 const studioCmd = new Command() 37 38 .description( ··· 74 75 .command('games', new Command() 75 76 .description('Tools for managing game data assets.') 76 77 .command('build', buildGamesCmd) 77 - .command('audio-matrix', audioMatrixCmd), 78 + .command('audio-matrix', audioMatrixCmd) 79 + .command('gen-audio:zhuyin', genAudioZhuyinCmd), 78 80 ) 79 81 .parse(Deno.args)

data/games/zhuyin/audio/01_ㄅㄆㄇㄚㄧ.mp3