Converts Pleco .xml export into an Anki-importable .tsv file
0
pleco_to_anki.ts
214 lines 7.5 kB view raw
1/** 2 * Pleco to Anki 3 * @description Converts Pleco .xml export into an Anki-importable .tsv file 4 * 5 * @reference 6 * `convert` Adapted From Quizlet's: https://github.com/quizlet/pinyin-converter 7 * by David Chanin and Jen Liu @quizlet 2013 8 * 9 * @example deno run --allow-read --allow-write main.ts [-infile] [-outdir] 10 * @example deno run --allow-read --allow-write main.ts ../flash-2308211824.xml ../results 11 */ 12import type { AnkiCardData, PlecoCard, PlecoResponse } from "./types.ts"; 13import { parse } from "https://deno.land/x/xml/mod.ts"; 14import { join } from "https://deno.land/std/path/mod.ts"; 15import { ensureDir } from "https://deno.land/std@0.199.0/fs/ensure_dir.ts"; 16import { convert } from "./pinyin_converter.ts"; 17 18interface PlecoResponse { 19 xml: { 20 "@version": number; 21 "@encoding": "string"; 22 }; 23 plecoflash: { 24 "@formatversion": number; 25 "@creator": string; 26 "@generator": string; 27 "@platform": string; 28 "@created": number; 29 categories: { 30 category: PlecoCategory[]; 31 }; 32 cards: { 33 card: PlecoCard[]; 34 }; 35 }; 36} 37 38interface PlecoCategory { 39 "@name": string; 40 "#text": string | null; 41} 42 43interface Catassign { 44 "@category": string; 45 "#text": string | null; 46} 47 48interface PlecoCard { 49 "@language": string; 50 entry: { 51 headword: Array<{ 52 "@charset": "sc" | "tc"; 53 "#text": string; // chinese str 54 }>; 55 pron: { 56 "@type": string; 57 "@tones": string; 58 "#text": string; 59 }; 60 defn: string; 61 }; 62 dictref: { 63 "@dictid": string; 64 "@entryid": number; 65 text: string | null; 66 }; 67 catassign: Catassign | Catassign[]; 68} 69 70interface AnkiCardData { 71 pinyin?: string; 72 definition?: string; 73 simplifiedText?: string; 74 traditionalText?: string; 75} 76 77const pinyinRegex = 78 /(shuang|chuang|zhuang|xiang|qiong|shuai|niang|guang|sheng|kuang|shang|jiong|huang|jiang|shuan|xiong|zhang|zheng|zhong|zhuai|zhuan|qiang|chang|liang|chuan|cheng|chong|chuai|hang|peng|chuo|piao|pian|chua|ping|yang|pang|chui|chun|chen|chan|chou|chao|chai|zhun|mang|meng|weng|shai|shei|miao|zhui|mian|yong|ming|wang|zhuo|zhua|shao|yuan|bing|zhen|fang|feng|zhan|zhou|zhao|zhei|zhai|rang|suan|reng|song|seng|dang|deng|dong|xuan|sang|rong|duan|cuan|cong|ceng|cang|diao|ruan|dian|ding|shou|xing|zuan|jiao|zong|zeng|zang|jian|tang|teng|tong|bian|biao|shan|tuan|huan|xian|huai|tiao|tian|hong|xiao|heng|ying|jing|shen|beng|kuan|kuai|nang|neng|nong|juan|kong|nuan|keng|kang|shua|niao|guan|nian|ting|shuo|guai|ning|quan|qiao|shui|gong|geng|gang|qian|bang|lang|leng|long|qing|ling|luan|shun|lian|liao|zhi|lia|liu|qin|lun|lin|luo|lan|lou|qiu|gai|gei|gao|gou|gan|gen|lao|lei|lai|que|gua|guo|nin|gui|niu|nie|gun|qie|qia|jun|kai|kei|kao|kou|kan|ken|qun|nun|nuo|xia|kua|kuo|nen|kui|nan|nou|kun|jue|nao|nei|hai|hei|hao|hou|han|hen|nai|rou|xiu|jin|hua|huo|tie|hui|tun|tui|hun|tuo|tan|jiu|zai|zei|zao|zou|zan|zen|eng|tou|tao|tei|tai|zuo|zui|xin|zun|jie|jia|run|diu|cai|cao|cou|can|cen|die|dia|xue|rui|cuo|cui|dun|cun|cin|ruo|rua|dui|sai|sao|sou|san|sen|duo|den|dan|dou|suo|sui|dao|sun|dei|zha|zhe|dai|xun|ang|ong|wai|fen|fan|fou|fei|zhu|wei|wan|min|miu|mie|wen|men|lie|chi|cha|che|man|mou|mao|mei|mai|yao|you|yan|chu|pin|pie|yin|pen|pan|pou|pao|shi|sha|she|pei|pai|yue|bin|bie|yun|nüe|lve|shu|ben|ban|bao|bei|bai|lüe|nve|ren|ran|rao|xie|re|ri|si|su|se|ru|sa|cu|ce|ca|ji|ci|zi|zu|ze|za|hu|he|ha|ju|ku|ke|qi|ka|gu|ge|ga|li|lu|le|qu|la|ni|xi|nu|ne|na|ti|tu|te|ta|xu|di|du|de|bo|lv|ba|ai|ei|ao|ou|an|en|er|da|wu|wa|wo|fu|fo|fa|nv|mi|mu|yi|ya|ye|me|mo|ma|pi|pu|po|yu|pa|bi|nü|bu|lü|e|o|a)r?[1-5]/gi; 79 80const vowels: { [vowel: string]: number } = { 81 "a*": 0, 82 "e*": 1, 83 "i*": 2, 84 "o*": 3, 85 "u*": 4, 86 "ü*": 5, 87 "A*": 6, 88 "E*": 7, 89 "I*": 8, 90 "O*": 9, 91 "U*": 10, 92 "Ü*": 11, 93}; 94 95const pinyin: { [vowel: number]: string[] } = { 96 [1]: ["ā", "ē", "ī", "ō", "ū", "ǖ", "Ā", "Ē", "Ī", "Ō", "Ū", "Ǖ"], 97 [2]: ["á", "é", "í", "ó", "ú", "ǘ", "Á", "É", "Í", "Ó", "Ú", "Ǘ"], 98 [3]: ["ǎ", "ě", "ǐ", "ǒ", "ǔ", "ǚ", "Ǎ", "Ě", "Ǐ", "Ǒ", "Ǔ", "Ǚ"], 99 [4]: ["à", "è", "ì", "ò", "ù", "ǜ", "À", "È", "Ì", "Ò", "Ù", "Ǜ"], 100 [5]: ["a", "e", "i", "o", "u", "ü", "A", "E", "I", "O", "U", "Ü"], 101}; 102 103const stars = "a*i a*o e*i ia* ia*o ie* io* iu* " + 104 "A*I A*O E*I IA* IA*O IE* IO* IU* " + 105 "o*u ua* ua*i ue* ui* uo* üe* " + 106 "O*U UA* UA*I UE* UI* UO* ÜE* " + 107 "A* E* I* O* U* Ü* " + 108 "a* e* i* o* u* ü*"; 109 110let accentMap: { [base: string]: string }; 111 112function convert(str: string) { 113 // Find words with a number behind them, and replace with callback fn. 114 const matches = str.match(pinyinRegex); 115 let resp = str; 116 (matches || []).forEach((match: string) => { 117 resp = resp.replace(match, getReplacement(match)); 118 }); 119 return resp; 120} 121 122function getReplacement(match: string) { 123 const accentMap = getAccentMap(); 124 const tone = Number(match.slice(-1)); 125 const word = match.slice(0, -1).replace("v", "ü").replace("V", "Ü"); 126 for (const base in accentMap) { 127 const vowel = accentMap[base]; 128 if (word.indexOf(base) >= 0) { 129 const vowelChar = vowel.match(/.\*/)?.[0]; 130 const vowelNum = vowelChar ? vowels[vowelChar] : undefined; 131 if (tone && vowelNum && vowelChar) { 132 const accentedVowelChar = pinyin[tone][vowelNum]; 133 return word 134 .replace(base, vowel) 135 .replace(vowelChar, accentedVowelChar); 136 } 137 } 138 } 139 return word; 140} 141 142function getAccentMap() { 143 if (accentMap) return accentMap; 144 145 accentMap = {}; 146 const nostars = stars.replace(/\*/g, ""); 147 const starsArray = stars.split(" "); 148 149 nostars.split(" ").forEach((base, i) => { 150 accentMap[base] = starsArray[i]; 151 }); 152 return accentMap; 153} 154 155 156const [inFile, outDir = "./"] = Deno.args; 157 158const text = await Deno.readTextFile(inFile); 159 160// deno-lint-ignore no-explicit-any 161const xml: PlecoResponse = parse(text) as any; 162const cards = xml.plecoflash.cards.card; 163 164const categoryExports: { 165 [categoryName: string]: AnkiCardData[]; 166} = {}; 167 168cards.forEach((card: PlecoCard) => { 169 const { headword, pron, defn: definition } = card.entry; 170 const data: AnkiCardData = { 171 pinyin: convert(pron["#text"]), 172 definition, 173 simplifiedText: headword.find((h) => h["@charset"] == "sc")?.["#text"], 174 traditionalText: headword.find((h) => h["@charset"] == "tc")?.["#text"], 175 }; 176 177 const categories = [card.catassign].flat() 178 .map((catassign) => catassign["@category"]); 179 categories.forEach((category) => { 180 categoryExports[category] = categoryExports[category] || []; 181 categoryExports[category].push(data); 182 }); 183}); 184 185const respPromises = Object.keys(categoryExports) 186 .map(async (categoryName: string) => { 187 const tsvText = cardsToTSV(categoryExports[categoryName]); 188 await ensureDir( 189 join(outDir, categoryName.split("/").slice(0, -1).join("/")), 190 ); 191 return Deno.writeTextFile(join(outDir, `${categoryName}.tsv`), tsvText); 192 }); 193 194await Promise.all(respPromises); 195console.log("complete"); 196 197function format(str: string): string { 198 return str.replace(/\t/g, " ").replace(/\n/g, " <br/> "); 199} 200 201function cardsToTSV(cards: AnkiCardData[]): string { 202 let str = "traditionalText\tsimplifiedText\tpinyin\tdefinition\n"; 203 cards.forEach((card: AnkiCardData) => { 204 const { definition, pinyin, simplifiedText, traditionalText } = card; 205 const sub = [ 206 format(traditionalText || ""), 207 format(simplifiedText || ""), 208 format(pinyin || ""), 209 format(definition || ""), 210 ].join("\t"); 211 str += `${sub}\n`; 212 }); 213 return str; 214}