Converts Pleco .xml export into an Anki-importable .tsv file
0
pleco_to_anki.ts
1/**
2 * Pleco to Anki
3 * @description Converts Pleco .xml export into an Anki-importable .tsv file
4 *
5 * @reference
6 * `convert` Adapted From Quizlet's: https://github.com/quizlet/pinyin-converter
7 * by David Chanin and Jen Liu @quizlet 2013
8 *
9 * @example deno run --allow-read --allow-write main.ts [-infile] [-outdir]
10 * @example deno run --allow-read --allow-write main.ts ../flash-2308211824.xml ../results
11 */
12import type { AnkiCardData, PlecoCard, PlecoResponse } from "./types.ts";
13import { parse } from "https://deno.land/x/xml/mod.ts";
14import { join } from "https://deno.land/std/path/mod.ts";
15import { ensureDir } from "https://deno.land/std@0.199.0/fs/ensure_dir.ts";
16import { convert } from "./pinyin_converter.ts";
17
18interface PlecoResponse {
19 xml: {
20 "@version": number;
21 "@encoding": "string";
22 };
23 plecoflash: {
24 "@formatversion": number;
25 "@creator": string;
26 "@generator": string;
27 "@platform": string;
28 "@created": number;
29 categories: {
30 category: PlecoCategory[];
31 };
32 cards: {
33 card: PlecoCard[];
34 };
35 };
36}
37
38interface PlecoCategory {
39 "@name": string;
40 "#text": string | null;
41}
42
43interface Catassign {
44 "@category": string;
45 "#text": string | null;
46}
47
48interface PlecoCard {
49 "@language": string;
50 entry: {
51 headword: Array<{
52 "@charset": "sc" | "tc";
53 "#text": string; // chinese str
54 }>;
55 pron: {
56 "@type": string;
57 "@tones": string;
58 "#text": string;
59 };
60 defn: string;
61 };
62 dictref: {
63 "@dictid": string;
64 "@entryid": number;
65 text: string | null;
66 };
67 catassign: Catassign | Catassign[];
68}
69
70interface AnkiCardData {
71 pinyin?: string;
72 definition?: string;
73 simplifiedText?: string;
74 traditionalText?: string;
75}
76
77const pinyinRegex =
78 /(shuang|chuang|zhuang|xiang|qiong|shuai|niang|guang|sheng|kuang|shang|jiong|huang|jiang|shuan|xiong|zhang|zheng|zhong|zhuai|zhuan|qiang|chang|liang|chuan|cheng|chong|chuai|hang|peng|chuo|piao|pian|chua|ping|yang|pang|chui|chun|chen|chan|chou|chao|chai|zhun|mang|meng|weng|shai|shei|miao|zhui|mian|yong|ming|wang|zhuo|zhua|shao|yuan|bing|zhen|fang|feng|zhan|zhou|zhao|zhei|zhai|rang|suan|reng|song|seng|dang|deng|dong|xuan|sang|rong|duan|cuan|cong|ceng|cang|diao|ruan|dian|ding|shou|xing|zuan|jiao|zong|zeng|zang|jian|tang|teng|tong|bian|biao|shan|tuan|huan|xian|huai|tiao|tian|hong|xiao|heng|ying|jing|shen|beng|kuan|kuai|nang|neng|nong|juan|kong|nuan|keng|kang|shua|niao|guan|nian|ting|shuo|guai|ning|quan|qiao|shui|gong|geng|gang|qian|bang|lang|leng|long|qing|ling|luan|shun|lian|liao|zhi|lia|liu|qin|lun|lin|luo|lan|lou|qiu|gai|gei|gao|gou|gan|gen|lao|lei|lai|que|gua|guo|nin|gui|niu|nie|gun|qie|qia|jun|kai|kei|kao|kou|kan|ken|qun|nun|nuo|xia|kua|kuo|nen|kui|nan|nou|kun|jue|nao|nei|hai|hei|hao|hou|han|hen|nai|rou|xiu|jin|hua|huo|tie|hui|tun|tui|hun|tuo|tan|jiu|zai|zei|zao|zou|zan|zen|eng|tou|tao|tei|tai|zuo|zui|xin|zun|jie|jia|run|diu|cai|cao|cou|can|cen|die|dia|xue|rui|cuo|cui|dun|cun|cin|ruo|rua|dui|sai|sao|sou|san|sen|duo|den|dan|dou|suo|sui|dao|sun|dei|zha|zhe|dai|xun|ang|ong|wai|fen|fan|fou|fei|zhu|wei|wan|min|miu|mie|wen|men|lie|chi|cha|che|man|mou|mao|mei|mai|yao|you|yan|chu|pin|pie|yin|pen|pan|pou|pao|shi|sha|she|pei|pai|yue|bin|bie|yun|nüe|lve|shu|ben|ban|bao|bei|bai|lüe|nve|ren|ran|rao|xie|re|ri|si|su|se|ru|sa|cu|ce|ca|ji|ci|zi|zu|ze|za|hu|he|ha|ju|ku|ke|qi|ka|gu|ge|ga|li|lu|le|qu|la|ni|xi|nu|ne|na|ti|tu|te|ta|xu|di|du|de|bo|lv|ba|ai|ei|ao|ou|an|en|er|da|wu|wa|wo|fu|fo|fa|nv|mi|mu|yi|ya|ye|me|mo|ma|pi|pu|po|yu|pa|bi|nü|bu|lü|e|o|a)r?[1-5]/gi;
79
80const vowels: { [vowel: string]: number } = {
81 "a*": 0,
82 "e*": 1,
83 "i*": 2,
84 "o*": 3,
85 "u*": 4,
86 "ü*": 5,
87 "A*": 6,
88 "E*": 7,
89 "I*": 8,
90 "O*": 9,
91 "U*": 10,
92 "Ü*": 11,
93};
94
95const pinyin: { [vowel: number]: string[] } = {
96 [1]: ["ā", "ē", "ī", "ō", "ū", "ǖ", "Ā", "Ē", "Ī", "Ō", "Ū", "Ǖ"],
97 [2]: ["á", "é", "í", "ó", "ú", "ǘ", "Á", "É", "Í", "Ó", "Ú", "Ǘ"],
98 [3]: ["ǎ", "ě", "ǐ", "ǒ", "ǔ", "ǚ", "Ǎ", "Ě", "Ǐ", "Ǒ", "Ǔ", "Ǚ"],
99 [4]: ["à", "è", "ì", "ò", "ù", "ǜ", "À", "È", "Ì", "Ò", "Ù", "Ǜ"],
100 [5]: ["a", "e", "i", "o", "u", "ü", "A", "E", "I", "O", "U", "Ü"],
101};
102
103const stars = "a*i a*o e*i ia* ia*o ie* io* iu* " +
104 "A*I A*O E*I IA* IA*O IE* IO* IU* " +
105 "o*u ua* ua*i ue* ui* uo* üe* " +
106 "O*U UA* UA*I UE* UI* UO* ÜE* " +
107 "A* E* I* O* U* Ü* " +
108 "a* e* i* o* u* ü*";
109
110let accentMap: { [base: string]: string };
111
112function convert(str: string) {
113 // Find words with a number behind them, and replace with callback fn.
114 const matches = str.match(pinyinRegex);
115 let resp = str;
116 (matches || []).forEach((match: string) => {
117 resp = resp.replace(match, getReplacement(match));
118 });
119 return resp;
120}
121
122function getReplacement(match: string) {
123 const accentMap = getAccentMap();
124 const tone = Number(match.slice(-1));
125 const word = match.slice(0, -1).replace("v", "ü").replace("V", "Ü");
126 for (const base in accentMap) {
127 const vowel = accentMap[base];
128 if (word.indexOf(base) >= 0) {
129 const vowelChar = vowel.match(/.\*/)?.[0];
130 const vowelNum = vowelChar ? vowels[vowelChar] : undefined;
131 if (tone && vowelNum && vowelChar) {
132 const accentedVowelChar = pinyin[tone][vowelNum];
133 return word
134 .replace(base, vowel)
135 .replace(vowelChar, accentedVowelChar);
136 }
137 }
138 }
139 return word;
140}
141
142function getAccentMap() {
143 if (accentMap) return accentMap;
144
145 accentMap = {};
146 const nostars = stars.replace(/\*/g, "");
147 const starsArray = stars.split(" ");
148
149 nostars.split(" ").forEach((base, i) => {
150 accentMap[base] = starsArray[i];
151 });
152 return accentMap;
153}
154
155
156const [inFile, outDir = "./"] = Deno.args;
157
158const text = await Deno.readTextFile(inFile);
159
160// deno-lint-ignore no-explicit-any
161const xml: PlecoResponse = parse(text) as any;
162const cards = xml.plecoflash.cards.card;
163
164const categoryExports: {
165 [categoryName: string]: AnkiCardData[];
166} = {};
167
168cards.forEach((card: PlecoCard) => {
169 const { headword, pron, defn: definition } = card.entry;
170 const data: AnkiCardData = {
171 pinyin: convert(pron["#text"]),
172 definition,
173 simplifiedText: headword.find((h) => h["@charset"] == "sc")?.["#text"],
174 traditionalText: headword.find((h) => h["@charset"] == "tc")?.["#text"],
175 };
176
177 const categories = [card.catassign].flat()
178 .map((catassign) => catassign["@category"]);
179 categories.forEach((category) => {
180 categoryExports[category] = categoryExports[category] || [];
181 categoryExports[category].push(data);
182 });
183});
184
185const respPromises = Object.keys(categoryExports)
186 .map(async (categoryName: string) => {
187 const tsvText = cardsToTSV(categoryExports[categoryName]);
188 await ensureDir(
189 join(outDir, categoryName.split("/").slice(0, -1).join("/")),
190 );
191 return Deno.writeTextFile(join(outDir, `${categoryName}.tsv`), tsvText);
192 });
193
194await Promise.all(respPromises);
195console.log("complete");
196
197function format(str: string): string {
198 return str.replace(/\t/g, " ").replace(/\n/g, " <br/> ");
199}
200
201function cardsToTSV(cards: AnkiCardData[]): string {
202 let str = "traditionalText\tsimplifiedText\tpinyin\tdefinition\n";
203 cards.forEach((card: AnkiCardData) => {
204 const { definition, pinyin, simplifiedText, traditionalText } = card;
205 const sub = [
206 format(traditionalText || ""),
207 format(simplifiedText || ""),
208 format(pinyin || ""),
209 format(definition || ""),
210 ].join("\t");
211 str += `${sub}\n`;
212 });
213 return str;
214}