this string has no description
0
parse_cedict.ts
1/**
2 * @description
3 * This is a simplified version of cedict2json.js by Kevin Yang, modified to be
4 * run with deno, only using std dependencies.
5 *
6 * @reference https://github.com/kevb34ns/CEDICT2JSON/blob/master/cedict2json.js
7 * @reference https://www.mdbg.net/chinese/dictionary?page=cc-cedict
8 *
9 * CC-CEDICT is licensed under the Creative Commons Attribution-Share
10 * Alike 3.0 License (https://creativecommons.org/licenses/by-sa/3.0/).
11 * You must give proper attribution and license any changes or
12 * improvements to the data under the same license.
13 */
14import { TextLineStream } from "jsr:@std/streams/text-line-stream";
15import { toTransformStream } from "jsr:@std/streams/to-transform-stream";
16
17interface Entry {
18 traditional: string;
19 simplified: string;
20 pinyin: string;
21 definitions: string[];
22}
23
24const utf8Stream = (await Deno.open("cedict_ts.u8")).readable
25 .pipeThrough(new TextDecoderStream())
26 .pipeThrough(new TextLineStream())
27 .pipeThrough(toTransformStream(async function* (src) {
28 for await (const line of src) {
29 if (line.trim().charAt(0) !== "#") {
30 const entry = parseEntry(line);
31 if (entry !== null) yield entry;
32 }
33 }
34 }));
35
36const entryArray: Entry[] = await Array.fromAsync(utf8Stream);
37
38await Deno.writeTextFile("cedict.json", JSON.stringify(entryArray, null, "\t"));
39
40function parseEntry(entry: string): Entry | null {
41 const firstSpace = entry.indexOf(" ");
42 const secondSpace = entry.indexOf(" ", firstSpace + 1);
43 const bracketsMatch = entry.match(/\[(.*?)\]/);
44 const definitions = entry.match(/\/(.*?)\//g);
45
46 if (
47 firstSpace <= 0 ||
48 secondSpace <= 0 ||
49 bracketsMatch === null ||
50 definitions === null ||
51 definitions.length === 0
52 ) {
53 console.log("Invalid entry: " + entry);
54 return null;
55 }
56
57 return {
58 traditional: entry.substring(0, firstSpace),
59 simplified: entry.substring(firstSpace + 1, secondSpace),
60 pinyin: bracketsMatch[1],
61 definitions: definitions.map((def) => def.replace(/[\/]/g, "")),
62 };
63}