this string has no description
0
parse_cedict.ts
63 lines 2.0 kB view raw
1/** 2 * @description 3 * This is a simplified version of cedict2json.js by Kevin Yang, modified to be 4 * run with deno, only using std dependencies. 5 * 6 * @reference https://github.com/kevb34ns/CEDICT2JSON/blob/master/cedict2json.js 7 * @reference https://www.mdbg.net/chinese/dictionary?page=cc-cedict 8 * 9 * CC-CEDICT is licensed under the Creative Commons Attribution-Share 10 * Alike 3.0 License (https://creativecommons.org/licenses/by-sa/3.0/). 11 * You must give proper attribution and license any changes or 12 * improvements to the data under the same license. 13 */ 14import { TextLineStream } from "jsr:@std/streams/text-line-stream"; 15import { toTransformStream } from "jsr:@std/streams/to-transform-stream"; 16 17interface Entry { 18 traditional: string; 19 simplified: string; 20 pinyin: string; 21 definitions: string[]; 22} 23 24const utf8Stream = (await Deno.open("cedict_ts.u8")).readable 25 .pipeThrough(new TextDecoderStream()) 26 .pipeThrough(new TextLineStream()) 27 .pipeThrough(toTransformStream(async function* (src) { 28 for await (const line of src) { 29 if (line.trim().charAt(0) !== "#") { 30 const entry = parseEntry(line); 31 if (entry !== null) yield entry; 32 } 33 } 34 })); 35 36const entryArray: Entry[] = await Array.fromAsync(utf8Stream); 37 38await Deno.writeTextFile("cedict.json", JSON.stringify(entryArray, null, "\t")); 39 40function parseEntry(entry: string): Entry | null { 41 const firstSpace = entry.indexOf(" "); 42 const secondSpace = entry.indexOf(" ", firstSpace + 1); 43 const bracketsMatch = entry.match(/\[(.*?)\]/); 44 const definitions = entry.match(/\/(.*?)\//g); 45 46 if ( 47 firstSpace <= 0 || 48 secondSpace <= 0 || 49 bracketsMatch === null || 50 definitions === null || 51 definitions.length === 0 52 ) { 53 console.log("Invalid entry: " + entry); 54 return null; 55 } 56 57 return { 58 traditional: entry.substring(0, firstSpace), 59 simplified: entry.substring(firstSpace + 1, secondSpace), 60 pinyin: bracketsMatch[1], 61 definitions: definitions.map((def) => def.replace(/[\/]/g, "")), 62 }; 63}