/** * @description * This is a simplified version of cedict2json.js by Kevin Yang, modified to be * run with deno, only using std dependencies. * * @reference https://github.com/kevb34ns/CEDICT2JSON/blob/master/cedict2json.js * @reference https://www.mdbg.net/chinese/dictionary?page=cc-cedict * * CC-CEDICT is licensed under the Creative Commons Attribution-Share * Alike 3.0 License (https://creativecommons.org/licenses/by-sa/3.0/). * You must give proper attribution and license any changes or * improvements to the data under the same license. */ import { TextLineStream } from "jsr:@std/streams/text-line-stream"; import { toTransformStream } from "jsr:@std/streams/to-transform-stream"; interface Entry { traditional: string; simplified: string; pinyin: string; definitions: string[]; } const utf8Stream = (await Deno.open("cedict_ts.u8")).readable .pipeThrough(new TextDecoderStream()) .pipeThrough(new TextLineStream()) .pipeThrough(toTransformStream(async function* (src) { for await (const line of src) { if (line.trim().charAt(0) !== "#") { const entry = parseEntry(line); if (entry !== null) yield entry; } } })); const entryArray: Entry[] = await Array.fromAsync(utf8Stream); await Deno.writeTextFile("cedict.json", JSON.stringify(entryArray, null, "\t")); function parseEntry(entry: string): Entry | null { const firstSpace = entry.indexOf(" "); const secondSpace = entry.indexOf(" ", firstSpace + 1); const bracketsMatch = entry.match(/\[(.*?)\]/); const definitions = entry.match(/\/(.*?)\//g); if ( firstSpace <= 0 || secondSpace <= 0 || bracketsMatch === null || definitions === null || definitions.length === 0 ) { console.log("Invalid entry: " + entry); return null; } return { traditional: entry.substring(0, firstSpace), simplified: entry.substring(firstSpace + 1, secondSpace), pinyin: bracketsMatch[1], definitions: definitions.map((def) => def.replace(/[\/]/g, "")), }; }