MIRROR: javascript for 馃悳's, a tiny runtime with big ambitions
1#include <stdlib.h>
2#include <string.h>
3#include <stdint.h>
4
5#include "ant.h"
6#include "errors.h"
7#include "runtime.h"
8#include "internal.h"
9#include "descriptors.h"
10#include "utf8.h"
11
12#include "modules/textcodec.h"
13#include "modules/buffer.h"
14#include "modules/symbol.h"
15
16static ant_value_t g_textencoder_proto = 0;
17static ant_value_t g_textdecoder_proto = 0;
18
19td_state_t *td_state_new(td_encoding_t enc, bool fatal, bool ignore_bom) {
20 td_state_t *st = calloc(1, sizeof(td_state_t));
21 if (!st) return NULL;
22 st->encoding = enc;
23 st->fatal = fatal;
24 st->ignore_bom = ignore_bom;
25 return st;
26}
27
28static td_state_t *td_get_state(ant_value_t obj) {
29 ant_value_t s = js_get_slot(obj, SLOT_DATA);
30 if (vtype(s) != T_NUM) return NULL;
31 return (td_state_t *)(uintptr_t)(size_t)js_getnum(s);
32}
33
34static void td_finalize(ant_t *js, ant_object_t *obj) {
35 if (!obj->extra_slots) return;
36 ant_extra_slot_t *entries = (ant_extra_slot_t *)obj->extra_slots;
37
38 for (uint8_t i = 0; i < obj->extra_count; i++) {
39 if (entries[i].slot == SLOT_DATA && vtype(entries[i].value) == T_NUM) {
40 free((td_state_t *)(uintptr_t)(size_t)js_getnum(entries[i].value));
41 return;
42 }}
43}
44
45static int resolve_encoding(const char *s, size_t len) {
46 static const struct { const char *label; uint8_t len; td_encoding_t enc; } map[] = {
47 {"unicode-1-1-utf-8", 17, TD_ENC_UTF8}, {"unicode11utf8", 13, TD_ENC_UTF8},
48 {"unicode20utf8", 13, TD_ENC_UTF8}, {"utf-8", 5, TD_ENC_UTF8},
49 {"utf8", 4, TD_ENC_UTF8}, {"x-unicode20utf8",17, TD_ENC_UTF8},
50 {"windows-1252", 12, TD_ENC_WINDOWS_1252}, {"ascii", 5, TD_ENC_WINDOWS_1252},
51 {"unicodefffe", 11, TD_ENC_UTF16BE}, {"utf-16be", 8, TD_ENC_UTF16BE},
52 {"csunicode", 9, TD_ENC_UTF16LE}, {"iso-10646-ucs-2",16, TD_ENC_UTF16LE},
53 {"ucs-2", 5, TD_ENC_UTF16LE}, {"unicode", 7, TD_ENC_UTF16LE},
54 {"unicodefeff", 11, TD_ENC_UTF16LE}, {"utf-16", 6, TD_ENC_UTF16LE},
55 {"utf-16le", 8, TD_ENC_UTF16LE},
56 {"iso-8859-2", 10, TD_ENC_ISO_8859_2},
57 {NULL, 0, 0}
58 };
59 for (int i = 0; map[i].label; i++) {
60 if (len == map[i].len && strncasecmp(s, map[i].label, len) == 0) return (int)map[i].enc;
61 }
62 return -1;
63}
64
65static const char *encoding_name(td_encoding_t enc) {
66switch (enc) {
67 case TD_ENC_UTF16LE: return "utf-16le";
68 case TD_ENC_UTF16BE: return "utf-16be";
69 case TD_ENC_WINDOWS_1252: return "windows-1252";
70 case TD_ENC_ISO_8859_2: return "iso-8859-2";
71 default: return "utf-8";
72}}
73
74static const char *trim_label(const char *s, size_t len, size_t *out_len) {
75 while (len > 0 && (unsigned char)*s <= 0x20) { s++; len--; }
76 while (len > 0 && (unsigned char)s[len - 1] <= 0x20) { len--; }
77 *out_len = len;
78 return s;
79}
80
81static ant_value_t js_textencoder_get_encoding(ant_t *js, ant_value_t *args, int nargs) {
82 return js_mkstr(js, "utf-8", 5);
83}
84
85ant_value_t te_encode(ant_t *js, const char *str, size_t str_len) {
86 ArrayBufferData *ab = create_array_buffer_data(str_len);
87 if (!ab) return js_mkerr(js, "out of memory");
88
89 if (str_len > 0) {
90 const uint8_t *s = (const uint8_t *)str;
91 uint8_t *d = ab->data; size_t i = 0;
92
93 while (i < str_len) {
94 if (s[i] == 0xED && i + 2 < str_len && s[i+1] >= 0xA0 && s[i+1] <= 0xBF) {
95 d[i] = 0xEF; d[i+1] = 0xBF; d[i+2] = 0xBD;
96 i += 3;
97 } else { d[i] = s[i]; i++; }}
98 }
99
100 return create_typed_array(js, TYPED_ARRAY_UINT8, ab, 0, str_len, "Uint8Array");
101}
102
103static ant_value_t js_textencoder_encode(ant_t *js, ant_value_t *args, int nargs) {
104 size_t str_len = 0;
105 const char *str = "";
106
107 if (nargs > 0 && vtype(args[0]) == T_STR) {
108 str = js_getstr(js, args[0], &str_len);
109 if (!str) { str = ""; str_len = 0; }
110 } else if (nargs > 0 && vtype(args[0]) != T_UNDEF) {
111 ant_value_t sv = js_tostring_val(js, args[0]);
112 if (is_err(sv)) return sv;
113 str = js_getstr(js, sv, &str_len);
114 if (!str) { str = ""; str_len = 0; }
115 }
116
117 return te_encode(js, str, str_len);
118}
119
120static ant_value_t js_textencoder_encode_into(ant_t *js, ant_value_t *args, int nargs) {
121 if (nargs < 2) return js_mkerr_typed(js, JS_ERR_TYPE, "encodeInto requires 2 arguments");
122
123 size_t str_len = 0;
124 const char *str = "";
125 if (vtype(args[0]) == T_STR) {
126 str = js_getstr(js, args[0], &str_len);
127 if (!str) { str = ""; str_len = 0; }
128 } else if (vtype(args[0]) != T_UNDEF) {
129 ant_value_t sv = js_tostring_val(js, args[0]);
130 if (is_err(sv)) return sv;
131 str = js_getstr(js, sv, &str_len);
132 if (!str) { str = ""; str_len = 0; }
133 }
134
135 TypedArrayData *ta = buffer_get_typedarray_data(args[1]);
136 if (!ta) return js_mkerr_typed(js, JS_ERR_TYPE, "Second argument must be a Uint8Array");
137
138 uint8_t *dest = (ta->buffer && !ta->buffer->is_detached)
139 ? ta->buffer->data + ta->byte_offset : NULL;
140 size_t available = ta->byte_length;
141
142 const utf8proc_uint8_t *src = (const utf8proc_uint8_t *)str;
143 utf8proc_ssize_t src_len = (utf8proc_ssize_t)str_len;
144 utf8proc_ssize_t pos = 0;
145
146 size_t written = 0;
147 size_t read_units = 0;
148
149 while (pos < src_len) {
150 utf8proc_int32_t cp;
151 utf8proc_ssize_t n = utf8_next(src + pos, src_len - pos, &cp);
152 utf8proc_uint8_t tmp[4];
153 utf8proc_ssize_t enc_len;
154
155 if (cp >= 0xD800 && cp <= 0xDFFF) {
156 tmp[0] = 0xEF; tmp[1] = 0xBF; tmp[2] = 0xBD;
157 enc_len = 3;
158 } else {
159 enc_len = (cp >= 0) ? utf8proc_encode_char(cp, tmp) : 0;
160 if (enc_len <= 0) { tmp[0] = 0xEF; tmp[1] = 0xBF; tmp[2] = 0xBD; enc_len = 3; }
161 }
162
163 if (written + (size_t)enc_len > available) break;
164 if (dest) memcpy(dest + written, tmp, (size_t)enc_len);
165
166 written += (size_t)enc_len;
167 pos += n;
168 read_units += (cp >= 0x10000 && cp <= 0x10FFFF) ? 2 : 1;
169 }
170
171 ant_value_t result = js_mkobj(js);
172 js_set(js, result, "read", js_mknum((double)read_units));
173 js_set(js, result, "written", js_mknum((double)written));
174
175 return result;
176}
177
178static ant_value_t js_textencoder_ctor(ant_t *js, ant_value_t *args, int nargs) {
179 if (vtype(js->new_target) == T_UNDEF)
180 return js_mkerr_typed(js, JS_ERR_TYPE, "TextEncoder constructor requires 'new'");
181 ant_value_t obj = js_mkobj(js);
182 ant_value_t proto = js_instance_proto_from_new_target(js, g_textencoder_proto);
183 if (is_object_type(proto)) js_set_proto_init(obj, proto);
184 return obj;
185}
186
187static ant_value_t js_textdecoder_get_encoding(ant_t *js, ant_value_t *args, int nargs) {
188 td_state_t *st = td_get_state(js->this_val);
189 const char *name = encoding_name(st ? st->encoding : TD_ENC_UTF8);
190 return js_mkstr(js, name, strlen(name));
191}
192
193static ant_value_t js_textdecoder_get_fatal(ant_t *js, ant_value_t *args, int nargs) {
194 td_state_t *st = td_get_state(js->this_val);
195 return (st && st->fatal) ? js_true : js_false;
196}
197
198static ant_value_t js_textdecoder_get_ignore_bom(ant_t *js, ant_value_t *args, int nargs) {
199 td_state_t *st = td_get_state(js->this_val);
200 return (st && st->ignore_bom) ? js_true : js_false;
201}
202
203static inline uint16_t u16_read(const uint8_t *p, bool be) {
204 return be
205 ? (uint16_t)((uint16_t)p[0] << 8 | p[1])
206 : (uint16_t)((uint16_t)p[1] << 8 | p[0]);
207}
208
209static inline size_t u8_emit(char *out, size_t o, utf8proc_int32_t cp) {
210 utf8proc_ssize_t n = utf8proc_encode_char(cp, (utf8proc_uint8_t *)(out + o));
211 return n > 0 ? o + (size_t)n : o;
212}
213
214static inline size_t u8_fffd(char *out, size_t o) {
215 out[o] = (char)0xEF; out[o+1] = (char)0xBF; out[o+2] = (char)0xBD;
216 return o + 3;
217}
218
219#define U16_IS_HIGH(cu) ((cu) >= 0xD800 && (cu) <= 0xDBFF)
220#define U16_IS_LOW(cu) ((cu) >= 0xDC00 && (cu) <= 0xDFFF)
221#define U16_PAIR(hi,lo) (0x10000 + ((uint32_t)((hi) - 0xD800) << 10) + ((lo) - 0xDC00))
222
223static uint32_t decode_windows_1252_byte(uint8_t byte) {
224 static const uint16_t specials[32] = {
225 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
226 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
227 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
228 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178,
229 };
230 if (byte < 0x80) return byte;
231 if (byte < 0xA0) return specials[byte - 0x80];
232 return byte;
233}
234
235static uint32_t decode_iso_8859_2_byte(uint8_t byte) {
236 static const uint16_t upper[96] = {
237 0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7,
238 0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B,
239 0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7,
240 0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C,
241 0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7,
242 0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E,
243 0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7,
244 0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF,
245 0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7,
246 0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F,
247 0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7,
248 0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9,
249 };
250 if (byte < 0xA0) return byte;
251 return upper[byte - 0xA0];
252}
253
254static utf8proc_ssize_t decode_single_byte(td_state_t *st, const uint8_t *src, size_t len, char *out) {
255 size_t o = 0;
256 for (size_t i = 0; i < len; i++) {
257 uint32_t cp = (st->encoding == TD_ENC_WINDOWS_1252)
258 ? decode_windows_1252_byte(src[i])
259 : decode_iso_8859_2_byte(src[i]);
260 o = u8_emit(out, o, (utf8proc_int32_t)cp);
261 }
262 return (utf8proc_ssize_t)o;
263}
264
265static utf8proc_ssize_t utf16_decode(td_state_t *st, const uint8_t *src, size_t len, char *out, bool stream) {
266 bool be = (st->encoding == TD_ENC_UTF16BE);
267 size_t i = 0, o = 0;
268 size_t avail;
269
270 if (!st->bom_seen) {
271 if (len < 2) goto pend_tail;
272 if (u16_read(src, be) == 0xFEFF && !st->ignore_bom) i = 2;
273 st->bom_seen = true;
274 }
275
276 while (i < len) {
277 avail = len - i;
278
279 if (avail < 2) goto pend_tail;
280 uint16_t cu = u16_read(src + i, be);
281 i += 2;
282
283 if (!U16_IS_HIGH(cu) && !U16_IS_LOW(cu)) {
284 o = u8_emit(out, o, (utf8proc_int32_t)cu);
285 continue;
286 }
287
288 if (U16_IS_LOW(cu)) goto err;
289
290 avail = len - i;
291 if (avail < 2) goto pend_hi;
292
293 uint16_t lo = u16_read(src + i, be);
294 if (U16_IS_LOW(lo)) { i += 2; o = u8_emit(out, o, (utf8proc_int32_t)U16_PAIR(cu, lo)); continue; }
295
296 goto err;
297
298 pend_tail:
299 if (stream) { st->pending[0] = src[i]; st->pending_len = 1; }
300 else { if (st->fatal) return -1; o = u8_fffd(out, o); }
301 break;
302
303 pend_hi:
304 if (stream) { st->pending_len = (int)(len - (i - 2)); memcpy(st->pending, src + i - 2, (size_t)st->pending_len); }
305 else { if (st->fatal) return -1; o = u8_fffd(out, o); if (avail == 1) o = u8_fffd(out, o); }
306 break;
307
308 err:
309 if (st->fatal) return -1;
310 o = u8_fffd(out, o);
311 continue;
312 }
313
314 return (utf8proc_ssize_t)o;
315}
316
317#undef U16_IS_HIGH
318#undef U16_IS_LOW
319#undef U16_PAIR
320
321ant_value_t td_decode(ant_t *js, td_state_t *st, const uint8_t *input, size_t input_len, bool stream_mode) {
322 size_t total = (size_t)st->pending_len + input_len;
323 if (total == 0) {
324 if (!stream_mode) st->bom_seen = false;
325 return js_mkstr(js, "", 0);
326 }
327
328 uint8_t *work = NULL;
329 const uint8_t *src;
330 if (st->pending_len > 0) {
331 work = malloc(total);
332 if (!work) return js_mkerr(js, "out of memory");
333 memcpy(work, st->pending, (size_t)st->pending_len);
334 if (input && input_len > 0) memcpy(work + st->pending_len, input, input_len);
335 src = work;
336 } else src = input;
337 st->pending_len = 0;
338
339 char *out = malloc(total * 3 + 1);
340 if (!out) { free(work); return js_mkerr(js, "out of memory"); }
341
342 utf8proc_ssize_t n;
343 if (st->encoding == TD_ENC_UTF16LE || st->encoding == TD_ENC_UTF16BE) {
344 n = utf16_decode(st, src, total, out, stream_mode);
345 } else if (st->encoding == TD_ENC_WINDOWS_1252 || st->encoding == TD_ENC_ISO_8859_2) {
346 n = decode_single_byte(st, src, total, out);
347 st->pending_len = 0;
348 st->bom_seen = false;
349 } else {
350 utf8_dec_t dec = { .ignore_bom = st->ignore_bom, .bom_seen = st->bom_seen };
351 n = utf8_whatwg_decode(&dec, src, total, out, st->fatal, stream_mode);
352 st->pending_len = dec.pend_pos;
353 memcpy(st->pending, dec.pend_buf, (size_t)dec.pend_pos);
354 st->bom_seen = stream_mode ? dec.bom_seen : false;
355 }
356
357 if (n < 0) {
358 free(work); free(out);
359 return js_mkerr_typed(js, JS_ERR_TYPE, "The encoded data was not valid.");
360 }
361
362 if (st->encoding != TD_ENC_UTF8) {
363 if (!stream_mode) st->bom_seen = false;
364 }
365
366 ant_value_t result = js_mkstr(js, out, (size_t)n);
367 free(work);
368 free(out);
369
370 return result;
371}
372
373static ant_value_t js_textdecoder_decode(ant_t *js, ant_value_t *args, int nargs) {
374 td_state_t *st = td_get_state(js->this_val);
375 if (!st) return js_mkerr_typed(js, JS_ERR_TYPE, "Invalid TextDecoder");
376
377 bool stream_mode = false;
378 if (nargs > 1 && is_object_type(args[1])) {
379 ant_value_t sv = js_get(js, args[1], "stream");
380 stream_mode = js_truthy(js, sv);
381 }
382
383 const uint8_t *input = NULL;
384 size_t input_len = 0;
385 if (nargs > 0 && is_object_type(args[0]))
386 buffer_source_get_bytes(js, args[0], &input, &input_len);
387
388 return td_decode(js, st, input, input_len, stream_mode);
389}
390
391static ant_value_t js_textdecoder_ctor(ant_t *js, ant_value_t *args, int nargs) {
392 if (vtype(js->new_target) == T_UNDEF)
393 return js_mkerr_typed(js, JS_ERR_TYPE, "TextDecoder constructor requires 'new'");
394
395 td_encoding_t enc = TD_ENC_UTF8;
396 if (nargs > 0 && !is_undefined(args[0])) {
397 ant_value_t label = (vtype(args[0]) == T_STR) ? args[0] : coerce_to_str(js, args[0]);
398 if (is_err(label)) return label;
399
400 size_t llen;
401 const char *raw = js_getstr(js, label, &llen);
402 if (raw) {
403 size_t tlen;
404 const char *trimmed = trim_label(raw, llen, &tlen);
405 int resolved = resolve_encoding(trimmed, tlen);
406
407 if (resolved < 0) return js_mkerr_typed(
408 js, JS_ERR_RANGE, "Failed to construct 'TextDecoder': The encoding label provided ('%.*s') is invalid.",
409 (int)tlen, trimmed
410 );
411
412 enc = (td_encoding_t)resolved;
413 }}
414
415 bool fatal = false;
416 bool ignore_bom = false;
417
418 if (nargs > 1 && is_object_type(args[1])) {
419 ant_value_t fv = js_getprop_fallback(js, args[1], "fatal");
420 if (is_err(fv)) return fv;
421 if (vtype(fv) != T_UNDEF) fatal = js_truthy(js, fv);
422 ant_value_t bv = js_getprop_fallback(js, args[1], "ignoreBOM");
423 if (is_err(bv)) return bv;
424 if (vtype(bv) != T_UNDEF) ignore_bom = js_truthy(js, bv);
425 }
426
427 td_state_t *st = td_state_new(enc, fatal, ignore_bom);
428 if (!st) return js_mkerr(js, "out of memory");
429
430 ant_value_t obj = js_mkobj(js);
431 ant_value_t proto = js_instance_proto_from_new_target(js, g_textdecoder_proto);
432
433 if (is_object_type(proto)) js_set_proto_init(obj, proto);
434 js_set_slot(obj, SLOT_DATA, ANT_PTR(st));
435 js_set_finalizer(obj, td_finalize);
436
437 return obj;
438}
439
440void init_textcodec_module(void) {
441 ant_t *js = rt->js;
442 ant_value_t g = js_glob(js);
443
444 g_textencoder_proto = js_mkobj(js);
445 js_set_getter_desc(js, g_textencoder_proto, "encoding", 8, js_mkfun(js_textencoder_get_encoding), JS_DESC_C);
446 js_set(js, g_textencoder_proto, "encode", js_mkfun(js_textencoder_encode));
447 js_set(js, g_textencoder_proto, "encodeInto", js_mkfun(js_textencoder_encode_into));
448 js_set_sym(js, g_textencoder_proto, get_toStringTag_sym(), js_mkstr(js, "TextEncoder", 11));
449
450 ant_value_t te_ctor = js_make_ctor(js, js_textencoder_ctor, g_textencoder_proto, "TextEncoder", 11);
451 js_set(js, g, "TextEncoder", te_ctor);
452 js_set_descriptor(js, g, "TextEncoder", 11, JS_DESC_W | JS_DESC_C);
453
454 g_textdecoder_proto = js_mkobj(js);
455 js_set_getter_desc(js, g_textdecoder_proto, "encoding", 8, js_mkfun(js_textdecoder_get_encoding), JS_DESC_C);
456 js_set_getter_desc(js, g_textdecoder_proto, "fatal", 5, js_mkfun(js_textdecoder_get_fatal), JS_DESC_C);
457 js_set_getter_desc(js, g_textdecoder_proto, "ignoreBOM", 9, js_mkfun(js_textdecoder_get_ignore_bom), JS_DESC_C);
458 js_set(js, g_textdecoder_proto, "decode", js_mkfun(js_textdecoder_decode));
459 js_set_sym(js, g_textdecoder_proto, get_toStringTag_sym(), js_mkstr(js, "TextDecoder", 11));
460
461 ant_value_t td_ctor = js_make_ctor(js, js_textdecoder_ctor, g_textdecoder_proto, "TextDecoder", 11);
462 js_set(js, g, "TextDecoder", td_ctor);
463 js_set_descriptor(js, g, "TextDecoder", 11, JS_DESC_W | JS_DESC_C);
464}