MIRROR: javascript for 馃悳's, a tiny runtime with big ambitions
1#include <stdlib.h>
2#include <string.h>
3#include <stdint.h>
4
5#include "ant.h"
6#include "ptr.h"
7#include "errors.h"
8#include "runtime.h"
9#include "internal.h"
10#include "descriptors.h"
11#include "utf8.h"
12
13#include "modules/textcodec.h"
14#include "modules/buffer.h"
15#include "modules/symbol.h"
16
17static ant_value_t g_textencoder_proto = 0;
18static ant_value_t g_textdecoder_proto = 0;
19
20enum { TEXT_DECODER_NATIVE_TAG = 0x54444543u }; // TDEC
21
22td_state_t *td_state_new(td_encoding_t enc, bool fatal, bool ignore_bom) {
23 td_state_t *st = calloc(1, sizeof(td_state_t));
24 if (!st) return NULL;
25 st->encoding = enc;
26 st->fatal = fatal;
27 st->ignore_bom = ignore_bom;
28 return st;
29}
30
31static td_state_t *td_get_state(ant_value_t obj) {
32 return (td_state_t *)js_get_native(obj, TEXT_DECODER_NATIVE_TAG);
33}
34
35static void td_finalize(ant_t *js, ant_object_t *obj) {
36 ant_value_t value = js_obj_from_ptr(obj);
37 free(js_get_native(value, TEXT_DECODER_NATIVE_TAG));
38 js_clear_native(value, TEXT_DECODER_NATIVE_TAG);
39}
40
41static int resolve_encoding(const char *s, size_t len) {
42 static const struct { const char *label; uint8_t len; td_encoding_t enc; } map[] = {
43 {"unicode-1-1-utf-8", 17, TD_ENC_UTF8}, {"unicode11utf8", 13, TD_ENC_UTF8},
44 {"unicode20utf8", 13, TD_ENC_UTF8}, {"utf-8", 5, TD_ENC_UTF8},
45 {"utf8", 4, TD_ENC_UTF8}, {"x-unicode20utf8",17, TD_ENC_UTF8},
46 {"windows-1252", 12, TD_ENC_WINDOWS_1252}, {"ascii", 5, TD_ENC_WINDOWS_1252},
47 {"unicodefffe", 11, TD_ENC_UTF16BE}, {"utf-16be", 8, TD_ENC_UTF16BE},
48 {"csunicode", 9, TD_ENC_UTF16LE}, {"iso-10646-ucs-2",16, TD_ENC_UTF16LE},
49 {"ucs-2", 5, TD_ENC_UTF16LE}, {"unicode", 7, TD_ENC_UTF16LE},
50 {"unicodefeff", 11, TD_ENC_UTF16LE}, {"utf-16", 6, TD_ENC_UTF16LE},
51 {"utf-16le", 8, TD_ENC_UTF16LE},
52 {"iso-8859-2", 10, TD_ENC_ISO_8859_2},
53 {NULL, 0, 0}
54 };
55 for (int i = 0; map[i].label; i++) {
56 if (len == map[i].len && strncasecmp(s, map[i].label, len) == 0) return (int)map[i].enc;
57 }
58 return -1;
59}
60
61static const char *encoding_name(td_encoding_t enc) {
62switch (enc) {
63 case TD_ENC_UTF16LE: return "utf-16le";
64 case TD_ENC_UTF16BE: return "utf-16be";
65 case TD_ENC_WINDOWS_1252: return "windows-1252";
66 case TD_ENC_ISO_8859_2: return "iso-8859-2";
67 default: return "utf-8";
68}}
69
70static const char *trim_label(const char *s, size_t len, size_t *out_len) {
71 while (len > 0 && (unsigned char)*s <= 0x20) { s++; len--; }
72 while (len > 0 && (unsigned char)s[len - 1] <= 0x20) { len--; }
73 *out_len = len;
74 return s;
75}
76
77static ant_value_t js_textencoder_get_encoding(ant_t *js, ant_value_t *args, int nargs) {
78 return js_mkstr(js, "utf-8", 5);
79}
80
81ant_value_t te_encode(ant_t *js, const char *str, size_t str_len) {
82 ArrayBufferData *ab = create_array_buffer_data(str_len);
83 if (!ab) return js_mkerr(js, "out of memory");
84
85 if (str_len > 0) {
86 const uint8_t *s = (const uint8_t *)str;
87 uint8_t *d = ab->data; size_t i = 0;
88
89 while (i < str_len) {
90 if (s[i] == 0xED && i + 2 < str_len && s[i+1] >= 0xA0 && s[i+1] <= 0xBF) {
91 d[i] = 0xEF; d[i+1] = 0xBF; d[i+2] = 0xBD;
92 i += 3;
93 } else { d[i] = s[i]; i++; }}
94 }
95
96 return create_typed_array(js, TYPED_ARRAY_UINT8, ab, 0, str_len, "Uint8Array");
97}
98
99static ant_value_t js_textencoder_encode(ant_t *js, ant_value_t *args, int nargs) {
100 size_t str_len = 0;
101 const char *str = "";
102
103 if (nargs > 0 && vtype(args[0]) == T_STR) {
104 str = js_getstr(js, args[0], &str_len);
105 if (!str) { str = ""; str_len = 0; }
106 } else if (nargs > 0 && vtype(args[0]) != T_UNDEF) {
107 ant_value_t sv = js_tostring_val(js, args[0]);
108 if (is_err(sv)) return sv;
109 str = js_getstr(js, sv, &str_len);
110 if (!str) { str = ""; str_len = 0; }
111 }
112
113 return te_encode(js, str, str_len);
114}
115
116static ant_value_t js_textencoder_encode_into(ant_t *js, ant_value_t *args, int nargs) {
117 if (nargs < 2) return js_mkerr_typed(js, JS_ERR_TYPE, "encodeInto requires 2 arguments");
118
119 size_t str_len = 0;
120 const char *str = "";
121 if (vtype(args[0]) == T_STR) {
122 str = js_getstr(js, args[0], &str_len);
123 if (!str) { str = ""; str_len = 0; }
124 } else if (vtype(args[0]) != T_UNDEF) {
125 ant_value_t sv = js_tostring_val(js, args[0]);
126 if (is_err(sv)) return sv;
127 str = js_getstr(js, sv, &str_len);
128 if (!str) { str = ""; str_len = 0; }
129 }
130
131 TypedArrayData *ta = buffer_get_typedarray_data(args[1]);
132 if (!ta) return js_mkerr_typed(js, JS_ERR_TYPE, "Second argument must be a Uint8Array");
133
134 uint8_t *dest = (ta->buffer && !ta->buffer->is_detached)
135 ? ta->buffer->data + ta->byte_offset : NULL;
136 size_t available = ta->byte_length;
137
138 const utf8proc_uint8_t *src = (const utf8proc_uint8_t *)str;
139 utf8proc_ssize_t src_len = (utf8proc_ssize_t)str_len;
140 utf8proc_ssize_t pos = 0;
141
142 size_t written = 0;
143 size_t read_units = 0;
144
145 while (pos < src_len) {
146 utf8proc_int32_t cp;
147 utf8proc_ssize_t n = utf8_next(src + pos, src_len - pos, &cp);
148 utf8proc_uint8_t tmp[4];
149 utf8proc_ssize_t enc_len;
150
151 if (cp >= 0xD800 && cp <= 0xDFFF) {
152 tmp[0] = 0xEF; tmp[1] = 0xBF; tmp[2] = 0xBD;
153 enc_len = 3;
154 } else {
155 enc_len = (cp >= 0) ? utf8proc_encode_char(cp, tmp) : 0;
156 if (enc_len <= 0) { tmp[0] = 0xEF; tmp[1] = 0xBF; tmp[2] = 0xBD; enc_len = 3; }
157 }
158
159 if (written + (size_t)enc_len > available) break;
160 if (dest) memcpy(dest + written, tmp, (size_t)enc_len);
161
162 written += (size_t)enc_len;
163 pos += n;
164 read_units += (cp >= 0x10000 && cp <= 0x10FFFF) ? 2 : 1;
165 }
166
167 ant_value_t result = js_mkobj(js);
168 js_set(js, result, "read", js_mknum((double)read_units));
169 js_set(js, result, "written", js_mknum((double)written));
170
171 return result;
172}
173
174static ant_value_t js_textencoder_ctor(ant_t *js, ant_value_t *args, int nargs) {
175 if (vtype(js->new_target) == T_UNDEF)
176 return js_mkerr_typed(js, JS_ERR_TYPE, "TextEncoder constructor requires 'new'");
177 ant_value_t obj = js_mkobj(js);
178 ant_value_t proto = js_instance_proto_from_new_target(js, g_textencoder_proto);
179 if (is_object_type(proto)) js_set_proto_init(obj, proto);
180 return obj;
181}
182
183static ant_value_t js_textdecoder_get_encoding(ant_t *js, ant_value_t *args, int nargs) {
184 td_state_t *st = td_get_state(js->this_val);
185 const char *name = encoding_name(st ? st->encoding : TD_ENC_UTF8);
186 return js_mkstr(js, name, strlen(name));
187}
188
189static ant_value_t js_textdecoder_get_fatal(ant_t *js, ant_value_t *args, int nargs) {
190 td_state_t *st = td_get_state(js->this_val);
191 return (st && st->fatal) ? js_true : js_false;
192}
193
194static ant_value_t js_textdecoder_get_ignore_bom(ant_t *js, ant_value_t *args, int nargs) {
195 td_state_t *st = td_get_state(js->this_val);
196 return (st && st->ignore_bom) ? js_true : js_false;
197}
198
199static inline uint16_t u16_read(const uint8_t *p, bool be) {
200 return be
201 ? (uint16_t)((uint16_t)p[0] << 8 | p[1])
202 : (uint16_t)((uint16_t)p[1] << 8 | p[0]);
203}
204
205static inline size_t u8_emit(char *out, size_t o, utf8proc_int32_t cp) {
206 utf8proc_ssize_t n = utf8proc_encode_char(cp, (utf8proc_uint8_t *)(out + o));
207 return n > 0 ? o + (size_t)n : o;
208}
209
210static inline size_t u8_fffd(char *out, size_t o) {
211 out[o] = (char)0xEF; out[o+1] = (char)0xBF; out[o+2] = (char)0xBD;
212 return o + 3;
213}
214
215#define U16_IS_HIGH(cu) ((cu) >= 0xD800 && (cu) <= 0xDBFF)
216#define U16_IS_LOW(cu) ((cu) >= 0xDC00 && (cu) <= 0xDFFF)
217#define U16_PAIR(hi,lo) (0x10000 + ((uint32_t)((hi) - 0xD800) << 10) + ((lo) - 0xDC00))
218
219static uint32_t decode_windows_1252_byte(uint8_t byte) {
220 static const uint16_t specials[32] = {
221 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
222 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
223 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
224 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178,
225 };
226 if (byte < 0x80) return byte;
227 if (byte < 0xA0) return specials[byte - 0x80];
228 return byte;
229}
230
231static uint32_t decode_iso_8859_2_byte(uint8_t byte) {
232 static const uint16_t upper[96] = {
233 0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7,
234 0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B,
235 0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7,
236 0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C,
237 0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7,
238 0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E,
239 0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7,
240 0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF,
241 0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7,
242 0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F,
243 0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7,
244 0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9,
245 };
246 if (byte < 0xA0) return byte;
247 return upper[byte - 0xA0];
248}
249
250static utf8proc_ssize_t decode_single_byte(td_state_t *st, const uint8_t *src, size_t len, char *out) {
251 size_t o = 0;
252 for (size_t i = 0; i < len; i++) {
253 uint32_t cp = (st->encoding == TD_ENC_WINDOWS_1252)
254 ? decode_windows_1252_byte(src[i])
255 : decode_iso_8859_2_byte(src[i]);
256 o = u8_emit(out, o, (utf8proc_int32_t)cp);
257 }
258 return (utf8proc_ssize_t)o;
259}
260
261static utf8proc_ssize_t utf16_decode(td_state_t *st, const uint8_t *src, size_t len, char *out, bool stream) {
262 bool be = (st->encoding == TD_ENC_UTF16BE);
263 size_t i = 0, o = 0;
264 size_t avail;
265
266 if (!st->bom_seen) {
267 if (len < 2) goto pend_tail;
268 if (u16_read(src, be) == 0xFEFF && !st->ignore_bom) i = 2;
269 st->bom_seen = true;
270 }
271
272 while (i < len) {
273 avail = len - i;
274
275 if (avail < 2) goto pend_tail;
276 uint16_t cu = u16_read(src + i, be);
277 i += 2;
278
279 if (!U16_IS_HIGH(cu) && !U16_IS_LOW(cu)) {
280 o = u8_emit(out, o, (utf8proc_int32_t)cu);
281 continue;
282 }
283
284 if (U16_IS_LOW(cu)) goto err;
285
286 avail = len - i;
287 if (avail < 2) goto pend_hi;
288
289 uint16_t lo = u16_read(src + i, be);
290 if (U16_IS_LOW(lo)) { i += 2; o = u8_emit(out, o, (utf8proc_int32_t)U16_PAIR(cu, lo)); continue; }
291
292 goto err;
293
294 pend_tail:
295 if (stream) { st->pending[0] = src[i]; st->pending_len = 1; }
296 else { if (st->fatal) return -1; o = u8_fffd(out, o); }
297 break;
298
299 pend_hi:
300 if (stream) { st->pending_len = (int)(len - (i - 2)); memcpy(st->pending, src + i - 2, (size_t)st->pending_len); }
301 else { if (st->fatal) return -1; o = u8_fffd(out, o); if (avail == 1) o = u8_fffd(out, o); }
302 break;
303
304 err:
305 if (st->fatal) return -1;
306 o = u8_fffd(out, o);
307 continue;
308 }
309
310 return (utf8proc_ssize_t)o;
311}
312
313#undef U16_IS_HIGH
314#undef U16_IS_LOW
315#undef U16_PAIR
316
317ant_value_t td_decode(ant_t *js, td_state_t *st, const uint8_t *input, size_t input_len, bool stream_mode) {
318 size_t total = (size_t)st->pending_len + input_len;
319 if (total == 0) {
320 if (!stream_mode) st->bom_seen = false;
321 return js_mkstr(js, "", 0);
322 }
323
324 uint8_t *work = NULL;
325 const uint8_t *src;
326 if (st->pending_len > 0) {
327 work = malloc(total);
328 if (!work) return js_mkerr(js, "out of memory");
329 memcpy(work, st->pending, (size_t)st->pending_len);
330 if (input && input_len > 0) memcpy(work + st->pending_len, input, input_len);
331 src = work;
332 } else src = input;
333 st->pending_len = 0;
334
335 char *out = malloc(total * 3 + 1);
336 if (!out) { free(work); return js_mkerr(js, "out of memory"); }
337
338 utf8proc_ssize_t n;
339 if (st->encoding == TD_ENC_UTF16LE || st->encoding == TD_ENC_UTF16BE) {
340 n = utf16_decode(st, src, total, out, stream_mode);
341 } else if (st->encoding == TD_ENC_WINDOWS_1252 || st->encoding == TD_ENC_ISO_8859_2) {
342 n = decode_single_byte(st, src, total, out);
343 st->pending_len = 0;
344 st->bom_seen = false;
345 } else {
346 utf8_dec_t dec = { .ignore_bom = st->ignore_bom, .bom_seen = st->bom_seen };
347 n = utf8_whatwg_decode(&dec, src, total, out, st->fatal, stream_mode);
348 st->pending_len = dec.pend_pos;
349 memcpy(st->pending, dec.pend_buf, (size_t)dec.pend_pos);
350 st->bom_seen = stream_mode ? dec.bom_seen : false;
351 }
352
353 if (n < 0) {
354 free(work); free(out);
355 return js_mkerr_typed(js, JS_ERR_TYPE, "The encoded data was not valid.");
356 }
357
358 if (st->encoding != TD_ENC_UTF8) {
359 if (!stream_mode) st->bom_seen = false;
360 }
361
362 ant_value_t result = js_mkstr(js, out, (size_t)n);
363 free(work);
364 free(out);
365
366 return result;
367}
368
369static ant_value_t js_textdecoder_decode(ant_t *js, ant_value_t *args, int nargs) {
370 td_state_t *st = td_get_state(js->this_val);
371 if (!st) return js_mkerr_typed(js, JS_ERR_TYPE, "Invalid TextDecoder");
372
373 bool stream_mode = false;
374 if (nargs > 1 && is_object_type(args[1])) {
375 ant_value_t sv = js_get(js, args[1], "stream");
376 stream_mode = js_truthy(js, sv);
377 }
378
379 const uint8_t *input = NULL;
380 size_t input_len = 0;
381 if (nargs > 0 && is_object_type(args[0]))
382 buffer_source_get_bytes(js, args[0], &input, &input_len);
383
384 return td_decode(js, st, input, input_len, stream_mode);
385}
386
387static ant_value_t js_textdecoder_ctor(ant_t *js, ant_value_t *args, int nargs) {
388 if (vtype(js->new_target) == T_UNDEF)
389 return js_mkerr_typed(js, JS_ERR_TYPE, "TextDecoder constructor requires 'new'");
390
391 td_encoding_t enc = TD_ENC_UTF8;
392 if (nargs > 0 && !is_undefined(args[0])) {
393 ant_value_t label = (vtype(args[0]) == T_STR) ? args[0] : coerce_to_str(js, args[0]);
394 if (is_err(label)) return label;
395
396 size_t llen;
397 const char *raw = js_getstr(js, label, &llen);
398 if (raw) {
399 size_t tlen;
400 const char *trimmed = trim_label(raw, llen, &tlen);
401 int resolved = resolve_encoding(trimmed, tlen);
402
403 if (resolved < 0) return js_mkerr_typed(
404 js, JS_ERR_RANGE, "Failed to construct 'TextDecoder': The encoding label provided ('%.*s') is invalid.",
405 (int)tlen, trimmed
406 );
407
408 enc = (td_encoding_t)resolved;
409 }}
410
411 bool fatal = false;
412 bool ignore_bom = false;
413
414 if (nargs > 1 && is_object_type(args[1])) {
415 ant_value_t fv = js_getprop_fallback(js, args[1], "fatal");
416 if (is_err(fv)) return fv;
417 if (vtype(fv) != T_UNDEF) fatal = js_truthy(js, fv);
418 ant_value_t bv = js_getprop_fallback(js, args[1], "ignoreBOM");
419 if (is_err(bv)) return bv;
420 if (vtype(bv) != T_UNDEF) ignore_bom = js_truthy(js, bv);
421 }
422
423 td_state_t *st = td_state_new(enc, fatal, ignore_bom);
424 if (!st) return js_mkerr(js, "out of memory");
425
426 ant_value_t obj = js_mkobj(js);
427 ant_value_t proto = js_instance_proto_from_new_target(js, g_textdecoder_proto);
428
429 if (is_object_type(proto)) js_set_proto_init(obj, proto);
430 js_set_native(obj, st, TEXT_DECODER_NATIVE_TAG);
431 js_set_finalizer(obj, td_finalize);
432
433 return obj;
434}
435
436void init_textcodec_module(void) {
437 ant_t *js = rt->js;
438 ant_value_t g = js_glob(js);
439
440 g_textencoder_proto = js_mkobj(js);
441 js_set_getter_desc(js, g_textencoder_proto, "encoding", 8, js_mkfun(js_textencoder_get_encoding), JS_DESC_C);
442 js_set(js, g_textencoder_proto, "encode", js_mkfun(js_textencoder_encode));
443 js_set(js, g_textencoder_proto, "encodeInto", js_mkfun(js_textencoder_encode_into));
444 js_set_sym(js, g_textencoder_proto, get_toStringTag_sym(), js_mkstr(js, "TextEncoder", 11));
445
446 ant_value_t te_ctor = js_make_ctor(js, js_textencoder_ctor, g_textencoder_proto, "TextEncoder", 11);
447 js_set(js, g, "TextEncoder", te_ctor);
448 js_set_descriptor(js, g, "TextEncoder", 11, JS_DESC_W | JS_DESC_C);
449
450 g_textdecoder_proto = js_mkobj(js);
451 js_set_getter_desc(js, g_textdecoder_proto, "encoding", 8, js_mkfun(js_textdecoder_get_encoding), JS_DESC_C);
452 js_set_getter_desc(js, g_textdecoder_proto, "fatal", 5, js_mkfun(js_textdecoder_get_fatal), JS_DESC_C);
453 js_set_getter_desc(js, g_textdecoder_proto, "ignoreBOM", 9, js_mkfun(js_textdecoder_get_ignore_bom), JS_DESC_C);
454 js_set(js, g_textdecoder_proto, "decode", js_mkfun(js_textdecoder_decode));
455 js_set_sym(js, g_textdecoder_proto, get_toStringTag_sym(), js_mkstr(js, "TextDecoder", 11));
456
457 ant_value_t td_ctor = js_make_ctor(js, js_textdecoder_ctor, g_textdecoder_proto, "TextDecoder", 11);
458 js_set(js, g, "TextDecoder", td_ctor);
459 js_set_descriptor(js, g, "TextDecoder", 11, JS_DESC_W | JS_DESC_C);
460}