MIRROR: javascript for 馃悳's, a tiny runtime with big ambitions
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at mir/inline-method 464 lines 16 kB view raw
1#include <stdlib.h> 2#include <string.h> 3#include <stdint.h> 4 5#include "ant.h" 6#include "errors.h" 7#include "runtime.h" 8#include "internal.h" 9#include "descriptors.h" 10#include "utf8.h" 11 12#include "modules/textcodec.h" 13#include "modules/buffer.h" 14#include "modules/symbol.h" 15 16static ant_value_t g_textencoder_proto = 0; 17static ant_value_t g_textdecoder_proto = 0; 18 19td_state_t *td_state_new(td_encoding_t enc, bool fatal, bool ignore_bom) { 20 td_state_t *st = calloc(1, sizeof(td_state_t)); 21 if (!st) return NULL; 22 st->encoding = enc; 23 st->fatal = fatal; 24 st->ignore_bom = ignore_bom; 25 return st; 26} 27 28static td_state_t *td_get_state(ant_value_t obj) { 29 ant_value_t s = js_get_slot(obj, SLOT_DATA); 30 if (vtype(s) != T_NUM) return NULL; 31 return (td_state_t *)(uintptr_t)(size_t)js_getnum(s); 32} 33 34static void td_finalize(ant_t *js, ant_object_t *obj) { 35 if (!obj->extra_slots) return; 36 ant_extra_slot_t *entries = (ant_extra_slot_t *)obj->extra_slots; 37 38 for (uint8_t i = 0; i < obj->extra_count; i++) { 39 if (entries[i].slot == SLOT_DATA && vtype(entries[i].value) == T_NUM) { 40 free((td_state_t *)(uintptr_t)(size_t)js_getnum(entries[i].value)); 41 return; 42 }} 43} 44 45static int resolve_encoding(const char *s, size_t len) { 46 static const struct { const char *label; uint8_t len; td_encoding_t enc; } map[] = { 47 {"unicode-1-1-utf-8", 17, TD_ENC_UTF8}, {"unicode11utf8", 13, TD_ENC_UTF8}, 48 {"unicode20utf8", 13, TD_ENC_UTF8}, {"utf-8", 5, TD_ENC_UTF8}, 49 {"utf8", 4, TD_ENC_UTF8}, {"x-unicode20utf8",17, TD_ENC_UTF8}, 50 {"windows-1252", 12, TD_ENC_WINDOWS_1252}, {"ascii", 5, TD_ENC_WINDOWS_1252}, 51 {"unicodefffe", 11, TD_ENC_UTF16BE}, {"utf-16be", 8, TD_ENC_UTF16BE}, 52 {"csunicode", 9, TD_ENC_UTF16LE}, {"iso-10646-ucs-2",16, TD_ENC_UTF16LE}, 53 {"ucs-2", 5, TD_ENC_UTF16LE}, {"unicode", 7, TD_ENC_UTF16LE}, 54 {"unicodefeff", 11, TD_ENC_UTF16LE}, {"utf-16", 6, TD_ENC_UTF16LE}, 55 {"utf-16le", 8, TD_ENC_UTF16LE}, 56 {"iso-8859-2", 10, TD_ENC_ISO_8859_2}, 57 {NULL, 0, 0} 58 }; 59 for (int i = 0; map[i].label; i++) { 60 if (len == map[i].len && strncasecmp(s, map[i].label, len) == 0) return (int)map[i].enc; 61 } 62 return -1; 63} 64 65static const char *encoding_name(td_encoding_t enc) { 66switch (enc) { 67 case TD_ENC_UTF16LE: return "utf-16le"; 68 case TD_ENC_UTF16BE: return "utf-16be"; 69 case TD_ENC_WINDOWS_1252: return "windows-1252"; 70 case TD_ENC_ISO_8859_2: return "iso-8859-2"; 71 default: return "utf-8"; 72}} 73 74static const char *trim_label(const char *s, size_t len, size_t *out_len) { 75 while (len > 0 && (unsigned char)*s <= 0x20) { s++; len--; } 76 while (len > 0 && (unsigned char)s[len - 1] <= 0x20) { len--; } 77 *out_len = len; 78 return s; 79} 80 81static ant_value_t js_textencoder_get_encoding(ant_t *js, ant_value_t *args, int nargs) { 82 return js_mkstr(js, "utf-8", 5); 83} 84 85ant_value_t te_encode(ant_t *js, const char *str, size_t str_len) { 86 ArrayBufferData *ab = create_array_buffer_data(str_len); 87 if (!ab) return js_mkerr(js, "out of memory"); 88 89 if (str_len > 0) { 90 const uint8_t *s = (const uint8_t *)str; 91 uint8_t *d = ab->data; size_t i = 0; 92 93 while (i < str_len) { 94 if (s[i] == 0xED && i + 2 < str_len && s[i+1] >= 0xA0 && s[i+1] <= 0xBF) { 95 d[i] = 0xEF; d[i+1] = 0xBF; d[i+2] = 0xBD; 96 i += 3; 97 } else { d[i] = s[i]; i++; }} 98 } 99 100 return create_typed_array(js, TYPED_ARRAY_UINT8, ab, 0, str_len, "Uint8Array"); 101} 102 103static ant_value_t js_textencoder_encode(ant_t *js, ant_value_t *args, int nargs) { 104 size_t str_len = 0; 105 const char *str = ""; 106 107 if (nargs > 0 && vtype(args[0]) == T_STR) { 108 str = js_getstr(js, args[0], &str_len); 109 if (!str) { str = ""; str_len = 0; } 110 } else if (nargs > 0 && vtype(args[0]) != T_UNDEF) { 111 ant_value_t sv = js_tostring_val(js, args[0]); 112 if (is_err(sv)) return sv; 113 str = js_getstr(js, sv, &str_len); 114 if (!str) { str = ""; str_len = 0; } 115 } 116 117 return te_encode(js, str, str_len); 118} 119 120static ant_value_t js_textencoder_encode_into(ant_t *js, ant_value_t *args, int nargs) { 121 if (nargs < 2) return js_mkerr_typed(js, JS_ERR_TYPE, "encodeInto requires 2 arguments"); 122 123 size_t str_len = 0; 124 const char *str = ""; 125 if (vtype(args[0]) == T_STR) { 126 str = js_getstr(js, args[0], &str_len); 127 if (!str) { str = ""; str_len = 0; } 128 } else if (vtype(args[0]) != T_UNDEF) { 129 ant_value_t sv = js_tostring_val(js, args[0]); 130 if (is_err(sv)) return sv; 131 str = js_getstr(js, sv, &str_len); 132 if (!str) { str = ""; str_len = 0; } 133 } 134 135 TypedArrayData *ta = buffer_get_typedarray_data(args[1]); 136 if (!ta) return js_mkerr_typed(js, JS_ERR_TYPE, "Second argument must be a Uint8Array"); 137 138 uint8_t *dest = (ta->buffer && !ta->buffer->is_detached) 139 ? ta->buffer->data + ta->byte_offset : NULL; 140 size_t available = ta->byte_length; 141 142 const utf8proc_uint8_t *src = (const utf8proc_uint8_t *)str; 143 utf8proc_ssize_t src_len = (utf8proc_ssize_t)str_len; 144 utf8proc_ssize_t pos = 0; 145 146 size_t written = 0; 147 size_t read_units = 0; 148 149 while (pos < src_len) { 150 utf8proc_int32_t cp; 151 utf8proc_ssize_t n = utf8_next(src + pos, src_len - pos, &cp); 152 utf8proc_uint8_t tmp[4]; 153 utf8proc_ssize_t enc_len; 154 155 if (cp >= 0xD800 && cp <= 0xDFFF) { 156 tmp[0] = 0xEF; tmp[1] = 0xBF; tmp[2] = 0xBD; 157 enc_len = 3; 158 } else { 159 enc_len = (cp >= 0) ? utf8proc_encode_char(cp, tmp) : 0; 160 if (enc_len <= 0) { tmp[0] = 0xEF; tmp[1] = 0xBF; tmp[2] = 0xBD; enc_len = 3; } 161 } 162 163 if (written + (size_t)enc_len > available) break; 164 if (dest) memcpy(dest + written, tmp, (size_t)enc_len); 165 166 written += (size_t)enc_len; 167 pos += n; 168 read_units += (cp >= 0x10000 && cp <= 0x10FFFF) ? 2 : 1; 169 } 170 171 ant_value_t result = js_mkobj(js); 172 js_set(js, result, "read", js_mknum((double)read_units)); 173 js_set(js, result, "written", js_mknum((double)written)); 174 175 return result; 176} 177 178static ant_value_t js_textencoder_ctor(ant_t *js, ant_value_t *args, int nargs) { 179 if (vtype(js->new_target) == T_UNDEF) 180 return js_mkerr_typed(js, JS_ERR_TYPE, "TextEncoder constructor requires 'new'"); 181 ant_value_t obj = js_mkobj(js); 182 ant_value_t proto = js_instance_proto_from_new_target(js, g_textencoder_proto); 183 if (is_object_type(proto)) js_set_proto_init(obj, proto); 184 return obj; 185} 186 187static ant_value_t js_textdecoder_get_encoding(ant_t *js, ant_value_t *args, int nargs) { 188 td_state_t *st = td_get_state(js->this_val); 189 const char *name = encoding_name(st ? st->encoding : TD_ENC_UTF8); 190 return js_mkstr(js, name, strlen(name)); 191} 192 193static ant_value_t js_textdecoder_get_fatal(ant_t *js, ant_value_t *args, int nargs) { 194 td_state_t *st = td_get_state(js->this_val); 195 return (st && st->fatal) ? js_true : js_false; 196} 197 198static ant_value_t js_textdecoder_get_ignore_bom(ant_t *js, ant_value_t *args, int nargs) { 199 td_state_t *st = td_get_state(js->this_val); 200 return (st && st->ignore_bom) ? js_true : js_false; 201} 202 203static inline uint16_t u16_read(const uint8_t *p, bool be) { 204 return be 205 ? (uint16_t)((uint16_t)p[0] << 8 | p[1]) 206 : (uint16_t)((uint16_t)p[1] << 8 | p[0]); 207} 208 209static inline size_t u8_emit(char *out, size_t o, utf8proc_int32_t cp) { 210 utf8proc_ssize_t n = utf8proc_encode_char(cp, (utf8proc_uint8_t *)(out + o)); 211 return n > 0 ? o + (size_t)n : o; 212} 213 214static inline size_t u8_fffd(char *out, size_t o) { 215 out[o] = (char)0xEF; out[o+1] = (char)0xBF; out[o+2] = (char)0xBD; 216 return o + 3; 217} 218 219#define U16_IS_HIGH(cu) ((cu) >= 0xD800 && (cu) <= 0xDBFF) 220#define U16_IS_LOW(cu) ((cu) >= 0xDC00 && (cu) <= 0xDFFF) 221#define U16_PAIR(hi,lo) (0x10000 + ((uint32_t)((hi) - 0xD800) << 10) + ((lo) - 0xDC00)) 222 223static uint32_t decode_windows_1252_byte(uint8_t byte) { 224 static const uint16_t specials[32] = { 225 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 226 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, 227 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 228 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, 229 }; 230 if (byte < 0x80) return byte; 231 if (byte < 0xA0) return specials[byte - 0x80]; 232 return byte; 233} 234 235static uint32_t decode_iso_8859_2_byte(uint8_t byte) { 236 static const uint16_t upper[96] = { 237 0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7, 238 0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B, 239 0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7, 240 0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C, 241 0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7, 242 0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E, 243 0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7, 244 0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF, 245 0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7, 246 0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F, 247 0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7, 248 0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9, 249 }; 250 if (byte < 0xA0) return byte; 251 return upper[byte - 0xA0]; 252} 253 254static utf8proc_ssize_t decode_single_byte(td_state_t *st, const uint8_t *src, size_t len, char *out) { 255 size_t o = 0; 256 for (size_t i = 0; i < len; i++) { 257 uint32_t cp = (st->encoding == TD_ENC_WINDOWS_1252) 258 ? decode_windows_1252_byte(src[i]) 259 : decode_iso_8859_2_byte(src[i]); 260 o = u8_emit(out, o, (utf8proc_int32_t)cp); 261 } 262 return (utf8proc_ssize_t)o; 263} 264 265static utf8proc_ssize_t utf16_decode(td_state_t *st, const uint8_t *src, size_t len, char *out, bool stream) { 266 bool be = (st->encoding == TD_ENC_UTF16BE); 267 size_t i = 0, o = 0; 268 size_t avail; 269 270 if (!st->bom_seen) { 271 if (len < 2) goto pend_tail; 272 if (u16_read(src, be) == 0xFEFF && !st->ignore_bom) i = 2; 273 st->bom_seen = true; 274 } 275 276 while (i < len) { 277 avail = len - i; 278 279 if (avail < 2) goto pend_tail; 280 uint16_t cu = u16_read(src + i, be); 281 i += 2; 282 283 if (!U16_IS_HIGH(cu) && !U16_IS_LOW(cu)) { 284 o = u8_emit(out, o, (utf8proc_int32_t)cu); 285 continue; 286 } 287 288 if (U16_IS_LOW(cu)) goto err; 289 290 avail = len - i; 291 if (avail < 2) goto pend_hi; 292 293 uint16_t lo = u16_read(src + i, be); 294 if (U16_IS_LOW(lo)) { i += 2; o = u8_emit(out, o, (utf8proc_int32_t)U16_PAIR(cu, lo)); continue; } 295 296 goto err; 297 298 pend_tail: 299 if (stream) { st->pending[0] = src[i]; st->pending_len = 1; } 300 else { if (st->fatal) return -1; o = u8_fffd(out, o); } 301 break; 302 303 pend_hi: 304 if (stream) { st->pending_len = (int)(len - (i - 2)); memcpy(st->pending, src + i - 2, (size_t)st->pending_len); } 305 else { if (st->fatal) return -1; o = u8_fffd(out, o); if (avail == 1) o = u8_fffd(out, o); } 306 break; 307 308 err: 309 if (st->fatal) return -1; 310 o = u8_fffd(out, o); 311 continue; 312 } 313 314 return (utf8proc_ssize_t)o; 315} 316 317#undef U16_IS_HIGH 318#undef U16_IS_LOW 319#undef U16_PAIR 320 321ant_value_t td_decode(ant_t *js, td_state_t *st, const uint8_t *input, size_t input_len, bool stream_mode) { 322 size_t total = (size_t)st->pending_len + input_len; 323 if (total == 0) { 324 if (!stream_mode) st->bom_seen = false; 325 return js_mkstr(js, "", 0); 326 } 327 328 uint8_t *work = NULL; 329 const uint8_t *src; 330 if (st->pending_len > 0) { 331 work = malloc(total); 332 if (!work) return js_mkerr(js, "out of memory"); 333 memcpy(work, st->pending, (size_t)st->pending_len); 334 if (input && input_len > 0) memcpy(work + st->pending_len, input, input_len); 335 src = work; 336 } else src = input; 337 st->pending_len = 0; 338 339 char *out = malloc(total * 3 + 1); 340 if (!out) { free(work); return js_mkerr(js, "out of memory"); } 341 342 utf8proc_ssize_t n; 343 if (st->encoding == TD_ENC_UTF16LE || st->encoding == TD_ENC_UTF16BE) { 344 n = utf16_decode(st, src, total, out, stream_mode); 345 } else if (st->encoding == TD_ENC_WINDOWS_1252 || st->encoding == TD_ENC_ISO_8859_2) { 346 n = decode_single_byte(st, src, total, out); 347 st->pending_len = 0; 348 st->bom_seen = false; 349 } else { 350 utf8_dec_t dec = { .ignore_bom = st->ignore_bom, .bom_seen = st->bom_seen }; 351 n = utf8_whatwg_decode(&dec, src, total, out, st->fatal, stream_mode); 352 st->pending_len = dec.pend_pos; 353 memcpy(st->pending, dec.pend_buf, (size_t)dec.pend_pos); 354 st->bom_seen = stream_mode ? dec.bom_seen : false; 355 } 356 357 if (n < 0) { 358 free(work); free(out); 359 return js_mkerr_typed(js, JS_ERR_TYPE, "The encoded data was not valid."); 360 } 361 362 if (st->encoding != TD_ENC_UTF8) { 363 if (!stream_mode) st->bom_seen = false; 364 } 365 366 ant_value_t result = js_mkstr(js, out, (size_t)n); 367 free(work); 368 free(out); 369 370 return result; 371} 372 373static ant_value_t js_textdecoder_decode(ant_t *js, ant_value_t *args, int nargs) { 374 td_state_t *st = td_get_state(js->this_val); 375 if (!st) return js_mkerr_typed(js, JS_ERR_TYPE, "Invalid TextDecoder"); 376 377 bool stream_mode = false; 378 if (nargs > 1 && is_object_type(args[1])) { 379 ant_value_t sv = js_get(js, args[1], "stream"); 380 stream_mode = js_truthy(js, sv); 381 } 382 383 const uint8_t *input = NULL; 384 size_t input_len = 0; 385 if (nargs > 0 && is_object_type(args[0])) 386 buffer_source_get_bytes(js, args[0], &input, &input_len); 387 388 return td_decode(js, st, input, input_len, stream_mode); 389} 390 391static ant_value_t js_textdecoder_ctor(ant_t *js, ant_value_t *args, int nargs) { 392 if (vtype(js->new_target) == T_UNDEF) 393 return js_mkerr_typed(js, JS_ERR_TYPE, "TextDecoder constructor requires 'new'"); 394 395 td_encoding_t enc = TD_ENC_UTF8; 396 if (nargs > 0 && !is_undefined(args[0])) { 397 ant_value_t label = (vtype(args[0]) == T_STR) ? args[0] : coerce_to_str(js, args[0]); 398 if (is_err(label)) return label; 399 400 size_t llen; 401 const char *raw = js_getstr(js, label, &llen); 402 if (raw) { 403 size_t tlen; 404 const char *trimmed = trim_label(raw, llen, &tlen); 405 int resolved = resolve_encoding(trimmed, tlen); 406 407 if (resolved < 0) return js_mkerr_typed( 408 js, JS_ERR_RANGE, "Failed to construct 'TextDecoder': The encoding label provided ('%.*s') is invalid.", 409 (int)tlen, trimmed 410 ); 411 412 enc = (td_encoding_t)resolved; 413 }} 414 415 bool fatal = false; 416 bool ignore_bom = false; 417 418 if (nargs > 1 && is_object_type(args[1])) { 419 ant_value_t fv = js_getprop_fallback(js, args[1], "fatal"); 420 if (is_err(fv)) return fv; 421 if (vtype(fv) != T_UNDEF) fatal = js_truthy(js, fv); 422 ant_value_t bv = js_getprop_fallback(js, args[1], "ignoreBOM"); 423 if (is_err(bv)) return bv; 424 if (vtype(bv) != T_UNDEF) ignore_bom = js_truthy(js, bv); 425 } 426 427 td_state_t *st = td_state_new(enc, fatal, ignore_bom); 428 if (!st) return js_mkerr(js, "out of memory"); 429 430 ant_value_t obj = js_mkobj(js); 431 ant_value_t proto = js_instance_proto_from_new_target(js, g_textdecoder_proto); 432 433 if (is_object_type(proto)) js_set_proto_init(obj, proto); 434 js_set_slot(obj, SLOT_DATA, ANT_PTR(st)); 435 js_set_finalizer(obj, td_finalize); 436 437 return obj; 438} 439 440void init_textcodec_module(void) { 441 ant_t *js = rt->js; 442 ant_value_t g = js_glob(js); 443 444 g_textencoder_proto = js_mkobj(js); 445 js_set_getter_desc(js, g_textencoder_proto, "encoding", 8, js_mkfun(js_textencoder_get_encoding), JS_DESC_C); 446 js_set(js, g_textencoder_proto, "encode", js_mkfun(js_textencoder_encode)); 447 js_set(js, g_textencoder_proto, "encodeInto", js_mkfun(js_textencoder_encode_into)); 448 js_set_sym(js, g_textencoder_proto, get_toStringTag_sym(), js_mkstr(js, "TextEncoder", 11)); 449 450 ant_value_t te_ctor = js_make_ctor(js, js_textencoder_ctor, g_textencoder_proto, "TextEncoder", 11); 451 js_set(js, g, "TextEncoder", te_ctor); 452 js_set_descriptor(js, g, "TextEncoder", 11, JS_DESC_W | JS_DESC_C); 453 454 g_textdecoder_proto = js_mkobj(js); 455 js_set_getter_desc(js, g_textdecoder_proto, "encoding", 8, js_mkfun(js_textdecoder_get_encoding), JS_DESC_C); 456 js_set_getter_desc(js, g_textdecoder_proto, "fatal", 5, js_mkfun(js_textdecoder_get_fatal), JS_DESC_C); 457 js_set_getter_desc(js, g_textdecoder_proto, "ignoreBOM", 9, js_mkfun(js_textdecoder_get_ignore_bom), JS_DESC_C); 458 js_set(js, g_textdecoder_proto, "decode", js_mkfun(js_textdecoder_decode)); 459 js_set_sym(js, g_textdecoder_proto, get_toStringTag_sym(), js_mkstr(js, "TextDecoder", 11)); 460 461 ant_value_t td_ctor = js_make_ctor(js, js_textdecoder_ctor, g_textdecoder_proto, "TextDecoder", 11); 462 js_set(js, g, "TextDecoder", td_ctor); 463 js_set_descriptor(js, g, "TextDecoder", 11, JS_DESC_W | JS_DESC_C); 464}