MIRROR: javascript for 馃悳's, a tiny runtime with big ambitions
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at master 460 lines 16 kB view raw
1#include <stdlib.h> 2#include <string.h> 3#include <stdint.h> 4 5#include "ant.h" 6#include "ptr.h" 7#include "errors.h" 8#include "runtime.h" 9#include "internal.h" 10#include "descriptors.h" 11#include "utf8.h" 12 13#include "modules/textcodec.h" 14#include "modules/buffer.h" 15#include "modules/symbol.h" 16 17static ant_value_t g_textencoder_proto = 0; 18static ant_value_t g_textdecoder_proto = 0; 19 20enum { TEXT_DECODER_NATIVE_TAG = 0x54444543u }; // TDEC 21 22td_state_t *td_state_new(td_encoding_t enc, bool fatal, bool ignore_bom) { 23 td_state_t *st = calloc(1, sizeof(td_state_t)); 24 if (!st) return NULL; 25 st->encoding = enc; 26 st->fatal = fatal; 27 st->ignore_bom = ignore_bom; 28 return st; 29} 30 31static td_state_t *td_get_state(ant_value_t obj) { 32 return (td_state_t *)js_get_native(obj, TEXT_DECODER_NATIVE_TAG); 33} 34 35static void td_finalize(ant_t *js, ant_object_t *obj) { 36 ant_value_t value = js_obj_from_ptr(obj); 37 free(js_get_native(value, TEXT_DECODER_NATIVE_TAG)); 38 js_clear_native(value, TEXT_DECODER_NATIVE_TAG); 39} 40 41static int resolve_encoding(const char *s, size_t len) { 42 static const struct { const char *label; uint8_t len; td_encoding_t enc; } map[] = { 43 {"unicode-1-1-utf-8", 17, TD_ENC_UTF8}, {"unicode11utf8", 13, TD_ENC_UTF8}, 44 {"unicode20utf8", 13, TD_ENC_UTF8}, {"utf-8", 5, TD_ENC_UTF8}, 45 {"utf8", 4, TD_ENC_UTF8}, {"x-unicode20utf8",17, TD_ENC_UTF8}, 46 {"windows-1252", 12, TD_ENC_WINDOWS_1252}, {"ascii", 5, TD_ENC_WINDOWS_1252}, 47 {"unicodefffe", 11, TD_ENC_UTF16BE}, {"utf-16be", 8, TD_ENC_UTF16BE}, 48 {"csunicode", 9, TD_ENC_UTF16LE}, {"iso-10646-ucs-2",16, TD_ENC_UTF16LE}, 49 {"ucs-2", 5, TD_ENC_UTF16LE}, {"unicode", 7, TD_ENC_UTF16LE}, 50 {"unicodefeff", 11, TD_ENC_UTF16LE}, {"utf-16", 6, TD_ENC_UTF16LE}, 51 {"utf-16le", 8, TD_ENC_UTF16LE}, 52 {"iso-8859-2", 10, TD_ENC_ISO_8859_2}, 53 {NULL, 0, 0} 54 }; 55 for (int i = 0; map[i].label; i++) { 56 if (len == map[i].len && strncasecmp(s, map[i].label, len) == 0) return (int)map[i].enc; 57 } 58 return -1; 59} 60 61static const char *encoding_name(td_encoding_t enc) { 62switch (enc) { 63 case TD_ENC_UTF16LE: return "utf-16le"; 64 case TD_ENC_UTF16BE: return "utf-16be"; 65 case TD_ENC_WINDOWS_1252: return "windows-1252"; 66 case TD_ENC_ISO_8859_2: return "iso-8859-2"; 67 default: return "utf-8"; 68}} 69 70static const char *trim_label(const char *s, size_t len, size_t *out_len) { 71 while (len > 0 && (unsigned char)*s <= 0x20) { s++; len--; } 72 while (len > 0 && (unsigned char)s[len - 1] <= 0x20) { len--; } 73 *out_len = len; 74 return s; 75} 76 77static ant_value_t js_textencoder_get_encoding(ant_t *js, ant_value_t *args, int nargs) { 78 return js_mkstr(js, "utf-8", 5); 79} 80 81ant_value_t te_encode(ant_t *js, const char *str, size_t str_len) { 82 ArrayBufferData *ab = create_array_buffer_data(str_len); 83 if (!ab) return js_mkerr(js, "out of memory"); 84 85 if (str_len > 0) { 86 const uint8_t *s = (const uint8_t *)str; 87 uint8_t *d = ab->data; size_t i = 0; 88 89 while (i < str_len) { 90 if (s[i] == 0xED && i + 2 < str_len && s[i+1] >= 0xA0 && s[i+1] <= 0xBF) { 91 d[i] = 0xEF; d[i+1] = 0xBF; d[i+2] = 0xBD; 92 i += 3; 93 } else { d[i] = s[i]; i++; }} 94 } 95 96 return create_typed_array(js, TYPED_ARRAY_UINT8, ab, 0, str_len, "Uint8Array"); 97} 98 99static ant_value_t js_textencoder_encode(ant_t *js, ant_value_t *args, int nargs) { 100 size_t str_len = 0; 101 const char *str = ""; 102 103 if (nargs > 0 && vtype(args[0]) == T_STR) { 104 str = js_getstr(js, args[0], &str_len); 105 if (!str) { str = ""; str_len = 0; } 106 } else if (nargs > 0 && vtype(args[0]) != T_UNDEF) { 107 ant_value_t sv = js_tostring_val(js, args[0]); 108 if (is_err(sv)) return sv; 109 str = js_getstr(js, sv, &str_len); 110 if (!str) { str = ""; str_len = 0; } 111 } 112 113 return te_encode(js, str, str_len); 114} 115 116static ant_value_t js_textencoder_encode_into(ant_t *js, ant_value_t *args, int nargs) { 117 if (nargs < 2) return js_mkerr_typed(js, JS_ERR_TYPE, "encodeInto requires 2 arguments"); 118 119 size_t str_len = 0; 120 const char *str = ""; 121 if (vtype(args[0]) == T_STR) { 122 str = js_getstr(js, args[0], &str_len); 123 if (!str) { str = ""; str_len = 0; } 124 } else if (vtype(args[0]) != T_UNDEF) { 125 ant_value_t sv = js_tostring_val(js, args[0]); 126 if (is_err(sv)) return sv; 127 str = js_getstr(js, sv, &str_len); 128 if (!str) { str = ""; str_len = 0; } 129 } 130 131 TypedArrayData *ta = buffer_get_typedarray_data(args[1]); 132 if (!ta) return js_mkerr_typed(js, JS_ERR_TYPE, "Second argument must be a Uint8Array"); 133 134 uint8_t *dest = (ta->buffer && !ta->buffer->is_detached) 135 ? ta->buffer->data + ta->byte_offset : NULL; 136 size_t available = ta->byte_length; 137 138 const utf8proc_uint8_t *src = (const utf8proc_uint8_t *)str; 139 utf8proc_ssize_t src_len = (utf8proc_ssize_t)str_len; 140 utf8proc_ssize_t pos = 0; 141 142 size_t written = 0; 143 size_t read_units = 0; 144 145 while (pos < src_len) { 146 utf8proc_int32_t cp; 147 utf8proc_ssize_t n = utf8_next(src + pos, src_len - pos, &cp); 148 utf8proc_uint8_t tmp[4]; 149 utf8proc_ssize_t enc_len; 150 151 if (cp >= 0xD800 && cp <= 0xDFFF) { 152 tmp[0] = 0xEF; tmp[1] = 0xBF; tmp[2] = 0xBD; 153 enc_len = 3; 154 } else { 155 enc_len = (cp >= 0) ? utf8proc_encode_char(cp, tmp) : 0; 156 if (enc_len <= 0) { tmp[0] = 0xEF; tmp[1] = 0xBF; tmp[2] = 0xBD; enc_len = 3; } 157 } 158 159 if (written + (size_t)enc_len > available) break; 160 if (dest) memcpy(dest + written, tmp, (size_t)enc_len); 161 162 written += (size_t)enc_len; 163 pos += n; 164 read_units += (cp >= 0x10000 && cp <= 0x10FFFF) ? 2 : 1; 165 } 166 167 ant_value_t result = js_mkobj(js); 168 js_set(js, result, "read", js_mknum((double)read_units)); 169 js_set(js, result, "written", js_mknum((double)written)); 170 171 return result; 172} 173 174static ant_value_t js_textencoder_ctor(ant_t *js, ant_value_t *args, int nargs) { 175 if (vtype(js->new_target) == T_UNDEF) 176 return js_mkerr_typed(js, JS_ERR_TYPE, "TextEncoder constructor requires 'new'"); 177 ant_value_t obj = js_mkobj(js); 178 ant_value_t proto = js_instance_proto_from_new_target(js, g_textencoder_proto); 179 if (is_object_type(proto)) js_set_proto_init(obj, proto); 180 return obj; 181} 182 183static ant_value_t js_textdecoder_get_encoding(ant_t *js, ant_value_t *args, int nargs) { 184 td_state_t *st = td_get_state(js->this_val); 185 const char *name = encoding_name(st ? st->encoding : TD_ENC_UTF8); 186 return js_mkstr(js, name, strlen(name)); 187} 188 189static ant_value_t js_textdecoder_get_fatal(ant_t *js, ant_value_t *args, int nargs) { 190 td_state_t *st = td_get_state(js->this_val); 191 return (st && st->fatal) ? js_true : js_false; 192} 193 194static ant_value_t js_textdecoder_get_ignore_bom(ant_t *js, ant_value_t *args, int nargs) { 195 td_state_t *st = td_get_state(js->this_val); 196 return (st && st->ignore_bom) ? js_true : js_false; 197} 198 199static inline uint16_t u16_read(const uint8_t *p, bool be) { 200 return be 201 ? (uint16_t)((uint16_t)p[0] << 8 | p[1]) 202 : (uint16_t)((uint16_t)p[1] << 8 | p[0]); 203} 204 205static inline size_t u8_emit(char *out, size_t o, utf8proc_int32_t cp) { 206 utf8proc_ssize_t n = utf8proc_encode_char(cp, (utf8proc_uint8_t *)(out + o)); 207 return n > 0 ? o + (size_t)n : o; 208} 209 210static inline size_t u8_fffd(char *out, size_t o) { 211 out[o] = (char)0xEF; out[o+1] = (char)0xBF; out[o+2] = (char)0xBD; 212 return o + 3; 213} 214 215#define U16_IS_HIGH(cu) ((cu) >= 0xD800 && (cu) <= 0xDBFF) 216#define U16_IS_LOW(cu) ((cu) >= 0xDC00 && (cu) <= 0xDFFF) 217#define U16_PAIR(hi,lo) (0x10000 + ((uint32_t)((hi) - 0xD800) << 10) + ((lo) - 0xDC00)) 218 219static uint32_t decode_windows_1252_byte(uint8_t byte) { 220 static const uint16_t specials[32] = { 221 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 222 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, 223 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 224 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, 225 }; 226 if (byte < 0x80) return byte; 227 if (byte < 0xA0) return specials[byte - 0x80]; 228 return byte; 229} 230 231static uint32_t decode_iso_8859_2_byte(uint8_t byte) { 232 static const uint16_t upper[96] = { 233 0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7, 234 0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B, 235 0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7, 236 0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C, 237 0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7, 238 0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E, 239 0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7, 240 0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF, 241 0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7, 242 0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F, 243 0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7, 244 0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9, 245 }; 246 if (byte < 0xA0) return byte; 247 return upper[byte - 0xA0]; 248} 249 250static utf8proc_ssize_t decode_single_byte(td_state_t *st, const uint8_t *src, size_t len, char *out) { 251 size_t o = 0; 252 for (size_t i = 0; i < len; i++) { 253 uint32_t cp = (st->encoding == TD_ENC_WINDOWS_1252) 254 ? decode_windows_1252_byte(src[i]) 255 : decode_iso_8859_2_byte(src[i]); 256 o = u8_emit(out, o, (utf8proc_int32_t)cp); 257 } 258 return (utf8proc_ssize_t)o; 259} 260 261static utf8proc_ssize_t utf16_decode(td_state_t *st, const uint8_t *src, size_t len, char *out, bool stream) { 262 bool be = (st->encoding == TD_ENC_UTF16BE); 263 size_t i = 0, o = 0; 264 size_t avail; 265 266 if (!st->bom_seen) { 267 if (len < 2) goto pend_tail; 268 if (u16_read(src, be) == 0xFEFF && !st->ignore_bom) i = 2; 269 st->bom_seen = true; 270 } 271 272 while (i < len) { 273 avail = len - i; 274 275 if (avail < 2) goto pend_tail; 276 uint16_t cu = u16_read(src + i, be); 277 i += 2; 278 279 if (!U16_IS_HIGH(cu) && !U16_IS_LOW(cu)) { 280 o = u8_emit(out, o, (utf8proc_int32_t)cu); 281 continue; 282 } 283 284 if (U16_IS_LOW(cu)) goto err; 285 286 avail = len - i; 287 if (avail < 2) goto pend_hi; 288 289 uint16_t lo = u16_read(src + i, be); 290 if (U16_IS_LOW(lo)) { i += 2; o = u8_emit(out, o, (utf8proc_int32_t)U16_PAIR(cu, lo)); continue; } 291 292 goto err; 293 294 pend_tail: 295 if (stream) { st->pending[0] = src[i]; st->pending_len = 1; } 296 else { if (st->fatal) return -1; o = u8_fffd(out, o); } 297 break; 298 299 pend_hi: 300 if (stream) { st->pending_len = (int)(len - (i - 2)); memcpy(st->pending, src + i - 2, (size_t)st->pending_len); } 301 else { if (st->fatal) return -1; o = u8_fffd(out, o); if (avail == 1) o = u8_fffd(out, o); } 302 break; 303 304 err: 305 if (st->fatal) return -1; 306 o = u8_fffd(out, o); 307 continue; 308 } 309 310 return (utf8proc_ssize_t)o; 311} 312 313#undef U16_IS_HIGH 314#undef U16_IS_LOW 315#undef U16_PAIR 316 317ant_value_t td_decode(ant_t *js, td_state_t *st, const uint8_t *input, size_t input_len, bool stream_mode) { 318 size_t total = (size_t)st->pending_len + input_len; 319 if (total == 0) { 320 if (!stream_mode) st->bom_seen = false; 321 return js_mkstr(js, "", 0); 322 } 323 324 uint8_t *work = NULL; 325 const uint8_t *src; 326 if (st->pending_len > 0) { 327 work = malloc(total); 328 if (!work) return js_mkerr(js, "out of memory"); 329 memcpy(work, st->pending, (size_t)st->pending_len); 330 if (input && input_len > 0) memcpy(work + st->pending_len, input, input_len); 331 src = work; 332 } else src = input; 333 st->pending_len = 0; 334 335 char *out = malloc(total * 3 + 1); 336 if (!out) { free(work); return js_mkerr(js, "out of memory"); } 337 338 utf8proc_ssize_t n; 339 if (st->encoding == TD_ENC_UTF16LE || st->encoding == TD_ENC_UTF16BE) { 340 n = utf16_decode(st, src, total, out, stream_mode); 341 } else if (st->encoding == TD_ENC_WINDOWS_1252 || st->encoding == TD_ENC_ISO_8859_2) { 342 n = decode_single_byte(st, src, total, out); 343 st->pending_len = 0; 344 st->bom_seen = false; 345 } else { 346 utf8_dec_t dec = { .ignore_bom = st->ignore_bom, .bom_seen = st->bom_seen }; 347 n = utf8_whatwg_decode(&dec, src, total, out, st->fatal, stream_mode); 348 st->pending_len = dec.pend_pos; 349 memcpy(st->pending, dec.pend_buf, (size_t)dec.pend_pos); 350 st->bom_seen = stream_mode ? dec.bom_seen : false; 351 } 352 353 if (n < 0) { 354 free(work); free(out); 355 return js_mkerr_typed(js, JS_ERR_TYPE, "The encoded data was not valid."); 356 } 357 358 if (st->encoding != TD_ENC_UTF8) { 359 if (!stream_mode) st->bom_seen = false; 360 } 361 362 ant_value_t result = js_mkstr(js, out, (size_t)n); 363 free(work); 364 free(out); 365 366 return result; 367} 368 369static ant_value_t js_textdecoder_decode(ant_t *js, ant_value_t *args, int nargs) { 370 td_state_t *st = td_get_state(js->this_val); 371 if (!st) return js_mkerr_typed(js, JS_ERR_TYPE, "Invalid TextDecoder"); 372 373 bool stream_mode = false; 374 if (nargs > 1 && is_object_type(args[1])) { 375 ant_value_t sv = js_get(js, args[1], "stream"); 376 stream_mode = js_truthy(js, sv); 377 } 378 379 const uint8_t *input = NULL; 380 size_t input_len = 0; 381 if (nargs > 0 && is_object_type(args[0])) 382 buffer_source_get_bytes(js, args[0], &input, &input_len); 383 384 return td_decode(js, st, input, input_len, stream_mode); 385} 386 387static ant_value_t js_textdecoder_ctor(ant_t *js, ant_value_t *args, int nargs) { 388 if (vtype(js->new_target) == T_UNDEF) 389 return js_mkerr_typed(js, JS_ERR_TYPE, "TextDecoder constructor requires 'new'"); 390 391 td_encoding_t enc = TD_ENC_UTF8; 392 if (nargs > 0 && !is_undefined(args[0])) { 393 ant_value_t label = (vtype(args[0]) == T_STR) ? args[0] : coerce_to_str(js, args[0]); 394 if (is_err(label)) return label; 395 396 size_t llen; 397 const char *raw = js_getstr(js, label, &llen); 398 if (raw) { 399 size_t tlen; 400 const char *trimmed = trim_label(raw, llen, &tlen); 401 int resolved = resolve_encoding(trimmed, tlen); 402 403 if (resolved < 0) return js_mkerr_typed( 404 js, JS_ERR_RANGE, "Failed to construct 'TextDecoder': The encoding label provided ('%.*s') is invalid.", 405 (int)tlen, trimmed 406 ); 407 408 enc = (td_encoding_t)resolved; 409 }} 410 411 bool fatal = false; 412 bool ignore_bom = false; 413 414 if (nargs > 1 && is_object_type(args[1])) { 415 ant_value_t fv = js_getprop_fallback(js, args[1], "fatal"); 416 if (is_err(fv)) return fv; 417 if (vtype(fv) != T_UNDEF) fatal = js_truthy(js, fv); 418 ant_value_t bv = js_getprop_fallback(js, args[1], "ignoreBOM"); 419 if (is_err(bv)) return bv; 420 if (vtype(bv) != T_UNDEF) ignore_bom = js_truthy(js, bv); 421 } 422 423 td_state_t *st = td_state_new(enc, fatal, ignore_bom); 424 if (!st) return js_mkerr(js, "out of memory"); 425 426 ant_value_t obj = js_mkobj(js); 427 ant_value_t proto = js_instance_proto_from_new_target(js, g_textdecoder_proto); 428 429 if (is_object_type(proto)) js_set_proto_init(obj, proto); 430 js_set_native(obj, st, TEXT_DECODER_NATIVE_TAG); 431 js_set_finalizer(obj, td_finalize); 432 433 return obj; 434} 435 436void init_textcodec_module(void) { 437 ant_t *js = rt->js; 438 ant_value_t g = js_glob(js); 439 440 g_textencoder_proto = js_mkobj(js); 441 js_set_getter_desc(js, g_textencoder_proto, "encoding", 8, js_mkfun(js_textencoder_get_encoding), JS_DESC_C); 442 js_set(js, g_textencoder_proto, "encode", js_mkfun(js_textencoder_encode)); 443 js_set(js, g_textencoder_proto, "encodeInto", js_mkfun(js_textencoder_encode_into)); 444 js_set_sym(js, g_textencoder_proto, get_toStringTag_sym(), js_mkstr(js, "TextEncoder", 11)); 445 446 ant_value_t te_ctor = js_make_ctor(js, js_textencoder_ctor, g_textencoder_proto, "TextEncoder", 11); 447 js_set(js, g, "TextEncoder", te_ctor); 448 js_set_descriptor(js, g, "TextEncoder", 11, JS_DESC_W | JS_DESC_C); 449 450 g_textdecoder_proto = js_mkobj(js); 451 js_set_getter_desc(js, g_textdecoder_proto, "encoding", 8, js_mkfun(js_textdecoder_get_encoding), JS_DESC_C); 452 js_set_getter_desc(js, g_textdecoder_proto, "fatal", 5, js_mkfun(js_textdecoder_get_fatal), JS_DESC_C); 453 js_set_getter_desc(js, g_textdecoder_proto, "ignoreBOM", 9, js_mkfun(js_textdecoder_get_ignore_bom), JS_DESC_C); 454 js_set(js, g_textdecoder_proto, "decode", js_mkfun(js_textdecoder_decode)); 455 js_set_sym(js, g_textdecoder_proto, get_toStringTag_sym(), js_mkstr(js, "TextDecoder", 11)); 456 457 ant_value_t td_ctor = js_make_ctor(js, js_textdecoder_ctor, g_textdecoder_proto, "TextDecoder", 11); 458 js_set(js, g, "TextDecoder", td_ctor); 459 js_set_descriptor(js, g, "TextDecoder", 11, JS_DESC_W | JS_DESC_C); 460}