MIRROR: javascript for 🐜's, a tiny runtime with big ambitions
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

improve text encoding

+70 -20
+4
examples/spec/strings.js
··· 64 64 test('codePointAt utf8 2-byte', 'Γ©'.codePointAt(0), 233); 65 65 test('codePointAt utf8 3-byte', 'δΈ­'.codePointAt(0), 20013); 66 66 test('codePointAt utf8 4-byte', 'πŸ˜€'.codePointAt(0), 128512); 67 + test('charAt astral leading surrogate', 'πŸ’™'.charAt(0).charCodeAt(0), 0xD83D); 68 + test('charAt astral trailing surrogate', 'πŸ’™'.charAt(1).charCodeAt(0), 0xDC99); 67 69 68 70 test('replace', 'hello world'.replace('world', 'there'), 'hello there'); 69 71 test('replaceAll', 'a-b-c'.replaceAll('-', '_'), 'a_b_c'); ··· 75 77 76 78 test('length', 'hello'.length, 5); 77 79 test('bracket access', 'hello'[0], 'h'); 80 + test('bracket access astral leading surrogate', 'πŸ’™'[0].charCodeAt(0), 0xD83D); 81 + test('bracket access astral trailing surrogate', 'πŸ’™'[1].charCodeAt(0), 0xDC99); 78 82 79 83 test('concat', 'hello'.concat(' ', 'world'), 'hello world'); 80 84
+61 -18
src/ant.c
··· 3787 3787 return 0; 3788 3788 } 3789 3789 3790 + static ant_value_t js_string_from_utf16_code_unit(ant_t *js, uint32_t code_unit) { 3791 + char buf[4]; 3792 + size_t out_len = 0; 3793 + 3794 + if (code_unit >= 0xD800 && code_unit <= 0xDFFF) { 3795 + buf[0] = (char)(0xE0 | (code_unit >> 12)); 3796 + buf[1] = (char)(0x80 | ((code_unit >> 6) & 0x3F)); 3797 + buf[2] = (char)(0x80 | (code_unit & 0x3F)); 3798 + out_len = 3; 3799 + } else out_len = (size_t)utf8_encode(code_unit, buf); 3800 + 3801 + return js_mkstr(js, buf, out_len); 3802 + } 3803 + 3804 + static bool js_try_get_string_index( 3805 + ant_t *js, ant_value_t str, 3806 + const char *key, size_t key_len, ant_value_t *out 3807 + ) { 3808 + if (!is_array_index(key, (ant_offset_t)key_len)) return false; 3809 + 3810 + unsigned long idx = 0; 3811 + ant_offset_t byte_len = 0; 3812 + ant_offset_t str_off = vstr(js, str, &byte_len); 3813 + const char *str_data = (const char *)(uintptr_t)(str_off); 3814 + ant_offset_t str_len = (ant_offset_t)utf16_strlen(str_data, byte_len); 3815 + if (!parse_array_index(key, key_len, str_len, &idx)) return false; 3816 + 3817 + uint32_t code_unit = utf16_code_unit_at(str_data, byte_len, (ant_offset_t)idx); 3818 + if (code_unit == 0xFFFFFFFF) return false; 3819 + 3820 + *out = js_string_from_utf16_code_unit(js, code_unit); 3821 + return true; 3822 + } 3823 + 3790 3824 static ant_value_t getprop_any(ant_t *js, ant_value_t obj, const char *key, size_t key_len) { 3791 3825 uint8_t t = vtype(obj); 3792 3826 ··· 3794 3828 ant_offset_t byte_len; 3795 3829 ant_offset_t str_off = vstr(js, obj, &byte_len); 3796 3830 return tov(D(utf16_strlen((const char *)(uintptr_t)(str_off), byte_len))); 3831 + } 3832 + 3833 + if (t == T_STR) { 3834 + ant_value_t indexed = js_mkundef(); 3835 + if (js_try_get_string_index(js, obj, key, key_len, &indexed)) return indexed; 3797 3836 } 3798 3837 3799 3838 if (t == T_STR || t == T_NUM || t == T_BOOL || t == T_BIGINT) { ··· 4353 4392 static ant_value_t builtin_String(ant_t *js, ant_value_t *args, int nargs) { 4354 4393 ant_value_t sval; 4355 4394 4356 - if (nargs == 0) { 4357 - sval = js_mkstr(js, "", 0); 4358 - } else if (vtype(args[0]) == T_STR) { 4359 - sval = args[0]; 4360 - } else if (vtype(args[0]) == T_SYMBOL) { 4395 + if (nargs == 0) sval = js_mkstr(js, "", 0); 4396 + else if (vtype(args[0]) == T_STR) sval = args[0]; 4397 + 4398 + else if (vtype(args[0]) == T_SYMBOL) { 4361 4399 sval = js_symbol_to_string(js, args[0]); 4362 4400 if (is_err(sval)) return sval; 4363 4401 } else { ··· 4368 4406 ant_value_t string_proto = js_get_ctor_proto(js, "String", 6); 4369 4407 if (is_wrapper_ctor_target(js, js->this_val, string_proto)) { 4370 4408 set_slot(js->this_val, SLOT_PRIMITIVE, sval); 4371 - ant_offset_t slen; 4372 - vstr(js, sval, &slen); 4373 - js_setprop(js, js->this_val, js->length_str, tov((double)slen)); 4409 + 4410 + ant_offset_t byte_len; 4411 + ant_offset_t str_off = vstr(js, sval, &byte_len); 4412 + const char *str_data = (const char *)(uintptr_t)(str_off); 4413 + 4414 + js_setprop(js, js->this_val, js->length_str, tov((double)utf16_strlen(str_data, byte_len))); 4374 4415 js_set_descriptor(js, js_as_obj(js->this_val), "length", 6, 0); 4375 4416 } 4417 + 4376 4418 return sval; 4377 4419 } 4378 4420 ··· 8801 8843 ant_offset_t idx = (ant_offset_t) idx_d; 8802 8844 ant_offset_t byte_len; 8803 8845 ant_offset_t str_off = vstr(js, str, &byte_len); 8846 + 8804 8847 const char *str_data = (const char *)(uintptr_t)(str_off); 8848 + uint32_t code_unit = utf16_code_unit_at(str_data, byte_len, idx); 8849 + if (code_unit == 0xFFFFFFFF) return js_mkstr(js, "", 0); 8805 8850 8806 - size_t char_bytes; 8807 - int byte_offset = utf16_index_to_byte_offset(str_data, byte_len, idx, &char_bytes); 8808 - if (byte_offset < 0) return js_mkstr(js, "", 0); 8809 - 8810 - return js_mkstr(js, str_data + byte_offset, char_bytes); 8851 + return js_string_from_utf16_code_unit(js, code_unit); 8811 8852 } 8812 8853 8813 8854 static ant_value_t builtin_string_at(ant_t *js, ant_value_t *args, int nargs) { ··· 8824 8865 long idx = (long) idx_d; 8825 8866 if (idx < 0) idx += (long) utf16_len; 8826 8867 if (idx < 0 || idx >= (long) utf16_len) return js_mkundef(); 8827 - 8828 - size_t char_bytes; 8829 - int byte_offset = utf16_index_to_byte_offset(str_data, byte_len, idx, &char_bytes); 8830 - if (byte_offset < 0) return js_mkundef(); 8868 + 8869 + uint32_t code_unit = utf16_code_unit_at(str_data, byte_len, idx); 8870 + if (code_unit == 0xFFFFFFFF) return js_mkundef(); 8831 8871 8832 - return js_mkstr(js, str_data + byte_offset, char_bytes); 8872 + return js_string_from_utf16_code_unit(js, code_unit); 8833 8873 } 8834 8874 8835 8875 static ant_value_t builtin_string_localeCompare(ant_t *js, ant_value_t *args, int nargs) { ··· 12180 12220 *out = tov((double)utf16_strlen(str_data, byte_len)); 12181 12221 return true; 12182 12222 } 12223 + 12224 + if (t == T_STR && js_try_get_string_index(js, obj, key, key_len, out)) return true; 12183 12225 ant_value_t boxed = mkobj(js, 0); 12226 + 12184 12227 js_set_slot(js_as_obj(boxed), SLOT_PRIMITIVE, obj); 12185 12228 obj = boxed; t = T_OBJ; 12186 12229 }
+5 -2
src/modules/textcodec.c
··· 304 304 size_t i = 0, o = 0; 305 305 size_t avail; 306 306 307 - if (!st->bom_seen && len >= 2 && u16_read(src, be) == 0xFEFF && !st->ignore_bom) i = 2; 308 - st->bom_seen = true; 307 + if (!st->bom_seen) { 308 + if (len < 2) goto pend_tail; 309 + if (u16_read(src, be) == 0xFEFF && !st->ignore_bom) i = 2; 310 + st->bom_seen = true; 311 + } 309 312 310 313 while (i < len) { 311 314 avail = len - i;