MIRROR: javascript for ๐Ÿœ's, a tiny runtime with big ambitions
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

add utf16 index and range functions

+232 -87
+19
include/utf8.h
··· 11 11 size_t utf8_strlen(const char *str, size_t byte_len); 12 12 size_t utf16_strlen(const char *str, size_t byte_len); 13 13 14 + int utf16_index_to_byte_offset( 15 + const char *str, 16 + size_t byte_len, 17 + size_t utf16_idx, 18 + size_t *out_char_bytes 19 + ); 20 + 21 + int utf16_range_to_byte_range( 22 + const char *str, 23 + size_t byte_len, 24 + size_t utf16_start, 25 + size_t utf16_end, 26 + size_t *byte_start, 27 + size_t *byte_end 28 + ); 29 + 30 + uint32_t utf16_code_unit_at(const char *str, size_t byte_len, size_t utf16_idx); 31 + uint32_t utf16_codepoint_at(const char *str, size_t byte_len, size_t utf16_idx); 32 + 14 33 #endif
+61 -67
src/ant.c
··· 5612 5612 } 5613 5613 if (!isnan(idx_d) && idx_d >= 0 && idx_d == (double)(long)idx_d) { 5614 5614 jsoff_t idx = (jsoff_t) idx_d; 5615 - jsoff_t str_len = offtolen(loadoff(js, (jsoff_t) vdata(obj))); 5616 - if (idx < str_len) { 5617 - jsoff_t str_off = (jsoff_t) vdata(obj) + sizeof(jsoff_t); 5618 - char ch[2] = {js->mem[str_off + idx], 0}; 5619 - return js_mkstr(js, ch, 1); 5615 + jsoff_t byte_len = offtolen(loadoff(js, (jsoff_t) vdata(obj))); 5616 + jsoff_t str_off = (jsoff_t) vdata(obj) + sizeof(jsoff_t); 5617 + const char *str_data = (const char *)&js->mem[str_off]; 5618 + size_t char_bytes; 5619 + int byte_offset = utf16_index_to_byte_offset(str_data, byte_len, idx, &char_bytes); 5620 + if (byte_offset >= 0) { 5621 + return js_mkstr(js, str_data + byte_offset, char_bytes); 5620 5622 } 5621 5623 } 5622 5624 jsoff_t off = lkp_proto(js, obj, keystr, keylen); ··· 17137 17139 static jsval_t builtin_string_substring(struct js *js, jsval_t *args, int nargs) { 17138 17140 jsval_t str = to_string_val(js, js->this_val); 17139 17141 if (vtype(str) != T_STR) return js_mkerr(js, "substring called on non-string"); 17140 - jsoff_t str_len, str_off = vstr(js, str, &str_len); 17142 + jsoff_t byte_len, str_off = vstr(js, str, &byte_len); 17141 17143 const char *str_ptr = (char *) &js->mem[str_off]; 17142 - jsoff_t start = 0, end = str_len; 17143 - double dstr_len2 = D(str_len); 17144 + size_t utf16_len = utf16_strlen(str_ptr, byte_len); 17145 + jsoff_t start = 0, end = (jsoff_t)utf16_len; 17146 + double dstr_len2 = D(utf16_len); 17144 17147 17145 17148 if (nargs >= 1 && vtype(args[0]) == T_NUM) { 17146 17149 double d = tod(args[0]); ··· 17158 17161 end = tmp; 17159 17162 } 17160 17163 17161 - jsoff_t sub_len = end - start; 17162 - return js_mkstr(js, str_ptr + start, sub_len); 17164 + size_t byte_start, byte_end; 17165 + utf16_range_to_byte_range(str_ptr, byte_len, start, end, &byte_start, &byte_end); 17166 + return js_mkstr(js, str_ptr + byte_start, byte_end - byte_start); 17163 17167 } 17164 17168 17165 17169 static jsval_t builtin_string_substr(struct js *js, jsval_t *args, int nargs) { 17166 17170 jsval_t str = to_string_val(js, js->this_val); 17167 17171 if (vtype(str) != T_STR) return js_mkerr(js, "substr called on non-string"); 17168 - jsoff_t str_len, str_off = vstr(js, str, &str_len); 17172 + jsoff_t byte_len, str_off = vstr(js, str, &byte_len); 17169 17173 const char *str_ptr = (char *) &js->mem[str_off]; 17174 + size_t utf16_len = utf16_strlen(str_ptr, byte_len); 17170 17175 17171 - if (nargs < 1) return js_mkstr(js, str_ptr, str_len); 17176 + if (nargs < 1) return js_mkstr(js, str_ptr, byte_len); 17172 17177 17173 17178 double d_start = tod(args[0]); 17174 17179 jsoff_t start; 17175 17180 if (d_start < 0) { 17176 - start = (jsoff_t)((double)str_len + d_start); 17181 + start = (jsoff_t)((double)utf16_len + d_start); 17177 17182 if ((int)start < 0) start = 0; 17178 17183 } else { 17179 17184 start = (jsoff_t)d_start; 17180 17185 } 17181 - if (start > str_len) start = str_len; 17186 + if (start > (jsoff_t)utf16_len) start = (jsoff_t)utf16_len; 17182 17187 17183 - jsoff_t len = str_len - start; 17188 + jsoff_t len = (jsoff_t)utf16_len - start; 17184 17189 if (nargs >= 2 && vtype(args[1]) == T_NUM) { 17185 17190 double d = tod(args[1]); 17186 17191 if (d < 0) d = 0; 17187 17192 len = (jsoff_t)d; 17188 17193 } 17189 - if (start + len > str_len) len = str_len - start; 17194 + if (start + len > (jsoff_t)utf16_len) len = (jsoff_t)utf16_len - start; 17190 17195 17191 - return js_mkstr(js, str_ptr + start, len); 17196 + size_t byte_start, byte_end; 17197 + utf16_range_to_byte_range(str_ptr, byte_len, start, start + len, &byte_start, &byte_end); 17198 + return js_mkstr(js, str_ptr + byte_start, byte_end - byte_start); 17192 17199 } 17193 17200 17194 17201 static jsval_t builtin_string_split(struct js *js, jsval_t *args, int nargs) { ··· 17409 17416 jsval_t this_unwrapped = unwrap_primitive(js, js->this_val); 17410 17417 jsval_t str = js_tostring_val(js, this_unwrapped); 17411 17418 if (is_err(str)) return str; 17412 - jsoff_t str_len, str_off = vstr(js, str, &str_len); 17419 + jsoff_t byte_len, str_off = vstr(js, str, &byte_len); 17413 17420 const char *str_ptr = (char *) &js->mem[str_off]; 17414 - jsoff_t start = 0, end = str_len; 17415 - double dstr_len = D(str_len); 17421 + size_t utf16_len = utf16_strlen(str_ptr, byte_len); 17422 + jsoff_t start = 0, end = (jsoff_t)utf16_len; 17423 + double dstr_len = D(utf16_len); 17416 17424 17417 17425 if (nargs >= 1 && vtype(args[0]) == T_NUM) { 17418 17426 double d = tod(args[0]); ··· 17432 17440 } 17433 17441 17434 17442 if (start > end) start = end; 17435 - jsoff_t sub_len = end - start; 17436 - return js_mkstr(js, str_ptr + start, sub_len); 17443 + size_t byte_start, byte_end; 17444 + utf16_range_to_byte_range(str_ptr, byte_len, start, end, &byte_start, &byte_end); 17445 + return js_mkstr(js, str_ptr + byte_start, byte_end - byte_start); 17437 17446 } 17438 17447 17439 17448 static jsval_t builtin_string_includes(struct js *js, jsval_t *args, int nargs) { ··· 18130 18139 long idx_l = (long) idx_d; 18131 18140 if (idx_l < 0) return tov(JS_NAN); 18132 18141 18133 - jsoff_t idx = (jsoff_t) idx_l; 18134 - jsoff_t str_len = offtolen(loadoff(js, (jsoff_t) vdata(str))); 18135 - 18136 - if (idx >= str_len) return tov(JS_NAN); 18137 - 18142 + jsoff_t byte_len = offtolen(loadoff(js, (jsoff_t) vdata(str))); 18138 18143 jsoff_t str_off = (jsoff_t) vdata(str) + sizeof(jsoff_t); 18139 - unsigned char ch = (unsigned char) js->mem[str_off + idx]; 18144 + const char *str_data = (const char *)&js->mem[str_off]; 18140 18145 18141 - return tov((double) ch); 18146 + uint32_t code_unit = utf16_code_unit_at(str_data, byte_len, idx_l); 18147 + if (code_unit == 0xFFFFFFFF) return tov(JS_NAN); 18148 + 18149 + return tov((double) code_unit); 18142 18150 } 18143 18151 18144 18152 static jsval_t builtin_string_codePointAt(struct js *js, jsval_t *args, int nargs) { ··· 18152 18160 long idx_l = (long) idx_d; 18153 18161 if (idx_l < 0) return js_mkundef(); 18154 18162 18155 - jsoff_t idx = (jsoff_t) idx_l; 18156 - jsoff_t str_len = offtolen(loadoff(js, (jsoff_t) vdata(str))); 18157 - 18158 - if (idx >= str_len) return js_mkundef(); 18159 - 18163 + jsoff_t byte_len = offtolen(loadoff(js, (jsoff_t) vdata(str))); 18160 18164 jsoff_t str_off = (jsoff_t) vdata(str) + sizeof(jsoff_t); 18161 - const unsigned char *s = &js->mem[str_off + idx]; 18162 - jsoff_t remaining = str_len - idx; 18165 + const char *str_data = (const char *)&js->mem[str_off]; 18163 18166 18164 - unsigned char c0 = s[0]; 18165 - if (c0 < 0x80) return tov((double) c0); 18167 + uint32_t cp = utf16_codepoint_at(str_data, byte_len, idx_l); 18168 + if (cp == 0xFFFFFFFF) return js_mkundef(); 18166 18169 18167 - if ((c0 & 0xE0) == 0xC0 && remaining >= 2 && (s[1] & 0xC0) == 0x80) { 18168 - uint32_t cp = ((c0 & 0x1F) << 6) | (s[1] & 0x3F); 18169 - return tov((double) cp); 18170 - } 18171 - 18172 - if ((c0 & 0xF0) == 0xE0 && remaining >= 3 && (s[1] & 0xC0) == 0x80 && (s[2] & 0xC0) == 0x80) { 18173 - uint32_t cp = ((c0 & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F); 18174 - return tov((double) cp); 18175 - } 18176 - 18177 - if ((c0 & 0xF8) == 0xF0 && remaining >= 4 && (s[1] & 0xC0) == 0x80 && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) { 18178 - uint32_t cp = ((c0 & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F); 18179 - return tov((double) cp); 18180 - } 18181 - 18182 - return tov((double) c0); 18170 + return tov((double) cp); 18183 18171 } 18184 18172 18185 18173 static jsval_t builtin_string_toLowerCase(struct js *js, jsval_t *args, int nargs) { ··· 18382 18370 if (idx_d < 0 || isinf(idx_d)) return js_mkstr(js, "", 0); 18383 18371 18384 18372 jsoff_t idx = (jsoff_t) idx_d; 18385 - jsoff_t str_len = offtolen(loadoff(js, (jsoff_t) vdata(str))); 18386 - 18387 - if (idx >= str_len) return js_mkstr(js, "", 0); 18388 - 18373 + jsoff_t byte_len = offtolen(loadoff(js, (jsoff_t) vdata(str))); 18389 18374 jsoff_t str_off = (jsoff_t) vdata(str) + sizeof(jsoff_t); 18390 - char ch[2] = { js->mem[str_off + idx], '\0' }; 18375 + const char *str_data = (const char *)&js->mem[str_off]; 18391 18376 18392 - return js_mkstr(js, ch, 1); 18377 + size_t char_bytes; 18378 + int byte_offset = utf16_index_to_byte_offset(str_data, byte_len, idx, &char_bytes); 18379 + if (byte_offset < 0) return js_mkstr(js, "", 0); 18380 + 18381 + return js_mkstr(js, str_data + byte_offset, char_bytes); 18393 18382 } 18394 18383 18395 18384 static jsval_t builtin_string_at(struct js *js, jsval_t *args, int nargs) { ··· 18399 18388 double idx_d = nargs < 1 ? 0.0 : js_to_number(js, args[0]); 18400 18389 if (isnan(idx_d) || isinf(idx_d)) return js_mkundef(); 18401 18390 18402 - jsoff_t str_len = offtolen(loadoff(js, (jsoff_t) vdata(str))); 18391 + jsoff_t byte_len = offtolen(loadoff(js, (jsoff_t) vdata(str))); 18392 + jsoff_t str_off = (jsoff_t) vdata(str) + sizeof(jsoff_t); 18393 + const char *str_data = (const char *)&js->mem[str_off]; 18394 + size_t utf16_len = utf16_strlen(str_data, byte_len); 18395 + 18403 18396 long idx = (long) idx_d; 18404 - 18405 - if (idx < 0) idx += (long) str_len; 18406 - if (idx < 0 || idx >= (long) str_len) return js_mkundef(); 18397 + if (idx < 0) idx += (long) utf16_len; 18398 + if (idx < 0 || idx >= (long) utf16_len) return js_mkundef(); 18407 18399 18408 - jsoff_t str_off = (jsoff_t) vdata(str) + sizeof(jsoff_t); 18409 - char ch[2] = { js->mem[str_off + idx], '\0' }; 18410 - return js_mkstr(js, ch, 1); 18400 + size_t char_bytes; 18401 + int byte_offset = utf16_index_to_byte_offset(str_data, byte_len, idx, &char_bytes); 18402 + if (byte_offset < 0) return js_mkundef(); 18403 + 18404 + return js_mkstr(js, str_data + byte_offset, char_bytes); 18411 18405 } 18412 18406 18413 18407 static jsval_t builtin_string_localeCompare(struct js *js, jsval_t *args, int nargs) {
+152 -20
src/utf8.c
··· 34 34 if (len == 0) { *seq_len = 0; return 0; } 35 35 36 36 unsigned char first = buf[0]; 37 - int slen = utf8_sequence_length(first); 38 37 39 - if (slen < 0 || (size_t)slen > len) { 40 - *seq_len = 1; 41 - return 0xFFFD; 38 + if (first < 0x80) { *seq_len = 1; return first; } 39 + if ((first & 0xE0) == 0xC0 && len >= 2) { 40 + *seq_len = 2; 41 + return ((first & 0x1F) << 6) | (buf[1] & 0x3F); 42 42 } 43 - 44 - *seq_len = slen; 43 + if ((first & 0xF0) == 0xE0 && len >= 3) { 44 + *seq_len = 3; 45 + return ((first & 0x0F) << 12) | ((buf[1] & 0x3F) << 6) | (buf[2] & 0x3F); 46 + } 47 + if ((first & 0xF8) == 0xF0 && len >= 4) { 48 + *seq_len = 4; 49 + return ((first & 0x07) << 18) | ((buf[1] & 0x3F) << 12) | ((buf[2] & 0x3F) << 6) | (buf[3] & 0x3F); 50 + } 45 51 46 - if (slen == 1) return first; 47 - if (slen == 2) return ((first & 0x1F) << 6) | (buf[1] & 0x3F); 48 - if (slen == 3) return ((first & 0x0F) << 12) | ((buf[1] & 0x3F) << 6) | (buf[2] & 0x3F); 49 - return ((first & 0x07) << 18) | ((buf[1] & 0x3F) << 12) | ((buf[2] & 0x3F) << 6) | (buf[3] & 0x3F); 52 + *seq_len = 1; 53 + return 0xFFFD; 50 54 } 51 55 52 56 size_t utf8_strlen(const char *str, size_t byte_len) { 53 57 size_t count = 0; 54 58 const unsigned char *p = (const unsigned char *)str; 55 59 const unsigned char *end = p + byte_len; 56 - 57 60 while (p < end) { 58 - int seq_len = utf8_sequence_length(*p); 59 - if (seq_len < 0) { p++; count++; continue; } 60 - p += seq_len; 61 - count++; 61 + if ((*p & 0xC0) != 0x80) count++; 62 + p++; 62 63 } 63 64 return count; 64 65 } ··· 67 68 size_t count = 0; 68 69 const unsigned char *p = (const unsigned char *)str; 69 70 const unsigned char *end = p + byte_len; 70 - 71 71 while (p < end) { 72 - int seq_len; 73 - uint32_t cp = utf8_decode(p, end - p, &seq_len); 74 - p += seq_len; 75 - count += (cp >= 0x10000) ? 2 : 1; 72 + unsigned char c = *p; 73 + if ((c & 0xC0) != 0x80) { 74 + count++; 75 + if ((c & 0xF8) == 0xF0) count++; 76 + } 77 + p++; 76 78 } 77 79 return count; 78 80 } 81 + 82 + int utf16_index_to_byte_offset( 83 + const char *str, 84 + size_t byte_len, 85 + size_t utf16_idx, 86 + size_t *out_char_bytes 87 + ) { 88 + const unsigned char *p = (const unsigned char *)str; 89 + const unsigned char *end = p + byte_len; 90 + size_t utf16_pos = 0; 91 + 92 + while (p < end && utf16_pos < utf16_idx) { 93 + unsigned char c = *p; 94 + if (c < 0x80) { p++; utf16_pos++; } 95 + else if ((c & 0xE0) == 0xC0) { p += 2; utf16_pos++; } 96 + else if ((c & 0xF0) == 0xE0) { p += 3; utf16_pos++; } 97 + else if ((c & 0xF8) == 0xF0) { p += 4; utf16_pos += 2; } 98 + else { p++; utf16_pos++; } 99 + if (p > end) p = end; 100 + } 101 + 102 + if (p >= end) return -1; 103 + 104 + unsigned char c = *p; 105 + size_t slen = (c < 0x80) ? 1 : ((c & 0xE0) == 0xC0) ? 2 : ((c & 0xF0) == 0xE0) ? 3 : ((c & 0xF8) == 0xF0) ? 4 : 1; 106 + if (out_char_bytes) *out_char_bytes = slen; 107 + return (int)(p - (const unsigned char *)str); 108 + } 109 + 110 + int utf16_range_to_byte_range( 111 + const char *str, 112 + size_t byte_len, 113 + size_t utf16_start, 114 + size_t utf16_end, 115 + size_t *byte_start, 116 + size_t *byte_end 117 + ) { 118 + const unsigned char *p = (const unsigned char *)str; 119 + const unsigned char *end = p + byte_len; 120 + size_t utf16_pos = 0; 121 + size_t b_start = 0, b_end = byte_len; 122 + int found_start = 0, found_end = 0; 123 + 124 + while (p < end) { 125 + if (utf16_pos == utf16_start) { b_start = p - (const unsigned char *)str; found_start = 1; } 126 + if (utf16_pos == utf16_end) { b_end = p - (const unsigned char *)str; found_end = 1; break; } 127 + 128 + unsigned char c = *p; 129 + if (c < 0x80) { p++; utf16_pos++; } 130 + else if ((c & 0xE0) == 0xC0) { p += 2; utf16_pos++; } 131 + else if ((c & 0xF0) == 0xE0) { p += 3; utf16_pos++; } 132 + else if ((c & 0xF8) == 0xF0) { p += 4; utf16_pos += 2; } 133 + else { p++; utf16_pos++; } 134 + if (p > end) p = end; 135 + } 136 + 137 + if (!found_start && utf16_start >= utf16_pos) b_start = byte_len; 138 + if (!found_end && utf16_end >= utf16_pos) b_end = byte_len; 139 + 140 + *byte_start = b_start; 141 + *byte_end = b_end; 142 + return 0; 143 + } 144 + 145 + uint32_t utf16_code_unit_at(const char *str, size_t byte_len, size_t utf16_idx) { 146 + const unsigned char *p = (const unsigned char *)str; 147 + const unsigned char *end = p + byte_len; 148 + size_t utf16_pos = 0; 149 + 150 + while (p < end) { 151 + unsigned char c = *p; 152 + size_t units, slen; 153 + uint32_t cp; 154 + 155 + if (c < 0x80) { cp = c; slen = 1; units = 1; } 156 + else if ((c & 0xE0) == 0xC0 && p + 1 < end) { 157 + cp = ((c & 0x1F) << 6) | (p[1] & 0x3F); slen = 2; units = 1; 158 + } 159 + else if ((c & 0xF0) == 0xE0 && p + 2 < end) { 160 + cp = ((c & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F); slen = 3; units = 1; 161 + } 162 + else if ((c & 0xF8) == 0xF0 && p + 3 < end) { 163 + cp = ((c & 0x07) << 18) | ((p[1] & 0x3F) << 12) | ((p[2] & 0x3F) << 6) | (p[3] & 0x3F); slen = 4; units = 2; 164 + } 165 + else { cp = c; slen = 1; units = 1; } 166 + 167 + if (utf16_pos == utf16_idx) { 168 + if (units == 2) return 0xD800 + ((cp - 0x10000) >> 10); 169 + return cp; 170 + } 171 + if (units == 2 && utf16_pos + 1 == utf16_idx) { 172 + return 0xDC00 + ((cp - 0x10000) & 0x3FF); 173 + } 174 + p += slen; 175 + utf16_pos += units; 176 + } 177 + return 0xFFFFFFFF; 178 + } 179 + 180 + uint32_t utf16_codepoint_at(const char *str, size_t byte_len, size_t utf16_idx) { 181 + const unsigned char *p = (const unsigned char *)str; 182 + const unsigned char *end = p + byte_len; 183 + size_t utf16_pos = 0; 184 + 185 + while (p < end) { 186 + unsigned char c = *p; 187 + size_t units, slen; 188 + uint32_t cp; 189 + 190 + if (c < 0x80) { cp = c; slen = 1; units = 1; } 191 + else if ((c & 0xE0) == 0xC0 && p + 1 < end) { 192 + cp = ((c & 0x1F) << 6) | (p[1] & 0x3F); slen = 2; units = 1; 193 + } 194 + else if ((c & 0xF0) == 0xE0 && p + 2 < end) { 195 + cp = ((c & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F); slen = 3; units = 1; 196 + } 197 + else if ((c & 0xF8) == 0xF0 && p + 3 < end) { 198 + cp = ((c & 0x07) << 18) | ((p[1] & 0x3F) << 12) | ((p[2] & 0x3F) << 6) | (p[3] & 0x3F); slen = 4; units = 2; 199 + } 200 + else { cp = c; slen = 1; units = 1; } 201 + 202 + if (utf16_pos == utf16_idx) return cp; 203 + if (units == 2 && utf16_pos + 1 == utf16_idx) { 204 + return 0xDC00 + ((cp - 0x10000) & 0x3FF); 205 + } 206 + p += slen; 207 + utf16_pos += units; 208 + } 209 + return 0xFFFFFFFF; 210 + }