reduce scanning from O(n^2) to O(n) with caching for strings

+32

include/internal.h

··· 227 227 228 228 typedef struct { 229 229 ant_offset_t len; 230 + uint8_t is_ascii; 230 231 char bytes[]; 231 232 } ant_flat_string_t; 233 + 234 + enum { 235 + STR_ASCII_UNKNOWN = 0, 236 + STR_ASCII_YES = 1, 237 + STR_ASCII_NO = 2, 238 + }; 232 239 233 240 typedef struct { 234 241 ant_offset_t len; ··· 436 443 ant_value_t k = js_mkstr(js, name, len); 437 444 if (is_err(k)) return k; 438 445 return mkprop(js, obj, k, fn, ANT_PROP_ATTR_WRITABLE | ANT_PROP_ATTR_CONFIGURABLE); 446 + } 447 + 448 + static inline ant_flat_string_t *str_flat_from_bytes(const char *str) { 449 + return (ant_flat_string_t *)((char *)str - offsetof(ant_flat_string_t, bytes)); 450 + } 451 + 452 + static inline uint8_t str_detect_ascii_bytes(const char *str, size_t len) { 453 + const unsigned char *s = (const unsigned char *)str; 454 + for (size_t i = 0; i < len; i++) { 455 + if (s[i] >= 0x80) return STR_ASCII_NO; 456 + } 457 + return STR_ASCII_YES; 458 + } 459 + 460 + static inline void str_set_ascii_state(const char *str, uint8_t state) { 461 + ant_flat_string_t *flat = str_flat_from_bytes(str); 462 + flat->is_ascii = state; 463 + } 464 + 465 + static inline bool str_is_ascii(const char *str) { 466 + ant_flat_string_t *flat = str_flat_from_bytes(str); 467 + if (flat->is_ascii == STR_ASCII_UNKNOWN) { 468 + flat->is_ascii = str_detect_ascii_bytes(flat->bytes, (size_t)flat->len); 469 + } 470 + return flat->is_ascii == STR_ASCII_YES; 439 471 } 440 472 441 473 static inline void js_set_module_default(ant_t *js, ant_value_t lib, ant_value_t ctor_fn, const char *name) {

+58 -8

src/ant.c

··· 2307 2307 2308 2308 flat->len = (ant_offset_t)len; 2309 2309 if (ptr && len > 0) memcpy(flat->bytes, ptr, len); 2310 + 2310 2311 flat->bytes[len] = '\0'; 2312 + flat->is_ascii = (ptr || len == 0) 2313 + ? str_detect_ascii_bytes(flat->bytes, len) 2314 + : STR_ASCII_UNKNOWN; 2315 + 2311 2316 return mkval(T_STR, (uintptr_t)flat); 2312 2317 } 2313 2318 ··· 2324 2329 2325 2330 flat->len = (ant_offset_t)len; 2326 2331 if (ptr && len > 0) memcpy(flat->bytes, ptr, len); 2332 + 2327 2333 flat->bytes[len] = '\0'; 2334 + flat->is_ascii = (ptr || len == 0) 2335 + ? str_detect_ascii_bytes(flat->bytes, len) 2336 + : STR_ASCII_UNKNOWN; 2337 + 2328 2338 return mkval(T_STR, (uintptr_t)flat); 2329 2339 } 2330 2340 ··· 8832 8842 } 8833 8843 8834 8844 static ant_value_t builtin_string_toLowerCase(ant_t *js, ant_value_t *args, int nargs) { 8835 - (void) args; (void) nargs; 8836 8845 ant_value_t str = to_string_val(js, js->this_val); 8837 8846 if (vtype(str) != T_STR) return js_mkerr(js, "toLowerCase called on non-string"); 8838 8847 ··· 8845 8854 8846 8855 ant_offset_t out_len = 0; 8847 8856 utf8proc_ssize_t pos = 0; 8857 + 8848 8858 while (pos < src_len) { 8849 8859 utf8proc_int32_t cp; 8850 8860 utf8proc_ssize_t n = utf8_next(src + pos, src_len - pos, &cp); ··· 8856 8866 8857 8867 ant_value_t result = js_mkstr(js, NULL, out_len); 8858 8868 if (is_err(result)) return result; 8869 + 8859 8870 ant_offset_t result_len, result_off = vstr(js, result, &result_len); 8860 8871 char *result_ptr = (char *)(uintptr_t)(result_off); 8872 + uint8_t ascii_state = STR_ASCII_YES; 8861 8873 8862 8874 pos = 0; 8863 8875 ant_offset_t wpos = 0; 8876 + 8864 8877 while (pos < src_len) { 8865 8878 utf8proc_int32_t cp; 8866 8879 utf8proc_ssize_t n = utf8_next(src + pos, src_len - pos, &cp); 8867 - if (cp < 0) { result_ptr[wpos++] = (char)src[pos]; pos++; continue; } 8868 - wpos += (ant_offset_t)utf8proc_encode_char(utf8proc_tolower(cp), (utf8proc_uint8_t *)(result_ptr + wpos)); 8880 + 8881 + if (cp < 0) { 8882 + unsigned char byte = src[pos]; 8883 + if (byte >= 0x80) ascii_state = STR_ASCII_NO; 8884 + result_ptr[wpos++] = (char)byte; 8885 + pos++; continue; 8886 + } 8887 + 8888 + utf8proc_int32_t mapped = utf8proc_tolower(cp); 8889 + if (mapped >= 0x80) ascii_state = STR_ASCII_NO; 8890 + 8891 + wpos += (ant_offset_t)utf8proc_encode_char(mapped, (utf8proc_uint8_t *)(result_ptr + wpos)); 8869 8892 pos += n; 8870 8893 } 8871 - 8894 + 8895 + str_set_ascii_state(result_ptr, ascii_state); 8872 8896 return result; 8873 8897 } 8874 8898 8875 8899 static ant_value_t builtin_string_toUpperCase(ant_t *js, ant_value_t *args, int nargs) { 8876 - (void) args; (void) nargs; 8877 8900 ant_value_t str = to_string_val(js, js->this_val); 8878 8901 if (vtype(str) != T_STR) return js_mkerr(js, "toUpperCase called on non-string"); 8879 8902 ··· 8886 8909 8887 8910 ant_offset_t out_len = 0; 8888 8911 utf8proc_ssize_t pos = 0; 8912 + 8889 8913 while (pos < src_len) { 8890 8914 utf8proc_int32_t cp; 8891 8915 utf8proc_ssize_t n = utf8_next(src + pos, src_len - pos, &cp); ··· 8897 8921 8898 8922 ant_value_t result = js_mkstr(js, NULL, out_len); 8899 8923 if (is_err(result)) return result; 8924 + 8900 8925 ant_offset_t result_len, result_off = vstr(js, result, &result_len); 8901 8926 char *result_ptr = (char *)(uintptr_t)(result_off); 8927 + uint8_t ascii_state = STR_ASCII_YES; 8902 8928 8903 8929 pos = 0; 8904 8930 ant_offset_t wpos = 0; 8931 + 8905 8932 while (pos < src_len) { 8906 8933 utf8proc_int32_t cp; 8907 8934 utf8proc_ssize_t n = utf8_next(src + pos, src_len - pos, &cp); 8908 - if (cp < 0) { result_ptr[wpos++] = (char)src[pos]; pos++; continue; } 8909 - wpos += (ant_offset_t)utf8proc_encode_char(utf8proc_toupper(cp), (utf8proc_uint8_t *)(result_ptr + wpos)); 8935 + 8936 + if (cp < 0) { 8937 + unsigned char byte = src[pos]; 8938 + if (byte >= 0x80) ascii_state = STR_ASCII_NO; 8939 + result_ptr[wpos++] = (char)byte; 8940 + pos++; continue; 8941 + } 8942 + 8943 + utf8proc_int32_t mapped = utf8proc_toupper(cp); 8944 + if (mapped >= 0x80) ascii_state = STR_ASCII_NO; 8945 + 8946 + wpos += (ant_offset_t)utf8proc_encode_char(mapped, (utf8proc_uint8_t *)(result_ptr + wpos)); 8910 8947 pos += n; 8911 8948 } 8912 - 8949 + 8950 + str_set_ascii_state(result_ptr, ascii_state); 8913 8951 return result; 8914 8952 } 8915 8953 ··· 8979 9017 for (ant_offset_t i = 0; i < count; i++) { 8980 9018 memcpy(result_ptr + i * str_len, str_ptr, str_len); 8981 9019 } 9020 + str_set_ascii_state( 9021 + result_ptr, 9022 + str_is_ascii(str_ptr) ? STR_ASCII_YES : STR_ASCII_NO 9023 + ); 8982 9024 8983 9025 return result; 8984 9026 } ··· 9035 9077 pos += rem_bytes; 9036 9078 } 9037 9079 memcpy(result_ptr + pos, str_ptr, (size_t)str_len); 9080 + str_set_ascii_state( 9081 + result_ptr, 9082 + (str_is_ascii(pad_str) && str_is_ascii(str_ptr)) ? STR_ASCII_YES : STR_ASCII_NO 9083 + ); 9038 9084 9039 9085 return result; 9040 9086 } ··· 9091 9137 memcpy(result_ptr + pos, pad_str, rem_bytes); 9092 9138 pos += rem_bytes; 9093 9139 } 9140 + str_set_ascii_state( 9141 + result_ptr, 9142 + (str_is_ascii(str_ptr) && str_is_ascii(pad_str)) ? STR_ASCII_YES : STR_ASCII_NO 9143 + ); 9094 9144 9095 9145 return result; 9096 9146 }

+273 -135

src/utf8.c

··· 1 1 #include "utf8.h" 2 2 #include "utils.h" 3 + #include "internal.h" 4 + #include "gc/objects.h" 5 + 3 6 #include <stdlib.h> 4 7 #include <string.h> 5 8 #include <stdbool.h> 9 + #include <stddef.h> 10 + 11 + typedef struct { 12 + uint64_t epoch; 13 + const char *str; 14 + size_t byte_len; 15 + size_t byte_pos; 16 + size_t utf16_pos; 17 + } utf16_scan_cache_t; 18 + 19 + typedef struct { 20 + const char *str; 21 + size_t byte_len; 22 + const unsigned char *start; 23 + const unsigned char *end; 24 + const unsigned char *p; 25 + size_t utf16_pos; 26 + } utf16_scan_cursor_t; 27 + 28 + static _Thread_local utf16_scan_cache_t utf16_scan_cache = { 0 }; 29 + 30 + static inline void utf16_scan_cache_sync_epoch(void) { 31 + uint64_t epoch = gc_get_epoch(); 32 + if (utf16_scan_cache.epoch == epoch) return; 33 + utf16_scan_cache = (utf16_scan_cache_t){ .epoch = epoch }; 34 + } 35 + 36 + static inline void utf16_scan_cursor_init( 37 + utf16_scan_cursor_t *cursor, 38 + const char *str, 39 + size_t byte_len 40 + ) { 41 + utf16_scan_cache_sync_epoch(); 42 + cursor->str = str; 43 + cursor->byte_len = byte_len; 44 + cursor->start = (const unsigned char *)str; 45 + cursor->end = cursor->start + byte_len; 46 + cursor->p = cursor->start; 47 + cursor->utf16_pos = 0; 48 + } 49 + 50 + static inline bool utf16_scan_cache_matches(const utf16_scan_cursor_t *cursor) { 51 + return utf16_scan_cache.str == cursor->str 52 + && utf16_scan_cache.byte_pos <= cursor->byte_len; 53 + } 54 + 55 + static inline void utf16_scan_cursor_resume_cached(utf16_scan_cursor_t *cursor) { 56 + if (!utf16_scan_cache_matches(cursor)) return; 57 + cursor->p = cursor->start + utf16_scan_cache.byte_pos; 58 + cursor->utf16_pos = utf16_scan_cache.utf16_pos; 59 + } 60 + 61 + static inline void utf16_scan_cursor_resume_utf16( 62 + utf16_scan_cursor_t *cursor, 63 + size_t target_utf16 64 + ) { 65 + if (!utf16_scan_cache_matches(cursor)) return; 66 + if (target_utf16 < utf16_scan_cache.utf16_pos) return; 67 + cursor->p = cursor->start + utf16_scan_cache.byte_pos; 68 + cursor->utf16_pos = utf16_scan_cache.utf16_pos; 69 + } 70 + 71 + static inline void utf16_scan_cursor_resume_byte( 72 + utf16_scan_cursor_t *cursor, 73 + size_t target_byte 74 + ) { 75 + if (!utf16_scan_cache_matches(cursor)) return; 76 + if (target_byte < utf16_scan_cache.byte_pos) return; 77 + cursor->p = cursor->start + utf16_scan_cache.byte_pos; 78 + cursor->utf16_pos = utf16_scan_cache.utf16_pos; 79 + } 80 + 81 + static inline void utf16_scan_cursor_store(const utf16_scan_cursor_t *cursor) { 82 + utf16_scan_cache.str = cursor->str; 83 + utf16_scan_cache.byte_len = cursor->byte_len; 84 + utf16_scan_cache.byte_pos = (size_t)(cursor->p - cursor->start); 85 + utf16_scan_cache.utf16_pos = cursor->utf16_pos; 86 + } 87 + 88 + static inline void utf16_scan_decode( 89 + const unsigned char *p, 90 + const unsigned char *end, 91 + size_t *slen_out, 92 + size_t *units_out, 93 + uint32_t *cp_out 94 + ) { 95 + unsigned char c = *p; 96 + if (c < 0x80) { 97 + if (cp_out) *cp_out = c; 98 + *slen_out = 1; 99 + *units_out = 1; 100 + return; 101 + } 102 + 103 + if ((c & 0xE0) == 0xC0) { 104 + if (cp_out && p + 1 < end) { 105 + *cp_out = ((uint32_t)(c & 0x1F) << 6) | (uint32_t)(p[1] & 0x3F); 106 + *slen_out = 2; 107 + *units_out = 1; 108 + return; 109 + } 110 + if (!cp_out) { 111 + *slen_out = 2; 112 + *units_out = 1; 113 + return; 114 + } 115 + } else if ((c & 0xF0) == 0xE0) { 116 + if (cp_out && p + 2 < end) { 117 + *cp_out = ((uint32_t)(c & 0x0F) << 12) 118 + | ((uint32_t)(p[1] & 0x3F) << 6) 119 + | (uint32_t)(p[2] & 0x3F); 120 + *slen_out = 3; 121 + *units_out = 1; 122 + return; 123 + } 124 + if (!cp_out) { 125 + *slen_out = 3; 126 + *units_out = 1; 127 + return; 128 + } 129 + } else if ((c & 0xF8) == 0xF0) { 130 + if (cp_out && p + 3 < end) { 131 + *cp_out = ((uint32_t)(c & 0x07) << 18) 132 + | ((uint32_t)(p[1] & 0x3F) << 12) 133 + | ((uint32_t)(p[2] & 0x3F) << 6) 134 + | (uint32_t)(p[3] & 0x3F); 135 + *slen_out = 4; 136 + *units_out = 2; 137 + return; 138 + } 139 + if (!cp_out) { 140 + *slen_out = 4; 141 + *units_out = 2; 142 + return; 143 + } 144 + } 145 + 146 + if (cp_out) *cp_out = c; 147 + *slen_out = 1; 148 + *units_out = 1; 149 + } 150 + 151 + static inline bool utf16_scan_cursor_advance( 152 + utf16_scan_cursor_t *cursor, 153 + const unsigned char *bound_end 154 + ) { 155 + size_t slen, units; 156 + const unsigned char *next; 157 + 158 + utf16_scan_decode(cursor->p, cursor->end, &slen, &units, NULL); 159 + next = cursor->p + slen; 160 + cursor->utf16_pos += units; 161 + if (next > bound_end) { 162 + cursor->p = bound_end; 163 + return false; 164 + } 165 + cursor->p = next; 166 + return true; 167 + } 6 168 7 169 static uint32_t utf8_decode(const unsigned char *buf, size_t len, int *seq_len) { 8 170 if (len == 0) { *seq_len = 0; return 0; } ··· 133 295 } 134 296 135 297 size_t utf16_strlen(const char *str, size_t byte_len) { 136 - const unsigned char *p = (const unsigned char *)str; 137 - const unsigned char *end = p + byte_len; 138 - 139 - size_t i = 0; 140 - for (; i + 8 <= byte_len; i += 8) { 141 - uint64_t chunk; 142 - memcpy(&chunk, p + i, 8); 143 - if (chunk & 0x8080808080808080ULL) goto slow_path; 144 - } 145 - for (; i < byte_len; i++) { 146 - if (p[i] & 0x80) goto slow_path; 147 - } 148 - return byte_len; 149 - 150 - slow_path:; 151 - size_t count = i; 152 - p += i; 153 - while (p < end) { 154 - unsigned char c = *p; 155 - if ((c & 0xC0) != 0x80) { 156 - count++; 157 - if ((c & 0xF8) == 0xF0) count++; 158 - } 159 - p++; 298 + if (str_is_ascii(str)) return byte_len; 299 + 300 + utf16_scan_cursor_t cursor; 301 + utf16_scan_cursor_init(&cursor, str, byte_len); 302 + utf16_scan_cursor_resume_cached(&cursor); 303 + 304 + while (cursor.p < cursor.end) { 305 + utf16_scan_cursor_advance(&cursor, cursor.end); 160 306 } 161 - return count; 307 + 308 + utf16_scan_cursor_store(&cursor); 309 + return cursor.utf16_pos; 162 310 } 163 311 164 312 int utf16_index_to_byte_offset( ··· 167 315 size_t utf16_idx, 168 316 size_t *out_char_bytes 169 317 ) { 170 - const unsigned char *p = (const unsigned char *)str; 171 - const unsigned char *end = p + byte_len; 172 - size_t utf16_pos = 0; 318 + if (str_is_ascii(str)) { 319 + if (utf16_idx > byte_len) return -1; 320 + if (out_char_bytes) *out_char_bytes = (utf16_idx < byte_len) ? 1 : 0; 321 + return (int)utf16_idx; 322 + } 323 + 324 + utf16_scan_cursor_t cursor; 325 + utf16_scan_cursor_init(&cursor, str, byte_len); 326 + utf16_scan_cursor_resume_utf16(&cursor, utf16_idx); 173 327 174 - while (p < end && utf16_pos < utf16_idx) { 175 - unsigned char c = *p; 176 - if (c < 0x80) { p++; utf16_pos++; } 177 - else if ((c & 0xE0) == 0xC0) { p += 2; utf16_pos++; } 178 - else if ((c & 0xF0) == 0xE0) { p += 3; utf16_pos++; } 179 - else if ((c & 0xF8) == 0xF0) { p += 4; utf16_pos += 2; } 180 - else { p++; utf16_pos++; } 181 - if (p > end) p = end; 328 + while (cursor.p < cursor.end && cursor.utf16_pos < utf16_idx) { 329 + utf16_scan_cursor_advance(&cursor, cursor.end); 182 330 } 183 331 184 - if (p >= end) { 185 - if (utf16_pos == utf16_idx) { 332 + if (cursor.p >= cursor.end) { 333 + if (cursor.utf16_pos == utf16_idx) { 186 334 if (out_char_bytes) *out_char_bytes = 0; 335 + utf16_scan_cursor_store(&cursor); 187 336 return (int)byte_len; 188 - } return -1; 337 + } 338 + utf16_scan_cursor_store(&cursor); 339 + return -1; 189 340 } 190 341 191 - unsigned char c = *p; 192 - size_t slen = (c < 0x80) 193 - ? 1 : ((c & 0xE0) == 0xC0) 194 - ? 2 : ((c & 0xF0) == 0xE0) 195 - ? 3 : ((c & 0xF8) == 0xF0) 196 - ? 4 : 1; 342 + size_t slen, units; 343 + utf16_scan_decode(cursor.p, cursor.end, &slen, &units, NULL); 197 344 198 345 if (out_char_bytes) *out_char_bytes = slen; 199 - return (int)(p - (const unsigned char *)str); 346 + utf16_scan_cursor_store(&cursor); 347 + return (int)(cursor.p - cursor.start); 200 348 } 201 349 202 350 int utf16_range_to_byte_range( ··· 207 355 size_t *byte_start, 208 356 size_t *byte_end 209 357 ) { 210 - const unsigned char *p = (const unsigned char *)str; 211 - const unsigned char *end = p + byte_len; 212 - 213 - size_t utf16_pos = 0; 358 + if (str_is_ascii(str)) { 359 + *byte_start = (utf16_start <= byte_len) ? utf16_start : byte_len; 360 + *byte_end = (utf16_end <= byte_len) ? utf16_end : byte_len; 361 + return 0; 362 + } 363 + 364 + utf16_scan_cursor_t cursor; 365 + utf16_scan_cursor_init(&cursor, str, byte_len); 366 + utf16_scan_cursor_resume_utf16(&cursor, utf16_start); 367 + 214 368 size_t b_start = 0, b_end = byte_len; 215 369 int found_start = 0, found_end = 0; 216 370 217 - while (p < end) { 218 - if (utf16_pos == utf16_start) { b_start = p - (const unsigned char *)str; found_start = 1; } 219 - if (utf16_pos == utf16_end) { b_end = p - (const unsigned char *)str; found_end = 1; break; } 220 - 221 - unsigned char c = *p; 222 - if (c < 0x80) { p++; utf16_pos++; } 223 - else if ((c & 0xE0) == 0xC0) { p += 2; utf16_pos++; } 224 - else if ((c & 0xF0) == 0xE0) { p += 3; utf16_pos++; } 225 - else if ((c & 0xF8) == 0xF0) { p += 4; utf16_pos += 2; } 226 - else { p++; utf16_pos++; } 227 - if (p > end) p = end; 371 + while (cursor.p < cursor.end) { 372 + if (cursor.utf16_pos == utf16_start) { 373 + b_start = (size_t)(cursor.p - cursor.start); 374 + found_start = 1; 375 + } 376 + if (cursor.utf16_pos == utf16_end) { 377 + b_end = (size_t)(cursor.p - cursor.start); 378 + found_end = 1; 379 + break; 380 + } 381 + utf16_scan_cursor_advance(&cursor, cursor.end); 228 382 } 229 383 230 - if (!found_start && utf16_start >= utf16_pos) b_start = byte_len; 231 - if (!found_end && utf16_end >= utf16_pos) b_end = byte_len; 384 + if (!found_start && utf16_start >= cursor.utf16_pos) b_start = byte_len; 385 + if (!found_end && utf16_end >= cursor.utf16_pos) b_end = byte_len; 232 386 233 387 *byte_start = b_start; 234 388 *byte_end = b_end; 389 + utf16_scan_cursor_store(&cursor); 235 390 236 391 return 0; 237 392 } 238 393 239 394 size_t byte_offset_to_utf16(const char *str, size_t byte_off) { 240 - const unsigned char *p = (const unsigned char *)str; 241 - const unsigned char *end = p + byte_off; 242 - size_t utf16_pos = 0; 395 + if (str_is_ascii(str)) return byte_off; 396 + 397 + utf16_scan_cursor_t cursor; 398 + const unsigned char *bound_end; 399 + bool ended_on_boundary = true; 400 + 401 + utf16_scan_cursor_init(&cursor, str, byte_off); 402 + utf16_scan_cursor_resume_byte(&cursor, byte_off); 403 + bound_end = cursor.start + byte_off; 243 404 244 - while (p < end) { 245 - unsigned char c = *p; 246 - if (c < 0x80) { p++; utf16_pos++; } 247 - else if ((c & 0xE0) == 0xC0) { p += 2; utf16_pos++; } 248 - else if ((c & 0xF0) == 0xE0) { p += 3; utf16_pos++; } 249 - else if ((c & 0xF8) == 0xF0) { p += 4; utf16_pos += 2; } 250 - else { p++; utf16_pos++; } 251 - if (p > end) p = end; 405 + while (cursor.p < bound_end) { 406 + if (!utf16_scan_cursor_advance(&cursor, bound_end)) { 407 + ended_on_boundary = false; 408 + break; 409 + } 252 410 } 253 - return utf16_pos; 411 + 412 + if (ended_on_boundary) utf16_scan_cursor_store(&cursor); 413 + return cursor.utf16_pos; 254 414 } 255 415 256 416 uint32_t utf16_code_unit_at(const char *str, size_t byte_len, size_t utf16_idx) { 257 - const unsigned char *p = (const unsigned char *)str; 258 - const unsigned char *end = p + byte_len; 259 - size_t utf16_pos = 0; 417 + if (str_is_ascii(str)) { 418 + if (utf16_idx >= byte_len) return 0xFFFFFFFF; 419 + return (unsigned char)str[utf16_idx]; 420 + } 421 + 422 + utf16_scan_cursor_t cursor; 423 + utf16_scan_cursor_init(&cursor, str, byte_len); 424 + utf16_scan_cursor_resume_utf16(&cursor, utf16_idx); 260 425 261 - while (p < end) { 262 - unsigned char c = *p; 263 - size_t units, slen; 426 + while (cursor.p < cursor.end) { 427 + size_t slen, units; 264 428 uint32_t cp; 265 429 266 - if (c < 0x80) { cp = c; slen = 1; units = 1; } 267 - else if ((c & 0xE0) == 0xC0 && p + 1 < end) { 268 - cp = ((c & 0x1F) << 6) 269 - | (p[1] & 0x3F); 270 - slen = 2; units = 1; 271 - } 272 - else if ((c & 0xF0) == 0xE0 && p + 2 < end) { 273 - cp = ((c & 0x0F) << 12) 274 - | ((p[1] & 0x3F) << 6) 275 - | (p[2] & 0x3F); 276 - slen = 3; units = 1; 277 - } 278 - else if ((c & 0xF8) == 0xF0 && p + 3 < end) { 279 - cp = ((c & 0x07) << 18) 280 - | ((p[1] & 0x3F) << 12) 281 - | ((p[2] & 0x3F) << 6) 282 - | (p[3] & 0x3F); 283 - slen = 4; units = 2; 284 - } 285 - else { cp = c; slen = 1; units = 1; } 430 + utf16_scan_decode(cursor.p, cursor.end, &slen, &units, &cp); 286 431 287 - if (utf16_pos == utf16_idx) { 432 + if (cursor.utf16_pos == utf16_idx) { 433 + utf16_scan_cursor_store(&cursor); 288 434 if (units == 2) return 0xD800 + ((cp - 0x10000) >> 10); 289 435 return cp; 290 436 } 291 - if (units == 2 && utf16_pos + 1 == utf16_idx) { 437 + if (units == 2 && cursor.utf16_pos + 1 == utf16_idx) { 438 + utf16_scan_cursor_store(&cursor); 292 439 return 0xDC00 + ((cp - 0x10000) & 0x3FF); 293 440 } 294 - p += slen; 295 - utf16_pos += units; 441 + cursor.p += slen; 442 + cursor.utf16_pos += units; 296 443 } 297 444 445 + utf16_scan_cursor_store(&cursor); 298 446 return 0xFFFFFFFF; 299 447 } 300 448 ··· 384 532 } 385 533 386 534 uint32_t utf16_codepoint_at(const char *str, size_t byte_len, size_t utf16_idx) { 387 - const unsigned char *p = (const unsigned char *)str; 388 - const unsigned char *end = p + byte_len; 389 - size_t utf16_pos = 0; 535 + if (str_is_ascii(str)) { 536 + if (utf16_idx >= byte_len) return 0xFFFFFFFF; 537 + return (unsigned char)str[utf16_idx]; 538 + } 539 + 540 + utf16_scan_cursor_t cursor; 541 + utf16_scan_cursor_init(&cursor, str, byte_len); 542 + utf16_scan_cursor_resume_utf16(&cursor, utf16_idx); 390 543 391 - while (p < end) { 392 - unsigned char c = *p; 393 - size_t units, slen; 544 + while (cursor.p < cursor.end) { 545 + size_t slen, units; 394 546 uint32_t cp; 395 547 396 - if (c < 0x80) { cp = c; slen = 1; units = 1; } 397 - else if ((c & 0xE0) == 0xC0 && p + 1 < end) { 398 - cp = ((c & 0x1F) << 6) 399 - | (p[1] & 0x3F); 400 - slen = 2; units = 1; 401 - } 402 - else if ((c & 0xF0) == 0xE0 && p + 2 < end) { 403 - cp = ((c & 0x0F) << 12) 404 - | ((p[1] & 0x3F) << 6) 405 - | (p[2] & 0x3F); 406 - slen = 3; units = 1; 548 + utf16_scan_decode(cursor.p, cursor.end, &slen, &units, &cp); 549 + 550 + if (cursor.utf16_pos == utf16_idx) { 551 + utf16_scan_cursor_store(&cursor); 552 + return cp; 407 553 } 408 - else if ((c & 0xF8) == 0xF0 && p + 3 < end) { 409 - cp = ((c & 0x07) << 18) 410 - | ((p[1] & 0x3F) << 12) 411 - | ((p[2] & 0x3F) << 6) 412 - | (p[3] & 0x3F); 413 - slen = 4; units = 2; 414 - } 415 - else { cp = c; slen = 1; units = 1; } 416 - 417 - if (utf16_pos == utf16_idx) return cp; 418 - if (units == 2 && utf16_pos + 1 == utf16_idx) { 554 + if (units == 2 && cursor.utf16_pos + 1 == utf16_idx) { 555 + utf16_scan_cursor_store(&cursor); 419 556 return 0xDC00 + ((cp - 0x10000) & 0x3FF); 420 557 } 421 558 422 - p += slen; 423 - utf16_pos += units; 559 + cursor.p += slen; 560 + cursor.utf16_pos += units; 424 561 } 425 562 563 + utf16_scan_cursor_store(&cursor); 426 564 return 0xFFFFFFFF; 427 565 }

Configure Feed

Configure Feed