add ASCII cache and optimize UTF-8 string length calculation

+3

include/internal.h

··· 65 65 jsval_t *gc_roots; 66 66 int gc_roots_len; 67 67 int gc_roots_cap; 68 + 69 + jsval_t ascii_char_cache[128]; 70 + bool ascii_cache_init; 68 71 }; 69 72 70 73 enum {

+21 -8

src/ant.c

··· 10414 10414 return js_mkundef(); 10415 10415 } 10416 10416 10417 + static void init_ascii_cache(struct js *js) { 10418 + if (js->ascii_cache_init) return; 10419 + for (int i = 0; i < 128; i++) { 10420 + char c = (char)i; 10421 + js->ascii_char_cache[i] = js_mkstr(js, &c, 1); 10422 + } 10423 + js->ascii_cache_init = true; 10424 + } 10425 + 10417 10426 static jsval_t for_of_iter_string(struct js *js, for_iter_ctx_t *ctx, jsval_t iterable) { 10418 10427 jshdl_t h_iterable = js_root(js, iterable); 10419 10428 size_t byte_pos = 0; 10429 + 10430 + if (!js->ascii_cache_init) init_ascii_cache(js); 10420 10431 10421 10432 for (;;) { 10422 10433 jsval_t cur = js_deref(js, h_iterable); ··· 10428 10439 const char *cur_str = (char *) &js->mem[cur_soff]; 10429 10440 unsigned char c = (unsigned char)cur_str[byte_pos]; 10430 10441 size_t char_bytes; 10431 - if (c < 0x80) char_bytes = 1; 10432 - else if ((c & 0xE0) == 0xC0) char_bytes = 2; 10433 - else if ((c & 0xF0) == 0xE0) char_bytes = 3; 10434 - else if ((c & 0xF8) == 0xF0) char_bytes = 4; 10435 - else char_bytes = 1; 10442 + jsval_t char_str; 10436 10443 10437 - if (byte_pos + char_bytes > cur_byte_len) char_bytes = cur_byte_len - byte_pos; 10438 - jsval_t char_str = js_mkstr(js, cur_str + byte_pos, char_bytes); 10439 - byte_pos += char_bytes; 10444 + if (c < 0x80) { char_bytes = 1; char_str = js->ascii_char_cache[c]; } else { 10445 + if ((c & 0xE0) == 0xC0) char_bytes = 2; 10446 + else if ((c & 0xF0) == 0xE0) char_bytes = 3; 10447 + else if ((c & 0xF8) == 0xF0) char_bytes = 4; 10448 + else char_bytes = 1; 10449 + if (byte_pos + char_bytes > cur_byte_len) char_bytes = cur_byte_len - byte_pos; 10450 + char_str = js_mkstr(js, cur_str + byte_pos, char_bytes); 10451 + } byte_pos += char_bytes; 10440 10452 10441 10453 jsval_t err = for_iter_bind_var(js, ctx, char_str); 10442 10454 if (is_err(err)) { js_unroot(js, h_iterable); return err; } ··· 22823 22835 } 22824 22836 22825 22837 for (int i = 0; i < c->js->gc_roots_len; i++) op_val(c, &c->js->gc_roots[i]); 22838 + if (c->js->ascii_cache_init) for (int i = 0; i < 128; i++) op_val(c, &c->js->ascii_char_cache[i]); 22826 22839 } 22827 22840 22828 22841 void js_gc_reserve_roots(GC_UPDATE_ARGS) {

+21 -3

src/utf8.c

··· 1 1 #include "utf8.h" 2 + #include <string.h> 2 3 3 4 int utf8_sequence_length(unsigned char first_byte) { 4 5 if ((first_byte & 0x80) == 0) return 1; ··· 70 71 } 71 72 72 73 size_t utf16_strlen(const char *str, size_t byte_len) { 73 - size_t count = 0; 74 74 const unsigned char *p = (const unsigned char *)str; 75 75 const unsigned char *end = p + byte_len; 76 + 77 + // Fast path: check if string is ASCII-only (very common case) 78 + size_t i = 0; 79 + for (; i + 8 <= byte_len; i += 8) { 80 + uint64_t chunk; 81 + memcpy(&chunk, p + i, 8); 82 + if (chunk & 0x8080808080808080ULL) goto slow_path; 83 + } 84 + for (; i < byte_len; i++) { 85 + if (p[i] & 0x80) goto slow_path; 86 + } 87 + return byte_len; // All ASCII: UTF-16 length == byte length 88 + 89 + slow_path:; 90 + size_t count = i; // ASCII chars counted before first non-ASCII 91 + p += i; 76 92 while (p < end) { 77 93 unsigned char c = *p; 78 94 if ((c & 0xC0) != 0x80) { 79 - count++; if ((c & 0xF8) == 0xF0) count++; 80 - } p++; 95 + count++; 96 + if ((c & 0xF8) == 0xF0) count++; // 4-byte UTF-8 = 2 UTF-16 units 97 + } 98 + p++; 81 99 } 82 100 return count; 83 101 }

Configure Feed

Configure Feed