improve utf8 safety · themackabu.com/ant@3f81bdd

+31 -17

2 changed files

expand all

src

ant.c

utf8.c

+15 -10

src/ant.c

··· 10416 10416 10417 10417 static jsval_t for_of_iter_string(struct js *js, for_iter_ctx_t *ctx, jsval_t iterable) { 10418 10418 jshdl_t h_iterable = js_root(js, iterable); 10419 - jsoff_t byte_len; 10420 - jsoff_t soff = vstr(js, iterable, &byte_len); 10419 + size_t byte_pos = 0; 10421 10420 10422 - const char *str = (char *) &js->mem[soff]; 10423 - size_t utf16_len = utf16_strlen(str, byte_len); 10424 - 10425 - for (size_t i = 0; i < utf16_len; i++) { 10421 + for (;;) { 10426 10422 jsval_t cur = js_deref(js, h_iterable); 10427 10423 jsoff_t cur_byte_len; 10428 10424 jsoff_t cur_soff = vstr(js, cur, &cur_byte_len); 10425 + 10426 + if (byte_pos >= cur_byte_len) break; 10427 + 10429 10428 const char *cur_str = (char *) &js->mem[cur_soff]; 10429 + unsigned char c = (unsigned char)cur_str[byte_pos]; 10430 + size_t char_bytes; 10431 + if (c < 0x80) char_bytes = 1; 10432 + else if ((c & 0xE0) == 0xC0) char_bytes = 2; 10433 + else if ((c & 0xF0) == 0xE0) char_bytes = 3; 10434 + else if ((c & 0xF8) == 0xF0) char_bytes = 4; 10435 + else char_bytes = 1; 10430 10436 10431 - size_t char_bytes; 10432 - int byte_offset = utf16_index_to_byte_offset(cur_str, cur_byte_len, i, &char_bytes); 10433 - if (byte_offset < 0) break; 10434 - jsval_t char_str = js_mkstr(js, cur_str + byte_offset, char_bytes); 10437 + if (byte_pos + char_bytes > cur_byte_len) char_bytes = cur_byte_len - byte_pos; 10438 + jsval_t char_str = js_mkstr(js, cur_str + byte_pos, char_bytes); 10439 + byte_pos += char_bytes; 10435 10440 10436 10441 jsval_t err = for_iter_bind_var(js, ctx, char_str); 10437 10442 if (is_err(err)) { js_unroot(js, h_iterable); return err; }

+16 -7

src/utf8.c

··· 37 37 38 38 if (first < 0x80) { *seq_len = 1; return first; } 39 39 if ((first & 0xE0) == 0xC0 && len >= 2) { 40 + if ((buf[1] & 0xC0) != 0x80) { *seq_len = 1; return 0xFFFD; } 40 41 *seq_len = 2; 41 42 return ((first & 0x1F) << 6) | (buf[1] & 0x3F); 42 43 } 43 44 if ((first & 0xF0) == 0xE0 && len >= 3) { 45 + if ((buf[1] & 0xC0) != 0x80 || (buf[2] & 0xC0) != 0x80) { *seq_len = 1; return 0xFFFD; } 44 46 *seq_len = 3; 45 47 return ((first & 0x0F) << 12) | ((buf[1] & 0x3F) << 6) | (buf[2] & 0x3F); 46 48 } 47 49 if ((first & 0xF8) == 0xF0 && len >= 4) { 50 + if ((buf[1] & 0xC0) != 0x80 || (buf[2] & 0xC0) != 0x80 || (buf[3] & 0xC0) != 0x80) { *seq_len = 1; return 0xFFFD; } 48 51 *seq_len = 4; 49 52 return ((first & 0x07) << 18) | ((buf[1] & 0x3F) << 12) | ((buf[2] & 0x3F) << 6) | (buf[3] & 0x3F); 50 53 } ··· 58 61 const unsigned char *p = (const unsigned char *)str; 59 62 const unsigned char *end = p + byte_len; 60 63 while (p < end) { 61 - if ((*p & 0xC0) != 0x80) count++; 62 - p++; 64 + int seq_len = utf8_sequence_length(*p); 65 + if (seq_len <= 0 || (size_t)seq_len > (size_t)(end - p)) { 66 + count++; p++; 67 + } else { count++; p += seq_len; } 63 68 } 64 69 return count; 65 70 } ··· 71 76 while (p < end) { 72 77 unsigned char c = *p; 73 78 if ((c & 0xC0) != 0x80) { 74 - count++; 75 - if ((c & 0xF8) == 0xF0) count++; 76 - } 77 - p++; 79 + count++; if ((c & 0xF8) == 0xF0) count++; 80 + } p++; 78 81 } 79 82 return count; 80 83 } ··· 99 102 if (p > end) p = end; 100 103 } 101 104 102 - if (p >= end) return -1; 105 + if (p >= end) { 106 + if (utf16_pos == utf16_idx) { 107 + if (out_char_bytes) *out_char_bytes = 0; 108 + return (int)byte_len; 109 + } 110 + return -1; 111 + } 103 112 104 113 unsigned char c = *p; 105 114 size_t slen = (c < 0x80) ? 1 : ((c & 0xE0) == 0xC0) ? 2 : ((c & 0xF0) == 0xE0) ? 3 : ((c & 0xF8) == 0xF0) ? 4 : 1;

Configure Feed

Configure Feed