refactor string utilities into shared header · themackabu.com/ant@4c5e7a1

+25

include/strings.h

··· 1 + #ifndef STRINGS_H 2 + #define STRINGS_H 3 + 4 + #include <stdbool.h> 5 + #include <stddef.h> 6 + #include <stdint.h> 7 + 8 + static inline uint8_t unhex(uint8_t c) { 9 + return (c & 0xF) + (c >> 6) * 9; 10 + } 11 + 12 + static inline bool is_xdigit(int c) { 13 + return (unsigned)c < 256 && ( 14 + (c >= '0' && c <= '9') || 15 + (c >= 'a' && c <= 'f') || 16 + (c >= 'A' && c <= 'F') 17 + ); 18 + } 19 + 20 + size_t decode_escape( 21 + const uint8_t *in, size_t pos, size_t end, 22 + uint8_t *out, size_t *out_pos, uint8_t quote 23 + ); 24 + 25 + #endif

+1

libant/meson.build

··· 25 25 '../src/roots.c', 26 26 '../src/utils.c', 27 27 '../src/utf8.c', 28 + '../src/strings.c', 28 29 '../src/reactor.c', 29 30 '../src/sugar.c', 30 31 '../src/ant.c',

+1

meson.build

··· 24 24 'src/roots.c', 25 25 'src/utils.c', 26 26 'src/utf8.c', 27 + 'src/strings.c', 27 28 'src/reactor.c', 28 29 'src/sugar.c', 29 30 'src/ant.c',

+1 -75

src/ant.c

··· 16 16 #include "stack.h" 17 17 #include "errors.h" 18 18 #include "utf8.h" 19 + #include "strings.h" 19 20 #include "esm/remote.h" 20 21 21 22 #include <uv.h> ··· 567 568 568 569 inline size_t js_getbrk(struct js *js) { 569 570 return (size_t) js->brk; 570 - } 571 - 572 - static inline uint8_t unhex(uint8_t c) { 573 - return (c & 0xF) + (c >> 6) * 9; 574 571 } 575 572 576 573 static inline int is_body_end_tok(int tok) { ··· 4182 4179 static bool is_digit(int c) { 4183 4180 if (c < 0 || c >= 256) return false; 4184 4181 return (char_type[(uint8_t)c] & CHAR_DIGIT) != 0; 4185 - } 4186 - 4187 - static bool is_xdigit(int c) { 4188 - if (c < 0 || c >= 256) return false; 4189 - return (char_type[(uint8_t)c] & CHAR_XDIGIT) != 0; 4190 4182 } 4191 4183 4192 4184 static bool is_alpha(int c) { ··· 8224 8216 L_default: 8225 8217 if (is_assign(op)) return do_assign_op(js, op, lhs, r); 8226 8218 return js_mkerr(js, "unknown op %d", (int)op); 8227 - } 8228 - 8229 - static size_t decode_escape(const uint8_t *in, size_t pos, size_t end, uint8_t *out, size_t *out_pos, uint8_t quote) { 8230 - size_t n2 = pos; 8231 - uint8_t c = in[n2 + 1]; 8232 - size_t n1 = *out_pos; 8233 - 8234 - if (c == quote) { 8235 - out[n1++] = quote; 8236 - } else if (c == 'n') { 8237 - out[n1++] = '\n'; 8238 - } else if (c == 't') { 8239 - out[n1++] = '\t'; 8240 - } else if (c == 'r') { 8241 - out[n1++] = '\r'; 8242 - } else if (c == '0' && !(in[n2 + 2] >= '0' && in[n2 + 2] <= '7')) { 8243 - out[n1++] = '\0'; 8244 - } else if (c >= '1' && c <= '7') { 8245 - int val = c - '0'; 8246 - int extra = 0; 8247 - if (in[n2 + 2] >= '0' && in[n2 + 2] <= '7') { 8248 - val = val * 8 + (in[n2 + 2] - '0'); 8249 - extra++; 8250 - if (in[n2 + 3] >= '0' && in[n2 + 3] <= '7' && val * 8 + (in[n2 + 3] - '0') <= 255) { 8251 - val = val * 8 + (in[n2 + 3] - '0'); 8252 - extra++; 8253 - } 8254 - } 8255 - n2 += extra; 8256 - out[n1++] = (uint8_t)val; 8257 - } else if (c == 'v') { 8258 - out[n1++] = '\v'; 8259 - } else if (c == 'f') { 8260 - out[n1++] = '\f'; 8261 - } else if (c == 'b') { 8262 - out[n1++] = '\b'; 8263 - } else if (c == 'x' && n2 + 3 < end && is_xdigit(in[n2 + 2]) && is_xdigit(in[n2 + 3])) { 8264 - out[n1++] = (uint8_t)((unhex(in[n2 + 2]) << 4U) | unhex(in[n2 + 3])); 8265 - n2 += 2; 8266 - } else if (c == 'u' && n2 + 2 < end && in[n2 + 2] == '{') { 8267 - uint32_t cp = 0; 8268 - size_t i = n2 + 3; 8269 - while (i < end && is_xdigit(in[i])) { cp = (cp << 4) | unhex(in[i]); i++; } 8270 - if (i < end && in[i] == '}') { 8271 - if (cp < 0x80) { out[n1++] = (uint8_t)cp; } 8272 - else if (cp < 0x800) { out[n1++] = (uint8_t)(0xC0 | (cp >> 6)); out[n1++] = (uint8_t)(0x80 | (cp & 0x3F)); } 8273 - else if (cp < 0x10000) { out[n1++] = (uint8_t)(0xE0 | (cp >> 12)); out[n1++] = (uint8_t)(0x80 | ((cp >> 6) & 0x3F)); out[n1++] = (uint8_t)(0x80 | (cp & 0x3F)); } 8274 - else { out[n1++] = (uint8_t)(0xF0 | (cp >> 18)); out[n1++] = (uint8_t)(0x80 | ((cp >> 12) & 0x3F)); out[n1++] = (uint8_t)(0x80 | ((cp >> 6) & 0x3F)); out[n1++] = (uint8_t)(0x80 | (cp & 0x3F)); } 8275 - n2 = i; 8276 - } else { 8277 - out[n1++] = c; 8278 - } 8279 - } else if (c == 'u' && n2 + 5 < end && is_xdigit(in[n2 + 2]) && is_xdigit(in[n2 + 3]) && is_xdigit(in[n2 + 4]) && is_xdigit(in[n2 + 5])) { 8280 - uint32_t cp = (unhex(in[n2 + 2]) << 12U) | (unhex(in[n2 + 3]) << 8U) | (unhex(in[n2 + 4]) << 4U) | unhex(in[n2 + 5]); 8281 - if (cp < 0x80) { out[n1++] = (uint8_t)cp; } 8282 - else if (cp < 0x800) { out[n1++] = (uint8_t)(0xC0 | (cp >> 6)); out[n1++] = (uint8_t)(0x80 | (cp & 0x3F)); } 8283 - else { out[n1++] = (uint8_t)(0xE0 | (cp >> 12)); out[n1++] = (uint8_t)(0x80 | ((cp >> 6) & 0x3F)); out[n1++] = (uint8_t)(0x80 | (cp & 0x3F)); } 8284 - n2 += 4; 8285 - } else if (c == '\\') { 8286 - out[n1++] = '\\'; 8287 - } else { 8288 - out[n1++] = c; 8289 - } 8290 - 8291 - *out_pos = n1; 8292 - return n2 - pos; 8293 8219 } 8294 8220 8295 8221 static jsval_t js_template_literal(struct js *js) {

+86

src/strings.c

··· 1 + #include "utf8.h" 2 + #include "strings.h" 3 + 4 + static inline size_t decode_hex_escape(const uint8_t *in, size_t pos, uint8_t *out, size_t *out_pos) { 5 + out[(*out_pos)++] = (uint8_t)((unhex(in[pos + 2]) << 4U) | unhex(in[pos + 3])); 6 + return 2; 7 + } 8 + 9 + static size_t decode_octal_escape(const uint8_t *in, size_t pos, uint8_t *out, size_t *out_pos) { 10 + uint8_t c = in[pos + 1]; 11 + size_t extra = 0; 12 + int val = c - '0'; 13 + 14 + if (in[pos + 2] >= '0' && in[pos + 2] <= '7') { 15 + val = val * 8 + (in[pos + 2] - '0'); extra++; 16 + if (in[pos + 3] >= '0' && in[pos + 3] <= '7' && val * 8 + (in[pos + 3] - '0') <= 255) { 17 + val = val * 8 + (in[pos + 3] - '0'); extra++; 18 + } 19 + } 20 + 21 + out[(*out_pos)++] = (uint8_t)val; 22 + return extra; 23 + } 24 + 25 + static size_t decode_unicode_braced(const uint8_t *in, size_t pos, size_t end, uint8_t *out, size_t *out_pos) { 26 + uint32_t cp = 0; 27 + size_t i = pos + 3; 28 + 29 + while (i < end && is_xdigit(in[i])) { cp = (cp << 4) | unhex(in[i]); i++; } 30 + if (i < end && in[i] == '}') { 31 + *out_pos += utf8_encode(cp, (char *)&out[*out_pos]); 32 + return i - pos; 33 + } 34 + 35 + out[(*out_pos)++] = 'u'; 36 + return 0; 37 + } 38 + 39 + static size_t decode_unicode_fixed(const uint8_t *in, size_t pos, uint8_t *out, size_t *out_pos) { 40 + uint32_t cp = 41 + (unhex(in[pos + 2]) << 12U) | (unhex(in[pos + 3]) << 8U) | 42 + (unhex(in[pos + 4]) << 4U) | unhex(in[pos + 5]); 43 + 44 + *out_pos += utf8_encode(cp, (char *)&out[*out_pos]); 45 + return 4; 46 + } 47 + 48 + size_t decode_escape(const uint8_t *in, size_t pos, size_t end, uint8_t *out, size_t *out_pos, uint8_t quote) { 49 + uint8_t c = in[pos + 1]; 50 + size_t advance = 0; 51 + 52 + switch (c) { 53 + case 'n': out[(*out_pos)++] = '\n'; break; 54 + case 't': out[(*out_pos)++] = '\t'; break; 55 + case 'r': out[(*out_pos)++] = '\r'; break; 56 + case 'v': out[(*out_pos)++] = '\v'; break; 57 + case 'f': out[(*out_pos)++] = '\f'; break; 58 + case 'b': out[(*out_pos)++] = '\b'; break; 59 + case '\\': out[(*out_pos)++] = '\\'; break; 60 + case '0': 61 + if (!(in[pos + 2] >= '0' && in[pos + 2] <= '7')) { out[(*out_pos)++] = '\0'; break; } 62 + __attribute__((fallthrough)); 63 + case '1': case '2': case '3': case '4': case '5': case '6': case '7': 64 + advance = decode_octal_escape(in, pos, out, out_pos); 65 + break; 66 + case 'x': 67 + if (pos + 3 < end && is_xdigit(in[pos + 2]) && is_xdigit(in[pos + 3])) { 68 + advance = decode_hex_escape(in, pos, out, out_pos); 69 + } else out[(*out_pos)++] = c; 70 + break; 71 + case 'u': 72 + if (pos + 2 < end && in[pos + 2] == '{') { 73 + advance = decode_unicode_braced(in, pos, end, out, out_pos); 74 + } else if ( 75 + pos + 5 < end && is_xdigit(in[pos + 2]) && is_xdigit(in[pos + 3]) && 76 + is_xdigit(in[pos + 4]) && is_xdigit(in[pos + 5]) 77 + ) advance = decode_unicode_fixed(in, pos, out, out_pos); 78 + else out[(*out_pos)++] = c; 79 + break; 80 + default: 81 + out[(*out_pos)++] = (c == quote) ? quote : c; 82 + break; 83 + } 84 + 85 + return advance; 86 + }

+95

tests/bench_string_escape.js

··· 1 + const now = () => typeof performance !== 'undefined' && performance.now ? performance.now() : Date.now(); 2 + 3 + function bench(name, fn, iters = 1) { 4 + fn(); 5 + const t0 = now(); 6 + for (let i = 0; i < iters; i++) fn(); 7 + const dt = now() - t0; 8 + const per = (dt / iters).toFixed(3); 9 + console.log(`${name}: ${dt.toFixed(2)} ms total, ${per} ms/iter (${iters} iters)`); 10 + } 11 + 12 + // simple escape sequences 13 + bench('simple escapes (\\n\\t\\r)', () => { 14 + let s = ''; 15 + for (let i = 0; i < 5000; i++) s = "hello\tworld\nfoo\rbar"; 16 + }, 50); 17 + 18 + bench('backslash + quote escapes', () => { 19 + let s = ''; 20 + for (let i = 0; i < 5000; i++) s = "it\'s a \"test\" with \\backslash"; 21 + }, 50); 22 + 23 + // hex escapes \xNN 24 + bench('hex escapes (\\xNN)', () => { 25 + let s = ''; 26 + for (let i = 0; i < 5000; i++) s = "\x41\x42\x43\x61\x62\x63\x00\xff"; 27 + }, 50); 28 + 29 + // unicode escapes \uNNNN 30 + bench('unicode 4-digit (\\uNNNN)', () => { 31 + let s = ''; 32 + for (let i = 0; i < 5000; i++) s = "\u0041\u00e9\u4e16\u754c\u0048\u0065"; 33 + }, 50); 34 + 35 + // unicode braced escapes \u{NNNNN} 36 + bench('unicode braced (\\u{N+})', () => { 37 + let s = ''; 38 + for (let i = 0; i < 5000; i++) s = "\u{41}\u{e9}\u{4e16}\u{1f600}\u{10ffff}"; 39 + }, 50); 40 + 41 + // null escape 42 + bench('null escape (\\0)', () => { 43 + let s = ''; 44 + for (let i = 0; i < 5000; i++) s = "abc\0def\0ghi"; 45 + }, 50); 46 + 47 + // mixed escapes in one string 48 + bench('mixed escapes', () => { 49 + let s = ''; 50 + for (let i = 0; i < 5000; i++) s = "tab\there\nnewline\r\x41\u0042\u{43}end\\done"; 51 + }, 50); 52 + 53 + // template literals with escapes 54 + bench('template literal escapes', () => { 55 + let s = ''; 56 + const x = 42; 57 + for (let i = 0; i < 5000; i++) s = `\t\n\x41\u0042\u{43} val=${x}`; 58 + }, 50); 59 + 60 + // template literal with many interpolations 61 + bench('template interpolation heavy', () => { 62 + let s = ''; 63 + const a = 1, b = 2, c = 3, d = 4, e = 5; 64 + for (let i = 0; i < 5000; i++) s = `${a}-${b}-${c}-${d}-${e}`; 65 + }, 50); 66 + 67 + // long string with scattered escapes 68 + bench('long string scattered escapes', () => { 69 + let s = ''; 70 + for (let i = 0; i < 2000; i++) s = "aaaaaaaaaaaaaaaa\nbbbbbbbbbbbbbbbb\tcccccccccccccccc\rdddddddddddddddd\x41eeeeeeeeeeeeeeee\u0042ffffffffffffffff"; 71 + }, 50); 72 + 73 + // string concatenation with escapes 74 + bench('concat with escapes', () => { 75 + let s = ''; 76 + for (let i = 0; i < 2000; i++) s += "\n\t\x41\u0042"; 77 + }, 20); 78 + 79 + // octal escapes (legacy) 80 + bench('octal escapes', () => { 81 + let s = ''; 82 + for (let i = 0; i < 5000; i++) s = "\101\102\103\141\142\143"; 83 + }, 50); 84 + 85 + // form feed, vertical tab, backspace 86 + bench('rare escapes (\\v\\f\\b)', () => { 87 + let s = ''; 88 + for (let i = 0; i < 5000; i++) s = "a\vb\fc\bd"; 89 + }, 50); 90 + 91 + // no escapes baseline 92 + bench('no escapes (baseline)', () => { 93 + let s = ''; 94 + for (let i = 0; i < 5000; i++) s = "hello world this is a plain string with no escapes at all"; 95 + }, 50);

Configure Feed

Configure Feed