MIRROR: javascript for 馃悳's, a tiny runtime with big ambitions
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at master 565 lines 16 kB view raw
1#include "utf8.h" 2#include "utils.h" 3#include "internal.h" 4#include "gc/objects.h" 5 6#include <stdlib.h> 7#include <string.h> 8#include <stdbool.h> 9#include <stddef.h> 10 11typedef struct { 12 uint64_t epoch; 13 const char *str; 14 size_t byte_len; 15 size_t byte_pos; 16 size_t utf16_pos; 17} utf16_scan_cache_t; 18 19typedef struct { 20 const char *str; 21 size_t byte_len; 22 const unsigned char *start; 23 const unsigned char *end; 24 const unsigned char *p; 25 size_t utf16_pos; 26} utf16_scan_cursor_t; 27 28static _Thread_local utf16_scan_cache_t utf16_scan_cache = { 0 }; 29 30static inline void utf16_scan_cache_sync_epoch(void) { 31 uint64_t epoch = gc_get_epoch(); 32 if (utf16_scan_cache.epoch == epoch) return; 33 utf16_scan_cache = (utf16_scan_cache_t){ .epoch = epoch }; 34} 35 36static inline void utf16_scan_cursor_init( 37 utf16_scan_cursor_t *cursor, 38 const char *str, 39 size_t byte_len 40) { 41 utf16_scan_cache_sync_epoch(); 42 cursor->str = str; 43 cursor->byte_len = byte_len; 44 cursor->start = (const unsigned char *)str; 45 cursor->end = cursor->start + byte_len; 46 cursor->p = cursor->start; 47 cursor->utf16_pos = 0; 48} 49 50static inline bool utf16_scan_cache_matches(const utf16_scan_cursor_t *cursor) { 51 return utf16_scan_cache.str == cursor->str 52 && utf16_scan_cache.byte_pos <= cursor->byte_len; 53} 54 55static inline void utf16_scan_cursor_resume_cached(utf16_scan_cursor_t *cursor) { 56 if (!utf16_scan_cache_matches(cursor)) return; 57 cursor->p = cursor->start + utf16_scan_cache.byte_pos; 58 cursor->utf16_pos = utf16_scan_cache.utf16_pos; 59} 60 61static inline void utf16_scan_cursor_resume_utf16( 62 utf16_scan_cursor_t *cursor, 63 size_t target_utf16 64) { 65 if (!utf16_scan_cache_matches(cursor)) return; 66 if (target_utf16 < utf16_scan_cache.utf16_pos) return; 67 cursor->p = cursor->start + utf16_scan_cache.byte_pos; 68 cursor->utf16_pos = utf16_scan_cache.utf16_pos; 69} 70 71static inline void utf16_scan_cursor_resume_byte( 72 utf16_scan_cursor_t *cursor, 73 size_t target_byte 74) { 75 if (!utf16_scan_cache_matches(cursor)) return; 76 if (target_byte < utf16_scan_cache.byte_pos) return; 77 cursor->p = cursor->start + utf16_scan_cache.byte_pos; 78 cursor->utf16_pos = utf16_scan_cache.utf16_pos; 79} 80 81static inline void utf16_scan_cursor_store(const utf16_scan_cursor_t *cursor) { 82 utf16_scan_cache.str = cursor->str; 83 utf16_scan_cache.byte_len = cursor->byte_len; 84 utf16_scan_cache.byte_pos = (size_t)(cursor->p - cursor->start); 85 utf16_scan_cache.utf16_pos = cursor->utf16_pos; 86} 87 88static inline void utf16_scan_decode( 89 const unsigned char *p, 90 const unsigned char *end, 91 size_t *slen_out, 92 size_t *units_out, 93 uint32_t *cp_out 94) { 95 unsigned char c = *p; 96 if (c < 0x80) { 97 if (cp_out) *cp_out = c; 98 *slen_out = 1; 99 *units_out = 1; 100 return; 101 } 102 103 if ((c & 0xE0) == 0xC0) { 104 if (cp_out && p + 1 < end) { 105 *cp_out = ((uint32_t)(c & 0x1F) << 6) | (uint32_t)(p[1] & 0x3F); 106 *slen_out = 2; 107 *units_out = 1; 108 return; 109 } 110 if (!cp_out) { 111 *slen_out = 2; 112 *units_out = 1; 113 return; 114 } 115 } else if ((c & 0xF0) == 0xE0) { 116 if (cp_out && p + 2 < end) { 117 *cp_out = ((uint32_t)(c & 0x0F) << 12) 118 | ((uint32_t)(p[1] & 0x3F) << 6) 119 | (uint32_t)(p[2] & 0x3F); 120 *slen_out = 3; 121 *units_out = 1; 122 return; 123 } 124 if (!cp_out) { 125 *slen_out = 3; 126 *units_out = 1; 127 return; 128 } 129 } else if ((c & 0xF8) == 0xF0) { 130 if (cp_out && p + 3 < end) { 131 *cp_out = ((uint32_t)(c & 0x07) << 18) 132 | ((uint32_t)(p[1] & 0x3F) << 12) 133 | ((uint32_t)(p[2] & 0x3F) << 6) 134 | (uint32_t)(p[3] & 0x3F); 135 *slen_out = 4; 136 *units_out = 2; 137 return; 138 } 139 if (!cp_out) { 140 *slen_out = 4; 141 *units_out = 2; 142 return; 143 } 144 } 145 146 if (cp_out) *cp_out = c; 147 *slen_out = 1; 148 *units_out = 1; 149} 150 151static inline bool utf16_scan_cursor_advance( 152 utf16_scan_cursor_t *cursor, 153 const unsigned char *bound_end 154) { 155 size_t slen, units; 156 const unsigned char *next; 157 158 utf16_scan_decode(cursor->p, cursor->end, &slen, &units, NULL); 159 next = cursor->p + slen; 160 cursor->utf16_pos += units; 161 if (next > bound_end) { 162 cursor->p = bound_end; 163 return false; 164 } 165 cursor->p = next; 166 return true; 167} 168 169static uint32_t utf8_decode(const unsigned char *buf, size_t len, int *seq_len) { 170 if (len == 0) { *seq_len = 0; return 0; } 171 utf8proc_int32_t cp; 172 *seq_len = (int)utf8_next(buf, (utf8proc_ssize_t)len, &cp); 173 return cp < 0 ? 0xFFFD : (uint32_t)cp; 174} 175 176static bool utf8_json_quote_reserve(char **buf, size_t *cap, size_t need) { 177 if (need <= *cap) return true; 178 179 size_t next = *cap ? *cap * 2 : 64; 180 while (next < need) next *= 2; 181 182 char *tmp = realloc(*buf, next); 183 if (!tmp) return false; 184 *buf = tmp; 185 *cap = next; 186 return true; 187} 188 189static bool utf8_json_quote_append( 190 char **buf, size_t *len, size_t *cap, const void *src, size_t src_len 191) { 192 if (!utf8_json_quote_reserve(buf, cap, *len + src_len + 1)) return false; 193 memcpy(*buf + *len, src, src_len); 194 *len += src_len; 195 (*buf)[*len] = '\0'; 196 return true; 197} 198 199static bool utf8_json_quote_append_char(char **buf, size_t *len, size_t *cap, char ch) { 200 return utf8_json_quote_append(buf, len, cap, &ch, 1); 201} 202 203static bool utf8_json_quote_append_u_escape( 204 char **buf, size_t *len, size_t *cap, uint32_t code_unit 205) { 206 char escape[6] = { 207 '\\', 'u', 208 hex_char((int)(code_unit >> 12)), 209 hex_char((int)(code_unit >> 8)), 210 hex_char((int)(code_unit >> 4)), 211 hex_char((int)code_unit), 212 }; 213 return utf8_json_quote_append(buf, len, cap, escape, sizeof(escape)); 214} 215 216char *utf8_json_quote(const char *str, size_t byte_len, size_t *out_len) { 217 size_t utf16_len = utf16_strlen(str, byte_len); 218 size_t raw_len = 0; 219 size_t raw_cap = byte_len + 4; 220 221 char *raw = malloc(raw_cap); 222 if (!raw) return NULL; 223 224 if (!utf8_json_quote_append_char(&raw, &raw_len, &raw_cap, '"')) goto oom; 225 226 for (size_t i = 0; i < utf16_len; i++) { 227 uint32_t cu = utf16_code_unit_at(str, byte_len, i); 228 229 if (cu >= 0xD800 && cu <= 0xDBFF && i + 1 < utf16_len) { 230 uint32_t cu2 = utf16_code_unit_at(str, byte_len, i + 1); 231 if (cu2 >= 0xDC00 && cu2 <= 0xDFFF) { 232 uint32_t cp = utf16_codepoint_at(str, byte_len, i); 233 char utf8[4]; 234 int n = utf8_encode(cp, utf8); 235 if (n <= 0 || !utf8_json_quote_append(&raw, &raw_len, &raw_cap, utf8, (size_t)n)) goto oom; 236 i++; 237 continue; 238 }} 239 240 if (cu >= 0xD800 && cu <= 0xDFFF) { 241 if (!utf8_json_quote_append_u_escape(&raw, &raw_len, &raw_cap, cu)) goto oom; 242 continue; 243 } 244 245 switch (cu) { 246 case '"': if (!utf8_json_quote_append(&raw, &raw_len, &raw_cap, "\\\"", 2)) goto oom; continue; 247 case '\\': if (!utf8_json_quote_append(&raw, &raw_len, &raw_cap, "\\\\", 2)) goto oom; continue; 248 case '\b': if (!utf8_json_quote_append(&raw, &raw_len, &raw_cap, "\\b", 2)) goto oom; continue; 249 case '\f': if (!utf8_json_quote_append(&raw, &raw_len, &raw_cap, "\\f", 2)) goto oom; continue; 250 case '\n': if (!utf8_json_quote_append(&raw, &raw_len, &raw_cap, "\\n", 2)) goto oom; continue; 251 case '\r': if (!utf8_json_quote_append(&raw, &raw_len, &raw_cap, "\\r", 2)) goto oom; continue; 252 case '\t': if (!utf8_json_quote_append(&raw, &raw_len, &raw_cap, "\\t", 2)) goto oom; continue; 253 default: break; 254 } 255 256 if (cu < 0x20) { 257 if (!utf8_json_quote_append_u_escape(&raw, &raw_len, &raw_cap, cu)) goto oom; 258 continue; 259 } 260 261 char utf8[4]; 262 int n = utf8_encode(cu, utf8); 263 if (n <= 0 || !utf8_json_quote_append(&raw, &raw_len, &raw_cap, utf8, (size_t)n)) goto oom; 264 } 265 266 if (!utf8_json_quote_append_char(&raw, &raw_len, &raw_cap, '"')) goto oom; 267 if (out_len) *out_len = raw_len; 268 return raw; 269 270oom: 271 free(raw); 272 if (out_len) *out_len = 0; 273 return NULL; 274} 275 276size_t utf8_char_len_at(const char *str, size_t byte_len, size_t pos) { 277 if (pos >= byte_len) return 1; 278 int seq = utf8_sequence_length((unsigned char)str[pos]); 279 if (seq <= 0) return 1; 280 if (pos + (size_t)seq > byte_len) return byte_len - pos; 281 return (size_t)seq; 282} 283 284size_t utf8_strlen(const char *str, size_t byte_len) { 285 size_t count = 0; 286 const unsigned char *p = (const unsigned char *)str; 287 const unsigned char *end = p + byte_len; 288 while (p < end) { 289 int seq_len = utf8_sequence_length(*p); 290 if (seq_len <= 0 || (size_t)seq_len > (size_t)(end - p)) { 291 count++; p++; 292 } else { count++; p += seq_len; } 293 } 294 return count; 295} 296 297size_t utf16_strlen(const char *str, size_t byte_len) { 298 if (str_is_ascii(str)) return byte_len; 299 300 utf16_scan_cursor_t cursor; 301 utf16_scan_cursor_init(&cursor, str, byte_len); 302 utf16_scan_cursor_resume_cached(&cursor); 303 304 while (cursor.p < cursor.end) { 305 utf16_scan_cursor_advance(&cursor, cursor.end); 306 } 307 308 utf16_scan_cursor_store(&cursor); 309 return cursor.utf16_pos; 310} 311 312int utf16_index_to_byte_offset( 313 const char *str, 314 size_t byte_len, 315 size_t utf16_idx, 316 size_t *out_char_bytes 317) { 318 if (str_is_ascii(str)) { 319 if (utf16_idx > byte_len) return -1; 320 if (out_char_bytes) *out_char_bytes = (utf16_idx < byte_len) ? 1 : 0; 321 return (int)utf16_idx; 322 } 323 324 utf16_scan_cursor_t cursor; 325 utf16_scan_cursor_init(&cursor, str, byte_len); 326 utf16_scan_cursor_resume_utf16(&cursor, utf16_idx); 327 328 while (cursor.p < cursor.end && cursor.utf16_pos < utf16_idx) { 329 utf16_scan_cursor_advance(&cursor, cursor.end); 330 } 331 332 if (cursor.p >= cursor.end) { 333 if (cursor.utf16_pos == utf16_idx) { 334 if (out_char_bytes) *out_char_bytes = 0; 335 utf16_scan_cursor_store(&cursor); 336 return (int)byte_len; 337 } 338 utf16_scan_cursor_store(&cursor); 339 return -1; 340 } 341 342 size_t slen, units; 343 utf16_scan_decode(cursor.p, cursor.end, &slen, &units, NULL); 344 345 if (out_char_bytes) *out_char_bytes = slen; 346 utf16_scan_cursor_store(&cursor); 347 return (int)(cursor.p - cursor.start); 348} 349 350int utf16_range_to_byte_range( 351 const char *str, 352 size_t byte_len, 353 size_t utf16_start, 354 size_t utf16_end, 355 size_t *byte_start, 356 size_t *byte_end 357) { 358 if (str_is_ascii(str)) { 359 *byte_start = (utf16_start <= byte_len) ? utf16_start : byte_len; 360 *byte_end = (utf16_end <= byte_len) ? utf16_end : byte_len; 361 return 0; 362 } 363 364 utf16_scan_cursor_t cursor; 365 utf16_scan_cursor_init(&cursor, str, byte_len); 366 utf16_scan_cursor_resume_utf16(&cursor, utf16_start); 367 368 size_t b_start = 0, b_end = byte_len; 369 int found_start = 0, found_end = 0; 370 371 while (cursor.p < cursor.end) { 372 if (cursor.utf16_pos == utf16_start) { 373 b_start = (size_t)(cursor.p - cursor.start); 374 found_start = 1; 375 } 376 if (cursor.utf16_pos == utf16_end) { 377 b_end = (size_t)(cursor.p - cursor.start); 378 found_end = 1; 379 break; 380 } 381 utf16_scan_cursor_advance(&cursor, cursor.end); 382 } 383 384 if (!found_start && utf16_start >= cursor.utf16_pos) b_start = byte_len; 385 if (!found_end && utf16_end >= cursor.utf16_pos) b_end = byte_len; 386 387 *byte_start = b_start; 388 *byte_end = b_end; 389 utf16_scan_cursor_store(&cursor); 390 391 return 0; 392} 393 394size_t byte_offset_to_utf16(const char *str, size_t byte_off) { 395 if (str_is_ascii(str)) return byte_off; 396 397 utf16_scan_cursor_t cursor; 398 const unsigned char *bound_end; 399 bool ended_on_boundary = true; 400 401 utf16_scan_cursor_init(&cursor, str, byte_off); 402 utf16_scan_cursor_resume_byte(&cursor, byte_off); 403 bound_end = cursor.start + byte_off; 404 405 while (cursor.p < bound_end) { 406 if (!utf16_scan_cursor_advance(&cursor, bound_end)) { 407 ended_on_boundary = false; 408 break; 409 } 410 } 411 412 if (ended_on_boundary) utf16_scan_cursor_store(&cursor); 413 return cursor.utf16_pos; 414} 415 416uint32_t utf16_code_unit_at(const char *str, size_t byte_len, size_t utf16_idx) { 417 if (str_is_ascii(str)) { 418 if (utf16_idx >= byte_len) return 0xFFFFFFFF; 419 return (unsigned char)str[utf16_idx]; 420 } 421 422 utf16_scan_cursor_t cursor; 423 utf16_scan_cursor_init(&cursor, str, byte_len); 424 utf16_scan_cursor_resume_utf16(&cursor, utf16_idx); 425 426 while (cursor.p < cursor.end) { 427 size_t slen, units; 428 uint32_t cp; 429 430 utf16_scan_decode(cursor.p, cursor.end, &slen, &units, &cp); 431 432 if (cursor.utf16_pos == utf16_idx) { 433 utf16_scan_cursor_store(&cursor); 434 if (units == 2) return 0xD800 + ((cp - 0x10000) >> 10); 435 return cp; 436 } 437 if (units == 2 && cursor.utf16_pos + 1 == utf16_idx) { 438 utf16_scan_cursor_store(&cursor); 439 return 0xDC00 + ((cp - 0x10000) & 0x3FF); 440 } 441 cursor.p += slen; 442 cursor.utf16_pos += units; 443 } 444 445 utf16_scan_cursor_store(&cursor); 446 return 0xFFFFFFFF; 447} 448 449utf8proc_ssize_t utf8_whatwg_decode( 450 utf8_dec_t *dec, const uint8_t *src, size_t len, 451 char *out, bool fatal, bool stream 452) { 453 static const void *tbl[256] = { 454 [0x00 ... 0x7F] = &&L_ASCII, 455 [0x80 ... 0xBF] = &&L_LONE, 456 [0xC0 ... 0xC1] = &&L_BAD, 457 [0xC2 ... 0xDF] = &&L_2, 458 [0xE0] = &&L_E0, 459 [0xE1 ... 0xEC] = &&L_3, 460 [0xED] = &&L_ED, 461 [0xEE ... 0xEF] = &&L_3, 462 [0xF0] = &&L_F0, 463 [0xF1 ... 0xF3] = &&L_4, 464 [0xF4] = &&L_F4, 465 [0xF5 ... 0xFF] = &&L_BAD, 466 }; 467 468 size_t i = 0, o = 0; 469 int bc = 0; 470 471 uint8_t lo = 0x80, hi = 0xBF; 472 utf8proc_int32_t cp = 0; 473 uint8_t pb[4]; int pp = 0; 474 475#define FFFD() do { out[o++]=(char)0xEF; out[o++]=(char)0xBF; out[o++]=(char)0xBD; } while(0) 476#define NEXT() do { i++; if (i < len) goto *tbl[src[i]]; goto done; } while(0) 477 478 if (!len) goto done; 479 goto *tbl[src[0]]; 480 481L_ASCII: 482 dec->bom_seen = true; 483 out[o++] = (char)src[i]; 484 NEXT(); 485 486L_LONE: 487L_BAD: 488 if (fatal) return -1; 489 FFFD(); dec->bom_seen = true; 490 NEXT(); 491 492L_E0: bc=2; lo=0xA0; hi=0xBF; cp=src[i]&0x0F; pb[0]=src[i]; pp=1; i++; goto cont; 493L_ED: bc=2; lo=0x80; hi=0x9F; cp=src[i]&0x0F; pb[0]=src[i]; pp=1; i++; goto cont; 494L_3: bc=2; lo=0x80; hi=0xBF; cp=src[i]&0x0F; pb[0]=src[i]; pp=1; i++; goto cont; 495L_F0: bc=3; lo=0x90; hi=0xBF; cp=src[i]&0x07; pb[0]=src[i]; pp=1; i++; goto cont; 496L_F4: bc=3; lo=0x80; hi=0x8F; cp=src[i]&0x07; pb[0]=src[i]; pp=1; i++; goto cont; 497L_4: bc=3; lo=0x80; hi=0xBF; cp=src[i]&0x07; pb[0]=src[i]; pp=1; i++; goto cont; 498L_2: bc=1; lo=0x80; hi=0xBF; cp=src[i]&0x1F; pb[0]=src[i]; pp=1; i++; goto cont; 499 500cont: 501 while (bc > 0) { 502 if (i >= len) { 503 if (stream) { dec->pend_pos = pp; memcpy(dec->pend_buf, pb, pp); } 504 else { if (fatal) return -1; FFFD(); } 505 goto done; 506 } 507 uint8_t b = src[i]; 508 if (b < lo || b > hi) { 509 bc = 0; cp = 0; pp = 0; 510 if (fatal) return -1; 511 FFFD(); dec->bom_seen = true; 512 goto *tbl[b]; 513 } 514 lo = 0x80; hi = 0xBF; 515 cp = (cp << 6) | (b & 0x3F); 516 pb[pp++] = b; bc--; i++; 517 } 518 pp = 0; 519 if (!dec->bom_seen && cp == 0xFEFF && !dec->ignore_bom) dec->bom_seen = true; 520 else { 521 dec->bom_seen = true; 522 utf8proc_ssize_t n = utf8proc_encode_char(cp, (utf8proc_uint8_t *)(out + o)); 523 if (n > 0) o += (size_t)n; 524 } 525 cp = 0; 526 if (i < len) goto *tbl[src[i]]; 527 528done: 529#undef FFFD 530#undef NEXT 531 return (utf8proc_ssize_t)o; 532} 533 534uint32_t utf16_codepoint_at(const char *str, size_t byte_len, size_t utf16_idx) { 535 if (str_is_ascii(str)) { 536 if (utf16_idx >= byte_len) return 0xFFFFFFFF; 537 return (unsigned char)str[utf16_idx]; 538 } 539 540 utf16_scan_cursor_t cursor; 541 utf16_scan_cursor_init(&cursor, str, byte_len); 542 utf16_scan_cursor_resume_utf16(&cursor, utf16_idx); 543 544 while (cursor.p < cursor.end) { 545 size_t slen, units; 546 uint32_t cp; 547 548 utf16_scan_decode(cursor.p, cursor.end, &slen, &units, &cp); 549 550 if (cursor.utf16_pos == utf16_idx) { 551 utf16_scan_cursor_store(&cursor); 552 return cp; 553 } 554 if (units == 2 && cursor.utf16_pos + 1 == utf16_idx) { 555 utf16_scan_cursor_store(&cursor); 556 return 0xDC00 + ((cp - 0x10000) & 0x3FF); 557 } 558 559 cursor.p += slen; 560 cursor.utf16_pos += units; 561 } 562 563 utf16_scan_cursor_store(&cursor); 564 return 0xFFFFFFFF; 565}