MIRROR: javascript for 馃悳's, a tiny runtime with big ambitions
1#include "utf8.h"
2#include "utils.h"
3#include "internal.h"
4#include "gc/objects.h"
5
6#include <stdlib.h>
7#include <string.h>
8#include <stdbool.h>
9#include <stddef.h>
10
11typedef struct {
12 uint64_t epoch;
13 const char *str;
14 size_t byte_len;
15 size_t byte_pos;
16 size_t utf16_pos;
17} utf16_scan_cache_t;
18
19typedef struct {
20 const char *str;
21 size_t byte_len;
22 const unsigned char *start;
23 const unsigned char *end;
24 const unsigned char *p;
25 size_t utf16_pos;
26} utf16_scan_cursor_t;
27
28static _Thread_local utf16_scan_cache_t utf16_scan_cache = { 0 };
29
30static inline void utf16_scan_cache_sync_epoch(void) {
31 uint64_t epoch = gc_get_epoch();
32 if (utf16_scan_cache.epoch == epoch) return;
33 utf16_scan_cache = (utf16_scan_cache_t){ .epoch = epoch };
34}
35
36static inline void utf16_scan_cursor_init(
37 utf16_scan_cursor_t *cursor,
38 const char *str,
39 size_t byte_len
40) {
41 utf16_scan_cache_sync_epoch();
42 cursor->str = str;
43 cursor->byte_len = byte_len;
44 cursor->start = (const unsigned char *)str;
45 cursor->end = cursor->start + byte_len;
46 cursor->p = cursor->start;
47 cursor->utf16_pos = 0;
48}
49
50static inline bool utf16_scan_cache_matches(const utf16_scan_cursor_t *cursor) {
51 return utf16_scan_cache.str == cursor->str
52 && utf16_scan_cache.byte_pos <= cursor->byte_len;
53}
54
55static inline void utf16_scan_cursor_resume_cached(utf16_scan_cursor_t *cursor) {
56 if (!utf16_scan_cache_matches(cursor)) return;
57 cursor->p = cursor->start + utf16_scan_cache.byte_pos;
58 cursor->utf16_pos = utf16_scan_cache.utf16_pos;
59}
60
61static inline void utf16_scan_cursor_resume_utf16(
62 utf16_scan_cursor_t *cursor,
63 size_t target_utf16
64) {
65 if (!utf16_scan_cache_matches(cursor)) return;
66 if (target_utf16 < utf16_scan_cache.utf16_pos) return;
67 cursor->p = cursor->start + utf16_scan_cache.byte_pos;
68 cursor->utf16_pos = utf16_scan_cache.utf16_pos;
69}
70
71static inline void utf16_scan_cursor_resume_byte(
72 utf16_scan_cursor_t *cursor,
73 size_t target_byte
74) {
75 if (!utf16_scan_cache_matches(cursor)) return;
76 if (target_byte < utf16_scan_cache.byte_pos) return;
77 cursor->p = cursor->start + utf16_scan_cache.byte_pos;
78 cursor->utf16_pos = utf16_scan_cache.utf16_pos;
79}
80
81static inline void utf16_scan_cursor_store(const utf16_scan_cursor_t *cursor) {
82 utf16_scan_cache.str = cursor->str;
83 utf16_scan_cache.byte_len = cursor->byte_len;
84 utf16_scan_cache.byte_pos = (size_t)(cursor->p - cursor->start);
85 utf16_scan_cache.utf16_pos = cursor->utf16_pos;
86}
87
88static inline void utf16_scan_decode(
89 const unsigned char *p,
90 const unsigned char *end,
91 size_t *slen_out,
92 size_t *units_out,
93 uint32_t *cp_out
94) {
95 unsigned char c = *p;
96 if (c < 0x80) {
97 if (cp_out) *cp_out = c;
98 *slen_out = 1;
99 *units_out = 1;
100 return;
101 }
102
103 if ((c & 0xE0) == 0xC0) {
104 if (cp_out && p + 1 < end) {
105 *cp_out = ((uint32_t)(c & 0x1F) << 6) | (uint32_t)(p[1] & 0x3F);
106 *slen_out = 2;
107 *units_out = 1;
108 return;
109 }
110 if (!cp_out) {
111 *slen_out = 2;
112 *units_out = 1;
113 return;
114 }
115 } else if ((c & 0xF0) == 0xE0) {
116 if (cp_out && p + 2 < end) {
117 *cp_out = ((uint32_t)(c & 0x0F) << 12)
118 | ((uint32_t)(p[1] & 0x3F) << 6)
119 | (uint32_t)(p[2] & 0x3F);
120 *slen_out = 3;
121 *units_out = 1;
122 return;
123 }
124 if (!cp_out) {
125 *slen_out = 3;
126 *units_out = 1;
127 return;
128 }
129 } else if ((c & 0xF8) == 0xF0) {
130 if (cp_out && p + 3 < end) {
131 *cp_out = ((uint32_t)(c & 0x07) << 18)
132 | ((uint32_t)(p[1] & 0x3F) << 12)
133 | ((uint32_t)(p[2] & 0x3F) << 6)
134 | (uint32_t)(p[3] & 0x3F);
135 *slen_out = 4;
136 *units_out = 2;
137 return;
138 }
139 if (!cp_out) {
140 *slen_out = 4;
141 *units_out = 2;
142 return;
143 }
144 }
145
146 if (cp_out) *cp_out = c;
147 *slen_out = 1;
148 *units_out = 1;
149}
150
151static inline bool utf16_scan_cursor_advance(
152 utf16_scan_cursor_t *cursor,
153 const unsigned char *bound_end
154) {
155 size_t slen, units;
156 const unsigned char *next;
157
158 utf16_scan_decode(cursor->p, cursor->end, &slen, &units, NULL);
159 next = cursor->p + slen;
160 cursor->utf16_pos += units;
161 if (next > bound_end) {
162 cursor->p = bound_end;
163 return false;
164 }
165 cursor->p = next;
166 return true;
167}
168
169static uint32_t utf8_decode(const unsigned char *buf, size_t len, int *seq_len) {
170 if (len == 0) { *seq_len = 0; return 0; }
171 utf8proc_int32_t cp;
172 *seq_len = (int)utf8_next(buf, (utf8proc_ssize_t)len, &cp);
173 return cp < 0 ? 0xFFFD : (uint32_t)cp;
174}
175
176static bool utf8_json_quote_reserve(char **buf, size_t *cap, size_t need) {
177 if (need <= *cap) return true;
178
179 size_t next = *cap ? *cap * 2 : 64;
180 while (next < need) next *= 2;
181
182 char *tmp = realloc(*buf, next);
183 if (!tmp) return false;
184 *buf = tmp;
185 *cap = next;
186 return true;
187}
188
189static bool utf8_json_quote_append(
190 char **buf, size_t *len, size_t *cap, const void *src, size_t src_len
191) {
192 if (!utf8_json_quote_reserve(buf, cap, *len + src_len + 1)) return false;
193 memcpy(*buf + *len, src, src_len);
194 *len += src_len;
195 (*buf)[*len] = '\0';
196 return true;
197}
198
199static bool utf8_json_quote_append_char(char **buf, size_t *len, size_t *cap, char ch) {
200 return utf8_json_quote_append(buf, len, cap, &ch, 1);
201}
202
203static bool utf8_json_quote_append_u_escape(
204 char **buf, size_t *len, size_t *cap, uint32_t code_unit
205) {
206 char escape[6] = {
207 '\\', 'u',
208 hex_char((int)(code_unit >> 12)),
209 hex_char((int)(code_unit >> 8)),
210 hex_char((int)(code_unit >> 4)),
211 hex_char((int)code_unit),
212 };
213 return utf8_json_quote_append(buf, len, cap, escape, sizeof(escape));
214}
215
216char *utf8_json_quote(const char *str, size_t byte_len, size_t *out_len) {
217 size_t utf16_len = utf16_strlen(str, byte_len);
218 size_t raw_len = 0;
219 size_t raw_cap = byte_len + 4;
220
221 char *raw = malloc(raw_cap);
222 if (!raw) return NULL;
223
224 if (!utf8_json_quote_append_char(&raw, &raw_len, &raw_cap, '"')) goto oom;
225
226 for (size_t i = 0; i < utf16_len; i++) {
227 uint32_t cu = utf16_code_unit_at(str, byte_len, i);
228
229 if (cu >= 0xD800 && cu <= 0xDBFF && i + 1 < utf16_len) {
230 uint32_t cu2 = utf16_code_unit_at(str, byte_len, i + 1);
231 if (cu2 >= 0xDC00 && cu2 <= 0xDFFF) {
232 uint32_t cp = utf16_codepoint_at(str, byte_len, i);
233 char utf8[4];
234 int n = utf8_encode(cp, utf8);
235 if (n <= 0 || !utf8_json_quote_append(&raw, &raw_len, &raw_cap, utf8, (size_t)n)) goto oom;
236 i++;
237 continue;
238 }}
239
240 if (cu >= 0xD800 && cu <= 0xDFFF) {
241 if (!utf8_json_quote_append_u_escape(&raw, &raw_len, &raw_cap, cu)) goto oom;
242 continue;
243 }
244
245 switch (cu) {
246 case '"': if (!utf8_json_quote_append(&raw, &raw_len, &raw_cap, "\\\"", 2)) goto oom; continue;
247 case '\\': if (!utf8_json_quote_append(&raw, &raw_len, &raw_cap, "\\\\", 2)) goto oom; continue;
248 case '\b': if (!utf8_json_quote_append(&raw, &raw_len, &raw_cap, "\\b", 2)) goto oom; continue;
249 case '\f': if (!utf8_json_quote_append(&raw, &raw_len, &raw_cap, "\\f", 2)) goto oom; continue;
250 case '\n': if (!utf8_json_quote_append(&raw, &raw_len, &raw_cap, "\\n", 2)) goto oom; continue;
251 case '\r': if (!utf8_json_quote_append(&raw, &raw_len, &raw_cap, "\\r", 2)) goto oom; continue;
252 case '\t': if (!utf8_json_quote_append(&raw, &raw_len, &raw_cap, "\\t", 2)) goto oom; continue;
253 default: break;
254 }
255
256 if (cu < 0x20) {
257 if (!utf8_json_quote_append_u_escape(&raw, &raw_len, &raw_cap, cu)) goto oom;
258 continue;
259 }
260
261 char utf8[4];
262 int n = utf8_encode(cu, utf8);
263 if (n <= 0 || !utf8_json_quote_append(&raw, &raw_len, &raw_cap, utf8, (size_t)n)) goto oom;
264 }
265
266 if (!utf8_json_quote_append_char(&raw, &raw_len, &raw_cap, '"')) goto oom;
267 if (out_len) *out_len = raw_len;
268 return raw;
269
270oom:
271 free(raw);
272 if (out_len) *out_len = 0;
273 return NULL;
274}
275
276size_t utf8_char_len_at(const char *str, size_t byte_len, size_t pos) {
277 if (pos >= byte_len) return 1;
278 int seq = utf8_sequence_length((unsigned char)str[pos]);
279 if (seq <= 0) return 1;
280 if (pos + (size_t)seq > byte_len) return byte_len - pos;
281 return (size_t)seq;
282}
283
284size_t utf8_strlen(const char *str, size_t byte_len) {
285 size_t count = 0;
286 const unsigned char *p = (const unsigned char *)str;
287 const unsigned char *end = p + byte_len;
288 while (p < end) {
289 int seq_len = utf8_sequence_length(*p);
290 if (seq_len <= 0 || (size_t)seq_len > (size_t)(end - p)) {
291 count++; p++;
292 } else { count++; p += seq_len; }
293 }
294 return count;
295}
296
297size_t utf16_strlen(const char *str, size_t byte_len) {
298 if (str_is_ascii(str)) return byte_len;
299
300 utf16_scan_cursor_t cursor;
301 utf16_scan_cursor_init(&cursor, str, byte_len);
302 utf16_scan_cursor_resume_cached(&cursor);
303
304 while (cursor.p < cursor.end) {
305 utf16_scan_cursor_advance(&cursor, cursor.end);
306 }
307
308 utf16_scan_cursor_store(&cursor);
309 return cursor.utf16_pos;
310}
311
312int utf16_index_to_byte_offset(
313 const char *str,
314 size_t byte_len,
315 size_t utf16_idx,
316 size_t *out_char_bytes
317) {
318 if (str_is_ascii(str)) {
319 if (utf16_idx > byte_len) return -1;
320 if (out_char_bytes) *out_char_bytes = (utf16_idx < byte_len) ? 1 : 0;
321 return (int)utf16_idx;
322 }
323
324 utf16_scan_cursor_t cursor;
325 utf16_scan_cursor_init(&cursor, str, byte_len);
326 utf16_scan_cursor_resume_utf16(&cursor, utf16_idx);
327
328 while (cursor.p < cursor.end && cursor.utf16_pos < utf16_idx) {
329 utf16_scan_cursor_advance(&cursor, cursor.end);
330 }
331
332 if (cursor.p >= cursor.end) {
333 if (cursor.utf16_pos == utf16_idx) {
334 if (out_char_bytes) *out_char_bytes = 0;
335 utf16_scan_cursor_store(&cursor);
336 return (int)byte_len;
337 }
338 utf16_scan_cursor_store(&cursor);
339 return -1;
340 }
341
342 size_t slen, units;
343 utf16_scan_decode(cursor.p, cursor.end, &slen, &units, NULL);
344
345 if (out_char_bytes) *out_char_bytes = slen;
346 utf16_scan_cursor_store(&cursor);
347 return (int)(cursor.p - cursor.start);
348}
349
350int utf16_range_to_byte_range(
351 const char *str,
352 size_t byte_len,
353 size_t utf16_start,
354 size_t utf16_end,
355 size_t *byte_start,
356 size_t *byte_end
357) {
358 if (str_is_ascii(str)) {
359 *byte_start = (utf16_start <= byte_len) ? utf16_start : byte_len;
360 *byte_end = (utf16_end <= byte_len) ? utf16_end : byte_len;
361 return 0;
362 }
363
364 utf16_scan_cursor_t cursor;
365 utf16_scan_cursor_init(&cursor, str, byte_len);
366 utf16_scan_cursor_resume_utf16(&cursor, utf16_start);
367
368 size_t b_start = 0, b_end = byte_len;
369 int found_start = 0, found_end = 0;
370
371 while (cursor.p < cursor.end) {
372 if (cursor.utf16_pos == utf16_start) {
373 b_start = (size_t)(cursor.p - cursor.start);
374 found_start = 1;
375 }
376 if (cursor.utf16_pos == utf16_end) {
377 b_end = (size_t)(cursor.p - cursor.start);
378 found_end = 1;
379 break;
380 }
381 utf16_scan_cursor_advance(&cursor, cursor.end);
382 }
383
384 if (!found_start && utf16_start >= cursor.utf16_pos) b_start = byte_len;
385 if (!found_end && utf16_end >= cursor.utf16_pos) b_end = byte_len;
386
387 *byte_start = b_start;
388 *byte_end = b_end;
389 utf16_scan_cursor_store(&cursor);
390
391 return 0;
392}
393
394size_t byte_offset_to_utf16(const char *str, size_t byte_off) {
395 if (str_is_ascii(str)) return byte_off;
396
397 utf16_scan_cursor_t cursor;
398 const unsigned char *bound_end;
399 bool ended_on_boundary = true;
400
401 utf16_scan_cursor_init(&cursor, str, byte_off);
402 utf16_scan_cursor_resume_byte(&cursor, byte_off);
403 bound_end = cursor.start + byte_off;
404
405 while (cursor.p < bound_end) {
406 if (!utf16_scan_cursor_advance(&cursor, bound_end)) {
407 ended_on_boundary = false;
408 break;
409 }
410 }
411
412 if (ended_on_boundary) utf16_scan_cursor_store(&cursor);
413 return cursor.utf16_pos;
414}
415
416uint32_t utf16_code_unit_at(const char *str, size_t byte_len, size_t utf16_idx) {
417 if (str_is_ascii(str)) {
418 if (utf16_idx >= byte_len) return 0xFFFFFFFF;
419 return (unsigned char)str[utf16_idx];
420 }
421
422 utf16_scan_cursor_t cursor;
423 utf16_scan_cursor_init(&cursor, str, byte_len);
424 utf16_scan_cursor_resume_utf16(&cursor, utf16_idx);
425
426 while (cursor.p < cursor.end) {
427 size_t slen, units;
428 uint32_t cp;
429
430 utf16_scan_decode(cursor.p, cursor.end, &slen, &units, &cp);
431
432 if (cursor.utf16_pos == utf16_idx) {
433 utf16_scan_cursor_store(&cursor);
434 if (units == 2) return 0xD800 + ((cp - 0x10000) >> 10);
435 return cp;
436 }
437 if (units == 2 && cursor.utf16_pos + 1 == utf16_idx) {
438 utf16_scan_cursor_store(&cursor);
439 return 0xDC00 + ((cp - 0x10000) & 0x3FF);
440 }
441 cursor.p += slen;
442 cursor.utf16_pos += units;
443 }
444
445 utf16_scan_cursor_store(&cursor);
446 return 0xFFFFFFFF;
447}
448
449utf8proc_ssize_t utf8_whatwg_decode(
450 utf8_dec_t *dec, const uint8_t *src, size_t len,
451 char *out, bool fatal, bool stream
452) {
453 static const void *tbl[256] = {
454 [0x00 ... 0x7F] = &&L_ASCII,
455 [0x80 ... 0xBF] = &&L_LONE,
456 [0xC0 ... 0xC1] = &&L_BAD,
457 [0xC2 ... 0xDF] = &&L_2,
458 [0xE0] = &&L_E0,
459 [0xE1 ... 0xEC] = &&L_3,
460 [0xED] = &&L_ED,
461 [0xEE ... 0xEF] = &&L_3,
462 [0xF0] = &&L_F0,
463 [0xF1 ... 0xF3] = &&L_4,
464 [0xF4] = &&L_F4,
465 [0xF5 ... 0xFF] = &&L_BAD,
466 };
467
468 size_t i = 0, o = 0;
469 int bc = 0;
470
471 uint8_t lo = 0x80, hi = 0xBF;
472 utf8proc_int32_t cp = 0;
473 uint8_t pb[4]; int pp = 0;
474
475#define FFFD() do { out[o++]=(char)0xEF; out[o++]=(char)0xBF; out[o++]=(char)0xBD; } while(0)
476#define NEXT() do { i++; if (i < len) goto *tbl[src[i]]; goto done; } while(0)
477
478 if (!len) goto done;
479 goto *tbl[src[0]];
480
481L_ASCII:
482 dec->bom_seen = true;
483 out[o++] = (char)src[i];
484 NEXT();
485
486L_LONE:
487L_BAD:
488 if (fatal) return -1;
489 FFFD(); dec->bom_seen = true;
490 NEXT();
491
492L_E0: bc=2; lo=0xA0; hi=0xBF; cp=src[i]&0x0F; pb[0]=src[i]; pp=1; i++; goto cont;
493L_ED: bc=2; lo=0x80; hi=0x9F; cp=src[i]&0x0F; pb[0]=src[i]; pp=1; i++; goto cont;
494L_3: bc=2; lo=0x80; hi=0xBF; cp=src[i]&0x0F; pb[0]=src[i]; pp=1; i++; goto cont;
495L_F0: bc=3; lo=0x90; hi=0xBF; cp=src[i]&0x07; pb[0]=src[i]; pp=1; i++; goto cont;
496L_F4: bc=3; lo=0x80; hi=0x8F; cp=src[i]&0x07; pb[0]=src[i]; pp=1; i++; goto cont;
497L_4: bc=3; lo=0x80; hi=0xBF; cp=src[i]&0x07; pb[0]=src[i]; pp=1; i++; goto cont;
498L_2: bc=1; lo=0x80; hi=0xBF; cp=src[i]&0x1F; pb[0]=src[i]; pp=1; i++; goto cont;
499
500cont:
501 while (bc > 0) {
502 if (i >= len) {
503 if (stream) { dec->pend_pos = pp; memcpy(dec->pend_buf, pb, pp); }
504 else { if (fatal) return -1; FFFD(); }
505 goto done;
506 }
507 uint8_t b = src[i];
508 if (b < lo || b > hi) {
509 bc = 0; cp = 0; pp = 0;
510 if (fatal) return -1;
511 FFFD(); dec->bom_seen = true;
512 goto *tbl[b];
513 }
514 lo = 0x80; hi = 0xBF;
515 cp = (cp << 6) | (b & 0x3F);
516 pb[pp++] = b; bc--; i++;
517 }
518 pp = 0;
519 if (!dec->bom_seen && cp == 0xFEFF && !dec->ignore_bom) dec->bom_seen = true;
520 else {
521 dec->bom_seen = true;
522 utf8proc_ssize_t n = utf8proc_encode_char(cp, (utf8proc_uint8_t *)(out + o));
523 if (n > 0) o += (size_t)n;
524 }
525 cp = 0;
526 if (i < len) goto *tbl[src[i]];
527
528done:
529#undef FFFD
530#undef NEXT
531 return (utf8proc_ssize_t)o;
532}
533
534uint32_t utf16_codepoint_at(const char *str, size_t byte_len, size_t utf16_idx) {
535 if (str_is_ascii(str)) {
536 if (utf16_idx >= byte_len) return 0xFFFFFFFF;
537 return (unsigned char)str[utf16_idx];
538 }
539
540 utf16_scan_cursor_t cursor;
541 utf16_scan_cursor_init(&cursor, str, byte_len);
542 utf16_scan_cursor_resume_utf16(&cursor, utf16_idx);
543
544 while (cursor.p < cursor.end) {
545 size_t slen, units;
546 uint32_t cp;
547
548 utf16_scan_decode(cursor.p, cursor.end, &slen, &units, &cp);
549
550 if (cursor.utf16_pos == utf16_idx) {
551 utf16_scan_cursor_store(&cursor);
552 return cp;
553 }
554 if (units == 2 && cursor.utf16_pos + 1 == utf16_idx) {
555 utf16_scan_cursor_store(&cursor);
556 return 0xDC00 + ((cp - 0x10000) & 0x3FF);
557 }
558
559 cursor.p += slen;
560 cursor.utf16_pos += units;
561 }
562
563 utf16_scan_cursor_store(&cursor);
564 return 0xFFFFFFFF;
565}