bring text codecs up to spec · themackabu.com/ant@6965524

+131 -5

examples/spec/textcodec.js

··· 1 - import { test, summary } from './helpers.js'; 1 + import { test, testDeep, testThrows, summary } from './helpers.js'; 2 2 3 3 console.log('TextEncoder/TextDecoder Tests\n'); 4 4 ··· 16 16 17 17 const utf8 = encoder.encode('日本語'); 18 18 test('UTF-8 encode length', utf8.length, 9); 19 - const utf8Decoded = decoder.decode(utf8); 20 - test('UTF-8 decode', utf8Decoded, '日本語'); 19 + test('UTF-8 decode', decoder.decode(utf8), '日本語'); 21 20 22 21 const emoji = encoder.encode('😀'); 23 22 test('emoji encode length', emoji.length, 4); ··· 28 27 test('empty decode', decoder.decode(empty), ''); 29 28 30 29 const roundtrip = 'Hello, 世界! 🎉'; 31 - const rt = decoder.decode(encoder.encode(roundtrip)); 32 - test('roundtrip', rt, roundtrip); 30 + test('roundtrip', decoder.decode(encoder.encode(roundtrip)), roundtrip); 31 + 32 + test('TextEncoder.encoding', encoder.encoding, 'utf-8'); 33 + test('TextEncoder requires new', typeof TextEncoder, 'function'); 34 + testThrows('TextEncoder without new throws', () => TextEncoder()); 35 + 36 + testDeep('encode lone high surrogate', [...encoder.encode('\uD800')], [0xef, 0xbf, 0xbd]); 37 + testDeep('encode lone low surrogate', [...encoder.encode('\uDC00')], [0xef, 0xbf, 0xbd]); 38 + testDeep('encode surrogate in string', [...encoder.encode('a\uD800b')], [0x61, 0xef, 0xbf, 0xbd, 0x62]); 39 + testDeep('encode reversed surrogates', [...encoder.encode('\uDC00\uD800')], [0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd]); 40 + test('encode valid surrogate pair', encoder.encode('\uD834\uDD1E').length, 4); // U+1D11E 𝄞 41 + 42 + const dest = new Uint8Array(10); 43 + const result = encoder.encodeInto('hello', dest); 44 + test('encodeInto read', result.read, 5); 45 + test('encodeInto written', result.written, 5); 46 + test('encodeInto data', decoder.decode(dest.subarray(0, result.written)), 'hello'); 47 + 48 + const small = new Uint8Array(2); 49 + const partial = encoder.encodeInto('hello', small); 50 + test('encodeInto partial written', partial.written, 2); 51 + test('encodeInto partial read', partial.read, 2); 52 + 53 + test('TextDecoder default encoding', new TextDecoder().encoding, 'utf-8'); 54 + test('TextDecoder utf8 alias', new TextDecoder('utf8').encoding, 'utf-8'); 55 + test('TextDecoder case insensitive', new TextDecoder('UTF-8').encoding, 'utf-8'); 56 + test('TextDecoder utf-16le label', new TextDecoder('utf-16le').encoding, 'utf-16le'); 57 + test('TextDecoder utf-16be label', new TextDecoder('utf-16be').encoding, 'utf-16be'); 58 + test('TextDecoder utf-16 alias', new TextDecoder('utf-16').encoding, 'utf-16le'); 59 + testThrows('TextDecoder invalid label', () => new TextDecoder('bogus')); 60 + testThrows('TextDecoder without new throws', () => TextDecoder()); 61 + 62 + test('fatal defaults false', new TextDecoder().fatal, false); 63 + test('fatal option true', new TextDecoder('utf-8', { fatal: true }).fatal, true); 64 + test('ignoreBOM defaults false', new TextDecoder().ignoreBOM, false); 65 + test('ignoreBOM option true', new TextDecoder('utf-8', { ignoreBOM: true }).ignoreBOM, true); 66 + 67 + testThrows('fatal on invalid UTF-8', () => { 68 + new TextDecoder('utf-8', { fatal: true }).decode(new Uint8Array([0xff])); 69 + }); 70 + testThrows('fatal on truncated sequence', () => { 71 + new TextDecoder('utf-8', { fatal: true }).decode(new Uint8Array([0xc0])); 72 + }); 73 + testThrows('fatal on overlong', () => { 74 + new TextDecoder('utf-8', { fatal: true }).decode(new Uint8Array([0xc0, 0x80])); 75 + }); 76 + test('non-fatal replacement', new TextDecoder().decode(new Uint8Array([0xff])), '\uFFFD'); 77 + test('non-fatal truncated', new TextDecoder().decode(new Uint8Array([0xc0])), '\uFFFD'); 78 + 79 + test('UTF-8 BOM stripped by default', new TextDecoder().decode(new Uint8Array([0xef, 0xbb, 0xbf, 0x41])), 'A'); 80 + test('UTF-8 BOM kept with ignoreBOM', new TextDecoder('utf-8', { ignoreBOM: true }).decode(new Uint8Array([0xef, 0xbb, 0xbf, 0x41])), '\uFEFFA'); 81 + 82 + { 83 + const sd = new TextDecoder(); 84 + let out = ''; 85 + out += sd.decode(new Uint8Array([0xf0, 0x9f, 0x92]), { stream: true }); 86 + out += sd.decode(new Uint8Array([0xa9])); 87 + test('streaming UTF-8 multi-byte', out, '\u{1F4A9}'); 88 + } 89 + 90 + { 91 + const sd = new TextDecoder(); 92 + let out = ''; 93 + out += sd.decode(new Uint8Array([0xf0]), { stream: true }); 94 + out += sd.decode(new Uint8Array([0x9f]), { stream: true }); 95 + out += sd.decode(new Uint8Array([0x92]), { stream: true }); 96 + out += sd.decode(new Uint8Array([0xa9])); 97 + test('streaming UTF-8 byte-at-a-time', out, '\u{1F4A9}'); 98 + } 99 + 100 + { 101 + const sd = new TextDecoder(); 102 + let out = ''; 103 + out += sd.decode(new Uint8Array([0xf0, 0x9f]), { stream: true }); 104 + out += sd.decode(); 105 + test('streaming flush incomplete sequence', out, '\uFFFD'); 106 + } 107 + 108 + test('UTF-16LE basic', new TextDecoder('utf-16le').decode(new Uint8Array([0x41, 0x00, 0x42, 0x00])), 'AB'); 109 + test('UTF-16LE surrogate pair', new TextDecoder('utf-16le').decode(new Uint8Array([0x34, 0xd8, 0x1e, 0xdd])), '\uD834\uDD1E'); 110 + test('UTF-16LE BOM stripped', new TextDecoder('utf-16le').decode(new Uint8Array([0xff, 0xfe, 0x41, 0x00])), 'A'); 111 + 112 + test( 113 + 'UTF-16LE BOM kept with ignoreBOM', 114 + new TextDecoder('utf-16le', { ignoreBOM: true }).decode(new Uint8Array([0xff, 0xfe, 0x41, 0x00])), 115 + '\uFEFFA' 116 + ); 117 + 118 + testThrows('UTF-16LE fatal on odd byte', () => { 119 + new TextDecoder('utf-16le', { fatal: true }).decode(new Uint8Array([0x00])); 120 + }); 121 + 122 + test('UTF-16LE non-fatal odd byte', new TextDecoder('utf-16le').decode(new Uint8Array([0x00])), '\uFFFD'); 123 + 124 + { 125 + const sd = new TextDecoder('utf-16le'); 126 + let out = ''; 127 + out += sd.decode(new Uint8Array([0x41]), { stream: true }); 128 + out += sd.decode(new Uint8Array([0x00])); 129 + test('UTF-16LE streaming split code unit', out, 'A'); 130 + } 131 + 132 + { 133 + const sd = new TextDecoder('utf-16le'); 134 + let out = ''; 135 + out += sd.decode(new Uint8Array([0x34, 0xd8]), { stream: true }); 136 + out += sd.decode(new Uint8Array([0x1e, 0xdd])); 137 + test('UTF-16LE streaming split surrogate pair', out, '\uD834\uDD1E'); 138 + } 139 + 140 + test('UTF-16BE basic', new TextDecoder('utf-16be').decode(new Uint8Array([0x00, 0x41, 0x00, 0x42])), 'AB'); 141 + test('UTF-16BE surrogate pair', new TextDecoder('utf-16be').decode(new Uint8Array([0xd8, 0x34, 0xdd, 0x1e])), '\uD834\uDD1E'); 142 + test('UTF-16BE BOM stripped', new TextDecoder('utf-16be').decode(new Uint8Array([0xfe, 0xff, 0x00, 0x41])), 'A'); 143 + 144 + { 145 + const buf = new Uint8Array([0x68, 0x69]).buffer; 146 + test('decode ArrayBuffer', new TextDecoder().decode(buf), 'hi'); 147 + } 148 + 149 + { 150 + const d = new TextDecoder(); 151 + d.decode(new Uint8Array([0xf0, 0x9f]), { stream: true }); 152 + const fresh = d.decode(new Uint8Array([0x41])); 153 + test('decoder reuse resets', fresh, '\uFFFDA'); 154 + } 155 + 156 + testDeep('encode undefined', [...encoder.encode(undefined)], []); 157 + testDeep('encode no args', [...encoder.encode()], []); 158 + test('decode no args', decoder.decode(), ''); 33 159 34 160 summary();

+26

include/modules/textcodec.h

··· 1 1 #ifndef TEXTCODEC_H 2 2 #define TEXTCODEC_H 3 3 4 + #include <stdint.h> 5 + #include <stddef.h> 6 + #include <stdbool.h> 7 + 8 + #include "types.h" 9 + 10 + typedef enum { 11 + TD_ENC_UTF8 = 0, 12 + TD_ENC_UTF16LE, 13 + TD_ENC_UTF16BE, 14 + } td_encoding_t; 15 + 16 + typedef struct { 17 + td_encoding_t encoding; 18 + uint8_t pending[4]; 19 + int pending_len; 20 + bool fatal; 21 + bool ignore_bom; 22 + bool bom_seen; 23 + } td_state_t; 24 + 4 25 void init_textcodec_module(void); 26 + td_state_t *td_state_new(td_encoding_t enc, bool fatal, bool ignore_bom); 27 + 28 + ant_value_t td_decode(ant_t *js, td_state_t *st, const uint8_t *input, size_t input_len, bool stream); 29 + ant_value_t te_encode(ant_t *js, const char *str, size_t str_len); 30 + 5 31 6 32 #endif

+13

include/utf8.h

··· 3 3 4 4 #include <stddef.h> 5 5 #include <stdint.h> 6 + #include <stdbool.h> 6 7 #include <utf8proc.h> 8 + 9 + typedef struct { 10 + bool ignore_bom; 11 + bool bom_seen; 12 + uint8_t pend_buf[3]; 13 + int pend_pos; 14 + } utf8_dec_t; 15 + 16 + utf8proc_ssize_t utf8_whatwg_decode( 17 + utf8_dec_t *dec, const uint8_t *src, size_t len, 18 + char *out, bool fatal, bool stream 19 + ); 7 20 8 21 size_t utf8_char_len_at(const char *str, size_t byte_len, size_t pos); 9 22 size_t utf8_strlen(const char *str, size_t byte_len);

+4 -1

src/modules/buffer.c

··· 685 685 snprintf(idx_str, sizeof(idx_str), "%zu", i); 686 686 elem = js_get(js, args[0], idx_str); 687 687 } 688 - double val = vtype(elem) == T_NUM ? js_getnum(elem) : 0; 688 + 689 + double val = vtype(elem) == T_NUM 690 + ? js_getnum(elem) 691 + : js_to_number(js, elem); 689 692 690 693 if (type > TYPED_ARRAY_BIGUINT64) goto W_DONE; 691 694 goto *write_dispatch[type];

+398 -110

src/modules/textcodec.c

··· 1 1 #include <stdlib.h> 2 - #include <stdio.h> 3 2 #include <string.h> 3 + #include <stdint.h> 4 4 5 - #include "runtime.h" 5 + #include "ant.h" 6 6 #include "errors.h" 7 + #include "runtime.h" 7 8 #include "internal.h" 8 - #include "silver/engine.h" 9 + #include "descriptors.h" 10 + #include "utf8.h" 9 11 10 12 #include "modules/textcodec.h" 11 13 #include "modules/buffer.h" 12 14 #include "modules/symbol.h" 13 15 14 - // TextEncoder.prototype.encode(string) 15 - static ant_value_t js_textencoder_encode(ant_t *js, ant_value_t *args, int nargs) { 16 - size_t str_len = 0; 17 - const char *str = ""; 16 + static ant_value_t g_textencoder_proto = 0; 17 + static ant_value_t g_textdecoder_proto = 0; 18 + 19 + td_state_t *td_state_new(td_encoding_t enc, bool fatal, bool ignore_bom) { 20 + td_state_t *st = calloc(1, sizeof(td_state_t)); 21 + if (!st) return NULL; 22 + st->encoding = enc; 23 + st->fatal = fatal; 24 + st->ignore_bom = ignore_bom; 25 + return st; 26 + } 27 + 28 + static td_state_t *td_get_state(ant_value_t obj) { 29 + ant_value_t s = js_get_slot(obj, SLOT_DATA); 30 + if (vtype(s) != T_NUM) return NULL; 31 + return (td_state_t *)(uintptr_t)(size_t)js_getnum(s); 32 + } 33 + 34 + static void td_finalize(ant_t *js, ant_object_t *obj) { 35 + if (!obj->extra_slots) return; 36 + ant_extra_slot_t *entries = (ant_extra_slot_t *)obj->extra_slots; 18 37 19 - if (nargs > 0 && vtype(args[0]) == T_STR) { 20 - str = js_getstr(js, args[0], &str_len); 21 - if (!str) { str = ""; str_len = 0; } 38 + for (uint8_t i = 0; i < obj->extra_count; i++) { 39 + if (entries[i].slot == SLOT_DATA && vtype(entries[i].value) == T_NUM) { 40 + free((td_state_t *)(uintptr_t)(size_t)js_getnum(entries[i].value)); 41 + return; 42 + }} 43 + } 44 + 45 + static int resolve_encoding(const char *s, size_t len) { 46 + static const struct { const char *label; uint8_t len; td_encoding_t enc; } map[] = { 47 + {"unicode-1-1-utf-8", 18, TD_ENC_UTF8}, {"unicode11utf8", 13, TD_ENC_UTF8}, 48 + {"unicode20utf8", 13, TD_ENC_UTF8}, {"utf-8", 5, TD_ENC_UTF8}, 49 + {"utf8", 4, TD_ENC_UTF8}, {"x-unicode20utf8",17, TD_ENC_UTF8}, 50 + {"unicodefffe", 11, TD_ENC_UTF16BE}, {"utf-16be", 8, TD_ENC_UTF16BE}, 51 + {"csunicode", 9, TD_ENC_UTF16LE}, {"iso-10646-ucs-2",16, TD_ENC_UTF16LE}, 52 + {"ucs-2", 5, TD_ENC_UTF16LE}, {"unicode", 7, TD_ENC_UTF16LE}, 53 + {"unicodefeff", 11, TD_ENC_UTF16LE}, {"utf-16", 6, TD_ENC_UTF16LE}, 54 + {"utf-16le", 8, TD_ENC_UTF16LE}, 55 + {NULL, 0, 0} 56 + }; 57 + for (int i = 0; map[i].label; i++) { 58 + if (len == map[i].len && strncasecmp(s, map[i].label, len) == 0) return (int)map[i].enc; 59 + } 60 + return -1; 61 + } 62 + 63 + static const char *encoding_name(td_encoding_t enc) { 64 + switch (enc) { 65 + case TD_ENC_UTF16LE: return "utf-16le"; 66 + case TD_ENC_UTF16BE: return "utf-16be"; 67 + default: return "utf-8"; 68 + }} 69 + 70 + static const char *trim_label(const char *s, size_t len, size_t *out_len) { 71 + while (len > 0 && (unsigned char)*s <= 0x20) { s++; len--; } 72 + while (len > 0 && (unsigned char)s[len - 1] <= 0x20) { len--; } 73 + *out_len = len; 74 + return s; 75 + } 76 + 77 + static bool get_buffer_source(ant_t *js, ant_value_t arg, const uint8_t **out, size_t *len) { 78 + ant_value_t slot = js_get_slot(arg, SLOT_BUFFER); 79 + TypedArrayData *ta = (TypedArrayData *)js_gettypedarray(slot); 80 + if (ta) { 81 + if (!ta->buffer || ta->buffer->is_detached) { *out = NULL; *len = 0; return true; } 82 + *out = ta->buffer->data + ta->byte_offset; 83 + *len = ta->byte_length; 84 + return true; 22 85 } 23 86 24 - ant_value_t glob = js_glob(js); 25 - ant_value_t uint8array_ctor = js_get(js, glob, "Uint8Array"); 87 + if (vtype(slot) == T_NUM) { 88 + ArrayBufferData *ab = (ArrayBufferData *)(uintptr_t)(size_t)js_getnum(slot); 89 + if (!ab || ab->is_detached) { *out = NULL; *len = 0; return true; } 90 + *out = ab->data; 91 + *len = ab->length; 92 + return true; 93 + } 26 94 27 - if (vtype(uint8array_ctor) != T_FUNC && vtype(uint8array_ctor) != T_CFUNC) { 28 - return js_mkerr_typed(js, JS_ERR_TYPE, "Uint8Array constructor missing"); 29 - } 95 + ant_value_t buf_prop = js_get(js, arg, "buffer"); 96 + if (is_object_type(buf_prop)) { 97 + ant_value_t buf_slot = js_get_slot(buf_prop, SLOT_BUFFER); 30 98 31 - ant_value_t len_arg = js_mknum((double)str_len); 32 - ant_value_t saved_new_target = js->new_target; 99 + if (vtype(buf_slot) == T_NUM) { 100 + ArrayBufferData *ab = (ArrayBufferData *)(uintptr_t)(size_t)js_getnum(buf_slot); 101 + if (!ab || ab->is_detached) { *out = NULL; *len = 0; return true; } 102 + ant_value_t off_v = js_get(js, arg, "byteOffset"); 103 + ant_value_t len_v = js_get(js, arg, "byteLength"); 104 + size_t off = (vtype(off_v) == T_NUM) ? (size_t)js_getnum(off_v) : 0; 105 + size_t blen = (vtype(len_v) == T_NUM) ? (size_t)js_getnum(len_v) : ab->length - off; 106 + *out = ab->data + off; 107 + *len = blen; 108 + return true; 109 + }} 33 110 34 - js->new_target = uint8array_ctor; 35 - ant_value_t arr = sv_vm_call(js->vm, js, uint8array_ctor, js_mkundef(), &len_arg, 1, NULL, true); 111 + return false; 112 + } 113 + 114 + static ant_value_t make_ctor(ant_t *js, ant_cfunc_t fn, ant_value_t proto, const char *name, size_t nlen) { 115 + ant_value_t obj = js_mkobj(js); 116 + js_set_slot(obj, SLOT_CFUNC, js_mkfun(fn)); 117 + js_mkprop_fast(js, obj, "prototype", 9, proto); 118 + js_mkprop_fast(js, obj, "name", 4, js_mkstr(js, name, nlen)); 119 + js_set_descriptor(js, obj, "name", 4, 0); 36 120 37 - js->new_target = saved_new_target; 38 - if (vtype(arr) == T_ERR) return arr; 121 + ant_value_t fn_val = js_obj_to_func(obj); 122 + js_set(js, proto, "constructor", fn_val); 123 + js_set_descriptor(js, proto, "constructor", 11, JS_DESC_W | JS_DESC_C); 124 + 125 + return fn_val; 126 + } 127 + 128 + static ant_value_t js_textencoder_get_encoding(ant_t *js, ant_value_t *args, int nargs) { 129 + return js_mkstr(js, "utf-8", 5); 130 + } 131 + 132 + ant_value_t te_encode(ant_t *js, const char *str, size_t str_len) { 133 + ArrayBufferData *ab = create_array_buffer_data(str_len); 134 + if (!ab) return js_mkerr(js, "out of memory"); 39 135 40 136 if (str_len > 0) { 41 - ant_value_t ta_data_val = js_get_slot(arr, SLOT_BUFFER); 42 - TypedArrayData *ta_data = (TypedArrayData *)js_gettypedarray(ta_data_val); 43 - if (ta_data && ta_data->buffer && ta_data->buffer->data) memcpy(ta_data->buffer->data, str, str_len); 137 + const uint8_t *s = (const uint8_t *)str; 138 + uint8_t *d = ab->data; size_t i = 0; 139 + 140 + while (i < str_len) { 141 + if (s[i] == 0xED && i + 2 < str_len && s[i+1] >= 0xA0 && s[i+1] <= 0xBF) { 142 + d[i] = 0xEF; d[i+1] = 0xBF; d[i+2] = 0xBD; 143 + i += 3; 144 + } else { d[i] = s[i]; i++; }} 44 145 } 45 146 46 - return arr; 147 + return create_typed_array(js, TYPED_ARRAY_UINT8, ab, 0, str_len, "Uint8Array"); 47 148 } 48 149 49 - // TextEncoder.prototype.encodeInto(string, uint8array) 50 - static ant_value_t js_textencoder_encodeInto(ant_t *js, ant_value_t *args, int nargs) { 51 - if (nargs < 2) { 52 - return js_mkerr(js, "encodeInto requires string and Uint8Array arguments"); 150 + static ant_value_t js_textencoder_encode(ant_t *js, ant_value_t *args, int nargs) { 151 + size_t str_len = 0; 152 + const char *str = ""; 153 + 154 + if (nargs > 0 && vtype(args[0]) == T_STR) { 155 + str = js_getstr(js, args[0], &str_len); 156 + if (!str) { str = ""; str_len = 0; } 157 + } else if (nargs > 0 && vtype(args[0]) != T_UNDEF) { 158 + ant_value_t sv = js_tostring_val(js, args[0]); 159 + if (is_err(sv)) return sv; 160 + str = js_getstr(js, sv, &str_len); 161 + if (!str) { str = ""; str_len = 0; } 53 162 } 54 163 164 + return te_encode(js, str, str_len); 165 + } 166 + 167 + static ant_value_t js_textencoder_encode_into(ant_t *js, ant_value_t *args, int nargs) { 168 + if (nargs < 2) return js_mkerr_typed(js, JS_ERR_TYPE, "encodeInto requires 2 arguments"); 169 + 55 170 size_t str_len = 0; 56 171 const char *str = ""; 57 - 58 172 if (vtype(args[0]) == T_STR) { 59 173 str = js_getstr(js, args[0], &str_len); 60 - if (!str) { 61 - str = ""; 62 - str_len = 0; 63 - } 174 + if (!str) { str = ""; str_len = 0; } 175 + } else if (vtype(args[0]) != T_UNDEF) { 176 + ant_value_t sv = js_tostring_val(js, args[0]); 177 + if (is_err(sv)) return sv; 178 + str = js_getstr(js, sv, &str_len); 179 + if (!str) { str = ""; str_len = 0; } 64 180 } 65 - 66 - ant_value_t ta_data_val = js_get_slot(args[1], SLOT_BUFFER); 67 - TypedArrayData *ta_data = (TypedArrayData *)js_gettypedarray(ta_data_val); 68 - if (!ta_data) return js_mkerr(js, "Second argument must be a Uint8Array"); 181 + 182 + TypedArrayData *ta = (TypedArrayData *)js_gettypedarray(js_get_slot(args[1], SLOT_BUFFER)); 183 + if (!ta) return js_mkerr_typed(js, JS_ERR_TYPE, "Second argument must be a Uint8Array"); 184 + 185 + uint8_t *dest = (ta->buffer && !ta->buffer->is_detached) 186 + ? ta->buffer->data + ta->byte_offset : NULL; 187 + size_t available = ta->byte_length; 188 + 189 + const utf8proc_uint8_t *src = (const utf8proc_uint8_t *)str; 190 + utf8proc_ssize_t src_len = (utf8proc_ssize_t)str_len; 191 + utf8proc_ssize_t pos = 0; 69 192 70 - size_t available = ta_data->byte_length; 71 - size_t to_write = str_len < available ? str_len : available; 72 - 73 - if (to_write > 0) { 74 - memcpy(ta_data->buffer->data + ta_data->byte_offset, str, to_write); 193 + size_t written = 0; 194 + size_t read_units = 0; 195 + 196 + while (pos < src_len) { 197 + utf8proc_int32_t cp; 198 + utf8proc_ssize_t n = utf8_next(src + pos, src_len - pos, &cp); 199 + utf8proc_uint8_t tmp[4]; 200 + utf8proc_ssize_t enc_len; 201 + 202 + if (cp >= 0xD800 && cp <= 0xDFFF) { 203 + tmp[0] = 0xEF; tmp[1] = 0xBF; tmp[2] = 0xBD; 204 + enc_len = 3; 205 + } else { 206 + enc_len = (cp >= 0) ? utf8proc_encode_char(cp, tmp) : 0; 207 + if (enc_len <= 0) { tmp[0] = 0xEF; tmp[1] = 0xBF; tmp[2] = 0xBD; enc_len = 3; } 208 + } 209 + 210 + if (written + (size_t)enc_len > available) break; 211 + if (dest) memcpy(dest + written, tmp, (size_t)enc_len); 212 + 213 + written += (size_t)enc_len; 214 + pos += n; 215 + read_units += (cp >= 0x10000 && cp <= 0x10FFFF) ? 2 : 1; 75 216 } 76 - 217 + 77 218 ant_value_t result = js_mkobj(js); 78 - js_set(js, result, "read", js_mknum((double)to_write)); 79 - js_set(js, result, "written", js_mknum((double)to_write)); 219 + js_set(js, result, "read", js_mknum((double)read_units)); 220 + js_set(js, result, "written", js_mknum((double)written)); 80 221 81 222 return result; 82 223 } 83 224 84 - static ant_value_t js_textencoder_constructor(ant_t *js, ant_value_t *args, int nargs) { 85 - (void)args; 86 - (void)nargs; 87 - 225 + static ant_value_t js_textencoder_ctor(ant_t *js, ant_value_t *args, int nargs) { 226 + if (vtype(js->new_target) == T_UNDEF) 227 + return js_mkerr_typed(js, JS_ERR_TYPE, "TextEncoder constructor requires 'new'"); 88 228 ant_value_t obj = js_mkobj(js); 89 - js_set(js, obj, "encoding", js_mkstr(js, "utf-8", 5)); 90 - js_set(js, obj, "encode", js_mkfun(js_textencoder_encode)); 91 - js_set(js, obj, "encodeInto", js_mkfun(js_textencoder_encodeInto)); 92 - js_set_sym(js, obj, get_toStringTag_sym(), js_mkstr(js, "TextEncoder", 11)); 93 - 229 + ant_value_t proto = js_instance_proto_from_new_target(js, g_textencoder_proto); 230 + if (is_object_type(proto)) js_set_proto_init(obj, proto); 94 231 return obj; 95 232 } 96 233 97 - // TextDecoder.prototype.decode(bufferSource) 98 - static ant_value_t js_textdecoder_decode(ant_t *js, ant_value_t *args, int nargs) { 99 - if (nargs < 1) { 100 - return js_mkstr(js, "", 0); 234 + static ant_value_t js_textdecoder_get_encoding(ant_t *js, ant_value_t *args, int nargs) { 235 + td_state_t *st = td_get_state(js->this_val); 236 + const char *name = encoding_name(st ? st->encoding : TD_ENC_UTF8); 237 + return js_mkstr(js, name, strlen(name)); 238 + } 239 + 240 + static ant_value_t js_textdecoder_get_fatal(ant_t *js, ant_value_t *args, int nargs) { 241 + td_state_t *st = td_get_state(js->this_val); 242 + return (st && st->fatal) ? js_true : js_false; 243 + } 244 + 245 + static ant_value_t js_textdecoder_get_ignore_bom(ant_t *js, ant_value_t *args, int nargs) { 246 + td_state_t *st = td_get_state(js->this_val); 247 + return (st && st->ignore_bom) ? js_true : js_false; 248 + } 249 + 250 + static inline uint16_t u16_read(const uint8_t *p, bool be) { 251 + return be 252 + ? (uint16_t)((uint16_t)p[0] << 8 | p[1]) 253 + : (uint16_t)((uint16_t)p[1] << 8 | p[0]); 254 + } 255 + 256 + static inline size_t u8_emit(char *out, size_t o, utf8proc_int32_t cp) { 257 + utf8proc_ssize_t n = utf8proc_encode_char(cp, (utf8proc_uint8_t *)(out + o)); 258 + return n > 0 ? o + (size_t)n : o; 259 + } 260 + 261 + static inline size_t u8_fffd(char *out, size_t o) { 262 + out[o] = (char)0xEF; out[o+1] = (char)0xBF; out[o+2] = (char)0xBD; 263 + return o + 3; 264 + } 265 + 266 + #define U16_IS_HIGH(cu) ((cu) >= 0xD800 && (cu) <= 0xDBFF) 267 + #define U16_IS_LOW(cu) ((cu) >= 0xDC00 && (cu) <= 0xDFFF) 268 + #define U16_PAIR(hi,lo) (0x10000 + ((uint32_t)((hi) - 0xD800) << 10) + ((lo) - 0xDC00)) 269 + 270 + static utf8proc_ssize_t utf16_decode(td_state_t *st, const uint8_t *src, size_t len, char *out, bool stream) { 271 + bool be = (st->encoding == TD_ENC_UTF16BE); 272 + size_t i = 0, o = 0; 273 + size_t avail; 274 + 275 + if (!st->bom_seen && len >= 2 && u16_read(src, be) == 0xFEFF && !st->ignore_bom) i = 2; 276 + st->bom_seen = true; 277 + 278 + while (i < len) { 279 + avail = len - i; 280 + 281 + if (avail < 2) goto pend_tail; 282 + uint16_t cu = u16_read(src + i, be); 283 + i += 2; 284 + 285 + if (!U16_IS_HIGH(cu) && !U16_IS_LOW(cu)) { 286 + o = u8_emit(out, o, (utf8proc_int32_t)cu); 287 + continue; 288 + } 289 + 290 + if (U16_IS_LOW(cu)) goto err; 291 + 292 + avail = len - i; 293 + if (avail < 2) goto pend_hi; 294 + 295 + uint16_t lo = u16_read(src + i, be); 296 + if (U16_IS_LOW(lo)) { i += 2; o = u8_emit(out, o, (utf8proc_int32_t)U16_PAIR(cu, lo)); continue; } 297 + 298 + goto err; 299 + 300 + pend_tail: 301 + if (stream) { st->pending[0] = src[i]; st->pending_len = 1; } 302 + else goto err; 303 + break; 304 + 305 + pend_hi: 306 + if (stream) { st->pending_len = (int)(len - (i - 2)); memcpy(st->pending, src + i - 2, (size_t)st->pending_len); } 307 + else { if (st->fatal) return -1; o = u8_fffd(out, o); if (avail == 1) o = u8_fffd(out, o); } 308 + break; 309 + 310 + err: 311 + if (st->fatal) return -1; 312 + o = u8_fffd(out, o); 313 + continue; 101 314 } 102 315 103 - ant_value_t ta_data_val = js_get_slot(args[0], SLOT_BUFFER); 104 - TypedArrayData *ta_data = (TypedArrayData *)js_gettypedarray(ta_data_val); 105 - if (ta_data) { 106 - if (!ta_data->buffer) return js_mkstr(js, "", 0); 107 - uint8_t *data = ta_data->buffer->data + ta_data->byte_offset; 108 - size_t len = ta_data->byte_length; 109 - return js_mkstr(js, (const char *)data, len); 316 + return (utf8proc_ssize_t)o; 317 + } 318 + 319 + #undef U16_IS_HIGH 320 + #undef U16_IS_LOW 321 + #undef U16_PAIR 322 + 323 + ant_value_t td_decode(ant_t *js, td_state_t *st, const uint8_t *input, size_t input_len, bool stream_mode) { 324 + size_t total = (size_t)st->pending_len + input_len; 325 + if (total == 0) { 326 + if (!stream_mode) st->bom_seen = false; 327 + return js_mkstr(js, "", 0); 110 328 } 111 - 112 - ant_value_t ab_data_val = js_get_slot(args[0], SLOT_BUFFER); 113 - if (vtype(ab_data_val) == T_NUM) { 114 - ArrayBufferData *ab_data = (ArrayBufferData *)(uintptr_t)js_getnum(ab_data_val); 115 - if (!ab_data || !ab_data->data) return js_mkstr(js, "", 0); 116 - return js_mkstr(js, (const char *)ab_data->data, ab_data->length); 329 + 330 + uint8_t *work = NULL; 331 + const uint8_t *src; 332 + if (st->pending_len > 0) { 333 + work = malloc(total); 334 + if (!work) return js_mkerr(js, "out of memory"); 335 + memcpy(work, st->pending, (size_t)st->pending_len); 336 + if (input && input_len > 0) memcpy(work + st->pending_len, input, input_len); 337 + src = work; 338 + } else src = input; 339 + st->pending_len = 0; 340 + 341 + char *out = malloc(total * 3 + 1); 342 + if (!out) { free(work); return js_mkerr(js, "out of memory"); } 343 + 344 + utf8proc_ssize_t n; 345 + if (st->encoding == TD_ENC_UTF16LE || st->encoding == TD_ENC_UTF16BE) { 346 + n = utf16_decode(st, src, total, out, stream_mode); 347 + } else { 348 + utf8_dec_t dec = { .ignore_bom = st->ignore_bom, .bom_seen = st->bom_seen }; 349 + n = utf8_whatwg_decode(&dec, src, total, out, st->fatal, stream_mode); 350 + st->pending_len = dec.pend_pos; 351 + memcpy(st->pending, dec.pend_buf, (size_t)dec.pend_pos); 352 + st->bom_seen = stream_mode ? dec.bom_seen : false; 117 353 } 354 + 355 + if (n < 0) { 356 + free(work); free(out); 357 + return js_mkerr_typed(js, JS_ERR_TYPE, "The encoded data was not valid."); 358 + } 359 + 360 + if (st->encoding != TD_ENC_UTF8) { 361 + if (!stream_mode) st->bom_seen = false; 362 + } 363 + 364 + ant_value_t result = js_mkstr(js, out, (size_t)n); 365 + free(work); 366 + free(out); 118 367 119 - return js_mkstr(js, "", 0); 368 + return result; 120 369 } 121 370 122 - static ant_value_t js_textdecoder_constructor(ant_t *js, ant_value_t *args, int nargs) { 123 - const char *encoding = "utf-8"; 124 - size_t encoding_len = 5; 371 + static ant_value_t js_textdecoder_decode(ant_t *js, ant_value_t *args, int nargs) { 372 + td_state_t *st = td_get_state(js->this_val); 373 + if (!st) return js_mkerr_typed(js, JS_ERR_TYPE, "Invalid TextDecoder"); 374 + 375 + bool stream_mode = false; 376 + if (nargs > 1 && is_object_type(args[1])) { 377 + ant_value_t sv = js_get(js, args[1], "stream"); 378 + stream_mode = js_truthy(js, sv); 379 + } 380 + 381 + const uint8_t *input = NULL; 382 + size_t input_len = 0; 383 + if (nargs > 0 && is_object_type(args[0])) 384 + get_buffer_source(js, args[0], &input, &input_len); 385 + 386 + return td_decode(js, st, input, input_len, stream_mode); 387 + } 388 + 389 + static ant_value_t js_textdecoder_ctor(ant_t *js, ant_value_t *args, int nargs) { 390 + if (vtype(js->new_target) == T_UNDEF) 391 + return js_mkerr_typed(js, JS_ERR_TYPE, "TextDecoder constructor requires 'new'"); 392 + 393 + td_encoding_t enc = TD_ENC_UTF8; 394 + if (nargs > 0 && vtype(args[0]) == T_STR) { 395 + size_t llen; 396 + const char *raw = js_getstr(js, args[0], &llen); 397 + 398 + if (raw) { 399 + size_t tlen; 400 + const char *trimmed = trim_label(raw, llen, &tlen); 401 + int resolved = resolve_encoding(trimmed, tlen); 402 + 403 + if (resolved < 0) return js_mkerr_typed( 404 + js, JS_ERR_RANGE, "Failed to construct 'TextDecoder': The encoding label provided ('%.*s') is invalid.", 405 + (int)tlen, trimmed 406 + ); 407 + 408 + enc = (td_encoding_t)resolved; 409 + }} 410 + 411 + bool fatal = false; 412 + bool ignore_bom = false; 125 413 126 - if (nargs > 0 && vtype(args[0]) == T_STR) { 127 - encoding = js_getstr(js, args[0], &encoding_len); 128 - if (encoding && ( 129 - strcasecmp(encoding, "utf-8") == 0 || 130 - strcasecmp(encoding, "utf8") == 0) 131 - ) { encoding = "utf-8"; encoding_len = 5; } 414 + if (nargs > 1 && is_object_type(args[1])) { 415 + ant_value_t fv = js_get(js, args[1], "fatal"); 416 + if (vtype(fv) != T_UNDEF) fatal = js_truthy(js, fv); 417 + ant_value_t bv = js_get(js, args[1], "ignoreBOM"); 418 + if (vtype(bv) != T_UNDEF) ignore_bom = js_truthy(js, bv); 132 419 } 133 - 420 + 421 + td_state_t *st = td_state_new(enc, fatal, ignore_bom); 422 + if (!st) return js_mkerr(js, "out of memory"); 423 + 134 424 ant_value_t obj = js_mkobj(js); 135 - js_set(js, obj, "encoding", js_mkstr(js, encoding, encoding_len)); 136 - js_set(js, obj, "fatal", js_false); 137 - js_set(js, obj, "ignoreBOM", js_false); 138 - js_set(js, obj, "decode", js_mkfun(js_textdecoder_decode)); 139 - js_set_sym(js, obj, get_toStringTag_sym(), js_mkstr(js, "TextDecoder", 11)); 425 + ant_value_t proto = js_instance_proto_from_new_target(js, g_textdecoder_proto); 426 + 427 + if (is_object_type(proto)) js_set_proto_init(obj, proto); 428 + js_set_slot(obj, SLOT_DATA, ANT_PTR(st)); 429 + js_set_finalizer(obj, td_finalize); 140 430 141 431 return obj; 142 432 } 143 433 144 434 void init_textcodec_module(void) { 145 435 ant_t *js = rt->js; 146 - ant_value_t glob = js_glob(js); 436 + ant_value_t g = js_glob(js); 437 + 438 + g_textencoder_proto = js_mkobj(js); 439 + js_set_getter_desc(js, g_textencoder_proto, "encoding", 8, js_mkfun(js_textencoder_get_encoding), JS_DESC_C); 440 + js_set(js, g_textencoder_proto, "encode", js_mkfun(js_textencoder_encode)); 441 + js_set(js, g_textencoder_proto, "encodeInto", js_mkfun(js_textencoder_encode_into)); 442 + js_set_sym(js, g_textencoder_proto, get_toStringTag_sym(), js_mkstr(js, "TextEncoder", 11)); 147 443 148 - ant_value_t textencoder_ctor_obj = js_mkobj(js); 149 - js_set_slot(textencoder_ctor_obj, SLOT_CFUNC, js_mkfun(js_textencoder_constructor)); 150 - ant_value_t textencoder_proto = js_mkobj(js); 444 + ant_value_t te_ctor = make_ctor(js, js_textencoder_ctor, g_textencoder_proto, "TextEncoder", 11); 445 + js_set(js, g, "TextEncoder", te_ctor); 446 + js_set_descriptor(js, g, "TextEncoder", 11, JS_DESC_W | JS_DESC_C); 447 + 448 + g_textdecoder_proto = js_mkobj(js); 449 + js_set_getter_desc(js, g_textdecoder_proto, "encoding", 8, js_mkfun(js_textdecoder_get_encoding), JS_DESC_C); 450 + js_set_getter_desc(js, g_textdecoder_proto, "fatal", 5, js_mkfun(js_textdecoder_get_fatal), JS_DESC_C); 451 + js_set_getter_desc(js, g_textdecoder_proto, "ignoreBOM", 9, js_mkfun(js_textdecoder_get_ignore_bom), JS_DESC_C); 452 + js_set(js, g_textdecoder_proto, "decode", js_mkfun(js_textdecoder_decode)); 453 + js_set_sym(js, g_textdecoder_proto, get_toStringTag_sym(), js_mkstr(js, "TextDecoder", 11)); 151 454 152 - js_set(js, textencoder_proto, "encode", js_mkfun(js_textencoder_encode)); 153 - js_set(js, textencoder_proto, "encodeInto", js_mkfun(js_textencoder_encodeInto)); 154 - js_set(js, textencoder_proto, "encoding", js_mkstr(js, "utf-8", 5)); 155 - js_set(js, textencoder_ctor_obj, "prototype", textencoder_proto); 156 - ant_value_t textencoder_constructor = js_obj_to_func(textencoder_ctor_obj); 157 - js_set(js, glob, "TextEncoder", textencoder_constructor); 158 - 159 - ant_value_t textdecoder_ctor_obj = js_mkobj(js); 160 - js_set_slot(textdecoder_ctor_obj, SLOT_CFUNC, js_mkfun(js_textdecoder_constructor)); 161 - ant_value_t textdecoder_proto = js_mkobj(js); 162 - 163 - js_set(js, textdecoder_proto, "decode", js_mkfun(js_textdecoder_decode)); 164 - js_set(js, textdecoder_proto, "encoding", js_mkstr(js, "utf-8", 5)); 165 - js_set(js, textdecoder_proto, "fatal", js_false); 166 - js_set(js, textdecoder_proto, "ignoreBOM", js_false); 167 - js_set(js, textdecoder_ctor_obj, "prototype", textdecoder_proto); 168 - ant_value_t textdecoder_constructor = js_obj_to_func(textdecoder_ctor_obj); 169 - js_set(js, glob, "TextDecoder", textdecoder_constructor); 455 + ant_value_t td_ctor = make_ctor(js, js_textdecoder_ctor, g_textdecoder_proto, "TextDecoder", 11); 456 + js_set(js, g, "TextDecoder", td_ctor); 457 + js_set_descriptor(js, g, "TextDecoder", 11, JS_DESC_W | JS_DESC_C); 170 458 }

+86

src/utf8.c

··· 1 1 #include "utf8.h" 2 2 #include <string.h> 3 + #include <stdbool.h> 3 4 4 5 static uint32_t utf8_decode(const unsigned char *buf, size_t len, int *seq_len) { 5 6 if (len == 0) { *seq_len = 0; return 0; } ··· 176 177 } 177 178 178 179 return 0xFFFFFFFF; 180 + } 181 + 182 + utf8proc_ssize_t utf8_whatwg_decode( 183 + utf8_dec_t *dec, const uint8_t *src, size_t len, 184 + char *out, bool fatal, bool stream 185 + ) { 186 + static const void *tbl[256] = { 187 + [0x00 ... 0x7F] = &&L_ASCII, 188 + [0x80 ... 0xBF] = &&L_LONE, 189 + [0xC0 ... 0xC1] = &&L_BAD, 190 + [0xC2 ... 0xDF] = &&L_2, 191 + [0xE0] = &&L_E0, 192 + [0xE1 ... 0xEC] = &&L_3, 193 + [0xED] = &&L_ED, 194 + [0xEE ... 0xEF] = &&L_3, 195 + [0xF0] = &&L_F0, 196 + [0xF1 ... 0xF3] = &&L_4, 197 + [0xF4] = &&L_F4, 198 + [0xF5 ... 0xFF] = &&L_BAD, 199 + }; 200 + 201 + size_t i = 0, o = 0; 202 + int bc = 0; 203 + 204 + uint8_t lo = 0x80, hi = 0xBF; 205 + utf8proc_int32_t cp = 0; 206 + uint8_t pb[4]; int pp = 0; 207 + 208 + #define FFFD() do { out[o++]=(char)0xEF; out[o++]=(char)0xBF; out[o++]=(char)0xBD; } while(0) 209 + #define NEXT() do { i++; if (i < len) goto *tbl[src[i]]; goto done; } while(0) 210 + 211 + if (!len) goto done; 212 + goto *tbl[src[0]]; 213 + 214 + L_ASCII: 215 + dec->bom_seen = true; 216 + out[o++] = (char)src[i]; 217 + NEXT(); 218 + 219 + L_LONE: 220 + L_BAD: 221 + if (fatal) return -1; 222 + FFFD(); dec->bom_seen = true; 223 + NEXT(); 224 + 225 + L_E0: bc=2; lo=0xA0; hi=0xBF; cp=src[i]&0x0F; pb[0]=src[i]; pp=1; i++; goto cont; 226 + L_ED: bc=2; lo=0x80; hi=0x9F; cp=src[i]&0x0F; pb[0]=src[i]; pp=1; i++; goto cont; 227 + L_3: bc=2; lo=0x80; hi=0xBF; cp=src[i]&0x0F; pb[0]=src[i]; pp=1; i++; goto cont; 228 + L_F0: bc=3; lo=0x90; hi=0xBF; cp=src[i]&0x07; pb[0]=src[i]; pp=1; i++; goto cont; 229 + L_F4: bc=3; lo=0x80; hi=0x8F; cp=src[i]&0x07; pb[0]=src[i]; pp=1; i++; goto cont; 230 + L_4: bc=3; lo=0x80; hi=0xBF; cp=src[i]&0x07; pb[0]=src[i]; pp=1; i++; goto cont; 231 + L_2: bc=1; lo=0x80; hi=0xBF; cp=src[i]&0x1F; pb[0]=src[i]; pp=1; i++; goto cont; 232 + 233 + cont: 234 + while (bc > 0) { 235 + if (i >= len) { 236 + if (stream) { dec->pend_pos = pp; memcpy(dec->pend_buf, pb, pp); } 237 + else { if (fatal) return -1; FFFD(); } 238 + goto done; 239 + } 240 + uint8_t b = src[i]; 241 + if (b < lo || b > hi) { 242 + bc = 0; cp = 0; pp = 0; 243 + if (fatal) return -1; 244 + FFFD(); dec->bom_seen = true; 245 + goto *tbl[b]; 246 + } 247 + lo = 0x80; hi = 0xBF; 248 + cp = (cp << 6) | (b & 0x3F); 249 + pb[pp++] = b; bc--; i++; 250 + } 251 + pp = 0; 252 + if (!dec->bom_seen && cp == 0xFEFF && !dec->ignore_bom) dec->bom_seen = true; 253 + else { 254 + dec->bom_seen = true; 255 + utf8proc_ssize_t n = utf8proc_encode_char(cp, (utf8proc_uint8_t *)(out + o)); 256 + if (n > 0) o += (size_t)n; 257 + } 258 + cp = 0; 259 + if (i < len) goto *tbl[src[i]]; 260 + 261 + done: 262 + #undef FFFD 263 + #undef NEXT 264 + return (utf8proc_ssize_t)o; 179 265 } 180 266 181 267 uint32_t utf16_codepoint_at(const char *str, size_t byte_len, size_t utf16_idx) {

Configure Feed

Configure Feed