MIRROR: javascript for ๐Ÿœ's, a tiny runtime with big ambitions
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

add UTF-8 encoding and decoding functions

+150 -51
+14
include/utf8.h
··· 1 + #ifndef UTF8_H 2 + #define UTF8_H 3 + 4 + #include <stddef.h> 5 + #include <stdint.h> 6 + 7 + int utf8_sequence_length(unsigned char first_byte); 8 + int utf8_encode(uint32_t codepoint, char *out); 9 + uint32_t utf8_decode(const unsigned char *buf, size_t len, int *seq_len); 10 + 11 + size_t utf8_strlen(const char *str, size_t byte_len); 12 + size_t utf16_strlen(const char *str, size_t byte_len); 13 + 14 + #endif
+1
libant/meson.build
··· 24 24 lib_sources = files( 25 25 '../src/roots.c', 26 26 '../src/utils.c', 27 + '../src/utf8.c', 27 28 '../src/ant.c', 28 29 '../src/errors.c', 29 30 '../src/stack.c',
+1
meson.build
··· 23 23 lib_sources = files( 24 24 'src/roots.c', 25 25 'src/utils.c', 26 + 'src/utf8.c', 26 27 'src/ant.c', 27 28 'src/errors.c', 28 29 'src/stack.c',
+10 -25
src/ant.c
··· 13 13 #include "internal.h" 14 14 #include "stack.h" 15 15 #include "errors.h" 16 + #include "utf8.h" 16 17 17 18 #include <uv.h> 18 19 #include <oxc.h> ··· 3745 3746 return false; 3746 3747 } 3747 3748 3748 - static int encode_utf8(uint32_t cp, char *out) { 3749 - if (cp < 0x80) { 3750 - out[0] = (char)cp; 3751 - return 1; 3752 - } else if (cp < 0x800) { 3753 - out[0] = (char)(0xC0 | (cp >> 6)); 3754 - out[1] = (char)(0x80 | (cp & 0x3F)); 3755 - return 2; 3756 - } else if (cp < 0x10000) { 3757 - out[0] = (char)(0xE0 | (cp >> 12)); 3758 - out[1] = (char)(0x80 | ((cp >> 6) & 0x3F)); 3759 - out[2] = (char)(0x80 | (cp & 0x3F)); 3760 - return 3; 3761 - } else { 3762 - out[0] = (char)(0xF0 | (cp >> 18)); 3763 - out[1] = (char)(0x80 | ((cp >> 12) & 0x3F)); 3764 - out[2] = (char)(0x80 | ((cp >> 6) & 0x3F)); 3765 - out[3] = (char)(0x80 | (cp & 0x3F)); 3766 - return 4; 3767 - } 3768 - } 3769 - 3770 3749 #define CHAR_DIGIT 0x01 3771 3750 #define CHAR_XDIGIT 0x02 3772 3751 #define CHAR_ALPHA 0x04 ··· 3992 3971 uint32_t cp; 3993 3972 int el = parse_unicode_escape(src, (jsoff_t)srclen, (jsoff_t)si, &cp); 3994 3973 if (el > 0) { 3995 - di += encode_utf8(cp, dst + di); 3974 + di += utf8_encode(cp, dst + di); 3996 3975 si += el; 3997 3976 } else dst[di++] = src[si++]; 3998 3977 } ··· 5602 5581 } 5603 5582 if (streq(keystr, keylen, "length", 6)) { 5604 5583 if (vtype(obj) == T_STR) { 5605 - return tov(D(offtolen(loadoff(js, (jsoff_t) vdata(obj))))); 5584 + jsoff_t byte_len; 5585 + jsoff_t str_off = vstr(js, obj, &byte_len); 5586 + const char *str_data = (const char *)&js->mem[str_off]; 5587 + return tov(D(utf16_strlen(str_data, byte_len))); 5606 5588 } 5607 5589 if (vtype(obj) == T_ARR) { 5608 5590 jsoff_t len_off = lkp(js, obj, "length", 6); ··· 5727 5709 uint8_t t = vtype(l); 5728 5710 5729 5711 if (t == T_STR && streq(ptr, plen, "length", 6)) { 5730 - return tov(D(offtolen(loadoff(js, (jsoff_t) vdata(l))))); 5712 + jsoff_t byte_len; 5713 + jsoff_t str_off = vstr(js, l, &byte_len); 5714 + const char *str_data = (const char *)&js->mem[str_off]; 5715 + return tov(D(utf16_strlen(str_data, byte_len))); 5731 5716 } 5732 5717 5733 5718 if (t == T_ARR && streq(ptr, plen, "length", 6)) {
+46 -26
src/modules/uri.c
··· 5 5 #include "ant.h" 6 6 #include "errors.h" 7 7 #include "runtime.h" 8 + #include "utf8.h" 8 9 #include "modules/uri.h" 9 10 10 - static int hex_digit(char c) { 11 - if (c >= '0' && c <= '9') return c - '0'; 12 - if (c >= 'A' && c <= 'F') return c - 'A' + 10; 13 - if (c >= 'a' && c <= 'f') return c - 'a' + 10; 14 - return -1; 15 - } 16 - 17 - static int is_uri_unreserved(unsigned char c) { 18 - return (c >= 'A' && c <= 'Z') || 19 - (c >= 'a' && c <= 'z') || 20 - (c >= '0' && c <= '9') || 21 - c == '-' || c == '_' || c == '.' || c == '!' || 22 - c == '~' || c == '*' || c == '\'' || c == '(' || c == ')'; 23 - } 11 + static const unsigned char uri_unreserved[256] = { 12 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 13 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 14 + 0,1,0,0,0,0,0,1,1,1,1,0,0,1,1,0, 15 + 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, 16 + 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 17 + 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1, 18 + 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 19 + 1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0, 20 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 21 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 22 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 23 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 24 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 25 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 26 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 27 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 28 + }; 24 29 25 - static int is_uri_reserved(unsigned char c) { 26 - return c == ';' || c == '/' || c == '?' || c == ':' || 27 - c == '@' || c == '&' || c == '=' || c == '+' || 28 - c == '$' || c == ',' || c == '#'; 29 - } 30 + static const unsigned char uri_reserved[256] = { 31 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 32 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 33 + 0,0,0,1,1,0,1,0,0,0,0,1,1,0,0,1, 34 + 0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1, 35 + 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 36 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 37 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 38 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 39 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 40 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 41 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 42 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 43 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 44 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 45 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 46 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 47 + }; 30 48 31 - static int utf8_sequence_length(unsigned char first_byte) { 32 - if ((first_byte & 0x80) == 0) return 1; 33 - if ((first_byte & 0xE0) == 0xC0) return 2; 34 - if ((first_byte & 0xF0) == 0xE0) return 3; 35 - if ((first_byte & 0xF8) == 0xF0) return 4; 36 - return -1; 37 - } 49 + #define is_uri_unreserved(c) (uri_unreserved[(unsigned char)(c)]) 50 + #define is_uri_reserved(c) (uri_reserved[(unsigned char)(c)]) 38 51 39 52 static int is_valid_continuation(unsigned char c) { 40 53 return (c & 0xC0) == 0x80; 54 + } 55 + 56 + static int hex_digit(char c) { 57 + if (c >= '0' && c <= '9') return c - '0'; 58 + if (c >= 'A' && c <= 'F') return c - 'A' + 10; 59 + if (c >= 'a' && c <= 'f') return c - 'a' + 10; 60 + return -1; 41 61 } 42 62 43 63 static int is_lone_surrogate(const unsigned char *str, int seq_len) {
+78
src/utf8.c
··· 1 + #include "utf8.h" 2 + 3 + int utf8_sequence_length(unsigned char first_byte) { 4 + if ((first_byte & 0x80) == 0) return 1; 5 + if ((first_byte & 0xE0) == 0xC0) return 2; 6 + if ((first_byte & 0xF0) == 0xE0) return 3; 7 + if ((first_byte & 0xF8) == 0xF0) return 4; 8 + return -1; 9 + } 10 + 11 + int utf8_encode(uint32_t cp, char *out) { 12 + if (cp < 0x80) { 13 + out[0] = (char)cp; 14 + return 1; 15 + } else if (cp < 0x800) { 16 + out[0] = (char)(0xC0 | (cp >> 6)); 17 + out[1] = (char)(0x80 | (cp & 0x3F)); 18 + return 2; 19 + } else if (cp < 0x10000) { 20 + out[0] = (char)(0xE0 | (cp >> 12)); 21 + out[1] = (char)(0x80 | ((cp >> 6) & 0x3F)); 22 + out[2] = (char)(0x80 | (cp & 0x3F)); 23 + return 3; 24 + } else { 25 + out[0] = (char)(0xF0 | (cp >> 18)); 26 + out[1] = (char)(0x80 | ((cp >> 12) & 0x3F)); 27 + out[2] = (char)(0x80 | ((cp >> 6) & 0x3F)); 28 + out[3] = (char)(0x80 | (cp & 0x3F)); 29 + return 4; 30 + } 31 + } 32 + 33 + uint32_t utf8_decode(const unsigned char *buf, size_t len, int *seq_len) { 34 + if (len == 0) { *seq_len = 0; return 0; } 35 + 36 + unsigned char first = buf[0]; 37 + int slen = utf8_sequence_length(first); 38 + 39 + if (slen < 0 || (size_t)slen > len) { 40 + *seq_len = 1; 41 + return 0xFFFD; 42 + } 43 + 44 + *seq_len = slen; 45 + 46 + if (slen == 1) return first; 47 + if (slen == 2) return ((first & 0x1F) << 6) | (buf[1] & 0x3F); 48 + if (slen == 3) return ((first & 0x0F) << 12) | ((buf[1] & 0x3F) << 6) | (buf[2] & 0x3F); 49 + return ((first & 0x07) << 18) | ((buf[1] & 0x3F) << 12) | ((buf[2] & 0x3F) << 6) | (buf[3] & 0x3F); 50 + } 51 + 52 + size_t utf8_strlen(const char *str, size_t byte_len) { 53 + size_t count = 0; 54 + const unsigned char *p = (const unsigned char *)str; 55 + const unsigned char *end = p + byte_len; 56 + 57 + while (p < end) { 58 + int seq_len = utf8_sequence_length(*p); 59 + if (seq_len < 0) { p++; count++; continue; } 60 + p += seq_len; 61 + count++; 62 + } 63 + return count; 64 + } 65 + 66 + size_t utf16_strlen(const char *str, size_t byte_len) { 67 + size_t count = 0; 68 + const unsigned char *p = (const unsigned char *)str; 69 + const unsigned char *end = p + byte_len; 70 + 71 + while (p < end) { 72 + int seq_len; 73 + uint32_t cp = utf8_decode(p, end - p, &seq_len); 74 + p += seq_len; 75 + count += (cp >= 0x10000) ? 2 : 1; 76 + } 77 + return count; 78 + }