improve repl regex handling · themackabu.com/ant@235e691

+140

include/regex_scan.h

··· 1 + #ifndef REGEX_SCAN_H 2 + #define REGEX_SCAN_H 3 + 4 + #include <stdbool.h> 5 + #include <stddef.h> 6 + #include <string.h> 7 + 8 + static inline bool js_regex_is_space(unsigned char c) { 9 + return c == ' ' || c == '\t' || c == '\n' || c == '\r'; 10 + } 11 + 12 + static inline bool js_regex_is_digit(unsigned char c) { 13 + return c >= '0' && c <= '9'; 14 + } 15 + 16 + static inline bool js_regex_is_alpha(unsigned char c) { 17 + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); 18 + } 19 + 20 + static inline bool js_regex_is_ident_char(unsigned char c) { 21 + return js_regex_is_alpha(c) || js_regex_is_digit(c) || c == '_' || c == '$'; 22 + } 23 + 24 + static inline bool js_regex_word_eq(const char *word, size_t len, const char *lit, size_t lit_len) { 25 + return len == lit_len && memcmp(word, lit, lit_len) == 0; 26 + } 27 + 28 + static inline size_t js_regex_skip_ws_back(const char *code, size_t i) { 29 + while (i > 0 && js_regex_is_space((unsigned char)code[i - 1])) i--; 30 + return i; 31 + } 32 + 33 + static inline bool js_regex_word_allows_start(const char *word, size_t len) { 34 + return 35 + js_regex_word_eq(word, len, "return", 6) || 36 + js_regex_word_eq(word, len, "throw", 5) || 37 + js_regex_word_eq(word, len, "case", 4) || 38 + js_regex_word_eq(word, len, "delete", 6) || 39 + js_regex_word_eq(word, len, "void", 4) || 40 + js_regex_word_eq(word, len, "new", 3) || 41 + js_regex_word_eq(word, len, "typeof", 6) || 42 + js_regex_word_eq(word, len, "instanceof", 10) || 43 + js_regex_word_eq(word, len, "in", 2) || 44 + js_regex_word_eq(word, len, "of", 2) || 45 + js_regex_word_eq(word, len, "yield", 5) || 46 + js_regex_word_eq(word, len, "await", 5); 47 + } 48 + 49 + static inline bool js_regex_prev_forbids_start(unsigned char prev) { 50 + return 51 + js_regex_is_digit(prev) || 52 + prev == ')' || prev == ']' || prev == '}' || 53 + prev == '"' || prev == '\'' || prev == '`' || prev == '.'; 54 + } 55 + 56 + static inline bool js_regex_prev_allows_start(unsigned char prev) { 57 + switch (prev) { 58 + case '(': 59 + case '[': 60 + case '{': 61 + case ',': 62 + case ';': 63 + case ':': 64 + case '=': 65 + case '!': 66 + case '?': 67 + case '+': 68 + case '-': 69 + case '*': 70 + case '%': 71 + case '&': 72 + case '|': 73 + case '^': 74 + case '~': 75 + case '<': 76 + case '>': return true; 77 + default: return false; 78 + }} 79 + 80 + static inline bool js_regex_can_start(const char *code, size_t start) { 81 + if (start == 0) return true; 82 + 83 + size_t i = js_regex_skip_ws_back(code, start); 84 + if (i == 0) return true; 85 + 86 + unsigned char prev = (unsigned char)code[i - 1]; 87 + 88 + if (js_regex_is_ident_char(prev)) { 89 + size_t end = i; 90 + while (i > 0 && js_regex_is_ident_char((unsigned char)code[i - 1])) i--; 91 + return js_regex_word_allows_start(code + i, end - i); 92 + } 93 + 94 + if (js_regex_prev_forbids_start(prev)) return false; 95 + return js_regex_prev_allows_start(prev); 96 + } 97 + 98 + static inline bool js_scan_regex_literal( 99 + const char *code, size_t len, 100 + size_t start, size_t *out_end 101 + ) { 102 + if (start >= len || code[start] != '/') return false; 103 + if (start + 1 >= len || code[start + 1] == '/' || code[start + 1] == '*') return false; 104 + if (!js_regex_can_start(code, start)) return false; 105 + 106 + size_t i = start + 1; 107 + bool in_class = false; 108 + 109 + for (; i < len; i++) { 110 + unsigned char ch = (unsigned char)code[i]; 111 + if (ch == '\n' || ch == '\r') return false; 112 + 113 + if (ch == '\\') { 114 + if (i + 1 < len) i++; 115 + continue; 116 + } 117 + 118 + if (in_class) { 119 + if (ch == ']') in_class = false; 120 + continue; 121 + } 122 + 123 + if (ch == '[') { 124 + in_class = true; 125 + continue; 126 + } 127 + 128 + if (ch != '/') continue; 129 + 130 + i++; 131 + while (i < len && js_regex_is_alpha((unsigned char)code[i])) i++; 132 + 133 + if (out_end) *out_end = i; 134 + return true; 135 + } 136 + 137 + return false; 138 + } 139 + 140 + #endif

+2 -101

src/highlight.c

··· 5 5 6 6 #include "tokens.h" 7 7 #include "highlight.h" 8 + #include "regex_scan.h" 8 9 #include "silver/lexer.h" 9 10 10 11 typedef struct { const char *op; int len; hl_token_class cls; } op_entry_t; ··· 111 112 if (len == 5 && memcmp(word, "class", 5) == 0) return HL_CTX_AFTER_CLASS; 112 113 if (len == 7 && memcmp(word, "extends", 7) == 0) return HL_CTX_AFTER_EXTENDS; 113 114 return HL_CTX_NONE; 114 - } 115 - 116 - static bool regex_allowed_after_word(const char *word, size_t len) { 117 - #define W(s) (len == sizeof(s) - 1 && memcmp(word, s, sizeof(s) - 1) == 0) 118 - return 119 - W("return") || W("throw") || W("case") || W("delete") || 120 - W("void") || W("new") || W("typeof") || W("instanceof") || 121 - W("in") || W("of") || W("yield") || W("await"); 122 - #undef W 123 - } 124 - 125 - static bool can_start_regex_literal(const char *input, size_t start) { 126 - if (start == 0) return true; 127 - 128 - size_t i = start; 129 - while (i > 0) { 130 - unsigned char prev = (unsigned char)input[i - 1]; 131 - if (prev == ' ' || prev == '\t') { 132 - i--; 133 - continue; 134 - } 135 - if (prev == '\n' || prev == '\r') return true; 136 - 137 - if (is_ident_continue(prev)) { 138 - size_t end = i; 139 - while (i > 0 && is_ident_continue((unsigned char)input[i - 1])) i--; 140 - return regex_allowed_after_word(input + i, end - i); 141 - } 142 - 143 - if ( 144 - IS_DIGIT(prev) || prev == ')' || prev == ']' || prev == '}' || 145 - prev == '\'' || prev == '"' || prev == '`' || prev == '.' 146 - ) return false; 147 - 148 - switch (prev) { 149 - case '(': 150 - case '[': 151 - case '{': 152 - case ',': 153 - case ';': 154 - case ':': 155 - case '=': 156 - case '!': 157 - case '?': 158 - case '+': 159 - case '-': 160 - case '*': 161 - case '%': 162 - case '&': 163 - case '|': 164 - case '^': 165 - case '~': 166 - case '<': 167 - case '>': return true; 168 - default: return false; 169 - } 170 - } 171 - 172 - return true; 173 - } 174 - 175 - static bool try_parse_regex_literal(const char *input, size_t input_len, size_t start, size_t *out_end) { 176 - if (input[start] != '/' || start + 1 >= input_len) return false; 177 - if (input[start + 1] == '/' || input[start + 1] == '*') return false; 178 - if (!can_start_regex_literal(input, start)) return false; 179 - 180 - size_t i = start + 1; 181 - bool in_class = false; 182 - 183 - while (i < input_len) { 184 - unsigned char ch = (unsigned char)input[i]; 185 - 186 - if (ch == '\\') { 187 - i += (i + 1 < input_len) ? 2 : 1; 188 - continue; 189 - } 190 - if (ch == '\n' || ch == '\r') return false; 191 - if (!in_class && ch == '[') { 192 - in_class = true; i++; 193 - continue; 194 - } 195 - if (in_class && ch == ']') { 196 - in_class = false; i++; 197 - continue; 198 - } 199 - if (!in_class && ch == '/') { 200 - i++; 201 - while (i < input_len) { 202 - unsigned char f = (unsigned char)input[i]; 203 - if ((f >= 'a' && f <= 'z') || (f >= 'A' && f <= 'Z')) { 204 - i++; 205 - continue; 206 - } break; 207 - } 208 - *out_end = i; 209 - return true; 210 - } i++; 211 - } 212 - 213 - return false; 214 115 } 215 116 216 117 static size_t skip_inline_ws_forward(const char *input, size_t input_len, size_t i) { ··· 441 342 442 343 if (c == '/') { 443 344 size_t regex_end = 0; 444 - if (try_parse_regex_literal(input, input_len, i, &regex_end)) { 345 + if (js_scan_regex_literal(input, input_len, i, &regex_end)) { 445 346 it->ctx = HL_CTX_NONE; 446 347 *out = (hl_span){ i, regex_end - i, HL_REGEX }; 447 348 it->pos = regex_end;

+7

src/repl.c

··· 30 30 31 31 #include <crprintf.h> 32 32 #include "highlight.h" 33 + #include "regex_scan.h" 33 34 34 35 #define MAX_HISTORY 512 35 36 #define MAX_LINE_LENGTH 4096 ··· 755 756 for (i += 2; i + 1 < len && !(code[i] == '*' && code[i + 1] == '/'); i++); 756 757 if (i + 1 >= len) { free(s.templates); return true; } 757 758 i++; continue; 759 + } 760 + if (js_regex_can_start(code, i)) { 761 + size_t regex_end = 0; 762 + if (!js_scan_regex_literal(code, len, i, &regex_end)) { free(s.templates); return true; } 763 + i = regex_end - 1; 764 + continue; 758 765 } 759 766 } 760 767

Configure Feed

Configure Feed