MIRROR: javascript for 馃悳's, a tiny runtime with big ambitions
1// TODO: cleanup module, make cleaner
2
3#include <stdlib.h>
4#include <string.h>
5#include <stdio.h>
6
7#include "ant.h"
8#include "utf8.h"
9#include "errors.h"
10#include "runtime.h"
11#include "internal.h"
12#include "utils.h"
13#include "escape.h"
14#include "descriptors.h"
15
16#include "silver/engine.h"
17#include "modules/regex.h"
18#include "modules/symbol.h"
19#include "gc/objects.h"
20
21#include <pcre2.h>
22
23typedef struct {
24 ant_object_t *obj;
25 pcre2_code *code;
26 pcre2_match_data *match_data;
27 bool jit_ready;
28} regex_cache_entry_t;
29
30enum {
31 REGEXP_FLAG_HAS_INDICES = 1 << 0,
32 REGEXP_FLAG_GLOBAL = 1 << 1,
33 REGEXP_FLAG_IGNORE_CASE = 1 << 2,
34 REGEXP_FLAG_MULTILINE = 1 << 3,
35 REGEXP_FLAG_DOTALL = 1 << 4,
36 REGEXP_FLAG_UNICODE = 1 << 5,
37 REGEXP_FLAG_UNICODE_SET = 1 << 6,
38 REGEXP_FLAG_STICKY = 1 << 7,
39};
40
41static regex_cache_entry_t *regex_cache = NULL;
42static ant_value_t regexp_matchall_iter_proto_val = 0;
43
44static size_t regex_cache_count = 0;
45static size_t regex_cache_cap = 0;
46
47static inline uint8_t regexp_parse_flags_mask(const char *fstr, ant_offset_t flen) {
48 uint8_t mask = 0;
49 for (ant_offset_t k = 0; k < flen; k++) {
50 switch (fstr[k]) {
51 case 'd': mask |= REGEXP_FLAG_HAS_INDICES; break;
52 case 'g': mask |= REGEXP_FLAG_GLOBAL; break;
53 case 'i': mask |= REGEXP_FLAG_IGNORE_CASE; break;
54 case 'm': mask |= REGEXP_FLAG_MULTILINE; break;
55 case 's': mask |= REGEXP_FLAG_DOTALL; break;
56 case 'u': mask |= REGEXP_FLAG_UNICODE; break;
57 case 'v': mask |= REGEXP_FLAG_UNICODE_SET; break;
58 case 'y': mask |= REGEXP_FLAG_STICKY; break;
59 default: break;
60 }}
61 return mask;
62}
63
64static inline uint8_t regexp_flags_mask(ant_t *js, ant_value_t regexp) {
65 ant_offset_t flags_off = lkp(js, regexp, "flags", 5);
66 if (flags_off == 0) return 0;
67
68 ant_value_t flags_val = js_propref_load(js, flags_off);
69 if (vtype(flags_val) != T_STR) return 0;
70
71 ant_value_t cached_flags = js_get_slot(regexp, SLOT_REGEXP_FLAGS_STRING);
72 ant_value_t cached = js_get_slot(regexp, SLOT_REGEXP_FLAGS_MASK);
73 if (flags_val == cached_flags && vtype(cached) == T_NUM) return (uint8_t)tod(cached);
74
75 ant_offset_t flen, foff = vstr(js, flags_val, &flen);
76 uint8_t mask = regexp_parse_flags_mask((const char *)(uintptr_t)foff, flen);
77 js_set_slot(regexp, SLOT_REGEXP_FLAGS_MASK, tov((double)mask));
78 js_set_slot(regexp, SLOT_REGEXP_FLAGS_STRING, flags_val);
79
80 return mask;
81}
82
83static ant_value_t regexp_build_named_groups_meta(ant_t *js, pcre2_code *code) {
84 uint32_t namecount = 0;
85 pcre2_pattern_info(code, PCRE2_INFO_NAMECOUNT, &namecount);
86 if (namecount == 0) return js_mkundef();
87
88 uint32_t nameentrysize = 0;
89 PCRE2_SPTR nametable = NULL;
90 pcre2_pattern_info(code, PCRE2_INFO_NAMEENTRYSIZE, &nameentrysize);
91 pcre2_pattern_info(code, PCRE2_INFO_NAMETABLE, (void *)&nametable);
92
93 ant_value_t meta = js_mkarr(js);
94 if (is_err(meta)) return meta;
95
96 PCRE2_SPTR tabptr = nametable;
97 for (uint32_t i = 0; i < namecount; i++) {
98 int n = (tabptr[0] << 8) | tabptr[1];
99 const char *name = (const char *)(tabptr + 2);
100 ant_value_t name_val = js_mkstr(js, name, strlen(name));
101 if (is_err(name_val)) return name_val;
102 js_arr_push(js, meta, name_val);
103 js_arr_push(js, meta, tov((double)n));
104 tabptr += nameentrysize;
105 }
106
107 return meta;
108}
109
110static void update_regexp_statics(ant_t *js, const char *str_ptr, PCRE2_SIZE *ovector, uint32_t ovcount) {
111 ant_value_t regexp_ctor = js_get(js, js_glob(js), "RegExp");
112 if (is_err(regexp_ctor) || vtype(regexp_ctor) == T_UNDEF) return;
113
114 ant_value_t empty = js_mkstr(js, "", 0);
115 for (int i = 1; i <= 9; i++) {
116 char key[3] = {'$', (char)('0' + i), '\0'};
117 ant_value_t val = empty;
118 if ((uint32_t)i < ovcount && ovector[2*i] != PCRE2_UNSET)
119 val = js_mkstr(js, str_ptr + ovector[2*i], ovector[2*i+1] - ovector[2*i]);
120 if (is_err(setprop_cstr(js, regexp_ctor, key, 2, val))) return;
121 }
122
123 ant_value_t match0 = empty;
124 if (ovcount > 0 && ovector[0] != PCRE2_UNSET)
125 match0 = js_mkstr(js, str_ptr + ovector[0], ovector[1] - ovector[0]);
126 if (is_err(setprop_cstr(js, regexp_ctor, "lastMatch", 9, match0))) return;
127 (void)setprop_cstr(js, regexp_ctor, "$&", 2, match0);
128}
129
130static inline bool is_pcre2_passthrough_escape(char c) {
131switch (c) {
132 case 'd': case 'D': case 'w': case 'W': case 's': case 'S':
133 case 'b': case 'B': case 'n': case 'r': case 't': case 'f':
134 case '1': case '2': case '3': case '4': case '5':
135 case '6': case '7': case '8': case '9':
136 case '.': case '*': case '+': case '?':
137 case '(': case ')': case '[': case ']':
138 case '{': case '}': case '|': case '^':
139 case '$': case '\\': case '/': case '-': return true;
140 default: return false;
141}}
142
143static inline bool is_class_shorthand(char c) {
144 return c == 'w' || c == 'W' || c == 'd' || c == 'D' || c == 's' || c == 'S';
145}
146
147static size_t v_close_bracket(const char *src, size_t src_len, size_t open) {
148 int depth = 0;
149 for (size_t i = open; i < src_len; i++) {
150 if (src[i] == '\\' && i + 1 < src_len) { i++; continue; }
151 if (src[i] == '[') depth++;
152 else if (src[i] == ']') { if (--depth == 0) return i; }
153 }
154 return src_len;
155}
156
157static size_t v_translate_part(const char *p, size_t len, char *out, size_t out_size) {
158 if (len && p[0] == '[') return js_to_pcre2_pattern(p, len, out, out_size, false);
159 char tmp[1024];
160 if (len >= sizeof(tmp) - 2) return 0;
161 tmp[0] = '['; memcpy(tmp + 1, p, len); tmp[len + 1] = ']';
162 return js_to_pcre2_pattern(tmp, len + 2, out, out_size, false);
163}
164
165static int v_set_op(const char *src, size_t start, size_t end, size_t *op_pos) {
166 int depth = 0;
167 for (size_t i = start; i < end; ) {
168 if (src[i] == '\\' && i + 1 < end) {
169 char n = src[i + 1];
170 if ((n == 'p' || n == 'P') && i + 2 < end && src[i + 2] == '{') {
171 i += 3; while (i < end && src[i] != '}') i++; if (i < end) i++; continue;
172 }
173 if ((n == 'u' || n == 'x') && i + 2 < end && src[i + 2] == '{') {
174 i += 3; while (i < end && src[i] != '}') i++; if (i < end) i++; continue;
175 }
176 i += 2; continue;
177 }
178 if (src[i] == '[') { depth++; i++; continue; }
179 if (src[i] == ']') { if (depth > 0) { depth--; i++; continue; } break; }
180 if (!depth && i + 1 < end) {
181 if (src[i] == '&' && src[i+1] == '&') { *op_pos = i; return 1; }
182 if (src[i] == '-' && src[i+1] == '-') { *op_pos = i; return 2; }
183 }
184 i++;
185 }
186 return 0;
187}
188
189size_t js_to_pcre2_pattern(const char *src, size_t src_len, char *dst, size_t dst_size, bool v_flag) {
190 size_t di = 0;
191 int charclass_depth = 0;
192
193#define OUT(ch) do { if (di < dst_size - 1) dst[di++] = (ch); } while(0)
194
195 for (size_t si = 0; si < src_len && di < dst_size - 1; si++) {
196 if (src[si] == '[') {
197 if (v_flag && charclass_depth == 0) {
198 size_t close = v_close_bracket(src, src_len, si);
199 size_t op_pos;
200 int op_type = v_set_op(src, si + 1, close, &op_pos);
201 if (op_type && close < src_len) {
202 char ao[1024], bo[1024];
203 size_t aol = v_translate_part(&src[si + 1], op_pos - si - 1, ao, sizeof(ao));
204 size_t bol = v_translate_part(&src[op_pos + 2], close - op_pos - 2, bo, sizeof(bo));
205 const char *la = op_type == 1 ? ao : bo, *ra = op_type == 1 ? bo : ao;
206 size_t ll = op_type == 1 ? aol : bol, rl = op_type == 1 ? bol : aol;
207 OUT('('); OUT('?'); OUT(op_type == 1 ? '=' : '!');
208 for (size_t k = 0; k < ll; k++) OUT(la[k]);
209 OUT(')');
210 for (size_t k = 0; k < rl; k++) OUT(ra[k]);
211 si = close;
212 continue;
213 }
214 }
215 charclass_depth++;
216 OUT('[');
217 continue;
218 }
219 if (src[si] == ']' && charclass_depth > 0) {
220 charclass_depth--;
221 OUT(']');
222 continue;
223 }
224
225 if (charclass_depth > 0 && src[si] == '-' && si > 0 && src[si - 1] != '[' &&
226 si + 1 < src_len && src[si + 1] != ']') {
227 bool prev_is_shorthand = (si >= 2 && src[si - 2] == '\\' && is_class_shorthand(src[si - 1]));
228 bool next_is_shorthand = (si + 2 < src_len && src[si + 1] == '\\' && is_class_shorthand(src[si + 2]));
229 if (prev_is_shorthand || next_is_shorthand) {
230 OUT('\\'); OUT('-');
231 continue;
232 }
233 OUT('-');
234 continue;
235 }
236
237 if (src[si] != '\\' || si + 1 >= src_len) {
238 OUT(src[si]);
239 continue;
240 }
241
242 char next = src[si + 1];
243
244 if (next == 'v') {
245 OUT('\\'); OUT('x'); OUT('{'); OUT('0'); OUT('b'); OUT('}');
246 si++;
247 continue;
248 }
249
250 if (next == 'u' && si + 2 < src_len && src[si + 2] == '{') {
251 size_t brace_start = si + 3;
252 size_t brace_end = brace_start;
253 while (brace_end < src_len && src[brace_end] != '}' && is_xdigit(src[brace_end])) brace_end++;
254 if (brace_end < src_len && src[brace_end] == '}' && brace_end > brace_start) {
255 OUT('\\'); OUT('x'); OUT('{');
256 for (size_t k = brace_start; k < brace_end; k++) OUT(src[k]);
257 OUT('}');
258 si = brace_end;
259 continue;
260 }
261 }
262
263 if (next == 'u' && si + 5 < src_len &&
264 is_xdigit(src[si+2]) && is_xdigit(src[si+3]) &&
265 is_xdigit(src[si+4]) && is_xdigit(src[si+5])) {
266 OUT('\\'); OUT('x'); OUT('{');
267 OUT(src[si+2]); OUT(src[si+3]); OUT(src[si+4]); OUT(src[si+5]);
268 OUT('}');
269 si += 5;
270 continue;
271 }
272
273 if (next == 'u') {
274 si++;
275 OUT('u');
276 continue;
277 }
278
279 if (next == 'x' && si + 3 < src_len &&
280 is_xdigit(src[si+2]) && is_xdigit(src[si+3])) {
281 OUT('\\'); OUT('x'); OUT(src[si+2]); OUT(src[si+3]);
282 si += 3;
283 continue;
284 }
285
286 if (next == 'x') {
287 si++;
288 OUT('x');
289 continue;
290 }
291
292 if (next == '0' && (si + 2 >= src_len || src[si+2] < '0' || src[si+2] > '9')) {
293 OUT('\\'); OUT('x'); OUT('{'); OUT('0'); OUT('}');
294 si++;
295 continue;
296 }
297
298 if (next >= '0' && next <= '7') {
299 unsigned int octal = next - '0';
300 size_t advance = 1;
301 if (si + 2 < src_len && src[si+2] >= '0' && src[si+2] <= '7') {
302 octal = octal * 8 + (src[si+2] - '0');
303 advance = 2;
304 if (si + 3 < src_len && src[si+3] >= '0' && src[si+3] <= '7' && octal * 8 + (src[si+3] - '0') <= 255) {
305 octal = octal * 8 + (src[si+3] - '0');
306 advance = 3;
307 }
308 }
309
310 if (advance > 1 || next == '0') {
311 char hex[8];
312 int hlen = snprintf(hex, sizeof(hex), "\\x{%02x}", octal);
313 for (int k = 0; k < hlen && di < dst_size - 1; k++) OUT(hex[k]);
314 si += advance;
315 continue;
316 }
317 }
318
319 if (next == 'c' && si + 2 < src_len &&
320 ((src[si+2] >= 'A' && src[si+2] <= 'Z') || (src[si+2] >= 'a' && src[si+2] <= 'z'))) {
321 OUT('\\'); OUT('c'); OUT(src[si+2]);
322 si += 2;
323 continue;
324 }
325
326 if (next == 'c') {
327 OUT('\\'); OUT('\\'); OUT('c');
328 si++;
329 continue;
330 }
331
332 if ((next == 'p' || next == 'P') && si + 2 < src_len && src[si + 2] == '{') {
333 size_t brace_start = si + 3;
334 size_t brace_end = brace_start;
335 while (brace_end < src_len && src[brace_end] != '}') brace_end++;
336 if (brace_end < src_len && src[brace_end] == '}') {
337 const char *prop = &src[brace_start];
338 size_t prop_len = brace_end - brace_start;
339 static const struct { const char *name; const char *code; } gc_map[] = {
340 {"Letter","L"},{"Cased_Letter","LC"},{"Uppercase_Letter","Lu"},
341 {"Lowercase_Letter","Ll"},{"Titlecase_Letter","Lt"},
342 {"Modifier_Letter","Lm"},{"Other_Letter","Lo"},
343 {"Mark","M"},{"Nonspacing_Mark","Mn"},{"Spacing_Mark","Mc"},
344 {"Enclosing_Mark","Me"},
345 {"Number","N"},{"Decimal_Number","Nd"},{"Letter_Number","Nl"},
346 {"Other_Number","No"},
347 {"Punctuation","P"},{"Connector_Punctuation","Pc"},
348 {"Dash_Punctuation","Pd"},{"Open_Punctuation","Ps"},
349 {"Close_Punctuation","Pe"},{"Initial_Punctuation","Pi"},
350 {"Final_Punctuation","Pf"},{"Other_Punctuation","Po"},
351 {"Symbol","S"},{"Math_Symbol","Sm"},{"Currency_Symbol","Sc"},
352 {"Modifier_Symbol","Sk"},{"Other_Symbol","So"},
353 {"Separator","Z"},{"Space_Separator","Zs"},
354 {"Line_Separator","Zl"},{"Paragraph_Separator","Zp"},
355 {"Other","C"},{"Control","Cc"},{"Format","Cf"},
356 {"Surrogate","Cs"},{"Private_Use","Co"},{"Unassigned","Cn"},
357 };
358 static const struct { const char *script; const char *range; } u17_scripts[] = {
359 {"Sidetic", "\\x{10940}-\\x{1095F}"},
360 {"Garay", "\\x{10D40}-\\x{10D8F}"},
361 {"Gurung_Khema", "\\x{16100}-\\x{1613F}"},
362 {"Kirat_Rai", "\\x{16D40}-\\x{16D7F}"},
363 {"Ol_Onal", "\\x{1E5D0}-\\x{1E5FF}"},
364 {"Sunuwar", "\\x{11BC0}-\\x{11BFF}"},
365 {"Tulu_Tigalari", "\\x{11380}-\\x{113FF}"},
366 };
367 bool has_eq = (memchr(prop, '=', prop_len) != NULL);
368 bool has_colon = (memchr(prop, ':', prop_len) != NULL);
369 if (!has_eq && !has_colon && next == 'p' && charclass_depth == 0) {
370 static const struct { const char *name; const char *exp; } sprops[] = {
371 {"Emoji_Keycap_Sequence",
372 "(?:\\x{23}\\x{fe0f}\\x{20e3}|\\x{2a}\\x{fe0f}\\x{20e3}|[\\x{30}-\\x{39}]\\x{fe0f}\\x{20e3})"},
373 {"RGI_Emoji",
374 "(?:[\\x{1f1e6}-\\x{1f1ff}]{2}|(?:\\p{Emoji}[\\x{1f3fb}-\\x{1f3ff}]?\\x{200d})+\\p{Emoji}[\\x{1f3fb}-\\x{1f3ff}]?|\\p{Emoji}[\\x{1f3fb}-\\x{1f3ff}]|\\p{Emoji}\\x{fe0f}?)"},
375 };
376 for (size_t m = 0; m < sizeof(sprops)/sizeof(sprops[0]); m++) {
377 if (strlen(sprops[m].name) == prop_len && memcmp(sprops[m].name, prop, prop_len) == 0) {
378 for (const char *r = sprops[m].exp; *r && di < dst_size - 1; r++) OUT(*r);
379 si = brace_end;
380 goto next_char;
381 }
382 }
383 }
384 if (has_eq || has_colon) {
385 char sep = has_eq ? '=' : ':';
386 const char *val = memchr(prop, sep, prop_len);
387 if (val) {
388 val++;
389 size_t val_len = prop_len - (size_t)(val - prop);
390 for (size_t m = 0; m < sizeof(u17_scripts)/sizeof(u17_scripts[0]); m++) {
391 if (strlen(u17_scripts[m].script) == val_len &&
392 memcmp(u17_scripts[m].script, val, val_len) == 0) {
393 const char *r = u17_scripts[m].range;
394 OUT('[');
395 if (next == 'P') OUT('^');
396 for (; *r; r++) OUT(*r);
397 OUT(']');
398 si = brace_end;
399 goto next_char;
400 }
401 }
402 }
403 }
404 if (!has_eq && !has_colon) {
405 static const struct { const char *name; const char *range; } rangeprops[] = {
406 {"ASCII", "\\x{0}-\\x{7f}"},
407 {"Any", "\\x{0}-\\x{10ffff}"},
408 };
409 for (size_t m = 0; m < sizeof(rangeprops)/sizeof(rangeprops[0]); m++) {
410 if (strlen(rangeprops[m].name) == prop_len && memcmp(rangeprops[m].name, prop, prop_len) == 0) {
411 if (charclass_depth > 0) {
412 for (const char *r = rangeprops[m].range; *r; r++) OUT(*r);
413 } else {
414 OUT('['); if (next == 'P') OUT('^');
415 for (const char *r = rangeprops[m].range; *r; r++) OUT(*r);
416 OUT(']');
417 }
418 si = brace_end;
419 goto next_char;
420 }
421 }
422 }
423 const char *replacement = NULL;
424 if (!has_eq && !has_colon) {
425 for (size_t m = 0; m < sizeof(gc_map)/sizeof(gc_map[0]); m++) {
426 if (strlen(gc_map[m].name) == prop_len &&
427 memcmp(gc_map[m].name, prop, prop_len) == 0) {
428 replacement = gc_map[m].code;
429 break;
430 }
431 }
432 }
433 static const struct { const char *prop; const char *extra; } u17_props[] = {
434 {"Emoji", "\\x{1FACD}-\\x{1FACE}\\x{1FAE9}\\x{1FAF9}"},
435 };
436 const char *extra_range = NULL;
437 if (!has_eq && !has_colon && !replacement) {
438 for (size_t m = 0; m < sizeof(u17_props)/sizeof(u17_props[0]); m++) {
439 if (strlen(u17_props[m].prop) == prop_len &&
440 memcmp(u17_props[m].prop, prop, prop_len) == 0) {
441 extra_range = u17_props[m].extra;
442 break;
443 }
444 }
445 }
446 if (extra_range && charclass_depth == 0) {
447 const char *pfx = (next == 'p') ? "(?:\\p{" : "(?:\\P{";
448 for (const char *r = pfx; *r; r++) OUT(*r);
449 for (size_t k = brace_start; k < brace_end; k++) OUT(src[k]);
450 OUT('}'); OUT('|'); OUT('[');
451 if (next == 'P') OUT('^');
452 for (const char *r = extra_range; *r; r++) OUT(*r);
453 OUT(']'); OUT(')');
454 } else {
455 OUT('\\'); OUT(next); OUT('{');
456 if (replacement) {
457 for (const char *r = replacement; *r; r++) OUT(*r);
458 } else {
459 for (size_t k = brace_start; k < brace_end; k++) OUT(src[k]);
460 }
461 OUT('}');
462 }
463 si = brace_end;
464 continue;
465 }
466 OUT('\\'); OUT(next);
467 si++;
468 continue;
469 }
470
471 if (is_pcre2_passthrough_escape(next)) {
472 OUT('\\'); OUT(next);
473 si++;
474 continue;
475 }
476
477 si++;
478 OUT(next);
479 next_char:;
480 }
481
482#undef OUT
483 dst[di] = '\0';
484 return di;
485}
486
487#define REGEXP_SET_PROP(js, obj, key, klen, val, is_new) \
488 ((is_new) ? js_mkprop_fast(js, obj, key, klen, val) \
489 : js_setprop(js, obj, js_mkstr(js, key, klen), val))
490
491static void regexp_init_flags(ant_t *js, ant_value_t obj, const char *fstr, ant_offset_t flen, bool is_new) {
492 uint8_t mask = regexp_parse_flags_mask(fstr, flen);
493 bool d = (mask & REGEXP_FLAG_HAS_INDICES) != 0;
494 bool g = (mask & REGEXP_FLAG_GLOBAL) != 0;
495 bool i = (mask & REGEXP_FLAG_IGNORE_CASE) != 0;
496 bool m = (mask & REGEXP_FLAG_MULTILINE) != 0;
497 bool s = (mask & REGEXP_FLAG_DOTALL) != 0;
498 bool u = (mask & REGEXP_FLAG_UNICODE) != 0;
499 bool v = (mask & REGEXP_FLAG_UNICODE_SET) != 0;
500 bool y = (mask & REGEXP_FLAG_STICKY) != 0;
501
502 char sorted[10]; int si = 0;
503 if (d) sorted[si++] = 'd';
504 if (g) sorted[si++] = 'g';
505 if (i) sorted[si++] = 'i';
506 if (m) sorted[si++] = 'm';
507 if (s) sorted[si++] = 's';
508 if (u) sorted[si++] = 'u';
509 if (v) sorted[si++] = 'v';
510 if (y) sorted[si++] = 'y';
511
512 ant_value_t flags_value = js_mkstr(js, sorted, si);
513 REGEXP_SET_PROP(js, obj, "flags", 5, flags_value, is_new);
514 REGEXP_SET_PROP(js, obj, "hasIndices", 10, mkval(T_BOOL, d ? 1 : 0), is_new);
515 REGEXP_SET_PROP(js, obj, "global", 6, mkval(T_BOOL, g ? 1 : 0), is_new);
516 REGEXP_SET_PROP(js, obj, "ignoreCase", 10, mkval(T_BOOL, i ? 1 : 0), is_new);
517 REGEXP_SET_PROP(js, obj, "multiline", 9, mkval(T_BOOL, m ? 1 : 0), is_new);
518 REGEXP_SET_PROP(js, obj, "dotAll", 6, mkval(T_BOOL, s ? 1 : 0), is_new);
519 REGEXP_SET_PROP(js, obj, "unicode", 7, mkval(T_BOOL, u ? 1 : 0), is_new);
520 REGEXP_SET_PROP(js, obj, "unicodeSets", 11, mkval(T_BOOL, v ? 1 : 0), is_new);
521 REGEXP_SET_PROP(js, obj, "sticky", 6, mkval(T_BOOL, y ? 1 : 0), is_new);
522 REGEXP_SET_PROP(js, obj, "lastIndex", 9, tov(0), is_new);
523 js_set_slot(obj, SLOT_REGEXP_FLAGS_MASK, tov((double)mask));
524 js_set_slot(obj, SLOT_REGEXP_FLAGS_STRING, flags_value);
525 js_set_slot(obj, SLOT_REGEXP_NAMED_GROUPS, js_mkundef());
526}
527
528ant_value_t is_regexp_like(ant_t *js, ant_value_t value) {
529 if (!is_object_type(value)) return js_false;
530
531 ant_value_t match_sym = get_match_sym();
532 if (vtype(match_sym) == T_SYMBOL) {
533 ant_value_t match_val = js_get_sym(js, value, match_sym);
534 if (is_err(match_val)) return match_val;
535 if (vtype(match_val) != T_UNDEF) return js_bool(js_truthy(js, match_val));
536 }
537
538 ant_value_t regexp_ctor = js_get(js, js_glob(js), "RegExp");
539 if (is_err(regexp_ctor)) return regexp_ctor;
540
541 ant_value_t regexp_proto = js_get(js, regexp_ctor, "prototype");
542 if (is_err(regexp_proto)) return regexp_proto;
543 if (!is_object_type(regexp_proto)) return js_false;
544
545 return js_bool(proto_chain_contains(js, value, regexp_proto));
546}
547
548static ant_value_t should_regexp_passthrough(ant_t *js, ant_value_t *args, int nargs) {
549 if (vtype(js->new_target) != T_UNDEF) return js_false;
550 if (nargs <= 0) return js_false;
551
552 if (nargs >= 2 && vtype(args[1]) != T_UNDEF) return js_false;
553 if (!is_object_type(args[0])) return js_false;
554
555 ant_value_t is_re = is_regexp_like(js, args[0]);
556 if (is_err(is_re)) return is_re;
557 if (!js_truthy(js, is_re)) return js_false;
558
559 ant_value_t ctor = js_getprop_fallback(js, args[0], "constructor");
560 if (is_err(ctor)) return ctor;
561
562 ant_value_t regexp_ctor = js_get(js, js_glob(js), "RegExp");
563 if (is_err(regexp_ctor)) return regexp_ctor;
564
565 return js_bool(same_ctor_identity(js, ctor, regexp_ctor));
566}
567
568ant_value_t reject_regexp_arg(ant_t *js, ant_value_t value, const char *method_name) {
569 ant_value_t is_re = is_regexp_like(js, value);
570 if (is_err(is_re)) return is_re;
571 if (js_truthy(js, is_re)) {
572 return js_mkerr_typed(js, JS_ERR_TYPE, "First argument to %s must not be a RegExp", method_name);
573 }
574 return js_mkundef();
575}
576
577static ant_value_t regexp_species_construct(ant_t *js, ant_value_t rx, ant_value_t ctor, ant_value_t *ctor_args, int nargs) {
578 ant_value_t seed = js_mkobj(js);
579 if (is_err(seed)) return seed;
580
581 ant_value_t proto = js_get(js, ctor, "prototype");
582 if (is_err(proto)) return proto;
583 if (is_object_type(proto)) js_set_proto_init(seed, proto);
584
585 ant_value_t saved = js->new_target;
586 js->new_target = ctor;
587 ant_value_t result = sv_vm_call(js->vm, js, ctor, seed, ctor_args, nargs, NULL, true);
588 js->new_target = saved;
589
590 if (is_err(result)) return result;
591 if (!is_object_type(result))
592 return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp species constructor returned non-object");
593
594 return result;
595}
596
597static ant_value_t regexp_exec_abstract(ant_t *js, ant_value_t rx, ant_value_t str);
598static ant_value_t builtin_regexp_exec(ant_t *js, ant_value_t *args, int nargs);
599
600static regex_cache_entry_t *regex_cache_lookup(ant_object_t *obj) {
601 for (size_t i = 0; i < regex_cache_count; i++) {
602 if (regex_cache[i].obj == obj) return ®ex_cache[i];
603 }
604 return NULL;
605}
606
607static regex_cache_entry_t *regex_cache_insert(ant_object_t *obj, pcre2_code *code, pcre2_match_data *match_data, bool jit_ready) {
608 if (regex_cache_count >= regex_cache_cap) {
609 size_t new_cap = regex_cache_cap ? regex_cache_cap * 2 : 64;
610 regex_cache_entry_t *new_cache = realloc(regex_cache, new_cap * sizeof(regex_cache_entry_t));
611 if (!new_cache) return NULL;
612 regex_cache = new_cache;
613 regex_cache_cap = new_cap;
614 }
615 regex_cache_entry_t *entry = ®ex_cache[regex_cache_count++];
616 entry->obj = obj;
617 entry->code = code;
618 entry->match_data = match_data;
619 entry->jit_ready = jit_ready;
620 return entry;
621}
622
623typedef struct {
624 pcre2_code *code;
625 pcre2_match_data *match_data;
626 bool jit_ready;
627} compiled_regex_t;
628
629static bool regex_get_or_compile(ant_t *js, ant_value_t regexp_obj, compiled_regex_t *out) {
630 ant_object_t *obj_ptr = js_obj_ptr(regexp_obj);
631 uint8_t flags_mask = regexp_flags_mask(js, regexp_obj);
632
633 regex_cache_entry_t *cached = regex_cache_lookup(obj_ptr);
634 if (cached) {
635 out->code = cached->code;
636 out->match_data = cached->match_data;
637 out->jit_ready = cached->jit_ready;
638 return true;
639 }
640
641 ant_offset_t source_off = lkp(js, regexp_obj, "source", 6);
642 if (source_off == 0) return false;
643 ant_value_t source_val = js_propref_load(js, source_off);
644 if (vtype(source_val) != T_STR) return false;
645
646 ant_offset_t plen, poff = vstr(js, source_val, &plen);
647 const char *pattern_ptr = (char *)(uintptr_t)(poff);
648
649 char pcre2_pattern[4096];
650 size_t pcre2_len = js_to_pcre2_pattern(
651 pattern_ptr, plen, pcre2_pattern, sizeof(pcre2_pattern),
652 (flags_mask & REGEXP_FLAG_UNICODE_SET) != 0
653 );
654
655 uint32_t options = PCRE2_UTF | PCRE2_UCP | PCRE2_MATCH_UNSET_BACKREF | PCRE2_DUPNAMES;
656 if (flags_mask & REGEXP_FLAG_IGNORE_CASE) options |= PCRE2_CASELESS;
657 if (flags_mask & REGEXP_FLAG_MULTILINE) options |= PCRE2_MULTILINE;
658 if (flags_mask & REGEXP_FLAG_DOTALL) options |= PCRE2_DOTALL;
659
660 int errcode;
661 PCRE2_SIZE erroffset;
662 pcre2_code *re = pcre2_compile((PCRE2_SPTR)pcre2_pattern, pcre2_len, options, &errcode, &erroffset, NULL);
663 if (re == NULL) return false;
664
665 pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re, NULL);
666 bool jit_ready = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE) == 0;
667 regex_cache_insert(obj_ptr, re, match_data, jit_ready);
668 ant_value_t groups_meta = regexp_build_named_groups_meta(js, re);
669
670 if (is_err(groups_meta)) {
671 pcre2_match_data_free(match_data);
672 pcre2_code_free(re);
673 regex_cache_count--;
674 return false;
675 }
676
677 js_set_slot(regexp_obj, SLOT_REGEXP_NAMED_GROUPS, groups_meta);
678 out->code = re;
679 out->match_data = match_data;
680 out->jit_ready = jit_ready;
681
682 return true;
683}
684
685static ant_value_t builtin_RegExp(ant_t *js, ant_value_t *args, int nargs) {
686 bool pattern_is_regexp = false;
687 if (nargs > 0) {
688 ant_value_t is_re = is_regexp_like(js, args[0]);
689 if (is_err(is_re)) return is_re;
690 pattern_is_regexp = js_truthy(js, is_re);
691 }
692
693 if (vtype(js->new_target) == T_UNDEF && nargs > 0 && pattern_is_regexp) {
694 if (nargs < 2 || vtype(args[1]) == T_UNDEF) {
695 ant_value_t ctor = js_getprop_fallback(js, args[0], "constructor");
696 if (is_err(ctor)) return ctor;
697 ant_value_t regexp_ctor = js_get(js, js_glob(js), "RegExp");
698 if (is_err(regexp_ctor)) return regexp_ctor;
699 if (same_ctor_identity(js, ctor, regexp_ctor)) return args[0];
700 }
701 }
702
703 ant_value_t regexp_obj = js->this_val;
704 bool use_this = (vtype(js->new_target) != T_UNDEF && vtype(regexp_obj) == T_OBJ);
705
706 if (!use_this) {
707 regexp_obj = mkobj(js, 0);
708 if (is_err(regexp_obj)) return regexp_obj;
709 }
710
711 ant_value_t regexp_proto = js_get_ctor_proto(js, "RegExp", 6);
712 ant_value_t instance_proto = js_instance_proto_from_new_target(js, regexp_proto);
713
714 if (is_object_type(instance_proto)) js_set_proto_init(regexp_obj, instance_proto);
715 if (vtype(js->new_target) == T_FUNC || vtype(js->new_target) == T_CFUNC) {
716 js_set_slot(regexp_obj, SLOT_CTOR, js->new_target);
717 }
718
719 ant_value_t pattern = js_mkstr(js, "", 0);
720 ant_value_t flags = js_mkstr(js, "", 0);
721 if (nargs > 0) {
722 if (pattern_is_regexp) {
723 ant_value_t src = js_getprop_fallback(js, args[0], "source");
724 if (is_err(src)) return src;
725 pattern = js_tostring_val(js, src);
726 if (is_err(pattern)) return pattern;
727 if (nargs >= 2 && vtype(args[1]) != T_UNDEF) {
728 flags = js_tostring_val(js, args[1]);
729 } else {
730 ant_value_t fl = js_getprop_fallback(js, args[0], "flags");
731 if (is_err(fl)) return fl;
732 flags = js_tostring_val(js, fl);
733 }
734 if (is_err(flags)) return flags;
735 } else if (vtype(args[0]) == T_STR) {
736 pattern = args[0];
737 if (nargs > 1 && vtype(args[1]) == T_STR) flags = args[1];
738 } else if (vtype(args[0]) != T_UNDEF) {
739 ant_value_t s = js_tostring_val(js, args[0]);
740 if (is_err(s)) return s;
741 pattern = s;
742 if (nargs > 1 && vtype(args[1]) == T_STR) flags = args[1];
743 }
744 }
745
746 js_mkprop_fast(js, regexp_obj, "source", 6, pattern);
747 ant_offset_t flags_len, flags_off = vstr(js, flags, &flags_len);
748 regexp_init_flags(js, regexp_obj, (const char *)(uintptr_t)(flags_off), flags_len, true);
749
750 return regexp_obj;
751}
752
753static ant_value_t builtin_regexp_groups_getter(ant_t *js, ant_value_t *args, int nargs) {
754 ant_value_t result_arr = js->this_val;
755 if (!is_object_type(result_arr)) return js_mkundef();
756
757 ant_value_t cached = js_get_slot(result_arr, SLOT_REGEXP_GROUPS_CACHE);
758 if (is_object_type(cached)) return cached;
759
760 ant_value_t meta = js_get_slot(result_arr, SLOT_REGEXP_RESULT_GROUPS);
761 if (!is_object_type(meta)) return js_mkundef();
762
763 ant_value_t groups = js_mkobj(js);
764 if (is_err(groups)) return groups;
765 js_set_proto_init(groups, js_mknull());
766
767 for (ant_offset_t i = 0; ; i += 2) {
768 ant_value_t name = js_arr_get(js, meta, i);
769 if (vtype(name) == T_UNDEF) break;
770 ant_value_t index_val = js_arr_get(js, meta, i + 1);
771 ant_offset_t index = (vtype(index_val) == T_NUM) ? (ant_offset_t)tod(index_val) : 0;
772 char idxstr[16];
773 (void)uint_to_str(idxstr, sizeof(idxstr), (uint64_t)index);
774 ant_value_t value = js_getprop_fallback(js, result_arr, idxstr);
775 ant_offset_t name_len, name_off = vstr(js, name, &name_len);
776 ant_value_t status = setprop_cstr(js, groups, (const char *)(uintptr_t)name_off, (size_t)name_len, value);
777 if (is_err(status)) return status;
778 }
779
780 js_set_slot(result_arr, SLOT_REGEXP_GROUPS_CACHE, groups);
781 return groups;
782}
783
784static ant_value_t regexp_build_indices_pair(ant_t *js, PCRE2_SIZE start, PCRE2_SIZE end) {
785 if (start == PCRE2_UNSET) return js_mkundef();
786
787 ant_value_t pair = js_mkarr(js);
788 if (is_err(pair)) return pair;
789 js_arr_push(js, pair, tov((double)start));
790 js_arr_push(js, pair, tov((double)end));
791
792 return pair;
793}
794
795static ant_value_t regexp_build_indices_groups(
796 ant_t *js,
797 ant_value_t groups_meta,
798 ant_value_t indices_arr
799) {
800 ant_value_t groups = js_mkobj(js);
801 if (is_err(groups)) return groups;
802 js_set_proto_init(groups, js_mknull());
803
804 for (ant_offset_t i = 0; ; i += 2) {
805 ant_value_t name = js_arr_get(js, groups_meta, i);
806 if (vtype(name) == T_UNDEF) break;
807
808 ant_value_t index_val = js_arr_get(js, groups_meta, i + 1);
809 ant_offset_t index = (vtype(index_val) == T_NUM) ? (ant_offset_t)tod(index_val) : 0;
810 char idxstr[16];
811 (void)uint_to_str(idxstr, sizeof(idxstr), (uint64_t)index);
812
813 ant_value_t value = js_getprop_fallback(js, indices_arr, idxstr);
814 ant_offset_t name_len, name_off = vstr(js, name, &name_len);
815 ant_value_t status = setprop_cstr(js, groups, (const char *)(uintptr_t)name_off, (size_t)name_len, value);
816 if (is_err(status)) return status;
817 }
818
819 return groups;
820}
821
822static ant_value_t regexp_build_indices_result(
823 ant_t *js,
824 ant_value_t regexp,
825 PCRE2_SIZE *ovector,
826 uint32_t ovcount
827) {
828 ant_value_t indices_arr = js_mkarr(js);
829 if (is_err(indices_arr)) return indices_arr;
830
831 for (uint32_t i = 0; i < ovcount && i < 32; i++) {
832 ant_value_t pair = regexp_build_indices_pair(js, ovector[2*i], ovector[2*i+1]);
833 if (is_err(pair)) return pair;
834 js_arr_push(js, indices_arr, pair);
835 }
836
837 ant_value_t groups_meta = js_get_slot(regexp, SLOT_REGEXP_NAMED_GROUPS);
838 if (is_object_type(groups_meta)) {
839 ant_value_t groups = regexp_build_indices_groups(js, groups_meta, indices_arr);
840 if (is_err(groups)) return groups;
841 if (is_err(setprop_cstr(js, indices_arr, "groups", 6, groups))) return js_mkerr(js, "oom");
842 } else if (is_err(setprop_cstr(js, indices_arr, "groups", 6, js_mkundef()))) return js_mkerr(js, "oom");
843
844 return indices_arr;
845}
846
847static ant_value_t regexp_exec_internal(ant_t *js, ant_value_t regexp, ant_value_t str_arg, bool truthy_only) {
848 ant_offset_t str_len, str_off = vstr(js, str_arg, &str_len);
849 const char *str_ptr = (char *)(uintptr_t)(str_off);
850 uint8_t flags_mask = regexp_flags_mask(js, regexp);
851
852 bool global_flag = (flags_mask & REGEXP_FLAG_GLOBAL) != 0;
853 bool has_indices = (flags_mask & REGEXP_FLAG_HAS_INDICES) != 0;
854 bool sticky_flag = (flags_mask & REGEXP_FLAG_STICKY) != 0;
855
856 // TODO: reduce nesting
857 PCRE2_SIZE start_offset = 0;
858 if (global_flag || sticky_flag) {
859 ant_offset_t lastindex_off = lkp(js, regexp, "lastIndex", 9);
860 if (lastindex_off != 0) {
861 ant_value_t li_val = js_propref_load(js, lastindex_off);
862 if (vtype(li_val) == T_NUM) {
863 double li = tod(li_val);
864 if (li >= 0 && li <= (double)str_len) start_offset = (PCRE2_SIZE)li;
865 else {
866 if (is_err(setprop_cstr(js, regexp, "lastIndex", 9, tov(0)))) return js_mkerr(js, "oom");
867 return js_mknull();
868 }
869 }
870 }
871 }
872
873 compiled_regex_t compiled;
874 if (!regex_get_or_compile(js, regexp, &compiled)) return js_mknull();
875
876 uint32_t match_options = 0;
877 if (sticky_flag) match_options |= PCRE2_ANCHORED;
878
879 int rc;
880 if (compiled.jit_ready && !sticky_flag) {
881 rc = pcre2_jit_match(compiled.code, (PCRE2_SPTR)str_ptr, str_len, start_offset, match_options, compiled.match_data, NULL);
882 } else rc = pcre2_match(compiled.code, (PCRE2_SPTR)str_ptr, str_len, start_offset, match_options, compiled.match_data, NULL);
883
884 if (rc < 0) {
885 if ((global_flag || sticky_flag) && is_err(setprop_cstr(js, regexp, "lastIndex", 9, tov(0)))) {
886 return js_mkerr(js, "oom");
887 }
888 return js_mknull();
889 }
890
891 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(compiled.match_data);
892 uint32_t ovcount = pcre2_get_ovector_count(compiled.match_data);
893
894 update_regexp_statics(js, str_ptr, ovector, ovcount);
895
896 if (global_flag || sticky_flag) {
897 ant_value_t next_idx = tov((double)ovector[1]);
898 if (is_err(setprop_cstr(js, regexp, "lastIndex", 9, next_idx))) return js_mkerr(js, "oom");
899 }
900
901 if (truthy_only) return js_true;
902
903 ant_value_t result_arr = js_mkarr(js);
904 if (is_err(result_arr)) return result_arr;
905 for (uint32_t i = 0; i < ovcount && i < 32; i++) {
906 PCRE2_SIZE start = ovector[2*i];
907 PCRE2_SIZE end = ovector[2*i+1];
908 if (start == PCRE2_UNSET) {
909 js_arr_push(js, result_arr, js_mkundef());
910 } else {
911 ant_value_t match_str = js_mkstr(js, str_ptr + start, end - start);
912 js_arr_push(js, result_arr, match_str);
913 }
914 }
915
916 if (is_err(setprop_cstr(js, result_arr, "index", 5, tov((double)ovector[0])))) return js_mkerr(js, "oom");
917 if (is_err(setprop_cstr(js, result_arr, "input", 5, str_arg))) return js_mkerr(js, "oom");
918
919 ant_value_t groups_meta = js_get_slot(regexp, SLOT_REGEXP_NAMED_GROUPS);
920 if (is_object_type(groups_meta)) {
921 js_set_slot(result_arr, SLOT_REGEXP_RESULT_GROUPS, groups_meta);
922 js_set_slot(result_arr, SLOT_REGEXP_GROUPS_CACHE, js_mkundef());
923 js_set_getter_desc(js, js_as_obj(result_arr), "groups", 6, js_mkfun(builtin_regexp_groups_getter), JS_DESC_E | JS_DESC_C);
924 } else if (is_err(setprop_cstr(js, result_arr, "groups", 6, js_mkundef()))) return js_mkerr(js, "oom");
925
926 if (has_indices) {
927 ant_value_t indices = regexp_build_indices_result(js, regexp, ovector, ovcount);
928 if (is_err(indices)) return indices;
929 if (is_err(setprop_cstr(js, result_arr, "indices", 7, indices))) return js_mkerr(js, "oom");
930 }
931
932 return result_arr;
933}
934
935static ant_value_t builtin_regexp_exec(ant_t *js, ant_value_t *args, int nargs) {
936 ant_value_t regexp = js->this_val;
937 if (vtype(regexp) != T_OBJ) return js_mkerr(js, "exec called on non-regexp");
938 if (nargs < 1) return js_mknull();
939
940 ant_value_t str_arg = args[0];
941 if (vtype(str_arg) != T_STR) return js_mknull();
942
943 return regexp_exec_internal(js, regexp, str_arg, false);
944}
945
946static ant_value_t builtin_regexp_toString(ant_t *js, ant_value_t *args, int nargs) {
947 ant_value_t regexp = js->this_val;
948 if (!is_object_type(regexp))
949 return js_mkerr_typed(js, JS_ERR_TYPE, "toString called on non-object");
950
951 ant_value_t source_val = js_getprop_fallback(js, regexp, "source");
952 if (is_err(source_val)) return source_val;
953 ant_value_t source_str = js_tostring_val(js, source_val);
954 if (is_err(source_str)) return source_str;
955
956 ant_value_t flags_val = js_getprop_fallback(js, regexp, "flags");
957 if (is_err(flags_val)) return flags_val;
958 ant_value_t flags_str = js_tostring_val(js, flags_val);
959 if (is_err(flags_str)) return flags_str;
960
961 ant_offset_t src_len, src_off = vstr(js, source_str, &src_len);
962 ant_offset_t fl_len, fl_off = vstr(js, flags_str, &fl_len);
963
964 size_t total = 1 + src_len + 1 + fl_len;
965 char *buf = ant_calloc(total + 1);
966 if (!buf) return js_mkerr(js, "oom");
967 size_t n = 0;
968 buf[n++] = '/';
969 memcpy(buf + n, (const void *)(uintptr_t)src_off, src_len); n += src_len;
970 buf[n++] = '/';
971 memcpy(buf + n, (const void *)(uintptr_t)fl_off, fl_len); n += fl_len;
972
973 ant_value_t result = js_mkstr(js, buf, n);
974 free(buf);
975 return result;
976}
977
978static ant_value_t builtin_regexp_compile(ant_t *js, ant_value_t *args, int nargs) {
979 ant_value_t rx = js->this_val;
980 if (!is_object_type(rx))
981 return js_mkerr_typed(js, JS_ERR_TYPE, "compile called on non-object");
982
983 ant_value_t pattern = js_mkstr(js, "", 0);
984 ant_value_t flags = js_mkstr(js, "", 0);
985
986 if (nargs > 0 && vtype(args[0]) != T_UNDEF) {
987 ant_value_t is_re = is_regexp_like(js, args[0]);
988 if (is_err(is_re)) return is_re;
989 if (js_truthy(js, is_re)) {
990 ant_value_t src = js_getprop_fallback(js, args[0], "source");
991 if (is_err(src)) return src;
992 pattern = js_tostring_val(js, src);
993 if (is_err(pattern)) return pattern;
994 ant_value_t fl = js_getprop_fallback(js, args[0], "flags");
995 if (is_err(fl)) return fl;
996 flags = js_tostring_val(js, fl);
997 if (is_err(flags)) return flags;
998 } else {
999 pattern = js_tostring_val(js, args[0]);
1000 if (is_err(pattern)) return pattern;
1001 }
1002 }
1003 if (nargs > 1 && vtype(args[1]) != T_UNDEF) {
1004 flags = js_tostring_val(js, args[1]);
1005 if (is_err(flags)) return flags;
1006 }
1007
1008 js_setprop(js, rx, js_mkstr(js, "source", 6), pattern);
1009 ant_offset_t flen, foff = vstr(js, flags, &flen);
1010 regexp_init_flags(js, rx, (const char *)(uintptr_t)(foff), flen, false);
1011
1012 ant_object_t *rx_ptr = js_obj_ptr(rx);
1013 for (size_t i = 0; i < regex_cache_count; i++) {
1014 if (regex_cache[i].obj == rx_ptr) {
1015 pcre2_match_data_free(regex_cache[i].match_data);
1016 pcre2_code_free(regex_cache[i].code);
1017 regex_cache[i] = regex_cache[--regex_cache_count];
1018 break;
1019 }
1020 }
1021
1022 return rx;
1023}
1024
1025static inline bool is_syntax_char(char c) {
1026 return
1027 c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
1028 c == '+' || c == '?' || c == '(' || c == ')' || c == '[' ||
1029 c == ']' || c == '{' || c == '}' || c == '|' || c == '/';
1030}
1031
1032static inline bool is_other_punctuator(char c) {
1033 return
1034 c == ',' || c == '-' || c == ':' || c == ';' || c == '<' ||
1035 c == '=' || c == '>' || c == '@' || c == '!' || c == '"' ||
1036 c == '#' || c == '%' || c == '&' || c == '\'' || c == '`' || c == '~';
1037}
1038
1039static ant_value_t builtin_regexp_escape(ant_t *js, ant_value_t *args, int nargs) {
1040 if (nargs < 1 || vtype(args[0]) != T_STR)
1041 return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.escape requires a string argument");
1042
1043 ant_offset_t slen, soff = vstr(js, args[0], &slen);
1044 const char *src = (const char *)(uintptr_t)(soff);
1045
1046 size_t buf_cap = slen * 6 + 1;
1047 char *buf = ant_calloc(buf_cap);
1048 if (!buf) return js_mkerr(js, "oom");
1049 size_t di = 0;
1050 bool first = true;
1051
1052 for (size_t si = 0; si < slen; ) {
1053 unsigned char c = (unsigned char)src[si];
1054
1055 if (c >= 0x80) {
1056 utf8proc_int32_t cp;
1057 int bytes = (int)utf8_next(
1058 (const utf8proc_uint8_t *)&src[si],
1059 (utf8proc_ssize_t)(slen - si), &cp
1060 );
1061 for (int b = 0; b < bytes && si < slen; b++)
1062 buf[di++] = src[si++];
1063 first = false;
1064 continue;
1065 }
1066
1067 if (first && ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
1068 di += snprintf(buf + di, buf_cap - di, "\\x%02x", c);
1069 si++; first = false;
1070 continue;
1071 }
1072
1073 if (is_syntax_char(c)) {
1074 buf[di++] = '\\'; buf[di++] = c;
1075 si++; first = false;
1076 continue;
1077 }
1078
1079 if (is_other_punctuator(c) || c == ' ' || c == '\t' || c == '\n' ||
1080 c == '\r' || c == '\v' || c == '\f') {
1081 di += snprintf(buf + di, buf_cap - di, "\\x%02x", c);
1082 si++; first = false;
1083 continue;
1084 }
1085
1086 buf[di++] = c;
1087 si++; first = false;
1088 }
1089
1090 ant_value_t result = js_mkstr(js, buf, di);
1091 free(buf);
1092 return result;
1093}
1094
1095static ant_value_t regexp_exec_with_exec_fn(ant_t *js, ant_value_t rx, ant_value_t str, ant_value_t exec_fn) {
1096 if (vtype(exec_fn) == T_FUNC || vtype(exec_fn) == T_CFUNC) {
1097 ant_value_t call_args[1] = { str };
1098 ant_value_t result = sv_vm_call(js->vm, js, exec_fn, rx, call_args, 1, NULL, false);
1099 if (is_err(result)) return result;
1100 if (!is_object_type(result) && vtype(result) != T_NULL)
1101 return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp exec returned non-object");
1102 return result;
1103 }
1104
1105 ant_value_t call_args[1] = { str };
1106 ant_value_t saved = js->this_val;
1107 js->this_val = rx;
1108 ant_value_t result = builtin_regexp_exec(js, call_args, 1);
1109 js->this_val = saved;
1110
1111 return result;
1112}
1113
1114static ant_value_t regexp_exec_abstract(ant_t *js, ant_value_t rx, ant_value_t str) {
1115 ant_value_t exec_fn = js_get(js, rx, "exec");
1116 if (is_err(exec_fn)) return exec_fn;
1117 return regexp_exec_with_exec_fn(js, rx, str, exec_fn);
1118}
1119
1120bool regexp_exec_truthy_try_fast(
1121 ant_t *js,
1122 ant_value_t call_func,
1123 ant_value_t regexp,
1124 ant_value_t arg,
1125 ant_value_t *out_result
1126) {
1127 if (!out_result || vtype(call_func) != T_CFUNC) return false;
1128 if (!js_cfunc_same_entrypoint(call_func, builtin_regexp_exec)) return false;
1129 if (!is_object_type(regexp) || vtype(arg) != T_STR) return false;
1130
1131 ant_value_t result = regexp_exec_internal(js, regexp, arg, true);
1132 if (is_err(result)) {
1133 *out_result = result;
1134 return true;
1135 }
1136
1137 *out_result = mkval(T_BOOL, vtype(result) != T_NULL ? 1 : 0);
1138 return true;
1139}
1140
1141static ant_value_t builtin_regexp_test(ant_t *js, ant_value_t *args, int nargs) {
1142 ant_value_t regexp = js->this_val;
1143 if (!is_object_type(regexp))
1144 return js_mkerr_typed(js, JS_ERR_TYPE, "test called on non-object");
1145 ant_value_t str_arg = nargs > 0 ? js_tostring_val(js, args[0]) : js_mkstr(js, "undefined", 9);
1146 if (is_err(str_arg)) return str_arg;
1147 ant_value_t exec_fn = js_get(js, regexp, "exec");
1148 if (is_err(exec_fn)) return exec_fn;
1149
1150 ant_value_t result;
1151 if (vtype(exec_fn) == T_CFUNC && js_cfunc_same_entrypoint(exec_fn, builtin_regexp_exec)) {
1152 result = regexp_exec_internal(js, regexp, str_arg, true);
1153 } else result = regexp_exec_with_exec_fn(js, regexp, str_arg, exec_fn);
1154
1155 if (is_err(result)) return result;
1156 return mkval(T_BOOL, vtype(result) != T_NULL ? 1 : 0);
1157}
1158
1159static ant_value_t builtin_regexp_flags_getter(ant_t *js, ant_value_t *args, int nargs) {
1160 ant_value_t rx = js->this_val;
1161 if (!is_object_type(rx))
1162 return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.prototype.flags called on non-object");
1163
1164 char buf[16]; int n = 0;
1165 ant_value_t v = js_getprop_fallback(js, rx, "hasIndices");
1166
1167 if (is_err(v)) return v;
1168 if (js_truthy(js, v)) buf[n++] = 'd';
1169
1170 v = js_getprop_fallback(js, rx, "global");
1171 if (is_err(v)) return v;
1172 if (js_truthy(js, v)) buf[n++] = 'g';
1173
1174 v = js_getprop_fallback(js, rx, "ignoreCase");
1175 if (is_err(v)) return v;
1176 if (js_truthy(js, v)) buf[n++] = 'i';
1177
1178 v = js_getprop_fallback(js, rx, "multiline");
1179 if (is_err(v)) return v;
1180 if (js_truthy(js, v)) buf[n++] = 'm';
1181
1182 v = js_getprop_fallback(js, rx, "dotAll");
1183 if (is_err(v)) return v;
1184 if (js_truthy(js, v)) buf[n++] = 's';
1185
1186 v = js_getprop_fallback(js, rx, "unicode");
1187 if (is_err(v)) return v;
1188 if (js_truthy(js, v)) buf[n++] = 'u';
1189
1190 v = js_getprop_fallback(js, rx, "unicodeSets");
1191 if (is_err(v)) return v;
1192 if (js_truthy(js, v)) buf[n++] = 'v';
1193
1194 v = js_getprop_fallback(js, rx, "sticky");
1195 if (is_err(v)) return v;
1196 if (js_truthy(js, v)) buf[n++] = 'y';
1197
1198 return js_mkstr(js, buf, n);
1199}
1200
1201static ant_value_t builtin_regexp_symbol_match(ant_t *js, ant_value_t *args, int nargs) {
1202 ant_value_t rx = js->this_val;
1203 if (!is_object_type(rx))
1204 return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.prototype[@@match] called on non-object");
1205
1206 ant_value_t str = nargs > 0 ? js_tostring_val(js, args[0]) : js_mkstr(js, "undefined", 9);
1207 if (is_err(str)) return str;
1208
1209 ant_value_t global_val = js_getprop_fallback(js, rx, "global");
1210 if (is_err(global_val)) return global_val;
1211
1212 if (!js_truthy(js, global_val))
1213 return regexp_exec_abstract(js, rx, str);
1214
1215 ant_value_t unicode_val = js_getprop_fallback(js, rx, "unicode");
1216 if (is_err(unicode_val)) return unicode_val;
1217
1218 bool full_unicode = js_truthy(js, unicode_val);
1219 js_setprop(js, rx, js_mkstr(js, "lastIndex", 9), tov(0));
1220
1221 ant_value_t A = js_mkarr(js);
1222 if (is_err(A)) return A;
1223 ant_offset_t n = 0;
1224
1225 for (;;) {
1226 ant_value_t result = regexp_exec_abstract(js, rx, str);
1227 if (is_err(result)) return result;
1228 if (vtype(result) == T_NULL) return n == 0 ? js_mknull() : mkval(T_ARR, vdata(A));
1229
1230 ant_value_t match_str = js_tostring_val(js, js_arr_get(js, result, 0));
1231 if (is_err(match_str)) return match_str;
1232 js_arr_push(js, A, match_str);
1233 n++;
1234
1235 ant_offset_t mlen;
1236 vstr(js, match_str, &mlen);
1237 if (mlen == 0) {
1238 ant_value_t li_val = js_getprop_fallback(js, rx, "lastIndex");
1239 if (is_err(li_val)) return li_val;
1240 double li = vtype(li_val) == T_NUM ? tod(li_val) : 0;
1241 ant_offset_t str_len, str_off = vstr(js, str, &str_len);
1242 double advance = 1;
1243 if (full_unicode && li < (double)str_len) {
1244 advance = (double)utf8_char_len_at((const char *)(uintptr_t)(str_off), str_len, (ant_offset_t)li);
1245 } js_setprop(js, rx, js_mkstr(js, "lastIndex", 9), tov(li + advance));
1246 }
1247 }
1248}
1249
1250
1251static ant_value_t regexp_matchall_next(ant_t *js, ant_value_t *args, int nargs) {
1252 ant_value_t iter = js->this_val;
1253 ant_value_t rx = js_get_slot(iter, SLOT_MATCHALL_RX);
1254 ant_value_t str = js_get_slot(iter, SLOT_MATCHALL_STR);
1255 ant_value_t done_val = js_get_slot(iter, SLOT_MATCHALL_DONE);
1256
1257 if (js_truthy(js, done_val))
1258 return js_iter_result(js, false, js_mkundef());
1259
1260 ant_value_t result = regexp_exec_abstract(js, rx, str);
1261 if (is_err(result)) return result;
1262
1263 if (vtype(result) == T_NULL) {
1264 js_set_slot(iter, SLOT_MATCHALL_DONE, js_true);
1265 return js_iter_result(js, false, js_mkundef());
1266 }
1267
1268 ant_value_t global_val = js_getprop_fallback(js, rx, "global");
1269 if (js_truthy(js, global_val)) {
1270 ant_value_t match_str = js_tostring_val(js, js_arr_get(js, result, 0));
1271 if (is_err(match_str)) return match_str;
1272 ant_offset_t mlen;
1273 vstr(js, match_str, &mlen);
1274 if (mlen == 0) {
1275 ant_value_t li_val = js_getprop_fallback(js, rx, "lastIndex");
1276 double li = vtype(li_val) == T_NUM ? tod(li_val) : 0;
1277 js_setprop(js, rx, js_mkstr(js, "lastIndex", 9), tov(li + 1));
1278 }
1279 } else js_set_slot(iter, SLOT_MATCHALL_DONE, js_true);
1280
1281 return js_iter_result(js, true, result);
1282}
1283
1284static ant_value_t builtin_regexp_symbol_matchAll(ant_t *js, ant_value_t *args, int nargs) {
1285 ant_value_t rx = js->this_val;
1286 if (!is_object_type(rx))
1287 return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.prototype[@@matchAll] called on non-object");
1288
1289 ant_value_t str = nargs > 0 ? js_tostring_val(js, args[0]) : js_mkstr(js, "undefined", 9);
1290 if (is_err(str)) return str;
1291
1292 ant_value_t flags_val = js_getprop_fallback(js, rx, "flags");
1293 if (is_err(flags_val)) return flags_val;
1294 ant_value_t flags_str = js_tostring_val(js, flags_val);
1295 if (is_err(flags_str)) return flags_str;
1296
1297 ant_value_t source_val = js_getprop_fallback(js, rx, "source");
1298 if (is_err(source_val)) return source_val;
1299
1300 ant_value_t ctor_args[2] = { source_val, flags_str };
1301 ant_value_t regexp_ctor = js_get(js, js_glob(js), "RegExp");
1302 ant_value_t new_rx = sv_vm_call(js->vm, js, regexp_ctor, js_mkundef(), ctor_args, 2, NULL, true);
1303 if (is_err(new_rx)) return new_rx;
1304
1305 ant_value_t li_val = js_getprop_fallback(js, rx, "lastIndex");
1306 js_setprop(js, new_rx, js_mkstr(js, "lastIndex", 9), li_val);
1307
1308 ant_value_t iter = js_mkobj(js);
1309 js_set_slot(iter, SLOT_MATCHALL_RX, new_rx);
1310 js_set_slot(iter, SLOT_MATCHALL_STR, str);
1311 js_set_slot(iter, SLOT_MATCHALL_DONE, js_false);
1312
1313 js_set_proto_init(iter, regexp_matchall_iter_proto_val);
1314
1315 return iter;
1316}
1317
1318static ant_value_t builtin_string_matchAll(ant_t *js, ant_value_t *args, int nargs) {
1319 ant_value_t this_unwrapped = unwrap_primitive(js, js->this_val);
1320 ant_value_t str = js_tostring_val(js, this_unwrapped);
1321 if (is_err(str)) return str;
1322 if (nargs < 1) return js_mkerr_typed(js, JS_ERR_TYPE, "matchAll requires at least 1 argument");
1323
1324 if (is_object_type(args[0])) {
1325 ant_value_t is_re = is_regexp_like(js, args[0]);
1326 if (js_truthy(js, is_re)) {
1327 ant_value_t flags_val = js_getprop_fallback(js, args[0], "flags");
1328 if (is_err(flags_val)) return flags_val;
1329
1330 ant_value_t flags_str = js_tostring_val(js, flags_val);
1331 ant_offset_t flen, foff = vstr(js, flags_str, &flen);
1332
1333 const char *fp = (const char *)(uintptr_t)(foff);
1334 bool has_g = false;
1335 for (ant_offset_t i = 0; i < flen; i++) if (fp[i] == 'g') has_g = true;
1336 if (!has_g) return js_mkerr_typed(js, JS_ERR_TYPE, "String.prototype.matchAll called with a non-global RegExp");
1337 }
1338
1339 bool called = false;
1340 ant_value_t call_args[1] = { str };
1341 ant_value_t dispatched = maybe_call_symbol_method(
1342 js, args[0], get_matchAll_sym(), args[0], call_args, 1, &called
1343 );
1344
1345 if (is_err(dispatched)) return dispatched;
1346 if (called) return dispatched;
1347 }
1348
1349 ant_value_t pattern_str = js_tostring_val(js, args[0]);
1350 if (is_err(pattern_str)) return pattern_str;
1351
1352 ant_value_t ctor_args[2] = { pattern_str, js_mkstr(js, "g", 1) };
1353 ant_value_t regexp_ctor = js_get(js, js_glob(js), "RegExp");
1354 ant_value_t rx = sv_vm_call(js->vm, js, regexp_ctor, js_mkundef(), ctor_args, 2, NULL, true);
1355 if (is_err(rx)) return rx;
1356
1357 ant_value_t ma_args[1] = { str };
1358 js->this_val = rx;
1359
1360 return builtin_regexp_symbol_matchAll(js, ma_args, 1);
1361}
1362
1363static ant_value_t builtin_regexp_symbol_replace(ant_t *js, ant_value_t *args, int nargs) {
1364 ant_value_t rx = js->this_val;
1365 if (!is_object_type(rx))
1366 return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.prototype[@@replace] called on non-object");
1367
1368 ant_value_t str = nargs > 0 ? js_tostring_val(js, args[0]) : js_mkstr(js, "undefined", 9);
1369 if (is_err(str)) return str;
1370 ant_value_t replace_value = nargs > 1 ? args[1] : js_mkundef();
1371 bool func_replace = (vtype(replace_value) == T_FUNC || vtype(replace_value) == T_CFUNC);
1372 ant_value_t replace_str = js_mkundef();
1373 if (!func_replace) {
1374 replace_str = js_tostring_val(js, replace_value);
1375 if (is_err(replace_str)) return replace_str;
1376 }
1377
1378 ant_value_t global_val = js_getprop_fallback(js, rx, "global");
1379 if (is_err(global_val)) return global_val;
1380 bool global = js_truthy(js, global_val);
1381
1382 bool full_unicode = false;
1383 if (global) {
1384 ant_value_t unicode_val = js_getprop_fallback(js, rx, "unicode");
1385 if (is_err(unicode_val)) return unicode_val;
1386 full_unicode = js_truthy(js, unicode_val);
1387 js_setprop(js, rx, js_mkstr(js, "lastIndex", 9), tov(0));
1388 }
1389
1390 ant_value_t results = js_mkarr(js);
1391 if (is_err(results)) return results;
1392 ant_offset_t nresults = 0;
1393
1394 for (;;) {
1395 ant_value_t result = regexp_exec_abstract(js, rx, str);
1396 if (is_err(result)) return result;
1397 if (vtype(result) == T_NULL) break;
1398 js_arr_push(js, results, result);
1399 nresults++;
1400 if (!global) break;
1401
1402 ant_value_t match_str = js_tostring_val(js, js_arr_get(js, result, 0));
1403 if (is_err(match_str)) return match_str;
1404 ant_offset_t mlen; vstr(js, match_str, &mlen);
1405 if (mlen == 0) {
1406 ant_value_t li_val = js_getprop_fallback(js, rx, "lastIndex");
1407 if (is_err(li_val)) return li_val;
1408 double li = vtype(li_val) == T_NUM ? tod(li_val) : 0;
1409 ant_offset_t sl, so = vstr(js, str, &sl);
1410 double advance = 1;
1411 if (full_unicode && li < (double)sl) {
1412 advance = (double)utf8_char_len_at((const char *)(uintptr_t)(so), sl, (ant_offset_t)li);
1413 }
1414 js_setprop(js, rx, js_mkstr(js, "lastIndex", 9), tov(li + advance));
1415 }
1416 }
1417
1418 ant_offset_t str_len, str_off = vstr(js, str, &str_len);
1419 size_t buf_cap = str_len + 256;
1420 char *buf = ant_calloc(buf_cap);
1421 if (!buf) return js_mkerr(js, "oom");
1422 size_t buf_len = 0;
1423 ant_offset_t next_src_pos = 0;
1424
1425#define SB_APPEND(data, dlen) do { \
1426 if (buf_len + (dlen) >= buf_cap) { \
1427 buf_cap = (buf_len + (dlen) + 1) * 2; \
1428 char *nb = ant_realloc(buf, buf_cap); \
1429 if (!nb) { free(buf); return js_mkerr(js, "oom"); } \
1430 buf = nb; \
1431 } \
1432 memcpy(buf + buf_len, data, dlen); buf_len += (dlen); \
1433} while(0)
1434
1435 for (ant_offset_t i = 0; i < nresults; i++) {
1436 ant_value_t result = js_arr_get(js, results, i);
1437 ant_value_t matched = js_tostring_val(js, js_arr_get(js, result, 0));
1438 if (is_err(matched)) { free(buf); return matched; }
1439 ant_offset_t matched_len; vstr(js, matched, &matched_len);
1440
1441 ant_value_t pos_val = js_getprop_fallback(js, result, "index");
1442 ant_offset_t position = 0;
1443 if (!is_err(pos_val) && vtype(pos_val) == T_NUM) {
1444 double d = tod(pos_val);
1445 position = d < 0 ? 0 : (ant_offset_t)d;
1446 }
1447 if (position > str_len) position = str_len;
1448
1449 ant_value_t replacement;
1450 if (func_replace) {
1451 ant_offset_t ncaptures = js_arr_len(js, result);
1452 ant_value_t call_args[32];
1453 int ca = 0;
1454 for (ant_offset_t c = 0; c < ncaptures && ca < 30; c++)
1455 call_args[ca++] = js_arr_get(js, result, c);
1456 call_args[ca++] = tov((double)position);
1457 call_args[ca++] = str;
1458 replacement = sv_vm_call(js->vm, js, replace_value, js_mkundef(), call_args, ca, NULL, false);
1459 } else {
1460 replacement = replace_str;
1461 }
1462 if (is_err(replacement)) { free(buf); return replacement; }
1463 ant_value_t rep_str = js_tostring_val(js, replacement);
1464 if (is_err(rep_str)) { free(buf); return rep_str; }
1465
1466 if (position >= next_src_pos) {
1467 str_off = vstr(js, str, &str_len);
1468 if (position > next_src_pos)
1469 SB_APPEND((const char *)(uintptr_t)(str_off + next_src_pos), position - next_src_pos);
1470 ant_offset_t rep_len, rep_off = vstr(js, rep_str, &rep_len);
1471 if (func_replace) {
1472 SB_APPEND((const char *)(uintptr_t)(rep_off), rep_len);
1473 } else {
1474 ant_offset_t ncap = js_arr_len(js, result);
1475 int num_caps = ncap > 1 ? (int)(ncap - 1) : 0;
1476 repl_capture_t caps_buf[16], *caps = num_caps <= 16 ? caps_buf : ant_calloc(sizeof(repl_capture_t) * (size_t)num_caps);
1477 if (num_caps > 16 && !caps) {
1478 free(buf);
1479 return js_mkerr(js, "oom");
1480 }
1481 for (int ci = 0; ci < num_caps; ci++) {
1482 ant_value_t cap = js_arr_get(js, result, (ant_offset_t)(ci + 1));
1483 if (vtype(cap) == T_STR) { ant_offset_t cl, co = vstr(js, cap, &cl); caps[ci] = (repl_capture_t){ (const char *)(uintptr_t)(co), cl }; }
1484 else caps[ci] = (repl_capture_t){ NULL, 0 };
1485 }
1486 ant_offset_t mlen, moff = vstr(js, matched, &mlen);
1487 str_off = vstr(js, str, &str_len);
1488 bool ok = repl_template(
1489 (const char *)(uintptr_t)(rep_off), rep_len,
1490 (const char *)(uintptr_t)(moff), mlen,
1491 (const char *)(uintptr_t)(str_off), str_len, position,
1492 caps, num_caps, &buf, &buf_len, &buf_cap
1493 );
1494 if (caps != caps_buf) free(caps);
1495 if (!ok) {
1496 free(buf);
1497 return js_mkerr(js, "oom");
1498 }
1499 }
1500 next_src_pos = position + matched_len;
1501 }
1502 }
1503
1504 str_off = vstr(js, str, &str_len);
1505 if (next_src_pos < str_len)
1506 SB_APPEND((const char *)(uintptr_t)(str_off + next_src_pos), str_len - next_src_pos);
1507
1508#undef SB_APPEND
1509
1510 ant_value_t ret = js_mkstr(js, buf, buf_len);
1511 free(buf);
1512 return ret;
1513}
1514
1515static ant_value_t builtin_regexp_symbol_search(ant_t *js, ant_value_t *args, int nargs) {
1516 ant_value_t rx = js->this_val;
1517 if (!is_object_type(rx))
1518 return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.prototype[@@search] called on non-object");
1519
1520 ant_value_t str = nargs > 0 ? js_tostring_val(js, args[0]) : js_mkstr(js, "undefined", 9);
1521 if (is_err(str)) return str;
1522
1523 ant_value_t prev_li = js_getprop_fallback(js, rx, "lastIndex");
1524 if (is_err(prev_li)) return prev_li;
1525 js_setprop(js, rx, js_mkstr(js, "lastIndex", 9), tov(0));
1526
1527 ant_value_t result = regexp_exec_abstract(js, rx, str);
1528 if (is_err(result)) return result;
1529
1530 ant_value_t cur_li = js_getprop_fallback(js, rx, "lastIndex");
1531 if (is_err(cur_li)) return cur_li;
1532 js_setprop(js, rx, js_mkstr(js, "lastIndex", 9), prev_li);
1533
1534 if (vtype(result) == T_NULL) return tov(-1);
1535
1536 ant_value_t idx = js_getprop_fallback(js, result, "index");
1537 if (is_err(idx)) return idx;
1538 return vtype(idx) == T_NUM ? idx : tov(-1);
1539}
1540
1541static ant_value_t builtin_regexp_symbol_split(ant_t *js, ant_value_t *args, int nargs) {
1542 ant_value_t rx = js_getthis(js);
1543 if (!is_object_type(rx))
1544 return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.prototype[@@split] called on non-object");
1545
1546 ant_value_t str = nargs > 0 ? js_tostring_val(js, args[0]) : js_mkstr(js, "", 0);
1547 if (is_err(str)) return str;
1548
1549 ant_value_t ctor = js_get(js, rx, "constructor");
1550 if (is_err(ctor)) return ctor;
1551
1552 ant_value_t C;
1553 if (vtype(ctor) == T_UNDEF) {
1554 C = js_get(js, js_glob(js), "RegExp");
1555 } else if (!is_object_type(ctor)) {
1556 return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.prototype[@@split]: constructor is not an object");
1557 } else {
1558 ant_value_t species = get_ctor_species_value(js, ctor);
1559 if (is_err(species)) return species;
1560 if (vtype(species) == T_UNDEF || vtype(species) == T_NULL)
1561 C = js_get(js, js_glob(js), "RegExp");
1562 else C = species;
1563 }
1564
1565 if (is_err(C)) return C;
1566 if (vtype(C) != T_FUNC && vtype(C) != T_CFUNC)
1567 return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp species is not a constructor");
1568
1569 ant_value_t flags_val = js_get(js, rx, "flags");
1570 if (is_err(flags_val)) return flags_val;
1571 ant_value_t flags_str = js_tostring_val(js, flags_val);
1572 if (is_err(flags_str)) return flags_str;
1573
1574 ant_offset_t flen, foff = vstr(js, flags_str, &flen);
1575 const char *fptr = (const char *)(uintptr_t)(foff);
1576 bool unicode_matching = false, has_sticky = false;
1577 for (ant_offset_t i = 0; i < flen; i++) {
1578 if (fptr[i] == 'u' || fptr[i] == 'v') unicode_matching = true;
1579 if (fptr[i] == 'y') has_sticky = true;
1580 }
1581
1582 ant_value_t new_flags;
1583 if (has_sticky) new_flags = flags_str; else {
1584 char fbuf[16];
1585 if (flen > 14) flen = 14;
1586 foff = vstr(js, flags_str, &flen);
1587 fptr = (const char *)(uintptr_t)(foff);
1588 memcpy(fbuf, fptr, flen);
1589 fbuf[flen] = 'y';
1590 new_flags = js_mkstr(js, fbuf, flen + 1);
1591 }
1592
1593 ant_value_t ctor_args[2] = { rx, new_flags };
1594 ant_value_t splitter = regexp_species_construct(js, rx, C, ctor_args, 2);
1595 if (is_err(splitter)) return splitter;
1596
1597 ant_value_t A = js_mkarr(js);
1598 if (is_err(A)) return A;
1599 ant_offset_t lengthA = 0;
1600
1601 uint32_t lim = UINT32_MAX;
1602 if (nargs >= 2 && vtype(args[1]) != T_UNDEF) {
1603 double d = tod(args[1]);
1604 if (d >= 0 && d <= UINT32_MAX) lim = (uint32_t)d;
1605 } if (lim == 0) return mkval(T_ARR, vdata(A));
1606
1607 ant_offset_t str_len, str_off = vstr(js, str, &str_len);
1608 ant_offset_t size = str_len;
1609
1610 if (size == 0) {
1611 ant_value_t z = regexp_exec_abstract(js, splitter, str);
1612 if (is_err(z)) return z;
1613 if (vtype(z) == T_NULL) js_arr_push(js, A, str);
1614 return mkval(T_ARR, vdata(A));
1615 }
1616
1617 ant_offset_t p = 0, q = p;
1618 ant_value_t lastIndex_key = js_mkstr(js, "lastIndex", 9);
1619
1620 while (q < size) {
1621 js_setprop(js, splitter, lastIndex_key, tov((double)q));
1622
1623 ant_value_t z = regexp_exec_abstract(js, splitter, str);
1624 if (is_err(z)) return z;
1625
1626 if (vtype(z) == T_NULL) {
1627 if (unicode_matching) {
1628 str_off = vstr(js, str, &str_len);
1629 q += utf8_char_len_at((const char *)(uintptr_t)(str_off), str_len, q);
1630 } else q++;
1631 continue;
1632 }
1633
1634 ant_value_t li_val = js_get(js, splitter, "lastIndex");
1635 if (is_err(li_val)) return li_val;
1636 double e_raw = vtype(li_val) == T_NUM ? tod(li_val) : 0;
1637 ant_offset_t e = (ant_offset_t)(e_raw < 0 ? 0 : (e_raw > (double)size ? (double)size : e_raw));
1638
1639 if (e == p) {
1640 if (unicode_matching) {
1641 str_off = vstr(js, str, &str_len);
1642 q += utf8_char_len_at((const char *)(uintptr_t)(str_off), str_len, q);
1643 } else q++;
1644 continue;
1645 }
1646
1647 str_off = vstr(js, str, NULL);
1648 ant_value_t T_val = js_mkstr(js, (char *)(uintptr_t)(str_off + p), q - p);
1649 js_arr_push(js, A, T_val);
1650 lengthA++;
1651 if (lengthA == lim) return mkval(T_ARR, vdata(A));
1652
1653 ant_offset_t num_caps = js_arr_len(js, z);
1654 for (ant_offset_t i = 1; i < num_caps; i++) {
1655 ant_value_t cap = js_arr_get(js, z, i);
1656 js_arr_push(js, A, cap);
1657 lengthA++;
1658 if (lengthA == lim) return mkval(T_ARR, vdata(A));
1659 }
1660
1661 p = e;
1662 q = p;
1663 }
1664
1665 str_off = vstr(js, str, &str_len);
1666 ant_value_t trailing = js_mkstr(js, (char *)(uintptr_t)(str_off + p), str_len - p);
1667 js_arr_push(js, A, trailing);
1668 return mkval(T_ARR, vdata(A));
1669}
1670
1671ant_value_t do_regex_match_pcre2(ant_t *js, regex_match_args_t args) {
1672 char pcre2_pattern[4096];
1673 size_t pcre2_len = js_to_pcre2_pattern(args.pattern_ptr, args.pattern_len, pcre2_pattern, sizeof(pcre2_pattern), false);
1674
1675 uint32_t options = PCRE2_UTF | PCRE2_UCP | PCRE2_MATCH_UNSET_BACKREF | PCRE2_DUPNAMES;
1676 if (args.ignore_case) options |= PCRE2_CASELESS;
1677 if (args.multiline) options |= PCRE2_MULTILINE;
1678
1679 int errcode;
1680 PCRE2_SIZE erroffset;
1681 pcre2_code *re = pcre2_compile((PCRE2_SPTR)pcre2_pattern, pcre2_len, options, &errcode, &erroffset, NULL);
1682 if (re == NULL) return js_mknull();
1683
1684 pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re, NULL);
1685 uint32_t capture_count;
1686 pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &capture_count);
1687
1688 ant_value_t result_arr = js_mkarr(js);
1689 if (is_err(result_arr)) {
1690 pcre2_match_data_free(match_data);
1691 pcre2_code_free(re);
1692 return result_arr;
1693 }
1694
1695 PCRE2_SIZE pos = 0;
1696 int match_count = 0;
1697
1698 while (pos <= (PCRE2_SIZE)args.str_len) {
1699 int rc = pcre2_match(re, (PCRE2_SPTR)args.str_ptr, args.str_len, pos, 0, match_data, NULL);
1700 if (rc < 0) break;
1701
1702 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(match_data);
1703 PCRE2_SIZE match_start = ovector[0];
1704 PCRE2_SIZE match_end = ovector[1];
1705
1706 if (args.global) {
1707 ant_value_t match_str = js_mkstr(js, args.str_ptr + match_start, match_end - match_start);
1708 if (is_err(match_str)) {
1709 pcre2_match_data_free(match_data);
1710 pcre2_code_free(re);
1711 return match_str;
1712 }
1713 js_arr_push(js, result_arr, match_str);
1714 } else {
1715 for (uint32_t i = 0; i <= capture_count; i++) {
1716 PCRE2_SIZE start = ovector[2*i];
1717 PCRE2_SIZE end = ovector[2*i+1];
1718 if (start == PCRE2_UNSET) {
1719 js_arr_push(js, result_arr, js_mkundef());
1720 } else {
1721 ant_value_t match_str = js_mkstr(js, args.str_ptr + start, end - start);
1722 if (is_err(match_str)) {
1723 pcre2_match_data_free(match_data);
1724 pcre2_code_free(re);
1725 return match_str;
1726 }
1727 js_arr_push(js, result_arr, match_str);
1728 }
1729 }
1730 js_setprop(js, result_arr, js_mkstr(js, "index", 5), tov((double)match_start));
1731 }
1732 match_count++;
1733
1734 if (!args.global) break;
1735 if (match_start == match_end) {
1736 pos = match_end + 1;
1737 } else { pos = match_end; }
1738 }
1739
1740 pcre2_match_data_free(match_data);
1741 pcre2_code_free(re);
1742
1743 if (match_count == 0) return js_mknull();
1744 return result_arr;
1745}
1746
1747static bool str_buf_append(char **buf, size_t *len, size_t *cap, const char *data, size_t n) {
1748 if (n == 0) return true;
1749 if (*len + n >= *cap) {
1750 size_t new_cap = (*len + n + 1) * 2;
1751 char *nb = (char *)ant_realloc(*buf, new_cap);
1752 if (!nb) return false;
1753 *buf = nb;
1754 *cap = new_cap;
1755 }
1756 memcpy(*buf + *len, data, n);
1757 *len += n;
1758 return true;
1759}
1760
1761static inline ant_value_t emit_str_replacement(
1762 ant_t *js, ant_value_t replacement, bool is_func,
1763 const char *repl_ptr, ant_offset_t repl_len,
1764 const char *str_ptr, ant_value_t str,
1765 ant_offset_t pos, ant_offset_t match_len,
1766 char **buf, size_t *buf_len, size_t *buf_cap
1767) {
1768 if (is_func) {
1769 ant_value_t cb_args[3] = { js_mkstr(js, str_ptr + pos, match_len), tov((double)pos), str };
1770 ant_value_t r = sv_vm_call(js->vm, js, replacement, js_mkundef(), cb_args, 3, NULL, false);
1771
1772 if (vtype(r) == T_ERR) return r;
1773 ant_value_t r_str = js_tostring_val(js, r);
1774
1775 if (is_err(r_str)) return r_str;
1776 ant_offset_t rlen, roff = vstr(js, r_str, &rlen);
1777
1778 if (!str_buf_append(buf, buf_len, buf_cap, (const char *)(uintptr_t)roff, rlen)) return js_mkerr(js, "oom");
1779 } else if (!str_buf_append(buf, buf_len, buf_cap, repl_ptr, repl_len)) return js_mkerr(js, "oom");
1780
1781 return js_mkundef();
1782}
1783
1784static ant_value_t string_replace_impl(ant_t *js, ant_value_t *args, int nargs, bool replace_all) {
1785 ant_value_t this_unwrapped = unwrap_primitive(js, js->this_val);
1786 ant_value_t str = js_tostring_val(js, this_unwrapped);
1787
1788 if (is_err(str)) return str;
1789 if (nargs < 1) return str;
1790
1791 if (is_object_type(args[0])) {
1792 if (replace_all) {
1793 ant_value_t global_val = js_getprop_fallback(js, args[0], "global");
1794 if (!js_truthy(js, global_val)) return js_mkerr_typed(js, JS_ERR_TYPE, "String.prototype.replaceAll called with a non-global RegExp");
1795 }
1796
1797 bool called = false;
1798 ant_value_t replacement_arg = nargs > 1 ? args[1] : js_mkundef();
1799 ant_value_t call_args[2] = { str, replacement_arg };
1800
1801 ant_value_t result = maybe_call_symbol_method(js, args[0], get_replace_sym(), args[0], call_args, 2, &called);
1802 if (is_err(result)) return result;
1803 if (called) return result;
1804 }
1805
1806 if (nargs < 2) return str;
1807 ant_value_t search = args[0];
1808 ant_value_t replacement = args[1];
1809 if (vtype(search) != T_STR) return str;
1810
1811 ant_offset_t str_len, str_off = vstr(js, str, &str_len);
1812 const char *str_ptr = (char *)(uintptr_t)(str_off);
1813 ant_offset_t search_len, search_off = vstr(js, search, &search_len);
1814 const char *search_ptr = (char *)(uintptr_t)(search_off);
1815
1816 bool is_func = (vtype(replacement) == T_FUNC);
1817 ant_offset_t repl_len = 0;
1818 const char *repl_ptr = NULL;
1819
1820 if (!is_func) {
1821 if (vtype(replacement) != T_STR) return str;
1822 ant_offset_t repl_off = vstr(js, replacement, &repl_len);
1823 repl_ptr = (char *)(uintptr_t)(repl_off);
1824 }
1825
1826 if (!replace_all) {
1827 if (search_len > str_len) return str;
1828 ant_offset_t match_pos = 0;
1829 bool found = false;
1830
1831 for (ant_offset_t i = 0; i <= str_len - search_len; i++)
1832 if (memcmp(str_ptr + i, search_ptr, search_len) == 0) {
1833 match_pos = i; found = true; break;
1834 }
1835
1836 if (!found) return str;
1837
1838 size_t cap = str_len + repl_len + 256, len = 0;
1839 char *buf = (char *)ant_calloc(cap);
1840 if (!buf) return js_mkerr(js, "oom");
1841
1842 if (!str_buf_append(&buf, &len, &cap, str_ptr, match_pos)) {
1843 free(buf);
1844 return js_mkerr(js, "oom");
1845 }
1846
1847 ant_value_t err = emit_str_replacement(
1848 js, replacement, is_func, repl_ptr,
1849 repl_len, str_ptr, str, match_pos,
1850 search_len, &buf, &len, &cap
1851 );
1852
1853 if (vtype(err) == T_ERR) {
1854 free(buf);
1855 return err;
1856 }
1857
1858 if (!str_buf_append(
1859 &buf, &len, &cap, str_ptr + match_pos + search_len,
1860 str_len - match_pos - search_len)
1861 ) {
1862 free(buf);
1863 return js_mkerr(js, "oom");
1864 }
1865
1866 ant_value_t ret = js_mkstr(js, buf, len);
1867 free(buf);
1868
1869 return ret;
1870 } else {
1871 size_t cap = str_len + repl_len + 256, len = 0;
1872 char *buf = (char *)ant_calloc(cap);
1873 if (!buf) return js_mkerr(js, "oom");
1874
1875 ant_offset_t pos = 0;
1876 bool replaced = false;
1877
1878 while (pos + (ant_offset_t)search_len <= str_len) {
1879 if (search_len == 0 || memcmp(str_ptr + pos, search_ptr, search_len) == 0) {
1880 replaced = true;
1881 ant_value_t err = emit_str_replacement(js, replacement, is_func, repl_ptr, repl_len, str_ptr, str, pos, search_len, &buf, &len, &cap);
1882 if (vtype(err) == T_ERR) { free(buf); return err; }
1883 if (search_len == 0) {
1884 if (pos < str_len && !str_buf_append(&buf, &len, &cap, str_ptr + pos, 1)) { free(buf); return js_mkerr(js, "oom"); }
1885 pos++;
1886 } else pos += search_len;
1887 } else {
1888 if (!str_buf_append(&buf, &len, &cap, str_ptr + pos, 1)) { free(buf); return js_mkerr(js, "oom"); }
1889 pos++;
1890 }
1891 }
1892
1893 if (!str_buf_append(
1894 &buf, &len, &cap, str_ptr + pos,
1895 str_len - pos)
1896 ) {
1897 free(buf);
1898 return js_mkerr(js, "oom");
1899 }
1900
1901 if (!replaced) {
1902 free(buf);
1903 return str;
1904 }
1905
1906 ant_value_t ret = js_mkstr(js, buf, len);
1907 free(buf);
1908
1909 return ret;
1910 }
1911}
1912
1913static ant_value_t builtin_string_replace(ant_t *js, ant_value_t *args, int nargs) {
1914 return string_replace_impl(js, args, nargs, false);
1915}
1916
1917static ant_value_t builtin_string_replaceAll(ant_t *js, ant_value_t *args, int nargs) {
1918 return string_replace_impl(js, args, nargs, true);
1919}
1920
1921static ant_value_t builtin_string_search(ant_t *js, ant_value_t *args, int nargs) {
1922 ant_value_t this_unwrapped = unwrap_primitive(js, js->this_val);
1923 ant_value_t str = js_tostring_val(js, this_unwrapped);
1924 if (is_err(str)) return str;
1925 if (nargs < 1) return tov(-1);
1926
1927 if (is_object_type(args[0])) {
1928 bool called = false;
1929 ant_value_t call_args[1] = { str };
1930 ant_value_t dispatched = maybe_call_symbol_method(
1931 js, args[0], get_search_sym(), args[0], call_args, 1, &called
1932 );
1933 if (is_err(dispatched)) return dispatched;
1934 if (called) return dispatched;
1935 }
1936
1937 ant_value_t pattern = args[0];
1938 const char *pattern_ptr = NULL;
1939 ant_offset_t pattern_len = 0;
1940 bool ignore_case = false, multiline = false;
1941
1942 if (vtype(pattern) == T_OBJ) {
1943 ant_offset_t source_off = lkp(js, pattern, "source", 6);
1944 if (source_off == 0) return tov(-1);
1945 ant_value_t source_val = js_propref_load(js, source_off);
1946 if (vtype(source_val) != T_STR) return tov(-1);
1947
1948 ant_offset_t poff;
1949 poff = vstr(js, source_val, &pattern_len);
1950 pattern_ptr = (char *)(uintptr_t)(poff);
1951
1952 ant_offset_t flags_off = lkp(js, pattern, "flags", 5);
1953 if (flags_off != 0) {
1954 ant_value_t flags_val = js_propref_load(js, flags_off);
1955 if (vtype(flags_val) == T_STR) {
1956 ant_offset_t flen, foff = vstr(js, flags_val, &flen);
1957 const char *flags_str = (char *)(uintptr_t)(foff);
1958 for (ant_offset_t i = 0; i < flen; i++) {
1959 if (flags_str[i] == 'i') ignore_case = true;
1960 if (flags_str[i] == 'm') multiline = true;
1961 }
1962 }
1963 }
1964 } else if (vtype(pattern) == T_STR) {
1965 ant_offset_t poff;
1966 poff = vstr(js, pattern, &pattern_len);
1967 pattern_ptr = (char *)(uintptr_t)(poff);
1968 } else {
1969 return tov(-1);
1970 }
1971
1972 ant_offset_t str_len, str_off = vstr(js, str, &str_len);
1973 const char *str_ptr = (char *)(uintptr_t)(str_off);
1974
1975 char pcre2_pattern[4096];
1976 size_t pcre2_len = js_to_pcre2_pattern(pattern_ptr, pattern_len, pcre2_pattern, sizeof(pcre2_pattern), false);
1977
1978 uint32_t options = PCRE2_UTF | PCRE2_UCP | PCRE2_MATCH_UNSET_BACKREF | PCRE2_DUPNAMES;
1979 if (ignore_case) options |= PCRE2_CASELESS;
1980 if (multiline) options |= PCRE2_MULTILINE;
1981
1982 int errcode;
1983 PCRE2_SIZE erroffset;
1984 pcre2_code *re = pcre2_compile((PCRE2_SPTR)pcre2_pattern, pcre2_len, options, &errcode, &erroffset, NULL);
1985 if (re == NULL) return tov(-1);
1986
1987 pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re, NULL);
1988 int rc = pcre2_match(re, (PCRE2_SPTR)str_ptr, str_len, 0, 0, match_data, NULL);
1989
1990 if (rc < 0) {
1991 pcre2_match_data_free(match_data);
1992 pcre2_code_free(re);
1993 return tov(-1);
1994 }
1995
1996 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(match_data);
1997 double result = (double)ovector[0];
1998
1999 pcre2_match_data_free(match_data);
2000 pcre2_code_free(re);
2001
2002 return tov(result);
2003}
2004
2005static ant_value_t builtin_string_match(ant_t *js, ant_value_t *args, int nargs) {
2006 ant_value_t this_unwrapped = unwrap_primitive(js, js->this_val);
2007 ant_value_t str = js_tostring_val(js, this_unwrapped);
2008 if (is_err(str)) return str;
2009 if (nargs < 1) return js_mknull();
2010
2011 if (is_object_type(args[0])) {
2012 bool called = false;
2013 ant_value_t call_args[1] = { str };
2014 ant_value_t dispatched = maybe_call_symbol_method(
2015 js, args[0], get_match_sym(), args[0], call_args, 1, &called
2016 );
2017 if (is_err(dispatched)) return dispatched;
2018 if (called) return dispatched;
2019 }
2020
2021 ant_value_t pattern = args[0];
2022 const char *pattern_ptr = NULL;
2023 ant_offset_t pattern_len = 0;
2024 bool global_flag = false;
2025 bool ignore_case = false;
2026 bool multiline = false;
2027
2028 if (vtype(pattern) == T_OBJ) {
2029 ant_offset_t source_off = lkp(js, pattern, "source", 6);
2030 if (source_off == 0) return js_mknull();
2031
2032 ant_value_t source_val = js_propref_load(js, source_off);
2033 if (vtype(source_val) != T_STR) return js_mknull();
2034
2035 ant_offset_t poff;
2036 poff = vstr(js, source_val, &pattern_len);
2037 pattern_ptr = (char *)(uintptr_t)(poff);
2038
2039 ant_offset_t flags_off = lkp(js, pattern, "flags", 5);
2040 if (flags_off != 0) {
2041 ant_value_t flags_val = js_propref_load(js, flags_off);
2042 if (vtype(flags_val) == T_STR) {
2043 ant_offset_t flen, foff = vstr(js, flags_val, &flen);
2044 const char *flags_str = (char *)(uintptr_t)(foff);
2045 for (ant_offset_t i = 0; i < flen; i++) {
2046 if (flags_str[i] == 'g') global_flag = true;
2047 if (flags_str[i] == 'i') ignore_case = true;
2048 if (flags_str[i] == 'm') multiline = true;
2049 }}
2050 }
2051 } else if (vtype(pattern) == T_STR) {
2052 ant_offset_t poff;
2053 poff = vstr(js, pattern, &pattern_len);
2054 pattern_ptr = (char *)(uintptr_t)(poff);
2055 } else return js_mknull();
2056
2057 ant_offset_t str_len, str_off = vstr(js, str, &str_len);
2058 const char *str_ptr = (char *)(uintptr_t)(str_off);
2059
2060 ant_value_t result = do_regex_match_pcre2(js, (regex_match_args_t){
2061 .pattern_ptr = pattern_ptr, .pattern_len = pattern_len,
2062 .str_ptr = str_ptr, .str_len = str_len,
2063 .global = global_flag, .ignore_case = ignore_case, .multiline = multiline,
2064 });
2065
2066 if (!global_flag && vtype(result) == T_ARR) {
2067 js_setprop(js, result, js_mkstr(js, "input", 5), str);
2068 }
2069
2070 return result;
2071}
2072
2073void init_regex_module(void) {
2074 ant_t *js = rt->js;
2075 ant_value_t glob = js->global;
2076 ant_value_t object_proto = js->sym.object_proto;
2077
2078 ant_value_t regexp_proto = js_mkobj(js);
2079 js_set_proto_init(regexp_proto, object_proto);
2080
2081 defmethod(js, regexp_proto, "test", 4, js_mkfun(builtin_regexp_test));
2082 defmethod(js, regexp_proto, "exec", 4, js_mkfun(builtin_regexp_exec));
2083 defmethod(js, regexp_proto, "toString", 8, js_mkfun(builtin_regexp_toString));
2084
2085 js_mkprop_fast(js, regexp_proto, "global", 6, js_false);
2086 js_mkprop_fast(js, regexp_proto, "ignoreCase", 10, js_false);
2087 js_mkprop_fast(js, regexp_proto, "multiline", 9, js_false);
2088 js_mkprop_fast(js, regexp_proto, "dotAll", 6, js_false);
2089 js_mkprop_fast(js, regexp_proto, "unicode", 7, js_false);
2090 js_mkprop_fast(js, regexp_proto, "sticky", 6, js_false);
2091 js_mkprop_fast(js, regexp_proto, "hasIndices", 10, js_false);
2092 js_mkprop_fast(js, regexp_proto, "unicodeSets", 11, js_false);
2093
2094 js_set_sym(js, regexp_proto, get_split_sym(), js_mkfun(builtin_regexp_symbol_split));
2095 js_set_sym(js, regexp_proto, get_match_sym(), js_mkfun(builtin_regexp_symbol_match));
2096 js_set_sym(js, regexp_proto, get_matchAll_sym(), js_mkfun(builtin_regexp_symbol_matchAll));
2097
2098 regexp_matchall_iter_proto_val = js_mkobj(js);
2099 js_set_proto_init(regexp_matchall_iter_proto_val, js->sym.iterator_proto);
2100 defmethod(js, regexp_matchall_iter_proto_val, "next", 4, js_mkfun(regexp_matchall_next));
2101 js_set_sym(js, regexp_matchall_iter_proto_val, get_iterator_sym(), js_mkfun(sym_this_cb));
2102 js_set_sym(js, regexp_proto, get_replace_sym(), js_mkfun(builtin_regexp_symbol_replace));
2103 js_set_sym(js, regexp_proto, get_search_sym(), js_mkfun(builtin_regexp_symbol_search));
2104 js_set_sym(js, regexp_proto, get_toStringTag_sym(), js_mkstr(js, "RegExp", 6));
2105 js_set_getter_desc(js, regexp_proto, "flags", 5, js_mkfun(builtin_regexp_flags_getter), JS_DESC_C);
2106 defmethod(js, regexp_proto, "compile", 7, js_mkfun(builtin_regexp_compile));
2107
2108 ant_value_t regexp_ctor = js_mkobj(js);
2109 js_set_slot(regexp_ctor, SLOT_CFUNC, js_mkfun(builtin_RegExp));
2110 js_mkprop_fast(js, regexp_ctor, "prototype", 9, regexp_proto);
2111 js_mkprop_fast(js, regexp_ctor, "name", 4, js_mkstr(js, "RegExp", 6));
2112 js_set_descriptor(js, regexp_ctor, "name", 4, 0);
2113 js_define_species_getter(js, regexp_ctor);
2114
2115 ant_value_t regexp_func = js_obj_to_func(regexp_ctor);
2116 js_setprop(js, regexp_proto, js_mkstr(js, "constructor", 11), regexp_func);
2117 js_set_descriptor(js, regexp_proto, "constructor", 11, JS_DESC_W | JS_DESC_C);
2118
2119 js_set(js, regexp_ctor, "escape", js_mkfun(builtin_regexp_escape));
2120
2121 ant_value_t empty = js_mkstr(js, "", 0);
2122 for (int i = 1; i <= 9; i++) {
2123 char key[3] = {'$', (char)('0' + i), '\0'};
2124 js_set(js, regexp_ctor, key, empty);
2125 }
2126
2127 js_set(js, regexp_ctor, "lastMatch", empty);
2128 js_set(js, regexp_ctor, "$&", empty);
2129 js_set(js, glob, "RegExp", regexp_func);
2130
2131 ant_value_t string_ctor = js_get(js, glob, "String");
2132 ant_value_t string_proto = js_get(js, string_ctor, "prototype");
2133
2134 defmethod(js, string_proto, "search", 6, js_mkfun(builtin_string_search));
2135 defmethod(js, string_proto, "match", 5, js_mkfun(builtin_string_match));
2136 defmethod(js, string_proto, "matchAll", 8, js_mkfun(builtin_string_matchAll));
2137 defmethod(js, string_proto, "replace", 7, js_mkfun(builtin_string_replace));
2138 defmethod(js, string_proto, "replaceAll", 10, js_mkfun(builtin_string_replaceAll));
2139}
2140
2141void gc_sweep_regex_cache(void) {
2142 size_t write = 0;
2143 for (size_t i = 0; i < regex_cache_count; i++) {
2144 if (!gc_obj_is_marked(regex_cache[i].obj)) {
2145 pcre2_match_data_free(regex_cache[i].match_data);
2146 pcre2_code_free(regex_cache[i].code);
2147 } else {
2148 if (write != i) regex_cache[write] = regex_cache[i];
2149 write++;
2150 }
2151 }
2152 regex_cache_count = write;
2153}
2154
2155void cleanup_regex_module(void) {
2156 for (size_t i = 0; i < regex_cache_count; i++) {
2157 pcre2_match_data_free(regex_cache[i].match_data);
2158 pcre2_code_free(regex_cache[i].code);
2159 }
2160 free(regex_cache);
2161 regex_cache = NULL;
2162 regex_cache_count = 0;
2163 regex_cache_cap = 0;
2164}