we (web engine): Experimental web browser project to understand the limits of Claude
2
fork

Configure Feed

Select the types of activity you want to include in your feed.

Implement JS built-in RegExp engine

Add a from-scratch regex engine and wire it into the JS runtime:

- regex.rs: Pattern parser (character classes, quantifiers, anchors,
groups, alternation, backreferences, lookahead, escape sequences)
and backtracking matcher with proper continuation-based sequence
matching for greedy/lazy quantifiers.

- RegExp constructor and prototype methods (test, exec, toString)
with support for g/i/m/s/u/y flags, lastIndex tracking, and
named capture groups.

- Compiler emits RegExp(pattern, flags) calls for regex literals.

- String.prototype gains match, matchAll, search methods; replace,
replaceAll, split updated to handle RegExp arguments with capture
group substitution ($1, $&, etc.).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

+2443 -4
+723 -1
crates/js/src/builtins.rs
··· 240 240 // Create and register Date constructor. 241 241 init_date_builtins(vm); 242 242 243 + // Create and register RegExp constructor. 244 + init_regexp_builtins(vm); 245 + 243 246 // Create and register JSON object (static methods only). 244 247 init_json_object(vm); 245 248 ··· 1498 1501 ("split", string_proto_split), 1499 1502 ("replace", string_proto_replace), 1500 1503 ("replaceAll", string_proto_replace_all), 1504 + ("match", string_proto_match), 1505 + ("matchAll", string_proto_match_all), 1506 + ("search", string_proto_search), 1501 1507 ("toLowerCase", string_proto_to_lower_case), 1502 1508 ("toUpperCase", string_proto_to_upper_case), 1503 1509 ("at", string_proto_at), ··· 1864 1870 if args.is_empty() || matches!(args.first(), Some(Value::Undefined)) { 1865 1871 return Ok(make_value_array(ctx.gc, &[Value::String(s)])); 1866 1872 } 1867 - let sep = args[0].to_js_string(ctx.gc); 1868 1873 let limit = args 1869 1874 .get(1) 1870 1875 .map(|v| v.to_number() as usize) 1871 1876 .unwrap_or(usize::MAX); 1877 + 1878 + // Check if separator is a RegExp. 1879 + if let Some(arg0) = args.first() { 1880 + if is_regexp(ctx.gc, arg0) { 1881 + return string_split_regexp(ctx.gc, &s, arg0, limit); 1882 + } 1883 + } 1884 + 1885 + let sep = args[0].to_js_string(ctx.gc); 1872 1886 if sep.is_empty() { 1873 1887 let items: Vec<Value> = str_chars(&s) 1874 1888 .into_iter() ··· 1893 1907 Ok(make_value_array(ctx.gc, &items)) 1894 1908 } 1895 1909 1910 + fn string_split_regexp( 1911 + gc: &mut Gc<HeapObject>, 1912 + s: &str, 1913 + regexp: &Value, 1914 + limit: usize, 1915 + ) -> Result<Value, RuntimeError> { 1916 + use crate::regex::{exec, CompiledRegex}; 1917 + 1918 + let pattern = regexp_get_pattern(gc, regexp).unwrap_or_default(); 1919 + let flags_str = regexp_get_flags(gc, regexp).unwrap_or_default(); 1920 + let compiled = CompiledRegex::new(&pattern, &flags_str).map_err(RuntimeError::syntax_error)?; 1921 + let chars: Vec<char> = s.chars().collect(); 1922 + 1923 + let mut items = Vec::new(); 1924 + let mut last_end = 0usize; 1925 + 1926 + loop { 1927 + if items.len() >= limit { 1928 + break; 1929 + } 1930 + match exec(&compiled, s, last_end) { 1931 + Some(m) => { 1932 + // Avoid infinite loop on zero-length matches. 1933 + if m.start == m.end && m.start == last_end { 1934 + if last_end >= chars.len() { 1935 + break; 1936 + } 1937 + items.push(Value::String(chars[last_end].to_string())); 1938 + last_end += 1; 1939 + continue; 1940 + } 1941 + let piece: String = chars[last_end..m.start].iter().collect(); 1942 + items.push(Value::String(piece)); 1943 + // Add capturing groups. 1944 + for i in 1..m.captures.len() { 1945 + if items.len() >= limit { 1946 + break; 1947 + } 1948 + match m.captures[i] { 1949 + Some((cs, ce)) => { 1950 + let cap: String = chars[cs..ce].iter().collect(); 1951 + items.push(Value::String(cap)); 1952 + } 1953 + None => items.push(Value::Undefined), 1954 + } 1955 + } 1956 + last_end = m.end; 1957 + } 1958 + None => break, 1959 + } 1960 + } 1961 + if items.len() < limit { 1962 + let rest: String = chars[last_end..].iter().collect(); 1963 + items.push(Value::String(rest)); 1964 + } 1965 + Ok(make_value_array(gc, &items)) 1966 + } 1967 + 1896 1968 fn string_proto_replace(args: &[Value], ctx: &mut NativeContext) -> Result<Value, RuntimeError> { 1897 1969 let s = this_string(ctx); 1970 + 1971 + // Check if search argument is a RegExp. 1972 + if let Some(arg0) = args.first() { 1973 + if is_regexp(ctx.gc, arg0) { 1974 + let replacement = args 1975 + .get(1) 1976 + .map(|v| v.to_js_string(ctx.gc)) 1977 + .unwrap_or_default(); 1978 + return string_replace_regexp(ctx.gc, &s, arg0, &replacement); 1979 + } 1980 + } 1981 + 1898 1982 let search = args 1899 1983 .first() 1900 1984 .map(|v| v.to_js_string(ctx.gc)) ··· 1915 1999 } 1916 2000 } 1917 2001 2002 + fn string_replace_regexp( 2003 + gc: &mut Gc<HeapObject>, 2004 + s: &str, 2005 + regexp: &Value, 2006 + replacement: &str, 2007 + ) -> Result<Value, RuntimeError> { 2008 + use crate::regex::{exec, CompiledRegex}; 2009 + 2010 + let pattern = regexp_get_pattern(gc, regexp).unwrap_or_default(); 2011 + let flags_str = regexp_get_flags(gc, regexp).unwrap_or_default(); 2012 + let compiled = CompiledRegex::new(&pattern, &flags_str).map_err(RuntimeError::syntax_error)?; 2013 + let is_global = compiled.flags.global; 2014 + let chars: Vec<char> = s.chars().collect(); 2015 + let mut result = String::new(); 2016 + let mut last_end = 0usize; 2017 + 2018 + while let Some(m) = exec(&compiled, s, last_end) { 2019 + // Append text before match. 2020 + let before: String = chars[last_end..m.start].iter().collect(); 2021 + result.push_str(&before); 2022 + // Process replacement with $-substitutions. 2023 + let matched: String = chars[m.start..m.end].iter().collect(); 2024 + result.push_str(&apply_replacement( 2025 + replacement, 2026 + &matched, 2027 + &m.captures, 2028 + &chars, 2029 + )); 2030 + last_end = m.end; 2031 + if !is_global { 2032 + break; 2033 + } 2034 + // Avoid infinite loop on zero-length match. 2035 + if m.start == m.end { 2036 + if last_end < chars.len() { 2037 + result.push(chars[last_end]); 2038 + last_end += 1; 2039 + } else { 2040 + break; 2041 + } 2042 + } 2043 + } 2044 + let rest: String = chars[last_end..].iter().collect(); 2045 + result.push_str(&rest); 2046 + Ok(Value::String(result)) 2047 + } 2048 + 2049 + /// Apply replacement string with $-substitutions ($&, $1, etc.). 2050 + fn apply_replacement( 2051 + replacement: &str, 2052 + matched: &str, 2053 + captures: &[Option<(usize, usize)>], 2054 + chars: &[char], 2055 + ) -> String { 2056 + let rep_chars: Vec<char> = replacement.chars().collect(); 2057 + let mut result = String::new(); 2058 + let mut i = 0; 2059 + while i < rep_chars.len() { 2060 + if rep_chars[i] == '$' && i + 1 < rep_chars.len() { 2061 + match rep_chars[i + 1] { 2062 + '$' => { 2063 + result.push('$'); 2064 + i += 2; 2065 + } 2066 + '&' => { 2067 + result.push_str(matched); 2068 + i += 2; 2069 + } 2070 + '`' => { 2071 + // $` — text before match. 2072 + if let Some(Some((start, _))) = captures.first() { 2073 + let before: String = chars[..*start].iter().collect(); 2074 + result.push_str(&before); 2075 + } 2076 + i += 2; 2077 + } 2078 + '\'' => { 2079 + // $' — text after match. 2080 + if let Some(Some((_, end))) = captures.first() { 2081 + let after: String = chars[*end..].iter().collect(); 2082 + result.push_str(&after); 2083 + } 2084 + i += 2; 2085 + } 2086 + d if d.is_ascii_digit() => { 2087 + // $1, $12 etc. 2088 + let mut num_str = String::new(); 2089 + let mut j = i + 1; 2090 + while j < rep_chars.len() && rep_chars[j].is_ascii_digit() { 2091 + num_str.push(rep_chars[j]); 2092 + j += 1; 2093 + } 2094 + if let Ok(idx) = num_str.parse::<usize>() { 2095 + if idx > 0 && idx < captures.len() { 2096 + if let Some((s, e)) = captures[idx] { 2097 + let cap: String = chars[s..e].iter().collect(); 2098 + result.push_str(&cap); 2099 + } 2100 + } 2101 + } 2102 + i = j; 2103 + } 2104 + _ => { 2105 + result.push('$'); 2106 + i += 1; 2107 + } 2108 + } 2109 + } else { 2110 + result.push(rep_chars[i]); 2111 + i += 1; 2112 + } 2113 + } 2114 + result 2115 + } 2116 + 1918 2117 fn string_proto_replace_all( 1919 2118 args: &[Value], 1920 2119 ctx: &mut NativeContext, 1921 2120 ) -> Result<Value, RuntimeError> { 1922 2121 let s = this_string(ctx); 2122 + 2123 + // If search is a RegExp, it must have the global flag. 2124 + if let Some(arg0) = args.first() { 2125 + if is_regexp(ctx.gc, arg0) { 2126 + let flags = regexp_get_flags(ctx.gc, arg0).unwrap_or_default(); 2127 + if !flags.contains('g') { 2128 + return Err(RuntimeError::type_error( 2129 + "String.prototype.replaceAll called with a non-global RegExp argument", 2130 + )); 2131 + } 2132 + let replacement = args 2133 + .get(1) 2134 + .map(|v| v.to_js_string(ctx.gc)) 2135 + .unwrap_or_default(); 2136 + return string_replace_regexp(ctx.gc, &s, arg0, &replacement); 2137 + } 2138 + } 2139 + 1923 2140 let search = args 1924 2141 .first() 1925 2142 .map(|v| v.to_js_string(ctx.gc)) ··· 1929 2146 .map(|v| v.to_js_string(ctx.gc)) 1930 2147 .unwrap_or_default(); 1931 2148 Ok(Value::String(s.replace(&search, &replacement))) 2149 + } 2150 + 2151 + fn string_proto_match(args: &[Value], ctx: &mut NativeContext) -> Result<Value, RuntimeError> { 2152 + let s = this_string(ctx); 2153 + if args.is_empty() { 2154 + return Ok(Value::Null); 2155 + } 2156 + 2157 + let arg0 = &args[0]; 2158 + // If arg is not a RegExp, create one. 2159 + let regexp_val = if is_regexp(ctx.gc, arg0) { 2160 + arg0.clone() 2161 + } else { 2162 + let pattern = arg0.to_js_string(ctx.gc); 2163 + let proto = REGEXP_PROTO.with(|cell| cell.get()); 2164 + make_regexp_obj(ctx.gc, &pattern, "", proto).map_err(RuntimeError::syntax_error)? 2165 + }; 2166 + 2167 + let is_global = regexp_get_flags(ctx.gc, &regexp_val) 2168 + .map(|f| f.contains('g')) 2169 + .unwrap_or(false); 2170 + 2171 + if !is_global { 2172 + // Non-global: return exec result. 2173 + return regexp_exec_internal(ctx.gc, &regexp_val, &s); 2174 + } 2175 + 2176 + // Global: collect all matches. 2177 + regexp_set_last_index(ctx.gc, &regexp_val, 0.0); 2178 + let mut matches = Vec::new(); 2179 + loop { 2180 + let result = regexp_exec_internal(ctx.gc, &regexp_val, &s)?; 2181 + if matches!(result, Value::Null) { 2182 + break; 2183 + } 2184 + // Get the matched string (index 0 of the result array). 2185 + if let Value::Object(r) = &result { 2186 + if let Some(HeapObject::Object(data)) = ctx.gc.get(*r) { 2187 + if let Some(prop) = data.properties.get("0") { 2188 + matches.push(prop.value.clone()); 2189 + // Advance past zero-length matches. 2190 + let match_str = prop.value.to_js_string(ctx.gc); 2191 + if match_str.is_empty() { 2192 + let li = regexp_get_last_index(ctx.gc, &regexp_val); 2193 + regexp_set_last_index(ctx.gc, &regexp_val, li + 1.0); 2194 + } 2195 + } 2196 + } 2197 + } 2198 + } 2199 + if matches.is_empty() { 2200 + Ok(Value::Null) 2201 + } else { 2202 + Ok(make_value_array(ctx.gc, &matches)) 2203 + } 2204 + } 2205 + 2206 + fn string_proto_match_all(args: &[Value], ctx: &mut NativeContext) -> Result<Value, RuntimeError> { 2207 + let s = this_string(ctx); 2208 + if args.is_empty() { 2209 + return Ok(make_value_array(ctx.gc, &[])); 2210 + } 2211 + 2212 + let arg0 = &args[0]; 2213 + // If arg is a RegExp, it must have global flag. 2214 + let regexp_val = if is_regexp(ctx.gc, arg0) { 2215 + let flags = regexp_get_flags(ctx.gc, arg0).unwrap_or_default(); 2216 + if !flags.contains('g') { 2217 + return Err(RuntimeError::type_error( 2218 + "String.prototype.matchAll called with a non-global RegExp argument", 2219 + )); 2220 + } 2221 + arg0.clone() 2222 + } else { 2223 + let pattern = arg0.to_js_string(ctx.gc); 2224 + let proto = REGEXP_PROTO.with(|cell| cell.get()); 2225 + make_regexp_obj(ctx.gc, &pattern, "g", proto).map_err(RuntimeError::syntax_error)? 2226 + }; 2227 + 2228 + // Collect all match results. 2229 + regexp_set_last_index(ctx.gc, &regexp_val, 0.0); 2230 + let mut results = Vec::new(); 2231 + loop { 2232 + let result = regexp_exec_internal(ctx.gc, &regexp_val, &s)?; 2233 + if matches!(result, Value::Null) { 2234 + break; 2235 + } 2236 + // Advance past zero-length matches. 2237 + if let Value::Object(r) = &result { 2238 + if let Some(HeapObject::Object(data)) = ctx.gc.get(*r) { 2239 + if let Some(prop) = data.properties.get("0") { 2240 + let match_str = prop.value.to_js_string(ctx.gc); 2241 + if match_str.is_empty() { 2242 + let li = regexp_get_last_index(ctx.gc, &regexp_val); 2243 + regexp_set_last_index(ctx.gc, &regexp_val, li + 1.0); 2244 + } 2245 + } 2246 + } 2247 + } 2248 + results.push(result); 2249 + } 2250 + Ok(make_value_array(ctx.gc, &results)) 2251 + } 2252 + 2253 + fn string_proto_search(args: &[Value], ctx: &mut NativeContext) -> Result<Value, RuntimeError> { 2254 + let s = this_string(ctx); 2255 + if args.is_empty() { 2256 + return Ok(Value::Number(0.0)); // /(?:)/ matches at 0. 2257 + } 2258 + 2259 + let arg0 = &args[0]; 2260 + let regexp_val = if is_regexp(ctx.gc, arg0) { 2261 + arg0.clone() 2262 + } else { 2263 + let pattern = arg0.to_js_string(ctx.gc); 2264 + let proto = REGEXP_PROTO.with(|cell| cell.get()); 2265 + make_regexp_obj(ctx.gc, &pattern, "", proto).map_err(RuntimeError::syntax_error)? 2266 + }; 2267 + 2268 + // search always starts from 0 and ignores global/lastIndex. 2269 + let result = regexp_exec_internal(ctx.gc, &regexp_val, &s)?; 2270 + match result { 2271 + Value::Null => Ok(Value::Number(-1.0)), 2272 + Value::Object(r) => { 2273 + let idx = match ctx.gc.get(r) { 2274 + Some(HeapObject::Object(data)) => data 2275 + .properties 2276 + .get("index") 2277 + .map(|p| p.value.to_number()) 2278 + .unwrap_or(-1.0), 2279 + _ => -1.0, 2280 + }; 2281 + Ok(Value::Number(idx)) 2282 + } 2283 + _ => Ok(Value::Number(-1.0)), 2284 + } 1932 2285 } 1933 2286 1934 2287 fn string_proto_to_lower_case( ··· 3086 3439 return Ok(Value::Null); 3087 3440 } 3088 3441 date_to_iso_string(_args, ctx) 3442 + } 3443 + 3444 + // ── RegExp built-in ────────────────────────────────────────── 3445 + 3446 + thread_local! { 3447 + static REGEXP_PROTO: std::cell::Cell<Option<GcRef>> = const { std::cell::Cell::new(None) }; 3448 + } 3449 + 3450 + fn init_regexp_builtins(vm: &mut Vm) { 3451 + // RegExp.prototype. 3452 + let mut regexp_proto_data = ObjectData::new(); 3453 + if let Some(proto) = vm.object_prototype { 3454 + regexp_proto_data.prototype = Some(proto); 3455 + } 3456 + let regexp_proto = vm.gc.alloc(HeapObject::Object(regexp_proto_data)); 3457 + init_regexp_prototype(&mut vm.gc, regexp_proto); 3458 + 3459 + vm.regexp_prototype = Some(regexp_proto); 3460 + REGEXP_PROTO.with(|cell| cell.set(Some(regexp_proto))); 3461 + 3462 + // RegExp constructor function. 3463 + let ctor = vm.gc.alloc(HeapObject::Function(Box::new(FunctionData { 3464 + name: "RegExp".to_string(), 3465 + kind: FunctionKind::Native(NativeFunc { 3466 + callback: regexp_constructor, 3467 + }), 3468 + prototype_obj: Some(regexp_proto), 3469 + properties: HashMap::new(), 3470 + upvalues: Vec::new(), 3471 + }))); 3472 + 3473 + vm.set_global("RegExp", Value::Function(ctor)); 3474 + } 3475 + 3476 + fn init_regexp_prototype(gc: &mut Gc<HeapObject>, proto: GcRef) { 3477 + let methods: &[NativeMethod] = &[ 3478 + ("test", regexp_proto_test), 3479 + ("exec", regexp_proto_exec), 3480 + ("toString", regexp_proto_to_string), 3481 + ]; 3482 + for &(name, callback) in methods { 3483 + let f = make_native(gc, name, callback); 3484 + set_builtin_prop(gc, proto, name, Value::Function(f)); 3485 + } 3486 + } 3487 + 3488 + /// Create a RegExp object storing compiled regex state in hidden properties. 3489 + pub fn make_regexp_obj( 3490 + gc: &mut Gc<HeapObject>, 3491 + pattern: &str, 3492 + flags_str: &str, 3493 + proto: Option<GcRef>, 3494 + ) -> Result<Value, String> { 3495 + use crate::regex::CompiledRegex; 3496 + 3497 + let compiled = CompiledRegex::new(pattern, flags_str)?; 3498 + let flags = compiled.flags; 3499 + 3500 + let mut data = ObjectData::new(); 3501 + if let Some(p) = proto { 3502 + data.prototype = Some(p); 3503 + } 3504 + // Store pattern and flags as properties. 3505 + data.properties.insert( 3506 + "source".to_string(), 3507 + Property::builtin(Value::String(pattern.to_string())), 3508 + ); 3509 + let flags_string = flags.as_flag_string(); 3510 + data.properties.insert( 3511 + "flags".to_string(), 3512 + Property::builtin(Value::String(flags_string)), 3513 + ); 3514 + data.properties.insert( 3515 + "global".to_string(), 3516 + Property::builtin(Value::Boolean(flags.global)), 3517 + ); 3518 + data.properties.insert( 3519 + "ignoreCase".to_string(), 3520 + Property::builtin(Value::Boolean(flags.ignore_case)), 3521 + ); 3522 + data.properties.insert( 3523 + "multiline".to_string(), 3524 + Property::builtin(Value::Boolean(flags.multiline)), 3525 + ); 3526 + data.properties.insert( 3527 + "dotAll".to_string(), 3528 + Property::builtin(Value::Boolean(flags.dot_all)), 3529 + ); 3530 + data.properties.insert( 3531 + "unicode".to_string(), 3532 + Property::builtin(Value::Boolean(flags.unicode)), 3533 + ); 3534 + data.properties.insert( 3535 + "sticky".to_string(), 3536 + Property::builtin(Value::Boolean(flags.sticky)), 3537 + ); 3538 + data.properties 3539 + .insert("lastIndex".to_string(), Property::data(Value::Number(0.0))); 3540 + // Hidden: serialized pattern for re-compilation. 3541 + data.properties.insert( 3542 + "__regexp_pattern__".to_string(), 3543 + Property::builtin(Value::String(pattern.to_string())), 3544 + ); 3545 + data.properties.insert( 3546 + "__regexp_flags__".to_string(), 3547 + Property::builtin(Value::String(flags.as_flag_string())), 3548 + ); 3549 + 3550 + Ok(Value::Object(gc.alloc(HeapObject::Object(data)))) 3551 + } 3552 + 3553 + /// Check if a Value is a RegExp object. 3554 + pub fn is_regexp(gc: &Gc<HeapObject>, val: &Value) -> bool { 3555 + match val { 3556 + Value::Object(r) => match gc.get(*r) { 3557 + Some(HeapObject::Object(data)) => data.properties.contains_key("__regexp_pattern__"), 3558 + _ => false, 3559 + }, 3560 + _ => false, 3561 + } 3562 + } 3563 + 3564 + /// Extract the pattern from a RegExp object. 3565 + fn regexp_get_pattern(gc: &Gc<HeapObject>, val: &Value) -> Option<String> { 3566 + match val { 3567 + Value::Object(r) => match gc.get(*r) { 3568 + Some(HeapObject::Object(data)) => { 3569 + data.properties 3570 + .get("__regexp_pattern__") 3571 + .and_then(|p| match &p.value { 3572 + Value::String(s) => Some(s.clone()), 3573 + _ => None, 3574 + }) 3575 + } 3576 + _ => None, 3577 + }, 3578 + _ => None, 3579 + } 3580 + } 3581 + 3582 + /// Extract the flags string from a RegExp object. 3583 + fn regexp_get_flags(gc: &Gc<HeapObject>, val: &Value) -> Option<String> { 3584 + match val { 3585 + Value::Object(r) => match gc.get(*r) { 3586 + Some(HeapObject::Object(data)) => { 3587 + data.properties 3588 + .get("__regexp_flags__") 3589 + .and_then(|p| match &p.value { 3590 + Value::String(s) => Some(s.clone()), 3591 + _ => None, 3592 + }) 3593 + } 3594 + _ => None, 3595 + }, 3596 + _ => None, 3597 + } 3598 + } 3599 + 3600 + /// Get lastIndex from a RegExp object. 3601 + fn regexp_get_last_index(gc: &Gc<HeapObject>, val: &Value) -> f64 { 3602 + match val { 3603 + Value::Object(r) => match gc.get(*r) { 3604 + Some(HeapObject::Object(data)) => data 3605 + .properties 3606 + .get("lastIndex") 3607 + .map(|p| p.value.to_number()) 3608 + .unwrap_or(0.0), 3609 + _ => 0.0, 3610 + }, 3611 + _ => 0.0, 3612 + } 3613 + } 3614 + 3615 + /// Set lastIndex on a RegExp object. 3616 + fn regexp_set_last_index(gc: &mut Gc<HeapObject>, val: &Value, idx: f64) { 3617 + if let Value::Object(r) = val { 3618 + if let Some(HeapObject::Object(data)) = gc.get_mut(*r) { 3619 + if let Some(prop) = data.properties.get_mut("lastIndex") { 3620 + prop.value = Value::Number(idx); 3621 + } 3622 + } 3623 + } 3624 + } 3625 + 3626 + /// Execute the regex on a string and return a match result array or null. 3627 + fn regexp_exec_internal( 3628 + gc: &mut Gc<HeapObject>, 3629 + this: &Value, 3630 + input: &str, 3631 + ) -> Result<Value, RuntimeError> { 3632 + use crate::regex::{exec, CompiledRegex}; 3633 + 3634 + let pattern = regexp_get_pattern(gc, this) 3635 + .ok_or_else(|| RuntimeError::type_error("not a RegExp".to_string()))?; 3636 + let flags_str = regexp_get_flags(gc, this).unwrap_or_default(); 3637 + let compiled = CompiledRegex::new(&pattern, &flags_str).map_err(RuntimeError::syntax_error)?; 3638 + let is_global = compiled.flags.global; 3639 + let is_sticky = compiled.flags.sticky; 3640 + 3641 + let start_index = if is_global || is_sticky { 3642 + let li = regexp_get_last_index(gc, this); 3643 + if li < 0.0 { 3644 + 0 3645 + } else { 3646 + li as usize 3647 + } 3648 + } else { 3649 + 0 3650 + }; 3651 + 3652 + let chars: Vec<char> = input.chars().collect(); 3653 + let result = exec(&compiled, input, start_index); 3654 + 3655 + match result { 3656 + Some(m) => { 3657 + if is_global || is_sticky { 3658 + regexp_set_last_index(gc, this, m.end as f64); 3659 + } 3660 + 3661 + // Build result array: [fullMatch, ...groups] 3662 + let mut items: Vec<Value> = Vec::new(); 3663 + 3664 + // Full match (index 0). 3665 + let full: String = chars[m.start..m.end].iter().collect(); 3666 + items.push(Value::String(full)); 3667 + 3668 + // Capture groups (index 1..n). 3669 + for i in 1..m.captures.len() { 3670 + match m.captures[i] { 3671 + Some((s, e)) => { 3672 + let cap: String = chars[s..e].iter().collect(); 3673 + items.push(Value::String(cap)); 3674 + } 3675 + None => items.push(Value::Undefined), 3676 + } 3677 + } 3678 + 3679 + // Build named groups object (if any) before creating the array. 3680 + let groups_val = { 3681 + let (node, _) = 3682 + crate::regex::parse_pattern(&pattern).map_err(RuntimeError::syntax_error)?; 3683 + let named = collect_named_groups(&node); 3684 + if named.is_empty() { 3685 + Value::Undefined 3686 + } else { 3687 + let mut groups_data = ObjectData::new(); 3688 + for (name, idx) in &named { 3689 + let cap_idx = *idx as usize; 3690 + let val = if cap_idx < m.captures.len() { 3691 + match m.captures[cap_idx] { 3692 + Some((s, e)) => { 3693 + let cap: String = chars[s..e].iter().collect(); 3694 + Value::String(cap) 3695 + } 3696 + None => Value::Undefined, 3697 + } 3698 + } else { 3699 + Value::Undefined 3700 + }; 3701 + groups_data 3702 + .properties 3703 + .insert(name.clone(), Property::data(val)); 3704 + } 3705 + Value::Object(gc.alloc(HeapObject::Object(groups_data))) 3706 + } 3707 + }; 3708 + 3709 + let arr = make_value_array(gc, &items); 3710 + // Set index, input, and groups properties on the result array. 3711 + if let Value::Object(r) = arr { 3712 + if let Some(HeapObject::Object(data)) = gc.get_mut(r) { 3713 + data.properties.insert( 3714 + "index".to_string(), 3715 + Property::data(Value::Number(m.start as f64)), 3716 + ); 3717 + data.properties.insert( 3718 + "input".to_string(), 3719 + Property::data(Value::String(input.to_string())), 3720 + ); 3721 + data.properties 3722 + .insert("groups".to_string(), Property::data(groups_val)); 3723 + } 3724 + Ok(Value::Object(r)) 3725 + } else { 3726 + Ok(arr) 3727 + } 3728 + } 3729 + None => { 3730 + if is_global || is_sticky { 3731 + regexp_set_last_index(gc, this, 0.0); 3732 + } 3733 + Ok(Value::Null) 3734 + } 3735 + } 3736 + } 3737 + 3738 + /// Collect named groups from a regex AST node. 3739 + fn collect_named_groups(node: &crate::regex::Node) -> Vec<(String, u32)> { 3740 + use crate::regex::Node; 3741 + let mut result = Vec::new(); 3742 + match node { 3743 + Node::Group { 3744 + index, 3745 + name: Some(name), 3746 + node: inner, 3747 + } => { 3748 + result.push((name.clone(), *index)); 3749 + result.extend(collect_named_groups(inner)); 3750 + } 3751 + Node::Group { node: inner, .. } 3752 + | Node::NonCapturingGroup(inner) 3753 + | Node::Lookahead(inner) 3754 + | Node::NegativeLookahead(inner) => { 3755 + result.extend(collect_named_groups(inner)); 3756 + } 3757 + Node::Quantifier { node: inner, .. } => { 3758 + result.extend(collect_named_groups(inner)); 3759 + } 3760 + Node::Sequence(nodes) | Node::Alternation(nodes) => { 3761 + for n in nodes { 3762 + result.extend(collect_named_groups(n)); 3763 + } 3764 + } 3765 + _ => {} 3766 + } 3767 + result 3768 + } 3769 + 3770 + fn regexp_constructor(args: &[Value], ctx: &mut NativeContext) -> Result<Value, RuntimeError> { 3771 + let proto = REGEXP_PROTO.with(|cell| cell.get()); 3772 + let pattern = args 3773 + .first() 3774 + .map(|v| v.to_js_string(ctx.gc)) 3775 + .unwrap_or_default(); 3776 + let flags = args 3777 + .get(1) 3778 + .map(|v| { 3779 + if matches!(v, Value::Undefined) { 3780 + String::new() 3781 + } else { 3782 + v.to_js_string(ctx.gc) 3783 + } 3784 + }) 3785 + .unwrap_or_default(); 3786 + 3787 + make_regexp_obj(ctx.gc, &pattern, &flags, proto).map_err(RuntimeError::syntax_error) 3788 + } 3789 + 3790 + fn regexp_proto_test(args: &[Value], ctx: &mut NativeContext) -> Result<Value, RuntimeError> { 3791 + let input = args 3792 + .first() 3793 + .map(|v| v.to_js_string(ctx.gc)) 3794 + .unwrap_or_default(); 3795 + let result = regexp_exec_internal(ctx.gc, &ctx.this, &input)?; 3796 + Ok(Value::Boolean(!matches!(result, Value::Null))) 3797 + } 3798 + 3799 + fn regexp_proto_exec(args: &[Value], ctx: &mut NativeContext) -> Result<Value, RuntimeError> { 3800 + let input = args 3801 + .first() 3802 + .map(|v| v.to_js_string(ctx.gc)) 3803 + .unwrap_or_default(); 3804 + regexp_exec_internal(ctx.gc, &ctx.this, &input) 3805 + } 3806 + 3807 + fn regexp_proto_to_string(_args: &[Value], ctx: &mut NativeContext) -> Result<Value, RuntimeError> { 3808 + let pattern = regexp_get_pattern(ctx.gc, &ctx.this).unwrap_or_default(); 3809 + let flags = regexp_get_flags(ctx.gc, &ctx.this).unwrap_or_default(); 3810 + Ok(Value::String(format!("/{}/{}", pattern, flags))) 3089 3811 } 3090 3812 3091 3813 // ── JSON object ──────────────────────────────────────────────
+20 -3
crates/js/src/compiler.rs
··· 2403 2403 compile_expr(fc, inner, dst)?; 2404 2404 } 2405 2405 2406 - ExprKind::RegExp { .. } => { 2407 - // RegExp literals are created at runtime by the VM. 2408 - fc.builder.emit_reg(Op::LoadUndefined, dst); 2406 + ExprKind::RegExp { pattern, flags } => { 2407 + // Compile as: RegExp(pattern, flags) — a call to the global constructor. 2408 + let func_reg = fc.alloc_reg(); 2409 + let name_idx = fc.builder.add_name("RegExp"); 2410 + fc.builder.emit_reg_u16(Op::LoadGlobal, func_reg, name_idx); 2411 + 2412 + let args_start = fc.next_reg; 2413 + let pat_reg = fc.alloc_reg(); 2414 + let pat_idx = fc.builder.add_constant(Constant::String(pattern.clone())); 2415 + fc.builder.emit_reg_u16(Op::LoadConst, pat_reg, pat_idx); 2416 + 2417 + let flags_reg = fc.alloc_reg(); 2418 + let flags_idx = fc.builder.add_constant(Constant::String(flags.clone())); 2419 + fc.builder.emit_reg_u16(Op::LoadConst, flags_reg, flags_idx); 2420 + 2421 + fc.builder.emit_call(dst, func_reg, args_start, 2); 2422 + 2423 + fc.next_reg -= 1; // flags_reg 2424 + fc.next_reg -= 1; // pat_reg 2425 + fc.free_reg(func_reg); 2409 2426 } 2410 2427 2411 2428 ExprKind::OptionalChain { base } => {
+1
crates/js/src/lib.rs
··· 7 7 pub mod gc; 8 8 pub mod lexer; 9 9 pub mod parser; 10 + pub mod regex; 10 11 pub mod vm; 11 12 12 13 use std::fmt;
+1370
crates/js/src/regex.rs
··· 1 + //! Pure-Rust regular expression engine for the JavaScript RegExp built-in. 2 + //! 3 + //! Implements a regex parser (pattern → IR) and a backtracking matcher. 4 + //! Supports: character classes, quantifiers (greedy/lazy), anchors, groups 5 + //! (capturing, non-capturing, named), alternation, backreferences, lookahead, 6 + //! and standard escape sequences. 7 + 8 + // ── Regex AST ─────────────────────────────────────────────── 9 + 10 + /// A parsed regex node. 11 + #[derive(Debug, Clone)] 12 + pub enum Node { 13 + /// Match a single literal character. 14 + Literal(char), 15 + /// `.` — match any character (respects dotAll flag). 16 + Dot, 17 + /// `^` — start anchor. 18 + Start, 19 + /// `$` — end anchor. 20 + End, 21 + /// `\b` — word boundary. 22 + WordBoundary, 23 + /// `\B` — non-word boundary. 24 + NonWordBoundary, 25 + /// Character class `[...]` or `[^...]`. 26 + CharClass { 27 + negated: bool, 28 + ranges: Vec<ClassRange>, 29 + }, 30 + /// Alternation `a|b`. 31 + Alternation(Vec<Node>), 32 + /// Sequence of nodes (implicit concatenation). 33 + Sequence(Vec<Node>), 34 + /// Quantifier applied to a sub-node. 35 + Quantifier { 36 + node: Box<Node>, 37 + min: u32, 38 + max: Option<u32>, // None = unbounded 39 + greedy: bool, 40 + }, 41 + /// Capturing group `(...)`. 42 + Group { 43 + index: u32, 44 + name: Option<String>, 45 + node: Box<Node>, 46 + }, 47 + /// Non-capturing group `(?:...)`. 48 + NonCapturingGroup(Box<Node>), 49 + /// Backreference `\1` or `\k<name>`. 50 + Backref(u32), 51 + /// Positive lookahead `(?=...)`. 52 + Lookahead(Box<Node>), 53 + /// Negative lookahead `(?!...)`. 54 + NegativeLookahead(Box<Node>), 55 + } 56 + 57 + /// A range within a character class. 58 + #[derive(Debug, Clone)] 59 + pub enum ClassRange { 60 + /// Single character. 61 + Char(char), 62 + /// Character range `a-z`. 63 + Range(char, char), 64 + /// Predefined class like `\d`, `\w`, `\s`. 65 + Predefined(PredefinedClass), 66 + } 67 + 68 + /// Predefined character classes. 69 + #[derive(Debug, Clone, Copy)] 70 + pub enum PredefinedClass { 71 + Digit, // \d 72 + NonDigit, // \D 73 + Word, // \w 74 + NonWord, // \W 75 + Space, // \s 76 + NonSpace, // \S 77 + } 78 + 79 + // ── Regex flags ───────────────────────────────────────────── 80 + 81 + /// Parsed regex flags. 82 + #[derive(Debug, Clone, Copy, Default)] 83 + pub struct RegexFlags { 84 + pub global: bool, 85 + pub ignore_case: bool, 86 + pub multiline: bool, 87 + pub dot_all: bool, 88 + pub unicode: bool, 89 + pub sticky: bool, 90 + } 91 + 92 + impl RegexFlags { 93 + pub fn parse(flags_str: &str) -> Result<Self, String> { 94 + let mut f = RegexFlags::default(); 95 + for ch in flags_str.chars() { 96 + match ch { 97 + 'g' => { 98 + if f.global { 99 + return Err(format!("duplicate flag '{}'", ch)); 100 + } 101 + f.global = true; 102 + } 103 + 'i' => { 104 + if f.ignore_case { 105 + return Err(format!("duplicate flag '{}'", ch)); 106 + } 107 + f.ignore_case = true; 108 + } 109 + 'm' => { 110 + if f.multiline { 111 + return Err(format!("duplicate flag '{}'", ch)); 112 + } 113 + f.multiline = true; 114 + } 115 + 's' => { 116 + if f.dot_all { 117 + return Err(format!("duplicate flag '{}'", ch)); 118 + } 119 + f.dot_all = true; 120 + } 121 + 'u' => { 122 + if f.unicode { 123 + return Err(format!("duplicate flag '{}'", ch)); 124 + } 125 + f.unicode = true; 126 + } 127 + 'y' => { 128 + if f.sticky { 129 + return Err(format!("duplicate flag '{}'", ch)); 130 + } 131 + f.sticky = true; 132 + } 133 + _ => return Err(format!("invalid flag '{}'", ch)), 134 + } 135 + } 136 + Ok(f) 137 + } 138 + 139 + pub fn as_flag_string(&self) -> String { 140 + let mut s = String::new(); 141 + if self.global { 142 + s.push('g'); 143 + } 144 + if self.ignore_case { 145 + s.push('i'); 146 + } 147 + if self.multiline { 148 + s.push('m'); 149 + } 150 + if self.dot_all { 151 + s.push('s'); 152 + } 153 + if self.unicode { 154 + s.push('u'); 155 + } 156 + if self.sticky { 157 + s.push('y'); 158 + } 159 + s 160 + } 161 + } 162 + 163 + // ── Regex Parser ──────────────────────────────────────────── 164 + 165 + /// Parse a regex pattern string into a `Node` tree. 166 + pub fn parse_pattern(pattern: &str) -> Result<(Node, u32), String> { 167 + let chars: Vec<char> = pattern.chars().collect(); 168 + let mut parser = PatternParser { 169 + chars: &chars, 170 + pos: 0, 171 + group_count: 0, 172 + group_names: Vec::new(), 173 + }; 174 + let node = parser.parse_alternation()?; 175 + if parser.pos < parser.chars.len() { 176 + return Err(format!( 177 + "unexpected '{}' at position {}", 178 + parser.chars[parser.pos], parser.pos 179 + )); 180 + } 181 + Ok((node, parser.group_count)) 182 + } 183 + 184 + struct PatternParser<'a> { 185 + chars: &'a [char], 186 + pos: usize, 187 + group_count: u32, 188 + group_names: Vec<(String, u32)>, 189 + } 190 + 191 + impl<'a> PatternParser<'a> { 192 + fn peek(&self) -> Option<char> { 193 + self.chars.get(self.pos).copied() 194 + } 195 + 196 + fn advance(&mut self) -> Option<char> { 197 + let ch = self.chars.get(self.pos).copied(); 198 + if ch.is_some() { 199 + self.pos += 1; 200 + } 201 + ch 202 + } 203 + 204 + fn expect(&mut self, expected: char) -> Result<(), String> { 205 + match self.advance() { 206 + Some(ch) if ch == expected => Ok(()), 207 + Some(ch) => Err(format!("expected '{}', got '{}'", expected, ch)), 208 + None => Err(format!("expected '{}', got end of pattern", expected)), 209 + } 210 + } 211 + 212 + /// Parse alternation: `a|b|c` 213 + fn parse_alternation(&mut self) -> Result<Node, String> { 214 + let mut branches = vec![self.parse_sequence()?]; 215 + while self.peek() == Some('|') { 216 + self.advance(); // consume '|' 217 + branches.push(self.parse_sequence()?); 218 + } 219 + if branches.len() == 1 { 220 + Ok(branches.pop().unwrap()) 221 + } else { 222 + Ok(Node::Alternation(branches)) 223 + } 224 + } 225 + 226 + /// Parse a sequence of atoms (concatenation). 227 + fn parse_sequence(&mut self) -> Result<Node, String> { 228 + let mut nodes = Vec::new(); 229 + while let Some(ch) = self.peek() { 230 + if ch == '|' || ch == ')' { 231 + break; 232 + } 233 + nodes.push(self.parse_quantified()?); 234 + } 235 + if nodes.len() == 1 { 236 + Ok(nodes.pop().unwrap()) 237 + } else { 238 + Ok(Node::Sequence(nodes)) 239 + } 240 + } 241 + 242 + /// Parse an atom with optional quantifier. 243 + fn parse_quantified(&mut self) -> Result<Node, String> { 244 + let node = self.parse_atom()?; 245 + if let Some(ch) = self.peek() { 246 + match ch { 247 + '*' | '+' | '?' => { 248 + self.advance(); 249 + let (min, max) = match ch { 250 + '*' => (0, None), 251 + '+' => (1, None), 252 + '?' => (0, Some(1)), 253 + _ => unreachable!(), 254 + }; 255 + let greedy = if self.peek() == Some('?') { 256 + self.advance(); 257 + false 258 + } else { 259 + true 260 + }; 261 + Ok(Node::Quantifier { 262 + node: Box::new(node), 263 + min, 264 + max, 265 + greedy, 266 + }) 267 + } 268 + '{' => self.parse_brace_quantifier(node), 269 + _ => Ok(node), 270 + } 271 + } else { 272 + Ok(node) 273 + } 274 + } 275 + 276 + /// Parse `{n}`, `{n,}`, `{n,m}` quantifier. 277 + fn parse_brace_quantifier(&mut self, node: Node) -> Result<Node, String> { 278 + let save = self.pos; 279 + self.advance(); // consume '{' 280 + let min = match self.parse_decimal() { 281 + Some(n) => n, 282 + None => { 283 + // Not a valid quantifier — treat '{' as literal. 284 + self.pos = save; 285 + return Ok(node); 286 + } 287 + }; 288 + let max; 289 + match self.peek() { 290 + Some('}') => { 291 + self.advance(); 292 + max = Some(min); 293 + } 294 + Some(',') => { 295 + self.advance(); 296 + if self.peek() == Some('}') { 297 + self.advance(); 298 + max = None; 299 + } else { 300 + match self.parse_decimal() { 301 + Some(n) => { 302 + max = Some(n); 303 + if self.peek() != Some('}') { 304 + self.pos = save; 305 + return Ok(node); 306 + } 307 + self.advance(); 308 + } 309 + None => { 310 + self.pos = save; 311 + return Ok(node); 312 + } 313 + } 314 + } 315 + } 316 + _ => { 317 + self.pos = save; 318 + return Ok(node); 319 + } 320 + } 321 + let greedy = if self.peek() == Some('?') { 322 + self.advance(); 323 + false 324 + } else { 325 + true 326 + }; 327 + Ok(Node::Quantifier { 328 + node: Box::new(node), 329 + min, 330 + max, 331 + greedy, 332 + }) 333 + } 334 + 335 + fn parse_decimal(&mut self) -> Option<u32> { 336 + let start = self.pos; 337 + while let Some(ch) = self.peek() { 338 + if ch.is_ascii_digit() { 339 + self.advance(); 340 + } else { 341 + break; 342 + } 343 + } 344 + if self.pos == start { 345 + return None; 346 + } 347 + let s: String = self.chars[start..self.pos].iter().collect(); 348 + s.parse().ok() 349 + } 350 + 351 + /// Parse a single atom. 352 + fn parse_atom(&mut self) -> Result<Node, String> { 353 + match self.peek() { 354 + None => Err("unexpected end of pattern".to_string()), 355 + Some('.') => { 356 + self.advance(); 357 + Ok(Node::Dot) 358 + } 359 + Some('^') => { 360 + self.advance(); 361 + Ok(Node::Start) 362 + } 363 + Some('$') => { 364 + self.advance(); 365 + Ok(Node::End) 366 + } 367 + Some('\\') => self.parse_escape(), 368 + Some('[') => self.parse_char_class(), 369 + Some('(') => self.parse_group(), 370 + Some(ch) => { 371 + self.advance(); 372 + Ok(Node::Literal(ch)) 373 + } 374 + } 375 + } 376 + 377 + /// Parse an escape sequence. 378 + fn parse_escape(&mut self) -> Result<Node, String> { 379 + self.advance(); // consume '\' 380 + match self.advance() { 381 + None => Err("unexpected end of pattern after '\\'".to_string()), 382 + Some('d') => Ok(Node::CharClass { 383 + negated: false, 384 + ranges: vec![ClassRange::Predefined(PredefinedClass::Digit)], 385 + }), 386 + Some('D') => Ok(Node::CharClass { 387 + negated: false, 388 + ranges: vec![ClassRange::Predefined(PredefinedClass::NonDigit)], 389 + }), 390 + Some('w') => Ok(Node::CharClass { 391 + negated: false, 392 + ranges: vec![ClassRange::Predefined(PredefinedClass::Word)], 393 + }), 394 + Some('W') => Ok(Node::CharClass { 395 + negated: false, 396 + ranges: vec![ClassRange::Predefined(PredefinedClass::NonWord)], 397 + }), 398 + Some('s') => Ok(Node::CharClass { 399 + negated: false, 400 + ranges: vec![ClassRange::Predefined(PredefinedClass::Space)], 401 + }), 402 + Some('S') => Ok(Node::CharClass { 403 + negated: false, 404 + ranges: vec![ClassRange::Predefined(PredefinedClass::NonSpace)], 405 + }), 406 + Some('b') => Ok(Node::WordBoundary), 407 + Some('B') => Ok(Node::NonWordBoundary), 408 + Some('n') => Ok(Node::Literal('\n')), 409 + Some('r') => Ok(Node::Literal('\r')), 410 + Some('t') => Ok(Node::Literal('\t')), 411 + Some('f') => Ok(Node::Literal('\x0C')), 412 + Some('v') => Ok(Node::Literal('\x0B')), 413 + Some('0') => Ok(Node::Literal('\0')), 414 + Some('x') => { 415 + let hi = self.advance().ok_or("expected hex digit")?; 416 + let lo = self.advance().ok_or("expected hex digit")?; 417 + let code = hex2(hi, lo)?; 418 + Ok(Node::Literal(char::from(code))) 419 + } 420 + Some('u') => { 421 + if self.peek() == Some('{') { 422 + self.advance(); 423 + let mut code_str = String::new(); 424 + while self.peek() != Some('}') { 425 + code_str.push(self.advance().ok_or("expected '}'")?); 426 + } 427 + self.advance(); // consume '}' 428 + let code = 429 + u32::from_str_radix(&code_str, 16).map_err(|_| "invalid unicode escape")?; 430 + let ch = char::from_u32(code).ok_or("invalid unicode code point")?; 431 + Ok(Node::Literal(ch)) 432 + } else { 433 + let a = self.advance().ok_or("expected hex digit")?; 434 + let b = self.advance().ok_or("expected hex digit")?; 435 + let c = self.advance().ok_or("expected hex digit")?; 436 + let d = self.advance().ok_or("expected hex digit")?; 437 + let code = hex4(a, b, c, d)?; 438 + let ch = char::from_u32(code).ok_or("invalid unicode escape")?; 439 + Ok(Node::Literal(ch)) 440 + } 441 + } 442 + Some('k') => { 443 + // Named backreference \k<name> 444 + self.expect('<')?; 445 + let mut name = String::new(); 446 + while self.peek() != Some('>') { 447 + name.push(self.advance().ok_or("expected '>'")?); 448 + } 449 + self.advance(); // consume '>' 450 + // Resolve name to group index. 451 + for &(ref n, idx) in &self.group_names { 452 + if *n == name { 453 + return Ok(Node::Backref(idx)); 454 + } 455 + } 456 + Err(format!("unknown group name '{}'", name)) 457 + } 458 + Some(ch) if ch.is_ascii_digit() && ch != '0' => { 459 + // Numeric backreference \1, \12, etc. 460 + let mut num_str = String::new(); 461 + num_str.push(ch); 462 + while let Some(next) = self.peek() { 463 + if next.is_ascii_digit() { 464 + num_str.push(next); 465 + self.advance(); 466 + } else { 467 + break; 468 + } 469 + } 470 + let idx: u32 = num_str.parse().map_err(|_| "invalid backreference")?; 471 + Ok(Node::Backref(idx)) 472 + } 473 + // Escaped metacharacters — treat as literal. 474 + Some(ch) => Ok(Node::Literal(ch)), 475 + } 476 + } 477 + 478 + /// Parse a character class `[...]` or `[^...]`. 479 + fn parse_char_class(&mut self) -> Result<Node, String> { 480 + self.advance(); // consume '[' 481 + let negated = if self.peek() == Some('^') { 482 + self.advance(); 483 + true 484 + } else { 485 + false 486 + }; 487 + let mut ranges = Vec::new(); 488 + // Allow ']' as first character in class. 489 + if self.peek() == Some(']') { 490 + self.advance(); 491 + ranges.push(ClassRange::Char(']')); 492 + } 493 + while self.peek() != Some(']') { 494 + if self.peek().is_none() { 495 + return Err("unterminated character class".to_string()); 496 + } 497 + let item = self.parse_class_atom()?; 498 + // Check for range `a-b`. 499 + if self.peek() == Some('-') 500 + && self.chars.get(self.pos + 1) != Some(&']') 501 + && self.chars.get(self.pos + 1).is_some() 502 + { 503 + if let ClassRange::Char(start) = item { 504 + self.advance(); // consume '-' 505 + let end_item = self.parse_class_atom()?; 506 + if let ClassRange::Char(end) = end_item { 507 + if start > end { 508 + return Err("character class range out of order".to_string()); 509 + } 510 + ranges.push(ClassRange::Range(start, end)); 511 + continue; 512 + } else { 513 + // Not a valid range, push start, '-', and end_item. 514 + ranges.push(ClassRange::Char(start)); 515 + ranges.push(ClassRange::Char('-')); 516 + ranges.push(end_item); 517 + continue; 518 + } 519 + } 520 + } 521 + ranges.push(item); 522 + } 523 + self.advance(); // consume ']' 524 + Ok(Node::CharClass { negated, ranges }) 525 + } 526 + 527 + fn parse_class_atom(&mut self) -> Result<ClassRange, String> { 528 + match self.peek() { 529 + Some('\\') => { 530 + self.advance(); // consume '\' 531 + match self.advance() { 532 + None => Err("unexpected end of class".to_string()), 533 + Some('d') => Ok(ClassRange::Predefined(PredefinedClass::Digit)), 534 + Some('D') => Ok(ClassRange::Predefined(PredefinedClass::NonDigit)), 535 + Some('w') => Ok(ClassRange::Predefined(PredefinedClass::Word)), 536 + Some('W') => Ok(ClassRange::Predefined(PredefinedClass::NonWord)), 537 + Some('s') => Ok(ClassRange::Predefined(PredefinedClass::Space)), 538 + Some('S') => Ok(ClassRange::Predefined(PredefinedClass::NonSpace)), 539 + Some('n') => Ok(ClassRange::Char('\n')), 540 + Some('r') => Ok(ClassRange::Char('\r')), 541 + Some('t') => Ok(ClassRange::Char('\t')), 542 + Some('f') => Ok(ClassRange::Char('\x0C')), 543 + Some('v') => Ok(ClassRange::Char('\x0B')), 544 + Some('0') => Ok(ClassRange::Char('\0')), 545 + Some('x') => { 546 + let hi = self.advance().ok_or("expected hex digit")?; 547 + let lo = self.advance().ok_or("expected hex digit")?; 548 + let code = hex2(hi, lo)?; 549 + Ok(ClassRange::Char(char::from(code))) 550 + } 551 + Some('u') => { 552 + if self.peek() == Some('{') { 553 + self.advance(); 554 + let mut code_str = String::new(); 555 + while self.peek() != Some('}') { 556 + code_str.push(self.advance().ok_or("expected '}'")?); 557 + } 558 + self.advance(); 559 + let code = u32::from_str_radix(&code_str, 16) 560 + .map_err(|_| "invalid unicode escape")?; 561 + let ch = char::from_u32(code) 562 + .ok_or_else(|| "invalid unicode code point".to_string())?; 563 + Ok(ClassRange::Char(ch)) 564 + } else { 565 + let a = self.advance().ok_or("expected hex digit")?; 566 + let b = self.advance().ok_or("expected hex digit")?; 567 + let c = self.advance().ok_or("expected hex digit")?; 568 + let d = self.advance().ok_or("expected hex digit")?; 569 + let code = hex4(a, b, c, d)?; 570 + let ch = char::from_u32(code) 571 + .ok_or_else(|| "invalid unicode escape".to_string())?; 572 + Ok(ClassRange::Char(ch)) 573 + } 574 + } 575 + Some(ch) => Ok(ClassRange::Char(ch)), 576 + } 577 + } 578 + Some(ch) => { 579 + self.advance(); 580 + Ok(ClassRange::Char(ch)) 581 + } 582 + None => Err("unexpected end of character class".to_string()), 583 + } 584 + } 585 + 586 + /// Parse a group `(...)`. 587 + fn parse_group(&mut self) -> Result<Node, String> { 588 + self.advance(); // consume '(' 589 + if self.peek() == Some('?') { 590 + self.advance(); // consume '?' 591 + match self.peek() { 592 + Some(':') => { 593 + self.advance(); 594 + let inner = self.parse_alternation()?; 595 + self.expect(')')?; 596 + Ok(Node::NonCapturingGroup(Box::new(inner))) 597 + } 598 + Some('=') => { 599 + self.advance(); 600 + let inner = self.parse_alternation()?; 601 + self.expect(')')?; 602 + Ok(Node::Lookahead(Box::new(inner))) 603 + } 604 + Some('!') => { 605 + self.advance(); 606 + let inner = self.parse_alternation()?; 607 + self.expect(')')?; 608 + Ok(Node::NegativeLookahead(Box::new(inner))) 609 + } 610 + Some('<') => { 611 + self.advance(); // consume '<' 612 + let mut name = String::new(); 613 + while self.peek() != Some('>') { 614 + name.push(self.advance().ok_or("expected '>'")?); 615 + } 616 + self.advance(); // consume '>' 617 + self.group_count += 1; 618 + let idx = self.group_count; 619 + self.group_names.push((name.clone(), idx)); 620 + let inner = self.parse_alternation()?; 621 + self.expect(')')?; 622 + Ok(Node::Group { 623 + index: idx, 624 + name: Some(name), 625 + node: Box::new(inner), 626 + }) 627 + } 628 + _ => Err("invalid group specifier".to_string()), 629 + } 630 + } else { 631 + self.group_count += 1; 632 + let idx = self.group_count; 633 + let inner = self.parse_alternation()?; 634 + self.expect(')')?; 635 + Ok(Node::Group { 636 + index: idx, 637 + name: None, 638 + node: Box::new(inner), 639 + }) 640 + } 641 + } 642 + } 643 + 644 + // ── Hex helpers ───────────────────────────────────────────── 645 + 646 + fn hex_digit(ch: char) -> Result<u32, String> { 647 + match ch { 648 + '0'..='9' => Ok(ch as u32 - '0' as u32), 649 + 'a'..='f' => Ok(ch as u32 - 'a' as u32 + 10), 650 + 'A'..='F' => Ok(ch as u32 - 'A' as u32 + 10), 651 + _ => Err(format!("invalid hex digit '{}'", ch)), 652 + } 653 + } 654 + 655 + fn hex2(hi: char, lo: char) -> Result<u8, String> { 656 + Ok((hex_digit(hi)? * 16 + hex_digit(lo)?) as u8) 657 + } 658 + 659 + fn hex4(a: char, b: char, c: char, d: char) -> Result<u32, String> { 660 + Ok(hex_digit(a)? * 4096 + hex_digit(b)? * 256 + hex_digit(c)? * 16 + hex_digit(d)?) 661 + } 662 + 663 + // ── Compiled regex ────────────────────────────────────────── 664 + 665 + /// A compiled regular expression ready for matching. 666 + #[derive(Debug, Clone)] 667 + pub struct CompiledRegex { 668 + pub pattern: String, 669 + pub flags: RegexFlags, 670 + pub node: Node, 671 + pub group_count: u32, 672 + } 673 + 674 + impl CompiledRegex { 675 + pub fn new(pattern: &str, flags_str: &str) -> Result<Self, String> { 676 + let flags = RegexFlags::parse(flags_str)?; 677 + let (node, group_count) = parse_pattern(pattern)?; 678 + Ok(CompiledRegex { 679 + pattern: pattern.to_string(), 680 + flags, 681 + node, 682 + group_count, 683 + }) 684 + } 685 + } 686 + 687 + // ── Backtracking matcher ──────────────────────────────────── 688 + 689 + /// Result of a successful regex match. 690 + #[derive(Debug, Clone)] 691 + pub struct MatchResult { 692 + /// Overall match start index (in chars). 693 + pub start: usize, 694 + /// Overall match end index (in chars). 695 + pub end: usize, 696 + /// Capture group contents: index 0 = full match, 1..n = groups. 697 + pub captures: Vec<Option<(usize, usize)>>, 698 + } 699 + 700 + /// Execute the regex on the given string, starting the search at `start_pos` (char index). 701 + pub fn exec(regex: &CompiledRegex, input: &str, start_pos: usize) -> Option<MatchResult> { 702 + let chars: Vec<char> = input.chars().collect(); 703 + let group_count = regex.group_count as usize; 704 + 705 + if regex.flags.sticky { 706 + // Sticky: only try at start_pos. 707 + let mut captures = vec![None; group_count + 1]; 708 + let mut ctx = MatchContext { 709 + chars: &chars, 710 + flags: &regex.flags, 711 + captures: &mut captures, 712 + backtrack_limit: 1_000_000, 713 + backtrack_count: 0, 714 + }; 715 + if let Some(end) = match_node(&mut ctx, &regex.node, start_pos) { 716 + captures = ctx.captures.to_vec(); 717 + captures[0] = Some((start_pos, end)); 718 + return Some(MatchResult { 719 + start: start_pos, 720 + end, 721 + captures, 722 + }); 723 + } 724 + return None; 725 + } 726 + 727 + // Non-sticky: try each position starting from start_pos. 728 + for i in start_pos..=chars.len() { 729 + let mut captures = vec![None; group_count + 1]; 730 + let mut ctx = MatchContext { 731 + chars: &chars, 732 + flags: &regex.flags, 733 + captures: &mut captures, 734 + backtrack_limit: 1_000_000, 735 + backtrack_count: 0, 736 + }; 737 + if let Some(end) = match_node(&mut ctx, &regex.node, i) { 738 + captures = ctx.captures.to_vec(); 739 + captures[0] = Some((i, end)); 740 + return Some(MatchResult { 741 + start: i, 742 + end, 743 + captures, 744 + }); 745 + } 746 + } 747 + None 748 + } 749 + 750 + struct MatchContext<'a> { 751 + chars: &'a [char], 752 + flags: &'a RegexFlags, 753 + captures: &'a mut Vec<Option<(usize, usize)>>, 754 + backtrack_limit: u32, 755 + backtrack_count: u32, 756 + } 757 + 758 + /// Try to match `node` at position `pos`, returning the end position on success. 759 + fn match_node(ctx: &mut MatchContext, node: &Node, pos: usize) -> Option<usize> { 760 + ctx.backtrack_count += 1; 761 + if ctx.backtrack_count > ctx.backtrack_limit { 762 + return None; 763 + } 764 + 765 + match node { 766 + Node::Literal(ch) => { 767 + if pos < ctx.chars.len() { 768 + let input_ch = ctx.chars[pos]; 769 + if ctx.flags.ignore_case { 770 + if char_eq_ignore_case(input_ch, *ch) { 771 + Some(pos + 1) 772 + } else { 773 + None 774 + } 775 + } else if input_ch == *ch { 776 + Some(pos + 1) 777 + } else { 778 + None 779 + } 780 + } else { 781 + None 782 + } 783 + } 784 + 785 + Node::Dot => { 786 + if pos < ctx.chars.len() { 787 + let ch = ctx.chars[pos]; 788 + if ctx.flags.dot_all || (ch != '\n' && ch != '\r') { 789 + Some(pos + 1) 790 + } else { 791 + None 792 + } 793 + } else { 794 + None 795 + } 796 + } 797 + 798 + Node::Start => { 799 + if pos == 0 800 + || (ctx.flags.multiline && pos > 0 && is_line_terminator(ctx.chars[pos - 1])) 801 + { 802 + Some(pos) 803 + } else { 804 + None 805 + } 806 + } 807 + 808 + Node::End => { 809 + if pos == ctx.chars.len() 810 + || (ctx.flags.multiline 811 + && pos < ctx.chars.len() 812 + && is_line_terminator(ctx.chars[pos])) 813 + { 814 + Some(pos) 815 + } else { 816 + None 817 + } 818 + } 819 + 820 + Node::WordBoundary => { 821 + let before = if pos > 0 { 822 + is_word_char(ctx.chars[pos - 1]) 823 + } else { 824 + false 825 + }; 826 + let after = if pos < ctx.chars.len() { 827 + is_word_char(ctx.chars[pos]) 828 + } else { 829 + false 830 + }; 831 + if before != after { 832 + Some(pos) 833 + } else { 834 + None 835 + } 836 + } 837 + 838 + Node::NonWordBoundary => { 839 + let before = if pos > 0 { 840 + is_word_char(ctx.chars[pos - 1]) 841 + } else { 842 + false 843 + }; 844 + let after = if pos < ctx.chars.len() { 845 + is_word_char(ctx.chars[pos]) 846 + } else { 847 + false 848 + }; 849 + if before == after { 850 + Some(pos) 851 + } else { 852 + None 853 + } 854 + } 855 + 856 + Node::CharClass { negated, ranges } => { 857 + if pos >= ctx.chars.len() { 858 + return None; 859 + } 860 + let ch = ctx.chars[pos]; 861 + let matched = class_matches(ranges, ch, ctx.flags.ignore_case); 862 + if matched != *negated { 863 + Some(pos + 1) 864 + } else { 865 + None 866 + } 867 + } 868 + 869 + Node::Sequence(nodes) => match_sequence(ctx, nodes, 0, pos), 870 + 871 + Node::Alternation(branches) => { 872 + for branch in branches { 873 + let saved = ctx.captures.clone(); 874 + if let Some(end) = match_node(ctx, branch, pos) { 875 + return Some(end); 876 + } 877 + *ctx.captures = saved; 878 + } 879 + None 880 + } 881 + 882 + Node::Quantifier { 883 + node: inner, 884 + min, 885 + max, 886 + greedy, 887 + } => match_quantifier_standalone(ctx, inner, pos, *min, *max, *greedy), 888 + 889 + Node::Group { 890 + index, node: inner, .. 891 + } => { 892 + let idx = *index as usize; 893 + let saved = if idx < ctx.captures.len() { 894 + ctx.captures[idx] 895 + } else { 896 + None 897 + }; 898 + let result = match_node(ctx, inner, pos); 899 + if let Some(end) = result { 900 + if idx < ctx.captures.len() { 901 + ctx.captures[idx] = Some((pos, end)); 902 + } 903 + Some(end) 904 + } else { 905 + if idx < ctx.captures.len() { 906 + ctx.captures[idx] = saved; 907 + } 908 + None 909 + } 910 + } 911 + 912 + Node::NonCapturingGroup(inner) => match_node(ctx, inner, pos), 913 + 914 + Node::Backref(idx) => { 915 + let idx = *idx as usize; 916 + if idx >= ctx.captures.len() { 917 + return Some(pos); // Unmatched backref matches empty. 918 + } 919 + match ctx.captures[idx] { 920 + Some((start, end)) => { 921 + let cap_len = end - start; 922 + if pos + cap_len > ctx.chars.len() { 923 + return None; 924 + } 925 + for i in 0..cap_len { 926 + let a = ctx.chars[start + i]; 927 + let b = ctx.chars[pos + i]; 928 + if ctx.flags.ignore_case { 929 + if !char_eq_ignore_case(a, b) { 930 + return None; 931 + } 932 + } else if a != b { 933 + return None; 934 + } 935 + } 936 + Some(pos + cap_len) 937 + } 938 + None => Some(pos), // Unmatched group — backref matches empty. 939 + } 940 + } 941 + 942 + Node::Lookahead(inner) => { 943 + let saved = ctx.captures.clone(); 944 + if match_node(ctx, inner, pos).is_some() { 945 + Some(pos) // Lookahead doesn't consume. 946 + } else { 947 + *ctx.captures = saved; 948 + None 949 + } 950 + } 951 + 952 + Node::NegativeLookahead(inner) => { 953 + let saved = ctx.captures.clone(); 954 + if match_node(ctx, inner, pos).is_some() { 955 + *ctx.captures = saved; 956 + None 957 + } else { 958 + *ctx.captures = saved; 959 + Some(pos) 960 + } 961 + } 962 + } 963 + } 964 + 965 + /// Match a sequence of nodes with backtracking support for quantifiers. 966 + fn match_sequence(ctx: &mut MatchContext, nodes: &[Node], idx: usize, pos: usize) -> Option<usize> { 967 + if idx >= nodes.len() { 968 + return Some(pos); 969 + } 970 + 971 + let node = &nodes[idx]; 972 + 973 + // For quantifiers, try each count with the remaining sequence as continuation. 974 + if let Node::Quantifier { 975 + node: inner, 976 + min, 977 + max, 978 + greedy, 979 + } = node 980 + { 981 + return match_quantifier_in_seq(ctx, inner, pos, *min, *max, *greedy, &nodes[idx + 1..]); 982 + } 983 + 984 + // For non-quantifier nodes, match and continue. 985 + let saved = ctx.captures.clone(); 986 + match match_node(ctx, node, pos) { 987 + Some(next) => match_sequence(ctx, nodes, idx + 1, next).or_else(|| { 988 + *ctx.captures = saved; 989 + None 990 + }), 991 + None => None, 992 + } 993 + } 994 + 995 + /// Match a quantifier within a sequence, trying each count with the continuation. 996 + fn match_quantifier_in_seq( 997 + ctx: &mut MatchContext, 998 + inner: &Node, 999 + pos: usize, 1000 + min: u32, 1001 + max: Option<u32>, 1002 + greedy: bool, 1003 + continuation: &[Node], 1004 + ) -> Option<usize> { 1005 + // Collect all reachable positions (match inner 0 to max times). 1006 + let mut positions = vec![pos]; // positions[0] = 0 matches 1007 + let mut cur = pos; 1008 + 1009 + loop { 1010 + let count = positions.len() - 1; 1011 + if let Some(m) = max { 1012 + if count as u32 >= m { 1013 + break; 1014 + } 1015 + } 1016 + let saved = ctx.captures.clone(); 1017 + match match_node(ctx, inner, cur) { 1018 + Some(next) if next > cur => { 1019 + cur = next; 1020 + positions.push(cur); 1021 + } 1022 + Some(next) => { 1023 + // Zero-width match — record once and stop to prevent infinite loop. 1024 + *ctx.captures = saved; 1025 + if next == cur && (count as u32) < min { 1026 + positions.push(cur); 1027 + } 1028 + break; 1029 + } 1030 + None => { 1031 + *ctx.captures = saved; 1032 + break; 1033 + } 1034 + } 1035 + } 1036 + 1037 + let max_count = positions.len() - 1; 1038 + let min_count = min as usize; 1039 + if max_count < min_count { 1040 + return None; 1041 + } 1042 + 1043 + if greedy { 1044 + // Try from most matches to fewest. 1045 + for &p in positions[min_count..=max_count].iter().rev() { 1046 + let saved = ctx.captures.clone(); 1047 + if let Some(end) = match_sequence(ctx, continuation, 0, p) { 1048 + return Some(end); 1049 + } 1050 + *ctx.captures = saved; 1051 + } 1052 + None 1053 + } else { 1054 + // Lazy: try from fewest to most. 1055 + for &p in &positions[min_count..=max_count] { 1056 + let saved = ctx.captures.clone(); 1057 + if let Some(end) = match_sequence(ctx, continuation, 0, p) { 1058 + return Some(end); 1059 + } 1060 + *ctx.captures = saved; 1061 + } 1062 + None 1063 + } 1064 + } 1065 + 1066 + /// Match a standalone quantifier (not in a sequence context). 1067 + fn match_quantifier_standalone( 1068 + ctx: &mut MatchContext, 1069 + inner: &Node, 1070 + pos: usize, 1071 + min: u32, 1072 + max: Option<u32>, 1073 + greedy: bool, 1074 + ) -> Option<usize> { 1075 + let mut positions = vec![pos]; 1076 + let mut cur = pos; 1077 + 1078 + loop { 1079 + let count = positions.len() - 1; 1080 + if let Some(m) = max { 1081 + if count as u32 >= m { 1082 + break; 1083 + } 1084 + } 1085 + let saved = ctx.captures.clone(); 1086 + match match_node(ctx, inner, cur) { 1087 + Some(next) if next > cur => { 1088 + cur = next; 1089 + positions.push(cur); 1090 + } 1091 + Some(next) => { 1092 + *ctx.captures = saved; 1093 + if next == cur && (count as u32) < min { 1094 + positions.push(cur); 1095 + } 1096 + break; 1097 + } 1098 + None => { 1099 + *ctx.captures = saved; 1100 + break; 1101 + } 1102 + } 1103 + } 1104 + 1105 + let max_count = positions.len() - 1; 1106 + let min_count = min as usize; 1107 + if max_count < min_count { 1108 + return None; 1109 + } 1110 + 1111 + if greedy { 1112 + Some(positions[max_count]) 1113 + } else { 1114 + Some(positions[min_count]) 1115 + } 1116 + } 1117 + 1118 + // ── Character helpers ─────────────────────────────────────── 1119 + 1120 + fn is_line_terminator(ch: char) -> bool { 1121 + matches!(ch, '\n' | '\r' | '\u{2028}' | '\u{2029}') 1122 + } 1123 + 1124 + fn is_word_char(ch: char) -> bool { 1125 + ch.is_ascii_alphanumeric() || ch == '_' 1126 + } 1127 + 1128 + fn char_eq_ignore_case(a: char, b: char) -> bool { 1129 + a.eq_ignore_ascii_case(&b) 1130 + } 1131 + 1132 + fn class_matches(ranges: &[ClassRange], ch: char, ignore_case: bool) -> bool { 1133 + for range in ranges { 1134 + match range { 1135 + ClassRange::Char(c) => { 1136 + if ignore_case { 1137 + if char_eq_ignore_case(ch, *c) { 1138 + return true; 1139 + } 1140 + } else if ch == *c { 1141 + return true; 1142 + } 1143 + } 1144 + ClassRange::Range(start, end) => { 1145 + if ignore_case { 1146 + let ch_lower = ch.to_ascii_lowercase(); 1147 + let start_lower = start.to_ascii_lowercase(); 1148 + let end_lower = end.to_ascii_lowercase(); 1149 + if ch_lower >= start_lower && ch_lower <= end_lower { 1150 + return true; 1151 + } 1152 + } else if ch >= *start && ch <= *end { 1153 + return true; 1154 + } 1155 + } 1156 + ClassRange::Predefined(class) => { 1157 + if predefined_matches(*class, ch) { 1158 + return true; 1159 + } 1160 + } 1161 + } 1162 + } 1163 + false 1164 + } 1165 + 1166 + fn predefined_matches(class: PredefinedClass, ch: char) -> bool { 1167 + match class { 1168 + PredefinedClass::Digit => ch.is_ascii_digit(), 1169 + PredefinedClass::NonDigit => !ch.is_ascii_digit(), 1170 + PredefinedClass::Word => is_word_char(ch), 1171 + PredefinedClass::NonWord => !is_word_char(ch), 1172 + PredefinedClass::Space => matches!( 1173 + ch, 1174 + ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C' | '\u{00A0}' | '\u{FEFF}' 1175 + ), 1176 + PredefinedClass::NonSpace => !matches!( 1177 + ch, 1178 + ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C' | '\u{00A0}' | '\u{FEFF}' 1179 + ), 1180 + } 1181 + } 1182 + 1183 + // ── Tests ─────────────────────────────────────────────────── 1184 + 1185 + #[cfg(test)] 1186 + mod tests { 1187 + use super::*; 1188 + 1189 + fn match_at(pattern: &str, flags: &str, input: &str, start: usize) -> Option<MatchResult> { 1190 + let regex = CompiledRegex::new(pattern, flags).unwrap(); 1191 + exec(&regex, input, start) 1192 + } 1193 + 1194 + fn assert_match(pattern: &str, input: &str, expected_start: usize, expected_end: usize) { 1195 + let m = match_at(pattern, "", input, 0).expect("expected match"); 1196 + assert_eq!( 1197 + m.start, expected_start, 1198 + "pattern={} input={}", 1199 + pattern, input 1200 + ); 1201 + assert_eq!(m.end, expected_end, "pattern={} input={}", pattern, input); 1202 + } 1203 + 1204 + fn assert_no_match(pattern: &str, input: &str) { 1205 + assert!( 1206 + match_at(pattern, "", input, 0).is_none(), 1207 + "expected no match: pattern={} input={}", 1208 + pattern, 1209 + input 1210 + ); 1211 + } 1212 + 1213 + #[test] 1214 + fn test_literal() { 1215 + assert_match("abc", "xabcx", 1, 4); 1216 + assert_no_match("abc", "xyz"); 1217 + } 1218 + 1219 + #[test] 1220 + fn test_dot() { 1221 + assert_match("a.c", "abc", 0, 3); 1222 + assert_match("a.c", "axc", 0, 3); 1223 + assert_no_match("a.c", "a\nc"); 1224 + // dotAll flag 1225 + let m = match_at("a.c", "s", "a\nc", 0).unwrap(); 1226 + assert_eq!(m.start, 0); 1227 + assert_eq!(m.end, 3); 1228 + } 1229 + 1230 + #[test] 1231 + fn test_anchors() { 1232 + assert_match("^abc", "abc", 0, 3); 1233 + assert_no_match("^abc", "xabc"); 1234 + assert_match("abc$", "abc", 0, 3); 1235 + assert_no_match("abc$", "abcx"); 1236 + } 1237 + 1238 + #[test] 1239 + fn test_quantifiers() { 1240 + assert_match("a*", "aaa", 0, 3); 1241 + assert_match("a+", "aaa", 0, 3); 1242 + assert_no_match("a+", "bbb"); 1243 + assert_match("a?b", "ab", 0, 2); 1244 + assert_match("a?b", "b", 0, 1); 1245 + assert_match("a{2}", "aaa", 0, 2); 1246 + assert_match("a{2,3}", "aaaa", 0, 3); 1247 + assert_match("a{2,}", "aaaa", 0, 4); 1248 + } 1249 + 1250 + #[test] 1251 + fn test_char_class() { 1252 + assert_match("[abc]", "b", 0, 1); 1253 + assert_no_match("[abc]", "d"); 1254 + assert_match("[a-z]", "m", 0, 1); 1255 + assert_no_match("[a-z]", "M"); 1256 + assert_match("[^abc]", "d", 0, 1); 1257 + assert_no_match("[^abc]", "a"); 1258 + } 1259 + 1260 + #[test] 1261 + fn test_predefined_classes() { 1262 + assert_match("\\d+", "abc123", 3, 6); 1263 + assert_match("\\w+", "hello world", 0, 5); 1264 + assert_match("\\s+", "a b", 1, 2); 1265 + } 1266 + 1267 + #[test] 1268 + fn test_alternation() { 1269 + assert_match("cat|dog", "dog", 0, 3); 1270 + assert_match("cat|dog", "catdog", 0, 3); 1271 + } 1272 + 1273 + #[test] 1274 + fn test_groups() { 1275 + let m = match_at("(a)(b)(c)", "", "abc", 0).unwrap(); 1276 + assert_eq!(m.captures[1], Some((0, 1))); 1277 + assert_eq!(m.captures[2], Some((1, 2))); 1278 + assert_eq!(m.captures[3], Some((2, 3))); 1279 + } 1280 + 1281 + #[test] 1282 + fn test_non_capturing_group() { 1283 + let m = match_at("(?:ab)(c)", "", "abc", 0).unwrap(); 1284 + assert_eq!(m.captures[1], Some((2, 3))); 1285 + } 1286 + 1287 + #[test] 1288 + fn test_named_group() { 1289 + let m = match_at("(?<word>\\w+)", "", "hello", 0).unwrap(); 1290 + assert_eq!(m.captures[1], Some((0, 5))); 1291 + } 1292 + 1293 + #[test] 1294 + fn test_backreference() { 1295 + assert_match("(a)\\1", "aa", 0, 2); 1296 + assert_no_match("(a)\\1", "ab"); 1297 + } 1298 + 1299 + #[test] 1300 + fn test_word_boundary() { 1301 + assert_match("\\bfoo\\b", "a foo b", 2, 5); 1302 + assert_no_match("\\bfoo\\b", "afoo"); 1303 + } 1304 + 1305 + #[test] 1306 + fn test_lookahead() { 1307 + assert_match("a(?=b)", "ab", 0, 1); 1308 + assert_no_match("a(?=b)", "ac"); 1309 + assert_match("a(?!b)", "ac", 0, 1); 1310 + assert_no_match("a(?!b)", "ab"); 1311 + } 1312 + 1313 + #[test] 1314 + fn test_ignore_case() { 1315 + let m = match_at("abc", "i", "ABC", 0).unwrap(); 1316 + assert_eq!(m.start, 0); 1317 + assert_eq!(m.end, 3); 1318 + } 1319 + 1320 + #[test] 1321 + fn test_multiline() { 1322 + let m = match_at("^b", "m", "a\nb", 0).unwrap(); 1323 + assert_eq!(m.start, 2); 1324 + } 1325 + 1326 + #[test] 1327 + fn test_global_multiple() { 1328 + let regex = CompiledRegex::new("a", "g").unwrap(); 1329 + let m1 = exec(&regex, "aba", 0).unwrap(); 1330 + assert_eq!(m1.start, 0); 1331 + let m2 = exec(&regex, "aba", m1.end).unwrap(); 1332 + assert_eq!(m2.start, 2); 1333 + assert!(exec(&regex, "aba", m2.end).is_none()); 1334 + } 1335 + 1336 + #[test] 1337 + fn test_escape_sequences() { 1338 + assert_match("\\n", "\n", 0, 1); 1339 + assert_match("\\t", "\t", 0, 1); 1340 + assert_match("\\x41", "A", 0, 1); 1341 + assert_match("\\u0041", "A", 0, 1); 1342 + } 1343 + 1344 + #[test] 1345 + fn test_lazy_quantifiers() { 1346 + let m = match_at("a+?", "", "aaa", 0).unwrap(); 1347 + assert_eq!(m.end, 1); // Lazy: match as few as possible. 1348 + } 1349 + 1350 + #[test] 1351 + fn test_empty_pattern() { 1352 + let m = match_at("", "", "abc", 0).unwrap(); 1353 + assert_eq!(m.start, 0); 1354 + assert_eq!(m.end, 0); 1355 + } 1356 + 1357 + #[test] 1358 + fn test_flags_parse() { 1359 + let f = RegexFlags::parse("gims").unwrap(); 1360 + assert!(f.global); 1361 + assert!(f.ignore_case); 1362 + assert!(f.multiline); 1363 + assert!(f.dot_all); 1364 + assert!(!f.unicode); 1365 + assert!(!f.sticky); 1366 + 1367 + assert!(RegexFlags::parse("gg").is_err()); 1368 + assert!(RegexFlags::parse("x").is_err()); 1369 + } 1370 + }
+329
crates/js/src/vm.rs
··· 697 697 pub boolean_prototype: Option<GcRef>, 698 698 /// Built-in Date.prototype (for Date constructor objects). 699 699 pub date_prototype: Option<GcRef>, 700 + /// Built-in RegExp.prototype (for RegExp constructor objects). 701 + pub regexp_prototype: Option<GcRef>, 700 702 } 701 703 702 704 /// Maximum register file size. ··· 719 721 number_prototype: None, 720 722 boolean_prototype: None, 721 723 date_prototype: None, 724 + regexp_prototype: None, 722 725 }; 723 726 crate::builtins::init_builtins(&mut vm); 724 727 vm ··· 4190 4193 match eval("JSON.stringify({})").unwrap() { 4191 4194 Value::String(s) => assert_eq!(s, "{}"), 4192 4195 v => panic!("expected '{{}}', got {v:?}"), 4196 + } 4197 + } 4198 + 4199 + // ── RegExp tests ──────────────────────────────────────── 4200 + 4201 + #[test] 4202 + fn test_regexp_constructor() { 4203 + match eval("var r = new RegExp('abc', 'g'); r.source").unwrap() { 4204 + Value::String(s) => assert_eq!(s, "abc"), 4205 + v => panic!("expected 'abc', got {v:?}"), 4206 + } 4207 + match eval("var r = new RegExp('abc', 'gi'); r.flags").unwrap() { 4208 + Value::String(s) => assert_eq!(s, "gi"), 4209 + v => panic!("expected 'gi', got {v:?}"), 4210 + } 4211 + match eval("var r = new RegExp('abc'); r.global").unwrap() { 4212 + Value::Boolean(b) => assert!(!b), 4213 + v => panic!("expected false, got {v:?}"), 4214 + } 4215 + match eval("var r = new RegExp('abc', 'g'); r.global").unwrap() { 4216 + Value::Boolean(b) => assert!(b), 4217 + v => panic!("expected true, got {v:?}"), 4218 + } 4219 + } 4220 + 4221 + #[test] 4222 + fn test_regexp_test() { 4223 + match eval("var r = new RegExp('abc'); r.test('xabcx')").unwrap() { 4224 + Value::Boolean(b) => assert!(b), 4225 + v => panic!("expected true, got {v:?}"), 4226 + } 4227 + match eval("var r = new RegExp('abc'); r.test('xyz')").unwrap() { 4228 + Value::Boolean(b) => assert!(!b), 4229 + v => panic!("expected false, got {v:?}"), 4230 + } 4231 + match eval("var r = new RegExp('\\\\d+'); r.test('abc123')").unwrap() { 4232 + Value::Boolean(b) => assert!(b), 4233 + v => panic!("expected true, got {v:?}"), 4234 + } 4235 + } 4236 + 4237 + #[test] 4238 + fn test_regexp_exec() { 4239 + match eval("var r = new RegExp('(a)(b)(c)'); var m = r.exec('abc'); m[0]").unwrap() { 4240 + Value::String(s) => assert_eq!(s, "abc"), 4241 + v => panic!("expected 'abc', got {v:?}"), 4242 + } 4243 + match eval("var r = new RegExp('(a)(b)(c)'); var m = r.exec('abc'); m[1]").unwrap() { 4244 + Value::String(s) => assert_eq!(s, "a"), 4245 + v => panic!("expected 'a', got {v:?}"), 4246 + } 4247 + match eval("var r = new RegExp('b+'); var m = r.exec('aabbc'); m[0]").unwrap() { 4248 + Value::String(s) => assert_eq!(s, "bb"), 4249 + v => panic!("expected 'bb', got {v:?}"), 4250 + } 4251 + match eval("var r = new RegExp('xyz'); r.exec('abc')").unwrap() { 4252 + Value::Null => {} 4253 + v => panic!("expected null, got {v:?}"), 4254 + } 4255 + } 4256 + 4257 + #[test] 4258 + fn test_regexp_exec_global() { 4259 + let src = "var r = new RegExp('a', 'g'); r.exec('aba')[0]"; 4260 + match eval(src).unwrap() { 4261 + Value::String(s) => assert_eq!(s, "a"), 4262 + v => panic!("expected 'a', got {v:?}"), 4263 + } 4264 + let src = r#" 4265 + var r = new RegExp('a', 'g'); 4266 + r.exec('aba'); 4267 + var m = r.exec('aba'); 4268 + m[0] + ',' + m.index 4269 + "#; 4270 + match eval(src).unwrap() { 4271 + Value::String(s) => assert_eq!(s, "a,2"), 4272 + v => panic!("expected 'a,2', got {v:?}"), 4273 + } 4274 + } 4275 + 4276 + #[test] 4277 + fn test_regexp_to_string() { 4278 + match eval("var r = new RegExp('abc', 'gi'); r.toString()").unwrap() { 4279 + Value::String(s) => assert_eq!(s, "/abc/gi"), 4280 + v => panic!("expected '/abc/gi', got {v:?}"), 4281 + } 4282 + match eval("/hello\\d+/.toString()").unwrap() { 4283 + Value::String(s) => assert_eq!(s, "/hello\\d+/"), 4284 + v => panic!("expected '/hello\\d+/', got {v:?}"), 4285 + } 4286 + } 4287 + 4288 + #[test] 4289 + fn test_regexp_literal() { 4290 + match eval("/abc/.test('abc')").unwrap() { 4291 + Value::Boolean(b) => assert!(b), 4292 + v => panic!("expected true, got {v:?}"), 4293 + } 4294 + match eval("/abc/.test('xyz')").unwrap() { 4295 + Value::Boolean(b) => assert!(!b), 4296 + v => panic!("expected false, got {v:?}"), 4297 + } 4298 + match eval("/\\d+/.test('123')").unwrap() { 4299 + Value::Boolean(b) => assert!(b), 4300 + v => panic!("expected true, got {v:?}"), 4301 + } 4302 + match eval("/abc/i.test('ABC')").unwrap() { 4303 + Value::Boolean(b) => assert!(b), 4304 + v => panic!("expected true, got {v:?}"), 4305 + } 4306 + } 4307 + 4308 + #[test] 4309 + fn test_regexp_literal_exec() { 4310 + match eval("var m = /([a-z]+)(\\d+)/.exec('abc123'); m[0]").unwrap() { 4311 + Value::String(s) => assert_eq!(s, "abc123"), 4312 + v => panic!("expected 'abc123', got {v:?}"), 4313 + } 4314 + match eval("var m = /([a-z]+)(\\d+)/.exec('abc123'); m[1]").unwrap() { 4315 + Value::String(s) => assert_eq!(s, "abc"), 4316 + v => panic!("expected 'abc', got {v:?}"), 4317 + } 4318 + match eval("var m = /([a-z]+)(\\d+)/.exec('abc123'); m[2]").unwrap() { 4319 + Value::String(s) => assert_eq!(s, "123"), 4320 + v => panic!("expected '123', got {v:?}"), 4321 + } 4322 + } 4323 + 4324 + #[test] 4325 + fn test_string_match_regexp() { 4326 + match eval("'hello world'.match(/world/)[0]").unwrap() { 4327 + Value::String(s) => assert_eq!(s, "world"), 4328 + v => panic!("expected 'world', got {v:?}"), 4329 + } 4330 + match eval("'aaa'.match(/a/g).length").unwrap() { 4331 + Value::Number(n) => assert_eq!(n, 3.0), 4332 + v => panic!("expected 3, got {v:?}"), 4333 + } 4334 + match eval("'abc'.match(/xyz/)").unwrap() { 4335 + Value::Null => {} 4336 + v => panic!("expected null, got {v:?}"), 4337 + } 4338 + } 4339 + 4340 + #[test] 4341 + fn test_string_search_regexp() { 4342 + match eval("'hello world'.search(/world/)").unwrap() { 4343 + Value::Number(n) => assert_eq!(n, 6.0), 4344 + v => panic!("expected 6, got {v:?}"), 4345 + } 4346 + match eval("'abc'.search(/xyz/)").unwrap() { 4347 + Value::Number(n) => assert_eq!(n, -1.0), 4348 + v => panic!("expected -1, got {v:?}"), 4349 + } 4350 + match eval("'abc123'.search(/\\d/)").unwrap() { 4351 + Value::Number(n) => assert_eq!(n, 3.0), 4352 + v => panic!("expected 3, got {v:?}"), 4353 + } 4354 + } 4355 + 4356 + #[test] 4357 + fn test_string_replace_regexp() { 4358 + match eval("'hello world'.replace(/world/, 'rust')").unwrap() { 4359 + Value::String(s) => assert_eq!(s, "hello rust"), 4360 + v => panic!("expected 'hello rust', got {v:?}"), 4361 + } 4362 + match eval("'aaa'.replace(/a/, 'b')").unwrap() { 4363 + Value::String(s) => assert_eq!(s, "baa"), 4364 + v => panic!("expected 'baa', got {v:?}"), 4365 + } 4366 + match eval("'aaa'.replace(/a/g, 'b')").unwrap() { 4367 + Value::String(s) => assert_eq!(s, "bbb"), 4368 + v => panic!("expected 'bbb', got {v:?}"), 4369 + } 4370 + } 4371 + 4372 + #[test] 4373 + fn test_string_replace_capture_groups() { 4374 + let src = r#"'John Smith'.replace(/(\w+) (\w+)/, '$2, $1')"#; 4375 + match eval(src).unwrap() { 4376 + Value::String(s) => assert_eq!(s, "Smith, John"), 4377 + v => panic!("expected 'Smith, John', got {v:?}"), 4378 + } 4379 + match eval("'abc'.replace(/(b)/, '[$1]')").unwrap() { 4380 + Value::String(s) => assert_eq!(s, "a[b]c"), 4381 + v => panic!("expected 'a[b]c', got {v:?}"), 4382 + } 4383 + } 4384 + 4385 + #[test] 4386 + fn test_string_split_regexp() { 4387 + match eval("'a1b2c3'.split(/\\d/).length").unwrap() { 4388 + Value::Number(n) => assert_eq!(n, 4.0), 4389 + v => panic!("expected 4, got {v:?}"), 4390 + } 4391 + match eval("'a1b2c3'.split(/\\d/)[0]").unwrap() { 4392 + Value::String(s) => assert_eq!(s, "a"), 4393 + v => panic!("expected 'a', got {v:?}"), 4394 + } 4395 + } 4396 + 4397 + #[test] 4398 + fn test_regexp_ignore_case() { 4399 + match eval("/abc/i.exec('XAbCx')[0]").unwrap() { 4400 + Value::String(s) => assert_eq!(s, "AbC"), 4401 + v => panic!("expected 'AbC', got {v:?}"), 4402 + } 4403 + } 4404 + 4405 + #[test] 4406 + fn test_regexp_multiline() { 4407 + match eval("/^b/m.test('a\\nb')").unwrap() { 4408 + Value::Boolean(b) => assert!(b), 4409 + v => panic!("expected true, got {v:?}"), 4410 + } 4411 + match eval("/^b/.test('a\\nb')").unwrap() { 4412 + Value::Boolean(b) => assert!(!b), 4413 + v => panic!("expected false, got {v:?}"), 4414 + } 4415 + } 4416 + 4417 + #[test] 4418 + fn test_regexp_dot_all() { 4419 + match eval("/a.b/s.test('a\\nb')").unwrap() { 4420 + Value::Boolean(b) => assert!(b), 4421 + v => panic!("expected true, got {v:?}"), 4422 + } 4423 + match eval("/a.b/.test('a\\nb')").unwrap() { 4424 + Value::Boolean(b) => assert!(!b), 4425 + v => panic!("expected false, got {v:?}"), 4426 + } 4427 + } 4428 + 4429 + #[test] 4430 + fn test_regexp_word_boundary() { 4431 + match eval("/\\bfoo\\b/.test('a foo b')").unwrap() { 4432 + Value::Boolean(b) => assert!(b), 4433 + v => panic!("expected true, got {v:?}"), 4434 + } 4435 + match eval("/\\bfoo\\b/.test('foobar')").unwrap() { 4436 + Value::Boolean(b) => assert!(!b), 4437 + v => panic!("expected false, got {v:?}"), 4438 + } 4439 + } 4440 + 4441 + #[test] 4442 + fn test_regexp_quantifiers_vm() { 4443 + match eval("/a{3}/.test('aaa')").unwrap() { 4444 + Value::Boolean(b) => assert!(b), 4445 + v => panic!("expected true, got {v:?}"), 4446 + } 4447 + match eval("/a{3}/.test('aa')").unwrap() { 4448 + Value::Boolean(b) => assert!(!b), 4449 + v => panic!("expected false, got {v:?}"), 4450 + } 4451 + match eval("/a+?/.exec('aaa')[0]").unwrap() { 4452 + Value::String(s) => assert_eq!(s, "a"), 4453 + v => panic!("expected 'a', got {v:?}"), 4454 + } 4455 + } 4456 + 4457 + #[test] 4458 + fn test_regexp_alternation_vm() { 4459 + match eval("/cat|dog/.exec('I have a dog')[0]").unwrap() { 4460 + Value::String(s) => assert_eq!(s, "dog"), 4461 + v => panic!("expected 'dog', got {v:?}"), 4462 + } 4463 + } 4464 + 4465 + #[test] 4466 + fn test_regexp_lookahead_vm() { 4467 + match eval("/a(?=b)/.test('ab')").unwrap() { 4468 + Value::Boolean(b) => assert!(b), 4469 + v => panic!("expected true, got {v:?}"), 4470 + } 4471 + match eval("/a(?=b)/.test('ac')").unwrap() { 4472 + Value::Boolean(b) => assert!(!b), 4473 + v => panic!("expected false, got {v:?}"), 4474 + } 4475 + match eval("/a(?!b)/.test('ac')").unwrap() { 4476 + Value::Boolean(b) => assert!(b), 4477 + v => panic!("expected true, got {v:?}"), 4478 + } 4479 + } 4480 + 4481 + #[test] 4482 + fn test_regexp_char_class_vm() { 4483 + match eval("/[abc]/.test('b')").unwrap() { 4484 + Value::Boolean(b) => assert!(b), 4485 + v => panic!("expected true, got {v:?}"), 4486 + } 4487 + match eval("/[a-z]+/.exec('Hello')[0]").unwrap() { 4488 + Value::String(s) => assert_eq!(s, "ello"), 4489 + v => panic!("expected 'ello', got {v:?}"), 4490 + } 4491 + } 4492 + 4493 + #[test] 4494 + fn test_regexp_backreference_vm() { 4495 + match eval("/(a)\\1/.test('aa')").unwrap() { 4496 + Value::Boolean(b) => assert!(b), 4497 + v => panic!("expected true, got {v:?}"), 4498 + } 4499 + match eval("/(a)\\1/.test('ab')").unwrap() { 4500 + Value::Boolean(b) => assert!(!b), 4501 + v => panic!("expected false, got {v:?}"), 4502 + } 4503 + } 4504 + 4505 + #[test] 4506 + fn test_regexp_properties() { 4507 + match eval("var r = /abc/gim; r.global").unwrap() { 4508 + Value::Boolean(b) => assert!(b), 4509 + v => panic!("expected true, got {v:?}"), 4510 + } 4511 + match eval("/abc/.lastIndex").unwrap() { 4512 + Value::Number(n) => assert_eq!(n, 0.0), 4513 + v => panic!("expected 0, got {v:?}"), 4514 + } 4515 + } 4516 + 4517 + #[test] 4518 + fn test_string_replace_all_regexp() { 4519 + match eval("'aba'.replaceAll(/a/g, 'x')").unwrap() { 4520 + Value::String(s) => assert_eq!(s, "xbx"), 4521 + v => panic!("expected 'xbx', got {v:?}"), 4193 4522 } 4194 4523 } 4195 4524 }