My working unpac space for OCaml projects in development
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Fix Unicode whitespace handling and test262 metadata parsing

- Add proper Unicode whitespace and line terminator detection using
UTF-8 decoding (U+2028 Line Separator, U+2029 Paragraph Separator,
U+FEFF BOM, U+1680 Ogham Space, U+2000-U+200A spaces, etc.)
- Fix YAML metadata parser to correctly find /*--- ... ---*/ blocks
(was incorrectly matching first / which could be // comment)
- Fix metadata parser to properly handle negative: blocks with
nested phase: and type: fields
- Add uutf dependency for Unicode handling
- Test262 expressions: 8525/11093 (76.9%) passing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

+182 -28
+1 -1
lib/quickjs/parser/dune
··· 1 1 (library 2 2 (name quickjs_parser) 3 3 (public_name ocaml-quickjs.parser) 4 - (libraries quickjs_core) 4 + (libraries quickjs_core uutf) 5 5 (flags (:standard -w -32-37-39-69)) ; Suppress various warnings during development 6 6 (preprocess no_preprocessing))
+107
lib/quickjs/parser/lexer.ml
··· 53 53 | '\n' | '\r' -> true 54 54 | _ -> false 55 55 56 + (* Unicode character classification using Uchar *) 57 + let is_unicode_whitespace uchar = 58 + let cp = Uchar.to_int uchar in 59 + match cp with 60 + | 0x0009 (* Tab *) 61 + | 0x000B (* Vertical Tab *) 62 + | 0x000C (* Form Feed *) 63 + | 0x0020 (* Space *) 64 + | 0x00A0 (* No-Break Space *) 65 + | 0xFEFF (* BOM *) 66 + | 0x1680 (* Ogham Space Mark *) 67 + | 0x2000 | 0x2001 | 0x2002 | 0x2003 | 0x2004 (* En Quad .. Three-Per-Em Space *) 68 + | 0x2005 | 0x2006 | 0x2007 | 0x2008 | 0x2009 | 0x200A (* Four-Per-Em Space .. Hair Space *) 69 + | 0x202F (* Narrow No-Break Space *) 70 + | 0x205F (* Medium Mathematical Space *) 71 + | 0x3000 (* Ideographic Space *) 72 + -> true 73 + | _ -> false 74 + 75 + let is_unicode_line_terminator uchar = 76 + let cp = Uchar.to_int uchar in 77 + match cp with 78 + | 0x000A (* Line Feed *) 79 + | 0x000D (* Carriage Return *) 80 + | 0x2028 (* Line Separator *) 81 + | 0x2029 (* Paragraph Separator *) 82 + -> true 83 + | _ -> false 84 + 85 + (* Decode a single UTF-8 character at cursor position *) 86 + let check_unicode_at cursor = 87 + match Source.cursor_peek_n cursor 4 with 88 + | None -> None 89 + | Some s -> 90 + let len = String.length s in 91 + if len = 0 then None 92 + else 93 + let b0 = Char.code s.[0] in 94 + if b0 < 0x80 then 95 + (* ASCII *) 96 + Some (Uchar.of_int b0, 1) 97 + else if b0 < 0xC0 then 98 + (* Invalid UTF-8 start byte *) 99 + None 100 + else if b0 < 0xE0 then 101 + (* 2-byte sequence *) 102 + if len >= 2 then 103 + let b1 = Char.code s.[1] in 104 + let cp = ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F) in 105 + Some (Uchar.of_int cp, 2) 106 + else None 107 + else if b0 < 0xF0 then 108 + (* 3-byte sequence *) 109 + if len >= 3 then 110 + let b1 = Char.code s.[1] in 111 + let b2 = Char.code s.[2] in 112 + let cp = ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) in 113 + Some (Uchar.of_int cp, 3) 114 + else None 115 + else if b0 < 0xF8 then 116 + (* 4-byte sequence *) 117 + if len >= 4 then 118 + let b1 = Char.code s.[1] in 119 + let b2 = Char.code s.[2] in 120 + let b3 = Char.code s.[3] in 121 + let cp = ((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) 122 + lor ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) in 123 + Some (Uchar.of_int cp, 4) 124 + else None 125 + else 126 + None 127 + 56 128 let is_digit = function 57 129 | '0'..'9' -> true 58 130 | _ -> false ··· 112 184 Source.cursor_advance_n cursor 2; 113 185 skip_block_comment lexer 114 186 | _ -> ()) 187 + | Some c when Char.code c >= 0x80 -> 188 + (* Check for multi-byte Unicode whitespace/line terminators *) 189 + (match check_unicode_at cursor with 190 + | Some (uchar, byte_len) -> 191 + if is_unicode_line_terminator uchar then begin 192 + Source.cursor_advance_n cursor byte_len; 193 + lexer.newline_before <- true; 194 + skip_whitespace_and_comments lexer 195 + end else if is_unicode_whitespace uchar then begin 196 + Source.cursor_advance_n cursor byte_len; 197 + skip_whitespace_and_comments lexer 198 + end 199 + (* else: not whitespace, stop here *) 200 + | None -> ()) 115 201 | _ -> () 116 202 117 203 and skip_line_comment lexer = ··· 120 206 match Source.cursor_peek cursor with 121 207 | None -> () 122 208 | Some c when is_line_terminator c -> () (* Don't consume the newline *) 209 + | Some c when Char.code c >= 0x80 -> 210 + (* Check for Unicode line terminators *) 211 + (match check_unicode_at cursor with 212 + | Some (uchar, _) when is_unicode_line_terminator uchar -> () 213 + | Some (_, byte_len) -> 214 + Source.cursor_advance_n cursor byte_len; 215 + loop () 216 + | None -> 217 + Source.cursor_advance cursor; 218 + loop ()) 123 219 | Some _ -> 124 220 Source.cursor_advance cursor; 125 221 loop () ··· 143 239 lexer.newline_before <- true; 144 240 Source.cursor_advance cursor; 145 241 loop () 242 + | Some c when Char.code c >= 0x80 -> 243 + (* Check for Unicode characters *) 244 + (match check_unicode_at cursor with 245 + | Some (uchar, byte_len) -> 246 + if is_unicode_line_terminator uchar then 247 + lexer.newline_before <- true; 248 + Source.cursor_advance_n cursor byte_len; 249 + loop () 250 + | None -> 251 + Source.cursor_advance cursor; 252 + loop ()) 146 253 | Some _ -> 147 254 Source.cursor_advance cursor; 148 255 loop ()
+74 -27
test/runner/test262_runner.ml
··· 61 61 locale = []; 62 62 } in 63 63 (* Find /*--- ... ---*/ block *) 64 - match String.index_opt content '/' with 65 - | None -> default 66 - | Some start -> 67 - if start + 4 < String.length content && 68 - String.sub content start 4 = "/*--" then 69 - let end_marker = "---*/" in 70 - match Str.search_forward (Str.regexp_string end_marker) content (start + 4) with 71 - | exception Not_found -> default 72 - | end_pos -> 73 - let yaml_content = String.sub content (start + 5) (end_pos - start - 5) in 74 - (* Simple YAML-like parsing - just extract key fields *) 64 + let start_marker = "/*---" in 65 + let end_marker = "---*/" in 66 + match Str.search_forward (Str.regexp_string start_marker) content 0 with 67 + | exception Not_found -> default 68 + | start -> 69 + match Str.search_forward (Str.regexp_string end_marker) content (start + 5) with 70 + | exception Not_found -> default 71 + | end_pos -> 72 + let yaml_content = String.sub content (start + 5) (end_pos - start - 5) in 73 + (* Simple YAML-like parsing *) 75 74 let lines = String.split_on_char '\n' yaml_content in 75 + 76 + (* Parse a list of values that follow a key (indented lines starting with -) *) 77 + let rec parse_list acc = function 78 + | [] -> (List.rev acc, []) 79 + | line :: rest -> 80 + let trimmed = String.trim line in 81 + if String.length trimmed > 0 && trimmed.[0] = '-' then 82 + let item = String.trim (String.sub trimmed 1 (String.length trimmed - 1)) in 83 + parse_list (item :: acc) rest 84 + else if String.length line > 0 && line.[0] = ' ' then 85 + (* Indented non-list line, skip *) 86 + parse_list acc rest 87 + else 88 + (List.rev acc, line :: rest) 89 + in 90 + 91 + (* Parse nested block like negative: *) 92 + let rec parse_block acc = function 93 + | [] -> (acc, []) 94 + | line :: rest -> 95 + if String.length line > 0 && line.[0] = ' ' then 96 + let trimmed = String.trim line in 97 + (match String.index_opt trimmed ':' with 98 + | Some colon -> 99 + let key = String.trim (String.sub trimmed 0 colon) in 100 + let value = String.trim (String.sub trimmed (colon + 1) (String.length trimmed - colon - 1)) in 101 + parse_block ((key, value) :: acc) rest 102 + | None -> parse_block acc rest) 103 + else 104 + (acc, line :: rest) 105 + in 106 + 76 107 let rec parse_lines acc = function 77 108 | [] -> acc 78 109 | line :: rest -> 79 - let line = String.trim line in 80 - if String.length line > 0 && line.[0] <> '-' then 81 - match String.index_opt line ':' with 110 + let trimmed = String.trim line in 111 + if String.length trimmed = 0 then 112 + parse_lines acc rest 113 + else 114 + match String.index_opt trimmed ':' with 82 115 | None -> parse_lines acc rest 83 116 | Some colon -> 84 - let key = String.trim (String.sub line 0 colon) in 85 - let value = String.trim (String.sub line (colon + 1) (String.length line - colon - 1)) in 86 - let acc = match key with 87 - | "description" -> { acc with description = value } 88 - | "es5id" -> { acc with es5id = Some value } 89 - | "es6id" -> { acc with es6id = Some value } 90 - | "esid" -> { acc with esid = Some value } 91 - | _ -> acc 92 - in 93 - parse_lines acc rest 94 - else 95 - parse_lines acc rest 117 + let key = String.trim (String.sub trimmed 0 colon) in 118 + let value = String.trim (String.sub trimmed (colon + 1) (String.length trimmed - colon - 1)) in 119 + match key with 120 + | "description" -> 121 + parse_lines { acc with description = value } rest 122 + | "es5id" -> 123 + parse_lines { acc with es5id = Some value } rest 124 + | "es6id" -> 125 + parse_lines { acc with es6id = Some value } rest 126 + | "esid" -> 127 + parse_lines { acc with esid = Some value } rest 128 + | "negative" -> 129 + let (fields, rest') = parse_block [] rest in 130 + let phase = List.assoc_opt "phase" fields |> Option.value ~default:"" in 131 + let type_ = List.assoc_opt "type" fields |> Option.value ~default:"" in 132 + parse_lines { acc with negative = Some { phase; type_ } } rest' 133 + | "features" -> 134 + let (items, rest') = parse_list [] rest in 135 + parse_lines { acc with features = items } rest' 136 + | "includes" -> 137 + let (items, rest') = parse_list [] rest in 138 + parse_lines { acc with includes = items } rest' 139 + | "flags" -> 140 + let (items, rest') = parse_list [] rest in 141 + parse_lines { acc with flags = items } rest' 142 + | _ -> 143 + parse_lines acc rest 96 144 in 97 145 parse_lines default lines 98 - else default 99 146 100 147 (* Check if test should be skipped *) 101 148 let should_skip (config : config) (filename : string) (metadata : test_metadata) : string option =