Merge pull request #12664 from Octachron/at-least-latin-9-in-utf8

+8

.depend

··· 1013 1013 typing/oprint.cmo : \ 1014 1014 parsing/pprintast.cmi \ 1015 1015 typing/outcometree.cmi \ 1016 + utils/misc.cmi \ 1016 1017 parsing/lexer.cmi \ 1017 1018 utils/format_doc.cmi \ 1018 1019 parsing/asttypes.cmi \ ··· 1020 1021 typing/oprint.cmx : \ 1021 1022 parsing/pprintast.cmx \ 1022 1023 typing/outcometree.cmi \ 1024 + utils/misc.cmx \ 1023 1025 parsing/lexer.cmx \ 1024 1026 utils/format_doc.cmx \ 1025 1027 parsing/asttypes.cmx \ ··· 8190 8192 debugger/debugger_config.cmi 8191 8193 debugger/debugger_config.cmi : 8192 8194 debugger/debugger_lexer.cmo : \ 8195 + utils/misc.cmi \ 8193 8196 debugger/debugger_parser.cmi \ 8194 8197 debugger/debugger_lexer.cmi 8195 8198 debugger/debugger_lexer.cmx : \ 8199 + utils/misc.cmx \ 8196 8200 debugger/debugger_parser.cmx \ 8197 8201 debugger/debugger_lexer.cmi 8198 8202 debugger/debugger_lexer.cmi : \ ··· 9231 9235 ocamldoc/odoc_messages.cmi \ 9232 9236 ocamldoc/odoc_global.cmi \ 9233 9237 ocamldoc/odoc_comments_global.cmi \ 9238 + utils/misc.cmi \ 9234 9239 ocamldoc/odoc_lexer.cmi 9235 9240 ocamldoc/odoc_lexer.cmx : \ 9236 9241 otherlibs/str/str.cmx \ ··· 9238 9243 ocamldoc/odoc_messages.cmx \ 9239 9244 ocamldoc/odoc_global.cmx \ 9240 9245 ocamldoc/odoc_comments_global.cmx \ 9246 + utils/misc.cmx \ 9241 9247 ocamldoc/odoc_lexer.cmi 9242 9248 ocamldoc/odoc_lexer.cmi : \ 9243 9249 ocamldoc/odoc_parser.cmi ··· 9380 9386 typing/ident.cmi 9381 9387 ocamldoc/odoc_ocamlhtml.cmo : \ 9382 9388 otherlibs/str/str.cmi \ 9389 + utils/misc.cmi \ 9383 9390 ocamldoc/odoc_ocamlhtml.cmi 9384 9391 ocamldoc/odoc_ocamlhtml.cmx : \ 9385 9392 otherlibs/str/str.cmx \ 9393 + utils/misc.cmx \ 9386 9394 ocamldoc/odoc_ocamlhtml.cmi 9387 9395 ocamldoc/odoc_ocamlhtml.cmi : 9388 9396 ocamldoc/odoc_parameter.cmo : \

+9 -1

.gitattributes

··· 17 17 * text=auto 18 18 19 19 # It is not possible to wrap lines lines in .gitattributes files 20 - .gitattributes typo.long-line=may 20 + .gitattributes typo.long-line=may typo.utf8 21 21 22 22 # Binary files 23 23 /boot/ocamlc binary ··· 100 100 otherlibs/unix/stat_win32.c typo.long-line 101 101 otherlibs/unix/symlink_win32.c typo.long-line 102 102 103 + # Some Unicode characters here and there 104 + utils/misc.ml typo.utf8 103 105 runtime/sak.c typo.non-ascii 104 106 105 107 stdlib/hashbang typo.white-at-eol typo.missing-lf ··· 108 110 testsuite/tests/lib-bigarray-2/bigarrf.f typo.tab linguist-language=Fortran 109 111 testsuite/tests/lib-unix/win-stat/fakeclock.c typo.missing-header=false 110 112 testsuite/tests/misc-unsafe/almabench.ml typo.long-line 113 + testsuite/tests/parsing/latin9.ml typo.utf8 typo.very-long-line 114 + testsuite/tests/tool-ocamldoc/Latin9.ml typo.utf8 115 + testsuite/tests/parsetree/source.ml typo.utf8 116 + testsuite/tests/typing-unicode/*.ml typo.utf8 111 117 testsuite/tests/tool-toplevel/strings.ml typo.utf8 112 118 testsuite/tests/win-unicode/*.ml typo.utf8 119 + testsuite/tests/unicode/見.ml typo.utf8 120 + testsuite/tests/lexing/reject_bad_encoding.ml typo.prune 113 121 testsuite/tests/asmgen/immediates.cmm typo.very-long-line 114 122 testsuite/tests/generated-parse-errors/errors.* typo.very-long-line 115 123 testsuite/tools/*.S typo.missing-header

+5

Changes

··· 535 535 syntax for types. 536 536 (Chet Murthy, review by Gabriel Scherer) 537 537 538 + - #11736, #12664: Support utf-8 encoded source files and latin-9 compatible 539 + identifiers. 540 + (Xavier Leroy and Florian Angeletti, review by Daniel Bünzli and 541 + Jules Aguillon) 542 + 538 543 ### Type system: 539 544 540 545 - #12313, #11799: Do not re-build as-pattern type when a ground type annotation

+29 -6

debugger/debugger_lexer.mll

··· 18 18 19 19 open Debugger_parser 20 20 21 + 22 + let ident_for_extended raw_name = 23 + match Misc.Utf8_lexeme.normalize raw_name with 24 + | Error _ -> raise Parsing.Parse_error 25 + | Ok name -> 26 + match Misc.Utf8_lexeme.validate_identifier name with 27 + | Misc.Utf8_lexeme.Valid -> name 28 + | Misc.Utf8_lexeme.Invalid_character _ 29 + | Misc.Utf8_lexeme.Invalid_beginning _ -> 30 + raise Parsing.Parse_error 31 + 21 32 exception Int_overflow 22 33 23 34 } 24 35 36 + let lowercase = ['a'-'z' '_'] 37 + let uppercase = ['A'-'Z'] 38 + let identstart = lowercase | uppercase 39 + let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9'] 40 + let utf8 = ['\192'-'\255'] ['\128'-'\191']* 41 + let identstart_ext = identstart | utf8 42 + let identchar_ext = identchar | utf8 43 + let ident_ext = identstart_ext identchar_ext* 44 + 25 45 rule line = (* Read a whole line *) 26 46 parse 27 47 ([ ^ '\n' '\r' ]* as s) ('\n' | '\r' | "\r\n") ··· 53 73 parse 54 74 [' ' '\t'] + 55 75 { lexeme lexbuf } 56 - | ['a'-'z' '\223'-'\246' '\248'-'\255' '_'] 57 - (['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' 58 - '\'' '0'-'9' ]) * 76 + | lowercase identchar* 59 77 { LIDENT(Lexing.lexeme lexbuf) } 60 - | ['A'-'Z' '\192'-'\214' '\216'-'\222' ] 61 - (['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' 62 - '\'' '0'-'9' ]) * 78 + | uppercase identchar* 63 79 { UIDENT(Lexing.lexeme lexbuf) } 80 + | ident_ext as raw_name 81 + { 82 + let name = ident_for_extended raw_name in 83 + if Misc.Utf8_lexeme.is_capitalized name 84 + then UIDENT name 85 + else LIDENT name 86 + } 64 87 | '"' [^ '"']* "\"" 65 88 { let s = Lexing.lexeme lexbuf in 66 89 LIDENT(String.sub s 1 (String.length s - 2)) }

+2 -2

driver/makedepend.ml

··· 427 427 428 428 (* Init Hashtbl with all defined modules *) 429 429 let files = List.map (fun (file, file_kind, deps, pp_deps) -> 430 - let modname = Unit_info.modname_from_source file in 430 + let modname = Unit_info.lax_modname_from_source file in 431 431 let key = (modname, file_kind) in 432 432 let new_deps = ref [] in 433 433 Hashtbl.add h key (file, new_deps); ··· 526 526 ~mli_file:process_mli_map 527 527 in 528 528 Clflags.transparent_modules := old_transp; 529 - let modname = Unit_info.modname_from_source fname in 529 + let modname = Unit_info.lax_modname_from_source fname in 530 530 if String.Map.is_empty m then 531 531 report_err (Failure (fname ^ " : empty map file or parse error")); 532 532 let mm = Depend.make_node m in

+11 -2

lex/lexer.mll

··· 125 125 let extattrident = ident ('.' ident)* 126 126 let blank = [' ' '\009' '\012'] 127 127 128 + let uppercase = ['A'-'Z'] 129 + let ocaml_identstart = lowercase | uppercase 130 + let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9'] 131 + let utf8 = ['\192'-'\255'] ['\128'-'\191']* 132 + let identstart_ext = ocaml_identstart | utf8 133 + let identchar_ext = identchar | utf8 134 + let ocaml_ident = identstart_ext identchar_ext* 135 + 136 + 128 137 rule main = parse 129 138 [' ' '\013' '\009' '\012' ] + 130 139 { main lexbuf } ··· 297 306 | '\010' 298 307 { incr_loc lexbuf 0; 299 308 comment depth lexbuf } 300 - | ident 309 + | ocaml_ident 301 310 { comment depth lexbuf } 302 311 | _ 303 312 { comment depth lexbuf } ··· 333 342 | '\010' 334 343 { incr_loc lexbuf 0; 335 344 action stk lexbuf } 336 - | ident 345 + | ocaml_ident 337 346 { action stk lexbuf } 338 347 | _ 339 348 { action stk lexbuf }

+1 -1

ocamldoc/odoc_analyse.ml

··· 207 207 | Odoc_global.Text_file file -> 208 208 Location.input_name := file; 209 209 try 210 - let mod_name = Unit_info.modname_from_source file in 210 + let mod_name = Unit_info.lax_modname_from_source file in 211 211 let txt = 212 212 try Odoc_text.Texter.text_of_string (Odoc_misc.input_file_as_string file) 213 213 with Odoc_text.Text_syntax (l, c, s) ->

+1 -1

ocamldoc/odoc_ast.ml

··· 1846 1846 let (tree_structure, _) = typedtree in 1847 1847 prepare_file source_file input_file; 1848 1848 (* We create the t_module for this file. *) 1849 - let mod_name = Unit_info.modname_from_source source_file in 1849 + let mod_name = Unit_info.lax_modname_from_source source_file in 1850 1850 let len, info_opt = Sig.preamble !file_name !file 1851 1851 (fun x -> x.Parsetree.pstr_loc) parsetree in 1852 1852 let info_opt = analyze_toplevel_alerts info_opt parsetree in

+1 -1

ocamldoc/odoc_html.ml

··· 29 29 let index_only = ref false 30 30 let colorize_code = ref false 31 31 let html_short_functors = ref false 32 - let charset = ref "iso-8859-1" 32 + let charset = ref "UTF-8" 33 33 let show_navbar = ref true 34 34 35 35

+38 -7

ocamldoc/odoc_lexer.mll

··· 87 87 (** Remove first blank characters of each line of a string, until the first '*' *) 88 88 let remove_stars s = 89 89 Str.global_replace (Str.regexp ("^"^blank^"*\\*")) "" s 90 + 91 + let validate_encoding raw_name = 92 + match Misc.Utf8_lexeme.normalize raw_name with 93 + | Error s -> failwith (Format.asprintf "Invalid encoding %s" s) 94 + | Ok name -> name 95 + 96 + let validate_ident raw_name = 97 + let name = validate_encoding raw_name in 98 + match Misc.Utf8_lexeme.validate_identifier name with 99 + | Misc.Utf8_lexeme.Valid -> name 100 + | Misc.Utf8_lexeme.Invalid_character u -> 101 + failwith (Format.asprintf "Invalid character U+%X" (Uchar.to_int u)) 102 + | Misc.Utf8_lexeme.Invalid_beginning u -> 103 + failwith (Format.asprintf "Invalid first character U+%X" (Uchar.to_int u)) 104 + 105 + let validate_exception_uident raw_name = 106 + let name = validate_ident raw_name in 107 + if Misc.Utf8_lexeme.is_capitalized name then name else 108 + failwith (Format.asprintf "Invalid exception name: %s" name) 90 109 } 91 110 92 111 let blank = [ ' ' '\013' '\009' '\012'] 93 112 let nl_blank = blank | '\010' 94 113 let notblank = [^ ' ' '\010' '\013' '\009' '\012'] 95 - let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_'] 96 - let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222'] 97 - let identchar = 98 - ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9'] 99 114 100 - let modident = uppercase identchar* ('.' uppercase identchar* )* 115 + let lowercase = ['a'-'z' '_'] 116 + let uppercase = ['A'-'Z'] 117 + let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9'] 118 + let utf8 = ['\192'-'\255'] ['\128'-'\191']* 119 + let identchar_ext = identchar | utf8 120 + let identstart_ext = lowercase | uppercase | utf8 121 + let ident_ext = identstart_ext identchar_ext* 101 122 102 123 rule main = parse 103 124 [' ' '\013' '\009' '\012'] + ··· 301 322 } 302 323 303 324 | "@param" nl_blank+ (identchar+ as id) nl_blank+ { T_PARAM id } 325 + | "@param" nl_blank+ (identchar_ext+ as raw_id) nl_blank+ { 326 + let id = validate_ident raw_id in 327 + T_PARAM id 328 + } 304 329 | "@param" { failwith "usage: @param id description"} 305 - | "@before" nl_blank+ (notblank+ as v) nl_blank+ { T_BEFORE v } 330 + | "@before" nl_blank+ (notblank+ as v) nl_blank+ { 331 + let v = validate_encoding v in 332 + T_BEFORE v } 306 333 | "@before" { failwith "usage: @before version description"} 307 - | "@raise" nl_blank+ (modident as id) nl_blank+ { T_RAISES id } 334 + | "@raise" nl_blank+ (ident_ext ('.' ident_ext)* as exn_path) nl_blank+ 335 + { let raw_path = String.split_on_char '.' exn_path in 336 + let path = List.map validate_exception_uident raw_path in 337 + let id = String.concat "." path in 338 + T_RAISES id } 308 339 | "@raise" { failwith "usage: @raise Exception description"} 309 340 | "@"lowercase+ 310 341 {

+30 -15

ocamldoc/odoc_ocamlhtml.mll

··· 204 204 (** To store the position of the beginning of a string and comment *) 205 205 let string_start_pos = ref 0 206 206 let comment_start_pos = ref [] 207 + 208 + (** Normalizing utf-8 *) 209 + let normalize raw_name = 210 + (* we are printing documentation, it is too late to be strict *) 211 + match Misc.Utf8_lexeme.normalize raw_name with 212 + | Error s -> s 213 + | Ok name -> name 214 + 207 215 } 208 216 209 217 let blank = [' ' '\010' '\013' '\009' '\012'] 210 - let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_'] 211 - let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222'] 212 - let identchar = 213 - ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9'] 218 + 219 + let lowercase = ['a'-'z' '_'] 220 + let uppercase = ['A'-'Z'] 221 + let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9'] 222 + let utf8 = ['\192'-'\255'] ['\128'-'\191']* 223 + let identstart_ext = uppercase | lowercase | utf8 224 + let identchar_ext = identchar | utf8 225 + let ident_ext = identstart_ext identchar_ext* 226 + 227 + 214 228 let symbolchar = 215 229 ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] 216 230 let decimal_literal = ['0'-'9']+ ··· 237 251 | "_" 238 252 { print "_" ; token lexbuf } 239 253 | "~" { print "~" ; token lexbuf } 240 - | "~" lowercase identchar * ':' 254 + | "~" (ident_ext as raw_id ) ':' 241 255 { let s = Lexing.lexeme lexbuf in 242 - let name = String.sub s 1 (String.length s - 2) in 256 + let name = normalize raw_id in 243 257 if Hashtbl.mem keyword_table name then 244 258 raise (Error(Keyword_as_label name, Lexing.lexeme_start lexbuf, 245 259 Lexing.lexeme_end lexbuf)); 246 260 print s ; token lexbuf } 247 261 | "?" { print "?" ; token lexbuf } 248 - | "?" lowercase identchar * ':' 249 - { let s = Lexing.lexeme lexbuf in 250 - let name = String.sub s 1 (String.length s - 2) in 262 + | "?" (ident_ext as raw_id) ':' 263 + { 264 + let name = normalize raw_id in 251 265 if Hashtbl.mem keyword_table name then 252 266 raise (Error(Keyword_as_label name, Lexing.lexeme_start lexbuf, 253 267 Lexing.lexeme_end lexbuf)); 254 - print s ; token lexbuf } 255 - | lowercase identchar * 256 - { let s = Lexing.lexeme lexbuf in 257 - try 268 + print "?"; print name ; print ":"; token lexbuf } 269 + | (ident_ext as raw_id) 270 + { let s = normalize raw_id in 271 + if Misc.Utf8_lexeme.is_capitalized s then 272 + (print_class constructor_class (Lexing.lexeme lexbuf); 273 + token lexbuf) 274 + else try 258 275 let cl = Hashtbl.find keyword_table s in 259 276 (print_class cl s ; token lexbuf ) 260 277 with Not_found -> 261 278 (print s ; token lexbuf )} 262 - | uppercase identchar * 263 - { print_class constructor_class (Lexing.lexeme lexbuf) ; token lexbuf } (* No capitalized keywords *) 264 279 | decimal_literal | hex_literal | oct_literal | bin_literal 265 280 { print (Lexing.lexeme lexbuf) ; token lexbuf } 266 281 | float_literal

+1 -1

ocamldoc/odoc_sig.ml

··· 1887 1887 (ast : Parsetree.signature) (signat : Types.signature) = 1888 1888 prepare_file source_file input_file; 1889 1889 (* We create the t_module for this file. *) 1890 - let mod_name = Unit_info.modname_from_source source_file in 1890 + let mod_name = Unit_info.lax_modname_from_source source_file in 1891 1891 let len, info_opt = preamble !file_name !file 1892 1892 (fun x -> x.Parsetree.psig_loc) ast in 1893 1893 let info_opt = analyze_toplevel_alerts info_opt ast in

+5

parsing/lexer.mli

+108 -41

parsing/lexer.mll

··· 29 29 | Unterminated_string_in_comment of Location.t * Location.t 30 30 | Empty_character_literal 31 31 | Keyword_as_label of string 32 + | Capitalized_label of string 32 33 | Invalid_literal of string 33 34 | Invalid_directive of string * string option 35 + | Invalid_encoding of string 36 + | Invalid_char_in_ident of Uchar.t 37 + | Non_lowercase_delimiter of string 38 + | Capitalized_raw_identifier of string 34 39 35 40 exception Error of error * Location.t 36 41 ··· 256 261 illegal_escape lexbuf 257 262 (Printf.sprintf "%X is not a Unicode scalar value" cp) 258 263 264 + let validate_encoding lexbuf raw_name = 265 + match Utf8_lexeme.normalize raw_name with 266 + | Error _ -> error lexbuf (Invalid_encoding raw_name) 267 + | Ok name -> name 268 + 269 + let ident_for_extended lexbuf raw_name = 270 + let name = validate_encoding lexbuf raw_name in 271 + match Utf8_lexeme.validate_identifier name with 272 + | Utf8_lexeme.Valid -> name 273 + | Utf8_lexeme.Invalid_character u -> error lexbuf (Invalid_char_in_ident u) 274 + | Utf8_lexeme.Invalid_beginning _ -> 275 + assert false (* excluded by the regexps *) 276 + 277 + let validate_delim lexbuf raw_name = 278 + let name = validate_encoding lexbuf raw_name in 279 + if Utf8_lexeme.is_lowercase name then name 280 + else error lexbuf (Non_lowercase_delimiter name) 281 + 282 + let validate_ext lexbuf name = 283 + let name = validate_encoding lexbuf name in 284 + match Utf8_lexeme.validate_identifier ~with_dot:true name with 285 + | Utf8_lexeme.Valid -> name 286 + | Utf8_lexeme.Invalid_character u -> error lexbuf (Invalid_char_in_ident u) 287 + | Utf8_lexeme.Invalid_beginning _ -> 288 + assert false (* excluded by the regexps *) 289 + 290 + let lax_delim raw_name = 291 + match Utf8_lexeme.normalize raw_name with 292 + | Error _ -> None 293 + | Ok name -> 294 + if Utf8_lexeme.is_lowercase name then Some name 295 + else None 296 + 259 297 let is_keyword name = Hashtbl.mem keyword_table name 260 298 261 - let check_label_name lexbuf name = 262 - if is_keyword name then error lexbuf (Keyword_as_label name) 299 + let check_label_name ?(raw_escape=false) lexbuf name = 300 + if Utf8_lexeme.is_capitalized name then 301 + error lexbuf (Capitalized_label name); 302 + if not raw_escape && is_keyword name then 303 + error lexbuf (Keyword_as_label name) 263 304 264 305 (* Update the current location with file name and line number. *) 265 306 ··· 278 319 let preprocessor = ref None 279 320 280 321 let escaped_newlines = ref false 281 - 282 - (* Warn about Latin-1 characters used in idents *) 283 - 284 - let warn_latin1 lexbuf = 285 - Location.deprecated 286 - (Location.curr lexbuf) 287 - "ISO-Latin1 characters in identifiers" 288 322 289 323 let handle_docstrings = ref true 290 324 let comment_list = ref [] ··· 336 370 | Keyword_as_label kwd -> 337 371 Location.errorf ~loc 338 372 "%a is a keyword, it cannot be used as label name" Style.inline_code kwd 373 + | Capitalized_label lbl -> 374 + Location.errorf ~loc 375 + "%a cannot be used as label name, \ 376 + it must start with a lowercase letter" Style.inline_code lbl 339 377 | Invalid_literal s -> 340 378 Location.errorf ~loc "Invalid literal %s" s 341 379 | Invalid_directive (dir, explanation) -> ··· 343 381 (fun ppf -> match explanation with 344 382 | None -> () 345 383 | Some expl -> fprintf ppf ": %s" expl) 384 + | Invalid_encoding s -> 385 + Location.errorf ~loc "Invalid encoding of identifier %s." s 386 + | Invalid_char_in_ident u -> 387 + Location.errorf ~loc "Invalid character U+%X in identifier" 388 + (Uchar.to_int u) 389 + | Capitalized_raw_identifier lbl -> 390 + Location.errorf ~loc 391 + "%a cannot be used as a raw identifier, \ 392 + it must start with a lowercase letter" Style.inline_code lbl 393 + | Non_lowercase_delimiter name -> 394 + Location.errorf ~loc 395 + "%a cannot be used as a quoted string delimiter,@ \ 396 + it must contain only lowercase letters." 397 + Style.inline_code name 346 398 347 399 let () = 348 400 Location.register_error_of_exn ··· 359 411 let blank = [' ' '\009' '\012'] 360 412 let lowercase = ['a'-'z' '_'] 361 413 let uppercase = ['A'-'Z'] 414 + let identstart = lowercase | uppercase 362 415 let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9'] 363 - let lowercase_latin1 = ['a'-'z' '\223'-'\246' '\248'-'\255' '_'] 364 - let uppercase_latin1 = ['A'-'Z' '\192'-'\214' '\216'-'\222'] 365 - let identchar_latin1 = 366 - ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9'] 367 - (* This should be kept in sync with the [is_identchar] function in [env.ml] *) 416 + let utf8 = ['\192'-'\255'] ['\128'-'\191']* 417 + let identstart_ext = identstart | utf8 418 + let identchar_ext = identchar | utf8 368 419 369 420 let symbolchar = 370 421 ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] ··· 376 427 ['$' '&' '*' '+' '-' '/' '<' '=' '>' '@' '^' '|'] 377 428 378 429 let ident = (lowercase | uppercase) identchar* 379 - let extattrident = ident ('.' ident)* 430 + let ident_ext = identstart_ext identchar_ext* 431 + let extattrident = ident_ext ('.' ident_ext)* 380 432 381 433 let decimal_literal = 382 434 ['0'-'9'] ['0'-'9' '_']* ··· 419 471 | ".~" 420 472 { error lexbuf 421 473 (Reserved_sequence (".~", Some "is reserved for use in MetaOCaml")) } 422 - | "~" raw_ident_escape (lowercase identchar * as name) ':' 423 - { LABEL name } 424 - | "~" (lowercase identchar * as name) ':' 474 + | "~" (identstart identchar * as name) ':' 425 475 { check_label_name lexbuf name; 426 476 LABEL name } 427 - | "~" (lowercase_latin1 identchar_latin1 * as name) ':' 428 - { warn_latin1 lexbuf; 477 + | "~" (raw_ident_escape? as escape) (ident_ext as raw_name) ':' 478 + { let name = ident_for_extended lexbuf raw_name in 479 + check_label_name ~raw_escape:(escape<>"") lexbuf name; 429 480 LABEL name } 430 481 | "?" 431 482 { QUESTION } 432 - | "?" raw_ident_escape (lowercase identchar * as name) ':' 433 - { OPTLABEL name } 434 483 | "?" (lowercase identchar * as name) ':' 435 484 { check_label_name lexbuf name; 436 485 OPTLABEL name } 437 - | "?" (lowercase_latin1 identchar_latin1 * as name) ':' 438 - { warn_latin1 lexbuf; 439 - OPTLABEL name } 440 - | raw_ident_escape (lowercase identchar * as name) 441 - { LIDENT name } 486 + | "?" (raw_ident_escape? as escape) (ident_ext as raw_name) ':' 487 + { let name = ident_for_extended lexbuf raw_name in 488 + check_label_name ~raw_escape:(escape<>"") lexbuf name; 489 + OPTLABEL name 490 + } 442 491 | lowercase identchar * as name 443 492 { try Hashtbl.find keyword_table name 444 493 with Not_found -> LIDENT name } 445 - | lowercase_latin1 identchar_latin1 * as name 446 - { warn_latin1 lexbuf; LIDENT name } 447 494 | uppercase identchar * as name 448 495 { UIDENT name } (* No capitalized keywords *) 449 - | uppercase_latin1 identchar_latin1 * as name 450 - { warn_latin1 lexbuf; UIDENT name } 496 + | (raw_ident_escape? as escape) (ident_ext as raw_name) 497 + { let name = ident_for_extended lexbuf raw_name in 498 + if Utf8_lexeme.is_capitalized name then begin 499 + if escape="" then UIDENT name 500 + else 501 + (* we don't have capitalized keywords, and thus no needs for 502 + capitalized raw identifiers. *) 503 + error lexbuf (Capitalized_raw_identifier name) 504 + end else 505 + LIDENT name 506 + } (* No non-ascii keywords *) 451 507 | int_literal as lit { INT (lit, None) } 452 508 | (int_literal as lit) (literal_modifier as modif) 453 509 { INT (lit, Some modif) } ··· 460 516 | "\"" 461 517 { let s, loc = wrap_string_lexer string lexbuf in 462 518 STRING (s, loc, None) } 463 - | "{" (lowercase* as delim) "|" 464 - { let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in 465 - STRING (s, loc, Some delim) } 466 - | "{%" (extattrident as id) "|" 519 + | "{" (ident_ext? as raw_name) '|' 520 + { let delim = validate_delim lexbuf raw_name in 521 + let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in 522 + STRING (s, loc, Some delim) 523 + } 524 + | "{%" (extattrident as raw_id) "|" 467 525 { let orig_loc = Location.curr lexbuf in 526 + let id = validate_ext lexbuf raw_id in 468 527 let s, loc = wrap_string_lexer (quoted_string "") lexbuf in 469 528 let idloc = compute_quoted_string_idloc orig_loc 2 id in 470 529 QUOTED_STRING_EXPR (id, idloc, s, loc, Some "") } 471 - | "{%" (extattrident as id) blank+ (lowercase* as delim) "|" 530 + | "{%" (extattrident as raw_id) blank+ (ident_ext as raw_delim) "|" 472 531 { let orig_loc = Location.curr lexbuf in 532 + let id = validate_ext lexbuf raw_id in 533 + let delim = validate_delim lexbuf raw_delim in 473 534 let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in 474 535 let idloc = compute_quoted_string_idloc orig_loc 2 id in 475 536 QUOTED_STRING_EXPR (id, idloc, s, loc, Some delim) } 476 - | "{%%" (extattrident as id) "|" 537 + | "{%%" (extattrident as raw_id) "|" 477 538 { let orig_loc = Location.curr lexbuf in 539 + let id = validate_ext lexbuf raw_id in 478 540 let s, loc = wrap_string_lexer (quoted_string "") lexbuf in 479 541 let idloc = compute_quoted_string_idloc orig_loc 3 id in 480 542 QUOTED_STRING_ITEM (id, idloc, s, loc, Some "") } 481 - | "{%%" (extattrident as id) blank+ (lowercase* as delim) "|" 543 + | "{%%" (extattrident as raw_id) blank+ (ident_ext as raw_delim) "|" 482 544 { let orig_loc = Location.curr lexbuf in 545 + let id = validate_ext lexbuf raw_id in 546 + let delim = validate_delim lexbuf raw_delim in 483 547 let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in 484 548 let idloc = compute_quoted_string_idloc orig_loc 3 id in 485 549 QUOTED_STRING_ITEM (id, idloc, s, loc, Some delim) } ··· 666 730 is_in_string := false; 667 731 store_string_char '\"'; 668 732 comment lexbuf } 669 - | "{" ('%' '%'? extattrident blank*)? (lowercase* as delim) "|" 670 - { 733 + | "{" ('%' '%'? extattrident blank*)? (ident_ext? as raw_delim) "|" 734 + { match lax_delim raw_delim with 735 + | None -> store_lexeme lexbuf; comment lexbuf 736 + | Some delim -> 671 737 string_start_loc := Location.curr lexbuf; 672 738 store_lexeme lexbuf; 673 739 is_in_string := true; ··· 781 847 | eof 782 848 { is_in_string := false; 783 849 error_loc !string_start_loc Unterminated_string } 784 - | "|" (lowercase* as edelim) "}" 850 + | "|" (ident_ext? as raw_edelim) "}" 785 851 { 852 + let edelim = validate_encoding lexbuf raw_edelim in 786 853 if delim = edelim then lexbuf.lex_start_p 787 854 else (store_lexeme lexbuf; quoted_string delim lexbuf) 788 855 }

+34 -18

parsing/unit_info.ml

··· 17 17 type filename = string 18 18 type file_prefix = string 19 19 20 + type error = Invalid_encoding of string 21 + exception Error of error 22 + 20 23 type t = { 21 24 source_file: filename; 22 25 prefix: file_prefix; ··· 32 35 | dot_pos -> String.sub basename 0 dot_pos 33 36 | exception Not_found -> basename 34 37 35 - let modulize s = String.capitalize_ascii s 38 + let strict_modulize s = 39 + match Misc.Utf8_lexeme.capitalize s with 40 + | Ok x -> x 41 + | Error _ -> raise (Error (Invalid_encoding s)) 36 42 37 - (* We re-export the [Misc] definition *) 38 - let normalize = Misc.normalized_unit_filename 43 + let modulize s = match Misc.Utf8_lexeme.capitalize s with Ok x | Error x -> x 39 44 40 - let modname_from_source source_file = 41 - source_file |> Filename.basename |> basename_chop_extensions |> modulize 45 + (* We re-export the [Misc] definition, and ignore encoding errors under the 46 + assumption that we should focus our effort on not *producing* badly encoded 47 + module names *) 48 + let normalize x = match Misc.normalized_unit_filename x with 49 + | Ok x | Error x -> x 42 50 43 - let start_char = function 44 - | 'A' .. 'Z' -> true 45 - | _ -> false 51 + let stem source_file = 52 + source_file |> Filename.basename |> basename_chop_extensions 46 53 47 - let is_identchar_latin1 = function 48 - | 'A'..'Z' | 'a'..'z' | '_' | '\192'..'\214' | '\216'..'\246' 49 - | '\248'..'\255' | '\'' | '0'..'9' -> true 50 - | _ -> false 54 + let strict_modname_from_source source_file = 55 + source_file |> stem |> strict_modulize 56 + 57 + let lax_modname_from_source source_file = 58 + source_file |> stem |> modulize 51 59 52 60 (* Check validity of module name *) 53 - let is_unit_name name = 54 - String.length name > 0 55 - && start_char name.[0] 56 - && String.for_all is_identchar_latin1 name 61 + let is_unit_name name = Misc.Utf8_lexeme.is_valid_identifier name 57 62 58 63 let check_unit_name file = 59 64 if not (is_unit_name (modname file)) then ··· 61 66 (Warnings.Bad_module_name (modname file)) 62 67 63 68 let make ?(check_modname=true) ~source_file prefix = 64 - let modname = modname_from_source prefix in 69 + let modname = strict_modname_from_source prefix in 65 70 let p = { modname; prefix; source_file } in 66 71 if check_modname then check_unit_name p; 67 72 p ··· 79 84 let prefix x = Filename.remove_extension (filename x) 80 85 81 86 let from_filename filename = 82 - let modname = modname_from_source filename in 87 + let modname = lax_modname_from_source filename in 83 88 { modname; filename; source_file = None } 84 89 85 90 end ··· 120 125 let filename = modname f ^ ".cmi" in 121 126 let filename = Load_path.find_normalized filename in 122 127 { Artifact.filename; modname = modname f; source_file = Some f.source_file } 128 + 129 + let report_error = function 130 + | Invalid_encoding name -> 131 + Location.errorf "Invalid encoding of output name: %s." name 132 + 133 + let () = 134 + Location.register_error_of_exn 135 + (function 136 + | Error err -> Some (report_error err) 137 + | _ -> None 138 + )

+10 -3

parsing/unit_info.mli

··· 25 25 type filename = string 26 26 type file_prefix = string 27 27 28 + type error = Invalid_encoding of filename 29 + exception Error of error 30 + 28 31 (** [modulize s] capitalizes the first letter of [s]. *) 29 32 val modulize: string -> modname 30 33 31 34 (** [normalize s] uncapitalizes the first letter of [s]. *) 32 35 val normalize: string -> string 33 36 34 - (** [modname_from_source filename] is [modulize stem] where [stem] is the 37 + (** [lax_modname_from_source filename] is [modulize stem] where [stem] is the 35 38 basename of the filename [filename] stripped from all its extensions. 36 39 For instance, [modname_from_source "/pa.th/x.ml.pp"] is ["X"]. *) 37 - val modname_from_source: filename -> modname 40 + val lax_modname_from_source: filename -> modname 41 + 42 + (** Same as {!lax_modname_from_source} but raises an {!error.Invalid_encoding} 43 + error on filename with invalid utf8 encoding. *) 44 + val strict_modname_from_source: filename -> modname 38 45 39 46 (** {2:module_name_validation Module name validation function}*) 40 47 41 - (** [is_unit_name ~strict name] is true only if [name] can be used as a 48 + (** [is_unit_name name] is true only if [name] can be used as a 42 49 valid module name. *) 43 50 val is_unit_name : modname -> bool 44 51

+5

testsuite/tests/lexing/reject_bad_encoding.compilers.reference

··· 1 + Line 4, characters 8-10: 2 + 4 | let x = �a.x;; 3 + ^^ 4 + Error: Invalid encoding of identifier �a. 5 +

+4

testsuite/tests/lexing/reject_bad_encoding.ml

··· 1 + (* TEST 2 + toplevel; 3 + *) 4 + let x = �a.x;;

+3

testsuite/tests/parsetree/source.ml

··· 7481 7481 7482 7482 (* check pretty-printing of local module open in core_type *) 7483 7483 type t = String.( t ) 7484 + 7485 + (* Utf8 identifier *) 7486 + let là = function ça -> ça

+40

testsuite/tests/parsing/latin9.compilers.reference

··· 1 + val _ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿŠšŽžŒœŸ : 2 + string = "ok" 3 + type t = Æsop | Âcre | Ça | Élégant | Öst | Œuvre 4 + val été : string = "summer" 5 + val ça : string = "that" 6 + val straße : string = "street" 7 + val øre : string = "ear" 8 + val f : t -> int = <fun> 9 + val l : string list = ["summer"; "that"; "street"; "ear"] 10 + val s : string = "ok" 11 + Line 3, characters 0-47: 12 + 3 | module Élégant (* NFD encoded *) = struct end;; 13 + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 14 + Error: Multiple definition of the module name "Élégant". 15 + Names must be unique in a given structure or signature. 16 + val x : string = "x" 17 + Line 1, characters 9-14: 18 + 1 | let ko = {Là|x|Là};; 19 + ^^^^^ 20 + Error: "Là" cannot be used as a quoted string delimiter, 21 + it must contain only lowercase letters. 22 + Line 2, characters 10-20: 23 + 2 | let x = {%âcre.name été|x|été};; 24 + ^^^^^^^^^^ 25 + Error: Uninterpreted extension 'âcre.name'. 26 + Line 1, characters 10-19: 27 + 1 | let x = {%Âcre.sub été|x|été};; 28 + ^^^^^^^^^ 29 + Error: Uninterpreted extension 'Âcre.sub'. 30 + Line 2, characters 10-17: 31 + 2 | let x = {%âcre.m|x|};; 32 + ^^^^^^^ 33 + Error: Uninterpreted extension 'âcre.m'. 34 + Line 2, characters 4-10: 35 + 2 | let%À.ça x = ();; 36 + ^^^^^^ 37 + Error: Uninterpreted extension 'À.ça'. 38 + val x : unit = () 39 + val y : unit = () 40 +

+51

testsuite/tests/parsing/latin9.ml

··· 1 + (* TEST 2 + toplevel; 3 + *) 4 + 5 + (* NFC representation *) 6 + 7 + let _ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿŠšŽžŒœŸ 8 + = "ok" 9 + 10 + type t = Æsop | Âcre | Ça | Élégant | Öst | Œuvre 11 + 12 + let été = "summer" 13 + let ça = "that" 14 + let straße = "street" 15 + let øre = "ear" 16 + 17 + (* NFD representation *) 18 + 19 + let f = function 20 + | Æsop -> 1 | Âcre -> 2 | Ça -> 3 | Élégant -> 4 | Öst -> 5 | Œuvre -> 6 21 + 22 + let l = [été; ça; straße; øre] 23 + 24 + let s = _ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿŠšŽžŒœŸ 25 + 26 + let () = assert (f Élégant (* NFC encoded *) = 4) 27 + 28 + let () = 29 + let called = ref false in 30 + let élégant (* NFC encoded *) () = called := true in 31 + élégant (* NFD encoded *) (); assert (!called) 32 + ;; 33 + (* The following two defs should error with 'Multiple definition…' *) 34 + module Élégant (* NFC encoded *) = struct end 35 + module Élégant (* NFD encoded *) = struct end;; 36 + 37 + (** Quoted strings and extensions *) 38 + 39 + 40 + let x = {où|x|où};; 41 + let ko = {Là|x|Là};; 42 + 43 + let x = {%âcre.name été|x|été};; 44 + let x = {%Âcre.sub été|x|été};; 45 + 46 + let x = {%âcre.m|x|};; 47 + 48 + let%À.ça x = ();; 49 + 50 + let x = (* {été|*)|été}*) ();; 51 + let y = (* This is not a valid quoted string delimiter: {Été|*) ();;

+1 -1

testsuite/tests/tool-ocamldoc/Alert_toplevel.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/Alert_toplevel2.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/Alerts.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/Alerts_impl.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/Documentation_tags.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/Entities.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/Functions.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/Include_module_type_of.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/Inline_records.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/Item_ids.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+37

testsuite/tests/tool-ocamldoc/Latin9.html.reference

··· 1 + <!DOCTYPE html> 2 + <html> 3 + <head> 4 + <link rel="stylesheet" href="style.css" type="text/css"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 + <meta name="viewport" content="width=device-width, initial-scale=1"> 7 + <link rel="Start" href="index.html"> 8 + <link rel="Up" href="index.html"> 9 + <link title="Index of exceptions" rel=Appendix href="index_exceptions.html"> 10 + <link title="Index of values" rel=Appendix href="index_values.html"> 11 + <link title="Index of modules" rel=Appendix href="index_modules.html"> 12 + <link title="Latin9" rel="Chapter" href="Latin9.html"><title>Latin9</title> 13 + </head> 14 + <body> 15 + <div class="navbar"> <a class="up" href="index.html" title="Index">Up</a> 16 +  </div> 17 + <h1>Module <a href="type_Latin9.html">Latin9</a></h1> 18 + 19 + <pre><span id="MODULELatin9"><span class="keyword">module</span> Latin9</span>: <code class="code"><span class="keyword">sig</span></code> <a href="Latin9.html">..</a> <code class="code"><span class="keyword">end</span></code></pre><hr width="100%"> 20 + 21 + <pre><span id="MODULEÉté"><span class="keyword">module</span> <a href="Latin9.Été-c.html">Été</a></span>: <code class="code"><span class="keyword">sig</span></code> <a href="Latin9.Été-c.html">..</a> <code class="code"><span class="keyword">end</span></code></pre> 22 + <pre><span id="EXCEPTIONLà"><span class="keyword">exception</span> Là</span></pre> 23 + 24 + <pre><span id="VALf"><span class="keyword">val</span> f</span> : <code class="type">int -> 'a</code></pre><div class="info "> 25 + <div class="info-desc"> 26 + <p>Exceptions and parameters must be in latin-9 subset of unicode. 27 + In the @since version and in the description any character (e.g 字) is accepted:</p> 28 + </div> 29 + <ul class="info-attributes"> 30 + <li><b>Since</b> λ1</li> 31 + <li><b>Raises</b><ul><li><code>Là</code> स्तनति</li> 32 + <li><code>Été.Éclair</code> þunor</li> 33 + </ul></li> 34 + </ul> 35 + </div> 36 + <div class="param_info"><code class="code">éponyme</code> : ?</div> 37 + </body></html>

+18

testsuite/tests/tool-ocamldoc/Latin9.ml

··· 1 + (* TEST 2 + ocamldoc with html; 3 + *) 4 + 5 + module Été = struct 6 + exception Éclair 7 + end 8 + 9 + exception Là 10 + 11 + (** Exceptions and parameters must be in latin-9 subset of unicode. 12 + In the \@since version and in the description any character (e.g 字) is accepted: 13 + @since λ1 14 + @raise Là स्तनति 15 + @raise Été.Éclair þunor 16 + @param éponyme ? 17 + *) 18 + let f éponyme = if Random.int 2 > éponyme then raise Été.Éclair else raise Là

+1 -1

testsuite/tests/tool-ocamldoc/Linebreaks.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/Loop.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/Module_whitespace.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/No_preamble.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/Paragraph.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/Variants.html.reference

··· 2 2 <html> 3 3 <head> 4 4 <link rel="stylesheet" href="style.css" type="text/css"> 5 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 5 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 6 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 7 <link rel="Start" href="index.html"> 8 8 <link rel="Up" href="index.html">

+1 -1

testsuite/tests/tool-ocamldoc/type_Linebreaks.reference

··· 1 1 <html><head> 2 2 <link rel="stylesheet" href="style.css" type="text/css"> 3 - <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type"> 3 + <meta content="text/html; charset=UTF-8" http-equiv="Content-Type"> 4 4 <meta name="viewport" content="width=device-width, initial-scale=1"> 5 5 <link rel="Start" href="index.html"> 6 6 <link title="Index of types" rel=Appendix href="index_types.html">

+8

testsuite/tests/typing-unicode/genfiles.ml

··· 1 + let create_file name contents = 2 + Out_channel.with_open_text name (fun oc -> output_string oc contents) 3 + 4 + let _ = 5 + (* File name in NFC *) 6 + create_file "été.ml" "let x = 1\n"; 7 + (* File name in NFD *) 8 + create_file "\u{0063}\u{0327}a.ml" "let x = 2\n"

+18

testsuite/tests/typing-unicode/test.ml

··· 1 + (* TEST 2 + readonly_files = "genfiles.ml"; 3 + setup-ocamlc.byte-build-env; 4 + all_modules = "genfiles.ml"; 5 + program = "./genfiles.byte.exe"; 6 + ocamlc.byte; 7 + run; 8 + all_modules = "été.ml ça.ml test.ml"; 9 + program = "./main.byte.exe"; 10 + ocamlc.byte; 11 + run; 12 + *) 13 + 14 + let _ = 15 + (* Source is NFC *) 16 + assert (Été.x + Ça.x = 3); 17 + (* Source is NFD *) 18 + assert (Été.x + Ça.x = 3)

+1

testsuite/tests/unicode/néant.ml

··· 1 + let x = ()

+6

testsuite/tests/unicode/見.ml

··· 1 + (* TEST 2 + modules="néant.ml"; 3 + flags = "-w -bad-module-name"; 4 + *) 5 + 6 + let nothing = Néant.x

+1 -1

tools/check-typo-since

··· 20 20 21 21 check_typo_since() { 22 22 CHECK_TYPO=$(dirname $0)/check-typo 23 - git diff --name-only $1 \ 23 + git -c core.quotePath=false diff --name-only $1 \ 24 24 | (while IFS= read -r path 25 25 do 26 26 if test -e "$path"; then :; else continue; fi

+2 -2

tools/ci/actions/check-typo.sh

··· 50 50 COMMIT="$1" 51 51 COMMITS_TO_SEARCH="$2" 52 52 export OCAML_CT_HEAD="$COMMIT" 53 - export OCAML_CT_LS_FILES="git diff-tree --no-commit-id --name-only -r \ 54 - $COMMITS_TO_SEARCH --" 53 + export OCAML_CT_LS_FILES="git -c core.quotePath=false diff-tree \ 54 + --no-commit-id --name-only -r $COMMITS_TO_SEARCH --" 55 55 export OCAML_CT_CAT='git cat-file --textconv' 56 56 export OCAML_CT_PREFIX="$COMMIT:" 57 57 GIT_INDEX_FILE=tmp-index git read-tree --reset -i "$COMMIT"

+1 -1

toplevel/topcommon.ml

··· 34 34 let input_name = Location.input_name 35 35 36 36 let parse_mod_use_file name lb = 37 - let modname = Unit_info.modname_from_source name in 37 + let modname = Unit_info.lax_modname_from_source name in 38 38 let items = 39 39 List.concat 40 40 (List.map

+3 -12

typing/env.ml

··· 1682 1682 | Mp_present -> 1683 1683 Lazy_backtrack.create_forced (Aident id) 1684 1684 1685 - let is_identchar c = 1686 - (* This should be kept in sync with the [identchar_latin1] character class 1687 - in [lexer.mll] *) 1688 - match c with 1689 - | 'A'..'Z' | 'a'..'z' | '_' | '\192'..'\214' 1690 - | '\216'..'\246' | '\248'..'\255' | '\'' | '0'..'9' -> 1691 - true 1692 - | _ -> 1693 - false 1694 - 1695 1685 let rec components_of_module_maker 1696 1686 {cm_env; cm_prefixing_subst; 1697 1687 cm_path; cm_addr; cm_mty; cm_shape} : _ result = ··· 1897 1887 (* Note: we could also check here general validity of the 1898 1888 identifier, to protect against bad identifiers forged by -pp or 1899 1889 -ppx preprocessors. *) 1900 - if String.length name > 0 && not (is_identchar name.[0]) then 1890 + if String.length name > 0 && not 1891 + (Utf8_lexeme.starts_like_a_valid_identifier name) then 1901 1892 for i = 1 to String.length name - 1 do 1902 1893 if name.[i] = '#' then 1903 1894 error (Illegal_value_name(loc, name)) ··· 2532 2523 let unit_name_of_filename fn = 2533 2524 match Filename.extension fn with 2534 2525 | ".cmi" -> 2535 - let modname = Unit_info.modname_from_source fn in 2526 + let modname = Unit_info.strict_modname_from_source fn in 2536 2527 if Unit_info.is_unit_name modname then Some modname 2537 2528 else None 2538 2529 | _ -> None

+1 -20

typing/oprint.ml

··· 37 37 38 38 let out_ident = ref print_ident 39 39 40 - (* Check a character matches the [identchar_latin1] class from the lexer *) 41 - let is_ident_char c = 42 - match c with 43 - | 'A'..'Z' | 'a'..'z' | '_' | '\192'..'\214' | '\216'..'\246' 44 - | '\248'..'\255' | '\'' | '0'..'9' -> true 45 - | _ -> false 46 - 47 - let all_ident_chars s = 48 - let rec loop s len i = 49 - if i < len then begin 50 - if is_ident_char s.[i] then loop s len (i+1) 51 - else false 52 - end else begin 53 - true 54 - end 55 - in 56 - let len = String.length s in 57 - loop s len 0 58 - 59 40 let parenthesized_ident name = 60 41 (List.mem name ["or"; "mod"; "land"; "lor"; "lxor"; "lsl"; "lsr"; "asr"]) 61 - || not (all_ident_chars name) 42 + || not (Misc.Utf8_lexeme.is_valid_identifier name) 62 43 63 44 let value_ident ppf name = 64 45 if parenthesized_ident name then

+21 -15

utils/load_path.ml

··· 105 105 order. *) 106 106 let prepend_add dir = 107 107 List.iter (fun base -> 108 - let fn = Filename.concat dir.Dir.path base in 109 - let filename = Misc.normalized_unit_filename base in 110 - if dir.Dir.hidden then begin 111 - STbl.replace !hidden_files base fn; 112 - STbl.replace !hidden_files_uncap filename fn 113 - end else begin 114 - STbl.replace !visible_files base fn; 115 - STbl.replace !visible_files_uncap filename fn 116 - end 108 + Result.iter (fun filename -> 109 + let fn = Filename.concat dir.Dir.path base in 110 + if dir.Dir.hidden then begin 111 + STbl.replace !hidden_files base fn; 112 + STbl.replace !hidden_files_uncap filename fn 113 + end else begin 114 + STbl.replace !visible_files base fn; 115 + STbl.replace !visible_files_uncap filename fn 116 + end) 117 + (Misc.normalized_unit_filename base) 117 118 ) dir.Dir.files 118 119 119 120 let init ~auto_include ~visible ~hidden = ··· 150 151 in 151 152 List.iter 152 153 (fun base -> 153 - let fn = Filename.concat dir.Dir.path base in 154 - update base fn visible_files hidden_files; 155 - let ubase = Misc.normalized_unit_filename base in 156 - update ubase fn visible_files_uncap hidden_files_uncap) 154 + Result.iter (fun ubase -> 155 + let fn = Filename.concat dir.Dir.path base in 156 + update base fn visible_files hidden_files; 157 + update ubase fn visible_files_uncap hidden_files_uncap 158 + ) 159 + (Misc.normalized_unit_filename base) 160 + ) 157 161 dir.files; 158 162 if dir.hidden then 159 163 hidden_dirs := dir :: !hidden_dirs ··· 216 220 217 221 let find_normalized_with_visibility fn = 218 222 assert (not Config.merlin || Local_store.is_bound ()); 223 + match Misc.normalized_unit_filename fn with 224 + | Error _ -> raise Not_found 225 + | Ok fn_uncap -> 219 226 try 220 227 if is_basename fn && not !Sys.interactive then 221 - find_file_in_cache (Misc.normalized_unit_filename fn) 228 + find_file_in_cache fn_uncap 222 229 visible_files_uncap hidden_files_uncap 223 230 else 224 231 try ··· 227 234 | Not_found -> 228 235 (Misc.find_in_path_normalized (get_hidden_path_list ()) fn, Hidden) 229 236 with Not_found -> 230 - let fn_uncap = Misc.normalized_unit_filename fn in 231 237 (!auto_include_callback Dir.find_normalized fn_uncap, Visible) 232 238 233 239 let find_normalized fn = fst (find_normalized_with_visibility fn)

+234 -2

utils/misc.ml

··· 260 260 external compare : 'a -> 'a -> int = "%compare" 261 261 end 262 262 263 + (** {1 Minimal support for Unicode characters in identifiers} *) 264 + 265 + module Utf8_lexeme = struct 266 + 267 + type t = string 268 + 269 + (* Non-ASCII letters that are allowed in identifiers (currently: Latin-9) *) 270 + 271 + type case = Upper of Uchar.t | Lower of Uchar.t 272 + let known_chars : (Uchar.t, case) Hashtbl.t = Hashtbl.create 32 273 + 274 + let _ = 275 + List.iter 276 + (fun (upper, lower) -> 277 + let upper = Uchar.of_int upper and lower = Uchar.of_int lower in 278 + Hashtbl.add known_chars upper (Upper lower); 279 + Hashtbl.add known_chars lower (Lower upper)) 280 + [ 281 + (0xc0, 0xe0); (* À, à *) (0xc1, 0xe1); (* Á, á *) 282 + (0xc2, 0xe2); (* Â, â *) (0xc3, 0xe3); (* Ã, ã *) 283 + (0xc4, 0xe4); (* Ä, ä *) (0xc5, 0xe5); (* Å, å *) 284 + (0xc6, 0xe6); (* Æ, æ *) (0xc7, 0xe7); (* Ç, ç *) 285 + (0xc8, 0xe8); (* È, è *) (0xc9, 0xe9); (* É, é *) 286 + (0xca, 0xea); (* Ê, ê *) (0xcb, 0xeb); (* Ë, ë *) 287 + (0xcc, 0xec); (* Ì, ì *) (0xcd, 0xed); (* Í, í *) 288 + (0xce, 0xee); (* Î, î *) (0xcf, 0xef); (* Ï, ï *) 289 + (0xd0, 0xf0); (* Ð, ð *) (0xd1, 0xf1); (* Ñ, ñ *) 290 + (0xd2, 0xf2); (* Ò, ò *) (0xd3, 0xf3); (* Ó, ó *) 291 + (0xd4, 0xf4); (* Ô, ô *) (0xd5, 0xf5); (* Õ, õ *) 292 + (0xd6, 0xf6); (* Ö, ö *) (0xd8, 0xf8); (* Ø, ø *) 293 + (0xd9, 0xf9); (* Ù, ù *) (0xda, 0xfa); (* Ú, ú *) 294 + (0xdb, 0xfb); (* Û, û *) (0xdc, 0xfc); (* Ü, ü *) 295 + (0xdd, 0xfd); (* Ý, ý *) (0xde, 0xfe); (* Þ, þ *) 296 + (0x160, 0x161); (* Š, š *) (0x17d, 0x17e); (* Ž, ž *) 297 + (0x152, 0x153); (* Œ, œ *) (0x178, 0xff); (* Ÿ, ÿ *) 298 + (0x1e9e, 0xdf); (* ẞ, ß *) 299 + ] 300 + 301 + (* NFD to NFC conversion table for the letters above *) 302 + 303 + let known_pairs : (Uchar.t * Uchar.t, Uchar.t) Hashtbl.t = Hashtbl.create 32 304 + 305 + let _ = 306 + List.iter 307 + (fun (c1, n2, n) -> 308 + Hashtbl.add known_pairs 309 + (Uchar.of_char c1, Uchar.of_int n2) (Uchar.of_int n)) 310 + [ 311 + ('A', 0x300, 0xc0); (* À *) ('A', 0x301, 0xc1); (* Á *) 312 + ('A', 0x302, 0xc2); (* Â *) ('A', 0x303, 0xc3); (* Ã *) 313 + ('A', 0x308, 0xc4); (* Ä *) ('A', 0x30a, 0xc5); (* Å *) 314 + ('C', 0x327, 0xc7); (* Ç *) ('E', 0x300, 0xc8); (* È *) 315 + ('E', 0x301, 0xc9); (* É *) ('E', 0x302, 0xca); (* Ê *) 316 + ('E', 0x308, 0xcb); (* Ë *) ('I', 0x300, 0xcc); (* Ì *) 317 + ('I', 0x301, 0xcd); (* Í *) ('I', 0x302, 0xce); (* Î *) 318 + ('I', 0x308, 0xcf); (* Ï *) ('N', 0x303, 0xd1); (* Ñ *) 319 + ('O', 0x300, 0xd2); (* Ò *) ('O', 0x301, 0xd3); (* Ó *) 320 + ('O', 0x302, 0xd4); (* Ô *) ('O', 0x303, 0xd5); (* Õ *) 321 + ('O', 0x308, 0xd6); (* Ö *) 322 + ('U', 0x300, 0xd9); (* Ù *) ('U', 0x301, 0xda); (* Ú *) 323 + ('U', 0x302, 0xdb); (* Û *) ('U', 0x308, 0xdc); (* Ü *) 324 + ('Y', 0x301, 0xdd); (* Ý *) ('Y', 0x308, 0x178); (* Ÿ *) 325 + ('S', 0x30c, 0x160); (* Š *) ('Z', 0x30c, 0x17d); (* Ž *) 326 + ('a', 0x300, 0xe0); (* à *) ('a', 0x301, 0xe1); (* á *) 327 + ('a', 0x302, 0xe2); (* â *) ('a', 0x303, 0xe3); (* ã *) 328 + ('a', 0x308, 0xe4); (* ä *) ('a', 0x30a, 0xe5); (* å *) 329 + ('c', 0x327, 0xe7); (* ç *) ('e', 0x300, 0xe8); (* è *) 330 + ('e', 0x301, 0xe9); (* é *) ('e', 0x302, 0xea); (* ê *) 331 + ('e', 0x308, 0xeb); (* ë *) ('i', 0x300, 0xec); (* ì *) 332 + ('i', 0x301, 0xed); (* í *) ('i', 0x302, 0xee); (* î *) 333 + ('i', 0x308, 0xef); (* ï *) ('n', 0x303, 0xf1); (* ñ *) 334 + ('o', 0x300, 0xf2); (* ò *) ('o', 0x301, 0xf3); (* ó *) 335 + ('o', 0x302, 0xf4); (* ô *) ('o', 0x303, 0xf5); (* õ *) 336 + ('o', 0x308, 0xf6); (* ö *) 337 + ('u', 0x300, 0xf9); (* ù *) ('u', 0x301, 0xfa); (* ú *) 338 + ('u', 0x302, 0xfb); (* û *) ('u', 0x308, 0xfc); (* ü *) 339 + ('y', 0x301, 0xfd); (* ý *) ('y', 0x308, 0xff); (* ÿ *) 340 + ('s', 0x30c, 0x161); (* š *) ('z', 0x30c, 0x17e); (* ž *) 341 + ] 342 + 343 + let normalize_generic ~keep_ascii transform s = 344 + let rec norm check buf prev i = 345 + if i >= String.length s then begin 346 + Buffer.add_utf_8_uchar buf (transform prev) 347 + end else begin 348 + let d = String.get_utf_8_uchar s i in 349 + let u = Uchar.utf_decode_uchar d in 350 + check d u; 351 + let i' = i + Uchar.utf_decode_length d in 352 + match Hashtbl.find_opt known_pairs (prev, u) with 353 + | Some u' -> 354 + norm check buf u' i' 355 + | None -> 356 + Buffer.add_utf_8_uchar buf (transform prev); 357 + norm check buf u i' 358 + end in 359 + let ascii_limit = 128 in 360 + if s = "" 361 + || keep_ascii && String.for_all (fun x -> Char.code x < ascii_limit) s 362 + then Ok s 363 + else 364 + let buf = Buffer.create (String.length s) in 365 + let valid = ref true in 366 + let check d u = 367 + valid := !valid && Uchar.utf_decode_is_valid d && u <> Uchar.rep 368 + in 369 + let d = String.get_utf_8_uchar s 0 in 370 + let u = Uchar.utf_decode_uchar d in 371 + check d u; 372 + norm check buf u (Uchar.utf_decode_length d); 373 + let contents = Buffer.contents buf in 374 + if !valid then 375 + Ok contents 376 + else 377 + Error contents 378 + 379 + let normalize s = 380 + normalize_generic ~keep_ascii:true (fun u -> u) s 381 + 382 + (* Capitalization *) 383 + 384 + let uchar_is_uppercase u = 385 + let c = Uchar.to_int u in 386 + if c < 0x80 then c >= 65 && c <= 90 else 387 + match Hashtbl.find_opt known_chars u with 388 + | Some(Upper _) -> true 389 + | _ -> false 390 + 391 + let uchar_lowercase u = 392 + let c = Uchar.to_int u in 393 + if c < 0x80 then 394 + if c >= 65 && c <= 90 then Uchar.of_int (c + 32) else u 395 + else 396 + match Hashtbl.find_opt known_chars u with 397 + | Some(Upper u') -> u' 398 + | _ -> u 399 + 400 + let uchar_uppercase u = 401 + let c = Uchar.to_int u in 402 + if c < 0x80 then 403 + if c >= 97 && c <= 122 then Uchar.of_int (c - 32) else u 404 + else 405 + match Hashtbl.find_opt known_chars u with 406 + | Some(Lower u') -> u' 407 + | _ -> u 408 + 409 + let capitalize s = 410 + let first = ref true in 411 + normalize_generic ~keep_ascii:false 412 + (fun u -> if !first then (first := false; uchar_uppercase u) else u) 413 + s 414 + 415 + let uncapitalize s = 416 + let first = ref true in 417 + normalize_generic ~keep_ascii:false 418 + (fun u -> if !first then (first := false; uchar_lowercase u) else u) 419 + s 420 + 421 + let is_capitalized s = 422 + s <> "" && 423 + uchar_is_uppercase (Uchar.utf_decode_uchar (String.get_utf_8_uchar s 0)) 424 + 425 + (* Characters allowed in identifiers after normalization is applied. 426 + Currently: 427 + - ASCII letters, underscore 428 + - Latin-9 letters, represented in NFC 429 + - ASCII digits, single quote (but not as first character) 430 + - dot if [with_dot] = true 431 + *) 432 + let uchar_valid_in_identifier ~with_dot u = 433 + let c = Uchar.to_int u in 434 + if c < 0x80 then 435 + c >= 97 (* a *) && c <= 122 (* z *) 436 + || c >= 65 (* A *) && c <= 90 (* Z *) 437 + || c >= 48 (* 0 *) && c <= 57 (* 9 *) 438 + || c = 95 (* underscore *) 439 + || c = 39 (* single quote *) 440 + || (with_dot && c = 46) (* dot *) 441 + else 442 + Hashtbl.mem known_chars u 443 + 444 + let uchar_not_identifier_start u = 445 + let c = Uchar.to_int u in 446 + c >= 48 (* 0 *) && c <= 57 (* 9 *) 447 + || c = 39 (* single quote *) 448 + 449 + (* Check whether a normalized string is a valid OCaml identifier. *) 450 + 451 + type validation_result = 452 + | Valid 453 + | Invalid_character of Uchar.t (** Character not allowed *) 454 + | Invalid_beginning of Uchar.t (** Character not allowed as first char *) 455 + 456 + let validate_identifier ?(with_dot=false) s = 457 + let rec check i = 458 + if i >= String.length s then Valid else begin 459 + let d = String.get_utf_8_uchar s i in 460 + let u = Uchar.utf_decode_uchar d in 461 + let i' = i + Uchar.utf_decode_length d in 462 + if not (uchar_valid_in_identifier ~with_dot u) then 463 + Invalid_character u 464 + else if i = 0 && uchar_not_identifier_start u then 465 + Invalid_beginning u 466 + else 467 + check i' 468 + end 469 + in check 0 470 + 471 + let is_valid_identifier s = 472 + validate_identifier s = Valid 473 + 474 + let starts_like_a_valid_identifier s = 475 + s <> "" && 476 + (let u = Uchar.utf_decode_uchar (String.get_utf_8_uchar s 0) in 477 + uchar_valid_in_identifier ~with_dot:false u 478 + && not (uchar_not_identifier_start u)) 479 + 480 + let is_lowercase s = 481 + let rec is_lowercase_at len s n = 482 + if n >= len then true 483 + else 484 + let d = String.get_utf_8_uchar s n in 485 + let u = Uchar.utf_decode_uchar d in 486 + (uchar_valid_in_identifier ~with_dot:false u) 487 + && not (uchar_is_uppercase u) 488 + && is_lowercase_at len s (n+Uchar.utf_decode_length d) 489 + in 490 + is_lowercase_at (String.length s) s 0 491 + end 492 + 263 493 (* File functions *) 264 494 265 495 let find_in_path path name = ··· 290 520 if Sys.file_exists fullname then fullname else try_dir rem 291 521 in try_dir path 292 522 293 - let normalized_unit_filename = String.uncapitalize_ascii 523 + let normalized_unit_filename = Utf8_lexeme.uncapitalize 294 524 295 525 let find_in_path_normalized path name = 296 - let uname = normalized_unit_filename name in 526 + match normalized_unit_filename name with 527 + | Error _ -> raise Not_found 528 + | Ok uname -> 297 529 let rec try_dir = function 298 530 [] -> raise Not_found 299 531 | dir::rem ->

+63 -2

utils/misc.mli

··· 217 217 val find_in_path_rel: string list -> string -> string 218 218 (** Search a relative file in a list of directories. *) 219 219 220 - (** Normalize file name [Foo.ml] to [foo.ml] *) 221 - val normalized_unit_filename: string -> string 220 + (** Normalize file name [Foo.ml] to [foo.ml], using NFC and case-folding. 221 + Return [Error] if the input is not a valid utf-8 byte sequence *) 222 + val normalized_unit_filename: string -> (string,string) Result.t 222 223 223 224 val find_in_path_normalized: string list -> string -> string 224 225 (** Same as {!find_in_path_rel} , but search also for normalized unit filename, ··· 760 761 *) 761 762 762 763 val all_kinds : kind list 764 + end 765 + 766 + (** {1 Minimal support for Unicode characters in identifiers} *) 767 + 768 + (** Characters allowed in identifiers are, currently: 769 + - ASCII letters A-Z a-z 770 + - Latin-1 letters (U+00C0 - U+00FF except U+00D7 and U+00F7) 771 + - Character sequences which normalize to the above character under NFC 772 + - digits 0-9, underscore, single quote 773 + *) 774 + 775 + module Utf8_lexeme: sig 776 + type t = string 777 + 778 + val normalize: string -> (t,t) Result.t 779 + (** Normalize the given UTF-8 encoded string. 780 + Invalid UTF-8 sequences results in a error and are replaced 781 + by U+FFFD. 782 + Identifier characters are put in NFC normalized form. 783 + Other Unicode characters are left unchanged. *) 784 + 785 + val capitalize: string -> (t,t) Result.t 786 + (** Like [normalize], but if the string starts with a lowercase identifier 787 + character, it is replaced by the corresponding uppercase character. 788 + Subsequent characters are not changed. *) 789 + 790 + val uncapitalize: string -> (t,t) Result.t 791 + (** Like [normalize], but if the string starts with an uppercase identifier 792 + character, it is replaced by the corresponding lowercase character. 793 + Subsequent characters are not changed. *) 794 + 795 + val is_capitalized: t -> bool 796 + (** Returns [true] if the given normalized string starts with an 797 + uppercase identifier character, [false] otherwise. May return 798 + wrong results if the string is not normalized. *) 799 + 800 + val is_valid_identifier: t -> bool 801 + (** Check whether the given normalized string is a valid OCaml identifier: 802 + - all characters are identifier characters 803 + - it does not start with a digit or a single quote 804 + *) 805 + 806 + val is_lowercase: t -> bool 807 + (** Returns [true] if the given normalized string only contains lowercase 808 + identifier character, [false] otherwise. May return wrong results if the 809 + string is not normalized. *) 810 + 811 + type validation_result = 812 + | Valid 813 + | Invalid_character of Uchar.t (** Character not allowed *) 814 + | Invalid_beginning of Uchar.t (** Character not allowed as first char *) 815 + 816 + val validate_identifier: ?with_dot:bool -> t -> validation_result 817 + (** Like [is_valid_identifier], but returns a more detailed error code. Dots 818 + can be allowed to extend support to path-like identifiers. *) 819 + 820 + val starts_like_a_valid_identifier: t -> bool 821 + (** Checks whether the given normalized string starts with an identifier 822 + character other than a digit or a single quote. Subsequent characters 823 + are not checked. *) 763 824 end 764 825 765 826 (** {1 Miscellaneous type aliases} *)

Configure Feed

Configure Feed