···1717* text=auto
18181919# It is not possible to wrap lines lines in .gitattributes files
2020-.gitattributes typo.long-line=may
2020+.gitattributes typo.long-line=may typo.utf8
21212222# Binary files
2323/boot/ocamlc binary
···100100otherlibs/unix/stat_win32.c typo.long-line
101101otherlibs/unix/symlink_win32.c typo.long-line
102102103103+# Some Unicode characters here and there
104104+utils/misc.ml typo.utf8
103105runtime/sak.c typo.non-ascii
104106105107stdlib/hashbang typo.white-at-eol typo.missing-lf
···108110testsuite/tests/lib-bigarray-2/bigarrf.f typo.tab linguist-language=Fortran
109111testsuite/tests/lib-unix/win-stat/fakeclock.c typo.missing-header=false
110112testsuite/tests/misc-unsafe/almabench.ml typo.long-line
113113+testsuite/tests/parsing/latin9.ml typo.utf8 typo.very-long-line
114114+testsuite/tests/tool-ocamldoc/Latin9.ml typo.utf8
115115+testsuite/tests/parsetree/source.ml typo.utf8
116116+testsuite/tests/typing-unicode/*.ml typo.utf8
111117testsuite/tests/tool-toplevel/strings.ml typo.utf8
112118testsuite/tests/win-unicode/*.ml typo.utf8
119119+testsuite/tests/unicode/見.ml typo.utf8
120120+testsuite/tests/lexing/reject_bad_encoding.ml typo.prune
113121testsuite/tests/asmgen/immediates.cmm typo.very-long-line
114122testsuite/tests/generated-parse-errors/errors.* typo.very-long-line
115123testsuite/tools/*.S typo.missing-header
+5
Changes
···535535 syntax for types.
536536 (Chet Murthy, review by Gabriel Scherer)
537537538538+- #11736, #12664: Support utf-8 encoded source files and latin-9 compatible
539539+ identifiers.
540540+ (Xavier Leroy and Florian Angeletti, review by Daniel Bünzli and
541541+ Jules Aguillon)
542542+538543### Type system:
539544540545- #12313, #11799: Do not re-build as-pattern type when a ground type annotation
+29-6
debugger/debugger_lexer.mll
···18181919open Debugger_parser
20202121+2222+let ident_for_extended raw_name =
2323+ match Misc.Utf8_lexeme.normalize raw_name with
2424+ | Error _ -> raise Parsing.Parse_error
2525+ | Ok name ->
2626+ match Misc.Utf8_lexeme.validate_identifier name with
2727+ | Misc.Utf8_lexeme.Valid -> name
2828+ | Misc.Utf8_lexeme.Invalid_character _
2929+ | Misc.Utf8_lexeme.Invalid_beginning _ ->
3030+ raise Parsing.Parse_error
3131+2132exception Int_overflow
22332334}
24353636+let lowercase = ['a'-'z' '_']
3737+let uppercase = ['A'-'Z']
3838+let identstart = lowercase | uppercase
3939+let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9']
4040+let utf8 = ['\192'-'\255'] ['\128'-'\191']*
4141+let identstart_ext = identstart | utf8
4242+let identchar_ext = identchar | utf8
4343+let ident_ext = identstart_ext identchar_ext*
4444+2545rule line = (* Read a whole line *)
2646 parse
2747 ([ ^ '\n' '\r' ]* as s) ('\n' | '\r' | "\r\n")
···5373 parse
5474 [' ' '\t'] +
5575 { lexeme lexbuf }
5656- | ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
5757- (['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255'
5858- '\'' '0'-'9' ]) *
7676+ | lowercase identchar*
5977 { LIDENT(Lexing.lexeme lexbuf) }
6060- | ['A'-'Z' '\192'-'\214' '\216'-'\222' ]
6161- (['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255'
6262- '\'' '0'-'9' ]) *
7878+ | uppercase identchar*
6379 { UIDENT(Lexing.lexeme lexbuf) }
8080+ | ident_ext as raw_name
8181+ {
8282+ let name = ident_for_extended raw_name in
8383+ if Misc.Utf8_lexeme.is_capitalized name
8484+ then UIDENT name
8585+ else LIDENT name
8686+ }
6487 | '"' [^ '"']* "\""
6588 { let s = Lexing.lexeme lexbuf in
6689 LIDENT(String.sub s 1 (String.length s - 2)) }
+2-2
driver/makedepend.ml
···427427428428(* Init Hashtbl with all defined modules *)
429429 let files = List.map (fun (file, file_kind, deps, pp_deps) ->
430430- let modname = Unit_info.modname_from_source file in
430430+ let modname = Unit_info.lax_modname_from_source file in
431431 let key = (modname, file_kind) in
432432 let new_deps = ref [] in
433433 Hashtbl.add h key (file, new_deps);
···526526 ~mli_file:process_mli_map
527527 in
528528 Clflags.transparent_modules := old_transp;
529529- let modname = Unit_info.modname_from_source fname in
529529+ let modname = Unit_info.lax_modname_from_source fname in
530530 if String.Map.is_empty m then
531531 report_err (Failure (fname ^ " : empty map file or parse error"));
532532 let mm = Depend.make_node m in
···207207 | Odoc_global.Text_file file ->
208208 Location.input_name := file;
209209 try
210210- let mod_name = Unit_info.modname_from_source file in
210210+ let mod_name = Unit_info.lax_modname_from_source file in
211211 let txt =
212212 try Odoc_text.Texter.text_of_string (Odoc_misc.input_file_as_string file)
213213 with Odoc_text.Text_syntax (l, c, s) ->
+1-1
ocamldoc/odoc_ast.ml
···18461846 let (tree_structure, _) = typedtree in
18471847 prepare_file source_file input_file;
18481848 (* We create the t_module for this file. *)
18491849- let mod_name = Unit_info.modname_from_source source_file in
18491849+ let mod_name = Unit_info.lax_modname_from_source source_file in
18501850 let len, info_opt = Sig.preamble !file_name !file
18511851 (fun x -> x.Parsetree.pstr_loc) parsetree in
18521852 let info_opt = analyze_toplevel_alerts info_opt parsetree in
···8787(** Remove first blank characters of each line of a string, until the first '*' *)
8888let remove_stars s =
8989 Str.global_replace (Str.regexp ("^"^blank^"*\\*")) "" s
9090+9191+let validate_encoding raw_name =
9292+ match Misc.Utf8_lexeme.normalize raw_name with
9393+ | Error s -> failwith (Format.asprintf "Invalid encoding %s" s)
9494+ | Ok name -> name
9595+9696+let validate_ident raw_name =
9797+ let name = validate_encoding raw_name in
9898+ match Misc.Utf8_lexeme.validate_identifier name with
9999+ | Misc.Utf8_lexeme.Valid -> name
100100+ | Misc.Utf8_lexeme.Invalid_character u ->
101101+ failwith (Format.asprintf "Invalid character U+%X" (Uchar.to_int u))
102102+ | Misc.Utf8_lexeme.Invalid_beginning u ->
103103+ failwith (Format.asprintf "Invalid first character U+%X" (Uchar.to_int u))
104104+105105+ let validate_exception_uident raw_name =
106106+ let name = validate_ident raw_name in
107107+ if Misc.Utf8_lexeme.is_capitalized name then name else
108108+ failwith (Format.asprintf "Invalid exception name: %s" name)
90109}
9111092111let blank = [ ' ' '\013' '\009' '\012']
93112let nl_blank = blank | '\010'
94113let notblank = [^ ' ' '\010' '\013' '\009' '\012']
9595-let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
9696-let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222']
9797-let identchar =
9898- ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
99114100100-let modident = uppercase identchar* ('.' uppercase identchar* )*
115115+let lowercase = ['a'-'z' '_']
116116+let uppercase = ['A'-'Z']
117117+let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9']
118118+let utf8 = ['\192'-'\255'] ['\128'-'\191']*
119119+let identchar_ext = identchar | utf8
120120+let identstart_ext = lowercase | uppercase | utf8
121121+let ident_ext = identstart_ext identchar_ext*
101122102123rule main = parse
103124 [' ' '\013' '\009' '\012'] +
···301322 }
302323303324 | "@param" nl_blank+ (identchar+ as id) nl_blank+ { T_PARAM id }
325325+ | "@param" nl_blank+ (identchar_ext+ as raw_id) nl_blank+ {
326326+ let id = validate_ident raw_id in
327327+ T_PARAM id
328328+ }
304329 | "@param" { failwith "usage: @param id description"}
305305- | "@before" nl_blank+ (notblank+ as v) nl_blank+ { T_BEFORE v }
330330+ | "@before" nl_blank+ (notblank+ as v) nl_blank+ {
331331+ let v = validate_encoding v in
332332+ T_BEFORE v }
306333 | "@before" { failwith "usage: @before version description"}
307307- | "@raise" nl_blank+ (modident as id) nl_blank+ { T_RAISES id }
334334+ | "@raise" nl_blank+ (ident_ext ('.' ident_ext)* as exn_path) nl_blank+
335335+ { let raw_path = String.split_on_char '.' exn_path in
336336+ let path = List.map validate_exception_uident raw_path in
337337+ let id = String.concat "." path in
338338+ T_RAISES id }
308339 | "@raise" { failwith "usage: @raise Exception description"}
309340 | "@"lowercase+
310341 {
+30-15
ocamldoc/odoc_ocamlhtml.mll
···204204(** To store the position of the beginning of a string and comment *)
205205let string_start_pos = ref 0
206206let comment_start_pos = ref []
207207+208208+(** Normalizing utf-8 *)
209209+let normalize raw_name =
210210+ (* we are printing documentation, it is too late to be strict *)
211211+ match Misc.Utf8_lexeme.normalize raw_name with
212212+ | Error s -> s
213213+ | Ok name -> name
214214+207215}
208216209217let blank = [' ' '\010' '\013' '\009' '\012']
210210-let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
211211-let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222']
212212-let identchar =
213213- ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
218218+219219+let lowercase = ['a'-'z' '_']
220220+let uppercase = ['A'-'Z']
221221+let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9']
222222+let utf8 = ['\192'-'\255'] ['\128'-'\191']*
223223+let identstart_ext = uppercase | lowercase | utf8
224224+let identchar_ext = identchar | utf8
225225+let ident_ext = identstart_ext identchar_ext*
226226+227227+214228let symbolchar =
215229 ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
216230let decimal_literal = ['0'-'9']+
···237251 | "_"
238252 { print "_" ; token lexbuf }
239253 | "~" { print "~" ; token lexbuf }
240240- | "~" lowercase identchar * ':'
254254+ | "~" (ident_ext as raw_id ) ':'
241255 { let s = Lexing.lexeme lexbuf in
242242- let name = String.sub s 1 (String.length s - 2) in
256256+ let name = normalize raw_id in
243257 if Hashtbl.mem keyword_table name then
244258 raise (Error(Keyword_as_label name, Lexing.lexeme_start lexbuf,
245259 Lexing.lexeme_end lexbuf));
246260 print s ; token lexbuf }
247261 | "?" { print "?" ; token lexbuf }
248248- | "?" lowercase identchar * ':'
249249- { let s = Lexing.lexeme lexbuf in
250250- let name = String.sub s 1 (String.length s - 2) in
262262+ | "?" (ident_ext as raw_id) ':'
263263+ {
264264+ let name = normalize raw_id in
251265 if Hashtbl.mem keyword_table name then
252266 raise (Error(Keyword_as_label name, Lexing.lexeme_start lexbuf,
253267 Lexing.lexeme_end lexbuf));
254254- print s ; token lexbuf }
255255- | lowercase identchar *
256256- { let s = Lexing.lexeme lexbuf in
257257- try
268268+ print "?"; print name ; print ":"; token lexbuf }
269269+ | (ident_ext as raw_id)
270270+ { let s = normalize raw_id in
271271+ if Misc.Utf8_lexeme.is_capitalized s then
272272+ (print_class constructor_class (Lexing.lexeme lexbuf);
273273+ token lexbuf)
274274+ else try
258275 let cl = Hashtbl.find keyword_table s in
259276 (print_class cl s ; token lexbuf )
260277 with Not_found ->
261278 (print s ; token lexbuf )}
262262- | uppercase identchar *
263263- { print_class constructor_class (Lexing.lexeme lexbuf) ; token lexbuf } (* No capitalized keywords *)
264279 | decimal_literal | hex_literal | oct_literal | bin_literal
265280 { print (Lexing.lexeme lexbuf) ; token lexbuf }
266281 | float_literal
+1-1
ocamldoc/odoc_sig.ml
···18871887 (ast : Parsetree.signature) (signat : Types.signature) =
18881888 prepare_file source_file input_file;
18891889 (* We create the t_module for this file. *)
18901890- let mod_name = Unit_info.modname_from_source source_file in
18901890+ let mod_name = Unit_info.lax_modname_from_source source_file in
18911891 let len, info_opt = preamble !file_name !file
18921892 (fun x -> x.Parsetree.psig_loc) ast in
18931893 let info_opt = analyze_toplevel_alerts info_opt ast in
+5
parsing/lexer.mli
···3333 | Unterminated_string_in_comment of Location.t * Location.t
3434 | Empty_character_literal
3535 | Keyword_as_label of string
3636+ | Capitalized_label of string
3637 | Invalid_literal of string
3738 | Invalid_directive of string * string option
3939+ | Invalid_encoding of string
4040+ | Invalid_char_in_ident of Uchar.t
4141+ | Non_lowercase_delimiter of string
4242+ | Capitalized_raw_identifier of string
38433944exception Error of error * Location.t
4045
+108-41
parsing/lexer.mll
···2929 | Unterminated_string_in_comment of Location.t * Location.t
3030 | Empty_character_literal
3131 | Keyword_as_label of string
3232+ | Capitalized_label of string
3233 | Invalid_literal of string
3334 | Invalid_directive of string * string option
3535+ | Invalid_encoding of string
3636+ | Invalid_char_in_ident of Uchar.t
3737+ | Non_lowercase_delimiter of string
3838+ | Capitalized_raw_identifier of string
34393540exception Error of error * Location.t
3641···256261 illegal_escape lexbuf
257262 (Printf.sprintf "%X is not a Unicode scalar value" cp)
258263264264+let validate_encoding lexbuf raw_name =
265265+ match Utf8_lexeme.normalize raw_name with
266266+ | Error _ -> error lexbuf (Invalid_encoding raw_name)
267267+ | Ok name -> name
268268+269269+let ident_for_extended lexbuf raw_name =
270270+ let name = validate_encoding lexbuf raw_name in
271271+ match Utf8_lexeme.validate_identifier name with
272272+ | Utf8_lexeme.Valid -> name
273273+ | Utf8_lexeme.Invalid_character u -> error lexbuf (Invalid_char_in_ident u)
274274+ | Utf8_lexeme.Invalid_beginning _ ->
275275+ assert false (* excluded by the regexps *)
276276+277277+let validate_delim lexbuf raw_name =
278278+ let name = validate_encoding lexbuf raw_name in
279279+ if Utf8_lexeme.is_lowercase name then name
280280+ else error lexbuf (Non_lowercase_delimiter name)
281281+282282+let validate_ext lexbuf name =
283283+ let name = validate_encoding lexbuf name in
284284+ match Utf8_lexeme.validate_identifier ~with_dot:true name with
285285+ | Utf8_lexeme.Valid -> name
286286+ | Utf8_lexeme.Invalid_character u -> error lexbuf (Invalid_char_in_ident u)
287287+ | Utf8_lexeme.Invalid_beginning _ ->
288288+ assert false (* excluded by the regexps *)
289289+290290+let lax_delim raw_name =
291291+ match Utf8_lexeme.normalize raw_name with
292292+ | Error _ -> None
293293+ | Ok name ->
294294+ if Utf8_lexeme.is_lowercase name then Some name
295295+ else None
296296+259297let is_keyword name = Hashtbl.mem keyword_table name
260298261261-let check_label_name lexbuf name =
262262- if is_keyword name then error lexbuf (Keyword_as_label name)
299299+let check_label_name ?(raw_escape=false) lexbuf name =
300300+ if Utf8_lexeme.is_capitalized name then
301301+ error lexbuf (Capitalized_label name);
302302+ if not raw_escape && is_keyword name then
303303+ error lexbuf (Keyword_as_label name)
263304264305(* Update the current location with file name and line number. *)
265306···278319let preprocessor = ref None
279320280321let escaped_newlines = ref false
281281-282282-(* Warn about Latin-1 characters used in idents *)
283283-284284-let warn_latin1 lexbuf =
285285- Location.deprecated
286286- (Location.curr lexbuf)
287287- "ISO-Latin1 characters in identifiers"
288322289323let handle_docstrings = ref true
290324let comment_list = ref []
···336370 | Keyword_as_label kwd ->
337371 Location.errorf ~loc
338372 "%a is a keyword, it cannot be used as label name" Style.inline_code kwd
373373+ | Capitalized_label lbl ->
374374+ Location.errorf ~loc
375375+ "%a cannot be used as label name, \
376376+ it must start with a lowercase letter" Style.inline_code lbl
339377 | Invalid_literal s ->
340378 Location.errorf ~loc "Invalid literal %s" s
341379 | Invalid_directive (dir, explanation) ->
···343381 (fun ppf -> match explanation with
344382 | None -> ()
345383 | Some expl -> fprintf ppf ": %s" expl)
384384+ | Invalid_encoding s ->
385385+ Location.errorf ~loc "Invalid encoding of identifier %s." s
386386+ | Invalid_char_in_ident u ->
387387+ Location.errorf ~loc "Invalid character U+%X in identifier"
388388+ (Uchar.to_int u)
389389+ | Capitalized_raw_identifier lbl ->
390390+ Location.errorf ~loc
391391+ "%a cannot be used as a raw identifier, \
392392+ it must start with a lowercase letter" Style.inline_code lbl
393393+ | Non_lowercase_delimiter name ->
394394+ Location.errorf ~loc
395395+ "%a cannot be used as a quoted string delimiter,@ \
396396+ it must contain only lowercase letters."
397397+ Style.inline_code name
346398347399let () =
348400 Location.register_error_of_exn
···359411let blank = [' ' '\009' '\012']
360412let lowercase = ['a'-'z' '_']
361413let uppercase = ['A'-'Z']
414414+let identstart = lowercase | uppercase
362415let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9']
363363-let lowercase_latin1 = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
364364-let uppercase_latin1 = ['A'-'Z' '\192'-'\214' '\216'-'\222']
365365-let identchar_latin1 =
366366- ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
367367-(* This should be kept in sync with the [is_identchar] function in [env.ml] *)
416416+let utf8 = ['\192'-'\255'] ['\128'-'\191']*
417417+let identstart_ext = identstart | utf8
418418+let identchar_ext = identchar | utf8
368419369420let symbolchar =
370421 ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
···376427 ['$' '&' '*' '+' '-' '/' '<' '=' '>' '@' '^' '|']
377428378429let ident = (lowercase | uppercase) identchar*
379379-let extattrident = ident ('.' ident)*
430430+let ident_ext = identstart_ext identchar_ext*
431431+let extattrident = ident_ext ('.' ident_ext)*
380432381433let decimal_literal =
382434 ['0'-'9'] ['0'-'9' '_']*
···419471 | ".~"
420472 { error lexbuf
421473 (Reserved_sequence (".~", Some "is reserved for use in MetaOCaml")) }
422422- | "~" raw_ident_escape (lowercase identchar * as name) ':'
423423- { LABEL name }
424424- | "~" (lowercase identchar * as name) ':'
474474+ | "~" (identstart identchar * as name) ':'
425475 { check_label_name lexbuf name;
426476 LABEL name }
427427- | "~" (lowercase_latin1 identchar_latin1 * as name) ':'
428428- { warn_latin1 lexbuf;
477477+ | "~" (raw_ident_escape? as escape) (ident_ext as raw_name) ':'
478478+ { let name = ident_for_extended lexbuf raw_name in
479479+ check_label_name ~raw_escape:(escape<>"") lexbuf name;
429480 LABEL name }
430481 | "?"
431482 { QUESTION }
432432- | "?" raw_ident_escape (lowercase identchar * as name) ':'
433433- { OPTLABEL name }
434483 | "?" (lowercase identchar * as name) ':'
435484 { check_label_name lexbuf name;
436485 OPTLABEL name }
437437- | "?" (lowercase_latin1 identchar_latin1 * as name) ':'
438438- { warn_latin1 lexbuf;
439439- OPTLABEL name }
440440- | raw_ident_escape (lowercase identchar * as name)
441441- { LIDENT name }
486486+ | "?" (raw_ident_escape? as escape) (ident_ext as raw_name) ':'
487487+ { let name = ident_for_extended lexbuf raw_name in
488488+ check_label_name ~raw_escape:(escape<>"") lexbuf name;
489489+ OPTLABEL name
490490+ }
442491 | lowercase identchar * as name
443492 { try Hashtbl.find keyword_table name
444493 with Not_found -> LIDENT name }
445445- | lowercase_latin1 identchar_latin1 * as name
446446- { warn_latin1 lexbuf; LIDENT name }
447494 | uppercase identchar * as name
448495 { UIDENT name } (* No capitalized keywords *)
449449- | uppercase_latin1 identchar_latin1 * as name
450450- { warn_latin1 lexbuf; UIDENT name }
496496+ | (raw_ident_escape? as escape) (ident_ext as raw_name)
497497+ { let name = ident_for_extended lexbuf raw_name in
498498+ if Utf8_lexeme.is_capitalized name then begin
499499+ if escape="" then UIDENT name
500500+ else
501501+ (* we don't have capitalized keywords, and thus no needs for
502502+ capitalized raw identifiers. *)
503503+ error lexbuf (Capitalized_raw_identifier name)
504504+ end else
505505+ LIDENT name
506506+ } (* No non-ascii keywords *)
451507 | int_literal as lit { INT (lit, None) }
452508 | (int_literal as lit) (literal_modifier as modif)
453509 { INT (lit, Some modif) }
···460516 | "\""
461517 { let s, loc = wrap_string_lexer string lexbuf in
462518 STRING (s, loc, None) }
463463- | "{" (lowercase* as delim) "|"
464464- { let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in
465465- STRING (s, loc, Some delim) }
466466- | "{%" (extattrident as id) "|"
519519+ | "{" (ident_ext? as raw_name) '|'
520520+ { let delim = validate_delim lexbuf raw_name in
521521+ let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in
522522+ STRING (s, loc, Some delim)
523523+ }
524524+ | "{%" (extattrident as raw_id) "|"
467525 { let orig_loc = Location.curr lexbuf in
526526+ let id = validate_ext lexbuf raw_id in
468527 let s, loc = wrap_string_lexer (quoted_string "") lexbuf in
469528 let idloc = compute_quoted_string_idloc orig_loc 2 id in
470529 QUOTED_STRING_EXPR (id, idloc, s, loc, Some "") }
471471- | "{%" (extattrident as id) blank+ (lowercase* as delim) "|"
530530+ | "{%" (extattrident as raw_id) blank+ (ident_ext as raw_delim) "|"
472531 { let orig_loc = Location.curr lexbuf in
532532+ let id = validate_ext lexbuf raw_id in
533533+ let delim = validate_delim lexbuf raw_delim in
473534 let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in
474535 let idloc = compute_quoted_string_idloc orig_loc 2 id in
475536 QUOTED_STRING_EXPR (id, idloc, s, loc, Some delim) }
476476- | "{%%" (extattrident as id) "|"
537537+ | "{%%" (extattrident as raw_id) "|"
477538 { let orig_loc = Location.curr lexbuf in
539539+ let id = validate_ext lexbuf raw_id in
478540 let s, loc = wrap_string_lexer (quoted_string "") lexbuf in
479541 let idloc = compute_quoted_string_idloc orig_loc 3 id in
480542 QUOTED_STRING_ITEM (id, idloc, s, loc, Some "") }
481481- | "{%%" (extattrident as id) blank+ (lowercase* as delim) "|"
543543+ | "{%%" (extattrident as raw_id) blank+ (ident_ext as raw_delim) "|"
482544 { let orig_loc = Location.curr lexbuf in
545545+ let id = validate_ext lexbuf raw_id in
546546+ let delim = validate_delim lexbuf raw_delim in
483547 let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in
484548 let idloc = compute_quoted_string_idloc orig_loc 3 id in
485549 QUOTED_STRING_ITEM (id, idloc, s, loc, Some delim) }
···666730 is_in_string := false;
667731 store_string_char '\"';
668732 comment lexbuf }
669669- | "{" ('%' '%'? extattrident blank*)? (lowercase* as delim) "|"
670670- {
733733+ | "{" ('%' '%'? extattrident blank*)? (ident_ext? as raw_delim) "|"
734734+ { match lax_delim raw_delim with
735735+ | None -> store_lexeme lexbuf; comment lexbuf
736736+ | Some delim ->
671737 string_start_loc := Location.curr lexbuf;
672738 store_lexeme lexbuf;
673739 is_in_string := true;
···781847 | eof
782848 { is_in_string := false;
783849 error_loc !string_start_loc Unterminated_string }
784784- | "|" (lowercase* as edelim) "}"
850850+ | "|" (ident_ext? as raw_edelim) "}"
785851 {
852852+ let edelim = validate_encoding lexbuf raw_edelim in
786853 if delim = edelim then lexbuf.lex_start_p
787854 else (store_lexeme lexbuf; quoted_string delim lexbuf)
788855 }
+34-18
parsing/unit_info.ml
···1717type filename = string
1818type file_prefix = string
19192020+type error = Invalid_encoding of string
2121+exception Error of error
2222+2023type t = {
2124 source_file: filename;
2225 prefix: file_prefix;
···3235 | dot_pos -> String.sub basename 0 dot_pos
3336 | exception Not_found -> basename
34373535-let modulize s = String.capitalize_ascii s
3838+let strict_modulize s =
3939+ match Misc.Utf8_lexeme.capitalize s with
4040+ | Ok x -> x
4141+ | Error _ -> raise (Error (Invalid_encoding s))
36423737-(* We re-export the [Misc] definition *)
3838-let normalize = Misc.normalized_unit_filename
4343+let modulize s = match Misc.Utf8_lexeme.capitalize s with Ok x | Error x -> x
39444040-let modname_from_source source_file =
4141- source_file |> Filename.basename |> basename_chop_extensions |> modulize
4545+(* We re-export the [Misc] definition, and ignore encoding errors under the
4646+ assumption that we should focus our effort on not *producing* badly encoded
4747+ module names *)
4848+let normalize x = match Misc.normalized_unit_filename x with
4949+ | Ok x | Error x -> x
42504343-let start_char = function
4444- | 'A' .. 'Z' -> true
4545- | _ -> false
5151+let stem source_file =
5252+ source_file |> Filename.basename |> basename_chop_extensions
46534747-let is_identchar_latin1 = function
4848- | 'A'..'Z' | 'a'..'z' | '_' | '\192'..'\214' | '\216'..'\246'
4949- | '\248'..'\255' | '\'' | '0'..'9' -> true
5050- | _ -> false
5454+let strict_modname_from_source source_file =
5555+ source_file |> stem |> strict_modulize
5656+5757+let lax_modname_from_source source_file =
5858+ source_file |> stem |> modulize
51595260(* Check validity of module name *)
5353-let is_unit_name name =
5454- String.length name > 0
5555- && start_char name.[0]
5656- && String.for_all is_identchar_latin1 name
6161+let is_unit_name name = Misc.Utf8_lexeme.is_valid_identifier name
57625863let check_unit_name file =
5964 if not (is_unit_name (modname file)) then
···6166 (Warnings.Bad_module_name (modname file))
62676368let make ?(check_modname=true) ~source_file prefix =
6464- let modname = modname_from_source prefix in
6969+ let modname = strict_modname_from_source prefix in
6570 let p = { modname; prefix; source_file } in
6671 if check_modname then check_unit_name p;
6772 p
···7984 let prefix x = Filename.remove_extension (filename x)
80858186 let from_filename filename =
8282- let modname = modname_from_source filename in
8787+ let modname = lax_modname_from_source filename in
8388 { modname; filename; source_file = None }
84898590end
···120125 let filename = modname f ^ ".cmi" in
121126 let filename = Load_path.find_normalized filename in
122127 { Artifact.filename; modname = modname f; source_file = Some f.source_file }
128128+129129+let report_error = function
130130+ | Invalid_encoding name ->
131131+ Location.errorf "Invalid encoding of output name: %s." name
132132+133133+let () =
134134+ Location.register_error_of_exn
135135+ (function
136136+ | Error err -> Some (report_error err)
137137+ | _ -> None
138138+ )
+10-3
parsing/unit_info.mli
···2525type filename = string
2626type file_prefix = string
27272828+type error = Invalid_encoding of filename
2929+exception Error of error
3030+2831(** [modulize s] capitalizes the first letter of [s]. *)
2932val modulize: string -> modname
30333134(** [normalize s] uncapitalizes the first letter of [s]. *)
3235val normalize: string -> string
33363434-(** [modname_from_source filename] is [modulize stem] where [stem] is the
3737+(** [lax_modname_from_source filename] is [modulize stem] where [stem] is the
3538 basename of the filename [filename] stripped from all its extensions.
3639 For instance, [modname_from_source "/pa.th/x.ml.pp"] is ["X"]. *)
3737-val modname_from_source: filename -> modname
4040+val lax_modname_from_source: filename -> modname
4141+4242+(** Same as {!lax_modname_from_source} but raises an {!error.Invalid_encoding}
4343+ error on filename with invalid utf8 encoding. *)
4444+val strict_modname_from_source: filename -> modname
38453946(** {2:module_name_validation Module name validation function}*)
40474141-(** [is_unit_name ~strict name] is true only if [name] can be used as a
4848+(** [is_unit_name name] is true only if [name] can be used as a
4249 valid module name. *)
4350val is_unit_name : modname -> bool
4451
···11+Line 4, characters 8-10:
22+4 | let x = �a.x;;
33+ ^^
44+Error: Invalid encoding of identifier �a.
55+
+4
testsuite/tests/lexing/reject_bad_encoding.ml
···11+(* TEST
22+toplevel;
33+*)
44+let x = �a.x;;
+3
testsuite/tests/parsetree/source.ml
···7481748174827482(* check pretty-printing of local module open in core_type *)
74837483type t = String.( t )
74847484+74857485+(* Utf8 identifier *)
74867486+let là = function ça -> ça
···11+<!DOCTYPE html>
22+<html>
33+<head>
44+<link rel="stylesheet" href="style.css" type="text/css">
55+<meta content="text/html; charset=UTF-8" http-equiv="Content-Type">
66+<meta name="viewport" content="width=device-width, initial-scale=1">
77+<link rel="Start" href="index.html">
88+<link rel="Up" href="index.html">
99+<link title="Index of exceptions" rel=Appendix href="index_exceptions.html">
1010+<link title="Index of values" rel=Appendix href="index_values.html">
1111+<link title="Index of modules" rel=Appendix href="index_modules.html">
1212+<link title="Latin9" rel="Chapter" href="Latin9.html"><title>Latin9</title>
1313+</head>
1414+<body>
1515+<div class="navbar"> <a class="up" href="index.html" title="Index">Up</a>
1616+ </div>
1717+<h1>Module <a href="type_Latin9.html">Latin9</a></h1>
1818+1919+<pre><span id="MODULELatin9"><span class="keyword">module</span> Latin9</span>: <code class="code"><span class="keyword">sig</span></code> <a href="Latin9.html">..</a> <code class="code"><span class="keyword">end</span></code></pre><hr width="100%">
2020+2121+<pre><span id="MODULEÉté"><span class="keyword">module</span> <a href="Latin9.Été-c.html">Été</a></span>: <code class="code"><span class="keyword">sig</span></code> <a href="Latin9.Été-c.html">..</a> <code class="code"><span class="keyword">end</span></code></pre>
2222+<pre><span id="EXCEPTIONLà"><span class="keyword">exception</span> Là</span></pre>
2323+2424+<pre><span id="VALf"><span class="keyword">val</span> f</span> : <code class="type">int -> 'a</code></pre><div class="info ">
2525+<div class="info-desc">
2626+<p>Exceptions and parameters must be in latin-9 subset of unicode.
2727+ In the @since version and in the description any character (e.g 字) is accepted:</p>
2828+</div>
2929+<ul class="info-attributes">
3030+<li><b>Since</b> λ1</li>
3131+<li><b>Raises</b><ul><li><code>Là</code> स्तनति</li>
3232+<li><code>Été.Éclair</code> þunor</li>
3333+</ul></li>
3434+</ul>
3535+</div>
3636+<div class="param_info"><code class="code">éponyme</code> : ?</div>
3737+</body></html>
+18
testsuite/tests/tool-ocamldoc/Latin9.ml
···11+(* TEST
22+ ocamldoc with html;
33+*)
44+55+module Été = struct
66+ exception Éclair
77+end
88+99+exception Là
1010+1111+(** Exceptions and parameters must be in latin-9 subset of unicode.
1212+ In the \@since version and in the description any character (e.g 字) is accepted:
1313+ @since λ1
1414+ @raise Là स्तनति
1515+ @raise Été.Éclair þunor
1616+ @param éponyme ?
1717+*)
1818+let f éponyme = if Random.int 2 > éponyme then raise Été.Éclair else raise Là
···3434let input_name = Location.input_name
35353636let parse_mod_use_file name lb =
3737- let modname = Unit_info.modname_from_source name in
3737+ let modname = Unit_info.lax_modname_from_source name in
3838 let items =
3939 List.concat
4040 (List.map
+3-12
typing/env.ml
···16821682 | Mp_present ->
16831683 Lazy_backtrack.create_forced (Aident id)
1684168416851685-let is_identchar c =
16861686- (* This should be kept in sync with the [identchar_latin1] character class
16871687- in [lexer.mll] *)
16881688- match c with
16891689- | 'A'..'Z' | 'a'..'z' | '_' | '\192'..'\214'
16901690- | '\216'..'\246' | '\248'..'\255' | '\'' | '0'..'9' ->
16911691- true
16921692- | _ ->
16931693- false
16941694-16951685let rec components_of_module_maker
16961686 {cm_env; cm_prefixing_subst;
16971687 cm_path; cm_addr; cm_mty; cm_shape} : _ result =
···18971887 (* Note: we could also check here general validity of the
18981888 identifier, to protect against bad identifiers forged by -pp or
18991889 -ppx preprocessors. *)
19001900- if String.length name > 0 && not (is_identchar name.[0]) then
18901890+ if String.length name > 0 && not
18911891+ (Utf8_lexeme.starts_like_a_valid_identifier name) then
19011892 for i = 1 to String.length name - 1 do
19021893 if name.[i] = '#' then
19031894 error (Illegal_value_name(loc, name))
···25322523let unit_name_of_filename fn =
25332524 match Filename.extension fn with
25342525 | ".cmi" ->
25352535- let modname = Unit_info.modname_from_source fn in
25262526+ let modname = Unit_info.strict_modname_from_source fn in
25362527 if Unit_info.is_unit_name modname then Some modname
25372528 else None
25382529 | _ -> None
+1-20
typing/oprint.ml
···37373838let out_ident = ref print_ident
39394040-(* Check a character matches the [identchar_latin1] class from the lexer *)
4141-let is_ident_char c =
4242- match c with
4343- | 'A'..'Z' | 'a'..'z' | '_' | '\192'..'\214' | '\216'..'\246'
4444- | '\248'..'\255' | '\'' | '0'..'9' -> true
4545- | _ -> false
4646-4747-let all_ident_chars s =
4848- let rec loop s len i =
4949- if i < len then begin
5050- if is_ident_char s.[i] then loop s len (i+1)
5151- else false
5252- end else begin
5353- true
5454- end
5555- in
5656- let len = String.length s in
5757- loop s len 0
5858-5940let parenthesized_ident name =
6041 (List.mem name ["or"; "mod"; "land"; "lor"; "lxor"; "lsl"; "lsr"; "asr"])
6161- || not (all_ident_chars name)
4242+ || not (Misc.Utf8_lexeme.is_valid_identifier name)
62436344let value_ident ppf name =
6445 if parenthesized_ident name then
+21-15
utils/load_path.ml
···105105 order. *)
106106let prepend_add dir =
107107 List.iter (fun base ->
108108- let fn = Filename.concat dir.Dir.path base in
109109- let filename = Misc.normalized_unit_filename base in
110110- if dir.Dir.hidden then begin
111111- STbl.replace !hidden_files base fn;
112112- STbl.replace !hidden_files_uncap filename fn
113113- end else begin
114114- STbl.replace !visible_files base fn;
115115- STbl.replace !visible_files_uncap filename fn
116116- end
108108+ Result.iter (fun filename ->
109109+ let fn = Filename.concat dir.Dir.path base in
110110+ if dir.Dir.hidden then begin
111111+ STbl.replace !hidden_files base fn;
112112+ STbl.replace !hidden_files_uncap filename fn
113113+ end else begin
114114+ STbl.replace !visible_files base fn;
115115+ STbl.replace !visible_files_uncap filename fn
116116+ end)
117117+ (Misc.normalized_unit_filename base)
117118 ) dir.Dir.files
118119119120let init ~auto_include ~visible ~hidden =
···150151 in
151152 List.iter
152153 (fun base ->
153153- let fn = Filename.concat dir.Dir.path base in
154154- update base fn visible_files hidden_files;
155155- let ubase = Misc.normalized_unit_filename base in
156156- update ubase fn visible_files_uncap hidden_files_uncap)
154154+ Result.iter (fun ubase ->
155155+ let fn = Filename.concat dir.Dir.path base in
156156+ update base fn visible_files hidden_files;
157157+ update ubase fn visible_files_uncap hidden_files_uncap
158158+ )
159159+ (Misc.normalized_unit_filename base)
160160+ )
157161 dir.files;
158162 if dir.hidden then
159163 hidden_dirs := dir :: !hidden_dirs
···216220217221let find_normalized_with_visibility fn =
218222 assert (not Config.merlin || Local_store.is_bound ());
223223+ match Misc.normalized_unit_filename fn with
224224+ | Error _ -> raise Not_found
225225+ | Ok fn_uncap ->
219226 try
220227 if is_basename fn && not !Sys.interactive then
221221- find_file_in_cache (Misc.normalized_unit_filename fn)
228228+ find_file_in_cache fn_uncap
222229 visible_files_uncap hidden_files_uncap
223230 else
224231 try
···227234 | Not_found ->
228235 (Misc.find_in_path_normalized (get_hidden_path_list ()) fn, Hidden)
229236 with Not_found ->
230230- let fn_uncap = Misc.normalized_unit_filename fn in
231237 (!auto_include_callback Dir.find_normalized fn_uncap, Visible)
232238233239let find_normalized fn = fst (find_normalized_with_visibility fn)
+234-2
utils/misc.ml
···260260 external compare : 'a -> 'a -> int = "%compare"
261261end
262262263263+(** {1 Minimal support for Unicode characters in identifiers} *)
264264+265265+module Utf8_lexeme = struct
266266+267267+ type t = string
268268+269269+ (* Non-ASCII letters that are allowed in identifiers (currently: Latin-9) *)
270270+271271+ type case = Upper of Uchar.t | Lower of Uchar.t
272272+ let known_chars : (Uchar.t, case) Hashtbl.t = Hashtbl.create 32
273273+274274+ let _ =
275275+ List.iter
276276+ (fun (upper, lower) ->
277277+ let upper = Uchar.of_int upper and lower = Uchar.of_int lower in
278278+ Hashtbl.add known_chars upper (Upper lower);
279279+ Hashtbl.add known_chars lower (Lower upper))
280280+ [
281281+ (0xc0, 0xe0); (* À, à *) (0xc1, 0xe1); (* Á, á *)
282282+ (0xc2, 0xe2); (* Â, â *) (0xc3, 0xe3); (* Ã, ã *)
283283+ (0xc4, 0xe4); (* Ä, ä *) (0xc5, 0xe5); (* Å, å *)
284284+ (0xc6, 0xe6); (* Æ, æ *) (0xc7, 0xe7); (* Ç, ç *)
285285+ (0xc8, 0xe8); (* È, è *) (0xc9, 0xe9); (* É, é *)
286286+ (0xca, 0xea); (* Ê, ê *) (0xcb, 0xeb); (* Ë, ë *)
287287+ (0xcc, 0xec); (* Ì, ì *) (0xcd, 0xed); (* Í, í *)
288288+ (0xce, 0xee); (* Î, î *) (0xcf, 0xef); (* Ï, ï *)
289289+ (0xd0, 0xf0); (* Ð, ð *) (0xd1, 0xf1); (* Ñ, ñ *)
290290+ (0xd2, 0xf2); (* Ò, ò *) (0xd3, 0xf3); (* Ó, ó *)
291291+ (0xd4, 0xf4); (* Ô, ô *) (0xd5, 0xf5); (* Õ, õ *)
292292+ (0xd6, 0xf6); (* Ö, ö *) (0xd8, 0xf8); (* Ø, ø *)
293293+ (0xd9, 0xf9); (* Ù, ù *) (0xda, 0xfa); (* Ú, ú *)
294294+ (0xdb, 0xfb); (* Û, û *) (0xdc, 0xfc); (* Ü, ü *)
295295+ (0xdd, 0xfd); (* Ý, ý *) (0xde, 0xfe); (* Þ, þ *)
296296+ (0x160, 0x161); (* Š, š *) (0x17d, 0x17e); (* Ž, ž *)
297297+ (0x152, 0x153); (* Œ, œ *) (0x178, 0xff); (* Ÿ, ÿ *)
298298+ (0x1e9e, 0xdf); (* ẞ, ß *)
299299+ ]
300300+301301+ (* NFD to NFC conversion table for the letters above *)
302302+303303+ let known_pairs : (Uchar.t * Uchar.t, Uchar.t) Hashtbl.t = Hashtbl.create 32
304304+305305+ let _ =
306306+ List.iter
307307+ (fun (c1, n2, n) ->
308308+ Hashtbl.add known_pairs
309309+ (Uchar.of_char c1, Uchar.of_int n2) (Uchar.of_int n))
310310+ [
311311+ ('A', 0x300, 0xc0); (* À *) ('A', 0x301, 0xc1); (* Á *)
312312+ ('A', 0x302, 0xc2); (* Â *) ('A', 0x303, 0xc3); (* Ã *)
313313+ ('A', 0x308, 0xc4); (* Ä *) ('A', 0x30a, 0xc5); (* Å *)
314314+ ('C', 0x327, 0xc7); (* Ç *) ('E', 0x300, 0xc8); (* È *)
315315+ ('E', 0x301, 0xc9); (* É *) ('E', 0x302, 0xca); (* Ê *)
316316+ ('E', 0x308, 0xcb); (* Ë *) ('I', 0x300, 0xcc); (* Ì *)
317317+ ('I', 0x301, 0xcd); (* Í *) ('I', 0x302, 0xce); (* Î *)
318318+ ('I', 0x308, 0xcf); (* Ï *) ('N', 0x303, 0xd1); (* Ñ *)
319319+ ('O', 0x300, 0xd2); (* Ò *) ('O', 0x301, 0xd3); (* Ó *)
320320+ ('O', 0x302, 0xd4); (* Ô *) ('O', 0x303, 0xd5); (* Õ *)
321321+ ('O', 0x308, 0xd6); (* Ö *)
322322+ ('U', 0x300, 0xd9); (* Ù *) ('U', 0x301, 0xda); (* Ú *)
323323+ ('U', 0x302, 0xdb); (* Û *) ('U', 0x308, 0xdc); (* Ü *)
324324+ ('Y', 0x301, 0xdd); (* Ý *) ('Y', 0x308, 0x178); (* Ÿ *)
325325+ ('S', 0x30c, 0x160); (* Š *) ('Z', 0x30c, 0x17d); (* Ž *)
326326+ ('a', 0x300, 0xe0); (* à *) ('a', 0x301, 0xe1); (* á *)
327327+ ('a', 0x302, 0xe2); (* â *) ('a', 0x303, 0xe3); (* ã *)
328328+ ('a', 0x308, 0xe4); (* ä *) ('a', 0x30a, 0xe5); (* å *)
329329+ ('c', 0x327, 0xe7); (* ç *) ('e', 0x300, 0xe8); (* è *)
330330+ ('e', 0x301, 0xe9); (* é *) ('e', 0x302, 0xea); (* ê *)
331331+ ('e', 0x308, 0xeb); (* ë *) ('i', 0x300, 0xec); (* ì *)
332332+ ('i', 0x301, 0xed); (* í *) ('i', 0x302, 0xee); (* î *)
333333+ ('i', 0x308, 0xef); (* ï *) ('n', 0x303, 0xf1); (* ñ *)
334334+ ('o', 0x300, 0xf2); (* ò *) ('o', 0x301, 0xf3); (* ó *)
335335+ ('o', 0x302, 0xf4); (* ô *) ('o', 0x303, 0xf5); (* õ *)
336336+ ('o', 0x308, 0xf6); (* ö *)
337337+ ('u', 0x300, 0xf9); (* ù *) ('u', 0x301, 0xfa); (* ú *)
338338+ ('u', 0x302, 0xfb); (* û *) ('u', 0x308, 0xfc); (* ü *)
339339+ ('y', 0x301, 0xfd); (* ý *) ('y', 0x308, 0xff); (* ÿ *)
340340+ ('s', 0x30c, 0x161); (* š *) ('z', 0x30c, 0x17e); (* ž *)
341341+ ]
342342+343343+ let normalize_generic ~keep_ascii transform s =
344344+ let rec norm check buf prev i =
345345+ if i >= String.length s then begin
346346+ Buffer.add_utf_8_uchar buf (transform prev)
347347+ end else begin
348348+ let d = String.get_utf_8_uchar s i in
349349+ let u = Uchar.utf_decode_uchar d in
350350+ check d u;
351351+ let i' = i + Uchar.utf_decode_length d in
352352+ match Hashtbl.find_opt known_pairs (prev, u) with
353353+ | Some u' ->
354354+ norm check buf u' i'
355355+ | None ->
356356+ Buffer.add_utf_8_uchar buf (transform prev);
357357+ norm check buf u i'
358358+ end in
359359+ let ascii_limit = 128 in
360360+ if s = ""
361361+ || keep_ascii && String.for_all (fun x -> Char.code x < ascii_limit) s
362362+ then Ok s
363363+ else
364364+ let buf = Buffer.create (String.length s) in
365365+ let valid = ref true in
366366+ let check d u =
367367+ valid := !valid && Uchar.utf_decode_is_valid d && u <> Uchar.rep
368368+ in
369369+ let d = String.get_utf_8_uchar s 0 in
370370+ let u = Uchar.utf_decode_uchar d in
371371+ check d u;
372372+ norm check buf u (Uchar.utf_decode_length d);
373373+ let contents = Buffer.contents buf in
374374+ if !valid then
375375+ Ok contents
376376+ else
377377+ Error contents
378378+379379+ let normalize s =
380380+ normalize_generic ~keep_ascii:true (fun u -> u) s
381381+382382+ (* Capitalization *)
383383+384384+ let uchar_is_uppercase u =
385385+ let c = Uchar.to_int u in
386386+ if c < 0x80 then c >= 65 && c <= 90 else
387387+ match Hashtbl.find_opt known_chars u with
388388+ | Some(Upper _) -> true
389389+ | _ -> false
390390+391391+ let uchar_lowercase u =
392392+ let c = Uchar.to_int u in
393393+ if c < 0x80 then
394394+ if c >= 65 && c <= 90 then Uchar.of_int (c + 32) else u
395395+ else
396396+ match Hashtbl.find_opt known_chars u with
397397+ | Some(Upper u') -> u'
398398+ | _ -> u
399399+400400+ let uchar_uppercase u =
401401+ let c = Uchar.to_int u in
402402+ if c < 0x80 then
403403+ if c >= 97 && c <= 122 then Uchar.of_int (c - 32) else u
404404+ else
405405+ match Hashtbl.find_opt known_chars u with
406406+ | Some(Lower u') -> u'
407407+ | _ -> u
408408+409409+ let capitalize s =
410410+ let first = ref true in
411411+ normalize_generic ~keep_ascii:false
412412+ (fun u -> if !first then (first := false; uchar_uppercase u) else u)
413413+ s
414414+415415+ let uncapitalize s =
416416+ let first = ref true in
417417+ normalize_generic ~keep_ascii:false
418418+ (fun u -> if !first then (first := false; uchar_lowercase u) else u)
419419+ s
420420+421421+ let is_capitalized s =
422422+ s <> "" &&
423423+ uchar_is_uppercase (Uchar.utf_decode_uchar (String.get_utf_8_uchar s 0))
424424+425425+ (* Characters allowed in identifiers after normalization is applied.
426426+ Currently:
427427+ - ASCII letters, underscore
428428+ - Latin-9 letters, represented in NFC
429429+ - ASCII digits, single quote (but not as first character)
430430+ - dot if [with_dot] = true
431431+ *)
432432+ let uchar_valid_in_identifier ~with_dot u =
433433+ let c = Uchar.to_int u in
434434+ if c < 0x80 then
435435+ c >= 97 (* a *) && c <= 122 (* z *)
436436+ || c >= 65 (* A *) && c <= 90 (* Z *)
437437+ || c >= 48 (* 0 *) && c <= 57 (* 9 *)
438438+ || c = 95 (* underscore *)
439439+ || c = 39 (* single quote *)
440440+ || (with_dot && c = 46) (* dot *)
441441+ else
442442+ Hashtbl.mem known_chars u
443443+444444+ let uchar_not_identifier_start u =
445445+ let c = Uchar.to_int u in
446446+ c >= 48 (* 0 *) && c <= 57 (* 9 *)
447447+ || c = 39 (* single quote *)
448448+449449+ (* Check whether a normalized string is a valid OCaml identifier. *)
450450+451451+ type validation_result =
452452+ | Valid
453453+ | Invalid_character of Uchar.t (** Character not allowed *)
454454+ | Invalid_beginning of Uchar.t (** Character not allowed as first char *)
455455+456456+ let validate_identifier ?(with_dot=false) s =
457457+ let rec check i =
458458+ if i >= String.length s then Valid else begin
459459+ let d = String.get_utf_8_uchar s i in
460460+ let u = Uchar.utf_decode_uchar d in
461461+ let i' = i + Uchar.utf_decode_length d in
462462+ if not (uchar_valid_in_identifier ~with_dot u) then
463463+ Invalid_character u
464464+ else if i = 0 && uchar_not_identifier_start u then
465465+ Invalid_beginning u
466466+ else
467467+ check i'
468468+ end
469469+ in check 0
470470+471471+ let is_valid_identifier s =
472472+ validate_identifier s = Valid
473473+474474+ let starts_like_a_valid_identifier s =
475475+ s <> "" &&
476476+ (let u = Uchar.utf_decode_uchar (String.get_utf_8_uchar s 0) in
477477+ uchar_valid_in_identifier ~with_dot:false u
478478+ && not (uchar_not_identifier_start u))
479479+480480+ let is_lowercase s =
481481+ let rec is_lowercase_at len s n =
482482+ if n >= len then true
483483+ else
484484+ let d = String.get_utf_8_uchar s n in
485485+ let u = Uchar.utf_decode_uchar d in
486486+ (uchar_valid_in_identifier ~with_dot:false u)
487487+ && not (uchar_is_uppercase u)
488488+ && is_lowercase_at len s (n+Uchar.utf_decode_length d)
489489+ in
490490+ is_lowercase_at (String.length s) s 0
491491+end
492492+263493(* File functions *)
264494265495let find_in_path path name =
···290520 if Sys.file_exists fullname then fullname else try_dir rem
291521 in try_dir path
292522293293-let normalized_unit_filename = String.uncapitalize_ascii
523523+let normalized_unit_filename = Utf8_lexeme.uncapitalize
294524295525let find_in_path_normalized path name =
296296- let uname = normalized_unit_filename name in
526526+ match normalized_unit_filename name with
527527+ | Error _ -> raise Not_found
528528+ | Ok uname ->
297529 let rec try_dir = function
298530 [] -> raise Not_found
299531 | dir::rem ->
+63-2
utils/misc.mli
···217217val find_in_path_rel: string list -> string -> string
218218 (** Search a relative file in a list of directories. *)
219219220220- (** Normalize file name [Foo.ml] to [foo.ml] *)
221221-val normalized_unit_filename: string -> string
220220+ (** Normalize file name [Foo.ml] to [foo.ml], using NFC and case-folding.
221221+ Return [Error] if the input is not a valid utf-8 byte sequence *)
222222+val normalized_unit_filename: string -> (string,string) Result.t
222223223224val find_in_path_normalized: string list -> string -> string
224225(** Same as {!find_in_path_rel} , but search also for normalized unit filename,
···760761 *)
761762762763 val all_kinds : kind list
764764+end
765765+766766+(** {1 Minimal support for Unicode characters in identifiers} *)
767767+768768+(** Characters allowed in identifiers are, currently:
769769+ - ASCII letters A-Z a-z
770770+ - Latin-1 letters (U+00C0 - U+00FF except U+00D7 and U+00F7)
771771+ - Character sequences which normalize to the above character under NFC
772772+ - digits 0-9, underscore, single quote
773773+*)
774774+775775+module Utf8_lexeme: sig
776776+ type t = string
777777+778778+ val normalize: string -> (t,t) Result.t
779779+ (** Normalize the given UTF-8 encoded string.
780780+ Invalid UTF-8 sequences results in a error and are replaced
781781+ by U+FFFD.
782782+ Identifier characters are put in NFC normalized form.
783783+ Other Unicode characters are left unchanged. *)
784784+785785+ val capitalize: string -> (t,t) Result.t
786786+ (** Like [normalize], but if the string starts with a lowercase identifier
787787+ character, it is replaced by the corresponding uppercase character.
788788+ Subsequent characters are not changed. *)
789789+790790+ val uncapitalize: string -> (t,t) Result.t
791791+ (** Like [normalize], but if the string starts with an uppercase identifier
792792+ character, it is replaced by the corresponding lowercase character.
793793+ Subsequent characters are not changed. *)
794794+795795+ val is_capitalized: t -> bool
796796+ (** Returns [true] if the given normalized string starts with an
797797+ uppercase identifier character, [false] otherwise. May return
798798+ wrong results if the string is not normalized. *)
799799+800800+ val is_valid_identifier: t -> bool
801801+ (** Check whether the given normalized string is a valid OCaml identifier:
802802+ - all characters are identifier characters
803803+ - it does not start with a digit or a single quote
804804+ *)
805805+806806+ val is_lowercase: t -> bool
807807+ (** Returns [true] if the given normalized string only contains lowercase
808808+ identifier character, [false] otherwise. May return wrong results if the
809809+ string is not normalized. *)
810810+811811+ type validation_result =
812812+ | Valid
813813+ | Invalid_character of Uchar.t (** Character not allowed *)
814814+ | Invalid_beginning of Uchar.t (** Character not allowed as first char *)
815815+816816+ val validate_identifier: ?with_dot:bool -> t -> validation_result
817817+ (** Like [is_valid_identifier], but returns a more detailed error code. Dots
818818+ can be allowed to extend support to path-like identifiers. *)
819819+820820+ val starts_like_a_valid_identifier: t -> bool
821821+ (** Checks whether the given normalized string starts with an identifier
822822+ character other than a digit or a single quote. Subsequent characters
823823+ are not checked. *)
763824end
764825765826(** {1 Miscellaneous type aliases} *)