The unpac monorepo manager self-hosting as a monorepo using unpac
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Reject invalid unicode encoding in module names

We reject invalid utf8 encoding and strings containing the replacement
character U+FFFD in identifiers and output targets.

Similarly, we ignore existing artifacts with encoding errors whenever we are
doing an artifact look-up in load paths.

+103 -32
+1
parsing/lexer.mli
··· 36 36 | Capitalized_label of string 37 37 | Invalid_literal of string 38 38 | Invalid_directive of string * string option 39 + | Invalid_encoding of string 39 40 | Invalid_char_in_ident of Uchar.t 40 41 | Capitalized_raw_identifier of string 41 42
+6 -1
parsing/lexer.mll
··· 32 32 | Capitalized_label of string 33 33 | Invalid_literal of string 34 34 | Invalid_directive of string * string option 35 + | Invalid_encoding of string 35 36 | Invalid_char_in_ident of Uchar.t 36 37 | Capitalized_raw_identifier of string 37 38 ··· 259 260 (Printf.sprintf "%X is not a Unicode scalar value" cp) 260 261 261 262 let ident_for_extended lexbuf raw_name = 262 - let name = UString.normalize raw_name in 263 + match UString.normalize raw_name with 264 + | Error _ -> error lexbuf (Invalid_encoding raw_name) 265 + | Ok name -> 263 266 match UString.validate_identifier name with 264 267 | UString.Valid -> name 265 268 | UString.Invalid_character u -> error lexbuf (Invalid_char_in_ident u) ··· 350 353 (fun ppf -> match explanation with 351 354 | None -> () 352 355 | Some expl -> fprintf ppf ": %s" expl) 356 + | Invalid_encoding s -> 357 + Location.errorf ~loc "Invalid encoding of identifier %s." s 353 358 | Invalid_char_in_ident u -> 354 359 Location.errorf ~loc "Invalid character U+%X in identifier" 355 360 (Uchar.to_int u)
+33 -5
parsing/unit_info.ml
··· 17 17 type filename = string 18 18 type file_prefix = string 19 19 20 + type error = Invalid_encoding of string 21 + exception Error of error 22 + 20 23 type t = { 21 24 source_file: filename; 22 25 prefix: file_prefix; ··· 32 35 | dot_pos -> String.sub basename 0 dot_pos 33 36 | exception Not_found -> basename 34 37 35 - let modulize s = Misc.UString.capitalize s 38 + let strict_modulize s = 39 + match Misc.UString.capitalize s with 40 + | Ok x -> x 41 + | Error _ -> raise (Error (Invalid_encoding s)) 42 + 43 + let modulize s = match Misc.UString.capitalize s with Ok x | Error x -> x 36 44 37 - (* We re-export the [Misc] definition *) 38 - let normalize = Misc.normalized_unit_filename 45 + (* We re-export the [Misc] definition, and ignore encoding errors under the 46 + assumption that we should focus our effort on not *producing* badly encoded 47 + module names *) 48 + let normalize x = match Misc.normalized_unit_filename x with | 49 + Ok x | Error x -> x 50 + 51 + let stem source_file = 52 + source_file |> Filename.basename |> basename_chop_extensions 53 + 54 + let strict_modname_from_source source_file = 55 + source_file |> stem |> strict_modulize 39 56 40 57 let modname_from_source source_file = 41 - source_file |> Filename.basename |> basename_chop_extensions |> modulize 58 + source_file |> stem |> modulize 42 59 43 60 (* Check validity of module name *) 44 61 let is_unit_name name = Misc.UString.is_valid_identifier name ··· 49 66 (Warnings.Bad_module_name (modname file)) 50 67 51 68 let make ?(check_modname=true) ~source_file prefix = 52 - let modname = modname_from_source prefix in 69 + let modname = strict_modname_from_source prefix in 53 70 let p = { modname; prefix; source_file } in 54 71 if check_modname then check_unit_name p; 55 72 p ··· 108 125 let filename = modname f ^ ".cmi" in 109 126 let filename = Load_path.find_normalized filename in 110 127 { Artifact.filename; modname = modname f; source_file = Some f.source_file } 128 + 129 + let report_error = function 130 + | Invalid_encoding name -> 131 + Location.errorf "Invalid encoding of output name: %s." name 132 + 133 + let () = 134 + Location.register_error_of_exn 135 + (function 136 + | Error err -> Some (report_error err) 137 + | _ -> None 138 + )
+7
parsing/unit_info.mli
··· 25 25 type filename = string 26 26 type file_prefix = string 27 27 28 + type error = Invalid_encoding of filename 29 + exception Error of error 30 + 28 31 (** [modulize s] capitalizes the first letter of [s]. *) 29 32 val modulize: string -> modname 30 33 ··· 35 38 basename of the filename [filename] stripped from all its extensions. 36 39 For instance, [modname_from_source "/pa.th/x.ml.pp"] is ["X"]. *) 37 40 val modname_from_source: filename -> modname 41 + 42 + (** Same as {!modname_from_source} but raises an {!error.Invalid_encoding} error 43 + on filename with invalid utf8 encoding. *) 44 + val strict_modname_from_source: filename -> modname 38 45 39 46 (** {2:module_name_validation Module name validation function}*) 40 47
+5
testsuite/tests/lexing/reject_bad_encoding.compilers.reference
··· 1 + Line 4, characters 8-10: 2 + 4 | let x = �a.x;; 3 + ^^ 4 + Error: Invalid encoding of identifier �a. 5 +
+4
testsuite/tests/lexing/reject_bad_encoding.ml
··· 1 + (* TEST 2 + toplevel; 3 + *) 4 + let x = �a.x;;
+1 -1
typing/env.ml
··· 2523 2523 let unit_name_of_filename fn = 2524 2524 match Filename.extension fn with 2525 2525 | ".cmi" -> 2526 - let modname = Unit_info.modname_from_source fn in 2526 + let modname = Unit_info.strict_modname_from_source fn in 2527 2527 if Unit_info.is_unit_name modname then Some modname 2528 2528 else None 2529 2529 | _ -> None
+21 -15
utils/load_path.ml
··· 105 105 order. *) 106 106 let prepend_add dir = 107 107 List.iter (fun base -> 108 - let fn = Filename.concat dir.Dir.path base in 109 - let filename = Misc.normalized_unit_filename base in 110 - if dir.Dir.hidden then begin 111 - STbl.replace !hidden_files base fn; 112 - STbl.replace !hidden_files_uncap filename fn 113 - end else begin 114 - STbl.replace !visible_files base fn; 115 - STbl.replace !visible_files_uncap filename fn 116 - end 108 + Result.iter (fun filename -> 109 + let fn = Filename.concat dir.Dir.path base in 110 + if dir.Dir.hidden then begin 111 + STbl.replace !hidden_files base fn; 112 + STbl.replace !hidden_files_uncap filename fn 113 + end else begin 114 + STbl.replace !visible_files base fn; 115 + STbl.replace !visible_files_uncap filename fn 116 + end) 117 + (Misc.normalized_unit_filename base) 117 118 ) dir.Dir.files 118 119 119 120 let init ~auto_include ~visible ~hidden = ··· 150 151 in 151 152 List.iter 152 153 (fun base -> 153 - let fn = Filename.concat dir.Dir.path base in 154 - update base fn visible_files hidden_files; 155 - let ubase = Misc.normalized_unit_filename base in 156 - update ubase fn visible_files_uncap hidden_files_uncap) 154 + Result.iter (fun ubase -> 155 + let fn = Filename.concat dir.Dir.path base in 156 + update base fn visible_files hidden_files; 157 + update ubase fn visible_files_uncap hidden_files_uncap 158 + ) 159 + (Misc.normalized_unit_filename base) 160 + ) 157 161 dir.files; 158 162 if dir.hidden then 159 163 hidden_dirs := dir :: !hidden_dirs ··· 216 220 217 221 let find_normalized_with_visibility fn = 218 222 assert (not Config.merlin || Local_store.is_bound ()); 223 + match Misc.normalized_unit_filename fn with 224 + | Error _ -> raise Not_found 225 + | Ok fn_uncap -> 219 226 try 220 227 if is_basename fn && not !Sys.interactive then 221 - find_file_in_cache (Misc.normalized_unit_filename fn) 228 + find_file_in_cache fn_uncap 222 229 visible_files_uncap hidden_files_uncap 223 230 else 224 231 try ··· 227 234 | Not_found -> 228 235 (Misc.find_in_path_normalized (get_hidden_path_list ()) fn, Hidden) 229 236 with Not_found -> 230 - let fn_uncap = Misc.normalized_unit_filename fn in 231 237 (!auto_include_callback Dir.find_normalized fn_uncap, Visible) 232 238 233 239 let find_normalized fn = fst (find_normalized_with_visibility fn)
+17 -4
utils/misc.ml
··· 342 342 343 343 let normalize_generic transform s = 344 344 let buf = Buffer.create (String.length s) in 345 + let valid = ref true in 346 + let check d u = 347 + valid := !valid && Uchar.utf_decode_is_valid d && u <> Uchar.rep 348 + in 345 349 let rec norm prev i = 346 350 if i >= String.length s then begin 347 351 Buffer.add_utf_8_uchar buf (transform prev) 348 352 end else begin 349 353 let d = String.get_utf_8_uchar s i in 350 354 let u = Uchar.utf_decode_uchar d in 355 + check d u; 351 356 let i' = i + Uchar.utf_decode_length d in 352 357 match Hashtbl.find_opt known_pairs (prev, u) with 353 358 | Some u' -> ··· 356 361 Buffer.add_utf_8_uchar buf (transform prev); 357 362 norm u i' 358 363 end in 359 - if s = "" then s else begin 364 + if s = "" then Ok s else begin 360 365 let d = String.get_utf_8_uchar s 0 in 361 - norm (Uchar.utf_decode_uchar d) (Uchar.utf_decode_length d); 362 - Buffer.contents buf 366 + let u = Uchar.utf_decode_uchar d in 367 + check d u; 368 + norm u (Uchar.utf_decode_length d); 369 + let contents = Buffer.contents buf in 370 + if !valid then 371 + Ok contents 372 + else 373 + Error contents 363 374 end 364 375 365 376 let normalize s = ··· 493 504 let normalized_unit_filename = UString.uncapitalize 494 505 495 506 let find_in_path_normalized path name = 496 - let uname = normalized_unit_filename name in 507 + match normalized_unit_filename name with 508 + | Error _ -> raise Not_found 509 + | Ok uname -> 497 510 let rec try_dir = function 498 511 [] -> raise Not_found 499 512 | dir::rem ->
+8 -6
utils/misc.mli
··· 217 217 val find_in_path_rel: string list -> string -> string 218 218 (** Search a relative file in a list of directories. *) 219 219 220 - (** Normalize file name [Foo.ml] to [foo.ml] *) 221 - val normalized_unit_filename: string -> string 220 + (** Normalize file name [Foo.ml] to [foo.ml] 221 + Return [Error] if the input is not a valid utf-8 byte sequence *) 222 + val normalized_unit_filename: string -> (string,string) Result.t 222 223 223 224 val find_in_path_normalized: string list -> string -> string 224 225 (** Same as {!find_in_path_rel} , but search also for normalized unit filename, ··· 798 799 module UString : sig 799 800 type t = string 800 801 801 - val normalize: string -> t 802 + val normalize: string -> (t,t) Result.t 802 803 (** Normalize the given UTF-8 encoded string. 803 - Invalid UTF-8 sequences are replaced by U+FFFD. 804 + Invalid UTF-8 sequences results in a error and are replaced 805 + by U+FFFD. 804 806 Identifier characters are put in NFC normalized form. 805 807 Other Unicode characters are left unchanged. *) 806 808 807 - val capitalize: string -> t 809 + val capitalize: string -> (t,t) Result.t 808 810 (** Like [normalize], but if the string starts with a lowercase identifier 809 811 character, it is replaced by the corresponding uppercase character. 810 812 Subsequent characters are not changed. *) 811 813 812 - val uncapitalize: string -> t 814 + val uncapitalize: string -> (t,t) Result.t 813 815 (** Like [normalize], but if the string starts with an uppercase identifier 814 816 character, it is replaced by the corresponding lowercase character. 815 817 Subsequent characters are not changed. *)