Punycode (RFC3492) in OCaml
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 177 lines 6.0 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: ISC 4 ---------------------------------------------------------------------------*) 5 6(* IDNA (Internationalized Domain Names in Applications) Implementation *) 7 8let max_domain_length = 253 9 10(* {1 Error Types} *) 11 12type error_reason = 13 | Punycode_error of Punycode.error_reason 14 | Invalid_label of string 15 | Domain_too_long of int 16 | Normalization_failed 17 | Verification_failed 18 19let pp_error_reason fmt = function 20 | Punycode_error e -> 21 Format.fprintf fmt "Punycode error: %a" Punycode.pp_error_reason e 22 | Invalid_label msg -> Format.fprintf fmt "invalid label: %s" msg 23 | Domain_too_long len -> 24 Format.fprintf fmt "domain too long: %d bytes (max %d)" len 25 max_domain_length 26 | Normalization_failed -> Format.fprintf fmt "Unicode normalization failed" 27 | Verification_failed -> 28 Format.fprintf fmt "IDNA verification failed (round-trip mismatch)" 29 30exception Error of error_reason 31 32let () = Printexc.register_printer (function 33 | Error reason -> Some (Format.asprintf "Punycode_idna.Error: %a" pp_error_reason reason) 34 | _ -> None) 35 36let error_reason_to_string reason = Format.asprintf "%a" pp_error_reason reason 37 38(* {1 Error Constructors} *) 39 40let punycode_error e = raise (Error (Punycode_error e)) 41let invalid_label msg = raise (Error (Invalid_label msg)) 42let domain_too_long len = raise (Error (Domain_too_long len)) 43let verification_failed () = raise (Error Verification_failed) 44 45(* {1 Unicode Normalization} *) 46 47let normalize_nfc s = Uunf_string.normalize_utf_8 `NFC s 48 49(* {1 Validation Helpers} *) 50 51let is_ace_label label = Punycode.has_ace_prefix label 52 53(* Check if a label follows STD3 rules (hostname restrictions): 54 - Only LDH (letters, digits, hyphens) 55 - Cannot start or end with hyphen *) 56let is_std3_valid label = 57 let len = String.length label in 58 let is_ldh c = 59 (c >= 'a' && c <= 'z') 60 || (c >= 'A' && c <= 'Z') 61 || (c >= '0' && c <= '9') 62 || c = '-' 63 in 64 len > 0 65 && label.[0] <> '-' 66 && label.[len - 1] <> '-' 67 && String.for_all is_ldh label 68 69(* Check hyphen placement: hyphens not in positions 3 and 4 (except for ACE) *) 70let check_hyphen_rules label = 71 let len = String.length label in 72 if len >= 4 && label.[2] = '-' && label.[3] = '-' then 73 (* Hyphens in positions 3 and 4 - only valid for ACE prefix *) 74 is_ace_label label 75 else true 76 77(* {1 Label Operations} *) 78 79let label_to_ascii_impl ~check_hyphens ~use_std3_rules label = 80 let len = String.length label in 81 if len = 0 then invalid_label "empty label" 82 else if len > Punycode.max_label_length then 83 punycode_error (Punycode.Label_too_long len) 84 else if Punycode.is_ascii_string label then begin 85 (* All ASCII - validate and pass through *) 86 if use_std3_rules && not (is_std3_valid label) then 87 invalid_label "STD3 rules violation" 88 else if check_hyphens && not (check_hyphen_rules label) then 89 invalid_label "invalid hyphen placement" 90 else label 91 end 92 else begin 93 (* Has non-ASCII - normalize and encode *) 94 let normalized = normalize_nfc label in 95 96 (* Encode to Punycode *) 97 let encoded = 98 try Punycode.encode_utf8 normalized 99 with Punycode.Error e -> punycode_error e 100 in 101 let result = Punycode.ace_prefix ^ encoded in 102 let result_len = String.length result in 103 if result_len > Punycode.max_label_length then 104 punycode_error (Punycode.Label_too_long result_len) 105 else if check_hyphens && not (check_hyphen_rules result) then 106 invalid_label "invalid hyphen placement in encoded label" 107 else 108 (* Verification: decode and compare to original normalized form *) 109 let decoded = 110 try Punycode.decode_utf8 encoded 111 with Punycode.Error _ -> verification_failed () 112 in 113 if decoded <> normalized then verification_failed () else result 114 end 115 116let label_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) label = 117 label_to_ascii_impl ~check_hyphens ~use_std3_rules label 118 119let label_to_unicode label = 120 if is_ace_label label then begin 121 let encoded = String.sub label 4 (String.length label - 4) in 122 try Punycode.decode_utf8 encoded 123 with Punycode.Error e -> punycode_error e 124 end 125 else label 126 127(* {1 Domain Operations} *) 128 129(* Split domain into labels *) 130let split_domain domain = String.split_on_char '.' domain 131 132(* Join labels into domain *) 133let join_labels labels = String.concat "." labels 134 135let to_ascii ?(check_hyphens = true) ?(check_bidi = false) 136 ?(check_joiners = false) ?(use_std3_rules = false) ?(transitional = false) 137 domain = 138 (* Note: check_bidi, check_joiners, and transitional are accepted but 139 not fully implemented - they would require additional Unicode data *) 140 let _ = check_bidi in 141 let _ = check_joiners in 142 let _ = transitional in 143 144 let labels = split_domain domain in 145 let encoded_labels = 146 List.map (label_to_ascii_impl ~check_hyphens ~use_std3_rules) labels 147 in 148 let result = join_labels encoded_labels in 149 let len = String.length result in 150 if len > max_domain_length then domain_too_long len else result 151 152let to_unicode domain = 153 let labels = split_domain domain in 154 let decoded_labels = List.map label_to_unicode labels in 155 join_labels decoded_labels 156 157(* {1 Domain Name Library Integration} *) 158 159let domain_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) domain = 160 let s = Domain_name.to_string domain in 161 let ascii = to_ascii ~check_hyphens ~use_std3_rules s in 162 match Domain_name.of_string ascii with 163 | Error (`Msg msg) -> invalid_label msg 164 | Ok d -> d 165 166let domain_to_unicode domain = 167 let s = Domain_name.to_string domain in 168 let unicode = to_unicode s in 169 match Domain_name.of_string unicode with 170 | Error (`Msg msg) -> invalid_label msg 171 | Ok d -> d 172 173(* {1 Validation} *) 174 175let is_idna_valid domain = 176 try ignore (to_ascii domain); true 177 with Error _ -> false