Punycode (RFC3492) in OCaml
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(* IDNA (Internationalized Domain Names in Applications) Implementation *)
7
8let max_domain_length = 253
9
10(* {1 Error Types} *)
11
12type error_reason =
13 | Punycode_error of Punycode.error_reason
14 | Invalid_label of string
15 | Domain_too_long of int
16 | Normalization_failed
17 | Verification_failed
18
19let pp_error_reason fmt = function
20 | Punycode_error e ->
21 Format.fprintf fmt "Punycode error: %a" Punycode.pp_error_reason e
22 | Invalid_label msg -> Format.fprintf fmt "invalid label: %s" msg
23 | Domain_too_long len ->
24 Format.fprintf fmt "domain too long: %d bytes (max %d)" len
25 max_domain_length
26 | Normalization_failed -> Format.fprintf fmt "Unicode normalization failed"
27 | Verification_failed ->
28 Format.fprintf fmt "IDNA verification failed (round-trip mismatch)"
29
30exception Error of error_reason
31
32let () = Printexc.register_printer (function
33 | Error reason -> Some (Format.asprintf "Punycode_idna.Error: %a" pp_error_reason reason)
34 | _ -> None)
35
36let error_reason_to_string reason = Format.asprintf "%a" pp_error_reason reason
37
38(* {1 Error Constructors} *)
39
40let punycode_error e = raise (Error (Punycode_error e))
41let invalid_label msg = raise (Error (Invalid_label msg))
42let domain_too_long len = raise (Error (Domain_too_long len))
43let verification_failed () = raise (Error Verification_failed)
44
45(* {1 Unicode Normalization} *)
46
47let normalize_nfc s = Uunf_string.normalize_utf_8 `NFC s
48
49(* {1 Validation Helpers} *)
50
51let is_ace_label label = Punycode.has_ace_prefix label
52
53(* Check if a label follows STD3 rules (hostname restrictions):
54 - Only LDH (letters, digits, hyphens)
55 - Cannot start or end with hyphen *)
56let is_std3_valid label =
57 let len = String.length label in
58 let is_ldh c =
59 (c >= 'a' && c <= 'z')
60 || (c >= 'A' && c <= 'Z')
61 || (c >= '0' && c <= '9')
62 || c = '-'
63 in
64 len > 0
65 && label.[0] <> '-'
66 && label.[len - 1] <> '-'
67 && String.for_all is_ldh label
68
69(* Check hyphen placement: hyphens not in positions 3 and 4 (except for ACE) *)
70let check_hyphen_rules label =
71 let len = String.length label in
72 if len >= 4 && label.[2] = '-' && label.[3] = '-' then
73 (* Hyphens in positions 3 and 4 - only valid for ACE prefix *)
74 is_ace_label label
75 else true
76
77(* {1 Label Operations} *)
78
79let label_to_ascii_impl ~check_hyphens ~use_std3_rules label =
80 let len = String.length label in
81 if len = 0 then invalid_label "empty label"
82 else if len > Punycode.max_label_length then
83 punycode_error (Punycode.Label_too_long len)
84 else if Punycode.is_ascii_string label then begin
85 (* All ASCII - validate and pass through *)
86 if use_std3_rules && not (is_std3_valid label) then
87 invalid_label "STD3 rules violation"
88 else if check_hyphens && not (check_hyphen_rules label) then
89 invalid_label "invalid hyphen placement"
90 else label
91 end
92 else begin
93 (* Has non-ASCII - normalize and encode *)
94 let normalized = normalize_nfc label in
95
96 (* Encode to Punycode *)
97 let encoded =
98 try Punycode.encode_utf8 normalized
99 with Punycode.Error e -> punycode_error e
100 in
101 let result = Punycode.ace_prefix ^ encoded in
102 let result_len = String.length result in
103 if result_len > Punycode.max_label_length then
104 punycode_error (Punycode.Label_too_long result_len)
105 else if check_hyphens && not (check_hyphen_rules result) then
106 invalid_label "invalid hyphen placement in encoded label"
107 else
108 (* Verification: decode and compare to original normalized form *)
109 let decoded =
110 try Punycode.decode_utf8 encoded
111 with Punycode.Error _ -> verification_failed ()
112 in
113 if decoded <> normalized then verification_failed () else result
114 end
115
116let label_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) label =
117 label_to_ascii_impl ~check_hyphens ~use_std3_rules label
118
119let label_to_unicode label =
120 if is_ace_label label then begin
121 let encoded = String.sub label 4 (String.length label - 4) in
122 try Punycode.decode_utf8 encoded
123 with Punycode.Error e -> punycode_error e
124 end
125 else label
126
127(* {1 Domain Operations} *)
128
129(* Split domain into labels *)
130let split_domain domain = String.split_on_char '.' domain
131
132(* Join labels into domain *)
133let join_labels labels = String.concat "." labels
134
135let to_ascii ?(check_hyphens = true) ?(check_bidi = false)
136 ?(check_joiners = false) ?(use_std3_rules = false) ?(transitional = false)
137 domain =
138 (* Note: check_bidi, check_joiners, and transitional are accepted but
139 not fully implemented - they would require additional Unicode data *)
140 let _ = check_bidi in
141 let _ = check_joiners in
142 let _ = transitional in
143
144 let labels = split_domain domain in
145 let encoded_labels =
146 List.map (label_to_ascii_impl ~check_hyphens ~use_std3_rules) labels
147 in
148 let result = join_labels encoded_labels in
149 let len = String.length result in
150 if len > max_domain_length then domain_too_long len else result
151
152let to_unicode domain =
153 let labels = split_domain domain in
154 let decoded_labels = List.map label_to_unicode labels in
155 join_labels decoded_labels
156
157(* {1 Domain Name Library Integration} *)
158
159let domain_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) domain =
160 let s = Domain_name.to_string domain in
161 let ascii = to_ascii ~check_hyphens ~use_std3_rules s in
162 match Domain_name.of_string ascii with
163 | Error (`Msg msg) -> invalid_label msg
164 | Ok d -> d
165
166let domain_to_unicode domain =
167 let s = Domain_name.to_string domain in
168 let unicode = to_unicode s in
169 match Domain_name.of_string unicode with
170 | Error (`Msg msg) -> invalid_label msg
171 | Ok d -> d
172
173(* {1 Validation} *)
174
175let is_idna_valid domain =
176 try ignore (to_ascii domain); true
177 with Error _ -> false