My aggregated monorepo of OCaml code, automaintained
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Remove ocaml-punycode from monorepo

-13246
-17
ocaml-punycode/.gitignore
··· 1 - # OCaml build artifacts 2 - _build/ 3 - *.install 4 - *.merlin 5 - 6 - # Third-party sources (fetch locally with opam source) 7 - third_party/ 8 - 9 - # Editor and OS files 10 - .DS_Store 11 - *.swp 12 - *~ 13 - .vscode/ 14 - .idea/ 15 - 16 - # Opam local switch 17 - _opam/
-1
ocaml-punycode/.ocamlformat
··· 1 - version=0.28.1
-53
ocaml-punycode/.tangled/workflows/build.yml
··· 1 - when: 2 - - event: ["push", "pull_request"] 3 - branch: ["main"] 4 - 5 - engine: nixery 6 - 7 - dependencies: 8 - nixpkgs: 9 - - shell 10 - - stdenv 11 - - findutils 12 - - binutils 13 - - libunwind 14 - - ncurses 15 - - opam 16 - - git 17 - - gawk 18 - - gnupatch 19 - - gnum4 20 - - gnumake 21 - - gnutar 22 - - gnused 23 - - gnugrep 24 - - diffutils 25 - - gzip 26 - - bzip2 27 - - gcc 28 - - ocaml 29 - - pkg-config 30 - 31 - steps: 32 - - name: opam 33 - command: | 34 - opam init --disable-sandboxing -a -y 35 - - name: repo 36 - command: | 37 - opam repo add aoah https://tangled.org/anil.recoil.org/aoah-opam-repo.git 38 - - name: switch 39 - command: | 40 - opam install . --confirm-level=unsafe-yes --deps-only 41 - - name: build 42 - command: | 43 - opam exec -- dune build -p punycode 44 - - name: switch-test 45 - command: | 46 - opam install . --confirm-level=unsafe-yes --deps-only --with-test 47 - - name: test 48 - command: | 49 - opam exec -- dune runtest --verbose 50 - - name: doc 51 - command: | 52 - opam install -y odoc 53 - opam exec -- dune build @doc
-15
ocaml-punycode/LICENSE.md
··· 1 - ISC License 2 - 3 - Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org> 4 - 5 - Permission to use, copy, modify, and distribute this software for any 6 - purpose with or without fee is hereby granted, provided that the above 7 - copyright notice and this permission notice appear in all copies. 8 - 9 - THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 - WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 - MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 - ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 - WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 - ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 - OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-112
ocaml-punycode/README.md
··· 1 - # puny - RFC 3492 Punycode and IDNA for OCaml 2 - 3 - High-quality implementation of RFC 3492 (Punycode) with IDNA (Internationalized Domain Names in Applications) support for OCaml. Enables encoding and decoding of internationalized domain names with proper Unicode normalization. 4 - 5 - ## Key Features 6 - 7 - - **RFC 3492 Punycode**: Complete implementation of the Bootstring algorithm for encoding Unicode in ASCII-compatible form 8 - - **IDNA Support**: ToASCII and ToUnicode operations per RFC 5891 (IDNA 2008) for internationalized domain names 9 - - **Unicode Normalization**: Automatic NFC normalization using `uunf` for proper IDNA compliance 10 - - **Mixed-Case Annotation**: Optional case preservation through Punycode encoding round-trips 11 - - **Domain Integration**: Native support for the `domain-name` library 12 - - **Comprehensive Error Handling**: Detailed position tracking and RFC-compliant error reporting 13 - 14 - ## Usage 15 - 16 - ### Basic Punycode Encoding/Decoding 17 - 18 - ```ocaml 19 - (* Encode a UTF-8 string to Punycode *) 20 - let encoded = Punycode.encode_utf8 "münchen" 21 - (* = Ok "mnchen-3ya" *) 22 - 23 - (* Decode Punycode back to UTF-8 *) 24 - let decoded = Punycode.decode_utf8 "mnchen-3ya" 25 - (* = Ok "münchen" *) 26 - ``` 27 - 28 - ### Domain Label Operations 29 - 30 - ```ocaml 31 - (* Encode a domain label with ACE prefix *) 32 - let label = Punycode.encode_label "münchen" 33 - (* = Ok "xn--mnchen-3ya" *) 34 - 35 - (* Decode an ACE-prefixed label *) 36 - let original = Punycode.decode_label "xn--mnchen-3ya" 37 - (* = Ok "münchen" *) 38 - ``` 39 - 40 - ### IDNA Domain Name Conversion 41 - 42 - ```ocaml 43 - (* Convert internationalized domain to ASCII for DNS lookup *) 44 - let ascii_domain = Punycode_idna.to_ascii "münchen.example.com" 45 - (* = Ok "xn--mnchen-3ya.example.com" *) 46 - 47 - (* Convert ASCII domain back to Unicode for display *) 48 - let unicode_domain = Punycode_idna.to_unicode "xn--mnchen-3ya.example.com" 49 - (* = Ok "münchen.example.com" *) 50 - ``` 51 - 52 - ### Working with Unicode Code Points 53 - 54 - ```ocaml 55 - (* Encode an array of Unicode code points *) 56 - let codepoints = [| Uchar.of_int 0x4ED6; Uchar.of_int 0x4EEC |] 57 - let encoded = Punycode.encode codepoints 58 - (* Result is Punycode string *) 59 - 60 - (* Decode to code points *) 61 - let decoded = Punycode.decode "ihqwcrb4cv8a8dqg056pqjye" 62 - (* Result is Uchar.t array *) 63 - ``` 64 - 65 - ### Integration with domain-name Library 66 - 67 - ```ocaml 68 - (* Convert a Domain_name.t to ASCII *) 69 - let domain = Domain_name.of_string_exn "münchen.example.com" in 70 - let ascii = Punycode_idna.domain_to_ascii domain 71 - (* = Ok (Domain_name for "xn--mnchen-3ya.example.com") *) 72 - 73 - (* Convert back to Unicode *) 74 - let unicode = Punycode_idna.domain_to_unicode ascii 75 - (* = Ok (original domain) *) 76 - ``` 77 - 78 - ## Installation 79 - 80 - ``` 81 - opam install puny 82 - ``` 83 - 84 - ## Documentation 85 - 86 - API documentation is available at https://tangled.org/@anil.recoil.org/ocaml-punycode or via: 87 - 88 - ``` 89 - opam install puny 90 - odig doc puny 91 - ``` 92 - 93 - ## Limitations 94 - 95 - The following IDNA 2008 features are not yet implemented: 96 - 97 - - **Bidi rules** (RFC 5893): Bidirectional text validation for right-to-left scripts 98 - - **Contextual joiners** (RFC 5892 Appendix A.1): Zero-width joiner/non-joiner validation 99 - 100 - These checks are disabled by default in the API. Most common use cases (European languages, CJK) work correctly without them. 101 - 102 - ## References 103 - 104 - - [RFC 3492](https://datatracker.ietf.org/doc/html/rfc3492) - Punycode: A Bootstring encoding of Unicode for IDNA 105 - - [RFC 5891](https://datatracker.ietf.org/doc/html/rfc5891) - Internationalized Domain Names in Applications (IDNA): Protocol 106 - - [RFC 5892](https://datatracker.ietf.org/doc/html/rfc5892) - Unicode Code Points and IDNA 107 - - [RFC 5893](https://datatracker.ietf.org/doc/html/rfc5893) - Right-to-Left Scripts for IDNA 108 - - [RFC 1035](https://datatracker.ietf.org/doc/html/rfc1035) - Domain Names Implementation and Specification 109 - 110 - ## License 111 - 112 - ISC
-5
ocaml-punycode/dune
··· 1 - ; Root dune file 2 - 3 - ; Ignore third_party directory (for fetched dependency sources) 4 - 5 - (data_only_dirs third_party)
-28
ocaml-punycode/dune-project
··· 1 - (lang dune 3.20) 2 - 3 - (name punycode) 4 - 5 - (generate_opam_files true) 6 - 7 - (license ISC) 8 - (authors "Anil Madhavapeddy") 9 - (homepage "https://tangled.org/anil.recoil.org/ocaml-punycode") 10 - (maintainers "Anil Madhavapeddy <anil@recoil.org>") 11 - (bug_reports "https://tangled.org/anil.recoil.org/ocaml-punycode/issues") 12 - (maintenance_intent "(latest)") 13 - 14 - (package 15 - (name punycode) 16 - (synopsis "RFC 3492 Punycode and IDNA implementation for OCaml") 17 - (description 18 - "A high-quality implementation of RFC 3492 (Punycode) with IDNA support. 19 - Provides encoding and decoding of internationalized domain names, 20 - with proper Unicode normalization and mixed-case annotation support.") 21 - (depends 22 - (ocaml (>= 5.4.0)) 23 - (dune (>= 3.0)) 24 - (uutf (>= 1.0.0)) 25 - (uunf (>= 15.0.0)) 26 - (domain-name (>= 0.4.0)) 27 - (odoc :with-doc) 28 - (alcotest :with-test)))
-15
ocaml-punycode/fuzz/dune
··· 1 - ; Crowbar fuzz testing for punycode 2 - ; 3 - ; To run: dune exec ocaml-punycode/fuzz/fuzz_punycode.exe 4 - ; With AFL: afl-fuzz -i fuzz/corpus -o fuzz/findings -- ./_build/default/ocaml-punycode/fuzz/fuzz_punycode.exe @@ 5 - 6 - (executable 7 - (name fuzz_punycode) 8 - (modules fuzz_punycode) 9 - (libraries punycode crowbar)) 10 - 11 - (rule 12 - (alias fuzz) 13 - (deps fuzz_punycode.exe) 14 - (action 15 - (run %{exe:fuzz_punycode.exe})))
-63
ocaml-punycode/fuzz/fuzz_punycode.ml
··· 1 - (*--------------------------------------------------------------------------- 2 - Copyright (c) 2025 Thomas Gazagnaire. All rights reserved. 3 - SPDX-License-Identifier: MIT 4 - ---------------------------------------------------------------------------*) 5 - 6 - (* Crowbar-based fuzz testing for Punycode encoding/decoding *) 7 - 8 - open Crowbar 9 - 10 - (* Test that encode_utf8 never crashes on arbitrary input *) 11 - let test_encode_no_crash input = 12 - ignore (Punycode.encode_utf8 input); 13 - check true 14 - 15 - (* Test that decode_utf8 never crashes on arbitrary input *) 16 - let test_decode_no_crash input = 17 - ignore (Punycode.decode_utf8 input); 18 - check true 19 - 20 - (* Test roundtrip: encode then decode should give back original (case-insensitive) 21 - IDNA/Punycode lowercases ASCII characters during encoding per RFC 5891 *) 22 - let test_roundtrip input = 23 - match Punycode.encode_utf8 input with 24 - | Ok encoded -> ( 25 - match Punycode.decode_utf8 encoded with 26 - | Ok decoded -> 27 - (* Compare lowercase versions since IDNA lowercases ASCII *) 28 - check_eq ~pp:Format.pp_print_string 29 - (String.lowercase_ascii input) 30 - (String.lowercase_ascii decoded) 31 - | Error _ -> 32 - (* Some encoded values might not decode, that's ok for fuzz testing *) 33 - check true) 34 - | Error _ -> 35 - (* Some inputs might not encode, that's ok *) 36 - check true 37 - 38 - (* Test ASCII-only strings (should pass through mostly unchanged) *) 39 - let test_ascii_string input = 40 - if String.length input > 0 then begin 41 - let ascii_only = 42 - String.init 43 - (String.length input mod 64) 44 - (fun i -> 45 - Char.chr (Char.code input.[i mod String.length input] mod 128)) 46 - in 47 - if String.length ascii_only > 0 then 48 - ignore (Punycode.encode_utf8 ascii_only) 49 - end; 50 - check true 51 - 52 - (* Test inputs starting with ACE prefix "xn--" *) 53 - let test_ace_prefix input = 54 - let ace_input = "xn--" ^ input in 55 - ignore (Punycode.decode_utf8 ace_input); 56 - check true 57 - 58 - let () = 59 - add_test ~name:"punycode: encode no crash" [ bytes ] test_encode_no_crash; 60 - add_test ~name:"punycode: decode no crash" [ bytes ] test_decode_no_crash; 61 - add_test ~name:"punycode: roundtrip" [ bytes ] test_roundtrip; 62 - add_test ~name:"punycode: ascii string" [ bytes ] test_ascii_string; 63 - add_test ~name:"punycode: ace prefix" [ bytes ] test_ace_prefix
-11
ocaml-punycode/lib/dune
··· 1 - (library 2 - (name punycode) 3 - (public_name punycode) 4 - (modules punycode) 5 - (libraries uutf)) 6 - 7 - (library 8 - (name punycode_idna) 9 - (public_name punycode.idna) 10 - (modules punycode_idna) 11 - (libraries punycode uunf domain-name))
-493
ocaml-punycode/lib/punycode.ml
··· 1 - (*--------------------------------------------------------------------------- 2 - Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 - SPDX-License-Identifier: ISC 4 - ---------------------------------------------------------------------------*) 5 - 6 - (* RFC 3492 Punycode Implementation *) 7 - 8 - (* {1 Bootstring Parameters for Punycode (RFC 3492 Section 5)} *) 9 - 10 - let base = 36 11 - let tmin = 1 12 - let tmax = 26 13 - let skew = 38 14 - let damp = 700 15 - let initial_bias = 72 16 - let initial_n = 0x80 (* 128 *) 17 - let delimiter = '-' 18 - let ace_prefix = "xn--" 19 - let max_label_length = 63 20 - 21 - (* {1 Position Tracking} *) 22 - 23 - type position = { byte_offset : int; char_index : int } 24 - 25 - let position_byte_offset pos = pos.byte_offset 26 - let position_char_index pos = pos.char_index 27 - 28 - let pp_position fmt pos = 29 - Format.fprintf fmt "byte %d, char %d" pos.byte_offset pos.char_index 30 - 31 - (* {1 Error Types} *) 32 - 33 - type error = 34 - | Overflow of position 35 - | Invalid_character of position * Uchar.t 36 - | Invalid_digit of position * char 37 - | Unexpected_end of position 38 - | Invalid_utf8 of position 39 - | Label_too_long of int 40 - | Empty_label 41 - 42 - let pp_error fmt = function 43 - | Overflow pos -> 44 - Format.fprintf fmt "arithmetic overflow at %a" pp_position pos 45 - | Invalid_character (pos, u) -> 46 - Format.fprintf fmt "invalid character U+%04X at %a" (Uchar.to_int u) 47 - pp_position pos 48 - | Invalid_digit (pos, c) -> 49 - Format.fprintf fmt "invalid Punycode digit '%c' (0x%02X) at %a" c 50 - (Char.code c) pp_position pos 51 - | Unexpected_end pos -> 52 - Format.fprintf fmt "unexpected end of input at %a" pp_position pos 53 - | Invalid_utf8 pos -> 54 - Format.fprintf fmt "invalid UTF-8 sequence at %a" pp_position pos 55 - | Label_too_long len -> 56 - Format.fprintf fmt "label too long: %d bytes (max %d)" len 57 - max_label_length 58 - | Empty_label -> Format.fprintf fmt "empty label" 59 - 60 - let error_to_string err = Format.asprintf "%a" pp_error err 61 - 62 - (* {1 Error Constructors} *) 63 - 64 - let overflow pos = Error (Overflow pos) 65 - let invalid_character pos u = Error (Invalid_character (pos, u)) 66 - let invalid_digit pos c = Error (Invalid_digit (pos, c)) 67 - let unexpected_end pos = Error (Unexpected_end pos) 68 - let _invalid_utf8 pos = Error (Invalid_utf8 pos) 69 - let label_too_long len = Error (Label_too_long len) 70 - let empty_label = Error Empty_label 71 - 72 - (* {1 Case Flags} *) 73 - 74 - type case_flag = Uppercase | Lowercase 75 - 76 - (* {1 Basic Predicates} *) 77 - 78 - let is_basic u = Uchar.to_int u < 0x80 79 - let is_ascii_string s = String.for_all (fun c -> Char.code c < 0x80) s 80 - 81 - let has_ace_prefix s = 82 - let len = String.length s in 83 - len >= 4 84 - && (s.[0] = 'x' || s.[0] = 'X') 85 - && (s.[1] = 'n' || s.[1] = 'N') 86 - && s.[2] = '-' 87 - && s.[3] = '-' 88 - 89 - (* {1 Digit Encoding/Decoding (RFC 3492 Section 5)} 90 - 91 - Digit values: 92 - - 0-25: a-z (or A-Z) 93 - - 26-35: 0-9 94 - *) 95 - 96 - let encode_digit d case_flag = 97 - if d < 26 then Char.chr (d + if case_flag = Uppercase then 0x41 else 0x61) 98 - else Char.chr (d - 26 + 0x30) 99 - 100 - let decode_digit c = 101 - let code = Char.code c in 102 - if code >= 0x30 && code <= 0x39 then Some (code - 0x30 + 26) 103 - (* '0'-'9' -> 26-35 *) 104 - else if code >= 0x41 && code <= 0x5A then Some (code - 0x41) 105 - (* 'A'-'Z' -> 0-25 *) 106 - else if code >= 0x61 && code <= 0x7A then Some (code - 0x61) 107 - (* 'a'-'z' -> 0-25 *) 108 - else None 109 - 110 - (* Check if a character is "flagged" (uppercase) for case annotation *) 111 - let is_flagged c = 112 - let code = Char.code c in 113 - code >= 0x41 && code <= 0x5A (* 'A'-'Z' *) 114 - 115 - (* {1 Bias Adaptation (RFC 3492 Section 6.1)} *) 116 - 117 - let adapt ~delta ~numpoints ~firsttime = 118 - let delta = if firsttime then delta / damp else delta / 2 in 119 - let delta = delta + (delta / numpoints) in 120 - let threshold = (base - tmin) * tmax / 2 in 121 - let rec loop delta k = 122 - if delta > threshold then loop (delta / (base - tmin)) (k + base) 123 - else k + ((base - tmin + 1) * delta / (delta + skew)) 124 - in 125 - loop delta 0 126 - 127 - (* {1 Overflow-Safe Arithmetic} 128 - 129 - RFC 3492 Section 6.4: Use detection to avoid overflow. 130 - A + B overflows iff B > maxint - A 131 - A + B*C overflows iff B > (maxint - A) / C 132 - *) 133 - 134 - let max_int_value = max_int 135 - 136 - let safe_mul_add a b c pos = 137 - if c = 0 then Ok a 138 - else if b > (max_int_value - a) / c then overflow pos 139 - else Ok (a + (b * c)) 140 - 141 - (* {1 UTF-8 to Code Points Conversion} *) 142 - 143 - let utf8_to_codepoints s = 144 - let len = String.length s in 145 - let acc = ref [] in 146 - let byte_offset = ref 0 in 147 - let char_index = ref 0 in 148 - let error = ref None in 149 - while !byte_offset < len && !error = None do 150 - let pos = { byte_offset = !byte_offset; char_index = !char_index } in 151 - let dec = String.get_utf_8_uchar s !byte_offset in 152 - if Uchar.utf_decode_is_valid dec then begin 153 - acc := Uchar.utf_decode_uchar dec :: !acc; 154 - byte_offset := !byte_offset + Uchar.utf_decode_length dec; 155 - incr char_index 156 - end 157 - else begin 158 - error := Some (Invalid_utf8 pos) 159 - end 160 - done; 161 - match !error with 162 - | Some e -> Error e 163 - | None -> Ok (Array.of_list (List.rev !acc)) 164 - 165 - (* {1 Code Points to UTF-8 Conversion} *) 166 - 167 - let codepoints_to_utf8 codepoints = 168 - let buf = Buffer.create (Array.length codepoints * 2) in 169 - Array.iter (Buffer.add_utf_8_uchar buf) codepoints; 170 - Buffer.contents buf 171 - 172 - (* {1 Punycode Encoding (RFC 3492 Section 6.3)} *) 173 - 174 - let encode_impl codepoints case_flags = 175 - let input_length = Array.length codepoints in 176 - if input_length = 0 then Ok "" 177 - else begin 178 - let output = Buffer.create (input_length * 2) in 179 - 180 - (* Copy basic code points to output *) 181 - let basic_count = ref 0 in 182 - for j = 0 to input_length - 1 do 183 - let cp = codepoints.(j) in 184 - if is_basic cp then begin 185 - let c = Uchar.to_int cp in 186 - let case = 187 - match case_flags with Some flags -> flags.(j) | None -> Lowercase 188 - in 189 - (* Preserve or apply case for ASCII letters *) 190 - let c' = 191 - if c >= 0x41 && c <= 0x5A then (* 'A'-'Z' *) 192 - if case = Lowercase then c + 0x20 else c 193 - else if c >= 0x61 && c <= 0x7A then (* 'a'-'z' *) 194 - if case = Uppercase then c - 0x20 else c 195 - else c 196 - in 197 - Buffer.add_char output (Char.chr c'); 198 - incr basic_count 199 - end 200 - done; 201 - 202 - let b = !basic_count in 203 - let h = ref b in 204 - 205 - (* Add delimiter if there were basic code points *) 206 - if b > 0 then Buffer.add_char output delimiter; 207 - 208 - (* Main encoding loop *) 209 - let n = ref initial_n in 210 - let delta = ref 0 in 211 - let bias = ref initial_bias in 212 - 213 - let result = ref (Ok ()) in 214 - 215 - while !h < input_length && !result = Ok () do 216 - (* Find minimum code point >= n *) 217 - let m = 218 - Array.fold_left 219 - (fun acc cp -> 220 - let cp_val = Uchar.to_int cp in 221 - if cp_val >= !n && cp_val < acc then cp_val else acc) 222 - max_int_value codepoints 223 - in 224 - 225 - (* Increase delta to advance state to <m, 0> *) 226 - let pos = { byte_offset = 0; char_index = !h } in 227 - match safe_mul_add !delta (m - !n) (!h + 1) pos with 228 - | Error e -> result := Error e 229 - | Ok new_delta -> 230 - delta := new_delta; 231 - n := m; 232 - 233 - (* Process each code point *) 234 - let j = ref 0 in 235 - while !j < input_length && !result = Ok () do 236 - let cp = Uchar.to_int codepoints.(!j) in 237 - let pos = { byte_offset = 0; char_index = !j } in 238 - 239 - if cp < !n then begin 240 - incr delta; 241 - if !delta = 0 then (* Overflow *) 242 - result := overflow pos 243 - end 244 - else if cp = !n then begin 245 - (* Encode delta as variable-length integer *) 246 - let q = ref !delta in 247 - let k = ref base in 248 - let done_encoding = ref false in 249 - 250 - while not !done_encoding do 251 - let t = 252 - if !k <= !bias then tmin 253 - else if !k >= !bias + tmax then tmax 254 - else !k - !bias 255 - in 256 - if !q < t then begin 257 - (* Output final digit *) 258 - let case = 259 - match case_flags with 260 - | Some flags -> flags.(!j) 261 - | None -> Lowercase 262 - in 263 - Buffer.add_char output (encode_digit !q case); 264 - done_encoding := true 265 - end 266 - else begin 267 - (* Output intermediate digit and continue *) 268 - let digit = t + ((!q - t) mod (base - t)) in 269 - Buffer.add_char output (encode_digit digit Lowercase); 270 - q := (!q - t) / (base - t); 271 - k := !k + base 272 - end 273 - done; 274 - 275 - bias := adapt ~delta:!delta ~numpoints:(!h + 1) ~firsttime:(!h = b); 276 - delta := 0; 277 - incr h 278 - end; 279 - incr j 280 - done; 281 - 282 - incr delta; 283 - incr n 284 - done; 285 - 286 - match !result with 287 - | Error e -> Error e 288 - | Ok () -> Ok (Buffer.contents output) 289 - end 290 - 291 - let encode codepoints = encode_impl codepoints None 292 - 293 - let encode_with_case codepoints case_flags = 294 - if Array.length codepoints <> Array.length case_flags then 295 - invalid_arg "encode_with_case: array lengths must match"; 296 - encode_impl codepoints (Some case_flags) 297 - 298 - (* {1 Punycode Decoding (RFC 3492 Section 6.2)} *) 299 - 300 - let decode_impl input = 301 - let input_length = String.length input in 302 - if input_length = 0 then Ok ([||], [||]) 303 - else begin 304 - (* Find last delimiter *) 305 - let b = Option.value ~default:0 (String.rindex_opt input delimiter) in 306 - 307 - (* Copy basic code points and extract case flags *) 308 - let output = ref [] in 309 - let case_output = ref [] in 310 - let error = ref None in 311 - 312 - for j = 0 to b - 1 do 313 - if !error = None then begin 314 - let c = input.[j] in 315 - let pos = { byte_offset = j; char_index = j } in 316 - let code = Char.code c in 317 - if code >= 0x80 then 318 - error := Some (Invalid_character (pos, Uchar.of_int code)) 319 - else begin 320 - output := Uchar.of_int code :: !output; 321 - case_output := 322 - (if is_flagged c then Uppercase else Lowercase) :: !case_output 323 - end 324 - end 325 - done; 326 - 327 - match !error with 328 - | Some e -> Error e 329 - | None -> ( 330 - let output = ref (Array.of_list (List.rev !output)) in 331 - let case_output = ref (Array.of_list (List.rev !case_output)) in 332 - 333 - (* Main decoding loop *) 334 - let n = ref initial_n in 335 - let i = ref 0 in 336 - let bias = ref initial_bias in 337 - let in_pos = ref (if b > 0 then b + 1 else 0) in 338 - let result = ref (Ok ()) in 339 - 340 - while !in_pos < input_length && !result = Ok () do 341 - let oldi = !i in 342 - let w = ref 1 in 343 - let k = ref base in 344 - let done_decoding = ref false in 345 - 346 - while (not !done_decoding) && !result = Ok () do 347 - let pos = 348 - { byte_offset = !in_pos; char_index = Array.length !output } 349 - in 350 - 351 - if !in_pos >= input_length then begin 352 - result := unexpected_end pos; 353 - done_decoding := true 354 - end 355 - else begin 356 - let c = input.[!in_pos] in 357 - incr in_pos; 358 - 359 - match decode_digit c with 360 - | None -> 361 - result := invalid_digit pos c; 362 - done_decoding := true 363 - | Some digit -> ( 364 - (* i = i + digit * w, with overflow check *) 365 - match safe_mul_add !i digit !w pos with 366 - | Error e -> 367 - result := Error e; 368 - done_decoding := true 369 - | Ok new_i -> 370 - i := new_i; 371 - 372 - let t = 373 - if !k <= !bias then tmin 374 - else if !k >= !bias + tmax then tmax 375 - else !k - !bias 376 - in 377 - 378 - if digit < t then begin 379 - (* Record case flag from this final digit *) 380 - done_decoding := true 381 - end 382 - else begin 383 - (* w = w * (base - t), with overflow check *) 384 - let base_minus_t = base - t in 385 - if !w > max_int_value / base_minus_t then begin 386 - result := overflow pos; 387 - done_decoding := true 388 - end 389 - else begin 390 - w := !w * base_minus_t; 391 - k := !k + base 392 - end 393 - end) 394 - end 395 - done; 396 - 397 - if !result = Ok () then begin 398 - let out_len = Array.length !output in 399 - bias := 400 - adapt ~delta:(!i - oldi) ~numpoints:(out_len + 1) 401 - ~firsttime:(oldi = 0); 402 - 403 - let pos = { byte_offset = !in_pos - 1; char_index = out_len } in 404 - 405 - (* n = n + i / (out_len + 1), with overflow check *) 406 - let increment = !i / (out_len + 1) in 407 - if increment > max_int_value - !n then result := overflow pos 408 - else begin 409 - n := !n + increment; 410 - i := !i mod (out_len + 1); 411 - 412 - (* Validate that n is a valid Unicode scalar value *) 413 - if not (Uchar.is_valid !n) then 414 - result := invalid_character pos Uchar.rep 415 - else begin 416 - (* Insert n at position i *) 417 - let new_output = Array.make (out_len + 1) (Uchar.of_int 0) in 418 - let new_case = Array.make (out_len + 1) Lowercase in 419 - 420 - for j = 0 to !i - 1 do 421 - new_output.(j) <- !output.(j); 422 - new_case.(j) <- !case_output.(j) 423 - done; 424 - new_output.(!i) <- Uchar.of_int !n; 425 - (* Case flag from final digit of this delta *) 426 - new_case.(!i) <- 427 - (if !in_pos > 0 && is_flagged input.[!in_pos - 1] then 428 - Uppercase 429 - else Lowercase); 430 - for j = !i to out_len - 1 do 431 - new_output.(j + 1) <- !output.(j); 432 - new_case.(j + 1) <- !case_output.(j) 433 - done; 434 - 435 - output := new_output; 436 - case_output := new_case; 437 - incr i 438 - end 439 - end 440 - end 441 - done; 442 - 443 - match !result with 444 - | Error e -> Error e 445 - | Ok () -> Ok (!output, !case_output)) 446 - end 447 - 448 - let decode input = Result.map fst (decode_impl input) 449 - let decode_with_case input = decode_impl input 450 - 451 - (* {1 UTF-8 String Operations} *) 452 - 453 - let encode_utf8 s = 454 - let open Result.Syntax in 455 - let* codepoints = utf8_to_codepoints s in 456 - encode codepoints 457 - 458 - let decode_utf8 punycode = 459 - let open Result.Syntax in 460 - let+ codepoints = decode punycode in 461 - codepoints_to_utf8 codepoints 462 - 463 - (* {1 Domain Label Operations} *) 464 - 465 - let encode_label label = 466 - if String.length label = 0 then empty_label 467 - else if is_ascii_string label then begin 468 - (* All ASCII - return as-is, but check length *) 469 - let len = String.length label in 470 - if len > max_label_length then label_too_long len else Ok label 471 - end 472 - else 473 - (* Has non-ASCII - encode with Punycode *) 474 - let open Result.Syntax in 475 - let* encoded = encode_utf8 label in 476 - let result = ace_prefix ^ encoded in 477 - let len = String.length result in 478 - if len > max_label_length then label_too_long len else Ok result 479 - 480 - let decode_label label = 481 - if String.length label = 0 then empty_label 482 - else if has_ace_prefix label then begin 483 - (* Remove ACE prefix and decode *) 484 - let punycode = String.sub label 4 (String.length label - 4) in 485 - decode_utf8 punycode 486 - end 487 - else begin 488 - (* No ACE prefix - validate and return *) 489 - if is_ascii_string label then Ok label 490 - else 491 - (* Has non-ASCII but no ACE prefix - return as-is *) 492 - Ok label 493 - end
-267
ocaml-punycode/lib/punycode.mli
··· 1 - (*--------------------------------------------------------------------------- 2 - Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 - SPDX-License-Identifier: ISC 4 - ---------------------------------------------------------------------------*) 5 - 6 - (** RFC 3492 Punycode: A Bootstring encoding of Unicode for IDNA. 7 - 8 - This module implements the Punycode algorithm as specified in 9 - {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492}, providing 10 - encoding and decoding of Unicode strings to/from ASCII-compatible encoding 11 - suitable for use in internationalized domain names. 12 - 13 - Punycode is an instance of Bootstring that uses particular parameter values 14 - appropriate for IDNA. See 15 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 16 - 5} for the specific parameter values. 17 - 18 - {2 References} 19 - - {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A 20 - Bootstring encoding of Unicode for IDNA 21 - - {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - IDNA Protocol 22 - *) 23 - 24 - (** {1 Position Tracking} *) 25 - 26 - type position 27 - (** Abstract type representing a position in input for error reporting. 28 - Positions track both byte offset and Unicode character index. *) 29 - 30 - val position_byte_offset : position -> int 31 - (** [position_byte_offset pos] returns the byte offset in the input. *) 32 - 33 - val position_char_index : position -> int 34 - (** [position_char_index pos] returns the Unicode character index (0-based). *) 35 - 36 - val pp_position : Format.formatter -> position -> unit 37 - (** [pp_position fmt pos] pretty-prints a position as "byte N, char M". *) 38 - 39 - (** {1 Error Types} *) 40 - 41 - type error = 42 - | Overflow of position 43 - (** Arithmetic overflow during encode/decode. This can occur with very 44 - long strings or extreme Unicode code point values. See 45 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.4} RFC 3492 46 - Section 6.4} for overflow handling requirements. *) 47 - | Invalid_character of position * Uchar.t 48 - (** A non-basic code point appeared where only basic code points (ASCII < 49 - 128) are allowed. Per 50 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} RFC 3492 51 - Section 3.1}, basic code points must be segregated at the beginning 52 - of the encoded string. *) 53 - | Invalid_digit of position * char 54 - (** An invalid Punycode digit was encountered during decoding. Valid 55 - digits are a-z, A-Z (values 0-25) and 0-9 (values 26-35). See 56 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 57 - Section 5} for digit-value mappings. *) 58 - | Unexpected_end of position 59 - (** The input ended prematurely during decoding of a delta value. See 60 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2} RFC 3492 61 - Section 6.2} decoding procedure. *) 62 - | Invalid_utf8 of position (** Malformed UTF-8 sequence in input string. *) 63 - | Label_too_long of int 64 - (** Encoded label exceeds 63 bytes (DNS limit per 65 - {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}). The int 66 - is the actual length. *) 67 - | Empty_label (** Empty label is not valid for encoding. *) 68 - 69 - val pp_error : Format.formatter -> error -> unit 70 - (** [pp_error fmt e] pretty-prints an error with position information. *) 71 - 72 - val error_to_string : error -> string 73 - (** [error_to_string e] converts an error to a human-readable string. *) 74 - 75 - (** {1 Constants} 76 - 77 - Punycode parameters as specified in 78 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 79 - 5}. *) 80 - 81 - val ace_prefix : string 82 - (** The ACE prefix ["xn--"] used for Punycode-encoded domain labels. See 83 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section 84 - 5} which notes that IDNA prepends this prefix. *) 85 - 86 - val max_label_length : int 87 - (** Maximum length of a domain label in bytes (63), per 88 - {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *) 89 - 90 - (** {1 Case Flags for Mixed-Case Annotation} 91 - 92 - {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 93 - Appendix A} describes an optional mechanism for preserving case information 94 - through the encoding/decoding round-trip. This is useful when the original 95 - string's case should be recoverable. 96 - 97 - Note: Mixed-case annotation is not used by the ToASCII and ToUnicode 98 - operations of IDNA. *) 99 - 100 - type case_flag = 101 - | Uppercase 102 - | Lowercase (** Case annotation for a character. *) 103 - 104 - (** {1 Core Punycode Operations} 105 - 106 - These functions implement the Bootstring algorithms from 107 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6}RFC 3492 Section 108 - 6}. They operate on arrays of Unicode code points ([Uchar.t array]). The 109 - encoded output is a plain ASCII string without the ACE prefix. *) 110 - 111 - val encode : Uchar.t array -> (string, error) result 112 - (** [encode codepoints] encodes an array of Unicode code points to a Punycode 113 - ASCII string. 114 - 115 - Implements the encoding procedure from 116 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.3}RFC 3492 117 - Section 6.3}: 118 - 119 - 1. Basic code points (ASCII < 128) are copied literally to the beginning of 120 - the output per 121 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} Section 3.1 122 - (Basic code point segregation)} 2. A delimiter ('-') is appended if there 123 - are any basic code points 3. Non-basic code points are encoded as deltas 124 - using the generalized variable-length integer representation from 125 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.3}Section 3.3} 126 - 127 - Example: 128 - {[ 129 - encode [| Uchar.of_int 0x4ED6; Uchar.of_int 0x4EEC; ... |] 130 - (* = Ok "ihqwcrb4cv8a8dqg056pqjye" *) 131 - ]} *) 132 - 133 - val decode : string -> (Uchar.t array, error) result 134 - (** [decode punycode] decodes a Punycode ASCII string to an array of Unicode 135 - code points. 136 - 137 - Implements the decoding procedure from 138 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492 139 - Section 6.2}. 140 - 141 - The input should be the Punycode portion only, without the ACE prefix. The 142 - decoder is case-insensitive for the encoded portion, as required by 143 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 144 - 5}: "A decoder MUST recognize the letters in both uppercase and lowercase 145 - forms". 146 - 147 - Example: 148 - {[ 149 - decode "ihqwcrb4cv8a8dqg056pqjye" 150 - (* = Ok [| U+4ED6; U+4EEC; U+4E3A; ... |] (Chinese simplified) *) 151 - ]} *) 152 - 153 - (** {1 Mixed-Case Annotation} 154 - 155 - These functions support round-trip case preservation as described in 156 - {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 157 - Appendix A}. *) 158 - 159 - val encode_with_case : 160 - Uchar.t array -> case_flag array -> (string, error) result 161 - (** [encode_with_case codepoints case_flags] encodes with case annotation. 162 - 163 - Per 164 - {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 165 - Appendix A}: 166 - - For basic (ASCII) letters, the output preserves the case flag directly 167 - - For non-ASCII characters, the case of the final digit in each delta 168 - encoding indicates the flag (uppercase = suggested uppercase) 169 - 170 - The [case_flags] array must have the same length as [codepoints]. 171 - 172 - @raise Invalid_argument if array lengths don't match. *) 173 - 174 - val decode_with_case : string -> (Uchar.t array * case_flag array, error) result 175 - (** [decode_with_case punycode] decodes and extracts case annotations. 176 - 177 - Per 178 - {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 179 - Appendix A}, returns both the decoded code points and an array of case 180 - flags indicating the suggested case for each character based on the 181 - uppercase/lowercase form of the encoding digits. *) 182 - 183 - (** {1 UTF-8 String Operations} 184 - 185 - Convenience functions that work directly with UTF-8 encoded OCaml strings. 186 - These combine UTF-8 decoding/encoding with the core Punycode operations. *) 187 - 188 - val encode_utf8 : string -> (string, error) result 189 - (** [encode_utf8 s] encodes a UTF-8 string to Punycode (no ACE prefix). 190 - 191 - This is equivalent to decoding [s] from UTF-8 to code points, then calling 192 - {!encode}. 193 - 194 - Example: 195 - {[ 196 - encode_utf8 "münchen" 197 - (* = Ok "mnchen-3ya" *) 198 - ]} *) 199 - 200 - val decode_utf8 : string -> (string, error) result 201 - (** [decode_utf8 punycode] decodes Punycode to a UTF-8 string (no ACE prefix). 202 - 203 - This is equivalent to calling {!decode} then encoding the result as UTF-8. 204 - 205 - Example: 206 - {[ 207 - decode_utf8 "mnchen-3ya" 208 - (* = Ok "münchen" *) 209 - ]} *) 210 - 211 - (** {1 Domain Label Operations} 212 - 213 - These functions handle the ACE prefix automatically and enforce DNS label 214 - length limits per 215 - {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *) 216 - 217 - val encode_label : string -> (string, error) result 218 - (** [encode_label label] encodes a domain label for use in DNS. 219 - 220 - If the label contains only ASCII characters, it is returned unchanged. 221 - Otherwise, it is Punycode-encoded with the ACE prefix ("xn--") prepended, as 222 - specified in 223 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section 224 - 5}. 225 - 226 - Returns {!Error} {!Label_too_long} if the result exceeds 63 bytes. 227 - 228 - Example: 229 - {[ 230 - encode_label "münchen" 231 - (* = Ok "xn--mnchen-3ya" *) 232 - encode_label "example" 233 - (* = Ok "example" *) 234 - ]} *) 235 - 236 - val decode_label : string -> (string, error) result 237 - (** [decode_label label] decodes a domain label. 238 - 239 - If the label starts with the ACE prefix ("xn--", case-insensitive), it is 240 - Punycode-decoded. Otherwise, it is returned unchanged. 241 - 242 - Example: 243 - {[ 244 - decode_label "xn--mnchen-3ya" 245 - (* = Ok "münchen" *) 246 - decode_label "example" 247 - (* = Ok "example" *) 248 - ]} *) 249 - 250 - (** {1 Validation} 251 - 252 - Predicate functions for checking code point and string properties. *) 253 - 254 - val is_basic : Uchar.t -> bool 255 - (** [is_basic u] is [true] if [u] is a basic code point (ASCII, < 128). 256 - 257 - Per 258 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 259 - 5}, basic code points for Punycode are the ASCII code points (0..7F). *) 260 - 261 - val is_ascii_string : string -> bool 262 - (** [is_ascii_string s] is [true] if [s] contains only ASCII characters (all 263 - bytes < 128). *) 264 - 265 - val has_ace_prefix : string -> bool 266 - (** [has_ace_prefix s] is [true] if [s] starts with the ACE prefix "xn--" 267 - (case-insensitive comparison). *)
-183
ocaml-punycode/lib/punycode_idna.ml
··· 1 - (*--------------------------------------------------------------------------- 2 - Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 - SPDX-License-Identifier: ISC 4 - ---------------------------------------------------------------------------*) 5 - 6 - (* IDNA (Internationalized Domain Names in Applications) Implementation *) 7 - 8 - let max_domain_length = 253 9 - 10 - (* {1 Error Types} *) 11 - 12 - type error = 13 - | Punycode_error of Punycode.error 14 - | Invalid_label of string 15 - | Domain_too_long of int 16 - | Normalization_failed 17 - | Verification_failed 18 - 19 - let pp_error fmt = function 20 - | Punycode_error e -> 21 - Format.fprintf fmt "Punycode error: %a" Punycode.pp_error e 22 - | Invalid_label msg -> Format.fprintf fmt "invalid label: %s" msg 23 - | Domain_too_long len -> 24 - Format.fprintf fmt "domain too long: %d bytes (max %d)" len 25 - max_domain_length 26 - | Normalization_failed -> Format.fprintf fmt "Unicode normalization failed" 27 - | Verification_failed -> 28 - Format.fprintf fmt "IDNA verification failed (round-trip mismatch)" 29 - 30 - let error_to_string err = Format.asprintf "%a" pp_error err 31 - 32 - (* {1 Error Constructors} *) 33 - 34 - let punycode_error e = Error (Punycode_error e) 35 - let invalid_label msg = Error (Invalid_label msg) 36 - let domain_too_long len = Error (Domain_too_long len) 37 - let _normalization_failed = Error Normalization_failed 38 - let verification_failed = Error Verification_failed 39 - 40 - (* {1 Unicode Normalization} *) 41 - 42 - let normalize_nfc s = Uunf_string.normalize_utf_8 `NFC s 43 - 44 - (* {1 Validation Helpers} *) 45 - 46 - let is_ace_label label = Punycode.has_ace_prefix label 47 - 48 - (* Check if a label follows STD3 rules (hostname restrictions): 49 - - Only LDH (letters, digits, hyphens) 50 - - Cannot start or end with hyphen *) 51 - let is_std3_valid label = 52 - let len = String.length label in 53 - let is_ldh c = 54 - (c >= 'a' && c <= 'z') 55 - || (c >= 'A' && c <= 'Z') 56 - || (c >= '0' && c <= '9') 57 - || c = '-' 58 - in 59 - len > 0 60 - && label.[0] <> '-' 61 - && label.[len - 1] <> '-' 62 - && String.for_all is_ldh label 63 - 64 - (* Check hyphen placement: hyphens not in positions 3 and 4 (except for ACE) *) 65 - let check_hyphen_rules label = 66 - let len = String.length label in 67 - if len >= 4 && label.[2] = '-' && label.[3] = '-' then 68 - (* Hyphens in positions 3 and 4 - only valid for ACE prefix *) 69 - is_ace_label label 70 - else true 71 - 72 - (* {1 Label Operations} *) 73 - 74 - let label_to_ascii_impl ~check_hyphens ~use_std3_rules label = 75 - let len = String.length label in 76 - if len = 0 then invalid_label "empty label" 77 - else if len > Punycode.max_label_length then 78 - punycode_error (Punycode.Label_too_long len) 79 - else if Punycode.is_ascii_string label then begin 80 - (* All ASCII - validate and pass through *) 81 - if use_std3_rules && not (is_std3_valid label) then 82 - invalid_label "STD3 rules violation" 83 - else if check_hyphens && not (check_hyphen_rules label) then 84 - invalid_label "invalid hyphen placement" 85 - else Ok label 86 - end 87 - else begin 88 - (* Has non-ASCII - normalize and encode *) 89 - let normalized = normalize_nfc label in 90 - 91 - (* Encode to Punycode *) 92 - match Punycode.encode_utf8 normalized with 93 - | Error e -> punycode_error e 94 - | Ok encoded -> ( 95 - let result = Punycode.ace_prefix ^ encoded in 96 - let result_len = String.length result in 97 - if result_len > Punycode.max_label_length then 98 - punycode_error (Punycode.Label_too_long result_len) 99 - else if check_hyphens && not (check_hyphen_rules result) then 100 - invalid_label "invalid hyphen placement in encoded label" 101 - else 102 - (* Verification: decode and compare to original normalized form *) 103 - match Punycode.decode_utf8 encoded with 104 - | Error _ -> verification_failed 105 - | Ok decoded -> 106 - if decoded <> normalized then verification_failed else Ok result) 107 - end 108 - 109 - let label_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) label = 110 - label_to_ascii_impl ~check_hyphens ~use_std3_rules label 111 - 112 - let label_to_unicode label = 113 - if is_ace_label label then begin 114 - let encoded = String.sub label 4 (String.length label - 4) in 115 - match Punycode.decode_utf8 encoded with 116 - | Error e -> punycode_error e 117 - | Ok decoded -> Ok decoded 118 - end 119 - else Ok label 120 - 121 - (* {1 Domain Operations} *) 122 - 123 - (* Split domain into labels *) 124 - let split_domain domain = String.split_on_char '.' domain 125 - 126 - (* Join labels into domain *) 127 - let join_labels labels = String.concat "." labels 128 - 129 - (* Map a function returning Result over a list, short-circuiting on first Error *) 130 - let map_result f lst = 131 - List.fold_right 132 - (fun x acc -> 133 - let open Result.Syntax in 134 - let* y = f x in 135 - let+ ys = acc in 136 - y :: ys) 137 - lst (Ok []) 138 - 139 - let to_ascii ?(check_hyphens = true) ?(check_bidi = false) 140 - ?(check_joiners = false) ?(use_std3_rules = false) ?(transitional = false) 141 - domain = 142 - (* Note: check_bidi, check_joiners, and transitional are accepted but 143 - not fully implemented - they would require additional Unicode data *) 144 - let _ = check_bidi in 145 - let _ = check_joiners in 146 - let _ = transitional in 147 - 148 - let open Result.Syntax in 149 - let labels = split_domain domain in 150 - let* encoded_labels = 151 - map_result (label_to_ascii_impl ~check_hyphens ~use_std3_rules) labels 152 - in 153 - let result = join_labels encoded_labels in 154 - let len = String.length result in 155 - if len > max_domain_length then domain_too_long len else Ok result 156 - 157 - let to_unicode domain = 158 - let open Result.Syntax in 159 - let labels = split_domain domain in 160 - let+ decoded_labels = map_result label_to_unicode labels in 161 - join_labels decoded_labels 162 - 163 - (* {1 Domain Name Library Integration} *) 164 - 165 - let domain_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) domain = 166 - let open Result.Syntax in 167 - let s = Domain_name.to_string domain in 168 - let* ascii = to_ascii ~check_hyphens ~use_std3_rules s in 169 - match Domain_name.of_string ascii with 170 - | Error (`Msg msg) -> invalid_label msg 171 - | Ok d -> Ok d 172 - 173 - let domain_to_unicode domain = 174 - let open Result.Syntax in 175 - let s = Domain_name.to_string domain in 176 - let* unicode = to_unicode s in 177 - match Domain_name.of_string unicode with 178 - | Error (`Msg msg) -> invalid_label msg 179 - | Ok d -> Ok d 180 - 181 - (* {1 Validation} *) 182 - 183 - let is_idna_valid domain = Result.is_ok (to_ascii domain)
-215
ocaml-punycode/lib/punycode_idna.mli
··· 1 - (*--------------------------------------------------------------------------- 2 - Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 - SPDX-License-Identifier: ISC 4 - ---------------------------------------------------------------------------*) 5 - 6 - (** IDNA (Internationalized Domain Names in Applications) support. 7 - 8 - This module provides ToASCII and ToUnicode operations as specified in 9 - {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} (IDNA 2008), 10 - using Punycode ({{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492}) 11 - for encoding. 12 - 13 - IDNA allows domain names to contain non-ASCII Unicode characters by encoding 14 - them using Punycode with an ACE prefix. This module handles the conversion 15 - between Unicode domain names and their ASCII-compatible encoding (ACE) form. 16 - 17 - {2 References} 18 - - {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - 19 - Internationalized Domain Names in Applications (IDNA): Protocol 20 - - {{:https://datatracker.ietf.org/doc/html/rfc5892}RFC 5892} - The Unicode 21 - Code Points and Internationalized Domain Names for Applications (IDNA) 22 - - {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} - Right-to-Left 23 - Scripts for Internationalized Domain Names for Applications (IDNA) 24 - - {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A 25 - Bootstring encoding of Unicode for IDNA *) 26 - 27 - (** {1 Error Types} *) 28 - 29 - type error = 30 - | Punycode_error of Punycode.error 31 - (** Error during Punycode encoding/decoding. See {!Punycode.error} for 32 - details. *) 33 - | Invalid_label of string 34 - (** Label violates IDNA constraints. The string describes the violation. 35 - See 36 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} RFC 5891 37 - Section 4} for label validation requirements. *) 38 - | Domain_too_long of int 39 - (** Domain name exceeds 253 bytes, per 40 - {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. The int is 41 - the actual length. *) 42 - | Normalization_failed 43 - (** Unicode normalization (NFC) failed. Per 44 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1} RFC 45 - 5891 Section 4.2.1}, labels must be in NFC form. *) 46 - | Verification_failed 47 - (** ToASCII/ToUnicode verification step failed (round-trip check). Per 48 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2} RFC 5891 49 - Section 4.2}, the result of encoding must decode back to the original 50 - input. *) 51 - 52 - val pp_error : Format.formatter -> error -> unit 53 - (** [pp_error fmt e] pretty-prints an error. *) 54 - 55 - val error_to_string : error -> string 56 - (** [error_to_string e] converts an error to a human-readable string. *) 57 - 58 - (** {1 Constants} *) 59 - 60 - val max_domain_length : int 61 - (** Maximum length of a domain name in bytes (253), per 62 - {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *) 63 - 64 - (** {1 ToASCII Operation} 65 - 66 - Converts an internationalized domain name to its ASCII-compatible encoding 67 - (ACE) form suitable for DNS lookup. 68 - 69 - See 70 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} RFC 5891 Section 71 - 4} for the complete ToASCII specification. *) 72 - 73 - val to_ascii : 74 - ?check_hyphens:bool -> 75 - ?check_bidi:bool -> 76 - ?check_joiners:bool -> 77 - ?use_std3_rules:bool -> 78 - ?transitional:bool -> 79 - string -> 80 - (string, error) result 81 - (** [to_ascii domain] converts an internationalized domain name to ASCII. 82 - 83 - Implements the ToASCII operation from 84 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891 85 - Section 4.1}. 86 - 87 - For each label in the domain: 1. If all ASCII, pass through (with optional 88 - STD3 validation) 2. Otherwise, normalize to NFC per 89 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}Section 90 - 4.2.1} and Punycode-encode with ACE prefix 91 - 92 - Optional parameters (per 93 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} RFC 5891 Section 94 - 4} processing options): 95 - - [check_hyphens]: Validate hyphen placement per 96 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1}Section 97 - 4.2.3.1} (default: true) 98 - - [check_bidi]: Check bidirectional text rules per 99 - {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} (default: 100 - false, not implemented) 101 - - [check_joiners]: Check contextual joiner rules per 102 - {{:https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.1}RFC 5892 103 - Appendix A.1} (default: false, not implemented) 104 - - [use_std3_rules]: Apply STD3 hostname rules per 105 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2}Section 106 - 4.2.3.2} (default: false) 107 - - [transitional]: Use IDNA 2003 transitional processing (default: false) 108 - 109 - Example: 110 - {[ 111 - to_ascii "münchen.example.com" 112 - (* = Ok "xn--mnchen-3ya.example.com" *) 113 - ]} *) 114 - 115 - val label_to_ascii : 116 - ?check_hyphens:bool -> 117 - ?use_std3_rules:bool -> 118 - string -> 119 - (string, error) result 120 - (** [label_to_ascii label] converts a single label to ASCII. 121 - 122 - This implements the core ToASCII operation for one label, as described in 123 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891 124 - Section 4.1}. *) 125 - 126 - (** {1 ToUnicode Operation} 127 - 128 - Converts an ASCII-compatible encoded domain name back to Unicode. 129 - 130 - See 131 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2} RFC 5891 132 - Section 4.2} for the complete ToUnicode specification. *) 133 - 134 - val to_unicode : string -> (string, error) result 135 - (** [to_unicode domain] converts an ACE domain name to Unicode. 136 - 137 - Implements the ToUnicode operation from 138 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891 139 - Section 4.2}. 140 - 141 - For each label in the domain: 1. If it has the ACE prefix ("xn--"), 142 - Punycode-decode it per 143 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492 144 - Section 6.2} 2. Otherwise, pass through unchanged 145 - 146 - Example: 147 - {[ 148 - to_unicode "xn--mnchen-3ya.example.com" 149 - (* = Ok "münchen.example.com" *) 150 - ]} *) 151 - 152 - val label_to_unicode : string -> (string, error) result 153 - (** [label_to_unicode label] converts a single ACE label to Unicode. 154 - 155 - This implements the core ToUnicode operation for one label, as described in 156 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891 157 - Section 4.2}. *) 158 - 159 - (** {1 Domain Name Integration} 160 - 161 - Functions that work with the 162 - {{:https://github.com/hannesm/domain-name}domain-name} library types. 163 - 164 - These provide integration with the [Domain_name] module for applications 165 - that use that library for domain name handling. *) 166 - 167 - val domain_to_ascii : 168 - ?check_hyphens:bool -> 169 - ?use_std3_rules:bool -> 170 - [ `raw ] Domain_name.t -> 171 - ([ `raw ] Domain_name.t, error) result 172 - (** [domain_to_ascii domain] converts a domain name to ASCII form. 173 - 174 - Applies {!to_ascii} to the string representation and returns the result as a 175 - [Domain_name.t]. 176 - 177 - Example: 178 - {[ 179 - let d = Domain_name.of_string_exn "münchen.example.com" in 180 - domain_to_ascii d 181 - (* = Ok (Domain_name.of_string_exn "xn--mnchen-3ya.example.com") *) 182 - ]} *) 183 - 184 - val domain_to_unicode : 185 - [ `raw ] Domain_name.t -> ([ `raw ] Domain_name.t, error) result 186 - (** [domain_to_unicode domain] converts a domain name to Unicode form. 187 - 188 - Applies {!to_unicode} to the string representation and returns the result as 189 - a [Domain_name.t]. *) 190 - 191 - (** {1 Validation} *) 192 - 193 - val is_idna_valid : string -> bool 194 - (** [is_idna_valid domain] checks if a domain name is valid for IDNA processing. 195 - 196 - Returns [true] if {!to_ascii} would succeed on the domain. *) 197 - 198 - val is_ace_label : string -> bool 199 - (** [is_ace_label label] is [true] if the label has the ACE prefix "xn--" 200 - (case-insensitive). This indicates the label is Punycode-encoded per 201 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 202 - 5}. *) 203 - 204 - (** {1 Normalization} *) 205 - 206 - val normalize_nfc : string -> string 207 - (** [normalize_nfc s] returns the NFC-normalized form of UTF-8 string [s]. 208 - 209 - Per 210 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1} RFC 5891 211 - Section 4.2.1}, domain labels must be normalized to NFC (Unicode 212 - Normalization Form C) before encoding. 213 - 214 - See {{:http://www.unicode.org/reports/tr15/}Unicode Standard Annex #15} for 215 - details on Unicode normalization forms. *)
-36
ocaml-punycode/punycode.opam
··· 1 - # This file is generated by dune, edit dune-project instead 2 - opam-version: "2.0" 3 - synopsis: "RFC 3492 Punycode and IDNA implementation for OCaml" 4 - description: """ 5 - A high-quality implementation of RFC 3492 (Punycode) with IDNA support. 6 - Provides encoding and decoding of internationalized domain names, 7 - with proper Unicode normalization and mixed-case annotation support.""" 8 - maintainer: ["Anil Madhavapeddy <anil@recoil.org>"] 9 - authors: ["Anil Madhavapeddy"] 10 - license: "ISC" 11 - homepage: "https://tangled.org/anil.recoil.org/ocaml-punycode" 12 - bug-reports: "https://tangled.org/anil.recoil.org/ocaml-punycode/issues" 13 - depends: [ 14 - "ocaml" {>= "5.4.0"} 15 - "dune" {>= "3.20" & >= "3.0"} 16 - "uutf" {>= "1.0.0"} 17 - "uunf" {>= "15.0.0"} 18 - "domain-name" {>= "0.4.0"} 19 - "odoc" {with-doc} 20 - "alcotest" {with-test} 21 - ] 22 - build: [ 23 - ["dune" "subst"] {dev} 24 - [ 25 - "dune" 26 - "build" 27 - "-p" 28 - name 29 - "-j" 30 - jobs 31 - "@install" 32 - "@runtest" {with-test} 33 - "@doc" {with-doc} 34 - ] 35 - ] 36 - x-maintenance-intent: ["(latest)"]
-3077
ocaml-punycode/spec/rfc1035.txt
··· 1 - Network Working Group P. Mockapetris 2 - Request for Comments: 1035 ISI 3 - November 1987 4 - Obsoletes: RFCs 882, 883, 973 5 - 6 - DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION 7 - 8 - 9 - 1. STATUS OF THIS MEMO 10 - 11 - This RFC describes the details of the domain system and protocol, and 12 - assumes that the reader is familiar with the concepts discussed in a 13 - companion RFC, "Domain Names - Concepts and Facilities" [RFC-1034]. 14 - 15 - The domain system is a mixture of functions and data types which are an 16 - official protocol and functions and data types which are still 17 - experimental. Since the domain system is intentionally extensible, new 18 - data types and experimental behavior should always be expected in parts 19 - of the system beyond the official protocol. The official protocol parts 20 - include standard queries, responses and the Internet class RR data 21 - formats (e.g., host addresses). Since the previous RFC set, several 22 - definitions have changed, so some previous definitions are obsolete. 23 - 24 - Experimental or obsolete features are clearly marked in these RFCs, and 25 - such information should be used with caution. 26 - 27 - The reader is especially cautioned not to depend on the values which 28 - appear in examples to be current or complete, since their purpose is 29 - primarily pedagogical. Distribution of this memo is unlimited. 30 - 31 - Table of Contents 32 - 33 - 1. STATUS OF THIS MEMO 1 34 - 2. INTRODUCTION 3 35 - 2.1. Overview 3 36 - 2.2. Common configurations 4 37 - 2.3. Conventions 7 38 - 2.3.1. Preferred name syntax 7 39 - 2.3.2. Data Transmission Order 8 40 - 2.3.3. Character Case 9 41 - 2.3.4. Size limits 10 42 - 3. DOMAIN NAME SPACE AND RR DEFINITIONS 10 43 - 3.1. Name space definitions 10 44 - 3.2. RR definitions 11 45 - 3.2.1. Format 11 46 - 3.2.2. TYPE values 12 47 - 3.2.3. QTYPE values 12 48 - 3.2.4. CLASS values 13 49 - 50 - 51 - 52 - Mockapetris [Page 1] 53 - 54 - RFC 1035 Domain Implementation and Specification November 1987 55 - 56 - 57 - 3.2.5. QCLASS values 13 58 - 3.3. Standard RRs 13 59 - 3.3.1. CNAME RDATA format 14 60 - 3.3.2. HINFO RDATA format 14 61 - 3.3.3. MB RDATA format (EXPERIMENTAL) 14 62 - 3.3.4. MD RDATA format (Obsolete) 15 63 - 3.3.5. MF RDATA format (Obsolete) 15 64 - 3.3.6. MG RDATA format (EXPERIMENTAL) 16 65 - 3.3.7. MINFO RDATA format (EXPERIMENTAL) 16 66 - 3.3.8. MR RDATA format (EXPERIMENTAL) 17 67 - 3.3.9. MX RDATA format 17 68 - 3.3.10. NULL RDATA format (EXPERIMENTAL) 17 69 - 3.3.11. NS RDATA format 18 70 - 3.3.12. PTR RDATA format 18 71 - 3.3.13. SOA RDATA format 19 72 - 3.3.14. TXT RDATA format 20 73 - 3.4. ARPA Internet specific RRs 20 74 - 3.4.1. A RDATA format 20 75 - 3.4.2. WKS RDATA format 21 76 - 3.5. IN-ADDR.ARPA domain 22 77 - 3.6. Defining new types, classes, and special namespaces 24 78 - 4. MESSAGES 25 79 - 4.1. Format 25 80 - 4.1.1. Header section format 26 81 - 4.1.2. Question section format 28 82 - 4.1.3. Resource record format 29 83 - 4.1.4. Message compression 30 84 - 4.2. Transport 32 85 - 4.2.1. UDP usage 32 86 - 4.2.2. TCP usage 32 87 - 5. MASTER FILES 33 88 - 5.1. Format 33 89 - 5.2. Use of master files to define zones 35 90 - 5.3. Master file example 36 91 - 6. NAME SERVER IMPLEMENTATION 37 92 - 6.1. Architecture 37 93 - 6.1.1. Control 37 94 - 6.1.2. Database 37 95 - 6.1.3. Time 39 96 - 6.2. Standard query processing 39 97 - 6.3. Zone refresh and reload processing 39 98 - 6.4. Inverse queries (Optional) 40 99 - 6.4.1. The contents of inverse queries and responses 40 100 - 6.4.2. Inverse query and response example 41 101 - 6.4.3. Inverse query processing 42 102 - 103 - 104 - 105 - 106 - 107 - 108 - Mockapetris [Page 2] 109 - 110 - RFC 1035 Domain Implementation and Specification November 1987 111 - 112 - 113 - 6.5. Completion queries and responses 42 114 - 7. RESOLVER IMPLEMENTATION 43 115 - 7.1. Transforming a user request into a query 43 116 - 7.2. Sending the queries 44 117 - 7.3. Processing responses 46 118 - 7.4. Using the cache 47 119 - 8. MAIL SUPPORT 47 120 - 8.1. Mail exchange binding 48 121 - 8.2. Mailbox binding (Experimental) 48 122 - 9. REFERENCES and BIBLIOGRAPHY 50 123 - Index 54 124 - 125 - 2. INTRODUCTION 126 - 127 - 2.1. Overview 128 - 129 - The goal of domain names is to provide a mechanism for naming resources 130 - in such a way that the names are usable in different hosts, networks, 131 - protocol families, internets, and administrative organizations. 132 - 133 - From the user's point of view, domain names are useful as arguments to a 134 - local agent, called a resolver, which retrieves information associated 135 - with the domain name. Thus a user might ask for the host address or 136 - mail information associated with a particular domain name. To enable 137 - the user to request a particular type of information, an appropriate 138 - query type is passed to the resolver with the domain name. To the user, 139 - the domain tree is a single information space; the resolver is 140 - responsible for hiding the distribution of data among name servers from 141 - the user. 142 - 143 - From the resolver's point of view, the database that makes up the domain 144 - space is distributed among various name servers. Different parts of the 145 - domain space are stored in different name servers, although a particular 146 - data item will be stored redundantly in two or more name servers. The 147 - resolver starts with knowledge of at least one name server. When the 148 - resolver processes a user query it asks a known name server for the 149 - information; in return, the resolver either receives the desired 150 - information or a referral to another name server. Using these 151 - referrals, resolvers learn the identities and contents of other name 152 - servers. Resolvers are responsible for dealing with the distribution of 153 - the domain space and dealing with the effects of name server failure by 154 - consulting redundant databases in other servers. 155 - 156 - Name servers manage two kinds of data. The first kind of data held in 157 - sets called zones; each zone is the complete database for a particular 158 - "pruned" subtree of the domain space. This data is called 159 - authoritative. A name server periodically checks to make sure that its 160 - zones are up to date, and if not, obtains a new copy of updated zones 161 - 162 - 163 - 164 - Mockapetris [Page 3] 165 - 166 - RFC 1035 Domain Implementation and Specification November 1987 167 - 168 - 169 - from master files stored locally or in another name server. The second 170 - kind of data is cached data which was acquired by a local resolver. 171 - This data may be incomplete, but improves the performance of the 172 - retrieval process when non-local data is repeatedly accessed. Cached 173 - data is eventually discarded by a timeout mechanism. 174 - 175 - This functional structure isolates the problems of user interface, 176 - failure recovery, and distribution in the resolvers and isolates the 177 - database update and refresh problems in the name servers. 178 - 179 - 2.2. Common configurations 180 - 181 - A host can participate in the domain name system in a number of ways, 182 - depending on whether the host runs programs that retrieve information 183 - from the domain system, name servers that answer queries from other 184 - hosts, or various combinations of both functions. The simplest, and 185 - perhaps most typical, configuration is shown below: 186 - 187 - Local Host | Foreign 188 - | 189 - +---------+ +----------+ | +--------+ 190 - | | user queries | |queries | | | 191 - | User |-------------->| |---------|->|Foreign | 192 - | Program | | Resolver | | | Name | 193 - | |<--------------| |<--------|--| Server | 194 - | | user responses| |responses| | | 195 - +---------+ +----------+ | +--------+ 196 - | A | 197 - cache additions | | references | 198 - V | | 199 - +----------+ | 200 - | cache | | 201 - +----------+ | 202 - 203 - User programs interact with the domain name space through resolvers; the 204 - format of user queries and user responses is specific to the host and 205 - its operating system. User queries will typically be operating system 206 - calls, and the resolver and its cache will be part of the host operating 207 - system. Less capable hosts may choose to implement the resolver as a 208 - subroutine to be linked in with every program that needs its services. 209 - Resolvers answer user queries with information they acquire via queries 210 - to foreign name servers and the local cache. 211 - 212 - Note that the resolver may have to make several queries to several 213 - different foreign name servers to answer a particular user query, and 214 - hence the resolution of a user query may involve several network 215 - accesses and an arbitrary amount of time. The queries to foreign name 216 - servers and the corresponding responses have a standard format described 217 - 218 - 219 - 220 - Mockapetris [Page 4] 221 - 222 - RFC 1035 Domain Implementation and Specification November 1987 223 - 224 - 225 - in this memo, and may be datagrams. 226 - 227 - Depending on its capabilities, a name server could be a stand alone 228 - program on a dedicated machine or a process or processes on a large 229 - timeshared host. A simple configuration might be: 230 - 231 - Local Host | Foreign 232 - | 233 - +---------+ | 234 - / /| | 235 - +---------+ | +----------+ | +--------+ 236 - | | | | |responses| | | 237 - | | | | Name |---------|->|Foreign | 238 - | Master |-------------->| Server | | |Resolver| 239 - | files | | | |<--------|--| | 240 - | |/ | | queries | +--------+ 241 - +---------+ +----------+ | 242 - 243 - Here a primary name server acquires information about one or more zones 244 - by reading master files from its local file system, and answers queries 245 - about those zones that arrive from foreign resolvers. 246 - 247 - The DNS requires that all zones be redundantly supported by more than 248 - one name server. Designated secondary servers can acquire zones and 249 - check for updates from the primary server using the zone transfer 250 - protocol of the DNS. This configuration is shown below: 251 - 252 - Local Host | Foreign 253 - | 254 - +---------+ | 255 - / /| | 256 - +---------+ | +----------+ | +--------+ 257 - | | | | |responses| | | 258 - | | | | Name |---------|->|Foreign | 259 - | Master |-------------->| Server | | |Resolver| 260 - | files | | | |<--------|--| | 261 - | |/ | | queries | +--------+ 262 - +---------+ +----------+ | 263 - A |maintenance | +--------+ 264 - | +------------|->| | 265 - | queries | |Foreign | 266 - | | | Name | 267 - +------------------|--| Server | 268 - maintenance responses | +--------+ 269 - 270 - In this configuration, the name server periodically establishes a 271 - virtual circuit to a foreign name server to acquire a copy of a zone or 272 - to check that an existing copy has not changed. The messages sent for 273 - 274 - 275 - 276 - Mockapetris [Page 5] 277 - 278 - RFC 1035 Domain Implementation and Specification November 1987 279 - 280 - 281 - these maintenance activities follow the same form as queries and 282 - responses, but the message sequences are somewhat different. 283 - 284 - The information flow in a host that supports all aspects of the domain 285 - name system is shown below: 286 - 287 - Local Host | Foreign 288 - | 289 - +---------+ +----------+ | +--------+ 290 - | | user queries | |queries | | | 291 - | User |-------------->| |---------|->|Foreign | 292 - | Program | | Resolver | | | Name | 293 - | |<--------------| |<--------|--| Server | 294 - | | user responses| |responses| | | 295 - +---------+ +----------+ | +--------+ 296 - | A | 297 - cache additions | | references | 298 - V | | 299 - +----------+ | 300 - | Shared | | 301 - | database | | 302 - +----------+ | 303 - A | | 304 - +---------+ refreshes | | references | 305 - / /| | V | 306 - +---------+ | +----------+ | +--------+ 307 - | | | | |responses| | | 308 - | | | | Name |---------|->|Foreign | 309 - | Master |-------------->| Server | | |Resolver| 310 - | files | | | |<--------|--| | 311 - | |/ | | queries | +--------+ 312 - +---------+ +----------+ | 313 - A |maintenance | +--------+ 314 - | +------------|->| | 315 - | queries | |Foreign | 316 - | | | Name | 317 - +------------------|--| Server | 318 - maintenance responses | +--------+ 319 - 320 - The shared database holds domain space data for the local name server 321 - and resolver. The contents of the shared database will typically be a 322 - mixture of authoritative data maintained by the periodic refresh 323 - operations of the name server and cached data from previous resolver 324 - requests. The structure of the domain data and the necessity for 325 - synchronization between name servers and resolvers imply the general 326 - characteristics of this database, but the actual format is up to the 327 - local implementor. 328 - 329 - 330 - 331 - 332 - Mockapetris [Page 6] 333 - 334 - RFC 1035 Domain Implementation and Specification November 1987 335 - 336 - 337 - Information flow can also be tailored so that a group of hosts act 338 - together to optimize activities. Sometimes this is done to offload less 339 - capable hosts so that they do not have to implement a full resolver. 340 - This can be appropriate for PCs or hosts which want to minimize the 341 - amount of new network code which is required. This scheme can also 342 - allow a group of hosts can share a small number of caches rather than 343 - maintaining a large number of separate caches, on the premise that the 344 - centralized caches will have a higher hit ratio. In either case, 345 - resolvers are replaced with stub resolvers which act as front ends to 346 - resolvers located in a recursive server in one or more name servers 347 - known to perform that service: 348 - 349 - Local Hosts | Foreign 350 - | 351 - +---------+ | 352 - | | responses | 353 - | Stub |<--------------------+ | 354 - | Resolver| | | 355 - | |----------------+ | | 356 - +---------+ recursive | | | 357 - queries | | | 358 - V | | 359 - +---------+ recursive +----------+ | +--------+ 360 - | | queries | |queries | | | 361 - | Stub |-------------->| Recursive|---------|->|Foreign | 362 - | Resolver| | Server | | | Name | 363 - | |<--------------| |<--------|--| Server | 364 - +---------+ responses | |responses| | | 365 - +----------+ | +--------+ 366 - | Central | | 367 - | cache | | 368 - +----------+ | 369 - 370 - In any case, note that domain components are always replicated for 371 - reliability whenever possible. 372 - 373 - 2.3. Conventions 374 - 375 - The domain system has several conventions dealing with low-level, but 376 - fundamental, issues. While the implementor is free to violate these 377 - conventions WITHIN HIS OWN SYSTEM, he must observe these conventions in 378 - ALL behavior observed from other hosts. 379 - 380 - 2.3.1. Preferred name syntax 381 - 382 - The DNS specifications attempt to be as general as possible in the rules 383 - for constructing domain names. The idea is that the name of any 384 - existing object can be expressed as a domain name with minimal changes. 385 - 386 - 387 - 388 - Mockapetris [Page 7] 389 - 390 - RFC 1035 Domain Implementation and Specification November 1987 391 - 392 - 393 - However, when assigning a domain name for an object, the prudent user 394 - will select a name which satisfies both the rules of the domain system 395 - and any existing rules for the object, whether these rules are published 396 - or implied by existing programs. 397 - 398 - For example, when naming a mail domain, the user should satisfy both the 399 - rules of this memo and those in RFC-822. When creating a new host name, 400 - the old rules for HOSTS.TXT should be followed. This avoids problems 401 - when old software is converted to use domain names. 402 - 403 - The following syntax will result in fewer problems with many 404 - 405 - applications that use domain names (e.g., mail, TELNET). 406 - 407 - <domain> ::= <subdomain> | " " 408 - 409 - <subdomain> ::= <label> | <subdomain> "." <label> 410 - 411 - <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ] 412 - 413 - <ldh-str> ::= <let-dig-hyp> | <let-dig-hyp> <ldh-str> 414 - 415 - <let-dig-hyp> ::= <let-dig> | "-" 416 - 417 - <let-dig> ::= <letter> | <digit> 418 - 419 - <letter> ::= any one of the 52 alphabetic characters A through Z in 420 - upper case and a through z in lower case 421 - 422 - <digit> ::= any one of the ten digits 0 through 9 423 - 424 - Note that while upper and lower case letters are allowed in domain 425 - names, no significance is attached to the case. That is, two names with 426 - the same spelling but different case are to be treated as if identical. 427 - 428 - The labels must follow the rules for ARPANET host names. They must 429 - start with a letter, end with a letter or digit, and have as interior 430 - characters only letters, digits, and hyphen. There are also some 431 - restrictions on the length. Labels must be 63 characters or less. 432 - 433 - For example, the following strings identify hosts in the Internet: 434 - 435 - A.ISI.EDU XX.LCS.MIT.EDU SRI-NIC.ARPA 436 - 437 - 2.3.2. Data Transmission Order 438 - 439 - The order of transmission of the header and data described in this 440 - document is resolved to the octet level. Whenever a diagram shows a 441 - 442 - 443 - 444 - Mockapetris [Page 8] 445 - 446 - RFC 1035 Domain Implementation and Specification November 1987 447 - 448 - 449 - group of octets, the order of transmission of those octets is the normal 450 - order in which they are read in English. For example, in the following 451 - diagram, the octets are transmitted in the order they are numbered. 452 - 453 - 0 1 454 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 455 - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 456 - | 1 | 2 | 457 - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 458 - | 3 | 4 | 459 - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 460 - | 5 | 6 | 461 - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 462 - 463 - Whenever an octet represents a numeric quantity, the left most bit in 464 - the diagram is the high order or most significant bit. That is, the bit 465 - labeled 0 is the most significant bit. For example, the following 466 - diagram represents the value 170 (decimal). 467 - 468 - 0 1 2 3 4 5 6 7 469 - +-+-+-+-+-+-+-+-+ 470 - |1 0 1 0 1 0 1 0| 471 - +-+-+-+-+-+-+-+-+ 472 - 473 - Similarly, whenever a multi-octet field represents a numeric quantity 474 - the left most bit of the whole field is the most significant bit. When 475 - a multi-octet quantity is transmitted the most significant octet is 476 - transmitted first. 477 - 478 - 2.3.3. Character Case 479 - 480 - For all parts of the DNS that are part of the official protocol, all 481 - comparisons between character strings (e.g., labels, domain names, etc.) 482 - are done in a case-insensitive manner. At present, this rule is in 483 - force throughout the domain system without exception. However, future 484 - additions beyond current usage may need to use the full binary octet 485 - capabilities in names, so attempts to store domain names in 7-bit ASCII 486 - or use of special bytes to terminate labels, etc., should be avoided. 487 - 488 - When data enters the domain system, its original case should be 489 - preserved whenever possible. In certain circumstances this cannot be 490 - done. For example, if two RRs are stored in a database, one at x.y and 491 - one at X.Y, they are actually stored at the same place in the database, 492 - and hence only one casing would be preserved. The basic rule is that 493 - case can be discarded only when data is used to define structure in a 494 - database, and two names are identical when compared in a case 495 - insensitive manner. 496 - 497 - 498 - 499 - 500 - Mockapetris [Page 9] 501 - 502 - RFC 1035 Domain Implementation and Specification November 1987 503 - 504 - 505 - Loss of case sensitive data must be minimized. Thus while data for x.y 506 - and X.Y may both be stored under a single location x.y or X.Y, data for 507 - a.x and B.X would never be stored under A.x, A.X, b.x, or b.X. In 508 - general, this preserves the case of the first label of a domain name, 509 - but forces standardization of interior node labels. 510 - 511 - Systems administrators who enter data into the domain database should 512 - take care to represent the data they supply to the domain system in a 513 - case-consistent manner if their system is case-sensitive. The data 514 - distribution system in the domain system will ensure that consistent 515 - representations are preserved. 516 - 517 - 2.3.4. Size limits 518 - 519 - Various objects and parameters in the DNS have size limits. They are 520 - listed below. Some could be easily changed, others are more 521 - fundamental. 522 - 523 - labels 63 octets or less 524 - 525 - names 255 octets or less 526 - 527 - TTL positive values of a signed 32 bit number. 528 - 529 - UDP messages 512 octets or less 530 - 531 - 3. DOMAIN NAME SPACE AND RR DEFINITIONS 532 - 533 - 3.1. Name space definitions 534 - 535 - Domain names in messages are expressed in terms of a sequence of labels. 536 - Each label is represented as a one octet length field followed by that 537 - number of octets. Since every domain name ends with the null label of 538 - the root, a domain name is terminated by a length byte of zero. The 539 - high order two bits of every length octet must be zero, and the 540 - remaining six bits of the length field limit the label to 63 octets or 541 - less. 542 - 543 - To simplify implementations, the total length of a domain name (i.e., 544 - label octets and label length octets) is restricted to 255 octets or 545 - less. 546 - 547 - Although labels can contain any 8 bit values in octets that make up a 548 - label, it is strongly recommended that labels follow the preferred 549 - syntax described elsewhere in this memo, which is compatible with 550 - existing host naming conventions. Name servers and resolvers must 551 - compare labels in a case-insensitive manner (i.e., A=a), assuming ASCII 552 - with zero parity. Non-alphabetic codes must match exactly. 553 - 554 - 555 - 556 - Mockapetris [Page 10] 557 - 558 - RFC 1035 Domain Implementation and Specification November 1987 559 - 560 - 561 - 3.2. RR definitions 562 - 563 - 3.2.1. Format 564 - 565 - All RRs have the same top level format shown below: 566 - 567 - 1 1 1 1 1 1 568 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 569 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 570 - | | 571 - / / 572 - / NAME / 573 - | | 574 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 575 - | TYPE | 576 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 577 - | CLASS | 578 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 579 - | TTL | 580 - | | 581 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 582 - | RDLENGTH | 583 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--| 584 - / RDATA / 585 - / / 586 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 587 - 588 - 589 - where: 590 - 591 - NAME an owner name, i.e., the name of the node to which this 592 - resource record pertains. 593 - 594 - TYPE two octets containing one of the RR TYPE codes. 595 - 596 - CLASS two octets containing one of the RR CLASS codes. 597 - 598 - TTL a 32 bit signed integer that specifies the time interval 599 - that the resource record may be cached before the source 600 - of the information should again be consulted. Zero 601 - values are interpreted to mean that the RR can only be 602 - used for the transaction in progress, and should not be 603 - cached. For example, SOA records are always distributed 604 - with a zero TTL to prohibit caching. Zero values can 605 - also be used for extremely volatile data. 606 - 607 - RDLENGTH an unsigned 16 bit integer that specifies the length in 608 - octets of the RDATA field. 609 - 610 - 611 - 612 - Mockapetris [Page 11] 613 - 614 - RFC 1035 Domain Implementation and Specification November 1987 615 - 616 - 617 - RDATA a variable length string of octets that describes the 618 - resource. The format of this information varies 619 - according to the TYPE and CLASS of the resource record. 620 - 621 - 3.2.2. TYPE values 622 - 623 - TYPE fields are used in resource records. Note that these types are a 624 - subset of QTYPEs. 625 - 626 - TYPE value and meaning 627 - 628 - A 1 a host address 629 - 630 - NS 2 an authoritative name server 631 - 632 - MD 3 a mail destination (Obsolete - use MX) 633 - 634 - MF 4 a mail forwarder (Obsolete - use MX) 635 - 636 - CNAME 5 the canonical name for an alias 637 - 638 - SOA 6 marks the start of a zone of authority 639 - 640 - MB 7 a mailbox domain name (EXPERIMENTAL) 641 - 642 - MG 8 a mail group member (EXPERIMENTAL) 643 - 644 - MR 9 a mail rename domain name (EXPERIMENTAL) 645 - 646 - NULL 10 a null RR (EXPERIMENTAL) 647 - 648 - WKS 11 a well known service description 649 - 650 - PTR 12 a domain name pointer 651 - 652 - HINFO 13 host information 653 - 654 - MINFO 14 mailbox or mail list information 655 - 656 - MX 15 mail exchange 657 - 658 - TXT 16 text strings 659 - 660 - 3.2.3. QTYPE values 661 - 662 - QTYPE fields appear in the question part of a query. QTYPES are a 663 - superset of TYPEs, hence all TYPEs are valid QTYPEs. In addition, the 664 - following QTYPEs are defined: 665 - 666 - 667 - 668 - Mockapetris [Page 12] 669 - 670 - RFC 1035 Domain Implementation and Specification November 1987 671 - 672 - 673 - AXFR 252 A request for a transfer of an entire zone 674 - 675 - MAILB 253 A request for mailbox-related records (MB, MG or MR) 676 - 677 - MAILA 254 A request for mail agent RRs (Obsolete - see MX) 678 - 679 - * 255 A request for all records 680 - 681 - 3.2.4. CLASS values 682 - 683 - CLASS fields appear in resource records. The following CLASS mnemonics 684 - and values are defined: 685 - 686 - IN 1 the Internet 687 - 688 - CS 2 the CSNET class (Obsolete - used only for examples in 689 - some obsolete RFCs) 690 - 691 - CH 3 the CHAOS class 692 - 693 - HS 4 Hesiod [Dyer 87] 694 - 695 - 3.2.5. QCLASS values 696 - 697 - QCLASS fields appear in the question section of a query. QCLASS values 698 - are a superset of CLASS values; every CLASS is a valid QCLASS. In 699 - addition to CLASS values, the following QCLASSes are defined: 700 - 701 - * 255 any class 702 - 703 - 3.3. Standard RRs 704 - 705 - The following RR definitions are expected to occur, at least 706 - potentially, in all classes. In particular, NS, SOA, CNAME, and PTR 707 - will be used in all classes, and have the same format in all classes. 708 - Because their RDATA format is known, all domain names in the RDATA 709 - section of these RRs may be compressed. 710 - 711 - <domain-name> is a domain name represented as a series of labels, and 712 - terminated by a label with zero length. <character-string> is a single 713 - length octet followed by that number of characters. <character-string> 714 - is treated as binary information, and can be up to 256 characters in 715 - length (including the length octet). 716 - 717 - 718 - 719 - 720 - 721 - 722 - 723 - 724 - Mockapetris [Page 13] 725 - 726 - RFC 1035 Domain Implementation and Specification November 1987 727 - 728 - 729 - 3.3.1. CNAME RDATA format 730 - 731 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 732 - / CNAME / 733 - / / 734 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 735 - 736 - where: 737 - 738 - CNAME A <domain-name> which specifies the canonical or primary 739 - name for the owner. The owner name is an alias. 740 - 741 - CNAME RRs cause no additional section processing, but name servers may 742 - choose to restart the query at the canonical name in certain cases. See 743 - the description of name server logic in [RFC-1034] for details. 744 - 745 - 3.3.2. HINFO RDATA format 746 - 747 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 748 - / CPU / 749 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 750 - / OS / 751 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 752 - 753 - where: 754 - 755 - CPU A <character-string> which specifies the CPU type. 756 - 757 - OS A <character-string> which specifies the operating 758 - system type. 759 - 760 - Standard values for CPU and OS can be found in [RFC-1010]. 761 - 762 - HINFO records are used to acquire general information about a host. The 763 - main use is for protocols such as FTP that can use special procedures 764 - when talking between machines or operating systems of the same type. 765 - 766 - 3.3.3. MB RDATA format (EXPERIMENTAL) 767 - 768 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 769 - / MADNAME / 770 - / / 771 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 772 - 773 - where: 774 - 775 - MADNAME A <domain-name> which specifies a host which has the 776 - specified mailbox. 777 - 778 - 779 - 780 - Mockapetris [Page 14] 781 - 782 - RFC 1035 Domain Implementation and Specification November 1987 783 - 784 - 785 - MB records cause additional section processing which looks up an A type 786 - RRs corresponding to MADNAME. 787 - 788 - 3.3.4. MD RDATA format (Obsolete) 789 - 790 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 791 - / MADNAME / 792 - / / 793 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 794 - 795 - where: 796 - 797 - MADNAME A <domain-name> which specifies a host which has a mail 798 - agent for the domain which should be able to deliver 799 - mail for the domain. 800 - 801 - MD records cause additional section processing which looks up an A type 802 - record corresponding to MADNAME. 803 - 804 - MD is obsolete. See the definition of MX and [RFC-974] for details of 805 - the new scheme. The recommended policy for dealing with MD RRs found in 806 - a master file is to reject them, or to convert them to MX RRs with a 807 - preference of 0. 808 - 809 - 3.3.5. MF RDATA format (Obsolete) 810 - 811 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 812 - / MADNAME / 813 - / / 814 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 815 - 816 - where: 817 - 818 - MADNAME A <domain-name> which specifies a host which has a mail 819 - agent for the domain which will accept mail for 820 - forwarding to the domain. 821 - 822 - MF records cause additional section processing which looks up an A type 823 - record corresponding to MADNAME. 824 - 825 - MF is obsolete. See the definition of MX and [RFC-974] for details ofw 826 - the new scheme. The recommended policy for dealing with MD RRs found in 827 - a master file is to reject them, or to convert them to MX RRs with a 828 - preference of 10. 829 - 830 - 831 - 832 - 833 - 834 - 835 - 836 - Mockapetris [Page 15] 837 - 838 - RFC 1035 Domain Implementation and Specification November 1987 839 - 840 - 841 - 3.3.6. MG RDATA format (EXPERIMENTAL) 842 - 843 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 844 - / MGMNAME / 845 - / / 846 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 847 - 848 - where: 849 - 850 - MGMNAME A <domain-name> which specifies a mailbox which is a 851 - member of the mail group specified by the domain name. 852 - 853 - MG records cause no additional section processing. 854 - 855 - 3.3.7. MINFO RDATA format (EXPERIMENTAL) 856 - 857 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 858 - / RMAILBX / 859 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 860 - / EMAILBX / 861 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 862 - 863 - where: 864 - 865 - RMAILBX A <domain-name> which specifies a mailbox which is 866 - responsible for the mailing list or mailbox. If this 867 - domain name names the root, the owner of the MINFO RR is 868 - responsible for itself. Note that many existing mailing 869 - lists use a mailbox X-request for the RMAILBX field of 870 - mailing list X, e.g., Msgroup-request for Msgroup. This 871 - field provides a more general mechanism. 872 - 873 - 874 - EMAILBX A <domain-name> which specifies a mailbox which is to 875 - receive error messages related to the mailing list or 876 - mailbox specified by the owner of the MINFO RR (similar 877 - to the ERRORS-TO: field which has been proposed). If 878 - this domain name names the root, errors should be 879 - returned to the sender of the message. 880 - 881 - MINFO records cause no additional section processing. Although these 882 - records can be associated with a simple mailbox, they are usually used 883 - with a mailing list. 884 - 885 - 886 - 887 - 888 - 889 - 890 - 891 - 892 - Mockapetris [Page 16] 893 - 894 - RFC 1035 Domain Implementation and Specification November 1987 895 - 896 - 897 - 3.3.8. MR RDATA format (EXPERIMENTAL) 898 - 899 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 900 - / NEWNAME / 901 - / / 902 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 903 - 904 - where: 905 - 906 - NEWNAME A <domain-name> which specifies a mailbox which is the 907 - proper rename of the specified mailbox. 908 - 909 - MR records cause no additional section processing. The main use for MR 910 - is as a forwarding entry for a user who has moved to a different 911 - mailbox. 912 - 913 - 3.3.9. MX RDATA format 914 - 915 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 916 - | PREFERENCE | 917 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 918 - / EXCHANGE / 919 - / / 920 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 921 - 922 - where: 923 - 924 - PREFERENCE A 16 bit integer which specifies the preference given to 925 - this RR among others at the same owner. Lower values 926 - are preferred. 927 - 928 - EXCHANGE A <domain-name> which specifies a host willing to act as 929 - a mail exchange for the owner name. 930 - 931 - MX records cause type A additional section processing for the host 932 - specified by EXCHANGE. The use of MX RRs is explained in detail in 933 - [RFC-974]. 934 - 935 - 3.3.10. NULL RDATA format (EXPERIMENTAL) 936 - 937 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 938 - / <anything> / 939 - / / 940 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 941 - 942 - Anything at all may be in the RDATA field so long as it is 65535 octets 943 - or less. 944 - 945 - 946 - 947 - 948 - Mockapetris [Page 17] 949 - 950 - RFC 1035 Domain Implementation and Specification November 1987 951 - 952 - 953 - NULL records cause no additional section processing. NULL RRs are not 954 - allowed in master files. NULLs are used as placeholders in some 955 - experimental extensions of the DNS. 956 - 957 - 3.3.11. NS RDATA format 958 - 959 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 960 - / NSDNAME / 961 - / / 962 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 963 - 964 - where: 965 - 966 - NSDNAME A <domain-name> which specifies a host which should be 967 - authoritative for the specified class and domain. 968 - 969 - NS records cause both the usual additional section processing to locate 970 - a type A record, and, when used in a referral, a special search of the 971 - zone in which they reside for glue information. 972 - 973 - The NS RR states that the named host should be expected to have a zone 974 - starting at owner name of the specified class. Note that the class may 975 - not indicate the protocol family which should be used to communicate 976 - with the host, although it is typically a strong hint. For example, 977 - hosts which are name servers for either Internet (IN) or Hesiod (HS) 978 - class information are normally queried using IN class protocols. 979 - 980 - 3.3.12. PTR RDATA format 981 - 982 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 983 - / PTRDNAME / 984 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 985 - 986 - where: 987 - 988 - PTRDNAME A <domain-name> which points to some location in the 989 - domain name space. 990 - 991 - PTR records cause no additional section processing. These RRs are used 992 - in special domains to point to some other location in the domain space. 993 - These records are simple data, and don't imply any special processing 994 - similar to that performed by CNAME, which identifies aliases. See the 995 - description of the IN-ADDR.ARPA domain for an example. 996 - 997 - 998 - 999 - 1000 - 1001 - 1002 - 1003 - 1004 - Mockapetris [Page 18] 1005 - 1006 - RFC 1035 Domain Implementation and Specification November 1987 1007 - 1008 - 1009 - 3.3.13. SOA RDATA format 1010 - 1011 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1012 - / MNAME / 1013 - / / 1014 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1015 - / RNAME / 1016 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1017 - | SERIAL | 1018 - | | 1019 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1020 - | REFRESH | 1021 - | | 1022 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1023 - | RETRY | 1024 - | | 1025 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1026 - | EXPIRE | 1027 - | | 1028 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1029 - | MINIMUM | 1030 - | | 1031 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1032 - 1033 - where: 1034 - 1035 - MNAME The <domain-name> of the name server that was the 1036 - original or primary source of data for this zone. 1037 - 1038 - RNAME A <domain-name> which specifies the mailbox of the 1039 - person responsible for this zone. 1040 - 1041 - SERIAL The unsigned 32 bit version number of the original copy 1042 - of the zone. Zone transfers preserve this value. This 1043 - value wraps and should be compared using sequence space 1044 - arithmetic. 1045 - 1046 - REFRESH A 32 bit time interval before the zone should be 1047 - refreshed. 1048 - 1049 - RETRY A 32 bit time interval that should elapse before a 1050 - failed refresh should be retried. 1051 - 1052 - EXPIRE A 32 bit time value that specifies the upper limit on 1053 - the time interval that can elapse before the zone is no 1054 - longer authoritative. 1055 - 1056 - 1057 - 1058 - 1059 - 1060 - Mockapetris [Page 19] 1061 - 1062 - RFC 1035 Domain Implementation and Specification November 1987 1063 - 1064 - 1065 - MINIMUM The unsigned 32 bit minimum TTL field that should be 1066 - exported with any RR from this zone. 1067 - 1068 - SOA records cause no additional section processing. 1069 - 1070 - All times are in units of seconds. 1071 - 1072 - Most of these fields are pertinent only for name server maintenance 1073 - operations. However, MINIMUM is used in all query operations that 1074 - retrieve RRs from a zone. Whenever a RR is sent in a response to a 1075 - query, the TTL field is set to the maximum of the TTL field from the RR 1076 - and the MINIMUM field in the appropriate SOA. Thus MINIMUM is a lower 1077 - bound on the TTL field for all RRs in a zone. Note that this use of 1078 - MINIMUM should occur when the RRs are copied into the response and not 1079 - when the zone is loaded from a master file or via a zone transfer. The 1080 - reason for this provison is to allow future dynamic update facilities to 1081 - change the SOA RR with known semantics. 1082 - 1083 - 1084 - 3.3.14. TXT RDATA format 1085 - 1086 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1087 - / TXT-DATA / 1088 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1089 - 1090 - where: 1091 - 1092 - TXT-DATA One or more <character-string>s. 1093 - 1094 - TXT RRs are used to hold descriptive text. The semantics of the text 1095 - depends on the domain where it is found. 1096 - 1097 - 3.4. Internet specific RRs 1098 - 1099 - 3.4.1. A RDATA format 1100 - 1101 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1102 - | ADDRESS | 1103 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1104 - 1105 - where: 1106 - 1107 - ADDRESS A 32 bit Internet address. 1108 - 1109 - Hosts that have multiple Internet addresses will have multiple A 1110 - records. 1111 - 1112 - 1113 - 1114 - 1115 - 1116 - Mockapetris [Page 20] 1117 - 1118 - RFC 1035 Domain Implementation and Specification November 1987 1119 - 1120 - 1121 - A records cause no additional section processing. The RDATA section of 1122 - an A line in a master file is an Internet address expressed as four 1123 - decimal numbers separated by dots without any imbedded spaces (e.g., 1124 - "10.2.0.52" or "192.0.5.6"). 1125 - 1126 - 3.4.2. WKS RDATA format 1127 - 1128 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1129 - | ADDRESS | 1130 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1131 - | PROTOCOL | | 1132 - +--+--+--+--+--+--+--+--+ | 1133 - | | 1134 - / <BIT MAP> / 1135 - / / 1136 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1137 - 1138 - where: 1139 - 1140 - ADDRESS An 32 bit Internet address 1141 - 1142 - PROTOCOL An 8 bit IP protocol number 1143 - 1144 - <BIT MAP> A variable length bit map. The bit map must be a 1145 - multiple of 8 bits long. 1146 - 1147 - The WKS record is used to describe the well known services supported by 1148 - a particular protocol on a particular internet address. The PROTOCOL 1149 - field specifies an IP protocol number, and the bit map has one bit per 1150 - port of the specified protocol. The first bit corresponds to port 0, 1151 - the second to port 1, etc. If the bit map does not include a bit for a 1152 - protocol of interest, that bit is assumed zero. The appropriate values 1153 - and mnemonics for ports and protocols are specified in [RFC-1010]. 1154 - 1155 - For example, if PROTOCOL=TCP (6), the 26th bit corresponds to TCP port 1156 - 25 (SMTP). If this bit is set, a SMTP server should be listening on TCP 1157 - port 25; if zero, SMTP service is not supported on the specified 1158 - address. 1159 - 1160 - The purpose of WKS RRs is to provide availability information for 1161 - servers for TCP and UDP. If a server supports both TCP and UDP, or has 1162 - multiple Internet addresses, then multiple WKS RRs are used. 1163 - 1164 - WKS RRs cause no additional section processing. 1165 - 1166 - In master files, both ports and protocols are expressed using mnemonics 1167 - or decimal numbers. 1168 - 1169 - 1170 - 1171 - 1172 - Mockapetris [Page 21] 1173 - 1174 - RFC 1035 Domain Implementation and Specification November 1987 1175 - 1176 - 1177 - 3.5. IN-ADDR.ARPA domain 1178 - 1179 - The Internet uses a special domain to support gateway location and 1180 - Internet address to host mapping. Other classes may employ a similar 1181 - strategy in other domains. The intent of this domain is to provide a 1182 - guaranteed method to perform host address to host name mapping, and to 1183 - facilitate queries to locate all gateways on a particular network in the 1184 - Internet. 1185 - 1186 - Note that both of these services are similar to functions that could be 1187 - performed by inverse queries; the difference is that this part of the 1188 - domain name space is structured according to address, and hence can 1189 - guarantee that the appropriate data can be located without an exhaustive 1190 - search of the domain space. 1191 - 1192 - The domain begins at IN-ADDR.ARPA and has a substructure which follows 1193 - the Internet addressing structure. 1194 - 1195 - Domain names in the IN-ADDR.ARPA domain are defined to have up to four 1196 - labels in addition to the IN-ADDR.ARPA suffix. Each label represents 1197 - one octet of an Internet address, and is expressed as a character string 1198 - for a decimal value in the range 0-255 (with leading zeros omitted 1199 - except in the case of a zero octet which is represented by a single 1200 - zero). 1201 - 1202 - Host addresses are represented by domain names that have all four labels 1203 - specified. Thus data for Internet address 10.2.0.52 is located at 1204 - domain name 52.0.2.10.IN-ADDR.ARPA. The reversal, though awkward to 1205 - read, allows zones to be delegated which are exactly one network of 1206 - address space. For example, 10.IN-ADDR.ARPA can be a zone containing 1207 - data for the ARPANET, while 26.IN-ADDR.ARPA can be a separate zone for 1208 - MILNET. Address nodes are used to hold pointers to primary host names 1209 - in the normal domain space. 1210 - 1211 - Network numbers correspond to some non-terminal nodes at various depths 1212 - in the IN-ADDR.ARPA domain, since Internet network numbers are either 1, 1213 - 2, or 3 octets. Network nodes are used to hold pointers to the primary 1214 - host names of gateways attached to that network. Since a gateway is, by 1215 - definition, on more than one network, it will typically have two or more 1216 - network nodes which point at it. Gateways will also have host level 1217 - pointers at their fully qualified addresses. 1218 - 1219 - Both the gateway pointers at network nodes and the normal host pointers 1220 - at full address nodes use the PTR RR to point back to the primary domain 1221 - names of the corresponding hosts. 1222 - 1223 - For example, the IN-ADDR.ARPA domain will contain information about the 1224 - ISI gateway between net 10 and 26, an MIT gateway from net 10 to MIT's 1225 - 1226 - 1227 - 1228 - Mockapetris [Page 22] 1229 - 1230 - RFC 1035 Domain Implementation and Specification November 1987 1231 - 1232 - 1233 - net 18, and hosts A.ISI.EDU and MULTICS.MIT.EDU. Assuming that ISI 1234 - gateway has addresses 10.2.0.22 and 26.0.0.103, and a name MILNET- 1235 - GW.ISI.EDU, and the MIT gateway has addresses 10.0.0.77 and 18.10.0.4 1236 - and a name GW.LCS.MIT.EDU, the domain database would contain: 1237 - 1238 - 10.IN-ADDR.ARPA. PTR MILNET-GW.ISI.EDU. 1239 - 10.IN-ADDR.ARPA. PTR GW.LCS.MIT.EDU. 1240 - 18.IN-ADDR.ARPA. PTR GW.LCS.MIT.EDU. 1241 - 26.IN-ADDR.ARPA. PTR MILNET-GW.ISI.EDU. 1242 - 22.0.2.10.IN-ADDR.ARPA. PTR MILNET-GW.ISI.EDU. 1243 - 103.0.0.26.IN-ADDR.ARPA. PTR MILNET-GW.ISI.EDU. 1244 - 77.0.0.10.IN-ADDR.ARPA. PTR GW.LCS.MIT.EDU. 1245 - 4.0.10.18.IN-ADDR.ARPA. PTR GW.LCS.MIT.EDU. 1246 - 103.0.3.26.IN-ADDR.ARPA. PTR A.ISI.EDU. 1247 - 6.0.0.10.IN-ADDR.ARPA. PTR MULTICS.MIT.EDU. 1248 - 1249 - Thus a program which wanted to locate gateways on net 10 would originate 1250 - a query of the form QTYPE=PTR, QCLASS=IN, QNAME=10.IN-ADDR.ARPA. It 1251 - would receive two RRs in response: 1252 - 1253 - 10.IN-ADDR.ARPA. PTR MILNET-GW.ISI.EDU. 1254 - 10.IN-ADDR.ARPA. PTR GW.LCS.MIT.EDU. 1255 - 1256 - The program could then originate QTYPE=A, QCLASS=IN queries for MILNET- 1257 - GW.ISI.EDU. and GW.LCS.MIT.EDU. to discover the Internet addresses of 1258 - these gateways. 1259 - 1260 - A resolver which wanted to find the host name corresponding to Internet 1261 - host address 10.0.0.6 would pursue a query of the form QTYPE=PTR, 1262 - QCLASS=IN, QNAME=6.0.0.10.IN-ADDR.ARPA, and would receive: 1263 - 1264 - 6.0.0.10.IN-ADDR.ARPA. PTR MULTICS.MIT.EDU. 1265 - 1266 - Several cautions apply to the use of these services: 1267 - - Since the IN-ADDR.ARPA special domain and the normal domain 1268 - for a particular host or gateway will be in different zones, 1269 - the possibility exists that that the data may be inconsistent. 1270 - 1271 - - Gateways will often have two names in separate domains, only 1272 - one of which can be primary. 1273 - 1274 - - Systems that use the domain database to initialize their 1275 - routing tables must start with enough gateway information to 1276 - guarantee that they can access the appropriate name server. 1277 - 1278 - - The gateway data only reflects the existence of a gateway in a 1279 - manner equivalent to the current HOSTS.TXT file. It doesn't 1280 - replace the dynamic availability information from GGP or EGP. 1281 - 1282 - 1283 - 1284 - Mockapetris [Page 23] 1285 - 1286 - RFC 1035 Domain Implementation and Specification November 1987 1287 - 1288 - 1289 - 3.6. Defining new types, classes, and special namespaces 1290 - 1291 - The previously defined types and classes are the ones in use as of the 1292 - date of this memo. New definitions should be expected. This section 1293 - makes some recommendations to designers considering additions to the 1294 - existing facilities. The mailing list NAMEDROPPERS@SRI-NIC.ARPA is the 1295 - forum where general discussion of design issues takes place. 1296 - 1297 - In general, a new type is appropriate when new information is to be 1298 - added to the database about an existing object, or we need new data 1299 - formats for some totally new object. Designers should attempt to define 1300 - types and their RDATA formats that are generally applicable to all 1301 - classes, and which avoid duplication of information. New classes are 1302 - appropriate when the DNS is to be used for a new protocol, etc which 1303 - requires new class-specific data formats, or when a copy of the existing 1304 - name space is desired, but a separate management domain is necessary. 1305 - 1306 - New types and classes need mnemonics for master files; the format of the 1307 - master files requires that the mnemonics for type and class be disjoint. 1308 - 1309 - TYPE and CLASS values must be a proper subset of QTYPEs and QCLASSes 1310 - respectively. 1311 - 1312 - The present system uses multiple RRs to represent multiple values of a 1313 - type rather than storing multiple values in the RDATA section of a 1314 - single RR. This is less efficient for most applications, but does keep 1315 - RRs shorter. The multiple RRs assumption is incorporated in some 1316 - experimental work on dynamic update methods. 1317 - 1318 - The present system attempts to minimize the duplication of data in the 1319 - database in order to insure consistency. Thus, in order to find the 1320 - address of the host for a mail exchange, you map the mail domain name to 1321 - a host name, then the host name to addresses, rather than a direct 1322 - mapping to host address. This approach is preferred because it avoids 1323 - the opportunity for inconsistency. 1324 - 1325 - In defining a new type of data, multiple RR types should not be used to 1326 - create an ordering between entries or express different formats for 1327 - equivalent bindings, instead this information should be carried in the 1328 - body of the RR and a single type used. This policy avoids problems with 1329 - caching multiple types and defining QTYPEs to match multiple types. 1330 - 1331 - For example, the original form of mail exchange binding used two RR 1332 - types one to represent a "closer" exchange (MD) and one to represent a 1333 - "less close" exchange (MF). The difficulty is that the presence of one 1334 - RR type in a cache doesn't convey any information about the other 1335 - because the query which acquired the cached information might have used 1336 - a QTYPE of MF, MD, or MAILA (which matched both). The redesigned 1337 - 1338 - 1339 - 1340 - Mockapetris [Page 24] 1341 - 1342 - RFC 1035 Domain Implementation and Specification November 1987 1343 - 1344 - 1345 - service used a single type (MX) with a "preference" value in the RDATA 1346 - section which can order different RRs. However, if any MX RRs are found 1347 - in the cache, then all should be there. 1348 - 1349 - 4. MESSAGES 1350 - 1351 - 4.1. Format 1352 - 1353 - All communications inside of the domain protocol are carried in a single 1354 - format called a message. The top level format of message is divided 1355 - into 5 sections (some of which are empty in certain cases) shown below: 1356 - 1357 - +---------------------+ 1358 - | Header | 1359 - +---------------------+ 1360 - | Question | the question for the name server 1361 - +---------------------+ 1362 - | Answer | RRs answering the question 1363 - +---------------------+ 1364 - | Authority | RRs pointing toward an authority 1365 - +---------------------+ 1366 - | Additional | RRs holding additional information 1367 - +---------------------+ 1368 - 1369 - The header section is always present. The header includes fields that 1370 - specify which of the remaining sections are present, and also specify 1371 - whether the message is a query or a response, a standard query or some 1372 - other opcode, etc. 1373 - 1374 - The names of the sections after the header are derived from their use in 1375 - standard queries. The question section contains fields that describe a 1376 - question to a name server. These fields are a query type (QTYPE), a 1377 - query class (QCLASS), and a query domain name (QNAME). The last three 1378 - sections have the same format: a possibly empty list of concatenated 1379 - resource records (RRs). The answer section contains RRs that answer the 1380 - question; the authority section contains RRs that point toward an 1381 - authoritative name server; the additional records section contains RRs 1382 - which relate to the query, but are not strictly answers for the 1383 - question. 1384 - 1385 - 1386 - 1387 - 1388 - 1389 - 1390 - 1391 - 1392 - 1393 - 1394 - 1395 - 1396 - Mockapetris [Page 25] 1397 - 1398 - RFC 1035 Domain Implementation and Specification November 1987 1399 - 1400 - 1401 - 4.1.1. Header section format 1402 - 1403 - The header contains the following fields: 1404 - 1405 - 1 1 1 1 1 1 1406 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 1407 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1408 - | ID | 1409 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1410 - |QR| Opcode |AA|TC|RD|RA| Z | RCODE | 1411 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1412 - | QDCOUNT | 1413 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1414 - | ANCOUNT | 1415 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1416 - | NSCOUNT | 1417 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1418 - | ARCOUNT | 1419 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1420 - 1421 - where: 1422 - 1423 - ID A 16 bit identifier assigned by the program that 1424 - generates any kind of query. This identifier is copied 1425 - the corresponding reply and can be used by the requester 1426 - to match up replies to outstanding queries. 1427 - 1428 - QR A one bit field that specifies whether this message is a 1429 - query (0), or a response (1). 1430 - 1431 - OPCODE A four bit field that specifies kind of query in this 1432 - message. This value is set by the originator of a query 1433 - and copied into the response. The values are: 1434 - 1435 - 0 a standard query (QUERY) 1436 - 1437 - 1 an inverse query (IQUERY) 1438 - 1439 - 2 a server status request (STATUS) 1440 - 1441 - 3-15 reserved for future use 1442 - 1443 - AA Authoritative Answer - this bit is valid in responses, 1444 - and specifies that the responding name server is an 1445 - authority for the domain name in question section. 1446 - 1447 - Note that the contents of the answer section may have 1448 - multiple owner names because of aliases. The AA bit 1449 - 1450 - 1451 - 1452 - Mockapetris [Page 26] 1453 - 1454 - RFC 1035 Domain Implementation and Specification November 1987 1455 - 1456 - 1457 - corresponds to the name which matches the query name, or 1458 - the first owner name in the answer section. 1459 - 1460 - TC TrunCation - specifies that this message was truncated 1461 - due to length greater than that permitted on the 1462 - transmission channel. 1463 - 1464 - RD Recursion Desired - this bit may be set in a query and 1465 - is copied into the response. If RD is set, it directs 1466 - the name server to pursue the query recursively. 1467 - Recursive query support is optional. 1468 - 1469 - RA Recursion Available - this be is set or cleared in a 1470 - response, and denotes whether recursive query support is 1471 - available in the name server. 1472 - 1473 - Z Reserved for future use. Must be zero in all queries 1474 - and responses. 1475 - 1476 - RCODE Response code - this 4 bit field is set as part of 1477 - responses. The values have the following 1478 - interpretation: 1479 - 1480 - 0 No error condition 1481 - 1482 - 1 Format error - The name server was 1483 - unable to interpret the query. 1484 - 1485 - 2 Server failure - The name server was 1486 - unable to process this query due to a 1487 - problem with the name server. 1488 - 1489 - 3 Name Error - Meaningful only for 1490 - responses from an authoritative name 1491 - server, this code signifies that the 1492 - domain name referenced in the query does 1493 - not exist. 1494 - 1495 - 4 Not Implemented - The name server does 1496 - not support the requested kind of query. 1497 - 1498 - 5 Refused - The name server refuses to 1499 - perform the specified operation for 1500 - policy reasons. For example, a name 1501 - server may not wish to provide the 1502 - information to the particular requester, 1503 - or a name server may not wish to perform 1504 - a particular operation (e.g., zone 1505 - 1506 - 1507 - 1508 - Mockapetris [Page 27] 1509 - 1510 - RFC 1035 Domain Implementation and Specification November 1987 1511 - 1512 - 1513 - transfer) for particular data. 1514 - 1515 - 6-15 Reserved for future use. 1516 - 1517 - QDCOUNT an unsigned 16 bit integer specifying the number of 1518 - entries in the question section. 1519 - 1520 - ANCOUNT an unsigned 16 bit integer specifying the number of 1521 - resource records in the answer section. 1522 - 1523 - NSCOUNT an unsigned 16 bit integer specifying the number of name 1524 - server resource records in the authority records 1525 - section. 1526 - 1527 - ARCOUNT an unsigned 16 bit integer specifying the number of 1528 - resource records in the additional records section. 1529 - 1530 - 4.1.2. Question section format 1531 - 1532 - The question section is used to carry the "question" in most queries, 1533 - i.e., the parameters that define what is being asked. The section 1534 - contains QDCOUNT (usually 1) entries, each of the following format: 1535 - 1536 - 1 1 1 1 1 1 1537 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 1538 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1539 - | | 1540 - / QNAME / 1541 - / / 1542 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1543 - | QTYPE | 1544 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1545 - | QCLASS | 1546 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1547 - 1548 - where: 1549 - 1550 - QNAME a domain name represented as a sequence of labels, where 1551 - each label consists of a length octet followed by that 1552 - number of octets. The domain name terminates with the 1553 - zero length octet for the null label of the root. Note 1554 - that this field may be an odd number of octets; no 1555 - padding is used. 1556 - 1557 - QTYPE a two octet code which specifies the type of the query. 1558 - The values for this field include all codes valid for a 1559 - TYPE field, together with some more general codes which 1560 - can match more than one type of RR. 1561 - 1562 - 1563 - 1564 - Mockapetris [Page 28] 1565 - 1566 - RFC 1035 Domain Implementation and Specification November 1987 1567 - 1568 - 1569 - QCLASS a two octet code that specifies the class of the query. 1570 - For example, the QCLASS field is IN for the Internet. 1571 - 1572 - 4.1.3. Resource record format 1573 - 1574 - The answer, authority, and additional sections all share the same 1575 - format: a variable number of resource records, where the number of 1576 - records is specified in the corresponding count field in the header. 1577 - Each resource record has the following format: 1578 - 1 1 1 1 1 1 1579 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 1580 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1581 - | | 1582 - / / 1583 - / NAME / 1584 - | | 1585 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1586 - | TYPE | 1587 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1588 - | CLASS | 1589 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1590 - | TTL | 1591 - | | 1592 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1593 - | RDLENGTH | 1594 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--| 1595 - / RDATA / 1596 - / / 1597 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1598 - 1599 - where: 1600 - 1601 - NAME a domain name to which this resource record pertains. 1602 - 1603 - TYPE two octets containing one of the RR type codes. This 1604 - field specifies the meaning of the data in the RDATA 1605 - field. 1606 - 1607 - CLASS two octets which specify the class of the data in the 1608 - RDATA field. 1609 - 1610 - TTL a 32 bit unsigned integer that specifies the time 1611 - interval (in seconds) that the resource record may be 1612 - cached before it should be discarded. Zero values are 1613 - interpreted to mean that the RR can only be used for the 1614 - transaction in progress, and should not be cached. 1615 - 1616 - 1617 - 1618 - 1619 - 1620 - Mockapetris [Page 29] 1621 - 1622 - RFC 1035 Domain Implementation and Specification November 1987 1623 - 1624 - 1625 - RDLENGTH an unsigned 16 bit integer that specifies the length in 1626 - octets of the RDATA field. 1627 - 1628 - RDATA a variable length string of octets that describes the 1629 - resource. The format of this information varies 1630 - according to the TYPE and CLASS of the resource record. 1631 - For example, the if the TYPE is A and the CLASS is IN, 1632 - the RDATA field is a 4 octet ARPA Internet address. 1633 - 1634 - 4.1.4. Message compression 1635 - 1636 - In order to reduce the size of messages, the domain system utilizes a 1637 - compression scheme which eliminates the repetition of domain names in a 1638 - message. In this scheme, an entire domain name or a list of labels at 1639 - the end of a domain name is replaced with a pointer to a prior occurance 1640 - of the same name. 1641 - 1642 - The pointer takes the form of a two octet sequence: 1643 - 1644 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1645 - | 1 1| OFFSET | 1646 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1647 - 1648 - The first two bits are ones. This allows a pointer to be distinguished 1649 - from a label, since the label must begin with two zero bits because 1650 - labels are restricted to 63 octets or less. (The 10 and 01 combinations 1651 - are reserved for future use.) The OFFSET field specifies an offset from 1652 - the start of the message (i.e., the first octet of the ID field in the 1653 - domain header). A zero offset specifies the first byte of the ID field, 1654 - etc. 1655 - 1656 - The compression scheme allows a domain name in a message to be 1657 - represented as either: 1658 - 1659 - - a sequence of labels ending in a zero octet 1660 - 1661 - - a pointer 1662 - 1663 - - a sequence of labels ending with a pointer 1664 - 1665 - Pointers can only be used for occurances of a domain name where the 1666 - format is not class specific. If this were not the case, a name server 1667 - or resolver would be required to know the format of all RRs it handled. 1668 - As yet, there are no such cases, but they may occur in future RDATA 1669 - formats. 1670 - 1671 - If a domain name is contained in a part of the message subject to a 1672 - length field (such as the RDATA section of an RR), and compression is 1673 - 1674 - 1675 - 1676 - Mockapetris [Page 30] 1677 - 1678 - RFC 1035 Domain Implementation and Specification November 1987 1679 - 1680 - 1681 - used, the length of the compressed name is used in the length 1682 - calculation, rather than the length of the expanded name. 1683 - 1684 - Programs are free to avoid using pointers in messages they generate, 1685 - although this will reduce datagram capacity, and may cause truncation. 1686 - However all programs are required to understand arriving messages that 1687 - contain pointers. 1688 - 1689 - For example, a datagram might need to use the domain names F.ISI.ARPA, 1690 - FOO.F.ISI.ARPA, ARPA, and the root. Ignoring the other fields of the 1691 - message, these domain names might be represented as: 1692 - 1693 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1694 - 20 | 1 | F | 1695 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1696 - 22 | 3 | I | 1697 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1698 - 24 | S | I | 1699 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1700 - 26 | 4 | A | 1701 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1702 - 28 | R | P | 1703 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1704 - 30 | A | 0 | 1705 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1706 - 1707 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1708 - 40 | 3 | F | 1709 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1710 - 42 | O | O | 1711 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1712 - 44 | 1 1| 20 | 1713 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1714 - 1715 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1716 - 64 | 1 1| 26 | 1717 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1718 - 1719 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1720 - 92 | 0 | | 1721 - +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 1722 - 1723 - The domain name for F.ISI.ARPA is shown at offset 20. The domain name 1724 - FOO.F.ISI.ARPA is shown at offset 40; this definition uses a pointer to 1725 - concatenate a label for FOO to the previously defined F.ISI.ARPA. The 1726 - domain name ARPA is defined at offset 64 using a pointer to the ARPA 1727 - component of the name F.ISI.ARPA at 20; note that this pointer relies on 1728 - ARPA being the last label in the string at 20. The root domain name is 1729 - 1730 - 1731 - 1732 - Mockapetris [Page 31] 1733 - 1734 - RFC 1035 Domain Implementation and Specification November 1987 1735 - 1736 - 1737 - defined by a single octet of zeros at 92; the root domain name has no 1738 - labels. 1739 - 1740 - 4.2. Transport 1741 - 1742 - The DNS assumes that messages will be transmitted as datagrams or in a 1743 - byte stream carried by a virtual circuit. While virtual circuits can be 1744 - used for any DNS activity, datagrams are preferred for queries due to 1745 - their lower overhead and better performance. Zone refresh activities 1746 - must use virtual circuits because of the need for reliable transfer. 1747 - 1748 - The Internet supports name server access using TCP [RFC-793] on server 1749 - port 53 (decimal) as well as datagram access using UDP [RFC-768] on UDP 1750 - port 53 (decimal). 1751 - 1752 - 4.2.1. UDP usage 1753 - 1754 - Messages sent using UDP user server port 53 (decimal). 1755 - 1756 - Messages carried by UDP are restricted to 512 bytes (not counting the IP 1757 - or UDP headers). Longer messages are truncated and the TC bit is set in 1758 - the header. 1759 - 1760 - UDP is not acceptable for zone transfers, but is the recommended method 1761 - for standard queries in the Internet. Queries sent using UDP may be 1762 - lost, and hence a retransmission strategy is required. Queries or their 1763 - responses may be reordered by the network, or by processing in name 1764 - servers, so resolvers should not depend on them being returned in order. 1765 - 1766 - The optimal UDP retransmission policy will vary with performance of the 1767 - Internet and the needs of the client, but the following are recommended: 1768 - 1769 - - The client should try other servers and server addresses 1770 - before repeating a query to a specific address of a server. 1771 - 1772 - - The retransmission interval should be based on prior 1773 - statistics if possible. Too aggressive retransmission can 1774 - easily slow responses for the community at large. Depending 1775 - on how well connected the client is to its expected servers, 1776 - the minimum retransmission interval should be 2-5 seconds. 1777 - 1778 - More suggestions on server selection and retransmission policy can be 1779 - found in the resolver section of this memo. 1780 - 1781 - 4.2.2. TCP usage 1782 - 1783 - Messages sent over TCP connections use server port 53 (decimal). The 1784 - message is prefixed with a two byte length field which gives the message 1785 - 1786 - 1787 - 1788 - Mockapetris [Page 32] 1789 - 1790 - RFC 1035 Domain Implementation and Specification November 1987 1791 - 1792 - 1793 - length, excluding the two byte length field. This length field allows 1794 - the low-level processing to assemble a complete message before beginning 1795 - to parse it. 1796 - 1797 - Several connection management policies are recommended: 1798 - 1799 - - The server should not block other activities waiting for TCP 1800 - data. 1801 - 1802 - - The server should support multiple connections. 1803 - 1804 - - The server should assume that the client will initiate 1805 - connection closing, and should delay closing its end of the 1806 - connection until all outstanding client requests have been 1807 - satisfied. 1808 - 1809 - - If the server needs to close a dormant connection to reclaim 1810 - resources, it should wait until the connection has been idle 1811 - for a period on the order of two minutes. In particular, the 1812 - server should allow the SOA and AXFR request sequence (which 1813 - begins a refresh operation) to be made on a single connection. 1814 - Since the server would be unable to answer queries anyway, a 1815 - unilateral close or reset may be used instead of a graceful 1816 - close. 1817 - 1818 - 5. MASTER FILES 1819 - 1820 - Master files are text files that contain RRs in text form. Since the 1821 - contents of a zone can be expressed in the form of a list of RRs a 1822 - master file is most often used to define a zone, though it can be used 1823 - to list a cache's contents. Hence, this section first discusses the 1824 - format of RRs in a master file, and then the special considerations when 1825 - a master file is used to create a zone in some name server. 1826 - 1827 - 5.1. Format 1828 - 1829 - The format of these files is a sequence of entries. Entries are 1830 - predominantly line-oriented, though parentheses can be used to continue 1831 - a list of items across a line boundary, and text literals can contain 1832 - CRLF within the text. Any combination of tabs and spaces act as a 1833 - delimiter between the separate items that make up an entry. The end of 1834 - any line in the master file can end with a comment. The comment starts 1835 - with a ";" (semicolon). 1836 - 1837 - The following entries are defined: 1838 - 1839 - <blank>[<comment>] 1840 - 1841 - 1842 - 1843 - 1844 - Mockapetris [Page 33] 1845 - 1846 - RFC 1035 Domain Implementation and Specification November 1987 1847 - 1848 - 1849 - $ORIGIN <domain-name> [<comment>] 1850 - 1851 - $INCLUDE <file-name> [<domain-name>] [<comment>] 1852 - 1853 - <domain-name><rr> [<comment>] 1854 - 1855 - <blank><rr> [<comment>] 1856 - 1857 - Blank lines, with or without comments, are allowed anywhere in the file. 1858 - 1859 - Two control entries are defined: $ORIGIN and $INCLUDE. $ORIGIN is 1860 - followed by a domain name, and resets the current origin for relative 1861 - domain names to the stated name. $INCLUDE inserts the named file into 1862 - the current file, and may optionally specify a domain name that sets the 1863 - relative domain name origin for the included file. $INCLUDE may also 1864 - have a comment. Note that a $INCLUDE entry never changes the relative 1865 - origin of the parent file, regardless of changes to the relative origin 1866 - made within the included file. 1867 - 1868 - The last two forms represent RRs. If an entry for an RR begins with a 1869 - blank, then the RR is assumed to be owned by the last stated owner. If 1870 - an RR entry begins with a <domain-name>, then the owner name is reset. 1871 - 1872 - <rr> contents take one of the following forms: 1873 - 1874 - [<TTL>] [<class>] <type> <RDATA> 1875 - 1876 - [<class>] [<TTL>] <type> <RDATA> 1877 - 1878 - The RR begins with optional TTL and class fields, followed by a type and 1879 - RDATA field appropriate to the type and class. Class and type use the 1880 - standard mnemonics, TTL is a decimal integer. Omitted class and TTL 1881 - values are default to the last explicitly stated values. Since type and 1882 - class mnemonics are disjoint, the parse is unique. (Note that this 1883 - order is different from the order used in examples and the order used in 1884 - the actual RRs; the given order allows easier parsing and defaulting.) 1885 - 1886 - <domain-name>s make up a large share of the data in the master file. 1887 - The labels in the domain name are expressed as character strings and 1888 - separated by dots. Quoting conventions allow arbitrary characters to be 1889 - stored in domain names. Domain names that end in a dot are called 1890 - absolute, and are taken as complete. Domain names which do not end in a 1891 - dot are called relative; the actual domain name is the concatenation of 1892 - the relative part with an origin specified in a $ORIGIN, $INCLUDE, or as 1893 - an argument to the master file loading routine. A relative name is an 1894 - error when no origin is available. 1895 - 1896 - 1897 - 1898 - 1899 - 1900 - Mockapetris [Page 34] 1901 - 1902 - RFC 1035 Domain Implementation and Specification November 1987 1903 - 1904 - 1905 - <character-string> is expressed in one or two ways: as a contiguous set 1906 - of characters without interior spaces, or as a string beginning with a " 1907 - and ending with a ". Inside a " delimited string any character can 1908 - occur, except for a " itself, which must be quoted using \ (back slash). 1909 - 1910 - Because these files are text files several special encodings are 1911 - necessary to allow arbitrary data to be loaded. In particular: 1912 - 1913 - of the root. 1914 - 1915 - @ A free standing @ is used to denote the current origin. 1916 - 1917 - \X where X is any character other than a digit (0-9), is 1918 - used to quote that character so that its special meaning 1919 - does not apply. For example, "\." can be used to place 1920 - a dot character in a label. 1921 - 1922 - \DDD where each D is a digit is the octet corresponding to 1923 - the decimal number described by DDD. The resulting 1924 - octet is assumed to be text and is not checked for 1925 - special meaning. 1926 - 1927 - ( ) Parentheses are used to group data that crosses a line 1928 - boundary. In effect, line terminations are not 1929 - recognized within parentheses. 1930 - 1931 - ; Semicolon is used to start a comment; the remainder of 1932 - the line is ignored. 1933 - 1934 - 5.2. Use of master files to define zones 1935 - 1936 - When a master file is used to load a zone, the operation should be 1937 - suppressed if any errors are encountered in the master file. The 1938 - rationale for this is that a single error can have widespread 1939 - consequences. For example, suppose that the RRs defining a delegation 1940 - have syntax errors; then the server will return authoritative name 1941 - errors for all names in the subzone (except in the case where the 1942 - subzone is also present on the server). 1943 - 1944 - Several other validity checks that should be performed in addition to 1945 - insuring that the file is syntactically correct: 1946 - 1947 - 1. All RRs in the file should have the same class. 1948 - 1949 - 2. Exactly one SOA RR should be present at the top of the zone. 1950 - 1951 - 3. If delegations are present and glue information is required, 1952 - it should be present. 1953 - 1954 - 1955 - 1956 - Mockapetris [Page 35] 1957 - 1958 - RFC 1035 Domain Implementation and Specification November 1987 1959 - 1960 - 1961 - 4. Information present outside of the authoritative nodes in the 1962 - zone should be glue information, rather than the result of an 1963 - origin or similar error. 1964 - 1965 - 5.3. Master file example 1966 - 1967 - The following is an example file which might be used to define the 1968 - ISI.EDU zone.and is loaded with an origin of ISI.EDU: 1969 - 1970 - @ IN SOA VENERA Action\.domains ( 1971 - 20 ; SERIAL 1972 - 7200 ; REFRESH 1973 - 600 ; RETRY 1974 - 3600000; EXPIRE 1975 - 60) ; MINIMUM 1976 - 1977 - NS A.ISI.EDU. 1978 - NS VENERA 1979 - NS VAXA 1980 - MX 10 VENERA 1981 - MX 20 VAXA 1982 - 1983 - A A 26.3.0.103 1984 - 1985 - VENERA A 10.1.0.52 1986 - A 128.9.0.32 1987 - 1988 - VAXA A 10.2.0.27 1989 - A 128.9.0.33 1990 - 1991 - 1992 - $INCLUDE <SUBSYS>ISI-MAILBOXES.TXT 1993 - 1994 - Where the file <SUBSYS>ISI-MAILBOXES.TXT is: 1995 - 1996 - MOE MB A.ISI.EDU. 1997 - LARRY MB A.ISI.EDU. 1998 - CURLEY MB A.ISI.EDU. 1999 - STOOGES MG MOE 2000 - MG LARRY 2001 - MG CURLEY 2002 - 2003 - Note the use of the \ character in the SOA RR to specify the responsible 2004 - person mailbox "Action.domains@E.ISI.EDU". 2005 - 2006 - 2007 - 2008 - 2009 - 2010 - 2011 - 2012 - Mockapetris [Page 36] 2013 - 2014 - RFC 1035 Domain Implementation and Specification November 1987 2015 - 2016 - 2017 - 6. NAME SERVER IMPLEMENTATION 2018 - 2019 - 6.1. Architecture 2020 - 2021 - The optimal structure for the name server will depend on the host 2022 - operating system and whether the name server is integrated with resolver 2023 - operations, either by supporting recursive service, or by sharing its 2024 - database with a resolver. This section discusses implementation 2025 - considerations for a name server which shares a database with a 2026 - resolver, but most of these concerns are present in any name server. 2027 - 2028 - 6.1.1. Control 2029 - 2030 - A name server must employ multiple concurrent activities, whether they 2031 - are implemented as separate tasks in the host's OS or multiplexing 2032 - inside a single name server program. It is simply not acceptable for a 2033 - name server to block the service of UDP requests while it waits for TCP 2034 - data for refreshing or query activities. Similarly, a name server 2035 - should not attempt to provide recursive service without processing such 2036 - requests in parallel, though it may choose to serialize requests from a 2037 - single client, or to regard identical requests from the same client as 2038 - duplicates. A name server should not substantially delay requests while 2039 - it reloads a zone from master files or while it incorporates a newly 2040 - refreshed zone into its database. 2041 - 2042 - 6.1.2. Database 2043 - 2044 - While name server implementations are free to use any internal data 2045 - structures they choose, the suggested structure consists of three major 2046 - parts: 2047 - 2048 - - A "catalog" data structure which lists the zones available to 2049 - this server, and a "pointer" to the zone data structure. The 2050 - main purpose of this structure is to find the nearest ancestor 2051 - zone, if any, for arriving standard queries. 2052 - 2053 - - Separate data structures for each of the zones held by the 2054 - name server. 2055 - 2056 - - A data structure for cached data. (or perhaps separate caches 2057 - for different classes) 2058 - 2059 - All of these data structures can be implemented an identical tree 2060 - structure format, with different data chained off the nodes in different 2061 - parts: in the catalog the data is pointers to zones, while in the zone 2062 - and cache data structures, the data will be RRs. In designing the tree 2063 - framework the designer should recognize that query processing will need 2064 - to traverse the tree using case-insensitive label comparisons; and that 2065 - 2066 - 2067 - 2068 - Mockapetris [Page 37] 2069 - 2070 - RFC 1035 Domain Implementation and Specification November 1987 2071 - 2072 - 2073 - in real data, a few nodes have a very high branching factor (100-1000 or 2074 - more), but the vast majority have a very low branching factor (0-1). 2075 - 2076 - One way to solve the case problem is to store the labels for each node 2077 - in two pieces: a standardized-case representation of the label where all 2078 - ASCII characters are in a single case, together with a bit mask that 2079 - denotes which characters are actually of a different case. The 2080 - branching factor diversity can be handled using a simple linked list for 2081 - a node until the branching factor exceeds some threshold, and 2082 - transitioning to a hash structure after the threshold is exceeded. In 2083 - any case, hash structures used to store tree sections must insure that 2084 - hash functions and procedures preserve the casing conventions of the 2085 - DNS. 2086 - 2087 - The use of separate structures for the different parts of the database 2088 - is motivated by several factors: 2089 - 2090 - - The catalog structure can be an almost static structure that 2091 - need change only when the system administrator changes the 2092 - zones supported by the server. This structure can also be 2093 - used to store parameters used to control refreshing 2094 - activities. 2095 - 2096 - - The individual data structures for zones allow a zone to be 2097 - replaced simply by changing a pointer in the catalog. Zone 2098 - refresh operations can build a new structure and, when 2099 - complete, splice it into the database via a simple pointer 2100 - replacement. It is very important that when a zone is 2101 - refreshed, queries should not use old and new data 2102 - simultaneously. 2103 - 2104 - - With the proper search procedures, authoritative data in zones 2105 - will always "hide", and hence take precedence over, cached 2106 - data. 2107 - 2108 - - Errors in zone definitions that cause overlapping zones, etc., 2109 - may cause erroneous responses to queries, but problem 2110 - determination is simplified, and the contents of one "bad" 2111 - zone can't corrupt another. 2112 - 2113 - - Since the cache is most frequently updated, it is most 2114 - vulnerable to corruption during system restarts. It can also 2115 - become full of expired RR data. In either case, it can easily 2116 - be discarded without disturbing zone data. 2117 - 2118 - A major aspect of database design is selecting a structure which allows 2119 - the name server to deal with crashes of the name server's host. State 2120 - information which a name server should save across system crashes 2121 - 2122 - 2123 - 2124 - Mockapetris [Page 38] 2125 - 2126 - RFC 1035 Domain Implementation and Specification November 1987 2127 - 2128 - 2129 - includes the catalog structure (including the state of refreshing for 2130 - each zone) and the zone data itself. 2131 - 2132 - 6.1.3. Time 2133 - 2134 - Both the TTL data for RRs and the timing data for refreshing activities 2135 - depends on 32 bit timers in units of seconds. Inside the database, 2136 - refresh timers and TTLs for cached data conceptually "count down", while 2137 - data in the zone stays with constant TTLs. 2138 - 2139 - A recommended implementation strategy is to store time in two ways: as 2140 - a relative increment and as an absolute time. One way to do this is to 2141 - use positive 32 bit numbers for one type and negative numbers for the 2142 - other. The RRs in zones use relative times; the refresh timers and 2143 - cache data use absolute times. Absolute numbers are taken with respect 2144 - to some known origin and converted to relative values when placed in the 2145 - response to a query. When an absolute TTL is negative after conversion 2146 - to relative, then the data is expired and should be ignored. 2147 - 2148 - 6.2. Standard query processing 2149 - 2150 - The major algorithm for standard query processing is presented in 2151 - [RFC-1034]. 2152 - 2153 - When processing queries with QCLASS=*, or some other QCLASS which 2154 - matches multiple classes, the response should never be authoritative 2155 - unless the server can guarantee that the response covers all classes. 2156 - 2157 - When composing a response, RRs which are to be inserted in the 2158 - additional section, but duplicate RRs in the answer or authority 2159 - sections, may be omitted from the additional section. 2160 - 2161 - When a response is so long that truncation is required, the truncation 2162 - should start at the end of the response and work forward in the 2163 - datagram. Thus if there is any data for the authority section, the 2164 - answer section is guaranteed to be unique. 2165 - 2166 - The MINIMUM value in the SOA should be used to set a floor on the TTL of 2167 - data distributed from a zone. This floor function should be done when 2168 - the data is copied into a response. This will allow future dynamic 2169 - update protocols to change the SOA MINIMUM field without ambiguous 2170 - semantics. 2171 - 2172 - 6.3. Zone refresh and reload processing 2173 - 2174 - In spite of a server's best efforts, it may be unable to load zone data 2175 - from a master file due to syntax errors, etc., or be unable to refresh a 2176 - zone within the its expiration parameter. In this case, the name server 2177 - 2178 - 2179 - 2180 - Mockapetris [Page 39] 2181 - 2182 - RFC 1035 Domain Implementation and Specification November 1987 2183 - 2184 - 2185 - should answer queries as if it were not supposed to possess the zone. 2186 - 2187 - If a master is sending a zone out via AXFR, and a new version is created 2188 - during the transfer, the master should continue to send the old version 2189 - if possible. In any case, it should never send part of one version and 2190 - part of another. If completion is not possible, the master should reset 2191 - the connection on which the zone transfer is taking place. 2192 - 2193 - 6.4. Inverse queries (Optional) 2194 - 2195 - Inverse queries are an optional part of the DNS. Name servers are not 2196 - required to support any form of inverse queries. If a name server 2197 - receives an inverse query that it does not support, it returns an error 2198 - response with the "Not Implemented" error set in the header. While 2199 - inverse query support is optional, all name servers must be at least 2200 - able to return the error response. 2201 - 2202 - 6.4.1. The contents of inverse queries and responses Inverse 2203 - queries reverse the mappings performed by standard query operations; 2204 - while a standard query maps a domain name to a resource, an inverse 2205 - query maps a resource to a domain name. For example, a standard query 2206 - might bind a domain name to a host address; the corresponding inverse 2207 - query binds the host address to a domain name. 2208 - 2209 - Inverse queries take the form of a single RR in the answer section of 2210 - the message, with an empty question section. The owner name of the 2211 - query RR and its TTL are not significant. The response carries 2212 - questions in the question section which identify all names possessing 2213 - the query RR WHICH THE NAME SERVER KNOWS. Since no name server knows 2214 - about all of the domain name space, the response can never be assumed to 2215 - be complete. Thus inverse queries are primarily useful for database 2216 - management and debugging activities. Inverse queries are NOT an 2217 - acceptable method of mapping host addresses to host names; use the IN- 2218 - ADDR.ARPA domain instead. 2219 - 2220 - Where possible, name servers should provide case-insensitive comparisons 2221 - for inverse queries. Thus an inverse query asking for an MX RR of 2222 - "Venera.isi.edu" should get the same response as a query for 2223 - "VENERA.ISI.EDU"; an inverse query for HINFO RR "IBM-PC UNIX" should 2224 - produce the same result as an inverse query for "IBM-pc unix". However, 2225 - this cannot be guaranteed because name servers may possess RRs that 2226 - contain character strings but the name server does not know that the 2227 - data is character. 2228 - 2229 - When a name server processes an inverse query, it either returns: 2230 - 2231 - 1. zero, one, or multiple domain names for the specified 2232 - resource as QNAMEs in the question section 2233 - 2234 - 2235 - 2236 - Mockapetris [Page 40] 2237 - 2238 - RFC 1035 Domain Implementation and Specification November 1987 2239 - 2240 - 2241 - 2. an error code indicating that the name server doesn't support 2242 - inverse mapping of the specified resource type. 2243 - 2244 - When the response to an inverse query contains one or more QNAMEs, the 2245 - owner name and TTL of the RR in the answer section which defines the 2246 - inverse query is modified to exactly match an RR found at the first 2247 - QNAME. 2248 - 2249 - RRs returned in the inverse queries cannot be cached using the same 2250 - mechanism as is used for the replies to standard queries. One reason 2251 - for this is that a name might have multiple RRs of the same type, and 2252 - only one would appear. For example, an inverse query for a single 2253 - address of a multiply homed host might create the impression that only 2254 - one address existed. 2255 - 2256 - 6.4.2. Inverse query and response example The overall structure 2257 - of an inverse query for retrieving the domain name that corresponds to 2258 - Internet address 10.1.0.52 is shown below: 2259 - 2260 - +-----------------------------------------+ 2261 - Header | OPCODE=IQUERY, ID=997 | 2262 - +-----------------------------------------+ 2263 - Question | <empty> | 2264 - +-----------------------------------------+ 2265 - Answer | <anyname> A IN 10.1.0.52 | 2266 - +-----------------------------------------+ 2267 - Authority | <empty> | 2268 - +-----------------------------------------+ 2269 - Additional | <empty> | 2270 - +-----------------------------------------+ 2271 - 2272 - This query asks for a question whose answer is the Internet style 2273 - address 10.1.0.52. Since the owner name is not known, any domain name 2274 - can be used as a placeholder (and is ignored). A single octet of zero, 2275 - signifying the root, is usually used because it minimizes the length of 2276 - the message. The TTL of the RR is not significant. The response to 2277 - this query might be: 2278 - 2279 - 2280 - 2281 - 2282 - 2283 - 2284 - 2285 - 2286 - 2287 - 2288 - 2289 - 2290 - 2291 - 2292 - Mockapetris [Page 41] 2293 - 2294 - RFC 1035 Domain Implementation and Specification November 1987 2295 - 2296 - 2297 - +-----------------------------------------+ 2298 - Header | OPCODE=RESPONSE, ID=997 | 2299 - +-----------------------------------------+ 2300 - Question |QTYPE=A, QCLASS=IN, QNAME=VENERA.ISI.EDU | 2301 - +-----------------------------------------+ 2302 - Answer | VENERA.ISI.EDU A IN 10.1.0.52 | 2303 - +-----------------------------------------+ 2304 - Authority | <empty> | 2305 - +-----------------------------------------+ 2306 - Additional | <empty> | 2307 - +-----------------------------------------+ 2308 - 2309 - Note that the QTYPE in a response to an inverse query is the same as the 2310 - TYPE field in the answer section of the inverse query. Responses to 2311 - inverse queries may contain multiple questions when the inverse is not 2312 - unique. If the question section in the response is not empty, then the 2313 - RR in the answer section is modified to correspond to be an exact copy 2314 - of an RR at the first QNAME. 2315 - 2316 - 6.4.3. Inverse query processing 2317 - 2318 - Name servers that support inverse queries can support these operations 2319 - through exhaustive searches of their databases, but this becomes 2320 - impractical as the size of the database increases. An alternative 2321 - approach is to invert the database according to the search key. 2322 - 2323 - For name servers that support multiple zones and a large amount of data, 2324 - the recommended approach is separate inversions for each zone. When a 2325 - particular zone is changed during a refresh, only its inversions need to 2326 - be redone. 2327 - 2328 - Support for transfer of this type of inversion may be included in future 2329 - versions of the domain system, but is not supported in this version. 2330 - 2331 - 6.5. Completion queries and responses 2332 - 2333 - The optional completion services described in RFC-882 and RFC-883 have 2334 - been deleted. Redesigned services may become available in the future. 2335 - 2336 - 2337 - 2338 - 2339 - 2340 - 2341 - 2342 - 2343 - 2344 - 2345 - 2346 - 2347 - 2348 - Mockapetris [Page 42] 2349 - 2350 - RFC 1035 Domain Implementation and Specification November 1987 2351 - 2352 - 2353 - 7. RESOLVER IMPLEMENTATION 2354 - 2355 - The top levels of the recommended resolver algorithm are discussed in 2356 - [RFC-1034]. This section discusses implementation details assuming the 2357 - database structure suggested in the name server implementation section 2358 - of this memo. 2359 - 2360 - 7.1. Transforming a user request into a query 2361 - 2362 - The first step a resolver takes is to transform the client's request, 2363 - stated in a format suitable to the local OS, into a search specification 2364 - for RRs at a specific name which match a specific QTYPE and QCLASS. 2365 - Where possible, the QTYPE and QCLASS should correspond to a single type 2366 - and a single class, because this makes the use of cached data much 2367 - simpler. The reason for this is that the presence of data of one type 2368 - in a cache doesn't confirm the existence or non-existence of data of 2369 - other types, hence the only way to be sure is to consult an 2370 - authoritative source. If QCLASS=* is used, then authoritative answers 2371 - won't be available. 2372 - 2373 - Since a resolver must be able to multiplex multiple requests if it is to 2374 - perform its function efficiently, each pending request is usually 2375 - represented in some block of state information. This state block will 2376 - typically contain: 2377 - 2378 - - A timestamp indicating the time the request began. 2379 - The timestamp is used to decide whether RRs in the database 2380 - can be used or are out of date. This timestamp uses the 2381 - absolute time format previously discussed for RR storage in 2382 - zones and caches. Note that when an RRs TTL indicates a 2383 - relative time, the RR must be timely, since it is part of a 2384 - zone. When the RR has an absolute time, it is part of a 2385 - cache, and the TTL of the RR is compared against the timestamp 2386 - for the start of the request. 2387 - 2388 - Note that using the timestamp is superior to using a current 2389 - time, since it allows RRs with TTLs of zero to be entered in 2390 - the cache in the usual manner, but still used by the current 2391 - request, even after intervals of many seconds due to system 2392 - load, query retransmission timeouts, etc. 2393 - 2394 - - Some sort of parameters to limit the amount of work which will 2395 - be performed for this request. 2396 - 2397 - The amount of work which a resolver will do in response to a 2398 - client request must be limited to guard against errors in the 2399 - database, such as circular CNAME references, and operational 2400 - problems, such as network partition which prevents the 2401 - 2402 - 2403 - 2404 - Mockapetris [Page 43] 2405 - 2406 - RFC 1035 Domain Implementation and Specification November 1987 2407 - 2408 - 2409 - resolver from accessing the name servers it needs. While 2410 - local limits on the number of times a resolver will retransmit 2411 - a particular query to a particular name server address are 2412 - essential, the resolver should have a global per-request 2413 - counter to limit work on a single request. The counter should 2414 - be set to some initial value and decremented whenever the 2415 - resolver performs any action (retransmission timeout, 2416 - retransmission, etc.) If the counter passes zero, the request 2417 - is terminated with a temporary error. 2418 - 2419 - Note that if the resolver structure allows one request to 2420 - start others in parallel, such as when the need to access a 2421 - name server for one request causes a parallel resolve for the 2422 - name server's addresses, the spawned request should be started 2423 - with a lower counter. This prevents circular references in 2424 - the database from starting a chain reaction of resolver 2425 - activity. 2426 - 2427 - - The SLIST data structure discussed in [RFC-1034]. 2428 - 2429 - This structure keeps track of the state of a request if it 2430 - must wait for answers from foreign name servers. 2431 - 2432 - 7.2. Sending the queries 2433 - 2434 - As described in [RFC-1034], the basic task of the resolver is to 2435 - formulate a query which will answer the client's request and direct that 2436 - query to name servers which can provide the information. The resolver 2437 - will usually only have very strong hints about which servers to ask, in 2438 - the form of NS RRs, and may have to revise the query, in response to 2439 - CNAMEs, or revise the set of name servers the resolver is asking, in 2440 - response to delegation responses which point the resolver to name 2441 - servers closer to the desired information. In addition to the 2442 - information requested by the client, the resolver may have to call upon 2443 - its own services to determine the address of name servers it wishes to 2444 - contact. 2445 - 2446 - In any case, the model used in this memo assumes that the resolver is 2447 - multiplexing attention between multiple requests, some from the client, 2448 - and some internally generated. Each request is represented by some 2449 - state information, and the desired behavior is that the resolver 2450 - transmit queries to name servers in a way that maximizes the probability 2451 - that the request is answered, minimizes the time that the request takes, 2452 - and avoids excessive transmissions. The key algorithm uses the state 2453 - information of the request to select the next name server address to 2454 - query, and also computes a timeout which will cause the next action 2455 - should a response not arrive. The next action will usually be a 2456 - transmission to some other server, but may be a temporary error to the 2457 - 2458 - 2459 - 2460 - Mockapetris [Page 44] 2461 - 2462 - RFC 1035 Domain Implementation and Specification November 1987 2463 - 2464 - 2465 - client. 2466 - 2467 - The resolver always starts with a list of server names to query (SLIST). 2468 - This list will be all NS RRs which correspond to the nearest ancestor 2469 - zone that the resolver knows about. To avoid startup problems, the 2470 - resolver should have a set of default servers which it will ask should 2471 - it have no current NS RRs which are appropriate. The resolver then adds 2472 - to SLIST all of the known addresses for the name servers, and may start 2473 - parallel requests to acquire the addresses of the servers when the 2474 - resolver has the name, but no addresses, for the name servers. 2475 - 2476 - To complete initialization of SLIST, the resolver attaches whatever 2477 - history information it has to the each address in SLIST. This will 2478 - usually consist of some sort of weighted averages for the response time 2479 - of the address, and the batting average of the address (i.e., how often 2480 - the address responded at all to the request). Note that this 2481 - information should be kept on a per address basis, rather than on a per 2482 - name server basis, because the response time and batting average of a 2483 - particular server may vary considerably from address to address. Note 2484 - also that this information is actually specific to a resolver address / 2485 - server address pair, so a resolver with multiple addresses may wish to 2486 - keep separate histories for each of its addresses. Part of this step 2487 - must deal with addresses which have no such history; in this case an 2488 - expected round trip time of 5-10 seconds should be the worst case, with 2489 - lower estimates for the same local network, etc. 2490 - 2491 - Note that whenever a delegation is followed, the resolver algorithm 2492 - reinitializes SLIST. 2493 - 2494 - The information establishes a partial ranking of the available name 2495 - server addresses. Each time an address is chosen and the state should 2496 - be altered to prevent its selection again until all other addresses have 2497 - been tried. The timeout for each transmission should be 50-100% greater 2498 - than the average predicted value to allow for variance in response. 2499 - 2500 - Some fine points: 2501 - 2502 - - The resolver may encounter a situation where no addresses are 2503 - available for any of the name servers named in SLIST, and 2504 - where the servers in the list are precisely those which would 2505 - normally be used to look up their own addresses. This 2506 - situation typically occurs when the glue address RRs have a 2507 - smaller TTL than the NS RRs marking delegation, or when the 2508 - resolver caches the result of a NS search. The resolver 2509 - should detect this condition and restart the search at the 2510 - next ancestor zone, or alternatively at the root. 2511 - 2512 - 2513 - 2514 - 2515 - 2516 - Mockapetris [Page 45] 2517 - 2518 - RFC 1035 Domain Implementation and Specification November 1987 2519 - 2520 - 2521 - - If a resolver gets a server error or other bizarre response 2522 - from a name server, it should remove it from SLIST, and may 2523 - wish to schedule an immediate transmission to the next 2524 - candidate server address. 2525 - 2526 - 7.3. Processing responses 2527 - 2528 - The first step in processing arriving response datagrams is to parse the 2529 - response. This procedure should include: 2530 - 2531 - - Check the header for reasonableness. Discard datagrams which 2532 - are queries when responses are expected. 2533 - 2534 - - Parse the sections of the message, and insure that all RRs are 2535 - correctly formatted. 2536 - 2537 - - As an optional step, check the TTLs of arriving data looking 2538 - for RRs with excessively long TTLs. If a RR has an 2539 - excessively long TTL, say greater than 1 week, either discard 2540 - the whole response, or limit all TTLs in the response to 1 2541 - week. 2542 - 2543 - The next step is to match the response to a current resolver request. 2544 - The recommended strategy is to do a preliminary matching using the ID 2545 - field in the domain header, and then to verify that the question section 2546 - corresponds to the information currently desired. This requires that 2547 - the transmission algorithm devote several bits of the domain ID field to 2548 - a request identifier of some sort. This step has several fine points: 2549 - 2550 - - Some name servers send their responses from different 2551 - addresses than the one used to receive the query. That is, a 2552 - resolver cannot rely that a response will come from the same 2553 - address which it sent the corresponding query to. This name 2554 - server bug is typically encountered in UNIX systems. 2555 - 2556 - - If the resolver retransmits a particular request to a name 2557 - server it should be able to use a response from any of the 2558 - transmissions. However, if it is using the response to sample 2559 - the round trip time to access the name server, it must be able 2560 - to determine which transmission matches the response (and keep 2561 - transmission times for each outgoing message), or only 2562 - calculate round trip times based on initial transmissions. 2563 - 2564 - - A name server will occasionally not have a current copy of a 2565 - zone which it should have according to some NS RRs. The 2566 - resolver should simply remove the name server from the current 2567 - SLIST, and continue. 2568 - 2569 - 2570 - 2571 - 2572 - Mockapetris [Page 46] 2573 - 2574 - RFC 1035 Domain Implementation and Specification November 1987 2575 - 2576 - 2577 - 7.4. Using the cache 2578 - 2579 - In general, we expect a resolver to cache all data which it receives in 2580 - responses since it may be useful in answering future client requests. 2581 - However, there are several types of data which should not be cached: 2582 - 2583 - - When several RRs of the same type are available for a 2584 - particular owner name, the resolver should either cache them 2585 - all or none at all. When a response is truncated, and a 2586 - resolver doesn't know whether it has a complete set, it should 2587 - not cache a possibly partial set of RRs. 2588 - 2589 - - Cached data should never be used in preference to 2590 - authoritative data, so if caching would cause this to happen 2591 - the data should not be cached. 2592 - 2593 - - The results of an inverse query should not be cached. 2594 - 2595 - - The results of standard queries where the QNAME contains "*" 2596 - labels if the data might be used to construct wildcards. The 2597 - reason is that the cache does not necessarily contain existing 2598 - RRs or zone boundary information which is necessary to 2599 - restrict the application of the wildcard RRs. 2600 - 2601 - - RR data in responses of dubious reliability. When a resolver 2602 - receives unsolicited responses or RR data other than that 2603 - requested, it should discard it without caching it. The basic 2604 - implication is that all sanity checks on a packet should be 2605 - performed before any of it is cached. 2606 - 2607 - In a similar vein, when a resolver has a set of RRs for some name in a 2608 - response, and wants to cache the RRs, it should check its cache for 2609 - already existing RRs. Depending on the circumstances, either the data 2610 - in the response or the cache is preferred, but the two should never be 2611 - combined. If the data in the response is from authoritative data in the 2612 - answer section, it is always preferred. 2613 - 2614 - 8. MAIL SUPPORT 2615 - 2616 - The domain system defines a standard for mapping mailboxes into domain 2617 - names, and two methods for using the mailbox information to derive mail 2618 - routing information. The first method is called mail exchange binding 2619 - and the other method is mailbox binding. The mailbox encoding standard 2620 - and mail exchange binding are part of the DNS official protocol, and are 2621 - the recommended method for mail routing in the Internet. Mailbox 2622 - binding is an experimental feature which is still under development and 2623 - subject to change. 2624 - 2625 - 2626 - 2627 - 2628 - Mockapetris [Page 47] 2629 - 2630 - RFC 1035 Domain Implementation and Specification November 1987 2631 - 2632 - 2633 - The mailbox encoding standard assumes a mailbox name of the form 2634 - "<local-part>@<mail-domain>". While the syntax allowed in each of these 2635 - sections varies substantially between the various mail internets, the 2636 - preferred syntax for the ARPA Internet is given in [RFC-822]. 2637 - 2638 - The DNS encodes the <local-part> as a single label, and encodes the 2639 - <mail-domain> as a domain name. The single label from the <local-part> 2640 - is prefaced to the domain name from <mail-domain> to form the domain 2641 - name corresponding to the mailbox. Thus the mailbox HOSTMASTER@SRI- 2642 - NIC.ARPA is mapped into the domain name HOSTMASTER.SRI-NIC.ARPA. If the 2643 - <local-part> contains dots or other special characters, its 2644 - representation in a master file will require the use of backslash 2645 - quoting to ensure that the domain name is properly encoded. For 2646 - example, the mailbox Action.domains@ISI.EDU would be represented as 2647 - Action\.domains.ISI.EDU. 2648 - 2649 - 8.1. Mail exchange binding 2650 - 2651 - Mail exchange binding uses the <mail-domain> part of a mailbox 2652 - specification to determine where mail should be sent. The <local-part> 2653 - is not even consulted. [RFC-974] specifies this method in detail, and 2654 - should be consulted before attempting to use mail exchange support. 2655 - 2656 - One of the advantages of this method is that it decouples mail 2657 - destination naming from the hosts used to support mail service, at the 2658 - cost of another layer of indirection in the lookup function. However, 2659 - the addition layer should eliminate the need for complicated "%", "!", 2660 - etc encodings in <local-part>. 2661 - 2662 - The essence of the method is that the <mail-domain> is used as a domain 2663 - name to locate type MX RRs which list hosts willing to accept mail for 2664 - <mail-domain>, together with preference values which rank the hosts 2665 - according to an order specified by the administrators for <mail-domain>. 2666 - 2667 - In this memo, the <mail-domain> ISI.EDU is used in examples, together 2668 - with the hosts VENERA.ISI.EDU and VAXA.ISI.EDU as mail exchanges for 2669 - ISI.EDU. If a mailer had a message for Mockapetris@ISI.EDU, it would 2670 - route it by looking up MX RRs for ISI.EDU. The MX RRs at ISI.EDU name 2671 - VENERA.ISI.EDU and VAXA.ISI.EDU, and type A queries can find the host 2672 - addresses. 2673 - 2674 - 8.2. Mailbox binding (Experimental) 2675 - 2676 - In mailbox binding, the mailer uses the entire mail destination 2677 - specification to construct a domain name. The encoded domain name for 2678 - the mailbox is used as the QNAME field in a QTYPE=MAILB query. 2679 - 2680 - Several outcomes are possible for this query: 2681 - 2682 - 2683 - 2684 - Mockapetris [Page 48] 2685 - 2686 - RFC 1035 Domain Implementation and Specification November 1987 2687 - 2688 - 2689 - 1. The query can return a name error indicating that the mailbox 2690 - does not exist as a domain name. 2691 - 2692 - In the long term, this would indicate that the specified 2693 - mailbox doesn't exist. However, until the use of mailbox 2694 - binding is universal, this error condition should be 2695 - interpreted to mean that the organization identified by the 2696 - global part does not support mailbox binding. The 2697 - appropriate procedure is to revert to exchange binding at 2698 - this point. 2699 - 2700 - 2. The query can return a Mail Rename (MR) RR. 2701 - 2702 - The MR RR carries new mailbox specification in its RDATA 2703 - field. The mailer should replace the old mailbox with the 2704 - new one and retry the operation. 2705 - 2706 - 3. The query can return a MB RR. 2707 - 2708 - The MB RR carries a domain name for a host in its RDATA 2709 - field. The mailer should deliver the message to that host 2710 - via whatever protocol is applicable, e.g., b,SMTP. 2711 - 2712 - 4. The query can return one or more Mail Group (MG) RRs. 2713 - 2714 - This condition means that the mailbox was actually a mailing 2715 - list or mail group, rather than a single mailbox. Each MG RR 2716 - has a RDATA field that identifies a mailbox that is a member 2717 - of the group. The mailer should deliver a copy of the 2718 - message to each member. 2719 - 2720 - 5. The query can return a MB RR as well as one or more MG RRs. 2721 - 2722 - This condition means the the mailbox was actually a mailing 2723 - list. The mailer can either deliver the message to the host 2724 - specified by the MB RR, which will in turn do the delivery to 2725 - all members, or the mailer can use the MG RRs to do the 2726 - expansion itself. 2727 - 2728 - In any of these cases, the response may include a Mail Information 2729 - (MINFO) RR. This RR is usually associated with a mail group, but is 2730 - legal with a MB. The MINFO RR identifies two mailboxes. One of these 2731 - identifies a responsible person for the original mailbox name. This 2732 - mailbox should be used for requests to be added to a mail group, etc. 2733 - The second mailbox name in the MINFO RR identifies a mailbox that should 2734 - receive error messages for mail failures. This is particularly 2735 - appropriate for mailing lists when errors in member names should be 2736 - reported to a person other than the one who sends a message to the list. 2737 - 2738 - 2739 - 2740 - Mockapetris [Page 49] 2741 - 2742 - RFC 1035 Domain Implementation and Specification November 1987 2743 - 2744 - 2745 - New fields may be added to this RR in the future. 2746 - 2747 - 2748 - 9. REFERENCES and BIBLIOGRAPHY 2749 - 2750 - [Dyer 87] S. Dyer, F. Hsu, "Hesiod", Project Athena 2751 - Technical Plan - Name Service, April 1987, version 1.9. 2752 - 2753 - Describes the fundamentals of the Hesiod name service. 2754 - 2755 - [IEN-116] J. Postel, "Internet Name Server", IEN-116, 2756 - USC/Information Sciences Institute, August 1979. 2757 - 2758 - A name service obsoleted by the Domain Name System, but 2759 - still in use. 2760 - 2761 - [Quarterman 86] J. Quarterman, and J. Hoskins, "Notable Computer Networks", 2762 - Communications of the ACM, October 1986, volume 29, number 2763 - 10. 2764 - 2765 - [RFC-742] K. Harrenstien, "NAME/FINGER", RFC-742, Network 2766 - Information Center, SRI International, December 1977. 2767 - 2768 - [RFC-768] J. Postel, "User Datagram Protocol", RFC-768, 2769 - USC/Information Sciences Institute, August 1980. 2770 - 2771 - [RFC-793] J. Postel, "Transmission Control Protocol", RFC-793, 2772 - USC/Information Sciences Institute, September 1981. 2773 - 2774 - [RFC-799] D. Mills, "Internet Name Domains", RFC-799, COMSAT, 2775 - September 1981. 2776 - 2777 - Suggests introduction of a hierarchy in place of a flat 2778 - name space for the Internet. 2779 - 2780 - [RFC-805] J. Postel, "Computer Mail Meeting Notes", RFC-805, 2781 - USC/Information Sciences Institute, February 1982. 2782 - 2783 - [RFC-810] E. Feinler, K. Harrenstien, Z. Su, and V. White, "DOD 2784 - Internet Host Table Specification", RFC-810, Network 2785 - Information Center, SRI International, March 1982. 2786 - 2787 - Obsolete. See RFC-952. 2788 - 2789 - [RFC-811] K. Harrenstien, V. White, and E. Feinler, "Hostnames 2790 - Server", RFC-811, Network Information Center, SRI 2791 - International, March 1982. 2792 - 2793 - 2794 - 2795 - 2796 - Mockapetris [Page 50] 2797 - 2798 - RFC 1035 Domain Implementation and Specification November 1987 2799 - 2800 - 2801 - Obsolete. See RFC-953. 2802 - 2803 - [RFC-812] K. Harrenstien, and V. White, "NICNAME/WHOIS", RFC-812, 2804 - Network Information Center, SRI International, March 2805 - 1982. 2806 - 2807 - [RFC-819] Z. Su, and J. Postel, "The Domain Naming Convention for 2808 - Internet User Applications", RFC-819, Network 2809 - Information Center, SRI International, August 1982. 2810 - 2811 - Early thoughts on the design of the domain system. 2812 - Current implementation is completely different. 2813 - 2814 - [RFC-821] J. Postel, "Simple Mail Transfer Protocol", RFC-821, 2815 - USC/Information Sciences Institute, August 1980. 2816 - 2817 - [RFC-830] Z. Su, "A Distributed System for Internet Name Service", 2818 - RFC-830, Network Information Center, SRI International, 2819 - October 1982. 2820 - 2821 - Early thoughts on the design of the domain system. 2822 - Current implementation is completely different. 2823 - 2824 - [RFC-882] P. Mockapetris, "Domain names - Concepts and 2825 - Facilities," RFC-882, USC/Information Sciences 2826 - Institute, November 1983. 2827 - 2828 - Superceeded by this memo. 2829 - 2830 - [RFC-883] P. Mockapetris, "Domain names - Implementation and 2831 - Specification," RFC-883, USC/Information Sciences 2832 - Institute, November 1983. 2833 - 2834 - Superceeded by this memo. 2835 - 2836 - [RFC-920] J. Postel and J. Reynolds, "Domain Requirements", 2837 - RFC-920, USC/Information Sciences Institute, 2838 - October 1984. 2839 - 2840 - Explains the naming scheme for top level domains. 2841 - 2842 - [RFC-952] K. Harrenstien, M. Stahl, E. Feinler, "DoD Internet Host 2843 - Table Specification", RFC-952, SRI, October 1985. 2844 - 2845 - Specifies the format of HOSTS.TXT, the host/address 2846 - table replaced by the DNS. 2847 - 2848 - 2849 - 2850 - 2851 - 2852 - Mockapetris [Page 51] 2853 - 2854 - RFC 1035 Domain Implementation and Specification November 1987 2855 - 2856 - 2857 - [RFC-953] K. Harrenstien, M. Stahl, E. Feinler, "HOSTNAME Server", 2858 - RFC-953, SRI, October 1985. 2859 - 2860 - This RFC contains the official specification of the 2861 - hostname server protocol, which is obsoleted by the DNS. 2862 - This TCP based protocol accesses information stored in 2863 - the RFC-952 format, and is used to obtain copies of the 2864 - host table. 2865 - 2866 - [RFC-973] P. Mockapetris, "Domain System Changes and 2867 - Observations", RFC-973, USC/Information Sciences 2868 - Institute, January 1986. 2869 - 2870 - Describes changes to RFC-882 and RFC-883 and reasons for 2871 - them. 2872 - 2873 - [RFC-974] C. Partridge, "Mail routing and the domain system", 2874 - RFC-974, CSNET CIC BBN Labs, January 1986. 2875 - 2876 - Describes the transition from HOSTS.TXT based mail 2877 - addressing to the more powerful MX system used with the 2878 - domain system. 2879 - 2880 - [RFC-1001] NetBIOS Working Group, "Protocol standard for a NetBIOS 2881 - service on a TCP/UDP transport: Concepts and Methods", 2882 - RFC-1001, March 1987. 2883 - 2884 - This RFC and RFC-1002 are a preliminary design for 2885 - NETBIOS on top of TCP/IP which proposes to base NetBIOS 2886 - name service on top of the DNS. 2887 - 2888 - [RFC-1002] NetBIOS Working Group, "Protocol standard for a NetBIOS 2889 - service on a TCP/UDP transport: Detailed 2890 - Specifications", RFC-1002, March 1987. 2891 - 2892 - [RFC-1010] J. Reynolds, and J. Postel, "Assigned Numbers", RFC-1010, 2893 - USC/Information Sciences Institute, May 1987. 2894 - 2895 - Contains socket numbers and mnemonics for host names, 2896 - operating systems, etc. 2897 - 2898 - [RFC-1031] W. Lazear, "MILNET Name Domain Transition", RFC-1031, 2899 - November 1987. 2900 - 2901 - Describes a plan for converting the MILNET to the DNS. 2902 - 2903 - [RFC-1032] M. Stahl, "Establishing a Domain - Guidelines for 2904 - Administrators", RFC-1032, November 1987. 2905 - 2906 - 2907 - 2908 - Mockapetris [Page 52] 2909 - 2910 - RFC 1035 Domain Implementation and Specification November 1987 2911 - 2912 - 2913 - Describes the registration policies used by the NIC to 2914 - administer the top level domains and delegate subzones. 2915 - 2916 - [RFC-1033] M. Lottor, "Domain Administrators Operations Guide", 2917 - RFC-1033, November 1987. 2918 - 2919 - A cookbook for domain administrators. 2920 - 2921 - [Solomon 82] M. Solomon, L. Landweber, and D. Neuhengen, "The CSNET 2922 - Name Server", Computer Networks, vol 6, nr 3, July 1982. 2923 - 2924 - Describes a name service for CSNET which is independent 2925 - from the DNS and DNS use in the CSNET. 2926 - 2927 - 2928 - 2929 - 2930 - 2931 - 2932 - 2933 - 2934 - 2935 - 2936 - 2937 - 2938 - 2939 - 2940 - 2941 - 2942 - 2943 - 2944 - 2945 - 2946 - 2947 - 2948 - 2949 - 2950 - 2951 - 2952 - 2953 - 2954 - 2955 - 2956 - 2957 - 2958 - 2959 - 2960 - 2961 - 2962 - 2963 - 2964 - Mockapetris [Page 53] 2965 - 2966 - RFC 1035 Domain Implementation and Specification November 1987 2967 - 2968 - 2969 - Index 2970 - 2971 - * 13 2972 - 2973 - ; 33, 35 2974 - 2975 - <character-string> 35 2976 - <domain-name> 34 2977 - 2978 - @ 35 2979 - 2980 - \ 35 2981 - 2982 - A 12 2983 - 2984 - Byte order 8 2985 - 2986 - CH 13 2987 - Character case 9 2988 - CLASS 11 2989 - CNAME 12 2990 - Completion 42 2991 - CS 13 2992 - 2993 - Hesiod 13 2994 - HINFO 12 2995 - HS 13 2996 - 2997 - IN 13 2998 - IN-ADDR.ARPA domain 22 2999 - Inverse queries 40 3000 - 3001 - Mailbox names 47 3002 - MB 12 3003 - MD 12 3004 - MF 12 3005 - MG 12 3006 - MINFO 12 3007 - MINIMUM 20 3008 - MR 12 3009 - MX 12 3010 - 3011 - NS 12 3012 - NULL 12 3013 - 3014 - Port numbers 32 3015 - Primary server 5 3016 - PTR 12, 18 3017 - 3018 - 3019 - 3020 - Mockapetris [Page 54] 3021 - 3022 - RFC 1035 Domain Implementation and Specification November 1987 3023 - 3024 - 3025 - QCLASS 13 3026 - QTYPE 12 3027 - 3028 - RDATA 12 3029 - RDLENGTH 11 3030 - 3031 - Secondary server 5 3032 - SOA 12 3033 - Stub resolvers 7 3034 - 3035 - TCP 32 3036 - TXT 12 3037 - TYPE 11 3038 - 3039 - UDP 32 3040 - 3041 - WKS 12 3042 - 3043 - 3044 - 3045 - 3046 - 3047 - 3048 - 3049 - 3050 - 3051 - 3052 - 3053 - 3054 - 3055 - 3056 - 3057 - 3058 - 3059 - 3060 - 3061 - 3062 - 3063 - 3064 - 3065 - 3066 - 3067 - 3068 - 3069 - 3070 - 3071 - 3072 - 3073 - 3074 - 3075 - 3076 - Mockapetris [Page 55] 3077 -
-1963
ocaml-punycode/spec/rfc3492.txt
··· 1 - 2 - 3 - 4 - 5 - 6 - 7 - Network Working Group A. Costello 8 - Request for Comments: 3492 Univ. of California, Berkeley 9 - Category: Standards Track March 2003 10 - 11 - 12 - Punycode: A Bootstring encoding of Unicode 13 - for Internationalized Domain Names in Applications (IDNA) 14 - 15 - Status of this Memo 16 - 17 - This document specifies an Internet standards track protocol for the 18 - Internet community, and requests discussion and suggestions for 19 - improvements. Please refer to the current edition of the "Internet 20 - Official Protocol Standards" (STD 1) for the standardization state 21 - and status of this protocol. Distribution of this memo is unlimited. 22 - 23 - Copyright Notice 24 - 25 - Copyright (C) The Internet Society (2003). All Rights Reserved. 26 - 27 - Abstract 28 - 29 - Punycode is a simple and efficient transfer encoding syntax designed 30 - for use with Internationalized Domain Names in Applications (IDNA). 31 - It uniquely and reversibly transforms a Unicode string into an ASCII 32 - string. ASCII characters in the Unicode string are represented 33 - literally, and non-ASCII characters are represented by ASCII 34 - characters that are allowed in host name labels (letters, digits, and 35 - hyphens). This document defines a general algorithm called 36 - Bootstring that allows a string of basic code points to uniquely 37 - represent any string of code points drawn from a larger set. 38 - Punycode is an instance of Bootstring that uses particular parameter 39 - values specified by this document, appropriate for IDNA. 40 - 41 - Table of Contents 42 - 43 - 1. Introduction...............................................2 44 - 1.1 Features..............................................2 45 - 1.2 Interaction of protocol parts.........................3 46 - 2. Terminology................................................3 47 - 3. Bootstring description.....................................4 48 - 3.1 Basic code point segregation..........................4 49 - 3.2 Insertion unsort coding...............................4 50 - 3.3 Generalized variable-length integers..................5 51 - 3.4 Bias adaptation.......................................7 52 - 4. Bootstring parameters......................................8 53 - 5. Parameter values for Punycode..............................8 54 - 6. Bootstring algorithms......................................9 55 - 56 - 57 - 58 - Costello Standards Track [Page 1] 59 - 60 - RFC 3492 IDNA Punycode March 2003 61 - 62 - 63 - 6.1 Bias adaptation function.............................10 64 - 6.2 Decoding procedure...................................11 65 - 6.3 Encoding procedure...................................12 66 - 6.4 Overflow handling....................................13 67 - 7. Punycode examples.........................................14 68 - 7.1 Sample strings.......................................14 69 - 7.2 Decoding traces......................................17 70 - 7.3 Encoding traces......................................19 71 - 8. Security Considerations...................................20 72 - 9. References................................................21 73 - 9.1 Normative References.................................21 74 - 9.2 Informative References...............................21 75 - A. Mixed-case annotation.....................................22 76 - B. Disclaimer and license....................................22 77 - C. Punycode sample implementation............................23 78 - Author's Address.............................................34 79 - Full Copyright Statement.....................................35 80 - 81 - 1. Introduction 82 - 83 - [IDNA] describes an architecture for supporting internationalized 84 - domain names. Labels containing non-ASCII characters can be 85 - represented by ACE labels, which begin with a special ACE prefix and 86 - contain only ASCII characters. The remainder of the label after the 87 - prefix is a Punycode encoding of a Unicode string satisfying certain 88 - constraints. For the details of the prefix and constraints, see 89 - [IDNA] and [NAMEPREP]. 90 - 91 - Punycode is an instance of a more general algorithm called 92 - Bootstring, which allows strings composed from a small set of "basic" 93 - code points to uniquely represent any string of code points drawn 94 - from a larger set. Punycode is Bootstring with particular parameter 95 - values appropriate for IDNA. 96 - 97 - 1.1 Features 98 - 99 - Bootstring has been designed to have the following features: 100 - 101 - * Completeness: Every extended string (sequence of arbitrary code 102 - points) can be represented by a basic string (sequence of basic 103 - code points). Restrictions on what strings are allowed, and on 104 - length, can be imposed by higher layers. 105 - 106 - * Uniqueness: There is at most one basic string that represents a 107 - given extended string. 108 - 109 - * Reversibility: Any extended string mapped to a basic string can 110 - be recovered from that basic string. 111 - 112 - 113 - 114 - Costello Standards Track [Page 2] 115 - 116 - RFC 3492 IDNA Punycode March 2003 117 - 118 - 119 - * Efficient encoding: The ratio of basic string length to extended 120 - string length is small. This is important in the context of 121 - domain names because RFC 1034 [RFC1034] restricts the length of a 122 - domain label to 63 characters. 123 - 124 - * Simplicity: The encoding and decoding algorithms are reasonably 125 - simple to implement. The goals of efficiency and simplicity are 126 - at odds; Bootstring aims at a good balance between them. 127 - 128 - * Readability: Basic code points appearing in the extended string 129 - are represented as themselves in the basic string (although the 130 - main purpose is to improve efficiency, not readability). 131 - 132 - Punycode can also support an additional feature that is not used by 133 - the ToASCII and ToUnicode operations of [IDNA]. When extended 134 - strings are case-folded prior to encoding, the basic string can use 135 - mixed case to tell how to convert the folded string into a mixed-case 136 - string. See appendix A "Mixed-case annotation". 137 - 138 - 1.2 Interaction of protocol parts 139 - 140 - Punycode is used by the IDNA protocol [IDNA] for converting domain 141 - labels into ASCII; it is not designed for any other purpose. It is 142 - explicitly not designed for processing arbitrary free text. 143 - 144 - 2. Terminology 145 - 146 - The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", 147 - "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this 148 - document are to be interpreted as described in BCP 14, RFC 2119 149 - [RFC2119]. 150 - 151 - A code point is an integral value associated with a character in a 152 - coded character set. 153 - 154 - As in the Unicode Standard [UNICODE], Unicode code points are denoted 155 - by "U+" followed by four to six hexadecimal digits, while a range of 156 - code points is denoted by two hexadecimal numbers separated by "..", 157 - with no prefixes. 158 - 159 - The operators div and mod perform integer division; (x div y) is the 160 - quotient of x divided by y, discarding the remainder, and (x mod y) 161 - is the remainder, so (x div y) * y + (x mod y) == x. Bootstring uses 162 - these operators only with nonnegative operands, so the quotient and 163 - remainder are always nonnegative. 164 - 165 - The break statement jumps out of the innermost loop (as in C). 166 - 167 - 168 - 169 - 170 - Costello Standards Track [Page 3] 171 - 172 - RFC 3492 IDNA Punycode March 2003 173 - 174 - 175 - An overflow is an attempt to compute a value that exceeds the maximum 176 - value of an integer variable. 177 - 178 - 3. Bootstring description 179 - 180 - Bootstring represents an arbitrary sequence of code points (the 181 - "extended string") as a sequence of basic code points (the "basic 182 - string"). This section describes the representation. Section 6 183 - "Bootstring algorithms" presents the algorithms as pseudocode. 184 - Sections 7.1 "Decoding traces" and 7.2 "Encoding traces" trace the 185 - algorithms for sample inputs. 186 - 187 - The following sections describe the four techniques used in 188 - Bootstring. "Basic code point segregation" is a very simple and 189 - efficient encoding for basic code points occurring in the extended 190 - string: they are simply copied all at once. "Insertion unsort 191 - coding" encodes the non-basic code points as deltas, and processes 192 - the code points in numerical order rather than in order of 193 - appearance, which typically results in smaller deltas. The deltas 194 - are represented as "generalized variable-length integers", which use 195 - basic code points to represent nonnegative integers. The parameters 196 - of this integer representation are dynamically adjusted using "bias 197 - adaptation", to improve efficiency when consecutive deltas have 198 - similar magnitudes. 199 - 200 - 3.1 Basic code point segregation 201 - 202 - All basic code points appearing in the extended string are 203 - represented literally at the beginning of the basic string, in their 204 - original order, followed by a delimiter if (and only if) the number 205 - of basic code points is nonzero. The delimiter is a particular basic 206 - code point, which never appears in the remainder of the basic string. 207 - The decoder can therefore find the end of the literal portion (if 208 - there is one) by scanning for the last delimiter. 209 - 210 - 3.2 Insertion unsort coding 211 - 212 - The remainder of the basic string (after the last delimiter if there 213 - is one) represents a sequence of nonnegative integral deltas as 214 - generalized variable-length integers, described in section 3.3. The 215 - meaning of the deltas is best understood in terms of the decoder. 216 - 217 - The decoder builds the extended string incrementally. Initially, the 218 - extended string is a copy of the literal portion of the basic string 219 - (excluding the last delimiter). The decoder inserts non-basic code 220 - points, one for each delta, into the extended string, ultimately 221 - arriving at the final decoded string. 222 - 223 - 224 - 225 - 226 - Costello Standards Track [Page 4] 227 - 228 - RFC 3492 IDNA Punycode March 2003 229 - 230 - 231 - At the heart of this process is a state machine with two state 232 - variables: an index i and a counter n. The index i refers to a 233 - position in the extended string; it ranges from 0 (the first 234 - position) to the current length of the extended string (which refers 235 - to a potential position beyond the current end). If the current 236 - state is <n,i>, the next state is <n,i+1> if i is less than the 237 - length of the extended string, or <n+1,0> if i equals the length of 238 - the extended string. In other words, each state change causes i to 239 - increment, wrapping around to zero if necessary, and n counts the 240 - number of wrap-arounds. 241 - 242 - Notice that the state always advances monotonically (there is no way 243 - for the decoder to return to an earlier state). At each state, an 244 - insertion is either performed or not performed. At most one 245 - insertion is performed in a given state. An insertion inserts the 246 - value of n at position i in the extended string. The deltas are a 247 - run-length encoding of this sequence of events: they are the lengths 248 - of the runs of non-insertion states preceeding the insertion states. 249 - Hence, for each delta, the decoder performs delta state changes, then 250 - an insertion, and then one more state change. (An implementation 251 - need not perform each state change individually, but can instead use 252 - division and remainder calculations to compute the next insertion 253 - state directly.) It is an error if the inserted code point is a 254 - basic code point (because basic code points were supposed to be 255 - segregated as described in section 3.1). 256 - 257 - The encoder's main task is to derive the sequence of deltas that will 258 - cause the decoder to construct the desired string. It can do this by 259 - repeatedly scanning the extended string for the next code point that 260 - the decoder would need to insert, and counting the number of state 261 - changes the decoder would need to perform, mindful of the fact that 262 - the decoder's extended string will include only those code points 263 - that have already been inserted. Section 6.3 "Encoding procedure" 264 - gives a precise algorithm. 265 - 266 - 3.3 Generalized variable-length integers 267 - 268 - In a conventional integer representation the base is the number of 269 - distinct symbols for digits, whose values are 0 through base-1. Let 270 - digit_0 denote the least significant digit, digit_1 the next least 271 - significant, and so on. The value represented is the sum over j of 272 - digit_j * w(j), where w(j) = base^j is the weight (scale factor) for 273 - position j. For example, in the base 8 integer 437, the digits are 274 - 7, 3, and 4, and the weights are 1, 8, and 64, so the value is 7 + 275 - 3*8 + 4*64 = 287. This representation has two disadvantages: First, 276 - there are multiple encodings of each value (because there can be 277 - extra zeros in the most significant positions), which is inconvenient 278 - 279 - 280 - 281 - 282 - Costello Standards Track [Page 5] 283 - 284 - RFC 3492 IDNA Punycode March 2003 285 - 286 - 287 - when unique encodings are needed. Second, the integer is not self- 288 - delimiting, so if multiple integers are concatenated the boundaries 289 - between them are lost. 290 - 291 - The generalized variable-length representation solves these two 292 - problems. The digit values are still 0 through base-1, but now the 293 - integer is self-delimiting by means of thresholds t(j), each of which 294 - is in the range 0 through base-1. Exactly one digit, the most 295 - significant, satisfies digit_j < t(j). Therefore, if several 296 - integers are concatenated, it is easy to separate them, starting with 297 - the first if they are little-endian (least significant digit first), 298 - or starting with the last if they are big-endian (most significant 299 - digit first). As before, the value is the sum over j of digit_j * 300 - w(j), but the weights are different: 301 - 302 - w(0) = 1 303 - w(j) = w(j-1) * (base - t(j-1)) for j > 0 304 - 305 - For example, consider the little-endian sequence of base 8 digits 306 - 734251... Suppose the thresholds are 2, 3, 5, 5, 5, 5... This 307 - implies that the weights are 1, 1*(8-2) = 6, 6*(8-3) = 30, 30*(8-5) = 308 - 90, 90*(8-5) = 270, and so on. 7 is not less than 2, and 3 is not 309 - less than 3, but 4 is less than 5, so 4 is the last digit. The value 310 - of 734 is 7*1 + 3*6 + 4*30 = 145. The next integer is 251, with 311 - value 2*1 + 5*6 + 1*30 = 62. Decoding this representation is very 312 - similar to decoding a conventional integer: Start with a current 313 - value of N = 0 and a weight w = 1. Fetch the next digit d and 314 - increase N by d * w. If d is less than the current threshold (t) 315 - then stop, otherwise increase w by a factor of (base - t), update t 316 - for the next position, and repeat. 317 - 318 - Encoding this representation is similar to encoding a conventional 319 - integer: If N < t then output one digit for N and stop, otherwise 320 - output the digit for t + ((N - t) mod (base - t)), then replace N 321 - with (N - t) div (base - t), update t for the next position, and 322 - repeat. 323 - 324 - For any particular set of values of t(j), there is exactly one 325 - generalized variable-length representation of each nonnegative 326 - integral value. 327 - 328 - Bootstring uses little-endian ordering so that the deltas can be 329 - separated starting with the first. The t(j) values are defined in 330 - terms of the constants base, tmin, and tmax, and a state variable 331 - called bias: 332 - 333 - t(j) = base * (j + 1) - bias, 334 - clamped to the range tmin through tmax 335 - 336 - 337 - 338 - Costello Standards Track [Page 6] 339 - 340 - RFC 3492 IDNA Punycode March 2003 341 - 342 - 343 - The clamping means that if the formula yields a value less than tmin 344 - or greater than tmax, then t(j) = tmin or tmax, respectively. (In 345 - the pseudocode in section 6 "Bootstring algorithms", the expression 346 - base * (j + 1) is denoted by k for performance reasons.) These t(j) 347 - values cause the representation to favor integers within a particular 348 - range determined by the bias. 349 - 350 - 3.4 Bias adaptation 351 - 352 - After each delta is encoded or decoded, bias is set for the next 353 - delta as follows: 354 - 355 - 1. Delta is scaled in order to avoid overflow in the next step: 356 - 357 - let delta = delta div 2 358 - 359 - But when this is the very first delta, the divisor is not 2, but 360 - instead a constant called damp. This compensates for the fact 361 - that the second delta is usually much smaller than the first. 362 - 363 - 2. Delta is increased to compensate for the fact that the next delta 364 - will be inserting into a longer string: 365 - 366 - let delta = delta + (delta div numpoints) 367 - 368 - numpoints is the total number of code points encoded/decoded so 369 - far (including the one corresponding to this delta itself, and 370 - including the basic code points). 371 - 372 - 3. Delta is repeatedly divided until it falls within a threshold, to 373 - predict the minimum number of digits needed to represent the next 374 - delta: 375 - 376 - while delta > ((base - tmin) * tmax) div 2 377 - do let delta = delta div (base - tmin) 378 - 379 - 4. The bias is set: 380 - 381 - let bias = 382 - (base * the number of divisions performed in step 3) + 383 - (((base - tmin + 1) * delta) div (delta + skew)) 384 - 385 - The motivation for this procedure is that the current delta 386 - provides a hint about the likely size of the next delta, and so 387 - t(j) is set to tmax for the more significant digits starting with 388 - the one expected to be last, tmin for the less significant digits 389 - up through the one expected to be third-last, and somewhere 390 - between tmin and tmax for the digit expected to be second-last 391 - 392 - 393 - 394 - Costello Standards Track [Page 7] 395 - 396 - RFC 3492 IDNA Punycode March 2003 397 - 398 - 399 - (balancing the hope of the expected-last digit being unnecessary 400 - against the danger of it being insufficient). 401 - 402 - 4. Bootstring parameters 403 - 404 - Given a set of basic code points, one needs to be designated as the 405 - delimiter. The base cannot be greater than the number of 406 - distinguishable basic code points remaining. The digit-values in the 407 - range 0 through base-1 need to be associated with distinct non- 408 - delimiter basic code points. In some cases multiple code points need 409 - to have the same digit-value; for example, uppercase and lowercase 410 - versions of the same letter need to be equivalent if basic strings 411 - are case-insensitive. 412 - 413 - The initial value of n cannot be greater than the minimum non-basic 414 - code point that could appear in extended strings. 415 - 416 - The remaining five parameters (tmin, tmax, skew, damp, and the 417 - initial value of bias) need to satisfy the following constraints: 418 - 419 - 0 <= tmin <= tmax <= base-1 420 - skew >= 1 421 - damp >= 2 422 - initial_bias mod base <= base - tmin 423 - 424 - Provided the constraints are satisfied, these five parameters affect 425 - efficiency but not correctness. They are best chosen empirically. 426 - 427 - If support for mixed-case annotation is desired (see appendix A), 428 - make sure that the code points corresponding to 0 through tmax-1 all 429 - have both uppercase and lowercase forms. 430 - 431 - 5. Parameter values for Punycode 432 - 433 - Punycode uses the following Bootstring parameter values: 434 - 435 - base = 36 436 - tmin = 1 437 - tmax = 26 438 - skew = 38 439 - damp = 700 440 - initial_bias = 72 441 - initial_n = 128 = 0x80 442 - 443 - Although the only restriction Punycode imposes on the input integers 444 - is that they be nonnegative, these parameters are especially designed 445 - to work well with Unicode [UNICODE] code points, which are integers 446 - in the range 0..10FFFF (but not D800..DFFF, which are reserved for 447 - 448 - 449 - 450 - Costello Standards Track [Page 8] 451 - 452 - RFC 3492 IDNA Punycode March 2003 453 - 454 - 455 - use by the UTF-16 encoding of Unicode). The basic code points are 456 - the ASCII [ASCII] code points (0..7F), of which U+002D (-) is the 457 - delimiter, and some of the others have digit-values as follows: 458 - 459 - code points digit-values 460 - ------------ ---------------------- 461 - 41..5A (A-Z) = 0 to 25, respectively 462 - 61..7A (a-z) = 0 to 25, respectively 463 - 30..39 (0-9) = 26 to 35, respectively 464 - 465 - Using hyphen-minus as the delimiter implies that the encoded string 466 - can end with a hyphen-minus only if the Unicode string consists 467 - entirely of basic code points, but IDNA forbids such strings from 468 - being encoded. The encoded string can begin with a hyphen-minus, but 469 - IDNA prepends a prefix. Therefore IDNA using Punycode conforms to 470 - the RFC 952 rule that host name labels neither begin nor end with a 471 - hyphen-minus [RFC952]. 472 - 473 - A decoder MUST recognize the letters in both uppercase and lowercase 474 - forms (including mixtures of both forms). An encoder SHOULD output 475 - only uppercase forms or only lowercase forms, unless it uses mixed- 476 - case annotation (see appendix A). 477 - 478 - Presumably most users will not manually write or type encoded strings 479 - (as opposed to cutting and pasting them), but those who do will need 480 - to be alert to the potential visual ambiguity between the following 481 - sets of characters: 482 - 483 - G 6 484 - I l 1 485 - O 0 486 - S 5 487 - U V 488 - Z 2 489 - 490 - Such ambiguities are usually resolved by context, but in a Punycode 491 - encoded string there is no context apparent to humans. 492 - 493 - 6. Bootstring algorithms 494 - 495 - Some parts of the pseudocode can be omitted if the parameters satisfy 496 - certain conditions (for which Punycode qualifies). These parts are 497 - enclosed in {braces}, and notes immediately following the pseudocode 498 - explain the conditions under which they can be omitted. 499 - 500 - 501 - 502 - 503 - 504 - 505 - 506 - Costello Standards Track [Page 9] 507 - 508 - RFC 3492 IDNA Punycode March 2003 509 - 510 - 511 - Formally, code points are integers, and hence the pseudocode assumes 512 - that arithmetic operations can be performed directly on code points. 513 - In some programming languages, explicit conversion between code 514 - points and integers might be necessary. 515 - 516 - 6.1 Bias adaptation function 517 - 518 - function adapt(delta,numpoints,firsttime): 519 - if firsttime then let delta = delta div damp 520 - else let delta = delta div 2 521 - let delta = delta + (delta div numpoints) 522 - let k = 0 523 - while delta > ((base - tmin) * tmax) div 2 do begin 524 - let delta = delta div (base - tmin) 525 - let k = k + base 526 - end 527 - return k + (((base - tmin + 1) * delta) div (delta + skew)) 528 - 529 - It does not matter whether the modifications to delta and k inside 530 - adapt() affect variables of the same name inside the 531 - encoding/decoding procedures, because after calling adapt() the 532 - caller does not read those variables before overwriting them. 533 - 534 - 535 - 536 - 537 - 538 - 539 - 540 - 541 - 542 - 543 - 544 - 545 - 546 - 547 - 548 - 549 - 550 - 551 - 552 - 553 - 554 - 555 - 556 - 557 - 558 - 559 - 560 - 561 - 562 - Costello Standards Track [Page 10] 563 - 564 - RFC 3492 IDNA Punycode March 2003 565 - 566 - 567 - 6.2 Decoding procedure 568 - 569 - let n = initial_n 570 - let i = 0 571 - let bias = initial_bias 572 - let output = an empty string indexed from 0 573 - consume all code points before the last delimiter (if there is one) 574 - and copy them to output, fail on any non-basic code point 575 - if more than zero code points were consumed then consume one more 576 - (which will be the last delimiter) 577 - while the input is not exhausted do begin 578 - let oldi = i 579 - let w = 1 580 - for k = base to infinity in steps of base do begin 581 - consume a code point, or fail if there was none to consume 582 - let digit = the code point's digit-value, fail if it has none 583 - let i = i + digit * w, fail on overflow 584 - let t = tmin if k <= bias {+ tmin}, or 585 - tmax if k >= bias + tmax, or k - bias otherwise 586 - if digit < t then break 587 - let w = w * (base - t), fail on overflow 588 - end 589 - let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) 590 - let n = n + i div (length(output) + 1), fail on overflow 591 - let i = i mod (length(output) + 1) 592 - {if n is a basic code point then fail} 593 - insert n into output at position i 594 - increment i 595 - end 596 - 597 - The full statement enclosed in braces (checking whether n is a basic 598 - code point) can be omitted if initial_n exceeds all basic code points 599 - (which is true for Punycode), because n is never less than initial_n. 600 - 601 - In the assignment of t, where t is clamped to the range tmin through 602 - tmax, "+ tmin" can always be omitted. This makes the clamping 603 - calculation incorrect when bias < k < bias + tmin, but that cannot 604 - happen because of the way bias is computed and because of the 605 - constraints on the parameters. 606 - 607 - Because the decoder state can only advance monotonically, and there 608 - is only one representation of any delta, there is therefore only one 609 - encoded string that can represent a given sequence of integers. The 610 - only error conditions are invalid code points, unexpected end-of- 611 - input, overflow, and basic code points encoded using deltas instead 612 - of appearing literally. If the decoder fails on these errors as 613 - shown above, then it cannot produce the same output for two distinct 614 - inputs. Without this property it would have been necessary to re- 615 - 616 - 617 - 618 - Costello Standards Track [Page 11] 619 - 620 - RFC 3492 IDNA Punycode March 2003 621 - 622 - 623 - encode the output and verify that it matches the input in order to 624 - guarantee the uniqueness of the encoding. 625 - 626 - 6.3 Encoding procedure 627 - 628 - let n = initial_n 629 - let delta = 0 630 - let bias = initial_bias 631 - let h = b = the number of basic code points in the input 632 - copy them to the output in order, followed by a delimiter if b > 0 633 - {if the input contains a non-basic code point < n then fail} 634 - while h < length(input) do begin 635 - let m = the minimum {non-basic} code point >= n in the input 636 - let delta = delta + (m - n) * (h + 1), fail on overflow 637 - let n = m 638 - for each code point c in the input (in order) do begin 639 - if c < n {or c is basic} then increment delta, fail on overflow 640 - if c == n then begin 641 - let q = delta 642 - for k = base to infinity in steps of base do begin 643 - let t = tmin if k <= bias {+ tmin}, or 644 - tmax if k >= bias + tmax, or k - bias otherwise 645 - if q < t then break 646 - output the code point for digit t + ((q - t) mod (base - t)) 647 - let q = (q - t) div (base - t) 648 - end 649 - output the code point for digit q 650 - let bias = adapt(delta, h + 1, test h equals b?) 651 - let delta = 0 652 - increment h 653 - end 654 - end 655 - increment delta and n 656 - end 657 - 658 - The full statement enclosed in braces (checking whether the input 659 - contains a non-basic code point less than n) can be omitted if all 660 - code points less than initial_n are basic code points (which is true 661 - for Punycode if code points are unsigned). 662 - 663 - The brace-enclosed conditions "non-basic" and "or c is basic" can be 664 - omitted if initial_n exceeds all basic code points (which is true for 665 - Punycode), because the code point being tested is never less than 666 - initial_n. 667 - 668 - In the assignment of t, where t is clamped to the range tmin through 669 - tmax, "+ tmin" can always be omitted. This makes the clamping 670 - calculation incorrect when bias < k < bias + tmin, but that cannot 671 - 672 - 673 - 674 - Costello Standards Track [Page 12] 675 - 676 - RFC 3492 IDNA Punycode March 2003 677 - 678 - 679 - happen because of the way bias is computed and because of the 680 - constraints on the parameters. 681 - 682 - The checks for overflow are necessary to avoid producing invalid 683 - output when the input contains very large values or is very long. 684 - 685 - The increment of delta at the bottom of the outer loop cannot 686 - overflow because delta < length(input) before the increment, and 687 - length(input) is already assumed to be representable. The increment 688 - of n could overflow, but only if h == length(input), in which case 689 - the procedure is finished anyway. 690 - 691 - 6.4 Overflow handling 692 - 693 - For IDNA, 26-bit unsigned integers are sufficient to handle all valid 694 - IDNA labels without overflow, because any string that needed a 27-bit 695 - delta would have to exceed either the code point limit (0..10FFFF) or 696 - the label length limit (63 characters). However, overflow handling 697 - is necessary because the inputs are not necessarily valid IDNA 698 - labels. 699 - 700 - If the programming language does not provide overflow detection, the 701 - following technique can be used. Suppose A, B, and C are 702 - representable nonnegative integers and C is nonzero. Then A + B 703 - overflows if and only if B > maxint - A, and A + (B * C) overflows if 704 - and only if B > (maxint - A) div C, where maxint is the greatest 705 - integer for which maxint + 1 cannot be represented. Refer to 706 - appendix C "Punycode sample implementation" for demonstrations of 707 - this technique in the C language. 708 - 709 - The decoding and encoding algorithms shown in sections 6.2 and 6.3 710 - handle overflow by detecting it whenever it happens. Another 711 - approach is to enforce limits on the inputs that prevent overflow 712 - from happening. For example, if the encoder were to verify that no 713 - input code points exceed M and that the input length does not exceed 714 - L, then no delta could ever exceed (M - initial_n) * (L + 1), and 715 - hence no overflow could occur if integer variables were capable of 716 - representing values that large. This prevention approach would 717 - impose more restrictions on the input than the detection approach 718 - does, but might be considered simpler in some programming languages. 719 - 720 - In theory, the decoder could use an analogous approach, limiting the 721 - number of digits in a variable-length integer (that is, limiting the 722 - number of iterations in the innermost loop). However, the number of 723 - digits that suffice to represent a given delta can sometimes 724 - represent much larger deltas (because of the adaptation), and hence 725 - this approach would probably need integers wider than 32 bits. 726 - 727 - 728 - 729 - 730 - Costello Standards Track [Page 13] 731 - 732 - RFC 3492 IDNA Punycode March 2003 733 - 734 - 735 - Yet another approach for the decoder is to allow overflow to occur, 736 - but to check the final output string by re-encoding it and comparing 737 - to the decoder input. If and only if they do not match (using a 738 - case-insensitive ASCII comparison) overflow has occurred. This 739 - delayed-detection approach would not impose any more restrictions on 740 - the input than the immediate-detection approach does, and might be 741 - considered simpler in some programming languages. 742 - 743 - In fact, if the decoder is used only inside the IDNA ToUnicode 744 - operation [IDNA], then it need not check for overflow at all, because 745 - ToUnicode performs a higher level re-encoding and comparison, and a 746 - mismatch has the same consequence as if the Punycode decoder had 747 - failed. 748 - 749 - 7. Punycode examples 750 - 751 - 7.1 Sample strings 752 - 753 - In the Punycode encodings below, the ACE prefix is not shown. 754 - Backslashes show where line breaks have been inserted in strings too 755 - long for one line. 756 - 757 - The first several examples are all translations of the sentence "Why 758 - can't they just speak in <language>?" (courtesy of Michael Kaplan's 759 - "provincial" page [PROVINCIAL]). Word breaks and punctuation have 760 - been removed, as is often done in domain names. 761 - 762 - (A) Arabic (Egyptian): 763 - u+0644 u+064A u+0647 u+0645 u+0627 u+0628 u+062A u+0643 u+0644 764 - u+0645 u+0648 u+0634 u+0639 u+0631 u+0628 u+064A u+061F 765 - Punycode: egbpdaj6bu4bxfgehfvwxn 766 - 767 - (B) Chinese (simplified): 768 - u+4ED6 u+4EEC u+4E3A u+4EC0 u+4E48 u+4E0D u+8BF4 u+4E2D u+6587 769 - Punycode: ihqwcrb4cv8a8dqg056pqjye 770 - 771 - (C) Chinese (traditional): 772 - u+4ED6 u+5011 u+7232 u+4EC0 u+9EBD u+4E0D u+8AAA u+4E2D u+6587 773 - Punycode: ihqwctvzc91f659drss3x8bo0yb 774 - 775 - (D) Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky 776 - U+0050 u+0072 u+006F u+010D u+0070 u+0072 u+006F u+0073 u+0074 777 - u+011B u+006E u+0065 u+006D u+006C u+0075 u+0076 u+00ED u+010D 778 - u+0065 u+0073 u+006B u+0079 779 - Punycode: Proprostnemluvesky-uyb24dma41a 780 - 781 - 782 - 783 - 784 - 785 - 786 - Costello Standards Track [Page 14] 787 - 788 - RFC 3492 IDNA Punycode March 2003 789 - 790 - 791 - (E) Hebrew: 792 - u+05DC u+05DE u+05D4 u+05D4 u+05DD u+05E4 u+05E9 u+05D5 u+05D8 793 - u+05DC u+05D0 u+05DE u+05D3 u+05D1 u+05E8 u+05D9 u+05DD u+05E2 794 - u+05D1 u+05E8 u+05D9 u+05EA 795 - Punycode: 4dbcagdahymbxekheh6e0a7fei0b 796 - 797 - (F) Hindi (Devanagari): 798 - u+092F u+0939 u+0932 u+094B u+0917 u+0939 u+093F u+0928 u+094D 799 - u+0926 u+0940 u+0915 u+094D u+092F u+094B u+0902 u+0928 u+0939 800 - u+0940 u+0902 u+092C u+094B u+0932 u+0938 u+0915 u+0924 u+0947 801 - u+0939 u+0948 u+0902 802 - Punycode: i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd 803 - 804 - (G) Japanese (kanji and hiragana): 805 - u+306A u+305C u+307F u+3093 u+306A u+65E5 u+672C u+8A9E u+3092 806 - u+8A71 u+3057 u+3066 u+304F u+308C u+306A u+3044 u+306E u+304B 807 - Punycode: n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa 808 - 809 - (H) Korean (Hangul syllables): 810 - u+C138 u+ACC4 u+C758 u+BAA8 u+B4E0 u+C0AC u+B78C u+B4E4 u+C774 811 - u+D55C u+AD6D u+C5B4 u+B97C u+C774 u+D574 u+D55C u+B2E4 u+BA74 812 - u+C5BC u+B9C8 u+B098 u+C88B u+C744 u+AE4C 813 - Punycode: 989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j\ 814 - psd879ccm6fea98c 815 - 816 - (I) Russian (Cyrillic): 817 - U+043F u+043E u+0447 u+0435 u+043C u+0443 u+0436 u+0435 u+043E 818 - u+043D u+0438 u+043D u+0435 u+0433 u+043E u+0432 u+043E u+0440 819 - u+044F u+0442 u+043F u+043E u+0440 u+0443 u+0441 u+0441 u+043A 820 - u+0438 821 - Punycode: b1abfaaepdrnnbgefbaDotcwatmq2g4l 822 - 823 - (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol 824 - U+0050 u+006F u+0072 u+0071 u+0075 u+00E9 u+006E u+006F u+0070 825 - u+0075 u+0065 u+0064 u+0065 u+006E u+0073 u+0069 u+006D u+0070 826 - u+006C u+0065 u+006D u+0065 u+006E u+0074 u+0065 u+0068 u+0061 827 - u+0062 u+006C u+0061 u+0072 u+0065 u+006E U+0045 u+0073 u+0070 828 - u+0061 u+00F1 u+006F u+006C 829 - Punycode: PorqunopuedensimplementehablarenEspaol-fmd56a 830 - 831 - (K) Vietnamese: 832 - T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\ 833 - <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t 834 - U+0054 u+1EA1 u+0069 u+0073 u+0061 u+006F u+0068 u+1ECD u+006B 835 - u+0068 u+00F4 u+006E u+0067 u+0074 u+0068 u+1EC3 u+0063 u+0068 836 - u+1EC9 u+006E u+00F3 u+0069 u+0074 u+0069 u+1EBF u+006E u+0067 837 - U+0056 u+0069 u+1EC7 u+0074 838 - Punycode: TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g 839 - 840 - 841 - 842 - Costello Standards Track [Page 15] 843 - 844 - RFC 3492 IDNA Punycode March 2003 845 - 846 - 847 - The next several examples are all names of Japanese music artists, 848 - song titles, and TV programs, just because the author happens to have 849 - them handy (but Japanese is useful for providing examples of single- 850 - row text, two-row text, ideographic text, and various mixtures 851 - thereof). 852 - 853 - (L) 3<nen>B<gumi><kinpachi><sensei> 854 - u+0033 u+5E74 U+0042 u+7D44 u+91D1 u+516B u+5148 u+751F 855 - Punycode: 3B-ww4c5e180e575a65lsy2b 856 - 857 - (M) <amuro><namie>-with-SUPER-MONKEYS 858 - u+5B89 u+5BA4 u+5948 u+7F8E u+6075 u+002D u+0077 u+0069 u+0074 859 - u+0068 u+002D U+0053 U+0055 U+0050 U+0045 U+0052 u+002D U+004D 860 - U+004F U+004E U+004B U+0045 U+0059 U+0053 861 - Punycode: -with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n 862 - 863 - (N) Hello-Another-Way-<sorezore><no><basho> 864 - U+0048 u+0065 u+006C u+006C u+006F u+002D U+0041 u+006E u+006F 865 - u+0074 u+0068 u+0065 u+0072 u+002D U+0057 u+0061 u+0079 u+002D 866 - u+305D u+308C u+305E u+308C u+306E u+5834 u+6240 867 - Punycode: Hello-Another-Way--fc4qua05auwb3674vfr0b 868 - 869 - (O) <hitotsu><yane><no><shita>2 870 - u+3072 u+3068 u+3064 u+5C4B u+6839 u+306E u+4E0B u+0032 871 - Punycode: 2-u9tlzr9756bt3uc0v 872 - 873 - (P) Maji<de>Koi<suru>5<byou><mae> 874 - U+004D u+0061 u+006A u+0069 u+3067 U+004B u+006F u+0069 u+3059 875 - u+308B u+0035 u+79D2 u+524D 876 - Punycode: MajiKoi5-783gue6qz075azm5e 877 - 878 - (Q) <pafii>de<runba> 879 - u+30D1 u+30D5 u+30A3 u+30FC u+0064 u+0065 u+30EB u+30F3 u+30D0 880 - Punycode: de-jg4avhby1noc0d 881 - 882 - (R) <sono><supiido><de> 883 - u+305D u+306E u+30B9 u+30D4 u+30FC u+30C9 u+3067 884 - Punycode: d9juau41awczczp 885 - 886 - The last example is an ASCII string that breaks the existing rules 887 - for host name labels. (It is not a realistic example for IDNA, 888 - because IDNA never encodes pure ASCII labels.) 889 - 890 - (S) -> $1.00 <- 891 - u+002D u+003E u+0020 u+0024 u+0031 u+002E u+0030 u+0030 u+0020 892 - u+003C u+002D 893 - Punycode: -> $1.00 <-- 894 - 895 - 896 - 897 - 898 - Costello Standards Track [Page 16] 899 - 900 - RFC 3492 IDNA Punycode March 2003 901 - 902 - 903 - 7.2 Decoding traces 904 - 905 - In the following traces, the evolving state of the decoder is shown 906 - as a sequence of hexadecimal values, representing the code points in 907 - the extended string. An asterisk appears just after the most 908 - recently inserted code point, indicating both n (the value preceeding 909 - the asterisk) and i (the position of the value just after the 910 - asterisk). Other numerical values are decimal. 911 - 912 - Decoding trace of example B from section 7.1: 913 - 914 - n is 128, i is 0, bias is 72 915 - input is "ihqwcrb4cv8a8dqg056pqjye" 916 - there is no delimiter, so extended string starts empty 917 - delta "ihq" decodes to 19853 918 - bias becomes 21 919 - 4E0D * 920 - delta "wc" decodes to 64 921 - bias becomes 20 922 - 4E0D 4E2D * 923 - delta "rb" decodes to 37 924 - bias becomes 13 925 - 4E3A * 4E0D 4E2D 926 - delta "4c" decodes to 56 927 - bias becomes 17 928 - 4E3A 4E48 * 4E0D 4E2D 929 - delta "v8a" decodes to 599 930 - bias becomes 32 931 - 4E3A 4EC0 * 4E48 4E0D 4E2D 932 - delta "8d" decodes to 130 933 - bias becomes 23 934 - 4ED6 * 4E3A 4EC0 4E48 4E0D 4E2D 935 - delta "qg" decodes to 154 936 - bias becomes 25 937 - 4ED6 4EEC * 4E3A 4EC0 4E48 4E0D 4E2D 938 - delta "056p" decodes to 46301 939 - bias becomes 84 940 - 4ED6 4EEC 4E3A 4EC0 4E48 4E0D 4E2D 6587 * 941 - delta "qjye" decodes to 88531 942 - bias becomes 90 943 - 4ED6 4EEC 4E3A 4EC0 4E48 4E0D 8BF4 * 4E2D 6587 944 - 945 - 946 - 947 - 948 - 949 - 950 - 951 - 952 - 953 - 954 - Costello Standards Track [Page 17] 955 - 956 - RFC 3492 IDNA Punycode March 2003 957 - 958 - 959 - Decoding trace of example L from section 7.1: 960 - 961 - n is 128, i is 0, bias is 72 962 - input is "3B-ww4c5e180e575a65lsy2b" 963 - literal portion is "3B-", so extended string starts as: 964 - 0033 0042 965 - delta "ww4c" decodes to 62042 966 - bias becomes 27 967 - 0033 0042 5148 * 968 - delta "5e" decodes to 139 969 - bias becomes 24 970 - 0033 0042 516B * 5148 971 - delta "180e" decodes to 16683 972 - bias becomes 67 973 - 0033 5E74 * 0042 516B 5148 974 - delta "575a" decodes to 34821 975 - bias becomes 82 976 - 0033 5E74 0042 516B 5148 751F * 977 - delta "65l" decodes to 14592 978 - bias becomes 67 979 - 0033 5E74 0042 7D44 * 516B 5148 751F 980 - delta "sy2b" decodes to 42088 981 - bias becomes 84 982 - 0033 5E74 0042 7D44 91D1 * 516B 5148 751F 983 - 984 - 985 - 986 - 987 - 988 - 989 - 990 - 991 - 992 - 993 - 994 - 995 - 996 - 997 - 998 - 999 - 1000 - 1001 - 1002 - 1003 - 1004 - 1005 - 1006 - 1007 - 1008 - 1009 - 1010 - Costello Standards Track [Page 18] 1011 - 1012 - RFC 3492 IDNA Punycode March 2003 1013 - 1014 - 1015 - 7.3 Encoding traces 1016 - 1017 - In the following traces, code point values are hexadecimal, while 1018 - other numerical values are decimal. 1019 - 1020 - Encoding trace of example B from section 7.1: 1021 - 1022 - bias is 72 1023 - input is: 1024 - 4ED6 4EEC 4E3A 4EC0 4E48 4E0D 8BF4 4E2D 6587 1025 - there are no basic code points, so no literal portion 1026 - next code point to insert is 4E0D 1027 - needed delta is 19853, encodes as "ihq" 1028 - bias becomes 21 1029 - next code point to insert is 4E2D 1030 - needed delta is 64, encodes as "wc" 1031 - bias becomes 20 1032 - next code point to insert is 4E3A 1033 - needed delta is 37, encodes as "rb" 1034 - bias becomes 13 1035 - next code point to insert is 4E48 1036 - needed delta is 56, encodes as "4c" 1037 - bias becomes 17 1038 - next code point to insert is 4EC0 1039 - needed delta is 599, encodes as "v8a" 1040 - bias becomes 32 1041 - next code point to insert is 4ED6 1042 - needed delta is 130, encodes as "8d" 1043 - bias becomes 23 1044 - next code point to insert is 4EEC 1045 - needed delta is 154, encodes as "qg" 1046 - bias becomes 25 1047 - next code point to insert is 6587 1048 - needed delta is 46301, encodes as "056p" 1049 - bias becomes 84 1050 - next code point to insert is 8BF4 1051 - needed delta is 88531, encodes as "qjye" 1052 - bias becomes 90 1053 - output is "ihqwcrb4cv8a8dqg056pqjye" 1054 - 1055 - 1056 - 1057 - 1058 - 1059 - 1060 - 1061 - 1062 - 1063 - 1064 - 1065 - 1066 - Costello Standards Track [Page 19] 1067 - 1068 - RFC 3492 IDNA Punycode March 2003 1069 - 1070 - 1071 - Encoding trace of example L from section 7.1: 1072 - 1073 - bias is 72 1074 - input is: 1075 - 0033 5E74 0042 7D44 91D1 516B 5148 751F 1076 - basic code points (0033, 0042) are copied to literal portion: "3B-" 1077 - next code point to insert is 5148 1078 - needed delta is 62042, encodes as "ww4c" 1079 - bias becomes 27 1080 - next code point to insert is 516B 1081 - needed delta is 139, encodes as "5e" 1082 - bias becomes 24 1083 - next code point to insert is 5E74 1084 - needed delta is 16683, encodes as "180e" 1085 - bias becomes 67 1086 - next code point to insert is 751F 1087 - needed delta is 34821, encodes as "575a" 1088 - bias becomes 82 1089 - next code point to insert is 7D44 1090 - needed delta is 14592, encodes as "65l" 1091 - bias becomes 67 1092 - next code point to insert is 91D1 1093 - needed delta is 42088, encodes as "sy2b" 1094 - bias becomes 84 1095 - output is "3B-ww4c5e180e575a65lsy2b" 1096 - 1097 - 8. Security Considerations 1098 - 1099 - Users expect each domain name in DNS to be controlled by a single 1100 - authority. If a Unicode string intended for use as a domain label 1101 - could map to multiple ACE labels, then an internationalized domain 1102 - name could map to multiple ASCII domain names, each controlled by a 1103 - different authority, some of which could be spoofs that hijack 1104 - service requests intended for another. Therefore Punycode is 1105 - designed so that each Unicode string has a unique encoding. 1106 - 1107 - However, there can still be multiple Unicode representations of the 1108 - "same" text, for various definitions of "same". This problem is 1109 - addressed to some extent by the Unicode standard under the topic of 1110 - canonicalization, and this work is leveraged for domain names by 1111 - Nameprep [NAMEPREP]. 1112 - 1113 - 1114 - 1115 - 1116 - 1117 - 1118 - 1119 - 1120 - 1121 - 1122 - Costello Standards Track [Page 20] 1123 - 1124 - RFC 3492 IDNA Punycode March 2003 1125 - 1126 - 1127 - 9. References 1128 - 1129 - 9.1 Normative References 1130 - 1131 - [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate 1132 - Requirement Levels", BCP 14, RFC 2119, March 1997. 1133 - 1134 - 9.2 Informative References 1135 - 1136 - [RFC952] Harrenstien, K., Stahl, M. and E. Feinler, "DOD Internet 1137 - Host Table Specification", RFC 952, October 1985. 1138 - 1139 - [RFC1034] Mockapetris, P., "Domain Names - Concepts and 1140 - Facilities", STD 13, RFC 1034, November 1987. 1141 - 1142 - [IDNA] Faltstrom, P., Hoffman, P. and A. Costello, 1143 - "Internationalizing Domain Names in Applications 1144 - (IDNA)", RFC 3490, March 2003. 1145 - 1146 - [NAMEPREP] Hoffman, P. and M. Blanchet, "Nameprep: A Stringprep 1147 - Profile for Internationalized Domain Names (IDN)", RFC 1148 - 3491, March 2003. 1149 - 1150 - [ASCII] Cerf, V., "ASCII format for Network Interchange", RFC 1151 - 20, October 1969. 1152 - 1153 - [PROVINCIAL] Kaplan, M., "The 'anyone can be provincial!' page", 1154 - http://www.trigeminal.com/samples/provincial.html. 1155 - 1156 - [UNICODE] The Unicode Consortium, "The Unicode Standard", 1157 - http://www.unicode.org/unicode/standard/standard.html. 1158 - 1159 - 1160 - 1161 - 1162 - 1163 - 1164 - 1165 - 1166 - 1167 - 1168 - 1169 - 1170 - 1171 - 1172 - 1173 - 1174 - 1175 - 1176 - 1177 - 1178 - Costello Standards Track [Page 21] 1179 - 1180 - RFC 3492 IDNA Punycode March 2003 1181 - 1182 - 1183 - A. Mixed-case annotation 1184 - 1185 - In order to use Punycode to represent case-insensitive strings, 1186 - higher layers need to case-fold the strings prior to Punycode 1187 - encoding. The encoded string can use mixed case as an annotation 1188 - telling how to convert the folded string into a mixed-case string for 1189 - display purposes. Note, however, that mixed-case annotation is not 1190 - used by the ToASCII and ToUnicode operations specified in [IDNA], and 1191 - therefore implementors of IDNA can disregard this appendix. 1192 - 1193 - Basic code points can use mixed case directly, because the decoder 1194 - copies them verbatim, leaving lowercase code points lowercase, and 1195 - leaving uppercase code points uppercase. Each non-basic code point 1196 - is represented by a delta, which is represented by a sequence of 1197 - basic code points, the last of which provides the annotation. If it 1198 - is uppercase, it is a suggestion to map the non-basic code point to 1199 - uppercase (if possible); if it is lowercase, it is a suggestion to 1200 - map the non-basic code point to lowercase (if possible). 1201 - 1202 - These annotations do not alter the code points returned by decoders; 1203 - the annotations are returned separately, for the caller to use or 1204 - ignore. Encoders can accept annotations in addition to code points, 1205 - but the annotations do not alter the output, except to influence the 1206 - uppercase/lowercase form of ASCII letters. 1207 - 1208 - Punycode encoders and decoders need not support these annotations, 1209 - and higher layers need not use them. 1210 - 1211 - B. Disclaimer and license 1212 - 1213 - Regarding this entire document or any portion of it (including the 1214 - pseudocode and C code), the author makes no guarantees and is not 1215 - responsible for any damage resulting from its use. The author grants 1216 - irrevocable permission to anyone to use, modify, and distribute it in 1217 - any way that does not diminish the rights of anyone else to use, 1218 - modify, and distribute it, provided that redistributed derivative 1219 - works do not contain misleading author or version information. 1220 - Derivative works need not be licensed under similar terms. 1221 - 1222 - 1223 - 1224 - 1225 - 1226 - 1227 - 1228 - 1229 - 1230 - 1231 - 1232 - 1233 - 1234 - Costello Standards Track [Page 22] 1235 - 1236 - RFC 3492 IDNA Punycode March 2003 1237 - 1238 - 1239 - C. Punycode sample implementation 1240 - 1241 - /* 1242 - punycode.c from RFC 3492 1243 - http://www.nicemice.net/idn/ 1244 - Adam M. Costello 1245 - http://www.nicemice.net/amc/ 1246 - 1247 - This is ANSI C code (C89) implementing Punycode (RFC 3492). 1248 - 1249 - */ 1250 - 1251 - 1252 - /************************************************************/ 1253 - /* Public interface (would normally go in its own .h file): */ 1254 - 1255 - #include <limits.h> 1256 - 1257 - enum punycode_status { 1258 - punycode_success, 1259 - punycode_bad_input, /* Input is invalid. */ 1260 - punycode_big_output, /* Output would exceed the space provided. */ 1261 - punycode_overflow /* Input needs wider integers to process. */ 1262 - }; 1263 - 1264 - #if UINT_MAX >= (1 << 26) - 1 1265 - typedef unsigned int punycode_uint; 1266 - #else 1267 - typedef unsigned long punycode_uint; 1268 - #endif 1269 - 1270 - enum punycode_status punycode_encode( 1271 - punycode_uint input_length, 1272 - const punycode_uint input[], 1273 - const unsigned char case_flags[], 1274 - punycode_uint *output_length, 1275 - char output[] ); 1276 - 1277 - /* punycode_encode() converts Unicode to Punycode. The input */ 1278 - /* is represented as an array of Unicode code points (not code */ 1279 - /* units; surrogate pairs are not allowed), and the output */ 1280 - /* will be represented as an array of ASCII code points. The */ 1281 - /* output string is *not* null-terminated; it will contain */ 1282 - /* zeros if and only if the input contains zeros. (Of course */ 1283 - /* the caller can leave room for a terminator and add one if */ 1284 - /* needed.) The input_length is the number of code points in */ 1285 - /* the input. The output_length is an in/out argument: the */ 1286 - /* caller passes in the maximum number of code points that it */ 1287 - 1288 - 1289 - 1290 - Costello Standards Track [Page 23] 1291 - 1292 - RFC 3492 IDNA Punycode March 2003 1293 - 1294 - 1295 - /* can receive, and on successful return it will contain the */ 1296 - /* number of code points actually output. The case_flags array */ 1297 - /* holds input_length boolean values, where nonzero suggests that */ 1298 - /* the corresponding Unicode character be forced to uppercase */ 1299 - /* after being decoded (if possible), and zero suggests that */ 1300 - /* it be forced to lowercase (if possible). ASCII code points */ 1301 - /* are encoded literally, except that ASCII letters are forced */ 1302 - /* to uppercase or lowercase according to the corresponding */ 1303 - /* uppercase flags. If case_flags is a null pointer then ASCII */ 1304 - /* letters are left as they are, and other code points are */ 1305 - /* treated as if their uppercase flags were zero. The return */ 1306 - /* value can be any of the punycode_status values defined above */ 1307 - /* except punycode_bad_input; if not punycode_success, then */ 1308 - /* output_size and output might contain garbage. */ 1309 - 1310 - enum punycode_status punycode_decode( 1311 - punycode_uint input_length, 1312 - const char input[], 1313 - punycode_uint *output_length, 1314 - punycode_uint output[], 1315 - unsigned char case_flags[] ); 1316 - 1317 - /* punycode_decode() converts Punycode to Unicode. The input is */ 1318 - /* represented as an array of ASCII code points, and the output */ 1319 - /* will be represented as an array of Unicode code points. The */ 1320 - /* input_length is the number of code points in the input. The */ 1321 - /* output_length is an in/out argument: the caller passes in */ 1322 - /* the maximum number of code points that it can receive, and */ 1323 - /* on successful return it will contain the actual number of */ 1324 - /* code points output. The case_flags array needs room for at */ 1325 - /* least output_length values, or it can be a null pointer if the */ 1326 - /* case information is not needed. A nonzero flag suggests that */ 1327 - /* the corresponding Unicode character be forced to uppercase */ 1328 - /* by the caller (if possible), while zero suggests that it be */ 1329 - /* forced to lowercase (if possible). ASCII code points are */ 1330 - /* output already in the proper case, but their flags will be set */ 1331 - /* appropriately so that applying the flags would be harmless. */ 1332 - /* The return value can be any of the punycode_status values */ 1333 - /* defined above; if not punycode_success, then output_length, */ 1334 - /* output, and case_flags might contain garbage. On success, the */ 1335 - /* decoder will never need to write an output_length greater than */ 1336 - /* input_length, because of how the encoding is defined. */ 1337 - 1338 - /**********************************************************/ 1339 - /* Implementation (would normally go in its own .c file): */ 1340 - 1341 - #include <string.h> 1342 - 1343 - 1344 - 1345 - 1346 - Costello Standards Track [Page 24] 1347 - 1348 - RFC 3492 IDNA Punycode March 2003 1349 - 1350 - 1351 - /*** Bootstring parameters for Punycode ***/ 1352 - 1353 - enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700, 1354 - initial_bias = 72, initial_n = 0x80, delimiter = 0x2D }; 1355 - 1356 - /* basic(cp) tests whether cp is a basic code point: */ 1357 - #define basic(cp) ((punycode_uint)(cp) < 0x80) 1358 - 1359 - /* delim(cp) tests whether cp is a delimiter: */ 1360 - #define delim(cp) ((cp) == delimiter) 1361 - 1362 - /* decode_digit(cp) returns the numeric value of a basic code */ 1363 - /* point (for use in representing integers) in the range 0 to */ 1364 - /* base-1, or base if cp is does not represent a value. */ 1365 - 1366 - static punycode_uint decode_digit(punycode_uint cp) 1367 - { 1368 - return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 : 1369 - cp - 97 < 26 ? cp - 97 : base; 1370 - } 1371 - 1372 - /* encode_digit(d,flag) returns the basic code point whose value */ 1373 - /* (when used for representing integers) is d, which needs to be in */ 1374 - /* the range 0 to base-1. The lowercase form is used unless flag is */ 1375 - /* nonzero, in which case the uppercase form is used. The behavior */ 1376 - /* is undefined if flag is nonzero and digit d has no uppercase form. */ 1377 - 1378 - static char encode_digit(punycode_uint d, int flag) 1379 - { 1380 - return d + 22 + 75 * (d < 26) - ((flag != 0) << 5); 1381 - /* 0..25 map to ASCII a..z or A..Z */ 1382 - /* 26..35 map to ASCII 0..9 */ 1383 - } 1384 - 1385 - /* flagged(bcp) tests whether a basic code point is flagged */ 1386 - /* (uppercase). The behavior is undefined if bcp is not a */ 1387 - /* basic code point. */ 1388 - 1389 - #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26) 1390 - 1391 - /* encode_basic(bcp,flag) forces a basic code point to lowercase */ 1392 - /* if flag is zero, uppercase if flag is nonzero, and returns */ 1393 - /* the resulting code point. The code point is unchanged if it */ 1394 - /* is caseless. The behavior is undefined if bcp is not a basic */ 1395 - /* code point. */ 1396 - 1397 - static char encode_basic(punycode_uint bcp, int flag) 1398 - { 1399 - 1400 - 1401 - 1402 - Costello Standards Track [Page 25] 1403 - 1404 - RFC 3492 IDNA Punycode March 2003 1405 - 1406 - 1407 - bcp -= (bcp - 97 < 26) << 5; 1408 - return bcp + ((!flag && (bcp - 65 < 26)) << 5); 1409 - } 1410 - 1411 - /*** Platform-specific constants ***/ 1412 - 1413 - /* maxint is the maximum value of a punycode_uint variable: */ 1414 - static const punycode_uint maxint = -1; 1415 - /* Because maxint is unsigned, -1 becomes the maximum value. */ 1416 - 1417 - /*** Bias adaptation function ***/ 1418 - 1419 - static punycode_uint adapt( 1420 - punycode_uint delta, punycode_uint numpoints, int firsttime ) 1421 - { 1422 - punycode_uint k; 1423 - 1424 - delta = firsttime ? delta / damp : delta >> 1; 1425 - /* delta >> 1 is a faster way of doing delta / 2 */ 1426 - delta += delta / numpoints; 1427 - 1428 - for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) { 1429 - delta /= base - tmin; 1430 - } 1431 - 1432 - return k + (base - tmin + 1) * delta / (delta + skew); 1433 - } 1434 - 1435 - /*** Main encode function ***/ 1436 - 1437 - enum punycode_status punycode_encode( 1438 - punycode_uint input_length, 1439 - const punycode_uint input[], 1440 - const unsigned char case_flags[], 1441 - punycode_uint *output_length, 1442 - char output[] ) 1443 - { 1444 - punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t; 1445 - 1446 - /* Initialize the state: */ 1447 - 1448 - n = initial_n; 1449 - delta = out = 0; 1450 - max_out = *output_length; 1451 - bias = initial_bias; 1452 - 1453 - /* Handle the basic code points: */ 1454 - 1455 - 1456 - 1457 - 1458 - Costello Standards Track [Page 26] 1459 - 1460 - RFC 3492 IDNA Punycode March 2003 1461 - 1462 - 1463 - for (j = 0; j < input_length; ++j) { 1464 - if (basic(input[j])) { 1465 - if (max_out - out < 2) return punycode_big_output; 1466 - output[out++] = 1467 - case_flags ? encode_basic(input[j], case_flags[j]) : input[j]; 1468 - } 1469 - /* else if (input[j] < n) return punycode_bad_input; */ 1470 - /* (not needed for Punycode with unsigned code points) */ 1471 - } 1472 - 1473 - h = b = out; 1474 - 1475 - /* h is the number of code points that have been handled, b is the */ 1476 - /* number of basic code points, and out is the number of characters */ 1477 - /* that have been output. */ 1478 - 1479 - if (b > 0) output[out++] = delimiter; 1480 - 1481 - /* Main encoding loop: */ 1482 - 1483 - while (h < input_length) { 1484 - /* All non-basic code points < n have been */ 1485 - /* handled already. Find the next larger one: */ 1486 - 1487 - for (m = maxint, j = 0; j < input_length; ++j) { 1488 - /* if (basic(input[j])) continue; */ 1489 - /* (not needed for Punycode) */ 1490 - if (input[j] >= n && input[j] < m) m = input[j]; 1491 - } 1492 - 1493 - /* Increase delta enough to advance the decoder's */ 1494 - /* <n,i> state to <m,0>, but guard against overflow: */ 1495 - 1496 - if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow; 1497 - delta += (m - n) * (h + 1); 1498 - n = m; 1499 - 1500 - for (j = 0; j < input_length; ++j) { 1501 - /* Punycode does not need to check whether input[j] is basic: */ 1502 - if (input[j] < n /* || basic(input[j]) */ ) { 1503 - if (++delta == 0) return punycode_overflow; 1504 - } 1505 - 1506 - if (input[j] == n) { 1507 - /* Represent delta as a generalized variable-length integer: */ 1508 - 1509 - for (q = delta, k = base; ; k += base) { 1510 - if (out >= max_out) return punycode_big_output; 1511 - 1512 - 1513 - 1514 - Costello Standards Track [Page 27] 1515 - 1516 - RFC 3492 IDNA Punycode March 2003 1517 - 1518 - 1519 - t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */ 1520 - k >= bias + tmax ? tmax : k - bias; 1521 - if (q < t) break; 1522 - output[out++] = encode_digit(t + (q - t) % (base - t), 0); 1523 - q = (q - t) / (base - t); 1524 - } 1525 - 1526 - output[out++] = encode_digit(q, case_flags && case_flags[j]); 1527 - bias = adapt(delta, h + 1, h == b); 1528 - delta = 0; 1529 - ++h; 1530 - } 1531 - } 1532 - 1533 - ++delta, ++n; 1534 - } 1535 - 1536 - *output_length = out; 1537 - return punycode_success; 1538 - } 1539 - 1540 - /*** Main decode function ***/ 1541 - 1542 - enum punycode_status punycode_decode( 1543 - punycode_uint input_length, 1544 - const char input[], 1545 - punycode_uint *output_length, 1546 - punycode_uint output[], 1547 - unsigned char case_flags[] ) 1548 - { 1549 - punycode_uint n, out, i, max_out, bias, 1550 - b, j, in, oldi, w, k, digit, t; 1551 - 1552 - /* Initialize the state: */ 1553 - 1554 - n = initial_n; 1555 - out = i = 0; 1556 - max_out = *output_length; 1557 - bias = initial_bias; 1558 - 1559 - /* Handle the basic code points: Let b be the number of input code */ 1560 - /* points before the last delimiter, or 0 if there is none, then */ 1561 - /* copy the first b code points to the output. */ 1562 - 1563 - for (b = j = 0; j < input_length; ++j) if (delim(input[j])) b = j; 1564 - if (b > max_out) return punycode_big_output; 1565 - 1566 - for (j = 0; j < b; ++j) { 1567 - 1568 - 1569 - 1570 - Costello Standards Track [Page 28] 1571 - 1572 - RFC 3492 IDNA Punycode March 2003 1573 - 1574 - 1575 - if (case_flags) case_flags[out] = flagged(input[j]); 1576 - if (!basic(input[j])) return punycode_bad_input; 1577 - output[out++] = input[j]; 1578 - } 1579 - 1580 - /* Main decoding loop: Start just after the last delimiter if any */ 1581 - /* basic code points were copied; start at the beginning otherwise. */ 1582 - 1583 - for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) { 1584 - 1585 - /* in is the index of the next character to be consumed, and */ 1586 - /* out is the number of code points in the output array. */ 1587 - 1588 - /* Decode a generalized variable-length integer into delta, */ 1589 - /* which gets added to i. The overflow checking is easier */ 1590 - /* if we increase i as we go, then subtract off its starting */ 1591 - /* value at the end to obtain delta. */ 1592 - 1593 - for (oldi = i, w = 1, k = base; ; k += base) { 1594 - if (in >= input_length) return punycode_bad_input; 1595 - digit = decode_digit(input[in++]); 1596 - if (digit >= base) return punycode_bad_input; 1597 - if (digit > (maxint - i) / w) return punycode_overflow; 1598 - i += digit * w; 1599 - t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */ 1600 - k >= bias + tmax ? tmax : k - bias; 1601 - if (digit < t) break; 1602 - if (w > maxint / (base - t)) return punycode_overflow; 1603 - w *= (base - t); 1604 - } 1605 - 1606 - bias = adapt(i - oldi, out + 1, oldi == 0); 1607 - 1608 - /* i was supposed to wrap around from out+1 to 0, */ 1609 - /* incrementing n each time, so we'll fix that now: */ 1610 - 1611 - if (i / (out + 1) > maxint - n) return punycode_overflow; 1612 - n += i / (out + 1); 1613 - i %= (out + 1); 1614 - 1615 - /* Insert n at position i of the output: */ 1616 - 1617 - /* not needed for Punycode: */ 1618 - /* if (decode_digit(n) <= base) return punycode_invalid_input; */ 1619 - if (out >= max_out) return punycode_big_output; 1620 - 1621 - if (case_flags) { 1622 - memmove(case_flags + i + 1, case_flags + i, out - i); 1623 - 1624 - 1625 - 1626 - Costello Standards Track [Page 29] 1627 - 1628 - RFC 3492 IDNA Punycode March 2003 1629 - 1630 - 1631 - /* Case of last character determines uppercase flag: */ 1632 - case_flags[i] = flagged(input[in - 1]); 1633 - } 1634 - 1635 - memmove(output + i + 1, output + i, (out - i) * sizeof *output); 1636 - output[i++] = n; 1637 - } 1638 - 1639 - *output_length = out; 1640 - return punycode_success; 1641 - } 1642 - 1643 - /******************************************************************/ 1644 - /* Wrapper for testing (would normally go in a separate .c file): */ 1645 - 1646 - #include <assert.h> 1647 - #include <stdio.h> 1648 - #include <stdlib.h> 1649 - #include <string.h> 1650 - 1651 - /* For testing, we'll just set some compile-time limits rather than */ 1652 - /* use malloc(), and set a compile-time option rather than using a */ 1653 - /* command-line option. */ 1654 - 1655 - enum { 1656 - unicode_max_length = 256, 1657 - ace_max_length = 256 1658 - }; 1659 - 1660 - static void usage(char **argv) 1661 - { 1662 - fprintf(stderr, 1663 - "\n" 1664 - "%s -e reads code points and writes a Punycode string.\n" 1665 - "%s -d reads a Punycode string and writes code points.\n" 1666 - "\n" 1667 - "Input and output are plain text in the native character set.\n" 1668 - "Code points are in the form u+hex separated by whitespace.\n" 1669 - "Although the specification allows Punycode strings to contain\n" 1670 - "any characters from the ASCII repertoire, this test code\n" 1671 - "supports only the printable characters, and needs the Punycode\n" 1672 - "string to be followed by a newline.\n" 1673 - "The case of the u in u+hex is the force-to-uppercase flag.\n" 1674 - , argv[0], argv[0]); 1675 - exit(EXIT_FAILURE); 1676 - } 1677 - 1678 - static void fail(const char *msg) 1679 - 1680 - 1681 - 1682 - Costello Standards Track [Page 30] 1683 - 1684 - RFC 3492 IDNA Punycode March 2003 1685 - 1686 - 1687 - { 1688 - fputs(msg,stderr); 1689 - exit(EXIT_FAILURE); 1690 - } 1691 - 1692 - static const char too_big[] = 1693 - "input or output is too large, recompile with larger limits\n"; 1694 - static const char invalid_input[] = "invalid input\n"; 1695 - static const char overflow[] = "arithmetic overflow\n"; 1696 - static const char io_error[] = "I/O error\n"; 1697 - 1698 - /* The following string is used to convert printable */ 1699 - /* characters between ASCII and the native charset: */ 1700 - 1701 - static const char print_ascii[] = 1702 - "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" 1703 - "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" 1704 - " !\"#$%&'()*+,-./" 1705 - "0123456789:;<=>?" 1706 - "@ABCDEFGHIJKLMNO" 1707 - "PQRSTUVWXYZ[\\]^_" 1708 - "`abcdefghijklmno" 1709 - "pqrstuvwxyz{|}~\n"; 1710 - 1711 - int main(int argc, char **argv) 1712 - { 1713 - enum punycode_status status; 1714 - int r; 1715 - unsigned int input_length, output_length, j; 1716 - unsigned char case_flags[unicode_max_length]; 1717 - 1718 - if (argc != 2) usage(argv); 1719 - if (argv[1][0] != '-') usage(argv); 1720 - if (argv[1][2] != 0) usage(argv); 1721 - 1722 - if (argv[1][1] == 'e') { 1723 - punycode_uint input[unicode_max_length]; 1724 - unsigned long codept; 1725 - char output[ace_max_length+1], uplus[3]; 1726 - int c; 1727 - 1728 - /* Read the input code points: */ 1729 - 1730 - input_length = 0; 1731 - 1732 - for (;;) { 1733 - r = scanf("%2s%lx", uplus, &codept); 1734 - if (ferror(stdin)) fail(io_error); 1735 - 1736 - 1737 - 1738 - Costello Standards Track [Page 31] 1739 - 1740 - RFC 3492 IDNA Punycode March 2003 1741 - 1742 - 1743 - if (r == EOF || r == 0) break; 1744 - 1745 - if (r != 2 || uplus[1] != '+' || codept > (punycode_uint)-1) { 1746 - fail(invalid_input); 1747 - } 1748 - 1749 - if (input_length == unicode_max_length) fail(too_big); 1750 - 1751 - if (uplus[0] == 'u') case_flags[input_length] = 0; 1752 - else if (uplus[0] == 'U') case_flags[input_length] = 1; 1753 - else fail(invalid_input); 1754 - 1755 - input[input_length++] = codept; 1756 - } 1757 - 1758 - /* Encode: */ 1759 - 1760 - output_length = ace_max_length; 1761 - status = punycode_encode(input_length, input, case_flags, 1762 - &output_length, output); 1763 - if (status == punycode_bad_input) fail(invalid_input); 1764 - if (status == punycode_big_output) fail(too_big); 1765 - if (status == punycode_overflow) fail(overflow); 1766 - assert(status == punycode_success); 1767 - 1768 - /* Convert to native charset and output: */ 1769 - 1770 - for (j = 0; j < output_length; ++j) { 1771 - c = output[j]; 1772 - assert(c >= 0 && c <= 127); 1773 - if (print_ascii[c] == 0) fail(invalid_input); 1774 - output[j] = print_ascii[c]; 1775 - } 1776 - 1777 - output[j] = 0; 1778 - r = puts(output); 1779 - if (r == EOF) fail(io_error); 1780 - return EXIT_SUCCESS; 1781 - } 1782 - 1783 - if (argv[1][1] == 'd') { 1784 - char input[ace_max_length+2], *p, *pp; 1785 - punycode_uint output[unicode_max_length]; 1786 - 1787 - /* Read the Punycode input string and convert to ASCII: */ 1788 - 1789 - fgets(input, ace_max_length+2, stdin); 1790 - if (ferror(stdin)) fail(io_error); 1791 - 1792 - 1793 - 1794 - Costello Standards Track [Page 32] 1795 - 1796 - RFC 3492 IDNA Punycode March 2003 1797 - 1798 - 1799 - if (feof(stdin)) fail(invalid_input); 1800 - input_length = strlen(input) - 1; 1801 - if (input[input_length] != '\n') fail(too_big); 1802 - input[input_length] = 0; 1803 - 1804 - for (p = input; *p != 0; ++p) { 1805 - pp = strchr(print_ascii, *p); 1806 - if (pp == 0) fail(invalid_input); 1807 - *p = pp - print_ascii; 1808 - } 1809 - 1810 - /* Decode: */ 1811 - 1812 - output_length = unicode_max_length; 1813 - status = punycode_decode(input_length, input, &output_length, 1814 - output, case_flags); 1815 - if (status == punycode_bad_input) fail(invalid_input); 1816 - if (status == punycode_big_output) fail(too_big); 1817 - if (status == punycode_overflow) fail(overflow); 1818 - assert(status == punycode_success); 1819 - 1820 - /* Output the result: */ 1821 - 1822 - for (j = 0; j < output_length; ++j) { 1823 - r = printf("%s+%04lX\n", 1824 - case_flags[j] ? "U" : "u", 1825 - (unsigned long) output[j] ); 1826 - if (r < 0) fail(io_error); 1827 - } 1828 - 1829 - return EXIT_SUCCESS; 1830 - } 1831 - 1832 - usage(argv); 1833 - return EXIT_SUCCESS; /* not reached, but quiets compiler warning */ 1834 - } 1835 - 1836 - 1837 - 1838 - 1839 - 1840 - 1841 - 1842 - 1843 - 1844 - 1845 - 1846 - 1847 - 1848 - 1849 - 1850 - Costello Standards Track [Page 33] 1851 - 1852 - RFC 3492 IDNA Punycode March 2003 1853 - 1854 - 1855 - Author's Address 1856 - 1857 - Adam M. Costello 1858 - University of California, Berkeley 1859 - http://www.nicemice.net/amc/ 1860 - 1861 - 1862 - 1863 - 1864 - 1865 - 1866 - 1867 - 1868 - 1869 - 1870 - 1871 - 1872 - 1873 - 1874 - 1875 - 1876 - 1877 - 1878 - 1879 - 1880 - 1881 - 1882 - 1883 - 1884 - 1885 - 1886 - 1887 - 1888 - 1889 - 1890 - 1891 - 1892 - 1893 - 1894 - 1895 - 1896 - 1897 - 1898 - 1899 - 1900 - 1901 - 1902 - 1903 - 1904 - 1905 - 1906 - Costello Standards Track [Page 34] 1907 - 1908 - RFC 3492 IDNA Punycode March 2003 1909 - 1910 - 1911 - Full Copyright Statement 1912 - 1913 - Copyright (C) The Internet Society (2003). All Rights Reserved. 1914 - 1915 - This document and translations of it may be copied and furnished to 1916 - others, and derivative works that comment on or otherwise explain it 1917 - or assist in its implementation may be prepared, copied, published 1918 - and distributed, in whole or in part, without restriction of any 1919 - kind, provided that the above copyright notice and this paragraph are 1920 - included on all such copies and derivative works. However, this 1921 - document itself may not be modified in any way, such as by removing 1922 - the copyright notice or references to the Internet Society or other 1923 - Internet organizations, except as needed for the purpose of 1924 - developing Internet standards in which case the procedures for 1925 - copyrights defined in the Internet Standards process must be 1926 - followed, or as required to translate it into languages other than 1927 - English. 1928 - 1929 - The limited permissions granted above are perpetual and will not be 1930 - revoked by the Internet Society or its successors or assigns. 1931 - 1932 - This document and the information contained herein is provided on an 1933 - "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING 1934 - TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING 1935 - BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION 1936 - HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF 1937 - MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. 1938 - 1939 - Acknowledgement 1940 - 1941 - Funding for the RFC Editor function is currently provided by the 1942 - Internet Society. 1943 - 1944 - 1945 - 1946 - 1947 - 1948 - 1949 - 1950 - 1951 - 1952 - 1953 - 1954 - 1955 - 1956 - 1957 - 1958 - 1959 - 1960 - 1961 - 1962 - Costello Standards Track [Page 35] 1963 -
-955
ocaml-punycode/spec/rfc5891.txt
··· 1 - 2 - 3 - 4 - 5 - 6 - 7 - Internet Engineering Task Force (IETF) J. Klensin 8 - Request for Comments: 5891 August 2010 9 - Obsoletes: 3490, 3491 10 - Updates: 3492 11 - Category: Standards Track 12 - ISSN: 2070-1721 13 - 14 - 15 - Internationalized Domain Names in Applications (IDNA): Protocol 16 - 17 - Abstract 18 - 19 - This document is the revised protocol definition for 20 - Internationalized Domain Names (IDNs). The rationale for changes, 21 - the relationship to the older specification, and important 22 - terminology are provided in other documents. This document specifies 23 - the protocol mechanism, called Internationalized Domain Names in 24 - Applications (IDNA), for registering and looking up IDNs in a way 25 - that does not require changes to the DNS itself. IDNA is only meant 26 - for processing domain names, not free text. 27 - 28 - Status of This Memo 29 - 30 - This is an Internet Standards Track document. 31 - 32 - This document is a product of the Internet Engineering Task Force 33 - (IETF). It represents the consensus of the IETF community. It has 34 - received public review and has been approved for publication by the 35 - Internet Engineering Steering Group (IESG). Further information on 36 - Internet Standards is available in Section 2 of RFC 5741. 37 - 38 - Information about the current status of this document, any errata, 39 - and how to provide feedback on it may be obtained at 40 - http://www.rfc-editor.org/info/rfc5891. 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - Klensin Standards Track [Page 1] 59 - 60 - RFC 5891 IDNA2008 Protocol August 2010 61 - 62 - 63 - Copyright Notice 64 - 65 - Copyright (c) 2010 IETF Trust and the persons identified as the 66 - document authors. All rights reserved. 67 - 68 - This document is subject to BCP 78 and the IETF Trust's Legal 69 - Provisions Relating to IETF Documents 70 - (http://trustee.ietf.org/license-info) in effect on the date of 71 - publication of this document. Please review these documents 72 - carefully, as they describe your rights and restrictions with respect 73 - to this document. Code Components extracted from this document must 74 - include Simplified BSD License text as described in Section 4.e of 75 - the Trust Legal Provisions and are provided without warranty as 76 - described in the Simplified BSD License. 77 - 78 - This document may contain material from IETF Documents or IETF 79 - Contributions published or made publicly available before November 80 - 10, 2008. The person(s) controlling the copyright in some of this 81 - material may not have granted the IETF Trust the right to allow 82 - modifications of such material outside the IETF Standards Process. 83 - Without obtaining an adequate license from the person(s) controlling 84 - the copyright in such materials, this document may not be modified 85 - outside the IETF Standards Process, and derivative works of it may 86 - not be created outside the IETF Standards Process, except to format 87 - it for publication as an RFC or to translate it into languages other 88 - than English. 89 - 90 - 91 - 92 - 93 - 94 - 95 - 96 - 97 - 98 - 99 - 100 - 101 - 102 - 103 - 104 - 105 - 106 - 107 - 108 - 109 - 110 - 111 - 112 - 113 - 114 - Klensin Standards Track [Page 2] 115 - 116 - RFC 5891 IDNA2008 Protocol August 2010 117 - 118 - 119 - Table of Contents 120 - 121 - 1. Introduction . . . . . . . . . . . . . . . . . . . . . . . . . 4 122 - 2. Terminology . . . . . . . . . . . . . . . . . . . . . . . . . 4 123 - 3. Requirements and Applicability . . . . . . . . . . . . . . . . 5 124 - 3.1. Requirements . . . . . . . . . . . . . . . . . . . . . . . 5 125 - 3.2. Applicability . . . . . . . . . . . . . . . . . . . . . . 5 126 - 3.2.1. DNS Resource Records . . . . . . . . . . . . . . . . . 6 127 - 3.2.2. Non-Domain-Name Data Types Stored in the DNS . . . . . 6 128 - 4. Registration Protocol . . . . . . . . . . . . . . . . . . . . 6 129 - 4.1. Input to IDNA Registration . . . . . . . . . . . . . . . . 7 130 - 4.2. Permitted Character and Label Validation . . . . . . . . . 7 131 - 4.2.1. Input Format . . . . . . . . . . . . . . . . . . . . . 7 132 - 4.2.2. Rejection of Characters That Are Not Permitted . . . . 8 133 - 4.2.3. Label Validation . . . . . . . . . . . . . . . . . . . 8 134 - 4.2.4. Registration Validation Requirements . . . . . . . . . 9 135 - 4.3. Registry Restrictions . . . . . . . . . . . . . . . . . . 9 136 - 4.4. Punycode Conversion . . . . . . . . . . . . . . . . . . . 9 137 - 4.5. Insertion in the Zone . . . . . . . . . . . . . . . . . . 10 138 - 5. Domain Name Lookup Protocol . . . . . . . . . . . . . . . . . 10 139 - 5.1. Label String Input . . . . . . . . . . . . . . . . . . . . 10 140 - 5.2. Conversion to Unicode . . . . . . . . . . . . . . . . . . 10 141 - 5.3. A-label Input . . . . . . . . . . . . . . . . . . . . . . 10 142 - 5.4. Validation and Character List Testing . . . . . . . . . . 11 143 - 5.5. Punycode Conversion . . . . . . . . . . . . . . . . . . . 13 144 - 5.6. DNS Name Resolution . . . . . . . . . . . . . . . . . . . 13 145 - 6. Security Considerations . . . . . . . . . . . . . . . . . . . 13 146 - 7. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 13 147 - 8. Contributors . . . . . . . . . . . . . . . . . . . . . . . . . 13 148 - 9. Acknowledgments . . . . . . . . . . . . . . . . . . . . . . . 14 149 - 10. References . . . . . . . . . . . . . . . . . . . . . . . . . . 14 150 - 10.1. Normative References . . . . . . . . . . . . . . . . . . . 14 151 - 10.2. Informative References . . . . . . . . . . . . . . . . . . 15 152 - Appendix A. Summary of Major Changes from IDNA2003 . . . . . . . 17 153 - 154 - 155 - 156 - 157 - 158 - 159 - 160 - 161 - 162 - 163 - 164 - 165 - 166 - 167 - 168 - 169 - 170 - Klensin Standards Track [Page 3] 171 - 172 - RFC 5891 IDNA2008 Protocol August 2010 173 - 174 - 175 - 1. Introduction 176 - 177 - This document supplies the protocol definition for Internationalized 178 - Domain Names in Applications (IDNA), with the version specified here 179 - known as IDNA2008. Essential definitions and terminology for 180 - understanding this document and a road map of the collection of 181 - documents that make up IDNA2008 appear in a separate Definitions 182 - document [RFC5890]. Appendix A discusses the relationship between 183 - this specification and the earlier version of IDNA (referred to here 184 - as "IDNA2003"). The rationale for these changes, along with 185 - considerable explanatory material and advice to zone administrators 186 - who support IDNs, is provided in another document, known informally 187 - in this series as the "Rationale document" [RFC5894]. 188 - 189 - IDNA works by allowing applications to use certain ASCII [ASCII] 190 - string labels (beginning with a special prefix) to represent 191 - non-ASCII name labels. Lower-layer protocols need not be aware of 192 - this; therefore, IDNA does not change any infrastructure. In 193 - particular, IDNA does not depend on any changes to DNS servers, 194 - resolvers, or DNS protocol elements, because the ASCII name service 195 - provided by the existing DNS can be used for IDNA. 196 - 197 - IDNA applies only to a specific subset of DNS labels. The base DNS 198 - standards [RFC1034] [RFC1035] and their various updates specify how 199 - to combine labels into fully-qualified domain names and parse labels 200 - out of those names. 201 - 202 - This document describes two separate protocols, one for IDN 203 - registration (Section 4) and one for IDN lookup (Section 5). These 204 - two protocols share some terminology, reference data, and operations. 205 - 206 - 2. Terminology 207 - 208 - As mentioned above, terminology used as part of the definition of 209 - IDNA appears in the Definitions document [RFC5890]. It is worth 210 - noting that some of this terminology overlaps with, and is consistent 211 - with, that used in Unicode or other character set standards and the 212 - DNS. Readers of this document are assumed to be familiar with the 213 - associated Definitions document and with the DNS-specific terminology 214 - in RFC 1034 [RFC1034]. 215 - 216 - The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", 217 - "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this 218 - document are to be interpreted as described in BCP 14, RFC 2119 219 - [RFC2119]. 220 - 221 - 222 - 223 - 224 - 225 - 226 - Klensin Standards Track [Page 4] 227 - 228 - RFC 5891 IDNA2008 Protocol August 2010 229 - 230 - 231 - 3. Requirements and Applicability 232 - 233 - 3.1. Requirements 234 - 235 - IDNA makes the following requirements: 236 - 237 - 1. Whenever a domain name is put into a domain name slot that is not 238 - IDNA-aware (see Section 2.3.2.6 of the Definitions document 239 - [RFC5890]), it MUST contain only ASCII characters (i.e., its 240 - labels must be either A-labels or NR-LDH labels), unless the DNS 241 - application is not subject to historical recommendations for 242 - "hostname"-style names (see RFC 1034 [RFC1034] and 243 - Section 3.2.1). 244 - 245 - 2. Labels MUST be compared using equivalent forms: either both 246 - A-label forms or both U-label forms. Because A-labels and 247 - U-labels can be transformed into each other without loss of 248 - information, these comparisons are equivalent (however, in 249 - practice, comparison of U-labels requires first verifying that 250 - they actually are U-labels and not just Unicode strings). A pair 251 - of A-labels MUST be compared as case-insensitive ASCII (as with 252 - all comparisons of ASCII DNS labels). U-labels MUST be compared 253 - as-is, without case folding or other intermediate steps. While 254 - it is not necessary to validate labels in order to compare them, 255 - successful comparison does not imply validity. In many cases, 256 - not limited to comparison, validation may be important for other 257 - reasons and SHOULD be performed. 258 - 259 - 3. Labels being registered MUST conform to the requirements of 260 - Section 4. Labels being looked up and the lookup process MUST 261 - conform to the requirements of Section 5. 262 - 263 - 3.2. Applicability 264 - 265 - IDNA applies to all domain names in all domain name slots in 266 - protocols except where it is explicitly excluded. It does not apply 267 - to domain name slots that do not use the LDH syntax rules as 268 - described in the Definitions document [RFC5890]. 269 - 270 - Because it uses the DNS, IDNA applies to many protocols that were 271 - specified before it was designed. IDNs occupying domain name slots 272 - in those older protocols MUST be in A-label form until and unless 273 - those protocols and their implementations are explicitly upgraded to 274 - be aware of IDNs and to accept the U-label form. IDNs actually 275 - appearing in DNS queries or responses MUST be A-labels. 276 - 277 - 278 - 279 - 280 - 281 - 282 - Klensin Standards Track [Page 5] 283 - 284 - RFC 5891 IDNA2008 Protocol August 2010 285 - 286 - 287 - IDNA-aware protocols and implementations MAY accept U-labels, 288 - A-labels, or both as those particular protocols specify. IDNA is not 289 - defined for extended label types (see RFC 2671 [RFC2671], Section 3). 290 - 291 - 3.2.1. DNS Resource Records 292 - 293 - IDNA applies only to domain names in the NAME and RDATA fields of DNS 294 - resource records whose CLASS is IN. See the DNS specification 295 - [RFC1035] for precise definitions of these terms. 296 - 297 - The application of IDNA to DNS resource records depends entirely on 298 - the CLASS of the record, and not on the TYPE except as noted below. 299 - This will remain true, even as new TYPEs are defined, unless a new 300 - TYPE defines TYPE-specific rules. Special naming conventions for SRV 301 - records (and "underscore labels" more generally) are incompatible 302 - with IDNA coding as discussed in the Definitions document [RFC5890], 303 - especially Section 2.3.2.3. Of course, underscore labels may be part 304 - of a domain that uses IDN labels at higher levels in the tree. 305 - 306 - 3.2.2. Non-Domain-Name Data Types Stored in the DNS 307 - 308 - Although IDNA enables the representation of non-ASCII characters in 309 - domain names, that does not imply that IDNA enables the 310 - representation of non-ASCII characters in other data types that are 311 - stored in domain names, specifically in the RDATA field for types 312 - that have structured RDATA format. For example, an email address 313 - local part is stored in a domain name in the RNAME field as part of 314 - the RDATA of an SOA record (e.g., hostmaster@example.com would be 315 - represented as hostmaster.example.com). IDNA does not update the 316 - existing email standards, which allow only ASCII characters in local 317 - parts. Even though work is in progress to define 318 - internationalization for email addresses [RFC4952], changes to the 319 - email address part of the SOA RDATA would require action in, or 320 - updates to, other standards, specifically those that specify the 321 - format of the SOA RR. 322 - 323 - 4. Registration Protocol 324 - 325 - This section defines the model for registering an IDN. The model is 326 - implementation independent; any sequence of steps that produces 327 - exactly the same result for all labels is considered a valid 328 - implementation. 329 - 330 - Note that, while the registration (this section) and lookup protocols 331 - (Section 5) are very similar in most respects, they are not 332 - identical, and implementers should carefully follow the steps 333 - described in this specification. 334 - 335 - 336 - 337 - 338 - Klensin Standards Track [Page 6] 339 - 340 - RFC 5891 IDNA2008 Protocol August 2010 341 - 342 - 343 - 4.1. Input to IDNA Registration 344 - 345 - Registration processes, especially processing by entities (often 346 - called "registrars") who deal with registrants before the request 347 - actually reaches the zone manager ("registry") are outside the scope 348 - of this definition and may differ significantly depending on local 349 - needs. By the time a string enters the IDNA registration process as 350 - described in this specification, it MUST be in Unicode and in 351 - Normalization Form C (NFC [Unicode-UAX15]). Entities responsible for 352 - zone files ("registries") MUST accept only the exact string for which 353 - registration is requested, free of any mappings or local adjustments. 354 - They MAY accept that input in any of three forms: 355 - 356 - 1. As a pair of A-label and U-label. 357 - 358 - 2. As an A-label only. 359 - 360 - 3. As a U-label only. 361 - 362 - The first two of these forms are RECOMMENDED because the use of 363 - A-labels avoids any possibility of ambiguity. The first is normally 364 - preferred over the second because it permits further verification of 365 - user intent (see Section 4.2.1). 366 - 367 - 4.2. Permitted Character and Label Validation 368 - 369 - 4.2.1. Input Format 370 - 371 - If both the U-label and A-label forms are available, the registry 372 - MUST ensure that the A-label form is in lowercase, perform a 373 - conversion to a U-label, perform the steps and tests described below 374 - on that U-label, and then verify that the A-label produced by the 375 - step in Section 4.4 matches the one provided as input. In addition, 376 - the U-label that was provided as input and the one obtained by 377 - conversion of the A-label MUST match exactly. If, for some reason, 378 - these tests fail, the registration MUST be rejected. 379 - 380 - If only an A-label was provided and the conversion to a U-label is 381 - not performed, the registry MUST still verify that the A-label is 382 - superficially valid, i.e., that it does not violate any of the rules 383 - of Punycode encoding [RFC3492] such as the prohibition on trailing 384 - hyphen-minus, the requirement that all characters be ASCII, and so 385 - on. Strings that appear to be A-labels (e.g., they start with 386 - "xn--") and strings that are supplied to the registry in a context 387 - reserved for A-labels (such as a field in a form to be filled out), 388 - but that are not valid A-labels as described in this paragraph, MUST 389 - NOT be placed in DNS zones that support IDNA. 390 - 391 - 392 - 393 - 394 - Klensin Standards Track [Page 7] 395 - 396 - RFC 5891 IDNA2008 Protocol August 2010 397 - 398 - 399 - If only an A-label is provided, the conversion to a U-label is not 400 - performed, but the superficial tests described in the previous 401 - paragraph are performed, registration procedures MAY, and usually 402 - will, bypass the tests and actions in the balance of Section 4.2 and 403 - in Sections 4.3 and 4.4. 404 - 405 - 4.2.2. Rejection of Characters That Are Not Permitted 406 - 407 - The candidate Unicode string MUST NOT contain characters that appear 408 - in the "DISALLOWED" and "UNASSIGNED" lists specified in the Tables 409 - document [RFC5892]. 410 - 411 - 4.2.3. Label Validation 412 - 413 - The proposed label (in the form of a Unicode string, i.e., a string 414 - that at least superficially appears to be a U-label) is then examined 415 - using tests that require examination of more than one character. 416 - Character order is considered to be the on-the-wire order. That 417 - order may not be the same as the display order. 418 - 419 - 4.2.3.1. Hyphen Restrictions 420 - 421 - The Unicode string MUST NOT contain "--" (two consecutive hyphens) in 422 - the third and fourth character positions and MUST NOT start or end 423 - with a "-" (hyphen). 424 - 425 - 4.2.3.2. Leading Combining Marks 426 - 427 - The Unicode string MUST NOT begin with a combining mark or combining 428 - character (see The Unicode Standard, Section 2.11 [Unicode] for an 429 - exact definition). 430 - 431 - 4.2.3.3. Contextual Rules 432 - 433 - The Unicode string MUST NOT contain any characters whose validity is 434 - context-dependent, unless the validity is positively confirmed by a 435 - contextual rule. To check this, each code point identified as 436 - CONTEXTJ or CONTEXTO in the Tables document [RFC5892] MUST have a 437 - non-null rule. If such a code point is missing a rule, the label is 438 - invalid. If the rule exists but the result of applying the rule is 439 - negative or inconclusive, the proposed label is invalid. 440 - 441 - 4.2.3.4. Labels Containing Characters Written Right to Left 442 - 443 - If the proposed label contains any characters from scripts that are 444 - written from right to left, it MUST meet the Bidi criteria [RFC5893]. 445 - 446 - 447 - 448 - 449 - 450 - Klensin Standards Track [Page 8] 451 - 452 - RFC 5891 IDNA2008 Protocol August 2010 453 - 454 - 455 - 4.2.4. Registration Validation Requirements 456 - 457 - Strings that contain at least one non-ASCII character, have been 458 - produced by the steps above, whose contents pass all of the tests in 459 - Section 4.2.3, and are 63 or fewer characters long in 460 - ASCII-compatible encoding (ACE) form (see Section 4.4), are U-labels. 461 - 462 - To summarize, tests are made in Section 4.2 for invalid characters, 463 - invalid combinations of characters, for labels that are invalid even 464 - if the characters they contain are valid individually, and for labels 465 - that do not conform to the restrictions for strings containing 466 - right-to-left characters. 467 - 468 - 4.3. Registry Restrictions 469 - 470 - In addition to the rules and tests above, there are many reasons why 471 - a registry could reject a label. Registries at all levels of the 472 - DNS, not just the top level, are expected to establish policies about 473 - label registrations. Policies are likely to be informed by the local 474 - languages and the scripts that are used to write them and may depend 475 - on many factors including what characters are in the label (for 476 - example, a label may be rejected based on other labels already 477 - registered). See the Rationale document [RFC5894], Section 3.2, for 478 - further discussion and recommendations about registry policies. 479 - 480 - The string produced by the steps in Section 4.2 is checked and 481 - processed as appropriate to local registry restrictions. Application 482 - of those registry restrictions may result in the rejection of some 483 - labels or the application of special restrictions to others. 484 - 485 - 4.4. Punycode Conversion 486 - 487 - The resulting U-label is converted to an A-label (defined in Section 488 - 2.3.2.1 of the Definitions document [RFC5890]). The A-label is the 489 - encoding of the U-label according to the Punycode algorithm [RFC3492] 490 - with the ACE prefix "xn--" added at the beginning of the string. The 491 - resulting string must, of course, conform to the length limits 492 - imposed by the DNS. This document does not update or alter the 493 - Punycode algorithm specified in RFC 3492 in any way. RFC 3492 does 494 - make a non-normative reference to the information about the value and 495 - construction of the ACE prefix that appears in RFC 3490 or Nameprep 496 - [RFC3491]. For consistency and reader convenience, IDNA2008 497 - effectively updates that reference to point to this document. That 498 - change does not alter the prefix itself. The prefix, "xn--", is the 499 - same in both sets of documents. 500 - 501 - 502 - 503 - 504 - 505 - 506 - Klensin Standards Track [Page 9] 507 - 508 - RFC 5891 IDNA2008 Protocol August 2010 509 - 510 - 511 - With the exception of the maximum string length test on Punycode 512 - output, the failure conditions identified in the Punycode encoding 513 - procedure cannot occur if the input is a U-label as determined by the 514 - steps in Sections 4.1 through 4.3 above. 515 - 516 - 4.5. Insertion in the Zone 517 - 518 - The label is registered in the DNS by inserting the A-label into a 519 - zone. 520 - 521 - 5. Domain Name Lookup Protocol 522 - 523 - Lookup is different from registration and different tests are applied 524 - on the client. Although some validity checks are necessary to avoid 525 - serious problems with the protocol, the lookup-side tests are more 526 - permissive and rely on the assumption that names that are present in 527 - the DNS are valid. That assumption is, however, a weak one because 528 - the presence of wildcards in the DNS might cause a string that is not 529 - actually registered in the DNS to be successfully looked up. 530 - 531 - 5.1. Label String Input 532 - 533 - The user supplies a string in the local character set, for example, 534 - by typing it, clicking on it, or copying and pasting it from a 535 - resource identifier, e.g., a Uniform Resource Identifier (URI) 536 - [RFC3986] or an Internationalized Resource Identifier (IRI) 537 - [RFC3987], from which the domain name is extracted. Alternately, 538 - some process not directly involving the user may read the string from 539 - a file or obtain it in some other way. Processing in this step and 540 - the one specified in Section 5.2 are local matters, to be 541 - accomplished prior to actual invocation of IDNA. 542 - 543 - 5.2. Conversion to Unicode 544 - 545 - The string is converted from the local character set into Unicode, if 546 - it is not already in Unicode. Depending on local needs, this 547 - conversion may involve mapping some characters into other characters 548 - as well as coding conversions. Those issues are discussed in the 549 - mapping-related sections (Sections 4.2, 4.4, 6, and 7.3) of the 550 - Rationale document [RFC5894] and in the separate Mapping document 551 - [IDNA2008-Mapping]. The result MUST be a Unicode string in NFC form. 552 - 553 - 5.3. A-label Input 554 - 555 - If the input to this procedure appears to be an A-label (i.e., it 556 - starts in "xn--", interpreted case-insensitively), the lookup 557 - application MAY attempt to convert it to a U-label, first ensuring 558 - that the A-label is entirely in lowercase (converting it to lowercase 559 - 560 - 561 - 562 - Klensin Standards Track [Page 10] 563 - 564 - RFC 5891 IDNA2008 Protocol August 2010 565 - 566 - 567 - if necessary), and apply the tests of Section 5.4 and the conversion 568 - of Section 5.5 to that form. If the label is converted to Unicode 569 - (i.e., to U-label form) using the Punycode decoding algorithm, then 570 - the processing specified in those two sections MUST be performed, and 571 - the label MUST be rejected if the resulting label is not identical to 572 - the original. See Section 8.1 of the Rationale document [RFC5894] 573 - for additional discussion on this topic. 574 - 575 - Conversion from the A-label and testing that the result is a U-label 576 - SHOULD be performed if the domain name will later be presented to the 577 - user in native character form (this requires that the lookup 578 - application be IDNA-aware). If those steps are not performed, the 579 - lookup process SHOULD at least test to determine that the string is 580 - actually an A-label, examining it for the invalid formats specified 581 - in the Punycode decoding specification. Applications that are not 582 - IDNA-aware will obviously omit that testing; others MAY treat the 583 - string as opaque to avoid the additional processing at the expense of 584 - providing less protection and information to users. 585 - 586 - 5.4. Validation and Character List Testing 587 - 588 - As with the registration procedure described in Section 4, the 589 - Unicode string is checked to verify that all characters that appear 590 - in it are valid as input to IDNA lookup processing. As discussed 591 - above and in the Rationale document [RFC5894], the lookup check is 592 - more liberal than the registration one. Labels that have not been 593 - fully evaluated for conformance to the applicable rules are referred 594 - to as "putative" labels as discussed in Section 2.3.2.1 of the 595 - Definitions document [RFC5890]. Putative U-labels with any of the 596 - following characteristics MUST be rejected prior to DNS lookup: 597 - 598 - o Labels that are not in NFC [Unicode-UAX15]. 599 - 600 - o Labels containing "--" (two consecutive hyphens) in the third and 601 - fourth character positions. 602 - 603 - o Labels whose first character is a combining mark (see The Unicode 604 - Standard, Section 2.11 [Unicode]). 605 - 606 - o Labels containing prohibited code points, i.e., those that are 607 - assigned to the "DISALLOWED" category of the Tables document 608 - [RFC5892]. 609 - 610 - o Labels containing code points that are identified in the Tables 611 - document as "CONTEXTJ", i.e., requiring exceptional contextual 612 - rule processing on lookup, but that do not conform to those rules. 613 - Note that this implies that a rule must be defined, not null: a 614 - 615 - 616 - 617 - 618 - Klensin Standards Track [Page 11] 619 - 620 - RFC 5891 IDNA2008 Protocol August 2010 621 - 622 - 623 - character that requires a contextual rule but for which the rule 624 - is null is treated in this step as having failed to conform to the 625 - rule. 626 - 627 - o Labels containing code points that are identified in the Tables 628 - document as "CONTEXTO", but for which no such rule appears in the 629 - table of rules. Applications resolving DNS names or carrying out 630 - equivalent operations are not required to test contextual rules 631 - for "CONTEXTO" characters, only to verify that a rule is defined 632 - (although they MAY make such tests to provide better protection or 633 - give better information to the user). 634 - 635 - o Labels containing code points that are unassigned in the version 636 - of Unicode being used by the application, i.e., in the UNASSIGNED 637 - category of the Tables document. 638 - 639 - This requirement means that the application must use a list of 640 - unassigned characters that is matched to the version of Unicode 641 - that is being used for the other requirements in this section. It 642 - is not required that the application know which version of Unicode 643 - is being used; that information might be part of the operating 644 - environment in which the application is running. 645 - 646 - In addition, the application SHOULD apply the following test. 647 - 648 - o Verification that the string is compliant with the requirements 649 - for right-to-left characters specified in the Bidi document 650 - [RFC5893]. 651 - 652 - This test may be omitted in special circumstances, such as when the 653 - lookup application knows that the conditions are enforced elsewhere, 654 - because an attempt to look up and resolve such strings will almost 655 - certainly lead to a DNS lookup failure except when wildcards are 656 - present in the zone. However, applying the test is likely to give 657 - much better information about the reason for a lookup failure -- 658 - information that may be usefully passed to the user when that is 659 - feasible -- than DNS resolution failure information alone. 660 - 661 - For all other strings, the lookup application MUST rely on the 662 - presence or absence of labels in the DNS to determine the validity of 663 - those labels and the validity of the characters they contain. If 664 - they are registered, they are presumed to be valid; if they are not, 665 - their possible validity is not relevant. While a lookup application 666 - may reasonably issue warnings about strings it believes may be 667 - problematic, applications that decline to process a string that 668 - conforms to the rules above (i.e., does not look it up in the DNS) 669 - are not in conformance with this protocol. 670 - 671 - 672 - 673 - 674 - Klensin Standards Track [Page 12] 675 - 676 - RFC 5891 IDNA2008 Protocol August 2010 677 - 678 - 679 - 5.5. Punycode Conversion 680 - 681 - The string that has now been validated for lookup is converted to ACE 682 - form by applying the Punycode algorithm to the string and then adding 683 - the ACE prefix ("xn--"). 684 - 685 - 5.6. DNS Name Resolution 686 - 687 - The A-label resulting from the conversion in Section 5.5 or supplied 688 - directly (see Section 5.3) is combined with other labels as needed to 689 - form a fully-qualified domain name that is then looked up in the DNS, 690 - using normal DNS resolver procedures. The lookup can obviously 691 - either succeed (returning information) or fail. 692 - 693 - 6. Security Considerations 694 - 695 - Security Considerations for this version of IDNA are described in the 696 - Definitions document [RFC5890], except for the special issues 697 - associated with right-to-left scripts and characters. The latter are 698 - discussed in the Bidi document [RFC5893]. 699 - 700 - In order to avoid intentional or accidental attacks from labels that 701 - might be confused with others, special problems in rendering, and so 702 - on, the IDNA model requires that registries exercise care and 703 - thoughtfulness about what labels they choose to permit. That issue 704 - is discussed in Section 4.3 of this document which, in turn, points 705 - to a somewhat more extensive discussion in the Rationale document 706 - [RFC5894]. 707 - 708 - 7. IANA Considerations 709 - 710 - IANA actions for this version of IDNA are specified in the Tables 711 - document [RFC5892] and discussed informally in the Rationale document 712 - [RFC5894]. The components of IDNA described in this document do not 713 - require any IANA actions. 714 - 715 - 8. Contributors 716 - 717 - While the listed editor held the pen, the original versions of this 718 - document represent the joint work and conclusions of an ad hoc design 719 - team consisting of the editor and, in alphabetic order, Harald 720 - Alvestrand, Tina Dam, Patrik Faltstrom, and Cary Karp. This document 721 - draws significantly on the original version of IDNA [RFC3490] both 722 - conceptually and for specific text. This second-generation version 723 - would not have been possible without the work that went into that 724 - first version and especially the contributions of its authors Patrik 725 - Faltstrom, Paul Hoffman, and Adam Costello. While Faltstrom was 726 - 727 - 728 - 729 - 730 - Klensin Standards Track [Page 13] 731 - 732 - RFC 5891 IDNA2008 Protocol August 2010 733 - 734 - 735 - actively involved in the creation of this version, Hoffman and 736 - Costello were not and should not be held responsible for any errors 737 - or omissions. 738 - 739 - 9. Acknowledgments 740 - 741 - This revision to IDNA would have been impossible without the 742 - accumulated experience since RFC 3490 was published and resulting 743 - comments and complaints of many people in the IETF, ICANN, and other 744 - communities (too many people to list here). Nor would it have been 745 - possible without RFC 3490 itself and the efforts of the Working Group 746 - that defined it. Those people whose contributions are acknowledged 747 - in RFC 3490, RFC 4690 [RFC4690], and the Rationale document [RFC5894] 748 - were particularly important. 749 - 750 - Specific textual changes were incorporated into this document after 751 - suggestions from the other contributors, Stephane Bortzmeyer, Vint 752 - Cerf, Lisa Dusseault, Paul Hoffman, Kent Karlsson, James Mitchell, 753 - Erik van der Poel, Marcos Sanz, Andrew Sullivan, Wil Tan, Ken 754 - Whistler, Chris Wright, and other WG participants and reviewers 755 - including Martin Duerst, James Mitchell, Subramanian Moonesamy, Peter 756 - Saint-Andre, Margaret Wasserman, and Dan Winship who caught specific 757 - errors and recommended corrections. Special thanks are due to Paul 758 - Hoffman for permission to extract material to form the basis for 759 - Appendix A from a draft document that he prepared. 760 - 761 - 10. References 762 - 763 - 10.1. Normative References 764 - 765 - [RFC1034] Mockapetris, P., "Domain names - concepts and 766 - facilities", STD 13, RFC 1034, November 1987. 767 - 768 - [RFC1035] Mockapetris, P., "Domain names - implementation and 769 - specification", STD 13, RFC 1035, November 1987. 770 - 771 - [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate 772 - Requirement Levels", BCP 14, RFC 2119, March 1997. 773 - 774 - [RFC3492] Costello, A., "Punycode: A Bootstring encoding of 775 - Unicode for Internationalized Domain Names in 776 - Applications (IDNA)", RFC 3492, March 2003. 777 - 778 - [RFC5890] Klensin, J., "Internationalized Domain Names for 779 - Applications (IDNA): Definitions and Document 780 - Framework", RFC 5890, August 2010. 781 - 782 - 783 - 784 - 785 - 786 - Klensin Standards Track [Page 14] 787 - 788 - RFC 5891 IDNA2008 Protocol August 2010 789 - 790 - 791 - [RFC5892] Faltstrom, P., Ed., "The Unicode Code Points and 792 - Internationalized Domain Names for Applications (IDNA)", 793 - RFC 5892, August 2010. 794 - 795 - [RFC5893] Alvestrand, H., Ed. and C. Karp, "Right-to-Left Scripts 796 - for Internationalized Domain Names for Applications 797 - (IDNA)", RFC 5893, August 2010. 798 - 799 - [Unicode-UAX15] 800 - The Unicode Consortium, "Unicode Standard Annex #15: 801 - Unicode Normalization Forms", September 2009, 802 - <http://www.unicode.org/reports/tr15/>. 803 - 804 - 10.2. Informative References 805 - 806 - [ASCII] American National Standards Institute (formerly United 807 - States of America Standards Institute), "USA Code for 808 - Information Interchange", ANSI X3.4-1968, 1968. ANSI 809 - X3.4-1968 has been replaced by newer versions with 810 - slight modifications, but the 1968 version remains 811 - definitive for the Internet. 812 - 813 - [IDNA2008-Mapping] 814 - Resnick, P. and P. Hoffman, "Mapping Characters in 815 - Internationalized Domain Names for Applications (IDNA)", 816 - Work in Progress, April 2010. 817 - 818 - [RFC2671] Vixie, P., "Extension Mechanisms for DNS (EDNS0)", 819 - RFC 2671, August 1999. 820 - 821 - [RFC3490] Faltstrom, P., Hoffman, P., and A. Costello, 822 - "Internationalizing Domain Names in Applications 823 - (IDNA)", RFC 3490, March 2003. 824 - 825 - [RFC3491] Hoffman, P. and M. Blanchet, "Nameprep: A Stringprep 826 - Profile for Internationalized Domain Names (IDN)", 827 - RFC 3491, March 2003. 828 - 829 - [RFC3986] Berners-Lee, T., Fielding, R., and L. Masinter, "Uniform 830 - Resource Identifier (URI): Generic Syntax", STD 66, 831 - RFC 3986, January 2005. 832 - 833 - [RFC3987] Duerst, M. and M. Suignard, "Internationalized Resource 834 - Identifiers (IRIs)", RFC 3987, January 2005. 835 - 836 - [RFC4690] Klensin, J., Faltstrom, P., Karp, C., and IAB, "Review 837 - and Recommendations for Internationalized Domain Names 838 - (IDNs)", RFC 4690, September 2006. 839 - 840 - 841 - 842 - Klensin Standards Track [Page 15] 843 - 844 - RFC 5891 IDNA2008 Protocol August 2010 845 - 846 - 847 - [RFC4952] Klensin, J. and Y. Ko, "Overview and Framework for 848 - Internationalized Email", RFC 4952, July 2007. 849 - 850 - [RFC5894] Klensin, J., "Internationalized Domain Names for 851 - Applications (IDNA): Background, Explanation, and 852 - Rationale", RFC 5894, August 2010. 853 - 854 - [Unicode] The Unicode Consortium, "The Unicode Standard, Version 855 - 5.0", 2007. Boston, MA, USA: Addison-Wesley. ISBN 856 - 0-321-48091-0. This printed reference has now been 857 - updated online to reflect additional code points. For 858 - code points, the reference at the time this document was 859 - published is to Unicode 5.2. 860 - 861 - 862 - 863 - 864 - 865 - 866 - 867 - 868 - 869 - 870 - 871 - 872 - 873 - 874 - 875 - 876 - 877 - 878 - 879 - 880 - 881 - 882 - 883 - 884 - 885 - 886 - 887 - 888 - 889 - 890 - 891 - 892 - 893 - 894 - 895 - 896 - 897 - 898 - Klensin Standards Track [Page 16] 899 - 900 - RFC 5891 IDNA2008 Protocol August 2010 901 - 902 - 903 - Appendix A. Summary of Major Changes from IDNA2003 904 - 905 - 1. Update base character set from Unicode 3.2 to Unicode version 906 - agnostic. 907 - 908 - 2. Separate the definitions for the "registration" and "lookup" 909 - activities. 910 - 911 - 3. Disallow symbol and punctuation characters except where special 912 - exceptions are necessary. 913 - 914 - 4. Remove the mapping and normalization steps from the protocol and 915 - have them, instead, done by the applications themselves, 916 - possibly in a local fashion, before invoking the protocol. 917 - 918 - 5. Change the way that the protocol specifies which characters are 919 - allowed in labels from "humans decide what the table of code 920 - points contains" to "decision about code points are based on 921 - Unicode properties plus a small exclusion list created by 922 - humans". 923 - 924 - 6. Introduce the new concept of characters that can be used only in 925 - specific contexts. 926 - 927 - 7. Allow typical words and names in languages such as Dhivehi and 928 - Yiddish to be expressed. 929 - 930 - 8. Make bidirectional domain names (delimited strings of labels, 931 - not just labels standing on their own) display in a less 932 - surprising fashion, whether they appear in obvious domain name 933 - contexts or as part of running text in paragraphs. 934 - 935 - 9. Remove the dot separator from the mandatory part of the 936 - protocol. 937 - 938 - 10. Make some currently valid labels that are not actually IDNA 939 - labels invalid. 940 - 941 - Author's Address 942 - 943 - John C Klensin 944 - 1770 Massachusetts Ave, Ste 322 945 - Cambridge, MA 02140 946 - USA 947 - 948 - Phone: +1 617 245 1457 949 - EMail: john+ietf@jck.com 950 - 951 - 952 - 953 - 954 - Klensin Standards Track [Page 17] 955 -
-3923
ocaml-punycode/spec/rfc5892.txt
··· 1 - 2 - 3 - 4 - 5 - 6 - 7 - Internet Engineering Task Force (IETF) P. Faltstrom, Ed. 8 - Request for Comments: 5892 Cisco 9 - Category: Standards Track August 2010 10 - ISSN: 2070-1721 11 - 12 - 13 - The Unicode Code Points and 14 - Internationalized Domain Names for Applications (IDNA) 15 - 16 - Abstract 17 - 18 - This document specifies rules for deciding whether a code point, 19 - considered in isolation or in context, is a candidate for inclusion 20 - in an Internationalized Domain Name (IDN). 21 - 22 - It is part of the specification of Internationalizing Domain Names in 23 - Applications 2008 (IDNA2008). 24 - 25 - Status of This Memo 26 - 27 - This is an Internet Standards Track document. 28 - 29 - This document is a product of the Internet Engineering Task Force 30 - (IETF). It represents the consensus of the IETF community. It has 31 - received public review and has been approved for publication by the 32 - Internet Engineering Steering Group (IESG). Further information on 33 - Internet Standards is available in Section 2 of RFC 5741. 34 - 35 - Information about the current status of this document, any errata, 36 - and how to provide feedback on it may be obtained at 37 - http://www.rfc-editor.org/info/rfc5892. 38 - 39 - Copyright Notice 40 - 41 - Copyright (c) 2010 IETF Trust and the persons identified as the 42 - document authors. All rights reserved. 43 - 44 - This document is subject to BCP 78 and the IETF Trust's Legal 45 - Provisions Relating to IETF Documents 46 - (http://trustee.ietf.org/license-info) in effect on the date of 47 - publication of this document. Please review these documents 48 - carefully, as they describe your rights and restrictions with respect 49 - to this document. Code Components extracted from this document must 50 - include Simplified BSD License text as described in Section 4.e of 51 - the Trust Legal Provisions and are provided without warranty as 52 - described in the Simplified BSD License. 53 - 54 - 55 - 56 - 57 - 58 - Faltstrom Standards Track [Page 1] 59 - 60 - RFC 5892 IDNA Code Points August 2010 61 - 62 - 63 - Table of Contents 64 - 65 - 1. Introduction . . . . . . . . . . . . . . . . . . . . . . . . . 3 66 - 2. Category Definitions Used to Calculate Derived Property 67 - Value . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5 68 - 2.1. LetterDigits (A) . . . . . . . . . . . . . . . . . . . . . 5 69 - 2.2. Unstable (B) . . . . . . . . . . . . . . . . . . . . . . . 6 70 - 2.3. IgnorableProperties (C) . . . . . . . . . . . . . . . . . 6 71 - 2.4. IgnorableBlocks (D) . . . . . . . . . . . . . . . . . . . 7 72 - 2.5. LDH (E) . . . . . . . . . . . . . . . . . . . . . . . . . 7 73 - 2.6. Exceptions (F) . . . . . . . . . . . . . . . . . . . . . . 7 74 - 2.7. BackwardCompatible (G) . . . . . . . . . . . . . . . . . . 9 75 - 2.8. JoinControl (H) . . . . . . . . . . . . . . . . . . . . . 9 76 - 2.9. OldHangulJamo (I) . . . . . . . . . . . . . . . . . . . . 9 77 - 2.10. Unassigned (J) . . . . . . . . . . . . . . . . . . . . . . 9 78 - 3. Calculation of the Derived Property . . . . . . . . . . . . . 10 79 - 4. Code Points . . . . . . . . . . . . . . . . . . . . . . . . . 10 80 - 5. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 11 81 - 5.1. IDNA-Derived Property Value Registry . . . . . . . . . . . 11 82 - 5.2. IDNA Context Registry . . . . . . . . . . . . . . . . . . 11 83 - 5.2.1. Template for Context Registry . . . . . . . . . . . . 11 84 - 6. Security Considerations . . . . . . . . . . . . . . . . . . . 12 85 - 7. Acknowledgements . . . . . . . . . . . . . . . . . . . . . . . 12 86 - Appendix A. Contextual Rules Registry . . . . . . . . . . . . . 13 87 - Appendix A.1. ZERO WIDTH NON-JOINER . . . . . . . . . . . . . . . 15 88 - Appendix A.2. ZERO WIDTH JOINER . . . . . . . . . . . . . . . . . 16 89 - Appendix A.3. MIDDLE DOT . . . . . . . . . . . . . . . . . . . . . 16 90 - Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) . . . . . . . . . 17 91 - Appendix A.5. HEBREW PUNCTUATION GERESH . . . . . . . . . . . . . 17 92 - Appendix A.6. HEBREW PUNCTUATION GERSHAYIM . . . . . . . . . . . . 18 93 - Appendix A.7. KATAKANA MIDDLE DOT . . . . . . . . . . . . . . . . 18 94 - Appendix A.8. ARABIC-INDIC DIGITS . . . . . . . . . . . . . . . . 19 95 - Appendix A.9. EXTENDED ARABIC-INDIC DIGITS . . . . . . . . . . . . 19 96 - Appendix B. Code Points 0x0000 - 0x10FFFF . . . . . . . . . . . 20 97 - Appendix B.1. Code Points in Unicode Character Database (UCD) 98 - Format . . . . . . . . . . . . . . . . . . . . . . . 20 99 - 8. References . . . . . . . . . . . . . . . . . . . . . . . . . . 69 100 - 8.1. Normative References . . . . . . . . . . . . . . . . . . . 69 101 - 8.2. Informative References . . . . . . . . . . . . . . . . . . 69 102 - 103 - 104 - 105 - 106 - 107 - 108 - 109 - 110 - 111 - 112 - 113 - 114 - Faltstrom Standards Track [Page 2] 115 - 116 - RFC 5892 IDNA Code Points August 2010 117 - 118 - 119 - 1. Introduction 120 - 121 - RFC 4690 [RFC4690] suggests an inclusion-based approach for selecting 122 - the code points from The Unicode Standard [Unicode52] that should be 123 - included in the list of code points that may be used in 124 - Internationalized Domain Names. 125 - 126 - Specifically, RFC 4690 [RFC4690] says the following: 127 - 128 - The IAB has concluded that there is a consensus within the broader 129 - community that lists of code points should be specified by the use 130 - of an inclusion-based mechanism (i.e., identifying the characters 131 - that are permitted), rather than by excluding a small number of 132 - characters from the total Unicode set as Stringprep [RFC3454] and 133 - Nameprep [RFC3491] do today. That conclusion should be reviewed 134 - by the IETF community and action taken as appropriate. 135 - 136 - This document reviews and classifies the collections of code points 137 - in the Unicode character set by examining various properties of the 138 - code points. It then defines an algorithm for determining a derived 139 - property value. It specifies a procedure, and not a table, of code 140 - points so that the algorithm can be used to determine code point sets 141 - independent of the version of Unicode that is in use. 142 - 143 - This document is not intended to specify precisely how these property 144 - values are to be applied in IDN labels. That information appears in 145 - the Protocol document [RFC5891], but it is important to understand 146 - that the assignment of a value of this property to a particular 147 - character is not sufficient to determine whether it can be used in a 148 - given label. In particular, some combinations of allowed code points 149 - are not advisable for use in IDNs due to rules specific to a script 150 - or class of characters. The requirement for such rules is linked to 151 - the operations in the Protocol document and especially to the 152 - characters designated as requiring contextual rules. 153 - 154 - The value of the property is to be interpreted as follows. 155 - 156 - o PROTOCOL VALID: Those that are allowed to be used in IDNs. Code 157 - points with this property value are permitted for general use in 158 - IDNs. However, that a label consists only of code points that 159 - have this property value does not imply that the label can be used 160 - in DNS. See the Protocol document for algorithms to make 161 - decisions about labels in domain names. The abbreviated term 162 - PVALID is used to refer to this value in the rest of this 163 - document. 164 - 165 - 166 - 167 - 168 - 169 - 170 - Faltstrom Standards Track [Page 3] 171 - 172 - RFC 5892 IDNA Code Points August 2010 173 - 174 - 175 - o CONTEXTUAL RULE REQUIRED: Some characteristics of the character, 176 - such as it being invisible in certain contexts or problematic in 177 - others, require that it not be used in labels unless specific 178 - other characters or properties are present. The abbreviated term 179 - CONTEXT is used to refer to this value in the rest of this 180 - document. There are two subdivisions of CONTEXTUAL RULE REQUIRED, 181 - one for Join_controls (called CONTEXTJ) and for other characters 182 - (called CONTEXTO). These are discussed in more detail below and 183 - in the Protocol document. 184 - 185 - o DISALLOWED: Those that should clearly not be included in IDNs. 186 - Code points with this property value are not permitted in IDNs. 187 - 188 - o UNASSIGNED: Those code points that are not designated (i.e., are 189 - unassigned) in the Unicode Standard. 190 - 191 - The mechanisms described here allow determination of the value of the 192 - property for future versions of Unicode (including characters added 193 - after Unicode 5.2). Changes in Unicode properties that do not affect 194 - the outcome of this process do not affect IDN. For example, a 195 - character can have its Unicode General_Category value (see 196 - [Unicode52]) change from So to Sm or from Lo to Ll, without affecting 197 - the algorithm results. Moreover, even if such changes were the 198 - result, the BackwardCompatible list (Section 2.7) can be adjusted to 199 - ensure the stability of the results. 200 - 201 - Some code points need to be allowed in exceptional circumstances but 202 - should be excluded in all other cases; these rules are also described 203 - in other documents. The most notable of these are the Join Control 204 - characters, U+200D ZERO WIDTH JOINER and U+200C ZERO WIDTH 205 - NON-JOINER. Both of them have the derived property value CONTEXTJ. 206 - A character with the derived property value CONTEXTJ or CONTEXTO 207 - (CONTEXTUAL RULE REQUIRED) is not to be used unless an appropriate 208 - rule has been established and the context of the character is 209 - consistent with that rule. It is invalid to either register a string 210 - containing these characters or even to look one up unless such a 211 - contextual rule is found and satisfied. Please see Appendix A, "The 212 - Contextual Rules Registry", for more information. 213 - 214 - This document is part of a series that, together, constitute a 215 - proposal for updating the IDNA standards to resolve issues uncovered 216 - in recent years, cover a broader range of scripts, and provide for 217 - migration to newer versions of Unicode. See the Rationale document 218 - [RFC5894] for a broader discussion. 219 - 220 - The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", 221 - "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this 222 - document are to be interpreted as described in RFC 2119 [RFC2119]. 223 - 224 - 225 - 226 - Faltstrom Standards Track [Page 4] 227 - 228 - RFC 5892 IDNA Code Points August 2010 229 - 230 - 231 - 2. Category Definitions Used to Calculate Derived Property Value 232 - 233 - The derived property obtains its value based on a two-step procedure. 234 - First, characters are placed in one or more character categories 235 - based on either core properties defined by the Unicode Standard or by 236 - treating the code point as an exception and addressing the code point 237 - by its code point value. These categories are not mutually 238 - exclusive. 239 - 240 - In the second step, set operations are used with these categories to 241 - determine the values for an IDN-specific property. Those operations 242 - are specified in Section 3. 243 - 244 - Unicode property names and property value names may have short 245 - abbreviations, such as gc for the General_Category property, and Ll 246 - for the Lowercase_Letter property value of the gc property. 247 - 248 - In the following specification of categories, the operation that 249 - returns the value of a particular Unicode character property for a 250 - code point is designated by using the formal name of that property 251 - (from PropertyAliases.txt) followed by '(cp)'. For example, the 252 - value of the General_Category property for a code point is indicated 253 - by General_Category(cp). 254 - 255 - 2.1. LetterDigits (A) 256 - 257 - A: General_Category(cp) is in {Ll, Lu, Lo, Nd, Lm, Mn, Mc} 258 - 259 - These rules identify characters commonly used in mnemonics and often 260 - informally described as "language characters". In general, only code 261 - points assigned to this category are suitable for use in IDN. 262 - 263 - For more information, see Section 4.5 of The Unicode Standard 264 - [Unicode]. 265 - 266 - The categories used in this rule are: 267 - 268 - o Ll - Lowercase_Letter 269 - 270 - o Lu - Uppercase_Letter 271 - 272 - o Lo - Other_Letter 273 - 274 - o Nd - Decimal_Number 275 - 276 - o Lm - Modifier_Letter 277 - 278 - 279 - 280 - 281 - 282 - Faltstrom Standards Track [Page 5] 283 - 284 - RFC 5892 IDNA Code Points August 2010 285 - 286 - 287 - o Mn - Nonspacing_Mark 288 - 289 - o Mc - Spacing_Mark 290 - 291 - 2.2. Unstable (B) 292 - 293 - B: toNFKC(toCaseFold(toNFKC(cp))) != cp 294 - 295 - This category is used to group the characters that are not stable 296 - under Normalization Form K (NFKC) and case folding. In general, 297 - these code points are not suitable for use for IDN. 298 - 299 - The toCaseFold() operation is defined in Section 3.13 of The Unicode 300 - Standard [Unicode]. 301 - 302 - The toNFKC() operation returns the code point in normalization form 303 - KC. For more information, see Section 5 of Unicode Standard Annex 304 - #15 [TR15]. 305 - 306 - It should be noted that NFKC is used, although Normalization Form C 307 - (NFC) is used in the "IDNA Protocol" document [RFC5891]. 308 - 309 - 2.3. IgnorableProperties (C) 310 - 311 - C: Default_Ignorable_Code_Point(cp) = True or 312 - White_Space(cp) = True or 313 - Noncharacter_Code_Point(cp) = True 314 - 315 - This category is used to group code points that are not recommended 316 - for use in identifiers. In general, these code points are not 317 - suitable for use in an IDN. 318 - 319 - The definition for Default_Ignorable_Code_Point can be found in 320 - DerivedCoreProperties.txt [DerivedCoreProperties] and is at the time 321 - of Unicode 5.2: 322 - 323 - Other_Default_Ignorable_Code_Point + Cf (Format characters) 324 - + Variation_Selector - White_Space - FFF9..FFFB (Annotation 325 - Characters) - 0600..0603, 06DD, 070F (exceptional Cf characters 326 - that should be visible) 327 - 328 - 329 - 330 - 331 - 332 - 333 - 334 - 335 - 336 - 337 - 338 - Faltstrom Standards Track [Page 6] 339 - 340 - RFC 5892 IDNA Code Points August 2010 341 - 342 - 343 - 2.4. IgnorableBlocks (D) 344 - 345 - D: Block(cp) is in {Combining Diacritical Marks for Symbols, 346 - Musical Symbols, Ancient Greek Musical Notation} 347 - 348 - This category is used to identify code points that are not useful in 349 - mnemonics or that are otherwise impractical for IDN use. In general, 350 - these code points are not suitable for use for IDN. 351 - 352 - The definition of blocks can be found in Blocks.txt [BlockNames]. 353 - 354 - 2.5. LDH (E) 355 - 356 - E: cp is in {002D, 0030..0039, 0061..007A} 357 - 358 - This category is used in the second step to preserve the traditional 359 - "hostname" (LDH -- as described in the Definitions document 360 - [RFC5890]) characters ('-', 0-9, and a-z). In general, these code 361 - points are suitable for use for IDN. Note that there are other rules 362 - regarding the code point U+002D HYPHEN-MINUS that are specified in 363 - the IDNA Protocol Specification [RFC5891]. 364 - 365 - 2.6. Exceptions (F) 366 - 367 - F: cp is in {00B7, 00DF, 0375, 03C2, 05F3, 05F4, 0640, 0660, 368 - 0661, 0662, 0663, 0664, 0665, 0666, 0667, 0668, 369 - 0669, 06F0, 06F1, 06F2, 06F3, 06F4, 06F5, 06F6, 370 - 06F7, 06F8, 06F9, 06FD, 06FE, 07FA, 0F0B, 3007, 371 - 302E, 302F, 3031, 3032, 3033, 3034, 3035, 303B, 372 - 30FB} 373 - 374 - This category explicitly lists code points for which the category 375 - cannot be assigned using only the core property values that exist in 376 - the Unicode standard. The values are according to the table below: 377 - 378 - PVALID -- Would otherwise have been DISALLOWED 379 - 380 - 00DF; PVALID # LATIN SMALL LETTER SHARP S 381 - 03C2; PVALID # GREEK SMALL LETTER FINAL SIGMA 382 - 06FD; PVALID # ARABIC SIGN SINDHI AMPERSAND 383 - 06FE; PVALID # ARABIC SIGN SINDHI POSTPOSITION MEN 384 - 0F0B; PVALID # TIBETAN MARK INTERSYLLABIC TSHEG 385 - 3007; PVALID # IDEOGRAPHIC NUMBER ZERO 386 - 387 - 388 - 389 - 390 - 391 - 392 - 393 - 394 - Faltstrom Standards Track [Page 7] 395 - 396 - RFC 5892 IDNA Code Points August 2010 397 - 398 - 399 - CONTEXTO -- Would otherwise have been DISALLOWED 400 - 401 - 00B7; CONTEXTO # MIDDLE DOT 402 - 0375; CONTEXTO # GREEK LOWER NUMERAL SIGN (KERAIA) 403 - 05F3; CONTEXTO # HEBREW PUNCTUATION GERESH 404 - 05F4; CONTEXTO # HEBREW PUNCTUATION GERSHAYIM 405 - 30FB; CONTEXTO # KATAKANA MIDDLE DOT 406 - 407 - CONTEXTO -- Would otherwise have been PVALID 408 - 409 - 0660; CONTEXTO # ARABIC-INDIC DIGIT ZERO 410 - 0661; CONTEXTO # ARABIC-INDIC DIGIT ONE 411 - 0662; CONTEXTO # ARABIC-INDIC DIGIT TWO 412 - 0663; CONTEXTO # ARABIC-INDIC DIGIT THREE 413 - 0664; CONTEXTO # ARABIC-INDIC DIGIT FOUR 414 - 0665; CONTEXTO # ARABIC-INDIC DIGIT FIVE 415 - 0666; CONTEXTO # ARABIC-INDIC DIGIT SIX 416 - 0667; CONTEXTO # ARABIC-INDIC DIGIT SEVEN 417 - 0668; CONTEXTO # ARABIC-INDIC DIGIT EIGHT 418 - 0669; CONTEXTO # ARABIC-INDIC DIGIT NINE 419 - 06F0; CONTEXTO # EXTENDED ARABIC-INDIC DIGIT ZERO 420 - 06F1; CONTEXTO # EXTENDED ARABIC-INDIC DIGIT ONE 421 - 06F2; CONTEXTO # EXTENDED ARABIC-INDIC DIGIT TWO 422 - 06F3; CONTEXTO # EXTENDED ARABIC-INDIC DIGIT THREE 423 - 06F4; CONTEXTO # EXTENDED ARABIC-INDIC DIGIT FOUR 424 - 06F5; CONTEXTO # EXTENDED ARABIC-INDIC DIGIT FIVE 425 - 06F6; CONTEXTO # EXTENDED ARABIC-INDIC DIGIT SIX 426 - 06F7; CONTEXTO # EXTENDED ARABIC-INDIC DIGIT SEVEN 427 - 06F8; CONTEXTO # EXTENDED ARABIC-INDIC DIGIT EIGHT 428 - 06F9; CONTEXTO # EXTENDED ARABIC-INDIC DIGIT NINE 429 - 430 - DISALLOWED -- Would otherwise have been PVALID 431 - 432 - 0640; DISALLOWED # ARABIC TATWEEL 433 - 07FA; DISALLOWED # NKO LAJANYALAN 434 - 302E; DISALLOWED # HANGUL SINGLE DOT TONE MARK 435 - 302F; DISALLOWED # HANGUL DOUBLE DOT TONE MARK 436 - 3031; DISALLOWED # VERTICAL KANA REPEAT MARK 437 - 3032; DISALLOWED # VERTICAL KANA REPEAT WITH VOICED SOUND MARK 438 - 3033; DISALLOWED # VERTICAL KANA REPEAT MARK UPPER HALF 439 - 3034; DISALLOWED # VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HA 440 - 3035; DISALLOWED # VERTICAL KANA REPEAT MARK LOWER HALF 441 - 303B; DISALLOWED # VERTICAL IDEOGRAPHIC ITERATION MARK 442 - 443 - 444 - 445 - 446 - 447 - 448 - 449 - 450 - Faltstrom Standards Track [Page 8] 451 - 452 - RFC 5892 IDNA Code Points August 2010 453 - 454 - 455 - 2.7. BackwardCompatible (G) 456 - 457 - G: cp is in {} 458 - 459 - This category includes the code points that property values in 460 - versions of Unicode after 5.2 have changed in such a way that the 461 - derived property value would no longer be PVALID or DISALLOWED. If 462 - changes are made to future versions of Unicode so that code points 463 - might change the property value from PVALID or DISALLOWED, then this 464 - table can be updated and keep special exception values so that the 465 - property values for code points stay stable. 466 - 467 - 2.8. JoinControl (H) 468 - 469 - H: Join_Control(cp) = True 470 - 471 - This category consists of Join Control characters (i.e., they are not 472 - in LetterDigits (Section 2.1) but are still required in IDN labels 473 - under some circumstances). 474 - 475 - 2.9. OldHangulJamo (I) 476 - 477 - I: Hangul_Syllable_Type(cp) is in {L, V, T} 478 - 479 - This category consists of all conjoining Hangul Jamo (Leading Jamo, 480 - Vowel Jamo, and Trailing Jamo). 481 - 482 - Elimination of conjoining Hangul Jamo from the set of PVALID 483 - characters results in restricting the set of Korean PVALID characters 484 - just to preformed, modern Hangul syllable characters. Old Hangul 485 - syllables, which must be spelled with sequences of conjoining Hangul 486 - Jamo, are not PVALID for IDNs. 487 - 488 - 2.10. Unassigned (J) 489 - 490 - J: General_Category(cp) is in {Cn} and 491 - Noncharacter_Code_Point(cp) = False 492 - 493 - This category consists of code points in the Unicode character set 494 - that are not (yet) assigned. It should be noted that Unicode 495 - distinguishes between "unassigned code points" and "unassigned 496 - characters". The unassigned code points are all but (Cn - 497 - Noncharacters), while the unassigned *characters* are all but (Cn + 498 - Cs). 499 - 500 - 501 - 502 - 503 - 504 - 505 - 506 - Faltstrom Standards Track [Page 9] 507 - 508 - RFC 5892 IDNA Code Points August 2010 509 - 510 - 511 - 3. Calculation of the Derived Property 512 - 513 - As described above (Section 1) and in more detail in the IDNA 514 - Protocol document [RFC5891], possible values of the IDN property are: 515 - 516 - o PVALID 517 - 518 - o CONTEXTJ 519 - 520 - o CONTEXTO 521 - 522 - o DISALLOWED 523 - 524 - o UNASSIGNED 525 - 526 - The algorithm to calculate the value of the derived property is as 527 - follows. If the name of a rule (such as Exception) is used, that 528 - implies the set of code points that the rule defines, while the same 529 - name as a function call (such as Exception(cp)) implies the value cp 530 - has in the Exceptions table. 531 - 532 - If .cp. .in. Exceptions Then Exceptions(cp); 533 - Else If .cp. .in. BackwardCompatible Then BackwardCompatible(cp); 534 - Else If .cp. .in. Unassigned Then UNASSIGNED; 535 - Else If .cp. .in. LDH Then PVALID; 536 - Else If .cp. .in. JoinControl Then CONTEXTJ; 537 - Else If .cp. .in. Unstable Then DISALLOWED; 538 - Else If .cp. .in. IgnorableProperties Then DISALLOWED; 539 - Else If .cp. .in. IgnorableBlocks Then DISALLOWED; 540 - Else If .cp. .in. OldHangulJamo Then DISALLOWED; 541 - Else If .cp. .in. LetterDigits Then PVALID; 542 - Else DISALLOWED; 543 - 544 - 4. Code Points 545 - 546 - The categories and rules defined in Sections 2 and 3 apply to all 547 - Unicode code points. The table in Appendix B shows, for illustrative 548 - purposes, the consequences of the categories and classification 549 - rules, and the resulting property values. 550 - 551 - The list of code points that can be found in Appendix B is 552 - non-normative. Sections 2 and 3 are normative. 553 - 554 - 555 - 556 - 557 - 558 - 559 - 560 - 561 - 562 - Faltstrom Standards Track [Page 10] 563 - 564 - RFC 5892 IDNA Code Points August 2010 565 - 566 - 567 - 5. IANA Considerations 568 - 569 - 5.1. IDNA-Derived Property Value Registry 570 - 571 - IANA has created a registry with the derived properties for the 572 - versions of Unicode released after (and including) version 5.2. The 573 - derived property value is to be calculated in cooperation with a 574 - designated expert [RFC5226] according to the specifications in 575 - Sections 2 and 3 and not by copying the non-normative table found in 576 - Appendix B. 577 - 578 - If non-backward-compatible changes or other problems arise during the 579 - creation or designated expert review of the table of derived property 580 - values, they should be flagged for the IESG. Changes to the rules 581 - (as specified in Sections 2 and 3), including BackwardCompatible 582 - (Section 2.7) (a set that is at release of this document is empty) 583 - require IETF Review, as described in RFC 5226 [RFC5226]. 584 - 585 - 5.2. IDNA Context Registry 586 - 587 - For characters that are defined in the IDNA derived property value 588 - registry (Section 5.1) as CONTEXTO or CONTEXTJ and that therefore 589 - require a contextual rule, IANA has created and now maintains a list 590 - of approved contextual rules. Additions or changes to these rules 591 - require IETF Review, as described in [RFC5226]. 592 - 593 - Appendix A contains further discussion and a table from which that 594 - registry can be initialized. 595 - 596 - 5.2.1. Template for Context Registry 597 - 598 - The following information is to be given when a new rule is created. 599 - 600 - Name: Unique name of the rule 601 - 602 - Code point: Rule that should be applied when this code point 603 - exists in the label 604 - 605 - Overview: Description in plain English on what the rule verifies 606 - 607 - Lookup: Should the rule be applied at time of lookup? 608 - 609 - Rule Set: The set of rules, with a reference to the defining 610 - document. 611 - 612 - 613 - 614 - 615 - 616 - 617 - 618 - Faltstrom Standards Track [Page 11] 619 - 620 - RFC 5892 IDNA Code Points August 2010 621 - 622 - 623 - 6. Security Considerations 624 - 625 - Security Considerations for this version of IDNA, except for the 626 - special issues associated with right-to-left scripts and characters, 627 - are described in the Definitions document [RFC5890]. Specific issues 628 - for labels containing characters associated with scripts written 629 - right to left appear in the Bidi document [RFC5893]. 630 - 631 - 7. Acknowledgements 632 - 633 - This document would not have been possible to produce without input 634 - from many people. The main contributors are (in alphabetical order) 635 - Harald Alvestrand, Vint Cerf, Tina Dam, Mark Davis, Gihan Dias, 636 - Mouhammet Diop, Michael Everson, Asmus Freytag, Debbie Garside, Paul 637 - Hoffman, Kent Karlsson, Cary Karp, Jaeyoun Kim, John Klensin, Olaf 638 - Kolkman, Gervase Markham, Ram Mohan, Lisa Moore, Yngve Pettersen, 639 - Erik van der Poel, Hualin Qian, Rick Reed, Pete Resnick, Lakmal 640 - Silva, Michel Suignard, Andrew Sullivan, Wil Tan, Kenneth Whistler, 641 - Chris Wright, and Yoshiro Yoneya. 642 - 643 - 644 - 645 - 646 - 647 - 648 - 649 - 650 - 651 - 652 - 653 - 654 - 655 - 656 - 657 - 658 - 659 - 660 - 661 - 662 - 663 - 664 - 665 - 666 - 667 - 668 - 669 - 670 - 671 - 672 - 673 - 674 - Faltstrom Standards Track [Page 12] 675 - 676 - RFC 5892 IDNA Code Points August 2010 677 - 678 - 679 - Appendix A. Contextual Rules Registry 680 - 681 - As discussed in Section 5.2 and in the IANA Considerations section of 682 - the Rationale document [RFC5894], a registry of rules that define the 683 - contexts in which particular PROTOCOL-VALID characters, characters 684 - associated with a requirement for Contextual Information, are 685 - permitted. These rules are expressed as tests on the label in which 686 - the characters appear (all, or any part of, the label may be tested). 687 - 688 - The grammatical rules are expressed in pseudo-code. The conventions 689 - used for that pseudo-code are explained here. 690 - 691 - Each rule is constructed as a Boolean expression that evaluates to 692 - either True or False. A simple "True;" or "False;" rule sets the 693 - default result value for the rule set. Subsequent conditional rules 694 - that evaluate to True or False may re-set the result value. 695 - 696 - A special value "Undefined" is used to deal with any error 697 - conditions, such as an attempt to test a character before the start 698 - of a label or after the end of a label. If any term of a rule 699 - evaluates to Undefined, further evaluation of the rule immediately 700 - terminates, as the result value of the rule will itself be Undefined. 701 - 702 - cp represents the code point to be tested. 703 - 704 - FirstChar is a special term that denotes the first code point in a 705 - label. 706 - 707 - LastChar is a special term that denotes the last code point in a 708 - label. 709 - 710 - .eq. represents the equality relation. 711 - 712 - A .eq. B evaluates to True if A equals B. 713 - 714 - .is. represents checking the position in a label. 715 - 716 - A .is. B evaluates to True if A and B have same position in 717 - the same label. 718 - 719 - .ne. represents the non-equality relation. 720 - 721 - A .ne. B evaluates to True if A is not equal to B. 722 - 723 - .in. represents the set inclusion relation. 724 - 725 - A .in. B evaluates to True if A is a member of the set B. 726 - 727 - 728 - 729 - 730 - Faltstrom Standards Track [Page 13] 731 - 732 - RFC 5892 IDNA Code Points August 2010 733 - 734 - 735 - A functional notation, Function_Name(cp), is used to express either 736 - string positions within a label, Boolean character property tests of 737 - a code point, or a regular expression match. When such function 738 - names refer to Boolean character property tests, the function names 739 - use the exact Unicode character property name for the property in 740 - question, and "cp" is evaluated as the Unicode value of the code 741 - point to be tested, rather than as its position in the label. When 742 - such function names refer to string positions within a label, "cp" is 743 - evaluated as its position in the label. 744 - 745 - RegExpMatch(X) takes as its parameter X a schematic regular 746 - expression consisting of a mix of Unicode character property values 747 - and literal Unicode code points. 748 - 749 - Script(cp) returns the value of the Unicode Script property, as 750 - defined in Scripts.txt in the Unicode Character Database. 751 - 752 - Canonical_Combining_Class(cp) returns the value of the Unicode 753 - Canonical_Combining_Class property, as defined in UnicodeData.txt in 754 - the Unicode Character Database. 755 - 756 - Before(cp) returns the code point of the character immediately 757 - preceding cp in logical order in the string representing the label. 758 - Before(FirstChar) evaluates to Undefined. 759 - 760 - After(cp) returns the code point of the character immediately 761 - following cp in logical order in the string representing the label. 762 - After(LastChar) evaluates to Undefined. 763 - 764 - Note that "Before" and "After" do not refer to the visual display 765 - order of the character in a label, which may be reversed or otherwise 766 - modified by the bidirectional algorithm for labels including 767 - characters from scripts written right to left. Instead, "Before" and 768 - "After" refer to the network order of the character in the label. 769 - 770 - The clauses "Then True" and "Then False" imply exit from the 771 - pseudo-code routine with the corresponding result. 772 - 773 - Repeated evaluation for all characters in a label makes use of the 774 - special construct: 775 - 776 - For All Characters: 777 - 778 - Expression; 779 - 780 - End For; 781 - 782 - 783 - 784 - 785 - 786 - Faltstrom Standards Track [Page 14] 787 - 788 - RFC 5892 IDNA Code Points August 2010 789 - 790 - 791 - This construct requires repeated evaluation of "Expression" for each 792 - code point in the label, starting from FirstChar and proceeding to 793 - LastChar. 794 - 795 - The different fields in the rules are to be interpreted as follows: 796 - 797 - Code point: 798 - The code point, or code points, to which this rule is to be 799 - applied. Normally, this implies that if any of the code points in 800 - a label is as defined, then the rules should be applied. If 801 - evaluated to True, the code point is OK as used; if evaluated to 802 - False, it is not OK. 803 - 804 - Overview: 805 - A description of the goal with the rule, in plain English. 806 - 807 - Lookup: 808 - True if application of this rule is recommended at lookup time; 809 - False otherwise. 810 - 811 - Rule Set: 812 - The rule set itself, as described above. 813 - 814 - Appendix A.1. ZERO WIDTH NON-JOINER 815 - 816 - Code point: 817 - U+200C 818 - 819 - Overview: 820 - This may occur in a formally cursive script (such as Arabic) in a 821 - context where it breaks a cursive connection as required for 822 - orthographic rules, as in the Persian language, for example. It 823 - also may occur in Indic scripts in a consonant-conjunct context 824 - (immediately following a virama), to control required display of 825 - such conjuncts. 826 - 827 - Lookup: 828 - True 829 - 830 - Rule Set: 831 - 832 - False; 833 - 834 - If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 835 - 836 - If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C 837 - 838 - (Joining_Type:T)*(Joining_Type:{R,D})) Then True; 839 - 840 - 841 - 842 - Faltstrom Standards Track [Page 15] 843 - 844 - RFC 5892 IDNA Code Points August 2010 845 - 846 - 847 - Appendix A.2. ZERO WIDTH JOINER 848 - 849 - Code point: 850 - U+200D 851 - 852 - Overview: 853 - This may occur in Indic scripts in a consonant-conjunct context 854 - (immediately following a virama), to control required display of 855 - such conjuncts. 856 - 857 - Lookup: 858 - True 859 - 860 - Rule Set: 861 - 862 - False; 863 - 864 - If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 865 - 866 - Appendix A.3. MIDDLE DOT 867 - 868 - Code point: 869 - U+00B7 870 - 871 - Overview: 872 - Between 'l' (U+006C) characters only, used to permit the Catalan 873 - character ela geminada to be expressed. 874 - 875 - Lookup: 876 - False 877 - 878 - Rule Set: 879 - 880 - False; 881 - 882 - If Before(cp) .eq. U+006C And 883 - 884 - After(cp) .eq. U+006C Then True; 885 - 886 - 887 - 888 - 889 - 890 - 891 - 892 - 893 - 894 - 895 - 896 - 897 - 898 - Faltstrom Standards Track [Page 16] 899 - 900 - RFC 5892 IDNA Code Points August 2010 901 - 902 - 903 - Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) 904 - 905 - Code point: 906 - U+0375 907 - 908 - Overview: 909 - The script of the following character MUST be Greek. 910 - 911 - Lookup: 912 - False 913 - 914 - Rule Set: 915 - 916 - False; 917 - 918 - If Script(After(cp)) .eq. Greek Then True; 919 - 920 - Appendix A.5. HEBREW PUNCTUATION GERESH 921 - 922 - Code point: 923 - U+05F3 924 - 925 - Overview: 926 - The script of the preceding character MUST be Hebrew. 927 - 928 - Lookup: 929 - False 930 - 931 - Rule Set: 932 - 933 - False; 934 - 935 - If Script(Before(cp)) .eq. Hebrew Then True; 936 - 937 - 938 - 939 - 940 - 941 - 942 - 943 - 944 - 945 - 946 - 947 - 948 - 949 - 950 - 951 - 952 - 953 - 954 - Faltstrom Standards Track [Page 17] 955 - 956 - RFC 5892 IDNA Code Points August 2010 957 - 958 - 959 - Appendix A.6. HEBREW PUNCTUATION GERSHAYIM 960 - 961 - Code point: 962 - U+05F4 963 - 964 - Overview: 965 - The script of the preceding character MUST be Hebrew. 966 - 967 - Lookup: 968 - False 969 - 970 - Rule Set: 971 - 972 - False; 973 - 974 - If Script(Before(cp)) .eq. Hebrew Then True; 975 - 976 - Appendix A.7. KATAKANA MIDDLE DOT 977 - 978 - Code point: 979 - U+30FB 980 - 981 - Overview: 982 - Note that the Script of Katakana Middle Dot is not any of 983 - "Hiragana", "Katakana", or "Han". The effect of this rule is to 984 - require at least one character in the label to be in one of those 985 - scripts. 986 - 987 - Lookup: 988 - False 989 - 990 - Rule Set: 991 - 992 - False; 993 - 994 - For All Characters: 995 - 996 - If Script(cp) .in. {Hiragana, Katakana, Han} Then True; 997 - 998 - End For; 999 - 1000 - 1001 - 1002 - 1003 - 1004 - 1005 - 1006 - 1007 - 1008 - 1009 - 1010 - Faltstrom Standards Track [Page 18] 1011 - 1012 - RFC 5892 IDNA Code Points August 2010 1013 - 1014 - 1015 - Appendix A.8. ARABIC-INDIC DIGITS 1016 - 1017 - Code point: 1018 - 0660..0669 1019 - 1020 - Overview: 1021 - Can not be mixed with Extended Arabic-Indic Digits. 1022 - 1023 - Lookup: 1024 - False 1025 - 1026 - Rule Set: 1027 - 1028 - True; 1029 - 1030 - For All Characters: 1031 - 1032 - If cp .in. 06F0..06F9 Then False; 1033 - 1034 - End For; 1035 - 1036 - Appendix A.9. EXTENDED ARABIC-INDIC DIGITS 1037 - 1038 - Code point: 1039 - 06F0..06F9 1040 - 1041 - Overview: 1042 - Can not be mixed with Arabic-Indic Digits. 1043 - 1044 - Lookup: 1045 - False 1046 - 1047 - Rule Set: 1048 - 1049 - True; 1050 - 1051 - For All Characters: 1052 - 1053 - If cp .in. 0660..0669 Then False; 1054 - 1055 - End For; 1056 - 1057 - 1058 - 1059 - 1060 - 1061 - 1062 - 1063 - 1064 - 1065 - 1066 - Faltstrom Standards Track [Page 19] 1067 - 1068 - RFC 5892 IDNA Code Points August 2010 1069 - 1070 - 1071 - Appendix B. Code Points 0x0000 - 0x10FFFF 1072 - 1073 - If one applies the rules (Section 3) to the code points 0x0000 to 1074 - 0x10FFFF to Unicode 5.2, the result is as follows. 1075 - 1076 - This list is non-normative, and only included for illustrative 1077 - purposes. Specifically, what is displayed in the third column is not 1078 - the formal name of the code point (as defined in Section 4.8 of The 1079 - Unicode Standard [Unicode52]). The differences exist, for example, 1080 - for the code points that have the code point value as part of the 1081 - name (for example, CJK UNIFIED IDEOGRAPH-4E00) and the naming of 1082 - Hangul syllables. For many code points, what you see is the official 1083 - name. 1084 - 1085 - Appendix B.1. Code Points in Unicode Character Database (UCD) Format 1086 - 1087 - 0000..002C ; DISALLOWED # <control>..COMMA 1088 - 002D ; PVALID # HYPHEN-MINUS 1089 - 002E..002F ; DISALLOWED # FULL STOP..SOLIDUS 1090 - 0030..0039 ; PVALID # DIGIT ZERO..DIGIT NINE 1091 - 003A..0060 ; DISALLOWED # COLON..GRAVE ACCENT 1092 - 0061..007A ; PVALID # LATIN SMALL LETTER A..LATIN SMALL LETTER Z 1093 - 007B..00B6 ; DISALLOWED # LEFT CURLY BRACKET..PILCROW SIGN 1094 - 00B7 ; CONTEXTO # MIDDLE DOT 1095 - 00B8..00DE ; DISALLOWED # CEDILLA..LATIN CAPITAL LETTER THORN 1096 - 00DF..00F6 ; PVALID # LATIN SMALL LETTER SHARP S..LATIN SMALL LETT 1097 - 00F7 ; DISALLOWED # DIVISION SIGN 1098 - 00F8..00FF ; PVALID # LATIN SMALL LETTER O WITH STROKE..LATIN SMAL 1099 - 0100 ; DISALLOWED # LATIN CAPITAL LETTER A WITH MACRON 1100 - 0101 ; PVALID # LATIN SMALL LETTER A WITH MACRON 1101 - 0102 ; DISALLOWED # LATIN CAPITAL LETTER A WITH BREVE 1102 - 0103 ; PVALID # LATIN SMALL LETTER A WITH BREVE 1103 - 0104 ; DISALLOWED # LATIN CAPITAL LETTER A WITH OGONEK 1104 - 0105 ; PVALID # LATIN SMALL LETTER A WITH OGONEK 1105 - 0106 ; DISALLOWED # LATIN CAPITAL LETTER C WITH ACUTE 1106 - 0107 ; PVALID # LATIN SMALL LETTER C WITH ACUTE 1107 - 0108 ; DISALLOWED # LATIN CAPITAL LETTER C WITH CIRCUMFLEX 1108 - 0109 ; PVALID # LATIN SMALL LETTER C WITH CIRCUMFLEX 1109 - 010A ; DISALLOWED # LATIN CAPITAL LETTER C WITH DOT ABOVE 1110 - 010B ; PVALID # LATIN SMALL LETTER C WITH DOT ABOVE 1111 - 010C ; DISALLOWED # LATIN CAPITAL LETTER C WITH CARON 1112 - 010D ; PVALID # LATIN SMALL LETTER C WITH CARON 1113 - 010E ; DISALLOWED # LATIN CAPITAL LETTER D WITH CARON 1114 - 010F ; PVALID # LATIN SMALL LETTER D WITH CARON 1115 - 0110 ; DISALLOWED # LATIN CAPITAL LETTER D WITH STROKE 1116 - 0111 ; PVALID # LATIN SMALL LETTER D WITH STROKE 1117 - 0112 ; DISALLOWED # LATIN CAPITAL LETTER E WITH MACRON 1118 - 0113 ; PVALID # LATIN SMALL LETTER E WITH MACRON 1119 - 1120 - 1121 - 1122 - Faltstrom Standards Track [Page 20] 1123 - 1124 - RFC 5892 IDNA Code Points August 2010 1125 - 1126 - 1127 - 0114 ; DISALLOWED # LATIN CAPITAL LETTER E WITH BREVE 1128 - 0115 ; PVALID # LATIN SMALL LETTER E WITH BREVE 1129 - 0116 ; DISALLOWED # LATIN CAPITAL LETTER E WITH DOT ABOVE 1130 - 0117 ; PVALID # LATIN SMALL LETTER E WITH DOT ABOVE 1131 - 0118 ; DISALLOWED # LATIN CAPITAL LETTER E WITH OGONEK 1132 - 0119 ; PVALID # LATIN SMALL LETTER E WITH OGONEK 1133 - 011A ; DISALLOWED # LATIN CAPITAL LETTER E WITH CARON 1134 - 011B ; PVALID # LATIN SMALL LETTER E WITH CARON 1135 - 011C ; DISALLOWED # LATIN CAPITAL LETTER G WITH CIRCUMFLEX 1136 - 011D ; PVALID # LATIN SMALL LETTER G WITH CIRCUMFLEX 1137 - 011E ; DISALLOWED # LATIN CAPITAL LETTER G WITH BREVE 1138 - 011F ; PVALID # LATIN SMALL LETTER G WITH BREVE 1139 - 0120 ; DISALLOWED # LATIN CAPITAL LETTER G WITH DOT ABOVE 1140 - 0121 ; PVALID # LATIN SMALL LETTER G WITH DOT ABOVE 1141 - 0122 ; DISALLOWED # LATIN CAPITAL LETTER G WITH CEDILLA 1142 - 0123 ; PVALID # LATIN SMALL LETTER G WITH CEDILLA 1143 - 0124 ; DISALLOWED # LATIN CAPITAL LETTER H WITH CIRCUMFLEX 1144 - 0125 ; PVALID # LATIN SMALL LETTER H WITH CIRCUMFLEX 1145 - 0126 ; DISALLOWED # LATIN CAPITAL LETTER H WITH STROKE 1146 - 0127 ; PVALID # LATIN SMALL LETTER H WITH STROKE 1147 - 0128 ; DISALLOWED # LATIN CAPITAL LETTER I WITH TILDE 1148 - 0129 ; PVALID # LATIN SMALL LETTER I WITH TILDE 1149 - 012A ; DISALLOWED # LATIN CAPITAL LETTER I WITH MACRON 1150 - 012B ; PVALID # LATIN SMALL LETTER I WITH MACRON 1151 - 012C ; DISALLOWED # LATIN CAPITAL LETTER I WITH BREVE 1152 - 012D ; PVALID # LATIN SMALL LETTER I WITH BREVE 1153 - 012E ; DISALLOWED # LATIN CAPITAL LETTER I WITH OGONEK 1154 - 012F ; PVALID # LATIN SMALL LETTER I WITH OGONEK 1155 - 0130 ; DISALLOWED # LATIN CAPITAL LETTER I WITH DOT ABOVE 1156 - 0131 ; PVALID # LATIN SMALL LETTER DOTLESS I 1157 - 0132..0134 ; DISALLOWED # LATIN CAPITAL LIGATURE IJ..LATIN CAPITAL LET 1158 - 0135 ; PVALID # LATIN SMALL LETTER J WITH CIRCUMFLEX 1159 - 0136 ; DISALLOWED # LATIN CAPITAL LETTER K WITH CEDILLA 1160 - 0137..0138 ; PVALID # LATIN SMALL LETTER K WITH CEDILLA..LATIN SMA 1161 - 0139 ; DISALLOWED # LATIN CAPITAL LETTER L WITH ACUTE 1162 - 013A ; PVALID # LATIN SMALL LETTER L WITH ACUTE 1163 - 013B ; DISALLOWED # LATIN CAPITAL LETTER L WITH CEDILLA 1164 - 013C ; PVALID # LATIN SMALL LETTER L WITH CEDILLA 1165 - 013D ; DISALLOWED # LATIN CAPITAL LETTER L WITH CARON 1166 - 013E ; PVALID # LATIN SMALL LETTER L WITH CARON 1167 - 013F..0141 ; DISALLOWED # LATIN CAPITAL LETTER L WITH MIDDLE DOT..LATI 1168 - 0142 ; PVALID # LATIN SMALL LETTER L WITH STROKE 1169 - 0143 ; DISALLOWED # LATIN CAPITAL LETTER N WITH ACUTE 1170 - 0144 ; PVALID # LATIN SMALL LETTER N WITH ACUTE 1171 - 0145 ; DISALLOWED # LATIN CAPITAL LETTER N WITH CEDILLA 1172 - 0146 ; PVALID # LATIN SMALL LETTER N WITH CEDILLA 1173 - 0147 ; DISALLOWED # LATIN CAPITAL LETTER N WITH CARON 1174 - 0148 ; PVALID # LATIN SMALL LETTER N WITH CARON 1175 - 1176 - 1177 - 1178 - Faltstrom Standards Track [Page 21] 1179 - 1180 - RFC 5892 IDNA Code Points August 2010 1181 - 1182 - 1183 - 0149..014A ; DISALLOWED # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE. 1184 - 014B ; PVALID # LATIN SMALL LETTER ENG 1185 - 014C ; DISALLOWED # LATIN CAPITAL LETTER O WITH MACRON 1186 - 014D ; PVALID # LATIN SMALL LETTER O WITH MACRON 1187 - 014E ; DISALLOWED # LATIN CAPITAL LETTER O WITH BREVE 1188 - 014F ; PVALID # LATIN SMALL LETTER O WITH BREVE 1189 - 0150 ; DISALLOWED # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE 1190 - 0151 ; PVALID # LATIN SMALL LETTER O WITH DOUBLE ACUTE 1191 - 0152 ; DISALLOWED # LATIN CAPITAL LIGATURE OE 1192 - 0153 ; PVALID # LATIN SMALL LIGATURE OE 1193 - 0154 ; DISALLOWED # LATIN CAPITAL LETTER R WITH ACUTE 1194 - 0155 ; PVALID # LATIN SMALL LETTER R WITH ACUTE 1195 - 0156 ; DISALLOWED # LATIN CAPITAL LETTER R WITH CEDILLA 1196 - 0157 ; PVALID # LATIN SMALL LETTER R WITH CEDILLA 1197 - 0158 ; DISALLOWED # LATIN CAPITAL LETTER R WITH CARON 1198 - 0159 ; PVALID # LATIN SMALL LETTER R WITH CARON 1199 - 015A ; DISALLOWED # LATIN CAPITAL LETTER S WITH ACUTE 1200 - 015B ; PVALID # LATIN SMALL LETTER S WITH ACUTE 1201 - 015C ; DISALLOWED # LATIN CAPITAL LETTER S WITH CIRCUMFLEX 1202 - 015D ; PVALID # LATIN SMALL LETTER S WITH CIRCUMFLEX 1203 - 015E ; DISALLOWED # LATIN CAPITAL LETTER S WITH CEDILLA 1204 - 015F ; PVALID # LATIN SMALL LETTER S WITH CEDILLA 1205 - 0160 ; DISALLOWED # LATIN CAPITAL LETTER S WITH CARON 1206 - 0161 ; PVALID # LATIN SMALL LETTER S WITH CARON 1207 - 0162 ; DISALLOWED # LATIN CAPITAL LETTER T WITH CEDILLA 1208 - 0163 ; PVALID # LATIN SMALL LETTER T WITH CEDILLA 1209 - 0164 ; DISALLOWED # LATIN CAPITAL LETTER T WITH CARON 1210 - 0165 ; PVALID # LATIN SMALL LETTER T WITH CARON 1211 - 0166 ; DISALLOWED # LATIN CAPITAL LETTER T WITH STROKE 1212 - 0167 ; PVALID # LATIN SMALL LETTER T WITH STROKE 1213 - 0168 ; DISALLOWED # LATIN CAPITAL LETTER U WITH TILDE 1214 - 0169 ; PVALID # LATIN SMALL LETTER U WITH TILDE 1215 - 016A ; DISALLOWED # LATIN CAPITAL LETTER U WITH MACRON 1216 - 016B ; PVALID # LATIN SMALL LETTER U WITH MACRON 1217 - 016C ; DISALLOWED # LATIN CAPITAL LETTER U WITH BREVE 1218 - 016D ; PVALID # LATIN SMALL LETTER U WITH BREVE 1219 - 016E ; DISALLOWED # LATIN CAPITAL LETTER U WITH RING ABOVE 1220 - 016F ; PVALID # LATIN SMALL LETTER U WITH RING ABOVE 1221 - 0170 ; DISALLOWED # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE 1222 - 0171 ; PVALID # LATIN SMALL LETTER U WITH DOUBLE ACUTE 1223 - 0172 ; DISALLOWED # LATIN CAPITAL LETTER U WITH OGONEK 1224 - 0173 ; PVALID # LATIN SMALL LETTER U WITH OGONEK 1225 - 0174 ; DISALLOWED # LATIN CAPITAL LETTER W WITH CIRCUMFLEX 1226 - 0175 ; PVALID # LATIN SMALL LETTER W WITH CIRCUMFLEX 1227 - 0176 ; DISALLOWED # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX 1228 - 0177 ; PVALID # LATIN SMALL LETTER Y WITH CIRCUMFLEX 1229 - 0178..0179 ; DISALLOWED # LATIN CAPITAL LETTER Y WITH DIAERESIS..LATIN 1230 - 017A ; PVALID # LATIN SMALL LETTER Z WITH ACUTE 1231 - 1232 - 1233 - 1234 - Faltstrom Standards Track [Page 22] 1235 - 1236 - RFC 5892 IDNA Code Points August 2010 1237 - 1238 - 1239 - 017B ; DISALLOWED # LATIN CAPITAL LETTER Z WITH DOT ABOVE 1240 - 017C ; PVALID # LATIN SMALL LETTER Z WITH DOT ABOVE 1241 - 017D ; DISALLOWED # LATIN CAPITAL LETTER Z WITH CARON 1242 - 017E ; PVALID # LATIN SMALL LETTER Z WITH CARON 1243 - 017F ; DISALLOWED # LATIN SMALL LETTER LONG S 1244 - 0180 ; PVALID # LATIN SMALL LETTER B WITH STROKE 1245 - 0181..0182 ; DISALLOWED # LATIN CAPITAL LETTER B WITH HOOK..LATIN CAPI 1246 - 0183 ; PVALID # LATIN SMALL LETTER B WITH TOPBAR 1247 - 0184 ; DISALLOWED # LATIN CAPITAL LETTER TONE SIX 1248 - 0185 ; PVALID # LATIN SMALL LETTER TONE SIX 1249 - 0186..0187 ; DISALLOWED # LATIN CAPITAL LETTER OPEN O..LATIN CAPITAL L 1250 - 0188 ; PVALID # LATIN SMALL LETTER C WITH HOOK 1251 - 0189..018B ; DISALLOWED # LATIN CAPITAL LETTER AFRICAN D..LATIN CAPITA 1252 - 018C..018D ; PVALID # LATIN SMALL LETTER D WITH TOPBAR..LATIN SMAL 1253 - 018E..0191 ; DISALLOWED # LATIN CAPITAL LETTER REVERSED E..LATIN CAPIT 1254 - 0192 ; PVALID # LATIN SMALL LETTER F WITH HOOK 1255 - 0193..0194 ; DISALLOWED # LATIN CAPITAL LETTER G WITH HOOK..LATIN CAPI 1256 - 0195 ; PVALID # LATIN SMALL LETTER HV 1257 - 0196..0198 ; DISALLOWED # LATIN CAPITAL LETTER IOTA..LATIN CAPITAL LET 1258 - 0199..019B ; PVALID # LATIN SMALL LETTER K WITH HOOK..LATIN SMALL 1259 - 019C..019D ; DISALLOWED # LATIN CAPITAL LETTER TURNED M..LATIN CAPITAL 1260 - 019E ; PVALID # LATIN SMALL LETTER N WITH LONG RIGHT LEG 1261 - 019F..01A0 ; DISALLOWED # LATIN CAPITAL LETTER O WITH MIDDLE TILDE..LA 1262 - 01A1 ; PVALID # LATIN SMALL LETTER O WITH HORN 1263 - 01A2 ; DISALLOWED # LATIN CAPITAL LETTER OI 1264 - 01A3 ; PVALID # LATIN SMALL LETTER OI 1265 - 01A4 ; DISALLOWED # LATIN CAPITAL LETTER P WITH HOOK 1266 - 01A5 ; PVALID # LATIN SMALL LETTER P WITH HOOK 1267 - 01A6..01A7 ; DISALLOWED # LATIN LETTER YR..LATIN CAPITAL LETTER TONE T 1268 - 01A8 ; PVALID # LATIN SMALL LETTER TONE TWO 1269 - 01A9 ; DISALLOWED # LATIN CAPITAL LETTER ESH 1270 - 01AA..01AB ; PVALID # LATIN LETTER REVERSED ESH LOOP..LATIN SMALL 1271 - 01AC ; DISALLOWED # LATIN CAPITAL LETTER T WITH HOOK 1272 - 01AD ; PVALID # LATIN SMALL LETTER T WITH HOOK 1273 - 01AE..01AF ; DISALLOWED # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK.. 1274 - 01B0 ; PVALID # LATIN SMALL LETTER U WITH HORN 1275 - 01B1..01B3 ; DISALLOWED # LATIN CAPITAL LETTER UPSILON..LATIN CAPITAL 1276 - 01B4 ; PVALID # LATIN SMALL LETTER Y WITH HOOK 1277 - 01B5 ; DISALLOWED # LATIN CAPITAL LETTER Z WITH STROKE 1278 - 01B6 ; PVALID # LATIN SMALL LETTER Z WITH STROKE 1279 - 01B7..01B8 ; DISALLOWED # LATIN CAPITAL LETTER EZH..LATIN CAPITAL LETT 1280 - 01B9..01BB ; PVALID # LATIN SMALL LETTER EZH REVERSED..LATIN LETTE 1281 - 01BC ; DISALLOWED # LATIN CAPITAL LETTER TONE FIVE 1282 - 01BD..01C3 ; PVALID # LATIN SMALL LETTER TONE FIVE..LATIN LETTER R 1283 - 01C4..01CD ; DISALLOWED # LATIN CAPITAL LETTER DZ WITH CARON..LATIN CA 1284 - 01CE ; PVALID # LATIN SMALL LETTER A WITH CARON 1285 - 01CF ; DISALLOWED # LATIN CAPITAL LETTER I WITH CARON 1286 - 01D0 ; PVALID # LATIN SMALL LETTER I WITH CARON 1287 - 1288 - 1289 - 1290 - Faltstrom Standards Track [Page 23] 1291 - 1292 - RFC 5892 IDNA Code Points August 2010 1293 - 1294 - 1295 - 01D1 ; DISALLOWED # LATIN CAPITAL LETTER O WITH CARON 1296 - 01D2 ; PVALID # LATIN SMALL LETTER O WITH CARON 1297 - 01D3 ; DISALLOWED # LATIN CAPITAL LETTER U WITH CARON 1298 - 01D4 ; PVALID # LATIN SMALL LETTER U WITH CARON 1299 - 01D5 ; DISALLOWED # LATIN CAPITAL LETTER U WITH DIAERESIS AND MA 1300 - 01D6 ; PVALID # LATIN SMALL LETTER U WITH DIAERESIS AND MACR 1301 - 01D7 ; DISALLOWED # LATIN CAPITAL LETTER U WITH DIAERESIS AND AC 1302 - 01D8 ; PVALID # LATIN SMALL LETTER U WITH DIAERESIS AND ACUT 1303 - 01D9 ; DISALLOWED # LATIN CAPITAL LETTER U WITH DIAERESIS AND CA 1304 - 01DA ; PVALID # LATIN SMALL LETTER U WITH DIAERESIS AND CARO 1305 - 01DB ; DISALLOWED # LATIN CAPITAL LETTER U WITH DIAERESIS AND GR 1306 - 01DC..01DD ; PVALID # LATIN SMALL LETTER U WITH DIAERESIS AND GRAV 1307 - 01DE ; DISALLOWED # LATIN CAPITAL LETTER A WITH DIAERESIS AND MA 1308 - 01DF ; PVALID # LATIN SMALL LETTER A WITH DIAERESIS AND MACR 1309 - 01E0 ; DISALLOWED # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MA 1310 - 01E1 ; PVALID # LATIN SMALL LETTER A WITH DOT ABOVE AND MACR 1311 - 01E2 ; DISALLOWED # LATIN CAPITAL LETTER AE WITH MACRON 1312 - 01E3 ; PVALID # LATIN SMALL LETTER AE WITH MACRON 1313 - 01E4 ; DISALLOWED # LATIN CAPITAL LETTER G WITH STROKE 1314 - 01E5 ; PVALID # LATIN SMALL LETTER G WITH STROKE 1315 - 01E6 ; DISALLOWED # LATIN CAPITAL LETTER G WITH CARON 1316 - 01E7 ; PVALID # LATIN SMALL LETTER G WITH CARON 1317 - 01E8 ; DISALLOWED # LATIN CAPITAL LETTER K WITH CARON 1318 - 01E9 ; PVALID # LATIN SMALL LETTER K WITH CARON 1319 - 01EA ; DISALLOWED # LATIN CAPITAL LETTER O WITH OGONEK 1320 - 01EB ; PVALID # LATIN SMALL LETTER O WITH OGONEK 1321 - 01EC ; DISALLOWED # LATIN CAPITAL LETTER O WITH OGONEK AND MACRO 1322 - 01ED ; PVALID # LATIN SMALL LETTER O WITH OGONEK AND MACRON 1323 - 01EE ; DISALLOWED # LATIN CAPITAL LETTER EZH WITH CARON 1324 - 01EF..01F0 ; PVALID # LATIN SMALL LETTER EZH WITH CARON..LATIN SMA 1325 - 01F1..01F4 ; DISALLOWED # LATIN CAPITAL LETTER DZ..LATIN CAPITAL LETTE 1326 - 01F5 ; PVALID # LATIN SMALL LETTER G WITH ACUTE 1327 - 01F6..01F8 ; DISALLOWED # LATIN CAPITAL LETTER HWAIR..LATIN CAPITAL LE 1328 - 01F9 ; PVALID # LATIN SMALL LETTER N WITH GRAVE 1329 - 01FA ; DISALLOWED # LATIN CAPITAL LETTER A WITH RING ABOVE AND A 1330 - 01FB ; PVALID # LATIN SMALL LETTER A WITH RING ABOVE AND ACU 1331 - 01FC ; DISALLOWED # LATIN CAPITAL LETTER AE WITH ACUTE 1332 - 01FD ; PVALID # LATIN SMALL LETTER AE WITH ACUTE 1333 - 01FE ; DISALLOWED # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE 1334 - 01FF ; PVALID # LATIN SMALL LETTER O WITH STROKE AND ACUTE 1335 - 0200 ; DISALLOWED # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE 1336 - 0201 ; PVALID # LATIN SMALL LETTER A WITH DOUBLE GRAVE 1337 - 0202 ; DISALLOWED # LATIN CAPITAL LETTER A WITH INVERTED BREVE 1338 - 0203 ; PVALID # LATIN SMALL LETTER A WITH INVERTED BREVE 1339 - 0204 ; DISALLOWED # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE 1340 - 0205 ; PVALID # LATIN SMALL LETTER E WITH DOUBLE GRAVE 1341 - 0206 ; DISALLOWED # LATIN CAPITAL LETTER E WITH INVERTED BREVE 1342 - 0207 ; PVALID # LATIN SMALL LETTER E WITH INVERTED BREVE 1343 - 1344 - 1345 - 1346 - Faltstrom Standards Track [Page 24] 1347 - 1348 - RFC 5892 IDNA Code Points August 2010 1349 - 1350 - 1351 - 0208 ; DISALLOWED # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE 1352 - 0209 ; PVALID # LATIN SMALL LETTER I WITH DOUBLE GRAVE 1353 - 020A ; DISALLOWED # LATIN CAPITAL LETTER I WITH INVERTED BREVE 1354 - 020B ; PVALID # LATIN SMALL LETTER I WITH INVERTED BREVE 1355 - 020C ; DISALLOWED # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE 1356 - 020D ; PVALID # LATIN SMALL LETTER O WITH DOUBLE GRAVE 1357 - 020E ; DISALLOWED # LATIN CAPITAL LETTER O WITH INVERTED BREVE 1358 - 020F ; PVALID # LATIN SMALL LETTER O WITH INVERTED BREVE 1359 - 0210 ; DISALLOWED # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE 1360 - 0211 ; PVALID # LATIN SMALL LETTER R WITH DOUBLE GRAVE 1361 - 0212 ; DISALLOWED # LATIN CAPITAL LETTER R WITH INVERTED BREVE 1362 - 0213 ; PVALID # LATIN SMALL LETTER R WITH INVERTED BREVE 1363 - 0214 ; DISALLOWED # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE 1364 - 0215 ; PVALID # LATIN SMALL LETTER U WITH DOUBLE GRAVE 1365 - 0216 ; DISALLOWED # LATIN CAPITAL LETTER U WITH INVERTED BREVE 1366 - 0217 ; PVALID # LATIN SMALL LETTER U WITH INVERTED BREVE 1367 - 0218 ; DISALLOWED # LATIN CAPITAL LETTER S WITH COMMA BELOW 1368 - 0219 ; PVALID # LATIN SMALL LETTER S WITH COMMA BELOW 1369 - 021A ; DISALLOWED # LATIN CAPITAL LETTER T WITH COMMA BELOW 1370 - 021B ; PVALID # LATIN SMALL LETTER T WITH COMMA BELOW 1371 - 021C ; DISALLOWED # LATIN CAPITAL LETTER YOGH 1372 - 021D ; PVALID # LATIN SMALL LETTER YOGH 1373 - 021E ; DISALLOWED # LATIN CAPITAL LETTER H WITH CARON 1374 - 021F ; PVALID # LATIN SMALL LETTER H WITH CARON 1375 - 0220 ; DISALLOWED # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG 1376 - 0221 ; PVALID # LATIN SMALL LETTER D WITH CURL 1377 - 0222 ; DISALLOWED # LATIN CAPITAL LETTER OU 1378 - 0223 ; PVALID # LATIN SMALL LETTER OU 1379 - 0224 ; DISALLOWED # LATIN CAPITAL LETTER Z WITH HOOK 1380 - 0225 ; PVALID # LATIN SMALL LETTER Z WITH HOOK 1381 - 0226 ; DISALLOWED # LATIN CAPITAL LETTER A WITH DOT ABOVE 1382 - 0227 ; PVALID # LATIN SMALL LETTER A WITH DOT ABOVE 1383 - 0228 ; DISALLOWED # LATIN CAPITAL LETTER E WITH CEDILLA 1384 - 0229 ; PVALID # LATIN SMALL LETTER E WITH CEDILLA 1385 - 022A ; DISALLOWED # LATIN CAPITAL LETTER O WITH DIAERESIS AND MA 1386 - 022B ; PVALID # LATIN SMALL LETTER O WITH DIAERESIS AND MACR 1387 - 022C ; DISALLOWED # LATIN CAPITAL LETTER O WITH TILDE AND MACRON 1388 - 022D ; PVALID # LATIN SMALL LETTER O WITH TILDE AND MACRON 1389 - 022E ; DISALLOWED # LATIN CAPITAL LETTER O WITH DOT ABOVE 1390 - 022F ; PVALID # LATIN SMALL LETTER O WITH DOT ABOVE 1391 - 0230 ; DISALLOWED # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MA 1392 - 0231 ; PVALID # LATIN SMALL LETTER O WITH DOT ABOVE AND MACR 1393 - 0232 ; DISALLOWED # LATIN CAPITAL LETTER Y WITH MACRON 1394 - 0233..0239 ; PVALID # LATIN SMALL LETTER Y WITH MACRON..LATIN SMAL 1395 - 023A..023B ; DISALLOWED # LATIN CAPITAL LETTER A WITH STROKE..LATIN CA 1396 - 023C ; PVALID # LATIN SMALL LETTER C WITH STROKE 1397 - 023D..023E ; DISALLOWED # LATIN CAPITAL LETTER L WITH BAR..LATIN CAPIT 1398 - 023F..0240 ; PVALID # LATIN SMALL LETTER S WITH SWASH TAIL..LATIN 1399 - 1400 - 1401 - 1402 - Faltstrom Standards Track [Page 25] 1403 - 1404 - RFC 5892 IDNA Code Points August 2010 1405 - 1406 - 1407 - 0241 ; DISALLOWED # LATIN CAPITAL LETTER GLOTTAL STOP 1408 - 0242 ; PVALID # LATIN SMALL LETTER GLOTTAL STOP 1409 - 0243..0246 ; DISALLOWED # LATIN CAPITAL LETTER B WITH STROKE..LATIN CA 1410 - 0247 ; PVALID # LATIN SMALL LETTER E WITH STROKE 1411 - 0248 ; DISALLOWED # LATIN CAPITAL LETTER J WITH STROKE 1412 - 0249 ; PVALID # LATIN SMALL LETTER J WITH STROKE 1413 - 024A ; DISALLOWED # LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL 1414 - 024B ; PVALID # LATIN SMALL LETTER Q WITH HOOK TAIL 1415 - 024C ; DISALLOWED # LATIN CAPITAL LETTER R WITH STROKE 1416 - 024D ; PVALID # LATIN SMALL LETTER R WITH STROKE 1417 - 024E ; DISALLOWED # LATIN CAPITAL LETTER Y WITH STROKE 1418 - 024F..02AF ; PVALID # LATIN SMALL LETTER Y WITH STROKE..LATIN SMAL 1419 - 02B0..02B8 ; DISALLOWED # MODIFIER LETTER SMALL H..MODIFIER LETTER SMA 1420 - 02B9..02C1 ; PVALID # MODIFIER LETTER PRIME..MODIFIER LETTER REVER 1421 - 02C2..02C5 ; DISALLOWED # MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LET 1422 - 02C6..02D1 ; PVALID # MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER 1423 - 02D2..02EB ; DISALLOWED # MODIFIER LETTER CENTRED RIGHT HALF RING..MOD 1424 - 02EC ; PVALID # MODIFIER LETTER VOICING 1425 - 02ED ; DISALLOWED # MODIFIER LETTER UNASPIRATED 1426 - 02EE ; PVALID # MODIFIER LETTER DOUBLE APOSTROPHE 1427 - 02EF..02FF ; DISALLOWED # MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER 1428 - 0300..033F ; PVALID # COMBINING GRAVE ACCENT..COMBINING DOUBLE OVE 1429 - 0340..0341 ; DISALLOWED # COMBINING GRAVE TONE MARK..COMBINING ACUTE T 1430 - 0342 ; PVALID # COMBINING GREEK PERISPOMENI 1431 - 0343..0345 ; DISALLOWED # COMBINING GREEK KORONIS..COMBINING GREEK YPO 1432 - 0346..034E ; PVALID # COMBINING BRIDGE ABOVE..COMBINING UPWARDS AR 1433 - 034F ; DISALLOWED # COMBINING GRAPHEME JOINER 1434 - 0350..036F ; PVALID # COMBINING RIGHT ARROWHEAD ABOVE..COMBINING L 1435 - 0370 ; DISALLOWED # GREEK CAPITAL LETTER HETA 1436 - 0371 ; PVALID # GREEK SMALL LETTER HETA 1437 - 0372 ; DISALLOWED # GREEK CAPITAL LETTER ARCHAIC SAMPI 1438 - 0373 ; PVALID # GREEK SMALL LETTER ARCHAIC SAMPI 1439 - 0374 ; DISALLOWED # GREEK NUMERAL SIGN 1440 - 0375 ; CONTEXTO # GREEK LOWER NUMERAL SIGN 1441 - 0376 ; DISALLOWED # GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA 1442 - 0377 ; PVALID # GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 1443 - 0378..0379 ; UNASSIGNED # <reserved>..<reserved> 1444 - 037A ; DISALLOWED # GREEK YPOGEGRAMMENI 1445 - 037B..037D ; PVALID # GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GR 1446 - 037E ; DISALLOWED # GREEK QUESTION MARK 1447 - 037F..0383 ; UNASSIGNED # <reserved>..<reserved> 1448 - 0384..038A ; DISALLOWED # GREEK TONOS..GREEK CAPITAL LETTER IOTA WITH 1449 - 038B ; UNASSIGNED # <reserved> 1450 - 038C ; DISALLOWED # GREEK CAPITAL LETTER OMICRON WITH TONOS 1451 - 038D ; UNASSIGNED # <reserved> 1452 - 038E..038F ; DISALLOWED # GREEK CAPITAL LETTER UPSILON WITH TONOS..GRE 1453 - 0390 ; PVALID # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND T 1454 - 0391..03A1 ; DISALLOWED # GREEK CAPITAL LETTER ALPHA..GREEK CAPITAL LE 1455 - 1456 - 1457 - 1458 - Faltstrom Standards Track [Page 26] 1459 - 1460 - RFC 5892 IDNA Code Points August 2010 1461 - 1462 - 1463 - 03A2 ; UNASSIGNED # <reserved> 1464 - 03A3..03AB ; DISALLOWED # GREEK CAPITAL LETTER SIGMA..GREEK CAPITAL LE 1465 - 03AC..03CE ; PVALID # GREEK SMALL LETTER ALPHA WITH TONOS..GREEK S 1466 - 03CF..03D6 ; DISALLOWED # GREEK CAPITAL KAI SYMBOL..GREEK PI SYMBOL 1467 - 03D7 ; PVALID # GREEK KAI SYMBOL 1468 - 03D8 ; DISALLOWED # GREEK LETTER ARCHAIC KOPPA 1469 - 03D9 ; PVALID # GREEK SMALL LETTER ARCHAIC KOPPA 1470 - 03DA ; DISALLOWED # GREEK LETTER STIGMA 1471 - 03DB ; PVALID # GREEK SMALL LETTER STIGMA 1472 - 03DC ; DISALLOWED # GREEK LETTER DIGAMMA 1473 - 03DD ; PVALID # GREEK SMALL LETTER DIGAMMA 1474 - 03DE ; DISALLOWED # GREEK LETTER KOPPA 1475 - 03DF ; PVALID # GREEK SMALL LETTER KOPPA 1476 - 03E0 ; DISALLOWED # GREEK LETTER SAMPI 1477 - 03E1 ; PVALID # GREEK SMALL LETTER SAMPI 1478 - 03E2 ; DISALLOWED # COPTIC CAPITAL LETTER SHEI 1479 - 03E3 ; PVALID # COPTIC SMALL LETTER SHEI 1480 - 03E4 ; DISALLOWED # COPTIC CAPITAL LETTER FEI 1481 - 03E5 ; PVALID # COPTIC SMALL LETTER FEI 1482 - 03E6 ; DISALLOWED # COPTIC CAPITAL LETTER KHEI 1483 - 03E7 ; PVALID # COPTIC SMALL LETTER KHEI 1484 - 03E8 ; DISALLOWED # COPTIC CAPITAL LETTER HORI 1485 - 03E9 ; PVALID # COPTIC SMALL LETTER HORI 1486 - 03EA ; DISALLOWED # COPTIC CAPITAL LETTER GANGIA 1487 - 03EB ; PVALID # COPTIC SMALL LETTER GANGIA 1488 - 03EC ; DISALLOWED # COPTIC CAPITAL LETTER SHIMA 1489 - 03ED ; PVALID # COPTIC SMALL LETTER SHIMA 1490 - 03EE ; DISALLOWED # COPTIC CAPITAL LETTER DEI 1491 - 03EF ; PVALID # COPTIC SMALL LETTER DEI 1492 - 03F0..03F2 ; DISALLOWED # GREEK KAPPA SYMBOL..GREEK LUNATE SIGMA SYMBO 1493 - 03F3 ; PVALID # GREEK LETTER YOT 1494 - 03F4..03F7 ; DISALLOWED # GREEK CAPITAL THETA SYMBOL..GREEK CAPITAL LE 1495 - 03F8 ; PVALID # GREEK SMALL LETTER SHO 1496 - 03F9..03FA ; DISALLOWED # GREEK CAPITAL LUNATE SIGMA SYMBOL..GREEK CAP 1497 - 03FB..03FC ; PVALID # GREEK SMALL LETTER SAN..GREEK RHO WITH STROK 1498 - 03FD..042F ; DISALLOWED # GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL.. 1499 - 0430..045F ; PVALID # CYRILLIC SMALL LETTER A..CYRILLIC SMALL LETT 1500 - 0460 ; DISALLOWED # CYRILLIC CAPITAL LETTER OMEGA 1501 - 0461 ; PVALID # CYRILLIC SMALL LETTER OMEGA 1502 - 0462 ; DISALLOWED # CYRILLIC CAPITAL LETTER YAT 1503 - 0463 ; PVALID # CYRILLIC SMALL LETTER YAT 1504 - 0464 ; DISALLOWED # CYRILLIC CAPITAL LETTER IOTIFIED E 1505 - 0465 ; PVALID # CYRILLIC SMALL LETTER IOTIFIED E 1506 - 0466 ; DISALLOWED # CYRILLIC CAPITAL LETTER LITTLE YUS 1507 - 0467 ; PVALID # CYRILLIC SMALL LETTER LITTLE YUS 1508 - 0468 ; DISALLOWED # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS 1509 - 0469 ; PVALID # CYRILLIC SMALL LETTER IOTIFIED LITTLE YUS 1510 - 046A ; DISALLOWED # CYRILLIC CAPITAL LETTER BIG YUS 1511 - 1512 - 1513 - 1514 - Faltstrom Standards Track [Page 27] 1515 - 1516 - RFC 5892 IDNA Code Points August 2010 1517 - 1518 - 1519 - 046B ; PVALID # CYRILLIC SMALL LETTER BIG YUS 1520 - 046C ; DISALLOWED # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS 1521 - 046D ; PVALID # CYRILLIC SMALL LETTER IOTIFIED BIG YUS 1522 - 046E ; DISALLOWED # CYRILLIC CAPITAL LETTER KSI 1523 - 046F ; PVALID # CYRILLIC SMALL LETTER KSI 1524 - 0470 ; DISALLOWED # CYRILLIC CAPITAL LETTER PSI 1525 - 0471 ; PVALID # CYRILLIC SMALL LETTER PSI 1526 - 0472 ; DISALLOWED # CYRILLIC CAPITAL LETTER FITA 1527 - 0473 ; PVALID # CYRILLIC SMALL LETTER FITA 1528 - 0474 ; DISALLOWED # CYRILLIC CAPITAL LETTER IZHITSA 1529 - 0475 ; PVALID # CYRILLIC SMALL LETTER IZHITSA 1530 - 0476 ; DISALLOWED # CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE 1531 - 0477 ; PVALID # CYRILLIC SMALL LETTER IZHITSA WITH DOUBLE GR 1532 - 0478 ; DISALLOWED # CYRILLIC CAPITAL LETTER UK 1533 - 0479 ; PVALID # CYRILLIC SMALL LETTER UK 1534 - 047A ; DISALLOWED # CYRILLIC CAPITAL LETTER ROUND OMEGA 1535 - 047B ; PVALID # CYRILLIC SMALL LETTER ROUND OMEGA 1536 - 047C ; DISALLOWED # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO 1537 - 047D ; PVALID # CYRILLIC SMALL LETTER OMEGA WITH TITLO 1538 - 047E ; DISALLOWED # CYRILLIC CAPITAL LETTER OT 1539 - 047F ; PVALID # CYRILLIC SMALL LETTER OT 1540 - 0480 ; DISALLOWED # CYRILLIC CAPITAL LETTER KOPPA 1541 - 0481 ; PVALID # CYRILLIC SMALL LETTER KOPPA 1542 - 0482 ; DISALLOWED # CYRILLIC THOUSANDS SIGN 1543 - 0483..0487 ; PVALID # COMBINING CYRILLIC TITLO..COMBINING CYRILLIC 1544 - 0488..048A ; DISALLOWED # COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..C 1545 - 048B ; PVALID # CYRILLIC SMALL LETTER SHORT I WITH TAIL 1546 - 048C ; DISALLOWED # CYRILLIC CAPITAL LETTER SEMISOFT SIGN 1547 - 048D ; PVALID # CYRILLIC SMALL LETTER SEMISOFT SIGN 1548 - 048E ; DISALLOWED # CYRILLIC CAPITAL LETTER ER WITH TICK 1549 - 048F ; PVALID # CYRILLIC SMALL LETTER ER WITH TICK 1550 - 0490 ; DISALLOWED # CYRILLIC CAPITAL LETTER GHE WITH UPTURN 1551 - 0491 ; PVALID # CYRILLIC SMALL LETTER GHE WITH UPTURN 1552 - 0492 ; DISALLOWED # CYRILLIC CAPITAL LETTER GHE WITH STROKE 1553 - 0493 ; PVALID # CYRILLIC SMALL LETTER GHE WITH STROKE 1554 - 0494 ; DISALLOWED # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK 1555 - 0495 ; PVALID # CYRILLIC SMALL LETTER GHE WITH MIDDLE HOOK 1556 - 0496 ; DISALLOWED # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER 1557 - 0497 ; PVALID # CYRILLIC SMALL LETTER ZHE WITH DESCENDER 1558 - 0498 ; DISALLOWED # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER 1559 - 0499 ; PVALID # CYRILLIC SMALL LETTER ZE WITH DESCENDER 1560 - 049A ; DISALLOWED # CYRILLIC CAPITAL LETTER KA WITH DESCENDER 1561 - 049B ; PVALID # CYRILLIC SMALL LETTER KA WITH DESCENDER 1562 - 049C ; DISALLOWED # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STR 1563 - 049D ; PVALID # CYRILLIC SMALL LETTER KA WITH VERTICAL STROK 1564 - 049E ; DISALLOWED # CYRILLIC CAPITAL LETTER KA WITH STROKE 1565 - 049F ; PVALID # CYRILLIC SMALL LETTER KA WITH STROKE 1566 - 04A0 ; DISALLOWED # CYRILLIC CAPITAL LETTER BASHKIR KA 1567 - 1568 - 1569 - 1570 - Faltstrom Standards Track [Page 28] 1571 - 1572 - RFC 5892 IDNA Code Points August 2010 1573 - 1574 - 1575 - 04A1 ; PVALID # CYRILLIC SMALL LETTER BASHKIR KA 1576 - 04A2 ; DISALLOWED # CYRILLIC CAPITAL LETTER EN WITH DESCENDER 1577 - 04A3 ; PVALID # CYRILLIC SMALL LETTER EN WITH DESCENDER 1578 - 04A4 ; DISALLOWED # CYRILLIC CAPITAL LIGATURE EN GHE 1579 - 04A5 ; PVALID # CYRILLIC SMALL LIGATURE EN GHE 1580 - 04A6 ; DISALLOWED # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK 1581 - 04A7 ; PVALID # CYRILLIC SMALL LETTER PE WITH MIDDLE HOOK 1582 - 04A8 ; DISALLOWED # CYRILLIC CAPITAL LETTER ABKHASIAN HA 1583 - 04A9 ; PVALID # CYRILLIC SMALL LETTER ABKHASIAN HA 1584 - 04AA ; DISALLOWED # CYRILLIC CAPITAL LETTER ES WITH DESCENDER 1585 - 04AB ; PVALID # CYRILLIC SMALL LETTER ES WITH DESCENDER 1586 - 04AC ; DISALLOWED # CYRILLIC CAPITAL LETTER TE WITH DESCENDER 1587 - 04AD ; PVALID # CYRILLIC SMALL LETTER TE WITH DESCENDER 1588 - 04AE ; DISALLOWED # CYRILLIC CAPITAL LETTER STRAIGHT U 1589 - 04AF ; PVALID # CYRILLIC SMALL LETTER STRAIGHT U 1590 - 04B0 ; DISALLOWED # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STRO 1591 - 04B1 ; PVALID # CYRILLIC SMALL LETTER STRAIGHT U WITH STROKE 1592 - 04B2 ; DISALLOWED # CYRILLIC CAPITAL LETTER HA WITH DESCENDER 1593 - 04B3 ; PVALID # CYRILLIC SMALL LETTER HA WITH DESCENDER 1594 - 04B4 ; DISALLOWED # CYRILLIC CAPITAL LIGATURE TE TSE 1595 - 04B5 ; PVALID # CYRILLIC SMALL LIGATURE TE TSE 1596 - 04B6 ; DISALLOWED # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER 1597 - 04B7 ; PVALID # CYRILLIC SMALL LETTER CHE WITH DESCENDER 1598 - 04B8 ; DISALLOWED # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL ST 1599 - 04B9 ; PVALID # CYRILLIC SMALL LETTER CHE WITH VERTICAL STRO 1600 - 04BA ; DISALLOWED # CYRILLIC CAPITAL LETTER SHHA 1601 - 04BB ; PVALID # CYRILLIC SMALL LETTER SHHA 1602 - 04BC ; DISALLOWED # CYRILLIC CAPITAL LETTER ABKHASIAN CHE 1603 - 04BD ; PVALID # CYRILLIC SMALL LETTER ABKHASIAN CHE 1604 - 04BE ; DISALLOWED # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH D 1605 - 04BF ; PVALID # CYRILLIC SMALL LETTER ABKHASIAN CHE WITH DES 1606 - 04C0..04C1 ; DISALLOWED # CYRILLIC LETTER PALOCHKA..CYRILLIC CAPITAL L 1607 - 04C2 ; PVALID # CYRILLIC SMALL LETTER ZHE WITH BREVE 1608 - 04C3 ; DISALLOWED # CYRILLIC CAPITAL LETTER KA WITH HOOK 1609 - 04C4 ; PVALID # CYRILLIC SMALL LETTER KA WITH HOOK 1610 - 04C5 ; DISALLOWED # CYRILLIC CAPITAL LETTER EL WITH TAIL 1611 - 04C6 ; PVALID # CYRILLIC SMALL LETTER EL WITH TAIL 1612 - 04C7 ; DISALLOWED # CYRILLIC CAPITAL LETTER EN WITH HOOK 1613 - 04C8 ; PVALID # CYRILLIC SMALL LETTER EN WITH HOOK 1614 - 04C9 ; DISALLOWED # CYRILLIC CAPITAL LETTER EN WITH TAIL 1615 - 04CA ; PVALID # CYRILLIC SMALL LETTER EN WITH TAIL 1616 - 04CB ; DISALLOWED # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE 1617 - 04CC ; PVALID # CYRILLIC SMALL LETTER KHAKASSIAN CHE 1618 - 04CD ; DISALLOWED # CYRILLIC CAPITAL LETTER EM WITH TAIL 1619 - 04CE..04CF ; PVALID # CYRILLIC SMALL LETTER EM WITH TAIL..CYRILLIC 1620 - 04D0 ; DISALLOWED # CYRILLIC CAPITAL LETTER A WITH BREVE 1621 - 04D1 ; PVALID # CYRILLIC SMALL LETTER A WITH BREVE 1622 - 04D2 ; DISALLOWED # CYRILLIC CAPITAL LETTER A WITH DIAERESIS 1623 - 1624 - 1625 - 1626 - Faltstrom Standards Track [Page 29] 1627 - 1628 - RFC 5892 IDNA Code Points August 2010 1629 - 1630 - 1631 - 04D3 ; PVALID # CYRILLIC SMALL LETTER A WITH DIAERESIS 1632 - 04D4 ; DISALLOWED # CYRILLIC CAPITAL LIGATURE A IE 1633 - 04D5 ; PVALID # CYRILLIC SMALL LIGATURE A IE 1634 - 04D6 ; DISALLOWED # CYRILLIC CAPITAL LETTER IE WITH BREVE 1635 - 04D7 ; PVALID # CYRILLIC SMALL LETTER IE WITH BREVE 1636 - 04D8 ; DISALLOWED # CYRILLIC CAPITAL LETTER SCHWA 1637 - 04D9 ; PVALID # CYRILLIC SMALL LETTER SCHWA 1638 - 04DA ; DISALLOWED # CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS 1639 - 04DB ; PVALID # CYRILLIC SMALL LETTER SCHWA WITH DIAERESIS 1640 - 04DC ; DISALLOWED # CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS 1641 - 04DD ; PVALID # CYRILLIC SMALL LETTER ZHE WITH DIAERESIS 1642 - 04DE ; DISALLOWED # CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS 1643 - 04DF ; PVALID # CYRILLIC SMALL LETTER ZE WITH DIAERESIS 1644 - 04E0 ; DISALLOWED # CYRILLIC CAPITAL LETTER ABKHASIAN DZE 1645 - 04E1 ; PVALID # CYRILLIC SMALL LETTER ABKHASIAN DZE 1646 - 04E2 ; DISALLOWED # CYRILLIC CAPITAL LETTER I WITH MACRON 1647 - 04E3 ; PVALID # CYRILLIC SMALL LETTER I WITH MACRON 1648 - 04E4 ; DISALLOWED # CYRILLIC CAPITAL LETTER I WITH DIAERESIS 1649 - 04E5 ; PVALID # CYRILLIC SMALL LETTER I WITH DIAERESIS 1650 - 04E6 ; DISALLOWED # CYRILLIC CAPITAL LETTER O WITH DIAERESIS 1651 - 04E7 ; PVALID # CYRILLIC SMALL LETTER O WITH DIAERESIS 1652 - 04E8 ; DISALLOWED # CYRILLIC CAPITAL LETTER BARRED O 1653 - 04E9 ; PVALID # CYRILLIC SMALL LETTER BARRED O 1654 - 04EA ; DISALLOWED # CYRILLIC CAPITAL LETTER BARRED O WITH DIAERE 1655 - 04EB ; PVALID # CYRILLIC SMALL LETTER BARRED O WITH DIAERESI 1656 - 04EC ; DISALLOWED # CYRILLIC CAPITAL LETTER E WITH DIAERESIS 1657 - 04ED ; PVALID # CYRILLIC SMALL LETTER E WITH DIAERESIS 1658 - 04EE ; DISALLOWED # CYRILLIC CAPITAL LETTER U WITH MACRON 1659 - 04EF ; PVALID # CYRILLIC SMALL LETTER U WITH MACRON 1660 - 04F0 ; DISALLOWED # CYRILLIC CAPITAL LETTER U WITH DIAERESIS 1661 - 04F1 ; PVALID # CYRILLIC SMALL LETTER U WITH DIAERESIS 1662 - 04F2 ; DISALLOWED # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE 1663 - 04F3 ; PVALID # CYRILLIC SMALL LETTER U WITH DOUBLE ACUTE 1664 - 04F4 ; DISALLOWED # CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS 1665 - 04F5 ; PVALID # CYRILLIC SMALL LETTER CHE WITH DIAERESIS 1666 - 04F6 ; DISALLOWED # CYRILLIC CAPITAL LETTER GHE WITH DESCENDER 1667 - 04F7 ; PVALID # CYRILLIC SMALL LETTER GHE WITH DESCENDER 1668 - 04F8 ; DISALLOWED # CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS 1669 - 04F9 ; PVALID # CYRILLIC SMALL LETTER YERU WITH DIAERESIS 1670 - 04FA ; DISALLOWED # CYRILLIC CAPITAL LETTER GHE WITH STROKE AND 1671 - 04FB ; PVALID # CYRILLIC SMALL LETTER GHE WITH STROKE AND HO 1672 - 04FC ; DISALLOWED # CYRILLIC CAPITAL LETTER HA WITH HOOK 1673 - 04FD ; PVALID # CYRILLIC SMALL LETTER HA WITH HOOK 1674 - 04FE ; DISALLOWED # CYRILLIC CAPITAL LETTER HA WITH STROKE 1675 - 04FF ; PVALID # CYRILLIC SMALL LETTER HA WITH STROKE 1676 - 0500 ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI DE 1677 - 0501 ; PVALID # CYRILLIC SMALL LETTER KOMI DE 1678 - 0502 ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI DJE 1679 - 1680 - 1681 - 1682 - Faltstrom Standards Track [Page 30] 1683 - 1684 - RFC 5892 IDNA Code Points August 2010 1685 - 1686 - 1687 - 0503 ; PVALID # CYRILLIC SMALL LETTER KOMI DJE 1688 - 0504 ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI ZJE 1689 - 0505 ; PVALID # CYRILLIC SMALL LETTER KOMI ZJE 1690 - 0506 ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI DZJE 1691 - 0507 ; PVALID # CYRILLIC SMALL LETTER KOMI DZJE 1692 - 0508 ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI LJE 1693 - 0509 ; PVALID # CYRILLIC SMALL LETTER KOMI LJE 1694 - 050A ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI NJE 1695 - 050B ; PVALID # CYRILLIC SMALL LETTER KOMI NJE 1696 - 050C ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI SJE 1697 - 050D ; PVALID # CYRILLIC SMALL LETTER KOMI SJE 1698 - 050E ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI TJE 1699 - 050F ; PVALID # CYRILLIC SMALL LETTER KOMI TJE 1700 - 0510 ; DISALLOWED # CYRILLIC CAPITAL LETTER REVERSED ZE 1701 - 0511 ; PVALID # CYRILLIC SMALL LETTER REVERSED ZE 1702 - 0512 ; DISALLOWED # CYRILLIC CAPITAL LETTER EL WITH HOOK 1703 - 0513 ; PVALID # CYRILLIC SMALL LETTER EL WITH HOOK 1704 - 0514 ; DISALLOWED # CYRILLIC CAPITAL LETTER LHA 1705 - 0515 ; PVALID # CYRILLIC SMALL LETTER LHA 1706 - 0516 ; DISALLOWED # CYRILLIC CAPITAL LETTER RHA 1707 - 0517 ; PVALID # CYRILLIC SMALL LETTER RHA 1708 - 0518 ; DISALLOWED # CYRILLIC CAPITAL LETTER YAE 1709 - 0519 ; PVALID # CYRILLIC SMALL LETTER YAE 1710 - 051A ; DISALLOWED # CYRILLIC CAPITAL LETTER QA 1711 - 051B ; PVALID # CYRILLIC SMALL LETTER QA 1712 - 051C ; DISALLOWED # CYRILLIC CAPITAL LETTER WE 1713 - 051D ; PVALID # CYRILLIC SMALL LETTER WE 1714 - 051E ; DISALLOWED # CYRILLIC CAPITAL LETTER ALEUT KA 1715 - 051F ; PVALID # CYRILLIC SMALL LETTER ALEUT KA 1716 - 0520 ; DISALLOWED # CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK 1717 - 0521 ; PVALID # CYRILLIC SMALL LETTER EL WITH MIDDLE HOOK 1718 - 0522 ; DISALLOWED # CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK 1719 - 0523 ; PVALID # CYRILLIC SMALL LETTER EN WITH MIDDLE HOOK 1720 - 0524 ; DISALLOWED # CYRILLIC CAPITAL LETTER PE WITH DESCENDER 1721 - 0525 ; PVALID # CYRILLIC SMALL LETTER PE WITH DESCENDER 1722 - 0526..0530 ; UNASSIGNED # <reserved>..<reserved> 1723 - 0531..0556 ; DISALLOWED # ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITA 1724 - 0557..0558 ; UNASSIGNED # <reserved>..<reserved> 1725 - 0559 ; PVALID # ARMENIAN MODIFIER LETTER LEFT HALF RING 1726 - 055A..055F ; DISALLOWED # ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION M 1727 - 0560 ; UNASSIGNED # <reserved> 1728 - 0561..0586 ; PVALID # ARMENIAN SMALL LETTER AYB..ARMENIAN SMALL LE 1729 - 0587 ; DISALLOWED # ARMENIAN SMALL LIGATURE ECH YIWN 1730 - 0588 ; UNASSIGNED # <reserved> 1731 - 0589..058A ; DISALLOWED # ARMENIAN FULL STOP..ARMENIAN HYPHEN 1732 - 058B..0590 ; UNASSIGNED # <reserved>..<reserved> 1733 - 0591..05BD ; PVALID # HEBREW ACCENT ETNAHTA..HEBREW POINT METEG 1734 - 05BE ; DISALLOWED # HEBREW PUNCTUATION MAQAF 1735 - 1736 - 1737 - 1738 - Faltstrom Standards Track [Page 31] 1739 - 1740 - RFC 5892 IDNA Code Points August 2010 1741 - 1742 - 1743 - 05BF ; PVALID # HEBREW POINT RAFE 1744 - 05C0 ; DISALLOWED # HEBREW PUNCTUATION PASEQ 1745 - 05C1..05C2 ; PVALID # HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT 1746 - 05C3 ; DISALLOWED # HEBREW PUNCTUATION SOF PASUQ 1747 - 05C4..05C5 ; PVALID # HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT 1748 - 05C6 ; DISALLOWED # HEBREW PUNCTUATION NUN HAFUKHA 1749 - 05C7 ; PVALID # HEBREW POINT QAMATS QATAN 1750 - 05C8..05CF ; UNASSIGNED # <reserved>..<reserved> 1751 - 05D0..05EA ; PVALID # HEBREW LETTER ALEF..HEBREW LETTER TAV 1752 - 05EB..05EF ; UNASSIGNED # <reserved>..<reserved> 1753 - 05F0..05F2 ; PVALID # HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW L 1754 - 05F3..05F4 ; CONTEXTO # HEBREW PUNCTUATION GERESH..HEBREW PUNCTUATIO 1755 - 05F5..05FF ; UNASSIGNED # <reserved>..<reserved> 1756 - 0600..0603 ; DISALLOWED # ARABIC NUMBER SIGN..ARABIC SIGN SAFHA 1757 - 0604..0605 ; UNASSIGNED # <reserved>..<reserved> 1758 - 0606..060F ; DISALLOWED # ARABIC-INDIC CUBE ROOT..ARABIC SIGN MISRA 1759 - 0610..061A ; PVALID # ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..AR 1760 - 061B ; DISALLOWED # ARABIC SEMICOLON 1761 - 061C..061D ; UNASSIGNED # <reserved>..<reserved> 1762 - 061E..061F ; DISALLOWED # ARABIC TRIPLE DOT PUNCTUATION MARK..ARABIC Q 1763 - 0620 ; UNASSIGNED # <reserved> 1764 - 0621..063F ; PVALID # ARABIC LETTER HAMZA..ARABIC LETTER FARSI YEH 1765 - 0640 ; DISALLOWED # ARABIC TATWEEL 1766 - 0641..065E ; PVALID # ARABIC LETTER FEH..ARABIC FATHA WITH TWO DOT 1767 - 065F ; UNASSIGNED # <reserved> 1768 - 0660..0669 ; CONTEXTO # ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT 1769 - 066A..066D ; DISALLOWED # ARABIC PERCENT SIGN..ARABIC FIVE POINTED STA 1770 - 066E..0674 ; PVALID # ARABIC LETTER DOTLESS BEH..ARABIC LETTER HIG 1771 - 0675..0678 ; DISALLOWED # ARABIC LETTER HIGH HAMZA ALEF..ARABIC LETTER 1772 - 0679..06D3 ; PVALID # ARABIC LETTER TTEH..ARABIC LETTER YEH BARREE 1773 - 06D4 ; DISALLOWED # ARABIC FULL STOP 1774 - 06D5..06DC ; PVALID # ARABIC LETTER AE..ARABIC SMALL HIGH SEEN 1775 - 06DD..06DE ; DISALLOWED # ARABIC END OF AYAH..ARABIC START OF RUB EL H 1776 - 06DF..06E8 ; PVALID # ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL 1777 - 06E9 ; DISALLOWED # ARABIC PLACE OF SAJDAH 1778 - 06EA..06EF ; PVALID # ARABIC EMPTY CENTRE LOW STOP..ARABIC LETTER 1779 - 06F0..06F9 ; CONTEXTO # EXTENDED ARABIC-INDIC DIGIT ZERO..EXTENDED A 1780 - 06FA..06FF ; PVALID # ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC L 1781 - 0700..070D ; DISALLOWED # SYRIAC END OF PARAGRAPH..SYRIAC HARKLEAN AST 1782 - 070E ; UNASSIGNED # <reserved> 1783 - 070F ; DISALLOWED # SYRIAC ABBREVIATION MARK 1784 - 0710..074A ; PVALID # SYRIAC LETTER ALAPH..SYRIAC BARREKH 1785 - 074B..074C ; UNASSIGNED # <reserved>..<reserved> 1786 - 074D..07B1 ; PVALID # SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER N 1787 - 07B2..07BF ; UNASSIGNED # <reserved>..<reserved> 1788 - 07C0..07F5 ; PVALID # NKO DIGIT ZERO..NKO LOW TONE APOSTROPHE 1789 - 07F6..07FA ; DISALLOWED # NKO SYMBOL OO DENNEN..NKO LAJANYALAN 1790 - 07FB..07FF ; UNASSIGNED # <reserved>..<reserved> 1791 - 1792 - 1793 - 1794 - Faltstrom Standards Track [Page 32] 1795 - 1796 - RFC 5892 IDNA Code Points August 2010 1797 - 1798 - 1799 - 0800..082D ; PVALID # SAMARITAN LETTER ALAF..SAMARITAN MARK NEQUDA 1800 - 082E..082F ; UNASSIGNED # <reserved>..<reserved> 1801 - 0830..083E ; DISALLOWED # SAMARITAN PUNCTUATION NEQUDAA..SAMARITAN PUN 1802 - 083F..08FF ; UNASSIGNED # <reserved>..<reserved> 1803 - 0900..0939 ; PVALID # DEVANAGARI SIGN INVERTED CANDRABINDU..DEVANA 1804 - 093A..093B ; UNASSIGNED # <reserved>..<reserved> 1805 - 093C..094E ; PVALID # DEVANAGARI SIGN NUKTA..DEVANAGARI VOWEL SIGN 1806 - 094F ; UNASSIGNED # <reserved> 1807 - 0950..0955 ; PVALID # DEVANAGARI OM..DEVANAGARI VOWEL SIGN CANDRA 1808 - 0956..0957 ; UNASSIGNED # <reserved>..<reserved> 1809 - 0958..095F ; DISALLOWED # DEVANAGARI LETTER QA..DEVANAGARI LETTER YYA 1810 - 0960..0963 ; PVALID # DEVANAGARI LETTER VOCALIC RR..DEVANAGARI VOW 1811 - 0964..0965 ; DISALLOWED # DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA 1812 - 0966..096F ; PVALID # DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE 1813 - 0970 ; DISALLOWED # DEVANAGARI ABBREVIATION SIGN 1814 - 0971..0972 ; PVALID # DEVANAGARI SIGN HIGH SPACING DOT..DEVANAGARI 1815 - 0973..0978 ; UNASSIGNED # <reserved>..<reserved> 1816 - 0979..097F ; PVALID # DEVANAGARI LETTER ZHA..DEVANAGARI LETTER BBA 1817 - 0980 ; UNASSIGNED # <reserved> 1818 - 0981..0983 ; PVALID # BENGALI SIGN CANDRABINDU..BENGALI SIGN VISAR 1819 - 0984 ; UNASSIGNED # <reserved> 1820 - 0985..098C ; PVALID # BENGALI LETTER A..BENGALI LETTER VOCALIC L 1821 - 098D..098E ; UNASSIGNED # <reserved>..<reserved> 1822 - 098F..0990 ; PVALID # BENGALI LETTER E..BENGALI LETTER AI 1823 - 0991..0992 ; UNASSIGNED # <reserved>..<reserved> 1824 - 0993..09A8 ; PVALID # BENGALI LETTER O..BENGALI LETTER NA 1825 - 09A9 ; UNASSIGNED # <reserved> 1826 - 09AA..09B0 ; PVALID # BENGALI LETTER PA..BENGALI LETTER RA 1827 - 09B1 ; UNASSIGNED # <reserved> 1828 - 09B2 ; PVALID # BENGALI LETTER LA 1829 - 09B3..09B5 ; UNASSIGNED # <reserved>..<reserved> 1830 - 09B6..09B9 ; PVALID # BENGALI LETTER SHA..BENGALI LETTER HA 1831 - 09BA..09BB ; UNASSIGNED # <reserved>..<reserved> 1832 - 09BC..09C4 ; PVALID # BENGALI SIGN NUKTA..BENGALI VOWEL SIGN VOCAL 1833 - 09C5..09C6 ; UNASSIGNED # <reserved>..<reserved> 1834 - 09C7..09C8 ; PVALID # BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI 1835 - 09C9..09CA ; UNASSIGNED # <reserved>..<reserved> 1836 - 09CB..09CE ; PVALID # BENGALI VOWEL SIGN O..BENGALI LETTER KHANDA 1837 - 09CF..09D6 ; UNASSIGNED # <reserved>..<reserved> 1838 - 09D7 ; PVALID # BENGALI AU LENGTH MARK 1839 - 09D8..09DB ; UNASSIGNED # <reserved>..<reserved> 1840 - 09DC..09DD ; DISALLOWED # BENGALI LETTER RRA..BENGALI LETTER RHA 1841 - 09DE ; UNASSIGNED # <reserved> 1842 - 09DF ; DISALLOWED # BENGALI LETTER YYA 1843 - 09E0..09E3 ; PVALID # BENGALI LETTER VOCALIC RR..BENGALI VOWEL SIG 1844 - 09E4..09E5 ; UNASSIGNED # <reserved>..<reserved> 1845 - 09E6..09F1 ; PVALID # BENGALI DIGIT ZERO..BENGALI LETTER RA WITH L 1846 - 09F2..09FB ; DISALLOWED # BENGALI RUPEE MARK..BENGALI GANDA MARK 1847 - 1848 - 1849 - 1850 - Faltstrom Standards Track [Page 33] 1851 - 1852 - RFC 5892 IDNA Code Points August 2010 1853 - 1854 - 1855 - 09FC..0A00 ; UNASSIGNED # <reserved>..<reserved> 1856 - 0A01..0A03 ; PVALID # GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN VISA 1857 - 0A04 ; UNASSIGNED # <reserved> 1858 - 0A05..0A0A ; PVALID # GURMUKHI LETTER A..GURMUKHI LETTER UU 1859 - 0A0B..0A0E ; UNASSIGNED # <reserved>..<reserved> 1860 - 0A0F..0A10 ; PVALID # GURMUKHI LETTER EE..GURMUKHI LETTER AI 1861 - 0A11..0A12 ; UNASSIGNED # <reserved>..<reserved> 1862 - 0A13..0A28 ; PVALID # GURMUKHI LETTER OO..GURMUKHI LETTER NA 1863 - 0A29 ; UNASSIGNED # <reserved> 1864 - 0A2A..0A30 ; PVALID # GURMUKHI LETTER PA..GURMUKHI LETTER RA 1865 - 0A31 ; UNASSIGNED # <reserved> 1866 - 0A32 ; PVALID # GURMUKHI LETTER LA 1867 - 0A33 ; DISALLOWED # GURMUKHI LETTER LLA 1868 - 0A34 ; UNASSIGNED # <reserved> 1869 - 0A35 ; PVALID # GURMUKHI LETTER VA 1870 - 0A36 ; DISALLOWED # GURMUKHI LETTER SHA 1871 - 0A37 ; UNASSIGNED # <reserved> 1872 - 0A38..0A39 ; PVALID # GURMUKHI LETTER SA..GURMUKHI LETTER HA 1873 - 0A3A..0A3B ; UNASSIGNED # <reserved>..<reserved> 1874 - 0A3C ; PVALID # GURMUKHI SIGN NUKTA 1875 - 0A3D ; UNASSIGNED # <reserved> 1876 - 0A3E..0A42 ; PVALID # GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN 1877 - 0A43..0A46 ; UNASSIGNED # <reserved>..<reserved> 1878 - 0A47..0A48 ; PVALID # GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN 1879 - 0A49..0A4A ; UNASSIGNED # <reserved>..<reserved> 1880 - 0A4B..0A4D ; PVALID # GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA 1881 - 0A4E..0A50 ; UNASSIGNED # <reserved>..<reserved> 1882 - 0A51 ; PVALID # GURMUKHI SIGN UDAAT 1883 - 0A52..0A58 ; UNASSIGNED # <reserved>..<reserved> 1884 - 0A59..0A5B ; DISALLOWED # GURMUKHI LETTER KHHA..GURMUKHI LETTER ZA 1885 - 0A5C ; PVALID # GURMUKHI LETTER RRA 1886 - 0A5D ; UNASSIGNED # <reserved> 1887 - 0A5E ; DISALLOWED # GURMUKHI LETTER FA 1888 - 0A5F..0A65 ; UNASSIGNED # <reserved>..<reserved> 1889 - 0A66..0A75 ; PVALID # GURMUKHI DIGIT ZERO..GURMUKHI SIGN YAKASH 1890 - 0A76..0A80 ; UNASSIGNED # <reserved>..<reserved> 1891 - 0A81..0A83 ; PVALID # GUJARATI SIGN CANDRABINDU..GUJARATI SIGN VIS 1892 - 0A84 ; UNASSIGNED # <reserved> 1893 - 0A85..0A8D ; PVALID # GUJARATI LETTER A..GUJARATI VOWEL CANDRA E 1894 - 0A8E ; UNASSIGNED # <reserved> 1895 - 0A8F..0A91 ; PVALID # GUJARATI LETTER E..GUJARATI VOWEL CANDRA O 1896 - 0A92 ; UNASSIGNED # <reserved> 1897 - 0A93..0AA8 ; PVALID # GUJARATI LETTER O..GUJARATI LETTER NA 1898 - 0AA9 ; UNASSIGNED # <reserved> 1899 - 0AAA..0AB0 ; PVALID # GUJARATI LETTER PA..GUJARATI LETTER RA 1900 - 0AB1 ; UNASSIGNED # <reserved> 1901 - 0AB2..0AB3 ; PVALID # GUJARATI LETTER LA..GUJARATI LETTER LLA 1902 - 0AB4 ; UNASSIGNED # <reserved> 1903 - 1904 - 1905 - 1906 - Faltstrom Standards Track [Page 34] 1907 - 1908 - RFC 5892 IDNA Code Points August 2010 1909 - 1910 - 1911 - 0AB5..0AB9 ; PVALID # GUJARATI LETTER VA..GUJARATI LETTER HA 1912 - 0ABA..0ABB ; UNASSIGNED # <reserved>..<reserved> 1913 - 0ABC..0AC5 ; PVALID # GUJARATI SIGN NUKTA..GUJARATI VOWEL SIGN CAN 1914 - 0AC6 ; UNASSIGNED # <reserved> 1915 - 0AC7..0AC9 ; PVALID # GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN C 1916 - 0ACA ; UNASSIGNED # <reserved> 1917 - 0ACB..0ACD ; PVALID # GUJARATI VOWEL SIGN O..GUJARATI SIGN VIRAMA 1918 - 0ACE..0ACF ; UNASSIGNED # <reserved>..<reserved> 1919 - 0AD0 ; PVALID # GUJARATI OM 1920 - 0AD1..0ADF ; UNASSIGNED # <reserved>..<reserved> 1921 - 0AE0..0AE3 ; PVALID # GUJARATI LETTER VOCALIC RR..GUJARATI VOWEL S 1922 - 0AE4..0AE5 ; UNASSIGNED # <reserved>..<reserved> 1923 - 0AE6..0AEF ; PVALID # GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE 1924 - 0AF0 ; UNASSIGNED # <reserved> 1925 - 0AF1 ; DISALLOWED # GUJARATI RUPEE SIGN 1926 - 0AF2..0B00 ; UNASSIGNED # <reserved>..<reserved> 1927 - 0B01..0B03 ; PVALID # ORIYA SIGN CANDRABINDU..ORIYA SIGN VISARGA 1928 - 0B04 ; UNASSIGNED # <reserved> 1929 - 0B05..0B0C ; PVALID # ORIYA LETTER A..ORIYA LETTER VOCALIC L 1930 - 0B0D..0B0E ; UNASSIGNED # <reserved>..<reserved> 1931 - 0B0F..0B10 ; PVALID # ORIYA LETTER E..ORIYA LETTER AI 1932 - 0B11..0B12 ; UNASSIGNED # <reserved>..<reserved> 1933 - 0B13..0B28 ; PVALID # ORIYA LETTER O..ORIYA LETTER NA 1934 - 0B29 ; UNASSIGNED # <reserved> 1935 - 0B2A..0B30 ; PVALID # ORIYA LETTER PA..ORIYA LETTER RA 1936 - 0B31 ; UNASSIGNED # <reserved> 1937 - 0B32..0B33 ; PVALID # ORIYA LETTER LA..ORIYA LETTER LLA 1938 - 0B34 ; UNASSIGNED # <reserved> 1939 - 0B35..0B39 ; PVALID # ORIYA LETTER VA..ORIYA LETTER HA 1940 - 0B3A..0B3B ; UNASSIGNED # <reserved>..<reserved> 1941 - 0B3C..0B44 ; PVALID # ORIYA SIGN NUKTA..ORIYA VOWEL SIGN VOCALIC R 1942 - 0B45..0B46 ; UNASSIGNED # <reserved>..<reserved> 1943 - 0B47..0B48 ; PVALID # ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI 1944 - 0B49..0B4A ; UNASSIGNED # <reserved>..<reserved> 1945 - 0B4B..0B4D ; PVALID # ORIYA VOWEL SIGN O..ORIYA SIGN VIRAMA 1946 - 0B4E..0B55 ; UNASSIGNED # <reserved>..<reserved> 1947 - 0B56..0B57 ; PVALID # ORIYA AI LENGTH MARK..ORIYA AU LENGTH MARK 1948 - 0B58..0B5B ; UNASSIGNED # <reserved>..<reserved> 1949 - 0B5C..0B5D ; DISALLOWED # ORIYA LETTER RRA..ORIYA LETTER RHA 1950 - 0B5E ; UNASSIGNED # <reserved> 1951 - 0B5F..0B63 ; PVALID # ORIYA LETTER YYA..ORIYA VOWEL SIGN VOCALIC L 1952 - 0B64..0B65 ; UNASSIGNED # <reserved>..<reserved> 1953 - 0B66..0B6F ; PVALID # ORIYA DIGIT ZERO..ORIYA DIGIT NINE 1954 - 0B70 ; DISALLOWED # ORIYA ISSHAR 1955 - 0B71 ; PVALID # ORIYA LETTER WA 1956 - 0B72..0B81 ; UNASSIGNED # <reserved>..<reserved> 1957 - 0B82..0B83 ; PVALID # TAMIL SIGN ANUSVARA..TAMIL SIGN VISARGA 1958 - 0B84 ; UNASSIGNED # <reserved> 1959 - 1960 - 1961 - 1962 - Faltstrom Standards Track [Page 35] 1963 - 1964 - RFC 5892 IDNA Code Points August 2010 1965 - 1966 - 1967 - 0B85..0B8A ; PVALID # TAMIL LETTER A..TAMIL LETTER UU 1968 - 0B8B..0B8D ; UNASSIGNED # <reserved>..<reserved> 1969 - 0B8E..0B90 ; PVALID # TAMIL LETTER E..TAMIL LETTER AI 1970 - 0B91 ; UNASSIGNED # <reserved> 1971 - 0B92..0B95 ; PVALID # TAMIL LETTER O..TAMIL LETTER KA 1972 - 0B96..0B98 ; UNASSIGNED # <reserved>..<reserved> 1973 - 0B99..0B9A ; PVALID # TAMIL LETTER NGA..TAMIL LETTER CA 1974 - 0B9B ; UNASSIGNED # <reserved> 1975 - 0B9C ; PVALID # TAMIL LETTER JA 1976 - 0B9D ; UNASSIGNED # <reserved> 1977 - 0B9E..0B9F ; PVALID # TAMIL LETTER NYA..TAMIL LETTER TTA 1978 - 0BA0..0BA2 ; UNASSIGNED # <reserved>..<reserved> 1979 - 0BA3..0BA4 ; PVALID # TAMIL LETTER NNA..TAMIL LETTER TA 1980 - 0BA5..0BA7 ; UNASSIGNED # <reserved>..<reserved> 1981 - 0BA8..0BAA ; PVALID # TAMIL LETTER NA..TAMIL LETTER PA 1982 - 0BAB..0BAD ; UNASSIGNED # <reserved>..<reserved> 1983 - 0BAE..0BB9 ; PVALID # TAMIL LETTER MA..TAMIL LETTER HA 1984 - 0BBA..0BBD ; UNASSIGNED # <reserved>..<reserved> 1985 - 0BBE..0BC2 ; PVALID # TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN UU 1986 - 0BC3..0BC5 ; UNASSIGNED # <reserved>..<reserved> 1987 - 0BC6..0BC8 ; PVALID # TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI 1988 - 0BC9 ; UNASSIGNED # <reserved> 1989 - 0BCA..0BCD ; PVALID # TAMIL VOWEL SIGN O..TAMIL SIGN VIRAMA 1990 - 0BCE..0BCF ; UNASSIGNED # <reserved>..<reserved> 1991 - 0BD0 ; PVALID # TAMIL OM 1992 - 0BD1..0BD6 ; UNASSIGNED # <reserved>..<reserved> 1993 - 0BD7 ; PVALID # TAMIL AU LENGTH MARK 1994 - 0BD8..0BE5 ; UNASSIGNED # <reserved>..<reserved> 1995 - 0BE6..0BEF ; PVALID # TAMIL DIGIT ZERO..TAMIL DIGIT NINE 1996 - 0BF0..0BFA ; DISALLOWED # TAMIL NUMBER TEN..TAMIL NUMBER SIGN 1997 - 0BFB..0C00 ; UNASSIGNED # <reserved>..<reserved> 1998 - 0C01..0C03 ; PVALID # TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA 1999 - 0C04 ; UNASSIGNED # <reserved> 2000 - 0C05..0C0C ; PVALID # TELUGU LETTER A..TELUGU LETTER VOCALIC L 2001 - 0C0D ; UNASSIGNED # <reserved> 2002 - 0C0E..0C10 ; PVALID # TELUGU LETTER E..TELUGU LETTER AI 2003 - 0C11 ; UNASSIGNED # <reserved> 2004 - 0C12..0C28 ; PVALID # TELUGU LETTER O..TELUGU LETTER NA 2005 - 0C29 ; UNASSIGNED # <reserved> 2006 - 0C2A..0C33 ; PVALID # TELUGU LETTER PA..TELUGU LETTER LLA 2007 - 0C34 ; UNASSIGNED # <reserved> 2008 - 0C35..0C39 ; PVALID # TELUGU LETTER VA..TELUGU LETTER HA 2009 - 0C3A..0C3C ; UNASSIGNED # <reserved>..<reserved> 2010 - 0C3D..0C44 ; PVALID # TELUGU SIGN AVAGRAHA..TELUGU VOWEL SIGN VOCA 2011 - 0C45 ; UNASSIGNED # <reserved> 2012 - 0C46..0C48 ; PVALID # TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI 2013 - 0C49 ; UNASSIGNED # <reserved> 2014 - 0C4A..0C4D ; PVALID # TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA 2015 - 2016 - 2017 - 2018 - Faltstrom Standards Track [Page 36] 2019 - 2020 - RFC 5892 IDNA Code Points August 2010 2021 - 2022 - 2023 - 0C4E..0C54 ; UNASSIGNED # <reserved>..<reserved> 2024 - 0C55..0C56 ; PVALID # TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 2025 - 0C57 ; UNASSIGNED # <reserved> 2026 - 0C58..0C59 ; PVALID # TELUGU LETTER TSA..TELUGU LETTER DZA 2027 - 0C5A..0C5F ; UNASSIGNED # <reserved>..<reserved> 2028 - 0C60..0C63 ; PVALID # TELUGU LETTER VOCALIC RR..TELUGU VOWEL SIGN 2029 - 0C64..0C65 ; UNASSIGNED # <reserved>..<reserved> 2030 - 0C66..0C6F ; PVALID # TELUGU DIGIT ZERO..TELUGU DIGIT NINE 2031 - 0C70..0C77 ; UNASSIGNED # <reserved>..<reserved> 2032 - 0C78..0C7F ; DISALLOWED # TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF 2033 - 0C80..0C81 ; UNASSIGNED # <reserved>..<reserved> 2034 - 0C82..0C83 ; PVALID # KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA 2035 - 0C84 ; UNASSIGNED # <reserved> 2036 - 0C85..0C8C ; PVALID # KANNADA LETTER A..KANNADA LETTER VOCALIC L 2037 - 0C8D ; UNASSIGNED # <reserved> 2038 - 0C8E..0C90 ; PVALID # KANNADA LETTER E..KANNADA LETTER AI 2039 - 0C91 ; UNASSIGNED # <reserved> 2040 - 0C92..0CA8 ; PVALID # KANNADA LETTER O..KANNADA LETTER NA 2041 - 0CA9 ; UNASSIGNED # <reserved> 2042 - 0CAA..0CB3 ; PVALID # KANNADA LETTER PA..KANNADA LETTER LLA 2043 - 0CB4 ; UNASSIGNED # <reserved> 2044 - 0CB5..0CB9 ; PVALID # KANNADA LETTER VA..KANNADA LETTER HA 2045 - 0CBA..0CBB ; UNASSIGNED # <reserved>..<reserved> 2046 - 0CBC..0CC4 ; PVALID # KANNADA SIGN NUKTA..KANNADA VOWEL SIGN VOCAL 2047 - 0CC5 ; UNASSIGNED # <reserved> 2048 - 0CC6..0CC8 ; PVALID # KANNADA VOWEL SIGN E..KANNADA VOWEL SIGN AI 2049 - 0CC9 ; UNASSIGNED # <reserved> 2050 - 0CCA..0CCD ; PVALID # KANNADA VOWEL SIGN O..KANNADA SIGN VIRAMA 2051 - 0CCE..0CD4 ; UNASSIGNED # <reserved>..<reserved> 2052 - 0CD5..0CD6 ; PVALID # KANNADA LENGTH MARK..KANNADA AI LENGTH MARK 2053 - 0CD7..0CDD ; UNASSIGNED # <reserved>..<reserved> 2054 - 0CDE ; PVALID # KANNADA LETTER FA 2055 - 0CDF ; UNASSIGNED # <reserved> 2056 - 0CE0..0CE3 ; PVALID # KANNADA LETTER VOCALIC RR..KANNADA VOWEL SIG 2057 - 0CE4..0CE5 ; UNASSIGNED # <reserved>..<reserved> 2058 - 0CE6..0CEF ; PVALID # KANNADA DIGIT ZERO..KANNADA DIGIT NINE 2059 - 0CF0 ; UNASSIGNED # <reserved> 2060 - 0CF1..0CF2 ; DISALLOWED # KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADH 2061 - 0CF3..0D01 ; UNASSIGNED # <reserved>..<reserved> 2062 - 0D02..0D03 ; PVALID # MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISA 2063 - 0D04 ; UNASSIGNED # <reserved> 2064 - 0D05..0D0C ; PVALID # MALAYALAM LETTER A..MALAYALAM LETTER VOCALIC 2065 - 0D0D ; UNASSIGNED # <reserved> 2066 - 0D0E..0D10 ; PVALID # MALAYALAM LETTER E..MALAYALAM LETTER AI 2067 - 0D11 ; UNASSIGNED # <reserved> 2068 - 0D12..0D28 ; PVALID # MALAYALAM LETTER O..MALAYALAM LETTER NA 2069 - 0D29 ; UNASSIGNED # <reserved> 2070 - 0D2A..0D39 ; PVALID # MALAYALAM LETTER PA..MALAYALAM LETTER HA 2071 - 2072 - 2073 - 2074 - Faltstrom Standards Track [Page 37] 2075 - 2076 - RFC 5892 IDNA Code Points August 2010 2077 - 2078 - 2079 - 0D3A..0D3C ; UNASSIGNED # <reserved>..<reserved> 2080 - 0D3D..0D44 ; PVALID # MALAYALAM SIGN AVAGRAHA..MALAYALAM VOWEL SIG 2081 - 0D45 ; UNASSIGNED # <reserved> 2082 - 0D46..0D48 ; PVALID # MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN 2083 - 0D49 ; UNASSIGNED # <reserved> 2084 - 0D4A..0D4D ; PVALID # MALAYALAM VOWEL SIGN O..MALAYALAM SIGN VIRAM 2085 - 0D4E..0D56 ; UNASSIGNED # <reserved>..<reserved> 2086 - 0D57 ; PVALID # MALAYALAM AU LENGTH MARK 2087 - 0D58..0D5F ; UNASSIGNED # <reserved>..<reserved> 2088 - 0D60..0D63 ; PVALID # MALAYALAM LETTER VOCALIC RR..MALAYALAM VOWEL 2089 - 0D64..0D65 ; UNASSIGNED # <reserved>..<reserved> 2090 - 0D66..0D6F ; PVALID # MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE 2091 - 0D70..0D75 ; DISALLOWED # MALAYALAM NUMBER TEN..MALAYALAM FRACTION THR 2092 - 0D76..0D78 ; UNASSIGNED # <reserved>..<reserved> 2093 - 0D79 ; DISALLOWED # MALAYALAM DATE MARK 2094 - 0D7A..0D7F ; PVALID # MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER 2095 - 0D80..0D81 ; UNASSIGNED # <reserved>..<reserved> 2096 - 0D82..0D83 ; PVALID # SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARG 2097 - 0D84 ; UNASSIGNED # <reserved> 2098 - 0D85..0D96 ; PVALID # SINHALA LETTER AYANNA..SINHALA LETTER AUYANN 2099 - 0D97..0D99 ; UNASSIGNED # <reserved>..<reserved> 2100 - 0D9A..0DB1 ; PVALID # SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA L 2101 - 0DB2 ; UNASSIGNED # <reserved> 2102 - 0DB3..0DBB ; PVALID # SINHALA LETTER SANYAKA DAYANNA..SINHALA LETT 2103 - 0DBC ; UNASSIGNED # <reserved> 2104 - 0DBD ; PVALID # SINHALA LETTER DANTAJA LAYANNA 2105 - 0DBE..0DBF ; UNASSIGNED # <reserved>..<reserved> 2106 - 0DC0..0DC6 ; PVALID # SINHALA LETTER VAYANNA..SINHALA LETTER FAYAN 2107 - 0DC7..0DC9 ; UNASSIGNED # <reserved>..<reserved> 2108 - 0DCA ; PVALID # SINHALA SIGN AL-LAKUNA 2109 - 0DCB..0DCE ; UNASSIGNED # <reserved>..<reserved> 2110 - 0DCF..0DD4 ; PVALID # SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL 2111 - 0DD5 ; UNASSIGNED # <reserved> 2112 - 0DD6 ; PVALID # SINHALA VOWEL SIGN DIGA PAA-PILLA 2113 - 0DD7 ; UNASSIGNED # <reserved> 2114 - 0DD8..0DDF ; PVALID # SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOW 2115 - 0DE0..0DF1 ; UNASSIGNED # <reserved>..<reserved> 2116 - 0DF2..0DF3 ; PVALID # SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHAL 2117 - 0DF4 ; DISALLOWED # SINHALA PUNCTUATION KUNDDALIYA 2118 - 0DF5..0E00 ; UNASSIGNED # <reserved>..<reserved> 2119 - 0E01..0E32 ; PVALID # THAI CHARACTER KO KAI..THAI CHARACTER SARA A 2120 - 0E33 ; DISALLOWED # THAI CHARACTER SARA AM 2121 - 0E34..0E3A ; PVALID # THAI CHARACTER SARA I..THAI CHARACTER PHINTH 2122 - 0E3B..0E3E ; UNASSIGNED # <reserved>..<reserved> 2123 - 0E3F ; DISALLOWED # THAI CURRENCY SYMBOL BAHT 2124 - 0E40..0E4E ; PVALID # THAI CHARACTER SARA E..THAI CHARACTER YAMAKK 2125 - 0E4F ; DISALLOWED # THAI CHARACTER FONGMAN 2126 - 0E50..0E59 ; PVALID # THAI DIGIT ZERO..THAI DIGIT NINE 2127 - 2128 - 2129 - 2130 - Faltstrom Standards Track [Page 38] 2131 - 2132 - RFC 5892 IDNA Code Points August 2010 2133 - 2134 - 2135 - 0E5A..0E5B ; DISALLOWED # THAI CHARACTER ANGKHANKHU..THAI CHARACTER KH 2136 - 0E5C..0E80 ; UNASSIGNED # <reserved>..<reserved> 2137 - 0E81..0E82 ; PVALID # LAO LETTER KO..LAO LETTER KHO SUNG 2138 - 0E83 ; UNASSIGNED # <reserved> 2139 - 0E84 ; PVALID # LAO LETTER KHO TAM 2140 - 0E85..0E86 ; UNASSIGNED # <reserved>..<reserved> 2141 - 0E87..0E88 ; PVALID # LAO LETTER NGO..LAO LETTER CO 2142 - 0E89 ; UNASSIGNED # <reserved> 2143 - 0E8A ; PVALID # LAO LETTER SO TAM 2144 - 0E8B..0E8C ; UNASSIGNED # <reserved>..<reserved> 2145 - 0E8D ; PVALID # LAO LETTER NYO 2146 - 0E8E..0E93 ; UNASSIGNED # <reserved>..<reserved> 2147 - 0E94..0E97 ; PVALID # LAO LETTER DO..LAO LETTER THO TAM 2148 - 0E98 ; UNASSIGNED # <reserved> 2149 - 0E99..0E9F ; PVALID # LAO LETTER NO..LAO LETTER FO SUNG 2150 - 0EA0 ; UNASSIGNED # <reserved> 2151 - 0EA1..0EA3 ; PVALID # LAO LETTER MO..LAO LETTER LO LING 2152 - 0EA4 ; UNASSIGNED # <reserved> 2153 - 0EA5 ; PVALID # LAO LETTER LO LOOT 2154 - 0EA6 ; UNASSIGNED # <reserved> 2155 - 0EA7 ; PVALID # LAO LETTER WO 2156 - 0EA8..0EA9 ; UNASSIGNED # <reserved>..<reserved> 2157 - 0EAA..0EAB ; PVALID # LAO LETTER SO SUNG..LAO LETTER HO SUNG 2158 - 0EAC ; UNASSIGNED # <reserved> 2159 - 0EAD..0EB2 ; PVALID # LAO LETTER O..LAO VOWEL SIGN AA 2160 - 0EB3 ; DISALLOWED # LAO VOWEL SIGN AM 2161 - 0EB4..0EB9 ; PVALID # LAO VOWEL SIGN I..LAO VOWEL SIGN UU 2162 - 0EBA ; UNASSIGNED # <reserved> 2163 - 0EBB..0EBD ; PVALID # LAO VOWEL SIGN MAI KON..LAO SEMIVOWEL SIGN N 2164 - 0EBE..0EBF ; UNASSIGNED # <reserved>..<reserved> 2165 - 0EC0..0EC4 ; PVALID # LAO VOWEL SIGN E..LAO VOWEL SIGN AI 2166 - 0EC5 ; UNASSIGNED # <reserved> 2167 - 0EC6 ; PVALID # LAO KO LA 2168 - 0EC7 ; UNASSIGNED # <reserved> 2169 - 0EC8..0ECD ; PVALID # LAO TONE MAI EK..LAO NIGGAHITA 2170 - 0ECE..0ECF ; UNASSIGNED # <reserved>..<reserved> 2171 - 0ED0..0ED9 ; PVALID # LAO DIGIT ZERO..LAO DIGIT NINE 2172 - 0EDA..0EDB ; UNASSIGNED # <reserved>..<reserved> 2173 - 0EDC..0EDD ; DISALLOWED # LAO HO NO..LAO HO MO 2174 - 0EDE..0EFF ; UNASSIGNED # <reserved>..<reserved> 2175 - 0F00 ; PVALID # TIBETAN SYLLABLE OM 2176 - 0F01..0F0A ; DISALLOWED # TIBETAN MARK GTER YIG MGO TRUNCATED A..TIBET 2177 - 0F0B ; PVALID # TIBETAN MARK INTERSYLLABIC TSHEG 2178 - 0F0C..0F17 ; DISALLOWED # TIBETAN MARK DELIMITER TSHEG BSTAR..TIBETAN 2179 - 0F18..0F19 ; PVALID # TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN 2180 - 0F1A..0F1F ; DISALLOWED # TIBETAN SIGN RDEL DKAR GCIG..TIBETAN SIGN RD 2181 - 0F20..0F29 ; PVALID # TIBETAN DIGIT ZERO..TIBETAN DIGIT NINE 2182 - 0F2A..0F34 ; DISALLOWED # TIBETAN DIGIT HALF ONE..TIBETAN MARK BSDUS R 2183 - 2184 - 2185 - 2186 - Faltstrom Standards Track [Page 39] 2187 - 2188 - RFC 5892 IDNA Code Points August 2010 2189 - 2190 - 2191 - 0F35 ; PVALID # TIBETAN MARK NGAS BZUNG NYI ZLA 2192 - 0F36 ; DISALLOWED # TIBETAN MARK CARET -DZUD RTAGS BZHI MIG CAN 2193 - 0F37 ; PVALID # TIBETAN MARK NGAS BZUNG SGOR RTAGS 2194 - 0F38 ; DISALLOWED # TIBETAN MARK CHE MGO 2195 - 0F39 ; PVALID # TIBETAN MARK TSA -PHRU 2196 - 0F3A..0F3D ; DISALLOWED # TIBETAN MARK GUG RTAGS GYON..TIBETAN MARK AN 2197 - 0F3E..0F42 ; PVALID # TIBETAN SIGN YAR TSHES..TIBETAN LETTER GA 2198 - 0F43 ; DISALLOWED # TIBETAN LETTER GHA 2199 - 0F44..0F47 ; PVALID # TIBETAN LETTER NGA..TIBETAN LETTER JA 2200 - 0F48 ; UNASSIGNED # <reserved> 2201 - 0F49..0F4C ; PVALID # TIBETAN LETTER NYA..TIBETAN LETTER DDA 2202 - 0F4D ; DISALLOWED # TIBETAN LETTER DDHA 2203 - 0F4E..0F51 ; PVALID # TIBETAN LETTER NNA..TIBETAN LETTER DA 2204 - 0F52 ; DISALLOWED # TIBETAN LETTER DHA 2205 - 0F53..0F56 ; PVALID # TIBETAN LETTER NA..TIBETAN LETTER BA 2206 - 0F57 ; DISALLOWED # TIBETAN LETTER BHA 2207 - 0F58..0F5B ; PVALID # TIBETAN LETTER MA..TIBETAN LETTER DZA 2208 - 0F5C ; DISALLOWED # TIBETAN LETTER DZHA 2209 - 0F5D..0F68 ; PVALID # TIBETAN LETTER WA..TIBETAN LETTER A 2210 - 0F69 ; DISALLOWED # TIBETAN LETTER KSSA 2211 - 0F6A..0F6C ; PVALID # TIBETAN LETTER FIXED-FORM RA..TIBETAN LETTER 2212 - 0F6D..0F70 ; UNASSIGNED # <reserved>..<reserved> 2213 - 0F71..0F72 ; PVALID # TIBETAN VOWEL SIGN AA..TIBETAN VOWEL SIGN I 2214 - 0F73 ; DISALLOWED # TIBETAN VOWEL SIGN II 2215 - 0F74 ; PVALID # TIBETAN VOWEL SIGN U 2216 - 0F75..0F79 ; DISALLOWED # TIBETAN VOWEL SIGN UU..TIBETAN VOWEL SIGN VO 2217 - 0F7A..0F80 ; PVALID # TIBETAN VOWEL SIGN E..TIBETAN VOWEL SIGN REV 2218 - 0F81 ; DISALLOWED # TIBETAN VOWEL SIGN REVERSED II 2219 - 0F82..0F84 ; PVALID # TIBETAN SIGN NYI ZLA NAA DA..TIBETAN MARK HA 2220 - 0F85 ; DISALLOWED # TIBETAN MARK PALUTA 2221 - 0F86..0F8B ; PVALID # TIBETAN SIGN LCI RTAGS..TIBETAN SIGN GRU MED 2222 - 0F8C..0F8F ; UNASSIGNED # <reserved>..<reserved> 2223 - 0F90..0F92 ; PVALID # TIBETAN SUBJOINED LETTER KA..TIBETAN SUBJOIN 2224 - 0F93 ; DISALLOWED # TIBETAN SUBJOINED LETTER GHA 2225 - 0F94..0F97 ; PVALID # TIBETAN SUBJOINED LETTER NGA..TIBETAN SUBJOI 2226 - 0F98 ; UNASSIGNED # <reserved> 2227 - 0F99..0F9C ; PVALID # TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOI 2228 - 0F9D ; DISALLOWED # TIBETAN SUBJOINED LETTER DDHA 2229 - 0F9E..0FA1 ; PVALID # TIBETAN SUBJOINED LETTER NNA..TIBETAN SUBJOI 2230 - 0FA2 ; DISALLOWED # TIBETAN SUBJOINED LETTER DHA 2231 - 0FA3..0FA6 ; PVALID # TIBETAN SUBJOINED LETTER NA..TIBETAN SUBJOIN 2232 - 0FA7 ; DISALLOWED # TIBETAN SUBJOINED LETTER BHA 2233 - 0FA8..0FAB ; PVALID # TIBETAN SUBJOINED LETTER MA..TIBETAN SUBJOIN 2234 - 0FAC ; DISALLOWED # TIBETAN SUBJOINED LETTER DZHA 2235 - 0FAD..0FB8 ; PVALID # TIBETAN SUBJOINED LETTER WA..TIBETAN SUBJOIN 2236 - 0FB9 ; DISALLOWED # TIBETAN SUBJOINED LETTER KSSA 2237 - 0FBA..0FBC ; PVALID # TIBETAN SUBJOINED LETTER FIXED-FORM WA..TIBE 2238 - 0FBD ; UNASSIGNED # <reserved> 2239 - 2240 - 2241 - 2242 - Faltstrom Standards Track [Page 40] 2243 - 2244 - RFC 5892 IDNA Code Points August 2010 2245 - 2246 - 2247 - 0FBE..0FC5 ; DISALLOWED # TIBETAN KU RU KHA..TIBETAN SYMBOL RDO RJE 2248 - 0FC6 ; PVALID # TIBETAN SYMBOL PADMA GDAN 2249 - 0FC7..0FCC ; DISALLOWED # TIBETAN SYMBOL RDO RJE RGYA GRAM..TIBETAN SY 2250 - 0FCD ; UNASSIGNED # <reserved> 2251 - 0FCE..0FD8 ; DISALLOWED # TIBETAN SIGN RDEL NAG RDEL DKAR..LEFT-FACING 2252 - 0FD9..0FFF ; UNASSIGNED # <reserved>..<reserved> 2253 - 1000..1049 ; PVALID # MYANMAR LETTER KA..MYANMAR DIGIT NINE 2254 - 104A..104F ; DISALLOWED # MYANMAR SIGN LITTLE SECTION..MYANMAR SYMBOL 2255 - 1050..109D ; PVALID # MYANMAR LETTER SHA..MYANMAR VOWEL SIGN AITON 2256 - 109E..10C5 ; DISALLOWED # MYANMAR SYMBOL SHAN ONE..GEORGIAN CAPITAL LE 2257 - 10C6..10CF ; UNASSIGNED # <reserved>..<reserved> 2258 - 10D0..10FA ; PVALID # GEORGIAN LETTER AN..GEORGIAN LETTER AIN 2259 - 10FB..10FC ; DISALLOWED # GEORGIAN PARAGRAPH SEPARATOR..MODIFIER LETTE 2260 - 10FD..10FF ; UNASSIGNED # <reserved>..<reserved> 2261 - 1100..11FF ; DISALLOWED # HANGUL CHOSEONG KIYEOK..HANGUL JONGSEONG SSA 2262 - 1200..1248 ; PVALID # ETHIOPIC SYLLABLE HA..ETHIOPIC SYLLABLE QWA 2263 - 1249 ; UNASSIGNED # <reserved> 2264 - 124A..124D ; PVALID # ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE 2265 - 124E..124F ; UNASSIGNED # <reserved>..<reserved> 2266 - 1250..1256 ; PVALID # ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO 2267 - 1257 ; UNASSIGNED # <reserved> 2268 - 1258 ; PVALID # ETHIOPIC SYLLABLE QHWA 2269 - 1259 ; UNASSIGNED # <reserved> 2270 - 125A..125D ; PVALID # ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QH 2271 - 125E..125F ; UNASSIGNED # <reserved>..<reserved> 2272 - 1260..1288 ; PVALID # ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA 2273 - 1289 ; UNASSIGNED # <reserved> 2274 - 128A..128D ; PVALID # ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE 2275 - 128E..128F ; UNASSIGNED # <reserved>..<reserved> 2276 - 1290..12B0 ; PVALID # ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA 2277 - 12B1 ; UNASSIGNED # <reserved> 2278 - 12B2..12B5 ; PVALID # ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE 2279 - 12B6..12B7 ; UNASSIGNED # <reserved>..<reserved> 2280 - 12B8..12BE ; PVALID # ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO 2281 - 12BF ; UNASSIGNED # <reserved> 2282 - 12C0 ; PVALID # ETHIOPIC SYLLABLE KXWA 2283 - 12C1 ; UNASSIGNED # <reserved> 2284 - 12C2..12C5 ; PVALID # ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KX 2285 - 12C6..12C7 ; UNASSIGNED # <reserved>..<reserved> 2286 - 12C8..12D6 ; PVALID # ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHAR 2287 - 12D7 ; UNASSIGNED # <reserved> 2288 - 12D8..1310 ; PVALID # ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA 2289 - 1311 ; UNASSIGNED # <reserved> 2290 - 1312..1315 ; PVALID # ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE 2291 - 1316..1317 ; UNASSIGNED # <reserved>..<reserved> 2292 - 1318..135A ; PVALID # ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA 2293 - 135B..135E ; UNASSIGNED # <reserved>..<reserved> 2294 - 135F ; PVALID # ETHIOPIC COMBINING GEMINATION MARK 2295 - 2296 - 2297 - 2298 - Faltstrom Standards Track [Page 41] 2299 - 2300 - RFC 5892 IDNA Code Points August 2010 2301 - 2302 - 2303 - 1360..137C ; DISALLOWED # ETHIOPIC SECTION MARK..ETHIOPIC NUMBER TEN T 2304 - 137D..137F ; UNASSIGNED # <reserved>..<reserved> 2305 - 1380..138F ; PVALID # ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SY 2306 - 1390..1399 ; DISALLOWED # ETHIOPIC TONAL MARK YIZET..ETHIOPIC TONAL MA 2307 - 139A..139F ; UNASSIGNED # <reserved>..<reserved> 2308 - 13A0..13F4 ; PVALID # CHEROKEE LETTER A..CHEROKEE LETTER YV 2309 - 13F5..13FF ; UNASSIGNED # <reserved>..<reserved> 2310 - 1400 ; DISALLOWED # CANADIAN SYLLABICS HYPHEN 2311 - 1401..166C ; PVALID # CANADIAN SYLLABICS E..CANADIAN SYLLABICS CAR 2312 - 166D..166E ; DISALLOWED # CANADIAN SYLLABICS CHI SIGN..CANADIAN SYLLAB 2313 - 166F..167F ; PVALID # CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS B 2314 - 1680 ; DISALLOWED # OGHAM SPACE MARK 2315 - 1681..169A ; PVALID # OGHAM LETTER BEITH..OGHAM LETTER PEITH 2316 - 169B..169C ; DISALLOWED # OGHAM FEATHER MARK..OGHAM REVERSED FEATHER M 2317 - 169D..169F ; UNASSIGNED # <reserved>..<reserved> 2318 - 16A0..16EA ; PVALID # RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X 2319 - 16EB..16F0 ; DISALLOWED # RUNIC SINGLE PUNCTUATION..RUNIC BELGTHOR SYM 2320 - 16F1..16FF ; UNASSIGNED # <reserved>..<reserved> 2321 - 1700..170C ; PVALID # TAGALOG LETTER A..TAGALOG LETTER YA 2322 - 170D ; UNASSIGNED # <reserved> 2323 - 170E..1714 ; PVALID # TAGALOG LETTER LA..TAGALOG SIGN VIRAMA 2324 - 1715..171F ; UNASSIGNED # <reserved>..<reserved> 2325 - 1720..1734 ; PVALID # HANUNOO LETTER A..HANUNOO SIGN PAMUDPOD 2326 - 1735..1736 ; DISALLOWED # PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DO 2327 - 1737..173F ; UNASSIGNED # <reserved>..<reserved> 2328 - 1740..1753 ; PVALID # BUHID LETTER A..BUHID VOWEL SIGN U 2329 - 1754..175F ; UNASSIGNED # <reserved>..<reserved> 2330 - 1760..176C ; PVALID # TAGBANWA LETTER A..TAGBANWA LETTER YA 2331 - 176D ; UNASSIGNED # <reserved> 2332 - 176E..1770 ; PVALID # TAGBANWA LETTER LA..TAGBANWA LETTER SA 2333 - 1771 ; UNASSIGNED # <reserved> 2334 - 1772..1773 ; PVALID # TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U 2335 - 1774..177F ; UNASSIGNED # <reserved>..<reserved> 2336 - 1780..17B3 ; PVALID # KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU 2337 - 17B4..17B5 ; DISALLOWED # KHMER VOWEL INHERENT AQ..KHMER VOWEL INHEREN 2338 - 17B6..17D3 ; PVALID # KHMER VOWEL SIGN AA..KHMER SIGN BATHAMASAT 2339 - 17D4..17D6 ; DISALLOWED # KHMER SIGN KHAN..KHMER SIGN CAMNUC PII KUUH 2340 - 17D7 ; PVALID # KHMER SIGN LEK TOO 2341 - 17D8..17DB ; DISALLOWED # KHMER SIGN BEYYAL..KHMER CURRENCY SYMBOL RIE 2342 - 17DC..17DD ; PVALID # KHMER SIGN AVAKRAHASANYA..KHMER SIGN ATTHACA 2343 - 17DE..17DF ; UNASSIGNED # <reserved>..<reserved> 2344 - 17E0..17E9 ; PVALID # KHMER DIGIT ZERO..KHMER DIGIT NINE 2345 - 17EA..17EF ; UNASSIGNED # <reserved>..<reserved> 2346 - 17F0..17F9 ; DISALLOWED # KHMER SYMBOL LEK ATTAK SON..KHMER SYMBOL LEK 2347 - 17FA..17FF ; UNASSIGNED # <reserved>..<reserved> 2348 - 1800..180E ; DISALLOWED # MONGOLIAN BIRGA..MONGOLIAN VOWEL SEPARATOR 2349 - 180F ; UNASSIGNED # <reserved> 2350 - 1810..1819 ; PVALID # MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE 2351 - 2352 - 2353 - 2354 - Faltstrom Standards Track [Page 42] 2355 - 2356 - RFC 5892 IDNA Code Points August 2010 2357 - 2358 - 2359 - 181A..181F ; UNASSIGNED # <reserved>..<reserved> 2360 - 1820..1877 ; PVALID # MONGOLIAN LETTER A..MONGOLIAN LETTER MANCHU 2361 - 1878..187F ; UNASSIGNED # <reserved>..<reserved> 2362 - 1880..18AA ; PVALID # MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONG 2363 - 18AB..18AF ; UNASSIGNED # <reserved>..<reserved> 2364 - 18B0..18F5 ; PVALID # CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CA 2365 - 18F6..18FF ; UNASSIGNED # <reserved>..<reserved> 2366 - 1900..191C ; PVALID # LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER HA 2367 - 191D..191F ; UNASSIGNED # <reserved>..<reserved> 2368 - 1920..192B ; PVALID # LIMBU VOWEL SIGN A..LIMBU SUBJOINED LETTER W 2369 - 192C..192F ; UNASSIGNED # <reserved>..<reserved> 2370 - 1930..193B ; PVALID # LIMBU SMALL LETTER KA..LIMBU SIGN SA-I 2371 - 193C..193F ; UNASSIGNED # <reserved>..<reserved> 2372 - 1940 ; DISALLOWED # LIMBU SIGN LOO 2373 - 1941..1943 ; UNASSIGNED # <reserved>..<reserved> 2374 - 1944..1945 ; DISALLOWED # LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK 2375 - 1946..196D ; PVALID # LIMBU DIGIT ZERO..TAI LE LETTER AI 2376 - 196E..196F ; UNASSIGNED # <reserved>..<reserved> 2377 - 1970..1974 ; PVALID # TAI LE LETTER TONE-2..TAI LE LETTER TONE-6 2378 - 1975..197F ; UNASSIGNED # <reserved>..<reserved> 2379 - 1980..19AB ; PVALID # NEW TAI LUE LETTER HIGH QA..NEW TAI LUE LETT 2380 - 19AC..19AF ; UNASSIGNED # <reserved>..<reserved> 2381 - 19B0..19C9 ; PVALID # NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW 2382 - 19CA..19CF ; UNASSIGNED # <reserved>..<reserved> 2383 - 19D0..19DA ; PVALID # NEW TAI LUE DIGIT ZERO..NEW TAI LUE THAM DIG 2384 - 19DB..19DD ; UNASSIGNED # <reserved>..<reserved> 2385 - 19DE..19FF ; DISALLOWED # NEW TAI LUE SIGN LAE..KHMER SYMBOL DAP-PRAM 2386 - 1A00..1A1B ; PVALID # BUGINESE LETTER KA..BUGINESE VOWEL SIGN AE 2387 - 1A1C..1A1D ; UNASSIGNED # <reserved>..<reserved> 2388 - 1A1E..1A1F ; DISALLOWED # BUGINESE PALLAWA..BUGINESE END OF SECTION 2389 - 1A20..1A5E ; PVALID # TAI THAM LETTER HIGH KA..TAI THAM CONSONANT 2390 - 1A5F ; UNASSIGNED # <reserved> 2391 - 1A60..1A7C ; PVALID # TAI THAM SIGN SAKOT..TAI THAM SIGN KHUEN-LUE 2392 - 1A7D..1A7E ; UNASSIGNED # <reserved>..<reserved> 2393 - 1A7F..1A89 ; PVALID # TAI THAM COMBINING CRYPTOGRAMMIC DOT..TAI TH 2394 - 1A8A..1A8F ; UNASSIGNED # <reserved>..<reserved> 2395 - 1A90..1A99 ; PVALID # TAI THAM THAM DIGIT ZERO..TAI THAM THAM DIGI 2396 - 1A9A..1A9F ; UNASSIGNED # <reserved>..<reserved> 2397 - 1AA0..1AA6 ; DISALLOWED # TAI THAM SIGN WIANG..TAI THAM SIGN REVERSED 2398 - 1AA7 ; PVALID # TAI THAM SIGN MAI YAMOK 2399 - 1AA8..1AAD ; DISALLOWED # TAI THAM SIGN KAAN..TAI THAM SIGN CAANG 2400 - 1AAE..1AFF ; UNASSIGNED # <reserved>..<reserved> 2401 - 1B00..1B4B ; PVALID # BALINESE SIGN ULU RICEM..BALINESE LETTER ASY 2402 - 1B4C..1B4F ; UNASSIGNED # <reserved>..<reserved> 2403 - 1B50..1B59 ; PVALID # BALINESE DIGIT ZERO..BALINESE DIGIT NINE 2404 - 1B5A..1B6A ; DISALLOWED # BALINESE PANTI..BALINESE MUSICAL SYMBOL DANG 2405 - 1B6B..1B73 ; PVALID # BALINESE MUSICAL SYMBOL COMBINING TEGEH..BAL 2406 - 1B74..1B7C ; DISALLOWED # BALINESE MUSICAL SYMBOL RIGHT-HAND OPEN DUG. 2407 - 2408 - 2409 - 2410 - Faltstrom Standards Track [Page 43] 2411 - 2412 - RFC 5892 IDNA Code Points August 2010 2413 - 2414 - 2415 - 1B7D..1B7F ; UNASSIGNED # <reserved>..<reserved> 2416 - 1B80..1BAA ; PVALID # SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PAMA 2417 - 1BAB..1BAD ; UNASSIGNED # <reserved>..<reserved> 2418 - 1BAE..1BB9 ; PVALID # SUNDANESE LETTER KHA..SUNDANESE DIGIT NINE 2419 - 1BBA..1BFF ; UNASSIGNED # <reserved>..<reserved> 2420 - 1C00..1C37 ; PVALID # LEPCHA LETTER KA..LEPCHA SIGN NUKTA 2421 - 1C38..1C3A ; UNASSIGNED # <reserved>..<reserved> 2422 - 1C3B..1C3F ; DISALLOWED # LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATIO 2423 - 1C40..1C49 ; PVALID # LEPCHA DIGIT ZERO..LEPCHA DIGIT NINE 2424 - 1C4A..1C4C ; UNASSIGNED # <reserved>..<reserved> 2425 - 1C4D..1C7D ; PVALID # LEPCHA LETTER TTA..OL CHIKI AHAD 2426 - 1C7E..1C7F ; DISALLOWED # OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTU 2427 - 1C80..1CCF ; UNASSIGNED # <reserved>..<reserved> 2428 - 1CD0..1CD2 ; PVALID # VEDIC TONE KARSHANA..VEDIC TONE PRENKHA 2429 - 1CD3 ; DISALLOWED # VEDIC SIGN NIHSHVASA 2430 - 1CD4..1CF2 ; PVALID # VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC 2431 - 1CF3..1CFF ; UNASSIGNED # <reserved>..<reserved> 2432 - 1D00..1D2B ; PVALID # LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTE 2433 - 1D2C..1D2E ; DISALLOWED # MODIFIER LETTER CAPITAL A..MODIFIER LETTER C 2434 - 1D2F ; PVALID # MODIFIER LETTER CAPITAL BARRED B 2435 - 1D30..1D3A ; DISALLOWED # MODIFIER LETTER CAPITAL D..MODIFIER LETTER C 2436 - 1D3B ; PVALID # MODIFIER LETTER CAPITAL REVERSED N 2437 - 1D3C..1D4D ; DISALLOWED # MODIFIER LETTER CAPITAL O..MODIFIER LETTER S 2438 - 1D4E ; PVALID # MODIFIER LETTER SMALL TURNED I 2439 - 1D4F..1D6A ; DISALLOWED # MODIFIER LETTER SMALL K..GREEK SUBSCRIPT SMA 2440 - 1D6B..1D77 ; PVALID # LATIN SMALL LETTER UE..LATIN SMALL LETTER TU 2441 - 1D78 ; DISALLOWED # MODIFIER LETTER CYRILLIC EN 2442 - 1D79..1D9A ; PVALID # LATIN SMALL LETTER INSULAR G..LATIN SMALL LE 2443 - 1D9B..1DBF ; DISALLOWED # MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER 2444 - 1DC0..1DE6 ; PVALID # COMBINING DOTTED GRAVE ACCENT..COMBINING LAT 2445 - 1DE7..1DFC ; UNASSIGNED # <reserved>..<reserved> 2446 - 1DFD..1DFF ; PVALID # COMBINING ALMOST EQUAL TO BELOW..COMBINING R 2447 - 1E00 ; DISALLOWED # LATIN CAPITAL LETTER A WITH RING BELOW 2448 - 1E01 ; PVALID # LATIN SMALL LETTER A WITH RING BELOW 2449 - 1E02 ; DISALLOWED # LATIN CAPITAL LETTER B WITH DOT ABOVE 2450 - 1E03 ; PVALID # LATIN SMALL LETTER B WITH DOT ABOVE 2451 - 1E04 ; DISALLOWED # LATIN CAPITAL LETTER B WITH DOT BELOW 2452 - 1E05 ; PVALID # LATIN SMALL LETTER B WITH DOT BELOW 2453 - 1E06 ; DISALLOWED # LATIN CAPITAL LETTER B WITH LINE BELOW 2454 - 1E07 ; PVALID # LATIN SMALL LETTER B WITH LINE BELOW 2455 - 1E08 ; DISALLOWED # LATIN CAPITAL LETTER C WITH CEDILLA AND ACUT 2456 - 1E09 ; PVALID # LATIN SMALL LETTER C WITH CEDILLA AND ACUTE 2457 - 1E0A ; DISALLOWED # LATIN CAPITAL LETTER D WITH DOT ABOVE 2458 - 1E0B ; PVALID # LATIN SMALL LETTER D WITH DOT ABOVE 2459 - 1E0C ; DISALLOWED # LATIN CAPITAL LETTER D WITH DOT BELOW 2460 - 1E0D ; PVALID # LATIN SMALL LETTER D WITH DOT BELOW 2461 - 1E0E ; DISALLOWED # LATIN CAPITAL LETTER D WITH LINE BELOW 2462 - 1E0F ; PVALID # LATIN SMALL LETTER D WITH LINE BELOW 2463 - 2464 - 2465 - 2466 - Faltstrom Standards Track [Page 44] 2467 - 2468 - RFC 5892 IDNA Code Points August 2010 2469 - 2470 - 2471 - 1E10 ; DISALLOWED # LATIN CAPITAL LETTER D WITH CEDILLA 2472 - 1E11 ; PVALID # LATIN SMALL LETTER D WITH CEDILLA 2473 - 1E12 ; DISALLOWED # LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW 2474 - 1E13 ; PVALID # LATIN SMALL LETTER D WITH CIRCUMFLEX BELOW 2475 - 1E14 ; DISALLOWED # LATIN CAPITAL LETTER E WITH MACRON AND GRAVE 2476 - 1E15 ; PVALID # LATIN SMALL LETTER E WITH MACRON AND GRAVE 2477 - 1E16 ; DISALLOWED # LATIN CAPITAL LETTER E WITH MACRON AND ACUTE 2478 - 1E17 ; PVALID # LATIN SMALL LETTER E WITH MACRON AND ACUTE 2479 - 1E18 ; DISALLOWED # LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW 2480 - 1E19 ; PVALID # LATIN SMALL LETTER E WITH CIRCUMFLEX BELOW 2481 - 1E1A ; DISALLOWED # LATIN CAPITAL LETTER E WITH TILDE BELOW 2482 - 1E1B ; PVALID # LATIN SMALL LETTER E WITH TILDE BELOW 2483 - 1E1C ; DISALLOWED # LATIN CAPITAL LETTER E WITH CEDILLA AND BREV 2484 - 1E1D ; PVALID # LATIN SMALL LETTER E WITH CEDILLA AND BREVE 2485 - 1E1E ; DISALLOWED # LATIN CAPITAL LETTER F WITH DOT ABOVE 2486 - 1E1F ; PVALID # LATIN SMALL LETTER F WITH DOT ABOVE 2487 - 1E20 ; DISALLOWED # LATIN CAPITAL LETTER G WITH MACRON 2488 - 1E21 ; PVALID # LATIN SMALL LETTER G WITH MACRON 2489 - 1E22 ; DISALLOWED # LATIN CAPITAL LETTER H WITH DOT ABOVE 2490 - 1E23 ; PVALID # LATIN SMALL LETTER H WITH DOT ABOVE 2491 - 1E24 ; DISALLOWED # LATIN CAPITAL LETTER H WITH DOT BELOW 2492 - 1E25 ; PVALID # LATIN SMALL LETTER H WITH DOT BELOW 2493 - 1E26 ; DISALLOWED # LATIN CAPITAL LETTER H WITH DIAERESIS 2494 - 1E27 ; PVALID # LATIN SMALL LETTER H WITH DIAERESIS 2495 - 1E28 ; DISALLOWED # LATIN CAPITAL LETTER H WITH CEDILLA 2496 - 1E29 ; PVALID # LATIN SMALL LETTER H WITH CEDILLA 2497 - 1E2A ; DISALLOWED # LATIN CAPITAL LETTER H WITH BREVE BELOW 2498 - 1E2B ; PVALID # LATIN SMALL LETTER H WITH BREVE BELOW 2499 - 1E2C ; DISALLOWED # LATIN CAPITAL LETTER I WITH TILDE BELOW 2500 - 1E2D ; PVALID # LATIN SMALL LETTER I WITH TILDE BELOW 2501 - 1E2E ; DISALLOWED # LATIN CAPITAL LETTER I WITH DIAERESIS AND AC 2502 - 1E2F ; PVALID # LATIN SMALL LETTER I WITH DIAERESIS AND ACUT 2503 - 1E30 ; DISALLOWED # LATIN CAPITAL LETTER K WITH ACUTE 2504 - 1E31 ; PVALID # LATIN SMALL LETTER K WITH ACUTE 2505 - 1E32 ; DISALLOWED # LATIN CAPITAL LETTER K WITH DOT BELOW 2506 - 1E33 ; PVALID # LATIN SMALL LETTER K WITH DOT BELOW 2507 - 1E34 ; DISALLOWED # LATIN CAPITAL LETTER K WITH LINE BELOW 2508 - 1E35 ; PVALID # LATIN SMALL LETTER K WITH LINE BELOW 2509 - 1E36 ; DISALLOWED # LATIN CAPITAL LETTER L WITH DOT BELOW 2510 - 1E37 ; PVALID # LATIN SMALL LETTER L WITH DOT BELOW 2511 - 1E38 ; DISALLOWED # LATIN CAPITAL LETTER L WITH DOT BELOW AND MA 2512 - 1E39 ; PVALID # LATIN SMALL LETTER L WITH DOT BELOW AND MACR 2513 - 1E3A ; DISALLOWED # LATIN CAPITAL LETTER L WITH LINE BELOW 2514 - 1E3B ; PVALID # LATIN SMALL LETTER L WITH LINE BELOW 2515 - 1E3C ; DISALLOWED # LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW 2516 - 1E3D ; PVALID # LATIN SMALL LETTER L WITH CIRCUMFLEX BELOW 2517 - 1E3E ; DISALLOWED # LATIN CAPITAL LETTER M WITH ACUTE 2518 - 1E3F ; PVALID # LATIN SMALL LETTER M WITH ACUTE 2519 - 2520 - 2521 - 2522 - Faltstrom Standards Track [Page 45] 2523 - 2524 - RFC 5892 IDNA Code Points August 2010 2525 - 2526 - 2527 - 1E40 ; DISALLOWED # LATIN CAPITAL LETTER M WITH DOT ABOVE 2528 - 1E41 ; PVALID # LATIN SMALL LETTER M WITH DOT ABOVE 2529 - 1E42 ; DISALLOWED # LATIN CAPITAL LETTER M WITH DOT BELOW 2530 - 1E43 ; PVALID # LATIN SMALL LETTER M WITH DOT BELOW 2531 - 1E44 ; DISALLOWED # LATIN CAPITAL LETTER N WITH DOT ABOVE 2532 - 1E45 ; PVALID # LATIN SMALL LETTER N WITH DOT ABOVE 2533 - 1E46 ; DISALLOWED # LATIN CAPITAL LETTER N WITH DOT BELOW 2534 - 1E47 ; PVALID # LATIN SMALL LETTER N WITH DOT BELOW 2535 - 1E48 ; DISALLOWED # LATIN CAPITAL LETTER N WITH LINE BELOW 2536 - 1E49 ; PVALID # LATIN SMALL LETTER N WITH LINE BELOW 2537 - 1E4A ; DISALLOWED # LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW 2538 - 1E4B ; PVALID # LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW 2539 - 1E4C ; DISALLOWED # LATIN CAPITAL LETTER O WITH TILDE AND ACUTE 2540 - 1E4D ; PVALID # LATIN SMALL LETTER O WITH TILDE AND ACUTE 2541 - 1E4E ; DISALLOWED # LATIN CAPITAL LETTER O WITH TILDE AND DIAERE 2542 - 1E4F ; PVALID # LATIN SMALL LETTER O WITH TILDE AND DIAERESI 2543 - 1E50 ; DISALLOWED # LATIN CAPITAL LETTER O WITH MACRON AND GRAVE 2544 - 1E51 ; PVALID # LATIN SMALL LETTER O WITH MACRON AND GRAVE 2545 - 1E52 ; DISALLOWED # LATIN CAPITAL LETTER O WITH MACRON AND ACUTE 2546 - 1E53 ; PVALID # LATIN SMALL LETTER O WITH MACRON AND ACUTE 2547 - 1E54 ; DISALLOWED # LATIN CAPITAL LETTER P WITH ACUTE 2548 - 1E55 ; PVALID # LATIN SMALL LETTER P WITH ACUTE 2549 - 1E56 ; DISALLOWED # LATIN CAPITAL LETTER P WITH DOT ABOVE 2550 - 1E57 ; PVALID # LATIN SMALL LETTER P WITH DOT ABOVE 2551 - 1E58 ; DISALLOWED # LATIN CAPITAL LETTER R WITH DOT ABOVE 2552 - 1E59 ; PVALID # LATIN SMALL LETTER R WITH DOT ABOVE 2553 - 1E5A ; DISALLOWED # LATIN CAPITAL LETTER R WITH DOT BELOW 2554 - 1E5B ; PVALID # LATIN SMALL LETTER R WITH DOT BELOW 2555 - 1E5C ; DISALLOWED # LATIN CAPITAL LETTER R WITH DOT BELOW AND MA 2556 - 1E5D ; PVALID # LATIN SMALL LETTER R WITH DOT BELOW AND MACR 2557 - 1E5E ; DISALLOWED # LATIN CAPITAL LETTER R WITH LINE BELOW 2558 - 1E5F ; PVALID # LATIN SMALL LETTER R WITH LINE BELOW 2559 - 1E60 ; DISALLOWED # LATIN CAPITAL LETTER S WITH DOT ABOVE 2560 - 1E61 ; PVALID # LATIN SMALL LETTER S WITH DOT ABOVE 2561 - 1E62 ; DISALLOWED # LATIN CAPITAL LETTER S WITH DOT BELOW 2562 - 1E63 ; PVALID # LATIN SMALL LETTER S WITH DOT BELOW 2563 - 1E64 ; DISALLOWED # LATIN CAPITAL LETTER S WITH ACUTE AND DOT AB 2564 - 1E65 ; PVALID # LATIN SMALL LETTER S WITH ACUTE AND DOT ABOV 2565 - 1E66 ; DISALLOWED # LATIN CAPITAL LETTER S WITH CARON AND DOT AB 2566 - 1E67 ; PVALID # LATIN SMALL LETTER S WITH CARON AND DOT ABOV 2567 - 1E68 ; DISALLOWED # LATIN CAPITAL LETTER S WITH DOT BELOW AND DO 2568 - 1E69 ; PVALID # LATIN SMALL LETTER S WITH DOT BELOW AND DOT 2569 - 1E6A ; DISALLOWED # LATIN CAPITAL LETTER T WITH DOT ABOVE 2570 - 1E6B ; PVALID # LATIN SMALL LETTER T WITH DOT ABOVE 2571 - 1E6C ; DISALLOWED # LATIN CAPITAL LETTER T WITH DOT BELOW 2572 - 1E6D ; PVALID # LATIN SMALL LETTER T WITH DOT BELOW 2573 - 1E6E ; DISALLOWED # LATIN CAPITAL LETTER T WITH LINE BELOW 2574 - 1E6F ; PVALID # LATIN SMALL LETTER T WITH LINE BELOW 2575 - 2576 - 2577 - 2578 - Faltstrom Standards Track [Page 46] 2579 - 2580 - RFC 5892 IDNA Code Points August 2010 2581 - 2582 - 2583 - 1E70 ; DISALLOWED # LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW 2584 - 1E71 ; PVALID # LATIN SMALL LETTER T WITH CIRCUMFLEX BELOW 2585 - 1E72 ; DISALLOWED # LATIN CAPITAL LETTER U WITH DIAERESIS BELOW 2586 - 1E73 ; PVALID # LATIN SMALL LETTER U WITH DIAERESIS BELOW 2587 - 1E74 ; DISALLOWED # LATIN CAPITAL LETTER U WITH TILDE BELOW 2588 - 1E75 ; PVALID # LATIN SMALL LETTER U WITH TILDE BELOW 2589 - 1E76 ; DISALLOWED # LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW 2590 - 1E77 ; PVALID # LATIN SMALL LETTER U WITH CIRCUMFLEX BELOW 2591 - 1E78 ; DISALLOWED # LATIN CAPITAL LETTER U WITH TILDE AND ACUTE 2592 - 1E79 ; PVALID # LATIN SMALL LETTER U WITH TILDE AND ACUTE 2593 - 1E7A ; DISALLOWED # LATIN CAPITAL LETTER U WITH MACRON AND DIAER 2594 - 1E7B ; PVALID # LATIN SMALL LETTER U WITH MACRON AND DIAERES 2595 - 1E7C ; DISALLOWED # LATIN CAPITAL LETTER V WITH TILDE 2596 - 1E7D ; PVALID # LATIN SMALL LETTER V WITH TILDE 2597 - 1E7E ; DISALLOWED # LATIN CAPITAL LETTER V WITH DOT BELOW 2598 - 1E7F ; PVALID # LATIN SMALL LETTER V WITH DOT BELOW 2599 - 1E80 ; DISALLOWED # LATIN CAPITAL LETTER W WITH GRAVE 2600 - 1E81 ; PVALID # LATIN SMALL LETTER W WITH GRAVE 2601 - 1E82 ; DISALLOWED # LATIN CAPITAL LETTER W WITH ACUTE 2602 - 1E83 ; PVALID # LATIN SMALL LETTER W WITH ACUTE 2603 - 1E84 ; DISALLOWED # LATIN CAPITAL LETTER W WITH DIAERESIS 2604 - 1E85 ; PVALID # LATIN SMALL LETTER W WITH DIAERESIS 2605 - 1E86 ; DISALLOWED # LATIN CAPITAL LETTER W WITH DOT ABOVE 2606 - 1E87 ; PVALID # LATIN SMALL LETTER W WITH DOT ABOVE 2607 - 1E88 ; DISALLOWED # LATIN CAPITAL LETTER W WITH DOT BELOW 2608 - 1E89 ; PVALID # LATIN SMALL LETTER W WITH DOT BELOW 2609 - 1E8A ; DISALLOWED # LATIN CAPITAL LETTER X WITH DOT ABOVE 2610 - 1E8B ; PVALID # LATIN SMALL LETTER X WITH DOT ABOVE 2611 - 1E8C ; DISALLOWED # LATIN CAPITAL LETTER X WITH DIAERESIS 2612 - 1E8D ; PVALID # LATIN SMALL LETTER X WITH DIAERESIS 2613 - 1E8E ; DISALLOWED # LATIN CAPITAL LETTER Y WITH DOT ABOVE 2614 - 1E8F ; PVALID # LATIN SMALL LETTER Y WITH DOT ABOVE 2615 - 1E90 ; DISALLOWED # LATIN CAPITAL LETTER Z WITH CIRCUMFLEX 2616 - 1E91 ; PVALID # LATIN SMALL LETTER Z WITH CIRCUMFLEX 2617 - 1E92 ; DISALLOWED # LATIN CAPITAL LETTER Z WITH DOT BELOW 2618 - 1E93 ; PVALID # LATIN SMALL LETTER Z WITH DOT BELOW 2619 - 1E94 ; DISALLOWED # LATIN CAPITAL LETTER Z WITH LINE BELOW 2620 - 1E95..1E99 ; PVALID # LATIN SMALL LETTER Z WITH LINE BELOW..LATIN 2621 - 1E9A..1E9B ; DISALLOWED # LATIN SMALL LETTER A WITH RIGHT HALF RING..L 2622 - 1E9C..1E9D ; PVALID # LATIN SMALL LETTER LONG S WITH DIAGONAL STRO 2623 - 1E9E ; DISALLOWED # LATIN CAPITAL LETTER SHARP S 2624 - 1E9F ; PVALID # LATIN SMALL LETTER DELTA 2625 - 1EA0 ; DISALLOWED # LATIN CAPITAL LETTER A WITH DOT BELOW 2626 - 1EA1 ; PVALID # LATIN SMALL LETTER A WITH DOT BELOW 2627 - 1EA2 ; DISALLOWED # LATIN CAPITAL LETTER A WITH HOOK ABOVE 2628 - 1EA3 ; PVALID # LATIN SMALL LETTER A WITH HOOK ABOVE 2629 - 1EA4 ; DISALLOWED # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND A 2630 - 1EA5 ; PVALID # LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACU 2631 - 2632 - 2633 - 2634 - Faltstrom Standards Track [Page 47] 2635 - 2636 - RFC 5892 IDNA Code Points August 2010 2637 - 2638 - 2639 - 1EA6 ; DISALLOWED # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND G 2640 - 1EA7 ; PVALID # LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRA 2641 - 1EA8 ; DISALLOWED # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND H 2642 - 1EA9 ; PVALID # LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOO 2643 - 1EAA ; DISALLOWED # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND T 2644 - 1EAB ; PVALID # LATIN SMALL LETTER A WITH CIRCUMFLEX AND TIL 2645 - 1EAC ; DISALLOWED # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND D 2646 - 1EAD ; PVALID # LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT 2647 - 1EAE ; DISALLOWED # LATIN CAPITAL LETTER A WITH BREVE AND ACUTE 2648 - 1EAF ; PVALID # LATIN SMALL LETTER A WITH BREVE AND ACUTE 2649 - 1EB0 ; DISALLOWED # LATIN CAPITAL LETTER A WITH BREVE AND GRAVE 2650 - 1EB1 ; PVALID # LATIN SMALL LETTER A WITH BREVE AND GRAVE 2651 - 1EB2 ; DISALLOWED # LATIN CAPITAL LETTER A WITH BREVE AND HOOK A 2652 - 1EB3 ; PVALID # LATIN SMALL LETTER A WITH BREVE AND HOOK ABO 2653 - 1EB4 ; DISALLOWED # LATIN CAPITAL LETTER A WITH BREVE AND TILDE 2654 - 1EB5 ; PVALID # LATIN SMALL LETTER A WITH BREVE AND TILDE 2655 - 1EB6 ; DISALLOWED # LATIN CAPITAL LETTER A WITH BREVE AND DOT BE 2656 - 1EB7 ; PVALID # LATIN SMALL LETTER A WITH BREVE AND DOT BELO 2657 - 1EB8 ; DISALLOWED # LATIN CAPITAL LETTER E WITH DOT BELOW 2658 - 1EB9 ; PVALID # LATIN SMALL LETTER E WITH DOT BELOW 2659 - 1EBA ; DISALLOWED # LATIN CAPITAL LETTER E WITH HOOK ABOVE 2660 - 1EBB ; PVALID # LATIN SMALL LETTER E WITH HOOK ABOVE 2661 - 1EBC ; DISALLOWED # LATIN CAPITAL LETTER E WITH TILDE 2662 - 1EBD ; PVALID # LATIN SMALL LETTER E WITH TILDE 2663 - 1EBE ; DISALLOWED # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND A 2664 - 1EBF ; PVALID # LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACU 2665 - 1EC0 ; DISALLOWED # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND G 2666 - 1EC1 ; PVALID # LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRA 2667 - 1EC2 ; DISALLOWED # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND H 2668 - 1EC3 ; PVALID # LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOO 2669 - 1EC4 ; DISALLOWED # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND T 2670 - 1EC5 ; PVALID # LATIN SMALL LETTER E WITH CIRCUMFLEX AND TIL 2671 - 1EC6 ; DISALLOWED # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND D 2672 - 1EC7 ; PVALID # LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT 2673 - 1EC8 ; DISALLOWED # LATIN CAPITAL LETTER I WITH HOOK ABOVE 2674 - 1EC9 ; PVALID # LATIN SMALL LETTER I WITH HOOK ABOVE 2675 - 1ECA ; DISALLOWED # LATIN CAPITAL LETTER I WITH DOT BELOW 2676 - 1ECB ; PVALID # LATIN SMALL LETTER I WITH DOT BELOW 2677 - 1ECC ; DISALLOWED # LATIN CAPITAL LETTER O WITH DOT BELOW 2678 - 1ECD ; PVALID # LATIN SMALL LETTER O WITH DOT BELOW 2679 - 1ECE ; DISALLOWED # LATIN CAPITAL LETTER O WITH HOOK ABOVE 2680 - 1ECF ; PVALID # LATIN SMALL LETTER O WITH HOOK ABOVE 2681 - 1ED0 ; DISALLOWED # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND A 2682 - 1ED1 ; PVALID # LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACU 2683 - 1ED2 ; DISALLOWED # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND G 2684 - 1ED3 ; PVALID # LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRA 2685 - 1ED4 ; DISALLOWED # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND H 2686 - 1ED5 ; PVALID # LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOO 2687 - 2688 - 2689 - 2690 - Faltstrom Standards Track [Page 48] 2691 - 2692 - RFC 5892 IDNA Code Points August 2010 2693 - 2694 - 2695 - 1ED6 ; DISALLOWED # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND T 2696 - 1ED7 ; PVALID # LATIN SMALL LETTER O WITH CIRCUMFLEX AND TIL 2697 - 1ED8 ; DISALLOWED # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND D 2698 - 1ED9 ; PVALID # LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT 2699 - 1EDA ; DISALLOWED # LATIN CAPITAL LETTER O WITH HORN AND ACUTE 2700 - 1EDB ; PVALID # LATIN SMALL LETTER O WITH HORN AND ACUTE 2701 - 1EDC ; DISALLOWED # LATIN CAPITAL LETTER O WITH HORN AND GRAVE 2702 - 1EDD ; PVALID # LATIN SMALL LETTER O WITH HORN AND GRAVE 2703 - 1EDE ; DISALLOWED # LATIN CAPITAL LETTER O WITH HORN AND HOOK AB 2704 - 1EDF ; PVALID # LATIN SMALL LETTER O WITH HORN AND HOOK ABOV 2705 - 1EE0 ; DISALLOWED # LATIN CAPITAL LETTER O WITH HORN AND TILDE 2706 - 1EE1 ; PVALID # LATIN SMALL LETTER O WITH HORN AND TILDE 2707 - 1EE2 ; DISALLOWED # LATIN CAPITAL LETTER O WITH HORN AND DOT BEL 2708 - 1EE3 ; PVALID # LATIN SMALL LETTER O WITH HORN AND DOT BELOW 2709 - 1EE4 ; DISALLOWED # LATIN CAPITAL LETTER U WITH DOT BELOW 2710 - 1EE5 ; PVALID # LATIN SMALL LETTER U WITH DOT BELOW 2711 - 1EE6 ; DISALLOWED # LATIN CAPITAL LETTER U WITH HOOK ABOVE 2712 - 1EE7 ; PVALID # LATIN SMALL LETTER U WITH HOOK ABOVE 2713 - 1EE8 ; DISALLOWED # LATIN CAPITAL LETTER U WITH HORN AND ACUTE 2714 - 1EE9 ; PVALID # LATIN SMALL LETTER U WITH HORN AND ACUTE 2715 - 1EEA ; DISALLOWED # LATIN CAPITAL LETTER U WITH HORN AND GRAVE 2716 - 1EEB ; PVALID # LATIN SMALL LETTER U WITH HORN AND GRAVE 2717 - 1EEC ; DISALLOWED # LATIN CAPITAL LETTER U WITH HORN AND HOOK AB 2718 - 1EED ; PVALID # LATIN SMALL LETTER U WITH HORN AND HOOK ABOV 2719 - 1EEE ; DISALLOWED # LATIN CAPITAL LETTER U WITH HORN AND TILDE 2720 - 1EEF ; PVALID # LATIN SMALL LETTER U WITH HORN AND TILDE 2721 - 1EF0 ; DISALLOWED # LATIN CAPITAL LETTER U WITH HORN AND DOT BEL 2722 - 1EF1 ; PVALID # LATIN SMALL LETTER U WITH HORN AND DOT BELOW 2723 - 1EF2 ; DISALLOWED # LATIN CAPITAL LETTER Y WITH GRAVE 2724 - 1EF3 ; PVALID # LATIN SMALL LETTER Y WITH GRAVE 2725 - 1EF4 ; DISALLOWED # LATIN CAPITAL LETTER Y WITH DOT BELOW 2726 - 1EF5 ; PVALID # LATIN SMALL LETTER Y WITH DOT BELOW 2727 - 1EF6 ; DISALLOWED # LATIN CAPITAL LETTER Y WITH HOOK ABOVE 2728 - 1EF7 ; PVALID # LATIN SMALL LETTER Y WITH HOOK ABOVE 2729 - 1EF8 ; DISALLOWED # LATIN CAPITAL LETTER Y WITH TILDE 2730 - 1EF9 ; PVALID # LATIN SMALL LETTER Y WITH TILDE 2731 - 1EFA ; DISALLOWED # LATIN CAPITAL LETTER MIDDLE-WELSH LL 2732 - 1EFB ; PVALID # LATIN SMALL LETTER MIDDLE-WELSH LL 2733 - 1EFC ; DISALLOWED # LATIN CAPITAL LETTER MIDDLE-WELSH V 2734 - 1EFD ; PVALID # LATIN SMALL LETTER MIDDLE-WELSH V 2735 - 1EFE ; DISALLOWED # LATIN CAPITAL LETTER Y WITH LOOP 2736 - 1EFF..1F07 ; PVALID # LATIN SMALL LETTER Y WITH LOOP..GREEK SMALL 2737 - 1F08..1F0F ; DISALLOWED # GREEK CAPITAL LETTER ALPHA WITH PSILI..GREEK 2738 - 1F10..1F15 ; PVALID # GREEK SMALL LETTER EPSILON WITH PSILI..GREEK 2739 - 1F16..1F17 ; UNASSIGNED # <reserved>..<reserved> 2740 - 1F18..1F1D ; DISALLOWED # GREEK CAPITAL LETTER EPSILON WITH PSILI..GRE 2741 - 1F1E..1F1F ; UNASSIGNED # <reserved>..<reserved> 2742 - 1F20..1F27 ; PVALID # GREEK SMALL LETTER ETA WITH PSILI..GREEK SMA 2743 - 2744 - 2745 - 2746 - Faltstrom Standards Track [Page 49] 2747 - 2748 - RFC 5892 IDNA Code Points August 2010 2749 - 2750 - 2751 - 1F28..1F2F ; DISALLOWED # GREEK CAPITAL LETTER ETA WITH PSILI..GREEK C 2752 - 1F30..1F37 ; PVALID # GREEK SMALL LETTER IOTA WITH PSILI..GREEK SM 2753 - 1F38..1F3F ; DISALLOWED # GREEK CAPITAL LETTER IOTA WITH PSILI..GREEK 2754 - 1F40..1F45 ; PVALID # GREEK SMALL LETTER OMICRON WITH PSILI..GREEK 2755 - 1F46..1F47 ; UNASSIGNED # <reserved>..<reserved> 2756 - 1F48..1F4D ; DISALLOWED # GREEK CAPITAL LETTER OMICRON WITH PSILI..GRE 2757 - 1F4E..1F4F ; UNASSIGNED # <reserved>..<reserved> 2758 - 1F50..1F57 ; PVALID # GREEK SMALL LETTER UPSILON WITH PSILI..GREEK 2759 - 1F58 ; UNASSIGNED # <reserved> 2760 - 1F59 ; DISALLOWED # GREEK CAPITAL LETTER UPSILON WITH DASIA 2761 - 1F5A ; UNASSIGNED # <reserved> 2762 - 1F5B ; DISALLOWED # GREEK CAPITAL LETTER UPSILON WITH DASIA AND 2763 - 1F5C ; UNASSIGNED # <reserved> 2764 - 1F5D ; DISALLOWED # GREEK CAPITAL LETTER UPSILON WITH DASIA AND 2765 - 1F5E ; UNASSIGNED # <reserved> 2766 - 1F5F ; DISALLOWED # GREEK CAPITAL LETTER UPSILON WITH DASIA AND 2767 - 1F60..1F67 ; PVALID # GREEK SMALL LETTER OMEGA WITH PSILI..GREEK S 2768 - 1F68..1F6F ; DISALLOWED # GREEK CAPITAL LETTER OMEGA WITH PSILI..GREEK 2769 - 1F70 ; PVALID # GREEK SMALL LETTER ALPHA WITH VARIA 2770 - 1F71 ; DISALLOWED # GREEK SMALL LETTER ALPHA WITH OXIA 2771 - 1F72 ; PVALID # GREEK SMALL LETTER EPSILON WITH VARIA 2772 - 1F73 ; DISALLOWED # GREEK SMALL LETTER EPSILON WITH OXIA 2773 - 1F74 ; PVALID # GREEK SMALL LETTER ETA WITH VARIA 2774 - 1F75 ; DISALLOWED # GREEK SMALL LETTER ETA WITH OXIA 2775 - 1F76 ; PVALID # GREEK SMALL LETTER IOTA WITH VARIA 2776 - 1F77 ; DISALLOWED # GREEK SMALL LETTER IOTA WITH OXIA 2777 - 1F78 ; PVALID # GREEK SMALL LETTER OMICRON WITH VARIA 2778 - 1F79 ; DISALLOWED # GREEK SMALL LETTER OMICRON WITH OXIA 2779 - 1F7A ; PVALID # GREEK SMALL LETTER UPSILON WITH VARIA 2780 - 1F7B ; DISALLOWED # GREEK SMALL LETTER UPSILON WITH OXIA 2781 - 1F7C ; PVALID # GREEK SMALL LETTER OMEGA WITH VARIA 2782 - 1F7D ; DISALLOWED # GREEK SMALL LETTER OMEGA WITH OXIA 2783 - 1F7E..1F7F ; UNASSIGNED # <reserved>..<reserved> 2784 - 1F80..1FAF ; DISALLOWED # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOG 2785 - 1FB0..1FB1 ; PVALID # GREEK SMALL LETTER ALPHA WITH VRACHY..GREEK 2786 - 1FB2..1FB4 ; DISALLOWED # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOG 2787 - 1FB5 ; UNASSIGNED # <reserved> 2788 - 1FB6 ; PVALID # GREEK SMALL LETTER ALPHA WITH PERISPOMENI 2789 - 1FB7..1FC4 ; DISALLOWED # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AN 2790 - 1FC5 ; UNASSIGNED # <reserved> 2791 - 1FC6 ; PVALID # GREEK SMALL LETTER ETA WITH PERISPOMENI 2792 - 1FC7..1FCF ; DISALLOWED # GREEK SMALL LETTER ETA WITH PERISPOMENI AND 2793 - 1FD0..1FD2 ; PVALID # GREEK SMALL LETTER IOTA WITH VRACHY..GREEK S 2794 - 1FD3 ; DISALLOWED # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND O 2795 - 1FD4..1FD5 ; UNASSIGNED # <reserved>..<reserved> 2796 - 1FD6..1FD7 ; PVALID # GREEK SMALL LETTER IOTA WITH PERISPOMENI..GR 2797 - 1FD8..1FDB ; DISALLOWED # GREEK CAPITAL LETTER IOTA WITH VRACHY..GREEK 2798 - 1FDC ; UNASSIGNED # <reserved> 2799 - 2800 - 2801 - 2802 - Faltstrom Standards Track [Page 50] 2803 - 2804 - RFC 5892 IDNA Code Points August 2010 2805 - 2806 - 2807 - 1FDD..1FDF ; DISALLOWED # GREEK DASIA AND VARIA..GREEK DASIA AND PERIS 2808 - 1FE0..1FE2 ; PVALID # GREEK SMALL LETTER UPSILON WITH VRACHY..GREE 2809 - 1FE3 ; DISALLOWED # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AN 2810 - 1FE4..1FE7 ; PVALID # GREEK SMALL LETTER RHO WITH PSILI..GREEK SMA 2811 - 1FE8..1FEF ; DISALLOWED # GREEK CAPITAL LETTER UPSILON WITH VRACHY..GR 2812 - 1FF0..1FF1 ; UNASSIGNED # <reserved>..<reserved> 2813 - 1FF2..1FF4 ; DISALLOWED # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOG 2814 - 1FF5 ; UNASSIGNED # <reserved> 2815 - 1FF6 ; PVALID # GREEK SMALL LETTER OMEGA WITH PERISPOMENI 2816 - 1FF7..1FFE ; DISALLOWED # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AN 2817 - 1FFF ; UNASSIGNED # <reserved> 2818 - 2000..200B ; DISALLOWED # EN QUAD..ZERO WIDTH SPACE 2819 - 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 2820 - 200E..2064 ; DISALLOWED # LEFT-TO-RIGHT MARK..INVISIBLE PLUS 2821 - 2065..2069 ; UNASSIGNED # <reserved>..<reserved> 2822 - 206A..2071 ; DISALLOWED # INHIBIT SYMMETRIC SWAPPING..SUPERSCRIPT LATI 2823 - 2072..2073 ; UNASSIGNED # <reserved>..<reserved> 2824 - 2074..208E ; DISALLOWED # SUPERSCRIPT FOUR..SUBSCRIPT RIGHT PARENTHESI 2825 - 208F ; UNASSIGNED # <reserved> 2826 - 2090..2094 ; DISALLOWED # LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCR 2827 - 2095..209F ; UNASSIGNED # <reserved>..<reserved> 2828 - 20A0..20B8 ; DISALLOWED # EURO-CURRENCY SIGN..TENGE SIGN 2829 - 20B9..20CF ; UNASSIGNED # <reserved>..<reserved> 2830 - 20D0..20F0 ; DISALLOWED # COMBINING LEFT HARPOON ABOVE..COMBINING ASTE 2831 - 20F1..20FF ; UNASSIGNED # <reserved>..<reserved> 2832 - 2100..214D ; DISALLOWED # ACCOUNT OF..AKTIESELSKAB 2833 - 214E ; PVALID # TURNED SMALL F 2834 - 214F..2183 ; DISALLOWED # SYMBOL FOR SAMARITAN SOURCE..ROMAN NUMERAL R 2835 - 2184 ; PVALID # LATIN SMALL LETTER REVERSED C 2836 - 2185..2189 ; DISALLOWED # ROMAN NUMERAL SIX LATE FORM..VULGAR FRACTION 2837 - 218A..218F ; UNASSIGNED # <reserved>..<reserved> 2838 - 2190..23E8 ; DISALLOWED # LEFTWARDS ARROW..DECIMAL EXPONENT SYMBOL 2839 - 23E9..23FF ; UNASSIGNED # <reserved>..<reserved> 2840 - 2400..2426 ; DISALLOWED # SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM 2841 - 2427..243F ; UNASSIGNED # <reserved>..<reserved> 2842 - 2440..244A ; DISALLOWED # OCR HOOK..OCR DOUBLE BACKSLASH 2843 - 244B..245F ; UNASSIGNED # <reserved>..<reserved> 2844 - 2460..26CD ; DISALLOWED # CIRCLED DIGIT ONE..DISABLED CAR 2845 - 26CE ; UNASSIGNED # <reserved> 2846 - 26CF..26E1 ; DISALLOWED # PICK..RESTRICTED LEFT ENTRY-2 2847 - 26E2 ; UNASSIGNED # <reserved> 2848 - 26E3 ; DISALLOWED # HEAVY CIRCLE WITH STROKE AND TWO DOTS ABOVE 2849 - 26E4..26E7 ; UNASSIGNED # <reserved>..<reserved> 2850 - 26E8..26FF ; DISALLOWED # BLACK CROSS ON SHIELD..WHITE FLAG WITH HORIZ 2851 - 2700 ; UNASSIGNED # <reserved> 2852 - 2701..2704 ; DISALLOWED # UPPER BLADE SCISSORS..WHITE SCISSORS 2853 - 2705 ; UNASSIGNED # <reserved> 2854 - 2706..2709 ; DISALLOWED # TELEPHONE LOCATION SIGN..ENVELOPE 2855 - 2856 - 2857 - 2858 - Faltstrom Standards Track [Page 51] 2859 - 2860 - RFC 5892 IDNA Code Points August 2010 2861 - 2862 - 2863 - 270A..270B ; UNASSIGNED # <reserved>..<reserved> 2864 - 270C..2727 ; DISALLOWED # VICTORY HAND..WHITE FOUR POINTED STAR 2865 - 2728 ; UNASSIGNED # <reserved> 2866 - 2729..274B ; DISALLOWED # STRESS OUTLINED WHITE STAR..HEAVY EIGHT TEAR 2867 - 274C ; UNASSIGNED # <reserved> 2868 - 274D ; DISALLOWED # SHADOWED WHITE CIRCLE 2869 - 274E ; UNASSIGNED # <reserved> 2870 - 274F..2752 ; DISALLOWED # LOWER RIGHT DROP-SHADOWED WHITE SQUARE..UPPE 2871 - 2753..2755 ; UNASSIGNED # <reserved>..<reserved> 2872 - 2756..275E ; DISALLOWED # BLACK DIAMOND MINUS WHITE X..HEAVY DOUBLE CO 2873 - 275F..2760 ; UNASSIGNED # <reserved>..<reserved> 2874 - 2761..2794 ; DISALLOWED # CURVED STEM PARAGRAPH SIGN ORNAMENT..HEAVY W 2875 - 2795..2797 ; UNASSIGNED # <reserved>..<reserved> 2876 - 2798..27AF ; DISALLOWED # HEAVY SOUTH EAST ARROW..NOTCHED LOWER RIGHT- 2877 - 27B0 ; UNASSIGNED # <reserved> 2878 - 27B1..27BE ; DISALLOWED # NOTCHED UPPER RIGHT-SHADOWED WHITE RIGHTWARD 2879 - 27BF ; UNASSIGNED # <reserved> 2880 - 27C0..27CA ; DISALLOWED # THREE DIMENSIONAL ANGLE..VERTICAL BAR WITH H 2881 - 27CB ; UNASSIGNED # <reserved> 2882 - 27CC ; DISALLOWED # LONG DIVISION 2883 - 27CD..27CF ; UNASSIGNED # <reserved>..<reserved> 2884 - 27D0..2B4C ; DISALLOWED # WHITE DIAMOND WITH CENTRED DOT..RIGHTWARDS A 2885 - 2B4D..2B4F ; UNASSIGNED # <reserved>..<reserved> 2886 - 2B50..2B59 ; DISALLOWED # WHITE MEDIUM STAR..HEAVY CIRCLED SALTIRE 2887 - 2B5A..2BFF ; UNASSIGNED # <reserved>..<reserved> 2888 - 2C00..2C2E ; DISALLOWED # GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CA 2889 - 2C2F ; UNASSIGNED # <reserved> 2890 - 2C30..2C5E ; PVALID # GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMAL 2891 - 2C5F ; UNASSIGNED # <reserved> 2892 - 2C60 ; DISALLOWED # LATIN CAPITAL LETTER L WITH DOUBLE BAR 2893 - 2C61 ; PVALID # LATIN SMALL LETTER L WITH DOUBLE BAR 2894 - 2C62..2C64 ; DISALLOWED # LATIN CAPITAL LETTER L WITH MIDDLE TILDE..LA 2895 - 2C65..2C66 ; PVALID # LATIN SMALL LETTER A WITH STROKE..LATIN SMAL 2896 - 2C67 ; DISALLOWED # LATIN CAPITAL LETTER H WITH DESCENDER 2897 - 2C68 ; PVALID # LATIN SMALL LETTER H WITH DESCENDER 2898 - 2C69 ; DISALLOWED # LATIN CAPITAL LETTER K WITH DESCENDER 2899 - 2C6A ; PVALID # LATIN SMALL LETTER K WITH DESCENDER 2900 - 2C6B ; DISALLOWED # LATIN CAPITAL LETTER Z WITH DESCENDER 2901 - 2C6C ; PVALID # LATIN SMALL LETTER Z WITH DESCENDER 2902 - 2C6D..2C70 ; DISALLOWED # LATIN CAPITAL LETTER ALPHA..LATIN CAPITAL LE 2903 - 2C71 ; PVALID # LATIN SMALL LETTER V WITH RIGHT HOOK 2904 - 2C72 ; DISALLOWED # LATIN CAPITAL LETTER W WITH HOOK 2905 - 2C73..2C74 ; PVALID # LATIN SMALL LETTER W WITH HOOK..LATIN SMALL 2906 - 2C75 ; DISALLOWED # LATIN CAPITAL LETTER HALF H 2907 - 2C76..2C7B ; PVALID # LATIN SMALL LETTER HALF H..LATIN LETTER SMAL 2908 - 2C7C..2C80 ; DISALLOWED # LATIN SUBSCRIPT SMALL LETTER J..COPTIC CAPIT 2909 - 2C81 ; PVALID # COPTIC SMALL LETTER ALFA 2910 - 2C82 ; DISALLOWED # COPTIC CAPITAL LETTER VIDA 2911 - 2912 - 2913 - 2914 - Faltstrom Standards Track [Page 52] 2915 - 2916 - RFC 5892 IDNA Code Points August 2010 2917 - 2918 - 2919 - 2C83 ; PVALID # COPTIC SMALL LETTER VIDA 2920 - 2C84 ; DISALLOWED # COPTIC CAPITAL LETTER GAMMA 2921 - 2C85 ; PVALID # COPTIC SMALL LETTER GAMMA 2922 - 2C86 ; DISALLOWED # COPTIC CAPITAL LETTER DALDA 2923 - 2C87 ; PVALID # COPTIC SMALL LETTER DALDA 2924 - 2C88 ; DISALLOWED # COPTIC CAPITAL LETTER EIE 2925 - 2C89 ; PVALID # COPTIC SMALL LETTER EIE 2926 - 2C8A ; DISALLOWED # COPTIC CAPITAL LETTER SOU 2927 - 2C8B ; PVALID # COPTIC SMALL LETTER SOU 2928 - 2C8C ; DISALLOWED # COPTIC CAPITAL LETTER ZATA 2929 - 2C8D ; PVALID # COPTIC SMALL LETTER ZATA 2930 - 2C8E ; DISALLOWED # COPTIC CAPITAL LETTER HATE 2931 - 2C8F ; PVALID # COPTIC SMALL LETTER HATE 2932 - 2C90 ; DISALLOWED # COPTIC CAPITAL LETTER THETHE 2933 - 2C91 ; PVALID # COPTIC SMALL LETTER THETHE 2934 - 2C92 ; DISALLOWED # COPTIC CAPITAL LETTER IAUDA 2935 - 2C93 ; PVALID # COPTIC SMALL LETTER IAUDA 2936 - 2C94 ; DISALLOWED # COPTIC CAPITAL LETTER KAPA 2937 - 2C95 ; PVALID # COPTIC SMALL LETTER KAPA 2938 - 2C96 ; DISALLOWED # COPTIC CAPITAL LETTER LAULA 2939 - 2C97 ; PVALID # COPTIC SMALL LETTER LAULA 2940 - 2C98 ; DISALLOWED # COPTIC CAPITAL LETTER MI 2941 - 2C99 ; PVALID # COPTIC SMALL LETTER MI 2942 - 2C9A ; DISALLOWED # COPTIC CAPITAL LETTER NI 2943 - 2C9B ; PVALID # COPTIC SMALL LETTER NI 2944 - 2C9C ; DISALLOWED # COPTIC CAPITAL LETTER KSI 2945 - 2C9D ; PVALID # COPTIC SMALL LETTER KSI 2946 - 2C9E ; DISALLOWED # COPTIC CAPITAL LETTER O 2947 - 2C9F ; PVALID # COPTIC SMALL LETTER O 2948 - 2CA0 ; DISALLOWED # COPTIC CAPITAL LETTER PI 2949 - 2CA1 ; PVALID # COPTIC SMALL LETTER PI 2950 - 2CA2 ; DISALLOWED # COPTIC CAPITAL LETTER RO 2951 - 2CA3 ; PVALID # COPTIC SMALL LETTER RO 2952 - 2CA4 ; DISALLOWED # COPTIC CAPITAL LETTER SIMA 2953 - 2CA5 ; PVALID # COPTIC SMALL LETTER SIMA 2954 - 2CA6 ; DISALLOWED # COPTIC CAPITAL LETTER TAU 2955 - 2CA7 ; PVALID # COPTIC SMALL LETTER TAU 2956 - 2CA8 ; DISALLOWED # COPTIC CAPITAL LETTER UA 2957 - 2CA9 ; PVALID # COPTIC SMALL LETTER UA 2958 - 2CAA ; DISALLOWED # COPTIC CAPITAL LETTER FI 2959 - 2CAB ; PVALID # COPTIC SMALL LETTER FI 2960 - 2CAC ; DISALLOWED # COPTIC CAPITAL LETTER KHI 2961 - 2CAD ; PVALID # COPTIC SMALL LETTER KHI 2962 - 2CAE ; DISALLOWED # COPTIC CAPITAL LETTER PSI 2963 - 2CAF ; PVALID # COPTIC SMALL LETTER PSI 2964 - 2CB0 ; DISALLOWED # COPTIC CAPITAL LETTER OOU 2965 - 2CB1 ; PVALID # COPTIC SMALL LETTER OOU 2966 - 2CB2 ; DISALLOWED # COPTIC CAPITAL LETTER DIALECT-P ALEF 2967 - 2968 - 2969 - 2970 - Faltstrom Standards Track [Page 53] 2971 - 2972 - RFC 5892 IDNA Code Points August 2010 2973 - 2974 - 2975 - 2CB3 ; PVALID # COPTIC SMALL LETTER DIALECT-P ALEF 2976 - 2CB4 ; DISALLOWED # COPTIC CAPITAL LETTER OLD COPTIC AIN 2977 - 2CB5 ; PVALID # COPTIC SMALL LETTER OLD COPTIC AIN 2978 - 2CB6 ; DISALLOWED # COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE 2979 - 2CB7 ; PVALID # COPTIC SMALL LETTER CRYPTOGRAMMIC EIE 2980 - 2CB8 ; DISALLOWED # COPTIC CAPITAL LETTER DIALECT-P KAPA 2981 - 2CB9 ; PVALID # COPTIC SMALL LETTER DIALECT-P KAPA 2982 - 2CBA ; DISALLOWED # COPTIC CAPITAL LETTER DIALECT-P NI 2983 - 2CBB ; PVALID # COPTIC SMALL LETTER DIALECT-P NI 2984 - 2CBC ; DISALLOWED # COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI 2985 - 2CBD ; PVALID # COPTIC SMALL LETTER CRYPTOGRAMMIC NI 2986 - 2CBE ; DISALLOWED # COPTIC CAPITAL LETTER OLD COPTIC OOU 2987 - 2CBF ; PVALID # COPTIC SMALL LETTER OLD COPTIC OOU 2988 - 2CC0 ; DISALLOWED # COPTIC CAPITAL LETTER SAMPI 2989 - 2CC1 ; PVALID # COPTIC SMALL LETTER SAMPI 2990 - 2CC2 ; DISALLOWED # COPTIC CAPITAL LETTER CROSSED SHEI 2991 - 2CC3 ; PVALID # COPTIC SMALL LETTER CROSSED SHEI 2992 - 2CC4 ; DISALLOWED # COPTIC CAPITAL LETTER OLD COPTIC SHEI 2993 - 2CC5 ; PVALID # COPTIC SMALL LETTER OLD COPTIC SHEI 2994 - 2CC6 ; DISALLOWED # COPTIC CAPITAL LETTER OLD COPTIC ESH 2995 - 2CC7 ; PVALID # COPTIC SMALL LETTER OLD COPTIC ESH 2996 - 2CC8 ; DISALLOWED # COPTIC CAPITAL LETTER AKHMIMIC KHEI 2997 - 2CC9 ; PVALID # COPTIC SMALL LETTER AKHMIMIC KHEI 2998 - 2CCA ; DISALLOWED # COPTIC CAPITAL LETTER DIALECT-P HORI 2999 - 2CCB ; PVALID # COPTIC SMALL LETTER DIALECT-P HORI 3000 - 2CCC ; DISALLOWED # COPTIC CAPITAL LETTER OLD COPTIC HORI 3001 - 2CCD ; PVALID # COPTIC SMALL LETTER OLD COPTIC HORI 3002 - 2CCE ; DISALLOWED # COPTIC CAPITAL LETTER OLD COPTIC HA 3003 - 2CCF ; PVALID # COPTIC SMALL LETTER OLD COPTIC HA 3004 - 2CD0 ; DISALLOWED # COPTIC CAPITAL LETTER L-SHAPED HA 3005 - 2CD1 ; PVALID # COPTIC SMALL LETTER L-SHAPED HA 3006 - 2CD2 ; DISALLOWED # COPTIC CAPITAL LETTER OLD COPTIC HEI 3007 - 2CD3 ; PVALID # COPTIC SMALL LETTER OLD COPTIC HEI 3008 - 2CD4 ; DISALLOWED # COPTIC CAPITAL LETTER OLD COPTIC HAT 3009 - 2CD5 ; PVALID # COPTIC SMALL LETTER OLD COPTIC HAT 3010 - 2CD6 ; DISALLOWED # COPTIC CAPITAL LETTER OLD COPTIC GANGIA 3011 - 2CD7 ; PVALID # COPTIC SMALL LETTER OLD COPTIC GANGIA 3012 - 2CD8 ; DISALLOWED # COPTIC CAPITAL LETTER OLD COPTIC DJA 3013 - 2CD9 ; PVALID # COPTIC SMALL LETTER OLD COPTIC DJA 3014 - 2CDA ; DISALLOWED # COPTIC CAPITAL LETTER OLD COPTIC SHIMA 3015 - 2CDB ; PVALID # COPTIC SMALL LETTER OLD COPTIC SHIMA 3016 - 2CDC ; DISALLOWED # COPTIC CAPITAL LETTER OLD NUBIAN SHIMA 3017 - 2CDD ; PVALID # COPTIC SMALL LETTER OLD NUBIAN SHIMA 3018 - 2CDE ; DISALLOWED # COPTIC CAPITAL LETTER OLD NUBIAN NGI 3019 - 2CDF ; PVALID # COPTIC SMALL LETTER OLD NUBIAN NGI 3020 - 2CE0 ; DISALLOWED # COPTIC CAPITAL LETTER OLD NUBIAN NYI 3021 - 2CE1 ; PVALID # COPTIC SMALL LETTER OLD NUBIAN NYI 3022 - 2CE2 ; DISALLOWED # COPTIC CAPITAL LETTER OLD NUBIAN WAU 3023 - 3024 - 3025 - 3026 - Faltstrom Standards Track [Page 54] 3027 - 3028 - RFC 5892 IDNA Code Points August 2010 3029 - 3030 - 3031 - 2CE3..2CE4 ; PVALID # COPTIC SMALL LETTER OLD NUBIAN WAU..COPTIC S 3032 - 2CE5..2CEB ; DISALLOWED # COPTIC SYMBOL MI RO..COPTIC CAPITAL LETTER C 3033 - 2CEC ; PVALID # COPTIC SMALL LETTER CRYPTOGRAMMIC SHEI 3034 - 2CED ; DISALLOWED # COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA 3035 - 2CEE..2CF1 ; PVALID # COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA..CO 3036 - 2CF2..2CF8 ; UNASSIGNED # <reserved>..<reserved> 3037 - 2CF9..2CFF ; DISALLOWED # COPTIC OLD NUBIAN FULL STOP..COPTIC MORPHOLO 3038 - 2D00..2D25 ; PVALID # GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LET 3039 - 2D26..2D2F ; UNASSIGNED # <reserved>..<reserved> 3040 - 2D30..2D65 ; PVALID # TIFINAGH LETTER YA..TIFINAGH LETTER YAZZ 3041 - 2D66..2D6E ; UNASSIGNED # <reserved>..<reserved> 3042 - 2D6F ; DISALLOWED # TIFINAGH MODIFIER LETTER LABIALIZATION MARK 3043 - 2D70..2D7F ; UNASSIGNED # <reserved>..<reserved> 3044 - 2D80..2D96 ; PVALID # ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGW 3045 - 2D97..2D9F ; UNASSIGNED # <reserved>..<reserved> 3046 - 2DA0..2DA6 ; PVALID # ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO 3047 - 2DA7 ; UNASSIGNED # <reserved> 3048 - 2DA8..2DAE ; PVALID # ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO 3049 - 2DAF ; UNASSIGNED # <reserved> 3050 - 2DB0..2DB6 ; PVALID # ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO 3051 - 2DB7 ; UNASSIGNED # <reserved> 3052 - 2DB8..2DBE ; PVALID # ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CC 3053 - 2DBF ; UNASSIGNED # <reserved> 3054 - 2DC0..2DC6 ; PVALID # ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO 3055 - 2DC7 ; UNASSIGNED # <reserved> 3056 - 2DC8..2DCE ; PVALID # ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO 3057 - 2DCF ; UNASSIGNED # <reserved> 3058 - 2DD0..2DD6 ; PVALID # ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO 3059 - 2DD7 ; UNASSIGNED # <reserved> 3060 - 2DD8..2DDE ; PVALID # ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO 3061 - 2DDF ; UNASSIGNED # <reserved> 3062 - 2DE0..2DFF ; PVALID # COMBINING CYRILLIC LETTER BE..COMBINING CYRI 3063 - 2E00..2E2E ; DISALLOWED # RIGHT ANGLE SUBSTITUTION MARKER..REVERSED QU 3064 - 2E2F ; PVALID # VERTICAL TILDE 3065 - 2E30..2E31 ; DISALLOWED # RING POINT..WORD SEPARATOR MIDDLE DOT 3066 - 2E32..2E7F ; UNASSIGNED # <reserved>..<reserved> 3067 - 2E80..2E99 ; DISALLOWED # CJK RADICAL REPEAT..CJK RADICAL RAP 3068 - 2E9A ; UNASSIGNED # <reserved> 3069 - 2E9B..2EF3 ; DISALLOWED # CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED 3070 - 2EF4..2EFF ; UNASSIGNED # <reserved>..<reserved> 3071 - 2F00..2FD5 ; DISALLOWED # KANGXI RADICAL ONE..KANGXI RADICAL FLUTE 3072 - 2FD6..2FEF ; UNASSIGNED # <reserved>..<reserved> 3073 - 2FF0..2FFB ; DISALLOWED # IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RI 3074 - 2FFC..2FFF ; UNASSIGNED # <reserved>..<reserved> 3075 - 3000..3004 ; DISALLOWED # IDEOGRAPHIC SPACE..JAPANESE INDUSTRIAL STAND 3076 - 3005..3007 ; PVALID # IDEOGRAPHIC ITERATION MARK..IDEOGRAPHIC NUMB 3077 - 3008..3029 ; DISALLOWED # LEFT ANGLE BRACKET..HANGZHOU NUMERAL NINE 3078 - 302A..302D ; PVALID # IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENT 3079 - 3080 - 3081 - 3082 - Faltstrom Standards Track [Page 55] 3083 - 3084 - RFC 5892 IDNA Code Points August 2010 3085 - 3086 - 3087 - 302E..303B ; DISALLOWED # HANGUL SINGLE DOT TONE MARK..VERTICAL IDEOGR 3088 - 303C ; PVALID # MASU MARK 3089 - 303D..303F ; DISALLOWED # PART ALTERNATION MARK..IDEOGRAPHIC HALF FILL 3090 - 3040 ; UNASSIGNED # <reserved> 3091 - 3041..3096 ; PVALID # HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMA 3092 - 3097..3098 ; UNASSIGNED # <reserved>..<reserved> 3093 - 3099..309A ; PVALID # COMBINING KATAKANA-HIRAGANA VOICED SOUND MAR 3094 - 309B..309C ; DISALLOWED # KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKAN 3095 - 309D..309E ; PVALID # HIRAGANA ITERATION MARK..HIRAGANA VOICED ITE 3096 - 309F..30A0 ; DISALLOWED # HIRAGANA DIGRAPH YORI..KATAKANA-HIRAGANA DOU 3097 - 30A1..30FA ; PVALID # KATAKANA LETTER SMALL A..KATAKANA LETTER VO 3098 - 30FB ; CONTEXTO # KATAKANA MIDDLE DOT 3099 - 30FC..30FE ; PVALID # KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATA 3100 - 30FF ; DISALLOWED # KATAKANA DIGRAPH KOTO 3101 - 3100..3104 ; UNASSIGNED # <reserved>..<reserved> 3102 - 3105..312D ; PVALID # BOPOMOFO LETTER B..BOPOMOFO LETTER IH 3103 - 312E..3130 ; UNASSIGNED # <reserved>..<reserved> 3104 - 3131..318E ; DISALLOWED # HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE 3105 - 318F ; UNASSIGNED # <reserved> 3106 - 3190..319F ; DISALLOWED # IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRA 3107 - 31A0..31B7 ; PVALID # BOPOMOFO LETTER BU..BOPOMOFO FINAL LETTER H 3108 - 31B8..31BF ; UNASSIGNED # <reserved>..<reserved> 3109 - 31C0..31E3 ; DISALLOWED # CJK STROKE T..CJK STROKE Q 3110 - 31E4..31EF ; UNASSIGNED # <reserved>..<reserved> 3111 - 31F0..31FF ; PVALID # KATAKANA LETTER SMALL KU..KATAKANA LETTER SM 3112 - 3200..321E ; DISALLOWED # PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED K 3113 - 321F ; UNASSIGNED # <reserved> 3114 - 3220..32FE ; DISALLOWED # PARENTHESIZED IDEOGRAPH ONE..CIRCLED KATAKAN 3115 - 32FF ; UNASSIGNED # <reserved> 3116 - 3300..33FF ; DISALLOWED # SQUARE APAATO..SQUARE GAL 3117 - 3400..4DB5 ; PVALID # <CJK Ideograph Extension A>..<CJK Ideograph 3118 - 4DB6..4DBF ; UNASSIGNED # <reserved>..<reserved> 3119 - 4DC0..4DFF ; DISALLOWED # HEXAGRAM FOR THE CREATIVE HEAVEN..HEXAGRAM F 3120 - 4E00..9FCB ; PVALID # <CJK Ideograph>..<CJK Ideograph> 3121 - 9FCC..9FFF ; UNASSIGNED # <reserved>..<reserved> 3122 - A000..A48C ; PVALID # YI SYLLABLE IT..YI SYLLABLE YYR 3123 - A48D..A48F ; UNASSIGNED # <reserved>..<reserved> 3124 - A490..A4C6 ; DISALLOWED # YI RADICAL QOT..YI RADICAL KE 3125 - A4C7..A4CF ; UNASSIGNED # <reserved>..<reserved> 3126 - A4D0..A4FD ; PVALID # LISU LETTER BA..LISU LETTER TONE MYA JEU 3127 - A4FE..A4FF ; DISALLOWED # LISU PUNCTUATION COMMA..LISU PUNCTUATION FUL 3128 - A500..A60C ; PVALID # VAI SYLLABLE EE..VAI SYLLABLE LENGTHENER 3129 - A60D..A60F ; DISALLOWED # VAI COMMA..VAI QUESTION MARK 3130 - A610..A62B ; PVALID # VAI SYLLABLE NDOLE FA..VAI SYLLABLE NDOLE DO 3131 - A62C..A63F ; UNASSIGNED # <reserved>..<reserved> 3132 - A640 ; DISALLOWED # CYRILLIC CAPITAL LETTER ZEMLYA 3133 - A641 ; PVALID # CYRILLIC SMALL LETTER ZEMLYA 3134 - A642 ; DISALLOWED # CYRILLIC CAPITAL LETTER DZELO 3135 - 3136 - 3137 - 3138 - Faltstrom Standards Track [Page 56] 3139 - 3140 - RFC 5892 IDNA Code Points August 2010 3141 - 3142 - 3143 - A643 ; PVALID # CYRILLIC SMALL LETTER DZELO 3144 - A644 ; DISALLOWED # CYRILLIC CAPITAL LETTER REVERSED DZE 3145 - A645 ; PVALID # CYRILLIC SMALL LETTER REVERSED DZE 3146 - A646 ; DISALLOWED # CYRILLIC CAPITAL LETTER IOTA 3147 - A647 ; PVALID # CYRILLIC SMALL LETTER IOTA 3148 - A648 ; DISALLOWED # CYRILLIC CAPITAL LETTER DJERV 3149 - A649 ; PVALID # CYRILLIC SMALL LETTER DJERV 3150 - A64A ; DISALLOWED # CYRILLIC CAPITAL LETTER MONOGRAPH UK 3151 - A64B ; PVALID # CYRILLIC SMALL LETTER MONOGRAPH UK 3152 - A64C ; DISALLOWED # CYRILLIC CAPITAL LETTER BROAD OMEGA 3153 - A64D ; PVALID # CYRILLIC SMALL LETTER BROAD OMEGA 3154 - A64E ; DISALLOWED # CYRILLIC CAPITAL LETTER NEUTRAL YER 3155 - A64F ; PVALID # CYRILLIC SMALL LETTER NEUTRAL YER 3156 - A650 ; DISALLOWED # CYRILLIC CAPITAL LETTER YERU WITH BACK YER 3157 - A651 ; PVALID # CYRILLIC SMALL LETTER YERU WITH BACK YER 3158 - A652 ; DISALLOWED # CYRILLIC CAPITAL LETTER IOTIFIED YAT 3159 - A653 ; PVALID # CYRILLIC SMALL LETTER IOTIFIED YAT 3160 - A654 ; DISALLOWED # CYRILLIC CAPITAL LETTER REVERSED YU 3161 - A655 ; PVALID # CYRILLIC SMALL LETTER REVERSED YU 3162 - A656 ; DISALLOWED # CYRILLIC CAPITAL LETTER IOTIFIED A 3163 - A657 ; PVALID # CYRILLIC SMALL LETTER IOTIFIED A 3164 - A658 ; DISALLOWED # CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS 3165 - A659 ; PVALID # CYRILLIC SMALL LETTER CLOSED LITTLE YUS 3166 - A65A ; DISALLOWED # CYRILLIC CAPITAL LETTER BLENDED YUS 3167 - A65B ; PVALID # CYRILLIC SMALL LETTER BLENDED YUS 3168 - A65C ; DISALLOWED # CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITT 3169 - A65D ; PVALID # CYRILLIC SMALL LETTER IOTIFIED CLOSED LITTLE 3170 - A65E ; DISALLOWED # CYRILLIC CAPITAL LETTER YN 3171 - A65F ; PVALID # CYRILLIC SMALL LETTER YN 3172 - A660..A661 ; UNASSIGNED # <reserved>..<reserved> 3173 - A662 ; DISALLOWED # CYRILLIC CAPITAL LETTER SOFT DE 3174 - A663 ; PVALID # CYRILLIC SMALL LETTER SOFT DE 3175 - A664 ; DISALLOWED # CYRILLIC CAPITAL LETTER SOFT EL 3176 - A665 ; PVALID # CYRILLIC SMALL LETTER SOFT EL 3177 - A666 ; DISALLOWED # CYRILLIC CAPITAL LETTER SOFT EM 3178 - A667 ; PVALID # CYRILLIC SMALL LETTER SOFT EM 3179 - A668 ; DISALLOWED # CYRILLIC CAPITAL LETTER MONOCULAR O 3180 - A669 ; PVALID # CYRILLIC SMALL LETTER MONOCULAR O 3181 - A66A ; DISALLOWED # CYRILLIC CAPITAL LETTER BINOCULAR O 3182 - A66B ; PVALID # CYRILLIC SMALL LETTER BINOCULAR O 3183 - A66C ; DISALLOWED # CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O 3184 - A66D..A66F ; PVALID # CYRILLIC SMALL LETTER DOUBLE MONOCULAR O..CO 3185 - A670..A673 ; DISALLOWED # COMBINING CYRILLIC TEN MILLIONS SIGN..SLAVON 3186 - A674..A67B ; UNASSIGNED # <reserved>..<reserved> 3187 - A67C..A67D ; PVALID # COMBINING CYRILLIC KAVYKA..COMBINING CYRILLI 3188 - A67E ; DISALLOWED # CYRILLIC KAVYKA 3189 - A67F ; PVALID # CYRILLIC PAYEROK 3190 - A680 ; DISALLOWED # CYRILLIC CAPITAL LETTER DWE 3191 - 3192 - 3193 - 3194 - Faltstrom Standards Track [Page 57] 3195 - 3196 - RFC 5892 IDNA Code Points August 2010 3197 - 3198 - 3199 - A681 ; PVALID # CYRILLIC SMALL LETTER DWE 3200 - A682 ; DISALLOWED # CYRILLIC CAPITAL LETTER DZWE 3201 - A683 ; PVALID # CYRILLIC SMALL LETTER DZWE 3202 - A684 ; DISALLOWED # CYRILLIC CAPITAL LETTER ZHWE 3203 - A685 ; PVALID # CYRILLIC SMALL LETTER ZHWE 3204 - A686 ; DISALLOWED # CYRILLIC CAPITAL LETTER CCHE 3205 - A687 ; PVALID # CYRILLIC SMALL LETTER CCHE 3206 - A688 ; DISALLOWED # CYRILLIC CAPITAL LETTER DZZE 3207 - A689 ; PVALID # CYRILLIC SMALL LETTER DZZE 3208 - A68A ; DISALLOWED # CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK 3209 - A68B ; PVALID # CYRILLIC SMALL LETTER TE WITH MIDDLE HOOK 3210 - A68C ; DISALLOWED # CYRILLIC CAPITAL LETTER TWE 3211 - A68D ; PVALID # CYRILLIC SMALL LETTER TWE 3212 - A68E ; DISALLOWED # CYRILLIC CAPITAL LETTER TSWE 3213 - A68F ; PVALID # CYRILLIC SMALL LETTER TSWE 3214 - A690 ; DISALLOWED # CYRILLIC CAPITAL LETTER TSSE 3215 - A691 ; PVALID # CYRILLIC SMALL LETTER TSSE 3216 - A692 ; DISALLOWED # CYRILLIC CAPITAL LETTER TCHE 3217 - A693 ; PVALID # CYRILLIC SMALL LETTER TCHE 3218 - A694 ; DISALLOWED # CYRILLIC CAPITAL LETTER HWE 3219 - A695 ; PVALID # CYRILLIC SMALL LETTER HWE 3220 - A696 ; DISALLOWED # CYRILLIC CAPITAL LETTER SHWE 3221 - A697 ; PVALID # CYRILLIC SMALL LETTER SHWE 3222 - A698..A69F ; UNASSIGNED # <reserved>..<reserved> 3223 - A6A0..A6E5 ; PVALID # BAMUM LETTER A..BAMUM LETTER KI 3224 - A6E6..A6EF ; DISALLOWED # BAMUM LETTER MO..BAMUM LETTER KOGHOM 3225 - A6F0..A6F1 ; PVALID # BAMUM COMBINING MARK KOQNDON..BAMUM COMBININ 3226 - A6F2..A6F7 ; DISALLOWED # BAMUM NJAEMLI..BAMUM QUESTION MARK 3227 - A6F8..A6FF ; UNASSIGNED # <reserved>..<reserved> 3228 - A700..A716 ; DISALLOWED # MODIFIER LETTER CHINESE TONE YIN PING..MODIF 3229 - A717..A71F ; PVALID # MODIFIER LETTER DOT VERTICAL BAR..MODIFIER L 3230 - A720..A722 ; DISALLOWED # MODIFIER LETTER STRESS AND HIGH TONE..LATIN 3231 - A723 ; PVALID # LATIN SMALL LETTER EGYPTOLOGICAL ALEF 3232 - A724 ; DISALLOWED # LATIN CAPITAL LETTER EGYPTOLOGICAL AIN 3233 - A725 ; PVALID # LATIN SMALL LETTER EGYPTOLOGICAL AIN 3234 - A726 ; DISALLOWED # LATIN CAPITAL LETTER HENG 3235 - A727 ; PVALID # LATIN SMALL LETTER HENG 3236 - A728 ; DISALLOWED # LATIN CAPITAL LETTER TZ 3237 - A729 ; PVALID # LATIN SMALL LETTER TZ 3238 - A72A ; DISALLOWED # LATIN CAPITAL LETTER TRESILLO 3239 - A72B ; PVALID # LATIN SMALL LETTER TRESILLO 3240 - A72C ; DISALLOWED # LATIN CAPITAL LETTER CUATRILLO 3241 - A72D ; PVALID # LATIN SMALL LETTER CUATRILLO 3242 - A72E ; DISALLOWED # LATIN CAPITAL LETTER CUATRILLO WITH COMMA 3243 - A72F..A731 ; PVALID # LATIN SMALL LETTER CUATRILLO WITH COMMA..LAT 3244 - A732 ; DISALLOWED # LATIN CAPITAL LETTER AA 3245 - A733 ; PVALID # LATIN SMALL LETTER AA 3246 - A734 ; DISALLOWED # LATIN CAPITAL LETTER AO 3247 - 3248 - 3249 - 3250 - Faltstrom Standards Track [Page 58] 3251 - 3252 - RFC 5892 IDNA Code Points August 2010 3253 - 3254 - 3255 - A735 ; PVALID # LATIN SMALL LETTER AO 3256 - A736 ; DISALLOWED # LATIN CAPITAL LETTER AU 3257 - A737 ; PVALID # LATIN SMALL LETTER AU 3258 - A738 ; DISALLOWED # LATIN CAPITAL LETTER AV 3259 - A739 ; PVALID # LATIN SMALL LETTER AV 3260 - A73A ; DISALLOWED # LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR 3261 - A73B ; PVALID # LATIN SMALL LETTER AV WITH HORIZONTAL BAR 3262 - A73C ; DISALLOWED # LATIN CAPITAL LETTER AY 3263 - A73D ; PVALID # LATIN SMALL LETTER AY 3264 - A73E ; DISALLOWED # LATIN CAPITAL LETTER REVERSED C WITH DOT 3265 - A73F ; PVALID # LATIN SMALL LETTER REVERSED C WITH DOT 3266 - A740 ; DISALLOWED # LATIN CAPITAL LETTER K WITH STROKE 3267 - A741 ; PVALID # LATIN SMALL LETTER K WITH STROKE 3268 - A742 ; DISALLOWED # LATIN CAPITAL LETTER K WITH DIAGONAL STROKE 3269 - A743 ; PVALID # LATIN SMALL LETTER K WITH DIAGONAL STROKE 3270 - A744 ; DISALLOWED # LATIN CAPITAL LETTER K WITH STROKE AND DIAGO 3271 - A745 ; PVALID # LATIN SMALL LETTER K WITH STROKE AND DIAGONA 3272 - A746 ; DISALLOWED # LATIN CAPITAL LETTER BROKEN L 3273 - A747 ; PVALID # LATIN SMALL LETTER BROKEN L 3274 - A748 ; DISALLOWED # LATIN CAPITAL LETTER L WITH HIGH STROKE 3275 - A749 ; PVALID # LATIN SMALL LETTER L WITH HIGH STROKE 3276 - A74A ; DISALLOWED # LATIN CAPITAL LETTER O WITH LONG STROKE OVER 3277 - A74B ; PVALID # LATIN SMALL LETTER O WITH LONG STROKE OVERLA 3278 - A74C ; DISALLOWED # LATIN CAPITAL LETTER O WITH LOOP 3279 - A74D ; PVALID # LATIN SMALL LETTER O WITH LOOP 3280 - A74E ; DISALLOWED # LATIN CAPITAL LETTER OO 3281 - A74F ; PVALID # LATIN SMALL LETTER OO 3282 - A750 ; DISALLOWED # LATIN CAPITAL LETTER P WITH STROKE THROUGH D 3283 - A751 ; PVALID # LATIN SMALL LETTER P WITH STROKE THROUGH DES 3284 - A752 ; DISALLOWED # LATIN CAPITAL LETTER P WITH FLOURISH 3285 - A753 ; PVALID # LATIN SMALL LETTER P WITH FLOURISH 3286 - A754 ; DISALLOWED # LATIN CAPITAL LETTER P WITH SQUIRREL TAIL 3287 - A755 ; PVALID # LATIN SMALL LETTER P WITH SQUIRREL TAIL 3288 - A756 ; DISALLOWED # LATIN CAPITAL LETTER Q WITH STROKE THROUGH D 3289 - A757 ; PVALID # LATIN SMALL LETTER Q WITH STROKE THROUGH DES 3290 - A758 ; DISALLOWED # LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE 3291 - A759 ; PVALID # LATIN SMALL LETTER Q WITH DIAGONAL STROKE 3292 - A75A ; DISALLOWED # LATIN CAPITAL LETTER R ROTUNDA 3293 - A75B ; PVALID # LATIN SMALL LETTER R ROTUNDA 3294 - A75C ; DISALLOWED # LATIN CAPITAL LETTER RUM ROTUNDA 3295 - A75D ; PVALID # LATIN SMALL LETTER RUM ROTUNDA 3296 - A75E ; DISALLOWED # LATIN CAPITAL LETTER V WITH DIAGONAL STROKE 3297 - A75F ; PVALID # LATIN SMALL LETTER V WITH DIAGONAL STROKE 3298 - A760 ; DISALLOWED # LATIN CAPITAL LETTER VY 3299 - A761 ; PVALID # LATIN SMALL LETTER VY 3300 - A762 ; DISALLOWED # LATIN CAPITAL LETTER VISIGOTHIC Z 3301 - A763 ; PVALID # LATIN SMALL LETTER VISIGOTHIC Z 3302 - A764 ; DISALLOWED # LATIN CAPITAL LETTER THORN WITH STROKE 3303 - 3304 - 3305 - 3306 - Faltstrom Standards Track [Page 59] 3307 - 3308 - RFC 5892 IDNA Code Points August 2010 3309 - 3310 - 3311 - A765 ; PVALID # LATIN SMALL LETTER THORN WITH STROKE 3312 - A766 ; DISALLOWED # LATIN CAPITAL LETTER THORN WITH STROKE THROU 3313 - A767 ; PVALID # LATIN SMALL LETTER THORN WITH STROKE THROUGH 3314 - A768 ; DISALLOWED # LATIN CAPITAL LETTER VEND 3315 - A769 ; PVALID # LATIN SMALL LETTER VEND 3316 - A76A ; DISALLOWED # LATIN CAPITAL LETTER ET 3317 - A76B ; PVALID # LATIN SMALL LETTER ET 3318 - A76C ; DISALLOWED # LATIN CAPITAL LETTER IS 3319 - A76D ; PVALID # LATIN SMALL LETTER IS 3320 - A76E ; DISALLOWED # LATIN CAPITAL LETTER CON 3321 - A76F ; PVALID # LATIN SMALL LETTER CON 3322 - A770 ; DISALLOWED # MODIFIER LETTER US 3323 - A771..A778 ; PVALID # LATIN SMALL LETTER DUM..LATIN SMALL LETTER U 3324 - A779 ; DISALLOWED # LATIN CAPITAL LETTER INSULAR D 3325 - A77A ; PVALID # LATIN SMALL LETTER INSULAR D 3326 - A77B ; DISALLOWED # LATIN CAPITAL LETTER INSULAR F 3327 - A77C ; PVALID # LATIN SMALL LETTER INSULAR F 3328 - A77D..A77E ; DISALLOWED # LATIN CAPITAL LETTER INSULAR G..LATIN CAPITA 3329 - A77F ; PVALID # LATIN SMALL LETTER TURNED INSULAR G 3330 - A780 ; DISALLOWED # LATIN CAPITAL LETTER TURNED L 3331 - A781 ; PVALID # LATIN SMALL LETTER TURNED L 3332 - A782 ; DISALLOWED # LATIN CAPITAL LETTER INSULAR R 3333 - A783 ; PVALID # LATIN SMALL LETTER INSULAR R 3334 - A784 ; DISALLOWED # LATIN CAPITAL LETTER INSULAR S 3335 - A785 ; PVALID # LATIN SMALL LETTER INSULAR S 3336 - A786 ; DISALLOWED # LATIN CAPITAL LETTER INSULAR T 3337 - A787..A788 ; PVALID # LATIN SMALL LETTER INSULAR T..MODIFIER LETTE 3338 - A789..A78B ; DISALLOWED # MODIFIER LETTER COLON..LATIN CAPITAL LETTER 3339 - A78C ; PVALID # LATIN SMALL LETTER SALTILLO 3340 - A78D..A7FA ; UNASSIGNED # <reserved>..<reserved> 3341 - A7FB..A827 ; PVALID # LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI N 3342 - A828..A82B ; DISALLOWED # SYLOTI NAGRI POETRY MARK-1..SYLOTI NAGRI POE 3343 - A82C..A82F ; UNASSIGNED # <reserved>..<reserved> 3344 - A830..A839 ; DISALLOWED # NORTH INDIC FRACTION ONE QUARTER..NORTH INDI 3345 - A83A..A83F ; UNASSIGNED # <reserved>..<reserved> 3346 - A840..A873 ; PVALID # PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABI 3347 - A874..A877 ; DISALLOWED # PHAGS-PA SINGLE HEAD MARK..PHAGS-PA MARK DOU 3348 - A878..A87F ; UNASSIGNED # <reserved>..<reserved> 3349 - A880..A8C4 ; PVALID # SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VI 3350 - A8C5..A8CD ; UNASSIGNED # <reserved>..<reserved> 3351 - A8CE..A8CF ; DISALLOWED # SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA 3352 - A8D0..A8D9 ; PVALID # SAURASHTRA DIGIT ZERO..SAURASHTRA DIGIT NINE 3353 - A8DA..A8DF ; UNASSIGNED # <reserved>..<reserved> 3354 - A8E0..A8F7 ; PVALID # COMBINING DEVANAGARI DIGIT ZERO..DEVANAGARI 3355 - A8F8..A8FA ; DISALLOWED # DEVANAGARI SIGN PUSHPIKA..DEVANAGARI CARET 3356 - A8FB ; PVALID # DEVANAGARI HEADSTROKE 3357 - A8FC..A8FF ; UNASSIGNED # <reserved>..<reserved> 3358 - A900..A92D ; PVALID # KAYAH LI DIGIT ZERO..KAYAH LI TONE CALYA PLO 3359 - 3360 - 3361 - 3362 - Faltstrom Standards Track [Page 60] 3363 - 3364 - RFC 5892 IDNA Code Points August 2010 3365 - 3366 - 3367 - A92E..A92F ; DISALLOWED # KAYAH LI SIGN CWI..KAYAH LI SIGN SHYA 3368 - A930..A953 ; PVALID # REJANG LETTER KA..REJANG VIRAMA 3369 - A954..A95E ; UNASSIGNED # <reserved>..<reserved> 3370 - A95F..A97C ; DISALLOWED # REJANG SECTION MARK..HANGUL CHOSEONG SSANGYE 3371 - A97D..A97F ; UNASSIGNED # <reserved>..<reserved> 3372 - A980..A9C0 ; PVALID # JAVANESE SIGN PANYANGGA..JAVANESE PANGKON 3373 - A9C1..A9CD ; DISALLOWED # JAVANESE LEFT RERENGGAN..JAVANESE TURNED PAD 3374 - A9CE ; UNASSIGNED # <reserved> 3375 - A9CF..A9D9 ; PVALID # JAVANESE PANGRANGKEP..JAVANESE DIGIT NINE 3376 - A9DA..A9DD ; UNASSIGNED # <reserved>..<reserved> 3377 - A9DE..A9DF ; DISALLOWED # JAVANESE PADA TIRTA TUMETES..JAVANESE PADA I 3378 - A9E0..A9FF ; UNASSIGNED # <reserved>..<reserved> 3379 - AA00..AA36 ; PVALID # CHAM LETTER A..CHAM CONSONANT SIGN WA 3380 - AA37..AA3F ; UNASSIGNED # <reserved>..<reserved> 3381 - AA40..AA4D ; PVALID # CHAM LETTER FINAL K..CHAM CONSONANT SIGN FIN 3382 - AA4E..AA4F ; UNASSIGNED # <reserved>..<reserved> 3383 - AA50..AA59 ; PVALID # CHAM DIGIT ZERO..CHAM DIGIT NINE 3384 - AA5A..AA5B ; UNASSIGNED # <reserved>..<reserved> 3385 - AA5C..AA5F ; DISALLOWED # CHAM PUNCTUATION SPIRAL..CHAM PUNCTUATION TR 3386 - AA60..AA76 ; PVALID # MYANMAR LETTER KHAMTI GA..MYANMAR LOGOGRAM K 3387 - AA77..AA79 ; DISALLOWED # MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SY 3388 - AA7A..AA7B ; PVALID # MYANMAR LETTER AITON RA..MYANMAR SIGN PAO KA 3389 - AA7C..AA7F ; UNASSIGNED # <reserved>..<reserved> 3390 - AA80..AAC2 ; PVALID # TAI VIET LETTER LOW KO..TAI VIET TONE MAI SO 3391 - AAC3..AADA ; UNASSIGNED # <reserved>..<reserved> 3392 - AADB..AADD ; PVALID # TAI VIET SYMBOL KON..TAI VIET SYMBOL SAM 3393 - AADE..AADF ; DISALLOWED # TAI VIET SYMBOL HO HOI..TAI VIET SYMBOL KOI 3394 - AAE0..ABBF ; UNASSIGNED # <reserved>..<reserved> 3395 - ABC0..ABEA ; PVALID # MEETEI MAYEK LETTER KOK..MEETEI MAYEK VOWEL 3396 - ABEB ; DISALLOWED # MEETEI MAYEK CHEIKHEI 3397 - ABEC..ABED ; PVALID # MEETEI MAYEK LUM IYEK..MEETEI MAYEK APUN IYE 3398 - ABEE..ABEF ; UNASSIGNED # <reserved>..<reserved> 3399 - ABF0..ABF9 ; PVALID # MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT 3400 - ABFA..ABFF ; UNASSIGNED # <reserved>..<reserved> 3401 - AC00..D7A3 ; PVALID # <Hangul Syllable>..<Hangul Syllable> 3402 - D7A4..D7AF ; UNASSIGNED # <reserved>..<reserved> 3403 - D7B0..D7C6 ; DISALLOWED # HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARA 3404 - D7C7..D7CA ; UNASSIGNED # <reserved>..<reserved> 3405 - D7CB..D7FB ; DISALLOWED # HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEO 3406 - D7FC..D7FF ; UNASSIGNED # <reserved>..<reserved> 3407 - D800..FA0D ; DISALLOWED # <Non Private Use High Surrogate>..CJK COMPAT 3408 - FA0E..FA0F ; PVALID # CJK COMPATIBILITY IDEOGRAPH-FA0E..CJK COMPAT 3409 - FA10 ; DISALLOWED # CJK COMPATIBILITY IDEOGRAPH-FA10 3410 - FA11 ; PVALID # CJK COMPATIBILITY IDEOGRAPH-FA11 3411 - FA12 ; DISALLOWED # CJK COMPATIBILITY IDEOGRAPH-FA12 3412 - FA13..FA14 ; PVALID # CJK COMPATIBILITY IDEOGRAPH-FA13..CJK COMPAT 3413 - FA15..FA1E ; DISALLOWED # CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPAT 3414 - FA1F ; PVALID # CJK COMPATIBILITY IDEOGRAPH-FA1F 3415 - 3416 - 3417 - 3418 - Faltstrom Standards Track [Page 61] 3419 - 3420 - RFC 5892 IDNA Code Points August 2010 3421 - 3422 - 3423 - FA20 ; DISALLOWED # CJK COMPATIBILITY IDEOGRAPH-FA20 3424 - FA21 ; PVALID # CJK COMPATIBILITY IDEOGRAPH-FA21 3425 - FA22 ; DISALLOWED # CJK COMPATIBILITY IDEOGRAPH-FA22 3426 - FA23..FA24 ; PVALID # CJK COMPATIBILITY IDEOGRAPH-FA23..CJK COMPAT 3427 - FA25..FA26 ; DISALLOWED # CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPAT 3428 - FA27..FA29 ; PVALID # CJK COMPATIBILITY IDEOGRAPH-FA27..CJK COMPAT 3429 - FA2A..FA2D ; DISALLOWED # CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPAT 3430 - FA2E..FA2F ; UNASSIGNED # <reserved>..<reserved> 3431 - FA30..FA6D ; DISALLOWED # CJK COMPATIBILITY IDEOGRAPH-FA30..CJK COMPAT 3432 - FA6E..FA6F ; UNASSIGNED # <reserved>..<reserved> 3433 - FA70..FAD9 ; DISALLOWED # CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPAT 3434 - FADA..FAFF ; UNASSIGNED # <reserved>..<reserved> 3435 - FB00..FB06 ; DISALLOWED # LATIN SMALL LIGATURE FF..LATIN SMALL LIGATUR 3436 - FB07..FB12 ; UNASSIGNED # <reserved>..<reserved> 3437 - FB13..FB17 ; DISALLOWED # ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SM 3438 - FB18..FB1C ; UNASSIGNED # <reserved>..<reserved> 3439 - FB1D ; DISALLOWED # HEBREW LETTER YOD WITH HIRIQ 3440 - FB1E ; PVALID # HEBREW POINT JUDEO-SPANISH VARIKA 3441 - FB1F..FB36 ; DISALLOWED # HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBRE 3442 - FB37 ; UNASSIGNED # <reserved> 3443 - FB38..FB3C ; DISALLOWED # HEBREW LETTER TET WITH DAGESH..HEBREW LETTER 3444 - FB3D ; UNASSIGNED # <reserved> 3445 - FB3E ; DISALLOWED # HEBREW LETTER MEM WITH DAGESH 3446 - FB3F ; UNASSIGNED # <reserved> 3447 - FB40..FB41 ; DISALLOWED # HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER 3448 - FB42 ; UNASSIGNED # <reserved> 3449 - FB43..FB44 ; DISALLOWED # HEBREW LETTER FINAL PE WITH DAGESH..HEBREW L 3450 - FB45 ; UNASSIGNED # <reserved> 3451 - FB46..FBB1 ; DISALLOWED # HEBREW LETTER TSADI WITH DAGESH..ARABIC LETT 3452 - FBB2..FBD2 ; UNASSIGNED # <reserved>..<reserved> 3453 - FBD3..FD3F ; DISALLOWED # ARABIC LETTER NG ISOLATED FORM..ORNATE RIGHT 3454 - FD40..FD4F ; UNASSIGNED # <reserved>..<reserved> 3455 - FD50..FD8F ; DISALLOWED # ARABIC LIGATURE TEH WITH JEEM WITH MEEM INIT 3456 - FD90..FD91 ; UNASSIGNED # <reserved>..<reserved> 3457 - FD92..FDC7 ; DISALLOWED # ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INI 3458 - FDC8..FDCF ; UNASSIGNED # <reserved>..<reserved> 3459 - FDD0..FDFD ; DISALLOWED # <noncharacter>..ARABIC LIGATURE BISMILLAH AR 3460 - FDFE..FDFF ; UNASSIGNED # <reserved>..<reserved> 3461 - FE00..FE19 ; DISALLOWED # VARIATION SELECTOR-1..PRESENTATION FORM FOR 3462 - FE1A..FE1F ; UNASSIGNED # <reserved>..<reserved> 3463 - FE20..FE26 ; PVALID # COMBINING LIGATURE LEFT HALF..COMBINING CONJ 3464 - FE27..FE2F ; UNASSIGNED # <reserved>..<reserved> 3465 - FE30..FE52 ; DISALLOWED # PRESENTATION FORM FOR VERTICAL TWO DOT LEADE 3466 - FE53 ; UNASSIGNED # <reserved> 3467 - FE54..FE66 ; DISALLOWED # SMALL SEMICOLON..SMALL EQUALS SIGN 3468 - FE67 ; UNASSIGNED # <reserved> 3469 - FE68..FE6B ; DISALLOWED # SMALL REVERSE SOLIDUS..SMALL COMMERCIAL AT 3470 - FE6C..FE6F ; UNASSIGNED # <reserved>..<reserved> 3471 - 3472 - 3473 - 3474 - Faltstrom Standards Track [Page 62] 3475 - 3476 - RFC 5892 IDNA Code Points August 2010 3477 - 3478 - 3479 - FE70..FE72 ; DISALLOWED # ARABIC FATHATAN ISOLATED FORM..ARABIC DAMMAT 3480 - FE73 ; PVALID # ARABIC TAIL FRAGMENT 3481 - FE74 ; DISALLOWED # ARABIC KASRATAN ISOLATED FORM 3482 - FE75 ; UNASSIGNED # <reserved> 3483 - FE76..FEFC ; DISALLOWED # ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE 3484 - FEFD..FEFE ; UNASSIGNED # <reserved>..<reserved> 3485 - FEFF ; DISALLOWED # ZERO WIDTH NO-BREAK SPACE 3486 - FF00 ; UNASSIGNED # <reserved> 3487 - FF01..FFBE ; DISALLOWED # FULLWIDTH EXCLAMATION MARK..HALFWIDTH HANGUL 3488 - FFBF..FFC1 ; UNASSIGNED # <reserved>..<reserved> 3489 - FFC2..FFC7 ; DISALLOWED # HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL 3490 - FFC8..FFC9 ; UNASSIGNED # <reserved>..<reserved> 3491 - FFCA..FFCF ; DISALLOWED # HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGU 3492 - FFD0..FFD1 ; UNASSIGNED # <reserved>..<reserved> 3493 - FFD2..FFD7 ; DISALLOWED # HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL 3494 - FFD8..FFD9 ; UNASSIGNED # <reserved>..<reserved> 3495 - FFDA..FFDC ; DISALLOWED # HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 3496 - FFDD..FFDF ; UNASSIGNED # <reserved>..<reserved> 3497 - FFE0..FFE6 ; DISALLOWED # FULLWIDTH CENT SIGN..FULLWIDTH WON SIGN 3498 - FFE7 ; UNASSIGNED # <reserved> 3499 - FFE8..FFEE ; DISALLOWED # HALFWIDTH FORMS LIGHT VERTICAL..HALFWIDTH WH 3500 - FFEF..FFF8 ; UNASSIGNED # <reserved>..<reserved> 3501 - FFF9..FFFF ; DISALLOWED # INTERLINEAR ANNOTATION ANCHOR..<noncharacter 3502 - 10000..1000B; PVALID # LINEAR B SYLLABLE B008 A..LINEAR B SYLLABLE 3503 - 1000C ; UNASSIGNED # <reserved> 3504 - 1000D..10026; PVALID # LINEAR B SYLLABLE B036 JO..LINEAR B SYLLABLE 3505 - 10027 ; UNASSIGNED # <reserved> 3506 - 10028..1003A; PVALID # LINEAR B SYLLABLE B060 RA..LINEAR B SYLLABLE 3507 - 1003B ; UNASSIGNED # <reserved> 3508 - 1003C..1003D; PVALID # LINEAR B SYLLABLE B017 ZA..LINEAR B SYLLABLE 3509 - 1003E ; UNASSIGNED # <reserved> 3510 - 1003F..1004D; PVALID # LINEAR B SYLLABLE B020 ZO..LINEAR B SYLLABLE 3511 - 1004E..1004F; UNASSIGNED # <reserved>..<reserved> 3512 - 10050..1005D; PVALID # LINEAR B SYMBOL B018..LINEAR B SYMBOL B089 3513 - 1005E..1007F; UNASSIGNED # <reserved>..<reserved> 3514 - 10080..100FA; PVALID # LINEAR B IDEOGRAM B100 MAN..LINEAR B IDEOGRA 3515 - 100FB..100FF; UNASSIGNED # <reserved>..<reserved> 3516 - 10100..10102; DISALLOWED # AEGEAN WORD SEPARATOR LINE..AEGEAN CHECK MAR 3517 - 10103..10106; UNASSIGNED # <reserved>..<reserved> 3518 - 10107..10133; DISALLOWED # AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOU 3519 - 10134..10136; UNASSIGNED # <reserved>..<reserved> 3520 - 10137..1018A; DISALLOWED # AEGEAN WEIGHT BASE UNIT..GREEK ZERO SIGN 3521 - 1018B..1018F; UNASSIGNED # <reserved>..<reserved> 3522 - 10190..1019B; DISALLOWED # ROMAN SEXTANS SIGN..ROMAN CENTURIAL SIGN 3523 - 1019C..101CF; UNASSIGNED # <reserved>..<reserved> 3524 - 101D0..101FC; DISALLOWED # PHAISTOS DISC SIGN PEDESTRIAN..PHAISTOS DISC 3525 - 101FD ; PVALID # PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE 3526 - 101FE..1027F; UNASSIGNED # <reserved>..<reserved> 3527 - 3528 - 3529 - 3530 - Faltstrom Standards Track [Page 63] 3531 - 3532 - RFC 5892 IDNA Code Points August 2010 3533 - 3534 - 3535 - 10280..1029C; PVALID # LYCIAN LETTER A..LYCIAN LETTER X 3536 - 1029D..1029F; UNASSIGNED # <reserved>..<reserved> 3537 - 102A0..102D0; PVALID # CARIAN LETTER A..CARIAN LETTER UUU3 3538 - 102D1..102FF; UNASSIGNED # <reserved>..<reserved> 3539 - 10300..1031E; PVALID # OLD ITALIC LETTER A..OLD ITALIC LETTER UU 3540 - 1031F ; UNASSIGNED # <reserved> 3541 - 10320..10323; DISALLOWED # OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL F 3542 - 10324..1032F; UNASSIGNED # <reserved>..<reserved> 3543 - 10330..10340; PVALID # GOTHIC LETTER AHSA..GOTHIC LETTER PAIRTHRA 3544 - 10341 ; DISALLOWED # GOTHIC LETTER NINETY 3545 - 10342..10349; PVALID # GOTHIC LETTER RAIDA..GOTHIC LETTER OTHAL 3546 - 1034A ; DISALLOWED # GOTHIC LETTER NINE HUNDRED 3547 - 1034B..1037F; UNASSIGNED # <reserved>..<reserved> 3548 - 10380..1039D; PVALID # UGARITIC LETTER ALPA..UGARITIC LETTER SSU 3549 - 1039E ; UNASSIGNED # <reserved> 3550 - 1039F ; DISALLOWED # UGARITIC WORD DIVIDER 3551 - 103A0..103C3; PVALID # OLD PERSIAN SIGN A..OLD PERSIAN SIGN HA 3552 - 103C4..103C7; UNASSIGNED # <reserved>..<reserved> 3553 - 103C8..103CF; PVALID # OLD PERSIAN SIGN AURAMAZDAA..OLD PERSIAN SIG 3554 - 103D0..103D5; DISALLOWED # OLD PERSIAN WORD DIVIDER..OLD PERSIAN NUMBER 3555 - 103D6..103FF; UNASSIGNED # <reserved>..<reserved> 3556 - 10400..10427; DISALLOWED # DESERET CAPITAL LETTER LONG I..DESERET CAPIT 3557 - 10428..1049D; PVALID # DESERET SMALL LETTER LONG I..OSMANYA LETTER 3558 - 1049E..1049F; UNASSIGNED # <reserved>..<reserved> 3559 - 104A0..104A9; PVALID # OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE 3560 - 104AA..107FF; UNASSIGNED # <reserved>..<reserved> 3561 - 10800..10805; PVALID # CYPRIOT SYLLABLE A..CYPRIOT SYLLABLE JA 3562 - 10806..10807; UNASSIGNED # <reserved>..<reserved> 3563 - 10808 ; PVALID # CYPRIOT SYLLABLE JO 3564 - 10809 ; UNASSIGNED # <reserved> 3565 - 1080A..10835; PVALID # CYPRIOT SYLLABLE KA..CYPRIOT SYLLABLE WO 3566 - 10836 ; UNASSIGNED # <reserved> 3567 - 10837..10838; PVALID # CYPRIOT SYLLABLE XA..CYPRIOT SYLLABLE XE 3568 - 10839..1083B; UNASSIGNED # <reserved>..<reserved> 3569 - 1083C ; PVALID # CYPRIOT SYLLABLE ZA 3570 - 1083D..1083E; UNASSIGNED # <reserved>..<reserved> 3571 - 1083F..10855; PVALID # CYPRIOT SYLLABLE ZO..IMPERIAL ARAMAIC LETTER 3572 - 10856 ; UNASSIGNED # <reserved> 3573 - 10857..1085F; DISALLOWED # IMPERIAL ARAMAIC SECTION SIGN..IMPERIAL ARAM 3574 - 10860..108FF; UNASSIGNED # <reserved>..<reserved> 3575 - 10900..10915; PVALID # PHOENICIAN LETTER ALF..PHOENICIAN LETTER TAU 3576 - 10916..1091B; DISALLOWED # PHOENICIAN NUMBER ONE..PHOENICIAN NUMBER THR 3577 - 1091C..1091E; UNASSIGNED # <reserved>..<reserved> 3578 - 1091F ; DISALLOWED # PHOENICIAN WORD SEPARATOR 3579 - 10920..10939; PVALID # LYDIAN LETTER A..LYDIAN LETTER C 3580 - 1093A..1093E; UNASSIGNED # <reserved>..<reserved> 3581 - 1093F ; DISALLOWED # LYDIAN TRIANGULAR MARK 3582 - 10940..109FF; UNASSIGNED # <reserved>..<reserved> 3583 - 3584 - 3585 - 3586 - Faltstrom Standards Track [Page 64] 3587 - 3588 - RFC 5892 IDNA Code Points August 2010 3589 - 3590 - 3591 - 10A00..10A03; PVALID # KHAROSHTHI LETTER A..KHAROSHTHI VOWEL SIGN V 3592 - 10A04 ; UNASSIGNED # <reserved> 3593 - 10A05..10A06; PVALID # KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SI 3594 - 10A07..10A0B; UNASSIGNED # <reserved>..<reserved> 3595 - 10A0C..10A13; PVALID # KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI LET 3596 - 10A14 ; UNASSIGNED # <reserved> 3597 - 10A15..10A17; PVALID # KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA 3598 - 10A18 ; UNASSIGNED # <reserved> 3599 - 10A19..10A33; PVALID # KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER TTT 3600 - 10A34..10A37; UNASSIGNED # <reserved>..<reserved> 3601 - 10A38..10A3A; PVALID # KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN D 3602 - 10A3B..10A3E; UNASSIGNED # <reserved>..<reserved> 3603 - 10A3F ; PVALID # KHAROSHTHI VIRAMA 3604 - 10A40..10A47; DISALLOWED # KHAROSHTHI DIGIT ONE..KHAROSHTHI NUMBER ONE 3605 - 10A48..10A4F; UNASSIGNED # <reserved>..<reserved> 3606 - 10A50..10A58; DISALLOWED # KHAROSHTHI PUNCTUATION DOT..KHAROSHTHI PUNCT 3607 - 10A59..10A5F; UNASSIGNED # <reserved>..<reserved> 3608 - 10A60..10A7C; PVALID # OLD SOUTH ARABIAN LETTER HE..OLD SOUTH ARABI 3609 - 10A7D..10A7F; DISALLOWED # OLD SOUTH ARABIAN NUMBER ONE..OLD SOUTH ARAB 3610 - 10A80..10AFF; UNASSIGNED # <reserved>..<reserved> 3611 - 10B00..10B35; PVALID # AVESTAN LETTER A..AVESTAN LETTER HE 3612 - 10B36..10B38; UNASSIGNED # <reserved>..<reserved> 3613 - 10B39..10B3F; DISALLOWED # AVESTAN ABBREVIATION MARK..LARGE ONE RING OV 3614 - 10B40..10B55; PVALID # INSCRIPTIONAL PARTHIAN LETTER ALEPH..INSCRIP 3615 - 10B56..10B57; UNASSIGNED # <reserved>..<reserved> 3616 - 10B58..10B5F; DISALLOWED # INSCRIPTIONAL PARTHIAN NUMBER ONE..INSCRIPTI 3617 - 10B60..10B72; PVALID # INSCRIPTIONAL PAHLAVI LETTER ALEPH..INSCRIPT 3618 - 10B73..10B77; UNASSIGNED # <reserved>..<reserved> 3619 - 10B78..10B7F; DISALLOWED # INSCRIPTIONAL PAHLAVI NUMBER ONE..INSCRIPTIO 3620 - 10B80..10BFF; UNASSIGNED # <reserved>..<reserved> 3621 - 10C00..10C48; PVALID # OLD TURKIC LETTER ORKHON A..OLD TURKIC LETTE 3622 - 10C49..10E5F; UNASSIGNED # <reserved>..<reserved> 3623 - 10E60..10E7E; DISALLOWED # RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS 3624 - 10E7F..1107F; UNASSIGNED # <reserved>..<reserved> 3625 - 11080..110BA; PVALID # KAITHI SIGN CANDRABINDU..KAITHI SIGN NUKTA 3626 - 110BB..110C1; DISALLOWED # KAITHI ABBREVIATION SIGN..KAITHI DOUBLE DAND 3627 - 110C2..11FFF; UNASSIGNED # <reserved>..<reserved> 3628 - 12000..1236E; PVALID # CUNEIFORM SIGN A..CUNEIFORM SIGN ZUM 3629 - 1236F..123FF; UNASSIGNED # <reserved>..<reserved> 3630 - 12400..12462; DISALLOWED # CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NU 3631 - 12463..1246F; UNASSIGNED # <reserved>..<reserved> 3632 - 12470..12473; DISALLOWED # CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD 3633 - 12474..12FFF; UNASSIGNED # <reserved>..<reserved> 3634 - 13000..1342E; PVALID # EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYP 3635 - 1342F..1CFFF; UNASSIGNED # <reserved>..<reserved> 3636 - 1D000..1D0F5; DISALLOWED # BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MU 3637 - 1D0F6..1D0FF; UNASSIGNED # <reserved>..<reserved> 3638 - 1D100..1D126; DISALLOWED # MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBO 3639 - 3640 - 3641 - 3642 - Faltstrom Standards Track [Page 65] 3643 - 3644 - RFC 5892 IDNA Code Points August 2010 3645 - 3646 - 3647 - 1D127..1D128; UNASSIGNED # <reserved>..<reserved> 3648 - 1D129..1D1DD; DISALLOWED # MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICA 3649 - 1D1DE..1D1FF; UNASSIGNED # <reserved>..<reserved> 3650 - 1D200..1D245; DISALLOWED # GREEK VOCAL NOTATION SYMBOL-1..GREEK MUSICAL 3651 - 1D246..1D2FF; UNASSIGNED # <reserved>..<reserved> 3652 - 1D300..1D356; DISALLOWED # MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING 3653 - 1D357..1D35F; UNASSIGNED # <reserved>..<reserved> 3654 - 1D360..1D371; DISALLOWED # COUNTING ROD UNIT DIGIT ONE..COUNTING ROD TE 3655 - 1D372..1D3FF; UNASSIGNED # <reserved>..<reserved> 3656 - 1D400..1D454; DISALLOWED # MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL IT 3657 - 1D455 ; UNASSIGNED # <reserved> 3658 - 1D456..1D49C; DISALLOWED # MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SC 3659 - 1D49D ; UNASSIGNED # <reserved> 3660 - 1D49E..1D49F; DISALLOWED # MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL 3661 - 1D4A0..1D4A1; UNASSIGNED # <reserved>..<reserved> 3662 - 1D4A2 ; DISALLOWED # MATHEMATICAL SCRIPT CAPITAL G 3663 - 1D4A3..1D4A4; UNASSIGNED # <reserved>..<reserved> 3664 - 1D4A5..1D4A6; DISALLOWED # MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL 3665 - 1D4A7..1D4A8; UNASSIGNED # <reserved>..<reserved> 3666 - 1D4A9..1D4AC; DISALLOWED # MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL 3667 - 1D4AD ; UNASSIGNED # <reserved> 3668 - 1D4AE..1D4B9; DISALLOWED # MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL 3669 - 1D4BA ; UNASSIGNED # <reserved> 3670 - 1D4BB ; DISALLOWED # MATHEMATICAL SCRIPT SMALL F 3671 - 1D4BC ; UNASSIGNED # <reserved> 3672 - 1D4BD..1D4C3; DISALLOWED # MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SC 3673 - 1D4C4 ; UNASSIGNED # <reserved> 3674 - 1D4C5..1D505; DISALLOWED # MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FR 3675 - 1D506 ; UNASSIGNED # <reserved> 3676 - 1D507..1D50A; DISALLOWED # MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL 3677 - 1D50B..1D50C; UNASSIGNED # <reserved>..<reserved> 3678 - 1D50D..1D514; DISALLOWED # MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL 3679 - 1D515 ; UNASSIGNED # <reserved> 3680 - 1D516..1D51C; DISALLOWED # MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL 3681 - 1D51D ; UNASSIGNED # <reserved> 3682 - 1D51E..1D539; DISALLOWED # MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL D 3683 - 1D53A ; UNASSIGNED # <reserved> 3684 - 1D53B..1D53E; DISALLOWED # MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEM 3685 - 1D53F ; UNASSIGNED # <reserved> 3686 - 1D540..1D544; DISALLOWED # MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEM 3687 - 1D545 ; UNASSIGNED # <reserved> 3688 - 1D546 ; DISALLOWED # MATHEMATICAL DOUBLE-STRUCK CAPITAL O 3689 - 1D547..1D549; UNASSIGNED # <reserved>..<reserved> 3690 - 1D54A..1D550; DISALLOWED # MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEM 3691 - 1D551 ; UNASSIGNED # <reserved> 3692 - 1D552..1D6A5; DISALLOWED # MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMAT 3693 - 1D6A6..1D6A7; UNASSIGNED # <reserved>..<reserved> 3694 - 1D6A8..1D7CB; DISALLOWED # MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICA 3695 - 3696 - 3697 - 3698 - Faltstrom Standards Track [Page 66] 3699 - 3700 - RFC 5892 IDNA Code Points August 2010 3701 - 3702 - 3703 - 1D7CC..1D7CD; UNASSIGNED # <reserved>..<reserved> 3704 - 1D7CE..1D7FF; DISALLOWED # MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL M 3705 - 1D800..1EFFF; UNASSIGNED # <reserved>..<reserved> 3706 - 1F000..1F02B; DISALLOWED # MAHJONG TILE EAST WIND..MAHJONG TILE BACK 3707 - 1F02C..1F02F; UNASSIGNED # <reserved>..<reserved> 3708 - 1F030..1F093; DISALLOWED # DOMINO TILE HORIZONTAL BACK..DOMINO TILE VER 3709 - 1F094..1F0FF; UNASSIGNED # <reserved>..<reserved> 3710 - 1F100..1F10A; DISALLOWED # DIGIT ZERO FULL STOP..DIGIT NINE COMMA 3711 - 1F10B..1F10F; UNASSIGNED # <reserved>..<reserved> 3712 - 1F110..1F12E; DISALLOWED # PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLE 3713 - 1F12F..1F130; UNASSIGNED # <reserved>..<reserved> 3714 - 1F131 ; DISALLOWED # SQUARED LATIN CAPITAL LETTER B 3715 - 1F132..1F13C; UNASSIGNED # <reserved>..<reserved> 3716 - 1F13D ; DISALLOWED # SQUARED LATIN CAPITAL LETTER N 3717 - 1F13E ; UNASSIGNED # <reserved> 3718 - 1F13F ; DISALLOWED # SQUARED LATIN CAPITAL LETTER P 3719 - 1F140..1F141; UNASSIGNED # <reserved>..<reserved> 3720 - 1F142 ; DISALLOWED # SQUARED LATIN CAPITAL LETTER S 3721 - 1F143..1F145; UNASSIGNED # <reserved>..<reserved> 3722 - 1F146 ; DISALLOWED # SQUARED LATIN CAPITAL LETTER W 3723 - 1F147..1F149; UNASSIGNED # <reserved>..<reserved> 3724 - 1F14A..1F14E; DISALLOWED # SQUARED HV..SQUARED PPV 3725 - 1F14F..1F156; UNASSIGNED # <reserved>..<reserved> 3726 - 1F157 ; DISALLOWED # NEGATIVE CIRCLED LATIN CAPITAL LETTER H 3727 - 1F158..1F15E; UNASSIGNED # <reserved>..<reserved> 3728 - 1F15F ; DISALLOWED # NEGATIVE CIRCLED LATIN CAPITAL LETTER P 3729 - 1F160..1F178; UNASSIGNED # <reserved>..<reserved> 3730 - 1F179 ; DISALLOWED # NEGATIVE SQUARED LATIN CAPITAL LETTER J 3731 - 1F17A ; UNASSIGNED # <reserved> 3732 - 1F17B..1F17C; DISALLOWED # NEGATIVE SQUARED LATIN CAPITAL LETTER L..NEG 3733 - 1F17D..1F17E; UNASSIGNED # <reserved>..<reserved> 3734 - 1F17F ; DISALLOWED # NEGATIVE SQUARED LATIN CAPITAL LETTER P 3735 - 1F180..1F189; UNASSIGNED # <reserved>..<reserved> 3736 - 1F18A..1F18D; DISALLOWED # CROSSED NEGATIVE SQUARED LATIN CAPITAL LETTE 3737 - 1F18E..1F18F; UNASSIGNED # <reserved>..<reserved> 3738 - 1F190 ; DISALLOWED # SQUARE DJ 3739 - 1F191..1F1FF; UNASSIGNED # <reserved>..<reserved> 3740 - 1F200 ; DISALLOWED # SQUARE HIRAGANA HOKA 3741 - 1F201..1F20F; UNASSIGNED # <reserved>..<reserved> 3742 - 1F210..1F231; DISALLOWED # SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED 3743 - 1F232..1F23F; UNASSIGNED # <reserved>..<reserved> 3744 - 1F240..1F248; DISALLOWED # TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRA 3745 - 1F249..1FFFD; UNASSIGNED # <reserved>..<reserved> 3746 - 1FFFE..1FFFF; DISALLOWED # <noncharacter>..<noncharacter> 3747 - 20000..2A6D6; PVALID # <CJK Ideograph Extension B>..<CJK Ideograph 3748 - 2A6D7..2A6FF; UNASSIGNED # <reserved>..<reserved> 3749 - 2A700..2B734; PVALID # <CJK Ideograph Extension C>..<CJK Ideograph 3750 - 2B735..2F7FF; UNASSIGNED # <reserved>..<reserved> 3751 - 3752 - 3753 - 3754 - Faltstrom Standards Track [Page 67] 3755 - 3756 - RFC 5892 IDNA Code Points August 2010 3757 - 3758 - 3759 - 2F800..2FA1D; DISALLOWED # CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPA 3760 - 2FA1E..2FFFD; UNASSIGNED # <reserved>..<reserved> 3761 - 2FFFE..2FFFF; DISALLOWED # <noncharacter>..<noncharacter> 3762 - 30000..3FFFD; UNASSIGNED # <reserved>..<reserved> 3763 - 3FFFE..3FFFF; DISALLOWED # <noncharacter>..<noncharacter> 3764 - 40000..4FFFD; UNASSIGNED # <reserved>..<reserved> 3765 - 4FFFE..4FFFF; DISALLOWED # <noncharacter>..<noncharacter> 3766 - 50000..5FFFD; UNASSIGNED # <reserved>..<reserved> 3767 - 5FFFE..5FFFF; DISALLOWED # <noncharacter>..<noncharacter> 3768 - 60000..6FFFD; UNASSIGNED # <reserved>..<reserved> 3769 - 6FFFE..6FFFF; DISALLOWED # <noncharacter>..<noncharacter> 3770 - 70000..7FFFD; UNASSIGNED # <reserved>..<reserved> 3771 - 7FFFE..7FFFF; DISALLOWED # <noncharacter>..<noncharacter> 3772 - 80000..8FFFD; UNASSIGNED # <reserved>..<reserved> 3773 - 8FFFE..8FFFF; DISALLOWED # <noncharacter>..<noncharacter> 3774 - 90000..9FFFD; UNASSIGNED # <reserved>..<reserved> 3775 - 9FFFE..9FFFF; DISALLOWED # <noncharacter>..<noncharacter> 3776 - A0000..AFFFD; UNASSIGNED # <reserved>..<reserved> 3777 - AFFFE..AFFFF; DISALLOWED # <noncharacter>..<noncharacter> 3778 - B0000..BFFFD; UNASSIGNED # <reserved>..<reserved> 3779 - BFFFE..BFFFF; DISALLOWED # <noncharacter>..<noncharacter> 3780 - C0000..CFFFD; UNASSIGNED # <reserved>..<reserved> 3781 - CFFFE..CFFFF; DISALLOWED # <noncharacter>..<noncharacter> 3782 - D0000..DFFFD; UNASSIGNED # <reserved>..<reserved> 3783 - DFFFE..DFFFF; DISALLOWED # <noncharacter>..<noncharacter> 3784 - E0000 ; UNASSIGNED # <reserved> 3785 - E0001 ; DISALLOWED # LANGUAGE TAG 3786 - E0002..E001F; UNASSIGNED # <reserved>..<reserved> 3787 - E0020..E007F; DISALLOWED # TAG SPACE..CANCEL TAG 3788 - E0080..E00FF; UNASSIGNED # <reserved>..<reserved> 3789 - E0100..E01EF; DISALLOWED # VARIATION SELECTOR-17..VARIATION SELECTOR-25 3790 - E01F0..EFFFD; UNASSIGNED # <reserved>..<reserved> 3791 - EFFFE..10FFFF; DISALLOWED # <noncharacter>..<noncharacter> 3792 - 3793 - 3794 - 3795 - 3796 - 3797 - 3798 - 3799 - 3800 - 3801 - 3802 - 3803 - 3804 - 3805 - 3806 - 3807 - 3808 - 3809 - 3810 - Faltstrom Standards Track [Page 68] 3811 - 3812 - RFC 5892 IDNA Code Points August 2010 3813 - 3814 - 3815 - 8. References 3816 - 3817 - 8.1. Normative References 3818 - 3819 - [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate 3820 - Requirement Levels", BCP 14, RFC 2119, March 1997. 3821 - 3822 - [TR15] Davis, M. and M. Duerst, "Unicode Standard Annex #15, 3823 - Unicode Normalization Forms, an integral part of the 3824 - Unicode Standard", 3825 - <http://unicode.org/unicode/reports/tr15/>. 3826 - 3827 - [Unicode] The Unicode Consortium, "The Unicode Standard, Version 3828 - 5.0", 2007. Boston, MA, USA: Addison-Wesley. ISBN 3829 - 0-321-48091-0. This printed reference has now been 3830 - updated online to reflect additional code points. For 3831 - code points, the reference at the time this document was 3832 - published is to Unicode 5.2. 3833 - 3834 - [Unicode52] The Unicode Consortium. The Unicode Standard, Version 3835 - 5.2.0, defined by: "The Unicode Standard, Version 3836 - 5.2.0", (Mountain View, CA: The Unicode Consortium, 3837 - 2009. ISBN 978-1-936213-00-9). 3838 - <http://www.unicode.org/versions/Unicode5.2.0/>. 3839 - 3840 - 8.2. Informative References 3841 - 3842 - [BlockNames] "Blocks-5.2.0.txt", Unicode Character Database, 3843 - May 2009, 3844 - <http://unicode.org/Public/5.2.0/ucd/Blocks.txt>. 3845 - 3846 - [DerivedCoreProperties] 3847 - "DerivedCoreProperties-5.2.0.txt", Unicode Character 3848 - Database, August 2009, <http://unicode.org/Public/5.2.0/ 3849 - ucd/DerivedCoreProperties.txt>. 3850 - 3851 - [RFC3454] Hoffman, P. and M. Blanchet, "Preparation of 3852 - Internationalized Strings ("stringprep")", RFC 3454, 3853 - December 2002. 3854 - 3855 - [RFC3491] Hoffman, P. and M. Blanchet, "Nameprep: A Stringprep 3856 - Profile for Internationalized Domain Names (IDN)", 3857 - RFC 3491, March 2003. 3858 - 3859 - [RFC4690] Klensin, J., Faltstrom, P., Karp, C., and IAB, "Review 3860 - and Recommendations for Internationalized Domain Names 3861 - (IDNs)", RFC 4690, September 2006. 3862 - 3863 - 3864 - 3865 - 3866 - Faltstrom Standards Track [Page 69] 3867 - 3868 - RFC 5892 IDNA Code Points August 2010 3869 - 3870 - 3871 - [RFC5226] Narten, T. and H. Alvestrand, "Guidelines for Writing an 3872 - IANA Considerations Section in RFCs", BCP 26, RFC 5226, 3873 - May 2008. 3874 - 3875 - [RFC5890] Klensin, J., "Internationalized Domain Names for 3876 - Applications (IDNA): Definitions and Document 3877 - Framework", RFC 5890, August 2010. 3878 - 3879 - [RFC5891] Klensin, J., "Internationalized Domain Names in 3880 - Applications (IDNA): Protocol", RFC 5891, August 2010. 3881 - 3882 - [RFC5893] Alvestrand, H., Ed. and C. Karp, "Right-to-Left Scripts 3883 - for Internationalized Domain Names for Applications 3884 - (IDNA)", RFC 5893, August 2010. 3885 - 3886 - [RFC5894] Klensin, J., "Internationalized Domain Names for 3887 - Applications (IDNA): Background, Explanation, and 3888 - Rationale", RFC 5894, August 2010. 3889 - 3890 - Author's Address 3891 - 3892 - Patrik Faltstrom (editor) 3893 - Cisco 3894 - 3895 - EMail: paf@cisco.com 3896 - 3897 - 3898 - 3899 - 3900 - 3901 - 3902 - 3903 - 3904 - 3905 - 3906 - 3907 - 3908 - 3909 - 3910 - 3911 - 3912 - 3913 - 3914 - 3915 - 3916 - 3917 - 3918 - 3919 - 3920 - 3921 - 3922 - Faltstrom Standards Track [Page 70] 3923 -
-955
ocaml-punycode/spec/rfc5893.txt
··· 1 - 2 - 3 - 4 - 5 - 6 - 7 - Internet Engineering Task Force (IETF) H. Alvestrand, Ed. 8 - Request for Comments: 5893 Google 9 - Category: Standards Track C. Karp 10 - ISSN: 2070-1721 Swedish Museum of Natural History 11 - August 2010 12 - 13 - 14 - Right-to-Left Scripts for 15 - Internationalized Domain Names for Applications (IDNA) 16 - 17 - Abstract 18 - 19 - The use of right-to-left scripts in Internationalized Domain Names 20 - (IDNs) has presented several challenges. This memo provides a new 21 - Bidi rule for Internationalized Domain Names for Applications (IDNA) 22 - labels, based on the encountered problems with some scripts and some 23 - shortcomings in the 2003 IDNA Bidi criterion. 24 - 25 - Status of This Memo 26 - 27 - This is an Internet Standards Track document. 28 - 29 - This document is a product of the Internet Engineering Task Force 30 - (IETF). It represents the consensus of the IETF community. It has 31 - received public review and has been approved for publication by the 32 - Internet Engineering Steering Group (IESG). Further information on 33 - Internet Standards is available in Section 2 of RFC 5741. 34 - 35 - Information about the current status of this document, any errata, 36 - and how to provide feedback on it may be obtained at 37 - http://www.rfc-editor.org/info/rfc5893. 38 - 39 - Copyright Notice 40 - 41 - Copyright (c) 2010 IETF Trust and the persons identified as the 42 - document authors. All rights reserved. 43 - 44 - This document is subject to BCP 78 and the IETF Trust's Legal 45 - Provisions Relating to IETF Documents 46 - (http://trustee.ietf.org/license-info) in effect on the date of 47 - publication of this document. Please review these documents 48 - carefully, as they describe your rights and restrictions with respect 49 - to this document. Code Components extracted from this document must 50 - include Simplified BSD License text as described in Section 4.e of 51 - the Trust Legal Provisions and are provided without warranty as 52 - described in the Simplified BSD License. 53 - 54 - 55 - 56 - 57 - 58 - Alvestrand & Karp Standards Track [Page 1] 59 - 60 - RFC 5893 IDNA Right to Left August 2010 61 - 62 - 63 - Table of Contents 64 - 65 - 1. Introduction . . . . . . . . . . . . . . . . . . . . . . . . . 2 66 - 1.1. Purpose and Applicability . . . . . . . . . . . . . . . . 2 67 - 1.2. Background and History . . . . . . . . . . . . . . . . . . 3 68 - 1.3. Structure of the Rest of This Document . . . . . . . . . . 3 69 - 1.4. Terminology . . . . . . . . . . . . . . . . . . . . . . . 4 70 - 2. The Bidi Rule . . . . . . . . . . . . . . . . . . . . . . . . 6 71 - 3. The Requirement Set for the Bidi Rule . . . . . . . . . . . . 6 72 - 4. Examples of Issues Found with RFC 3454 . . . . . . . . . . . . 9 73 - 4.1. Dhivehi . . . . . . . . . . . . . . . . . . . . . . . . . 9 74 - 4.2. Yiddish . . . . . . . . . . . . . . . . . . . . . . . . . 10 75 - 4.3. Strings with Numbers . . . . . . . . . . . . . . . . . . . 12 76 - 5. Troublesome Situations and Guidelines . . . . . . . . . . . . 12 77 - 6. Other Issues in Need of Resolution . . . . . . . . . . . . . . 13 78 - 7. Compatibility Considerations . . . . . . . . . . . . . . . . . 14 79 - 7.1. Backwards Compatibility Considerations . . . . . . . . . . 14 80 - 7.2. Forward Compatibility Considerations . . . . . . . . . . . 15 81 - 8. Security Considerations . . . . . . . . . . . . . . . . . . . 15 82 - 9. Acknowledgements . . . . . . . . . . . . . . . . . . . . . . . 16 83 - 10. References . . . . . . . . . . . . . . . . . . . . . . . . . . 16 84 - 10.1. Normative References . . . . . . . . . . . . . . . . . . . 16 85 - 10.2. Informative References . . . . . . . . . . . . . . . . . . 17 86 - 87 - 1. Introduction 88 - 89 - 1.1. Purpose and Applicability 90 - 91 - The purpose of this document is to establish a rule that can be 92 - applied to Internationalized Domain Name (IDN) labels in Unicode form 93 - (U-labels) containing characters from scripts that are written from 94 - right to left. It is part of the revised IDNA protocol [RFC5891]. 95 - 96 - When labels satisfy the rule, and when certain other conditions are 97 - satisfied, there is only a minimal chance of these labels being 98 - displayed in a confusing way by the Unicode bidirectional display 99 - algorithm. 100 - 101 - The other normative documents in the IDNA2008 document set establish 102 - criteria for valid labels, including listing the permitted 103 - characters. This document establishes additional validity criteria 104 - for labels in scripts normally written from right to left. 105 - 106 - This specification is not intended to place any requirements on 107 - domain names that do not contain characters from such scripts. 108 - 109 - 110 - 111 - 112 - 113 - 114 - Alvestrand & Karp Standards Track [Page 2] 115 - 116 - RFC 5893 IDNA Right to Left August 2010 117 - 118 - 119 - 1.2. Background and History 120 - 121 - The "Stringprep" specification [RFC3454], part of IDNA2003, made the 122 - following statement in its Section 6 on the Bidi algorithm: 123 - 124 - 3) If a string contains any RandALCat character, a RandALCat 125 - character MUST be the first character of the string, and a 126 - RandALCat character MUST be the last character of the string. 127 - 128 - (A RandALCat character is a character with unambiguously 129 - right-to-left directionality.) 130 - 131 - The reasoning behind this prohibition was to ensure that every 132 - component of a displayed domain name has an unambiguously preferred 133 - direction. However, this made certain words in languages written 134 - with right-to-left scripts invalid as IDN labels, and in at least one 135 - case (Dhivehi) meant that all the words of an entire language were 136 - forbidden as IDN labels. 137 - 138 - This is illustrated below with examples taken from the Dhivehi and 139 - Yiddish languages, as written with the Thaana and Hebrew scripts, 140 - respectively. 141 - 142 - RFC 3454 did not explicitly state the requirement to be fulfilled. 143 - Therefore, it is impossible to determine whether a simple relaxation 144 - of the rule would continue to fulfill the requirement. 145 - 146 - While this document specifies rules quite different from RFC 3454, 147 - most reasonable labels that were allowed under RFC 3454 will also be 148 - allowed under this specification (the most important example of 149 - non-permitted labels being labels that mix Arabic and European digits 150 - (AN and EN) inside an RTL label, and labels that use AN in an LTR 151 - label -- see Section 1.4 for terminology), so the operational impact 152 - of using the new rule in the updated IDNA specification is limited. 153 - 154 - 1.3. Structure of the Rest of This Document 155 - 156 - Section 2 defines a rule, the "Bidi rule", which can be used on a 157 - domain name label to check how safe it is to use in a domain name of 158 - possibly mixed directionality. The primary initial use of this rule 159 - is as part of the IDNA2008 protocol [RFC5891]. 160 - 161 - Section 3 sets out the requirements for defining the Bidi rule. 162 - 163 - Section 4 gives detailed examples that serve as justification for the 164 - new rule. 165 - 166 - 167 - 168 - 169 - 170 - Alvestrand & Karp Standards Track [Page 3] 171 - 172 - RFC 5893 IDNA Right to Left August 2010 173 - 174 - 175 - Section 5 to Section 8 describe various situations that can occur 176 - when dealing with domain names with characters of different 177 - directionality. 178 - 179 - Only Section 1.4 and Section 2 are normative. 180 - 181 - 1.4. Terminology 182 - 183 - The terminology used to describe IDNA concepts is defined in the 184 - Definitions document [RFC5890]. 185 - 186 - The terminology used for the Bidi properties of Unicode characters is 187 - taken from the Unicode Standard [Unicode52]. 188 - 189 - The Unicode Standard specifies a Bidi property for each character. 190 - That property controls the character's behavior in the Unicode 191 - bidirectional algorithm [Unicode-UAX9]. For reference, here are the 192 - values that the Unicode Bidi property can have: 193 - 194 - o L - Left to right - most letters in LTR scripts 195 - 196 - o R - Right to left - most letters in non-Arabic RTL scripts 197 - 198 - o AL - Arabic letters - most letters in the Arabic script 199 - 200 - o EN - European Number (0-9, and Extended Arabic-Indic numbers) 201 - 202 - o ES - European Number Separator (+ and -) 203 - 204 - o ET - European Number Terminator (currency symbols, the hash sign, 205 - the percent sign and so on) 206 - 207 - o AN - Arabic Number; this encompasses the Arabic-Indic numbers, but 208 - not the Extended Arabic-Indic numbers 209 - 210 - o CS - Common Number Separator (. , / : et al) 211 - 212 - o NSM - Nonspacing Mark - most combining accents 213 - 214 - o BN - Boundary Neutral - control characters (ZWNJ, ZWJ, and others) 215 - 216 - o B - Paragraph Separator 217 - 218 - o S - Segment Separator 219 - 220 - o WS - Whitespace, including the SPACE character 221 - 222 - o ON - Other Neutrals, including @, &, parentheses, MIDDLE DOT 223 - 224 - 225 - 226 - Alvestrand & Karp Standards Track [Page 4] 227 - 228 - RFC 5893 IDNA Right to Left August 2010 229 - 230 - 231 - o LRE, LRO, RLE, RLO, PDF - these are "directional control 232 - characters" and are not used in IDNA labels. 233 - 234 - In this memo, we use "network order" to describe the sequence of 235 - characters as transmitted on the wire or stored in a file; the terms 236 - "first", "next", "previous", "beginning", "end", "before", and 237 - "after" are used to refer to the relationship of characters and 238 - labels in network order. 239 - 240 - We use "display order" to talk about the sequence of characters as 241 - imaged on a display medium; the terms "left" and "right" are used to 242 - refer to the relationship of characters and labels in display order. 243 - 244 - Most of the time, the examples use the abbreviations for the Unicode 245 - Bidi classes to denote the directionality of the characters; the 246 - example string CS L consists of one character of class CS and one 247 - character of class L. In some examples, the convention that 248 - uppercase characters are of class R or AL, and lowercase characters 249 - are of class L is used -- thus, the example string ABC.abc would 250 - consist of three right-to-left characters and three left-to-right 251 - characters. 252 - 253 - The directionality of such examples is determined by context -- for 254 - instance, in the sentence "ABC.abc is displayed as CBA.abc", the 255 - first example string is in network order, the second example string 256 - is in display order. 257 - 258 - The term "paragraph" is used in the sense of the Unicode Bidi 259 - specification [Unicode-UAX9]. It means "a block of text that has an 260 - overall direction, either left to right or right to left", 261 - approximately; see the "Unicode Bidirectional Algorithm" 262 - [Unicode-UAX9] for details. 263 - 264 - "RTL" and "LTR" are abbreviations for "right to left" and "left to 265 - right", respectively. 266 - 267 - An RTL label is a label that contains at least one character of type 268 - R, AL, or AN. 269 - 270 - An LTR label is any label that is not an RTL label. 271 - 272 - A "Bidi domain name" is a domain name that contains at least one RTL 273 - label. (Note: This definition includes domain names containing only 274 - dots and right-to-left characters. Providing a separate category of 275 - "RTL domain names" would not make this specification simpler, so it 276 - has not been done.) 277 - 278 - 279 - 280 - 281 - 282 - Alvestrand & Karp Standards Track [Page 5] 283 - 284 - RFC 5893 IDNA Right to Left August 2010 285 - 286 - 287 - 2. The Bidi Rule 288 - 289 - The following rule, consisting of six conditions, applies to labels 290 - in Bidi domain names. The requirements that this rule satisfies are 291 - described in Section 3. All of the conditions must be satisfied for 292 - the rule to be satisfied. 293 - 294 - 1. The first character must be a character with Bidi property L, R, 295 - or AL. If it has the R or AL property, it is an RTL label; if it 296 - has the L property, it is an LTR label. 297 - 298 - 2. In an RTL label, only characters with the Bidi properties R, AL, 299 - AN, EN, ES, CS, ET, ON, BN, or NSM are allowed. 300 - 301 - 3. In an RTL label, the end of the label must be a character with 302 - Bidi property R, AL, EN, or AN, followed by zero or more 303 - characters with Bidi property NSM. 304 - 305 - 4. In an RTL label, if an EN is present, no AN may be present, and 306 - vice versa. 307 - 308 - 5. In an LTR label, only characters with the Bidi properties L, EN, 309 - ES, CS, ET, ON, BN, or NSM are allowed. 310 - 311 - 6. In an LTR label, the end of the label must be a character with 312 - Bidi property L or EN, followed by zero or more characters with 313 - Bidi property NSM. 314 - 315 - The following guarantees can be made based on the above: 316 - 317 - o In a domain name consisting of only labels that satisfy the rule, 318 - the requirements of Section 3 are satisfied. Note that even LTR 319 - labels and pure ASCII labels have to be tested. 320 - 321 - o In a domain name consisting of only LDH labels (as defined in the 322 - Definitions document [RFC5890]) and labels that satisfy the rule, 323 - the requirements of Section 3 are satisfied as long as a label 324 - that starts with an ASCII digit does not come after a 325 - right-to-left label. 326 - 327 - No guarantee is given for other combinations. 328 - 329 - 3. The Requirement Set for the Bidi Rule 330 - 331 - This document, unlike RFC 3454 [RFC3454], provides an explicit 332 - justification for the Bidi rule, and states a set of requirements for 333 - which it is possible to test whether or not the modified rule 334 - fulfills the requirement. 335 - 336 - 337 - 338 - Alvestrand & Karp Standards Track [Page 6] 339 - 340 - RFC 5893 IDNA Right to Left August 2010 341 - 342 - 343 - All the text in this document assumes that text containing the labels 344 - under consideration will be displayed using the Unicode bidirectional 345 - algorithm [Unicode-UAX9]. 346 - 347 - The requirements proposed are these: 348 - 349 - o Label Uniqueness: No two labels, when presented in display order 350 - in the same paragraph, should have the same sequence of characters 351 - without also having the same sequence of characters in network 352 - order, both when the paragraph has LTR direction and when the 353 - paragraph has RTL direction. (This is the criterion that is 354 - explicit in RFC 3454). (Note that a label displayed in an RTL 355 - paragraph may display the same as a different label displayed in 356 - an LTR paragraph and still satisfy this criterion.) 357 - 358 - o Character Grouping: When displaying a string of labels, using the 359 - Unicode Bidi algorithm to reorder the characters for display, the 360 - characters of each label should remain grouped between the 361 - characters delimiting the labels, both when the string is embedded 362 - in a paragraph with LTR direction and when it is embedded in a 363 - paragraph with RTL direction. 364 - 365 - Several stronger statements were considered and rejected, because 366 - they seem to be impossible to fulfill within the constraints of the 367 - Unicode bidirectional algorithm. These include: 368 - 369 - o The appearance of a label should be unaffected by its embedding 370 - context. This proved impossible even for ASCII labels; the label 371 - "123-A" will have a different display order in an RTL context than 372 - in an LTR context. (This particular example is, however, 373 - disallowed anyway.) 374 - 375 - o The sequence of labels should be consistent with network order. 376 - This proved impossible -- a domain name consisting of the labels 377 - (in network order) L1.R2.R3.L4 will be displayed as L1.R3.R2.L4 in 378 - an LTR context. (In an RTL context, it will be displayed as 379 - L4.R3.R2.L1). 380 - 381 - o No two domain names should be displayed the same, even under 382 - differing directionality. This was shown to be unsound, since the 383 - domain name (in network order) ABC.abc will have display order 384 - CBA.abc in an LTR context and abc.CBA in an RTL context, while the 385 - domain name (network) abc.ABC will have display order abc.CBA in 386 - an LTR context and CBA.abc in an RTL context. 387 - 388 - 389 - 390 - 391 - 392 - 393 - 394 - Alvestrand & Karp Standards Track [Page 7] 395 - 396 - RFC 5893 IDNA Right to Left August 2010 397 - 398 - 399 - One possible requirement was thought to be problematic, but turned 400 - out to be satisfied by a string that obeys the proposed rules: 401 - 402 - o The Character Grouping requirement should be satisfied when 403 - directional controls (LRE, RLE, RLO, LRO, PDF) are used in the 404 - same paragraph (outside of the labels). Because these controls 405 - affect presentation order in non-obvious ways, by affecting the 406 - "sor" and "eor" properties of the Unicode Bidi algorithm, the 407 - conditions above require extra testing in order to figure out 408 - whether or not they influence the display of the domain name. 409 - Testing found that for the strings allowed under the rule 410 - presented in this document, directional controls do not influence 411 - the display of the domain name. 412 - 413 - This is still not stated as a requirement, since it did not seem as 414 - important as the stated requirements, but it is useful to know that 415 - Bidi domain names where the labels satisfy the rule have this 416 - property. 417 - 418 - In the following descriptions, first-level bullets are used to 419 - indicate rules or normative statements; second-level bullets are 420 - commentary. 421 - 422 - The Character Grouping requirement can be more formally stated as: 423 - 424 - o Let "Delimiterchars" be a set of characters with the Unicode Bidi 425 - properties CS, WS, ON. (These are commonly used to delimit labels 426 - -- both the FULL STOP and the space are included. They are not 427 - allowed in domain labels.) 428 - 429 - * ET, though it commonly occurs next to domain names in practice, 430 - is problematic: the context R CS L EN ET (for instance A.a1%) 431 - makes the label L EN not satisfy the character grouping 432 - requirement. 433 - 434 - * ES commonly occurs in labels as HYPHEN-MINUS, but could also be 435 - used as a delimiter (for instance, the plus sign). It is left 436 - out here. 437 - 438 - o Let "unproblematic label" be a label that either satisfies the 439 - requirements or does not contain any character with the Bidi 440 - properties R, AL, or AN and does not begin with a character with 441 - the Bidi property EN. (Informally, "it does not start with a 442 - number".) 443 - 444 - 445 - 446 - 447 - 448 - 449 - 450 - Alvestrand & Karp Standards Track [Page 8] 451 - 452 - RFC 5893 IDNA Right to Left August 2010 453 - 454 - 455 - A label X satisfies the Character Grouping requirement when, for any 456 - Delimiter Character D1 and D2, and for any label S1 and S2 that is an 457 - unproblematic label or an empty string, the following holds true: 458 - 459 - If the string formed by concatenating S1, D1, X, D2, and S2 is 460 - reordered according to the Bidi algorithm, then all the characters of 461 - X in the reordered string are between D1 and D2, and no other 462 - characters are between D1 and D2, both if the overall paragraph 463 - direction is LTR and if the overall paragraph direction is RTL. 464 - 465 - Note that the definition is self-referential, since S1 and S2 are 466 - constrained to be "legal" by this definition. This makes testing 467 - changes to proposed rules a little complex, but does not create 468 - problems for testing whether or not a given proposed rule satisfies 469 - the criterion. 470 - 471 - The "zero-length" case represents the case where a domain name is 472 - next to something that isn't a domain name, separated by a delimiter 473 - character. 474 - 475 - Note about the position of BN: The Unicode bidirectional algorithm 476 - specifies that a BN has an effect on the adjoining characters in 477 - network order, not in display order, and are therefore treated as if 478 - removed during Bidi processing ([Unicode-UAX9], Section 3.3.2, rule 479 - X9 and Section 5.3). Therefore, the question of "what position does 480 - a BN have after reordering" is not meaningful. It has been ignored 481 - while developing the rules here. 482 - 483 - The Label Uniqueness requirement can be formally stated as: 484 - 485 - If two non-identical labels X and Y, embedded as for the test above, 486 - displayed in paragraphs with the same directionality, are reordered 487 - by the Bidi algorithm into the same sequence of code points, the 488 - labels X and Y cannot both be legal. 489 - 490 - 4. Examples of Issues Found with RFC 3454 491 - 492 - 4.1. Dhivehi 493 - 494 - Dhivehi, the official language of the Maldives, is written with the 495 - Thaana script. This script displays some of the characteristics of 496 - the Arabic script, including its directional properties, and the 497 - indication of vowels by the diacritical marking of consonantal base 498 - characters. This marking is obligatory, and both two consecutive 499 - vowels and syllable-final consonants are indicated with unvoiced 500 - combining marks. Every Dhivehi word therefore ends with a combining 501 - mark. 502 - 503 - 504 - 505 - 506 - Alvestrand & Karp Standards Track [Page 9] 507 - 508 - RFC 5893 IDNA Right to Left August 2010 509 - 510 - 511 - The word for "computer", which is romanized as "konpeetaru", is 512 - written with the following sequence of Unicode code points: 513 - 514 - U+0786 THAANA LETTER KAAFU (AL) 515 - 516 - U+07AE THAANA OBOFILI (NSM) 517 - 518 - U+0782 THAANA LETTER NOONU (AL) 519 - 520 - U+07B0 THAANA SUKUN (NSM) 521 - 522 - U+0795 THAANA LETTER PAVIYANI (AL) 523 - 524 - U+07A9 THAANA LETTER EEBEEFILI (AL) 525 - 526 - U+0793 THAANA LETTER TAVIYANI (AL) 527 - 528 - U+07A6 THAANA ABAFILI (NSM) 529 - 530 - U+0783 THAANA LETTER RAA (AL) 531 - 532 - U+07AA THAANA UBUFILI (NSM) 533 - 534 - The directionality class of U+07AA in the Unicode database 535 - [Unicode52] is NSM (Nonspacing Mark), which is not R or AL; a 536 - conformant implementation of the IDNA2003 algorithm will say that 537 - "this is not in RandALCat" and refuse to encode the string. 538 - 539 - 4.2. Yiddish 540 - 541 - Yiddish is one of several languages written with the Hebrew script 542 - (others include Hebrew and Ladino). This is basically a consonantal 543 - alphabet (also termed an "abjad"), but Yiddish is written using an 544 - extended form that is fully vocalic. The vowels are indicated in 545 - several ways, one of which is by repurposing letters that are 546 - consonants in Hebrew. Other letters are used both as vowels and 547 - consonants, with combining marks, called "points", used to 548 - differentiate between them. Finally, some base characters can 549 - indicate several different vowels, which are also disambiguated by 550 - combining marks. Pointed characters can appear in word-final 551 - position and may therefore also be needed at the end of labels. This 552 - is not an invariable attribute of a Yiddish string and there is thus 553 - greater latitude here than there is with Dhivehi. 554 - 555 - The organization now known as the "YIVO Institute for Jewish 556 - Research" developed orthographic rules for modern Standard Yiddish 557 - during the 1930s on the basis of work conducted in several venues 558 - since earlier in that century. These are given in, "The Standardized 559 - 560 - 561 - 562 - Alvestrand & Karp Standards Track [Page 10] 563 - 564 - RFC 5893 IDNA Right to Left August 2010 565 - 566 - 567 - Yiddish Orthography: Rules of Yiddish Spelling" [SYO], and are taken 568 - as normatively descriptive of modern Standard Yiddish in any context 569 - where that notion is deemed relevant. They have been applied 570 - exclusively in all formal Yiddish dictionaries published since their 571 - establishment, and are similarly dominant in academic and 572 - bibliographic regards. 573 - 574 - It therefore appears appropriate for this repertoire also to be 575 - supported fully by IDNA. This presents no difficulty with characters 576 - in initial and medial positions, but pointed characters are regularly 577 - used in final position as well. All of the characters in the SYO 578 - repertoire appear in both marked and unmarked form with one 579 - exception: the HEBREW LETTER PE (U+05E4). The SYO only permits this 580 - with a HEBREW POINT DAGESH (U+05BC), providing the Yiddish equivalent 581 - to the Latin letter "p", or a HEBREW POINT RAFE (U+05BF), equivalent 582 - to the Latin letter "f". There is, however, a separate unpointed 583 - allograph, the HEBREW LETTER FINAL PE (U+05E3), for the latter 584 - character when it appears in final position. The constraint on the 585 - use of the SYO repertoire resulting from the proscription of 586 - combining marks at the end of RTL strings thus reduces to nothing 587 - more, or less, than the equivalent of saying that a string of Latin 588 - characters cannot end with the letter "p". It must also be noted 589 - that the HEBREW LETTER PE with the HEBREW POINT DAGESH is 590 - characteristic of almost all traditional Yiddish orthographies that 591 - predate (or remain in use in parallel to) the SYO, being the first 592 - pointed character to appear in any of them. 593 - 594 - A more general instantiation of the basic problem can be seen in the 595 - representation of the YIVO acronym. This acronym is written with the 596 - Hebrew letters YOD YOD HIRIQ VAV VAV ALEF QAMATS, where HIRIQ and 597 - QAMATS are combining points. The Unicode code points are: 598 - 599 - U+05D9 HEBREW LETTER YOD (R) 600 - 601 - U+05B4 HEBREW POINT HIRIQ (NSM) 602 - 603 - U+05D5 HEBREW LETTER VAV (R) 604 - 605 - U+05D0 HEBREW LETTER ALEF (R) 606 - 607 - U+05B8 HEBREW POINT QAMATS (NSM) 608 - 609 - The directionality class of U+05B8 HEBREW POINT QAMATS in the Unicode 610 - database is NSM, which again causes the IDNA2003 algorithm to reject 611 - the string. 612 - 613 - 614 - 615 - 616 - 617 - 618 - Alvestrand & Karp Standards Track [Page 11] 619 - 620 - RFC 5893 IDNA Right to Left August 2010 621 - 622 - 623 - It may also be noted that all of the combined characters mentioned 624 - above exist in precomposed form at separate positions in the Unicode 625 - chart. However, by invoking Stringprep, the IDNA2003 algorithm also 626 - rejects those code points, for reasons not discussed here. 627 - 628 - 4.3. Strings with Numbers 629 - 630 - By requiring that the first or last character of a string be a member 631 - of category R or AL, the Stringprep specification [RFC3454] 632 - prohibited a string containing right-to-left characters from ending 633 - with a number. 634 - 635 - Consider the strings ALEF 5 (HEBREW LETTER ALEF + DIGIT FIVE) and 5 636 - ALEF. Displayed in an LTR context, the first one will be displayed 637 - from left to right as 5 ALEF (with the 5 being considered right to 638 - left because of the leading ALEF), while 5 ALEF will be displayed in 639 - exactly the same order (5 taking the direction from context). 640 - Clearly, only one of those should be permitted as a registered label, 641 - but barring them both seems unnecessary. 642 - 643 - 5. Troublesome Situations and Guidelines 644 - 645 - There are situations in which labels that satisfy the rule above will 646 - be displayed in a surprising fashion. The most important of these is 647 - the case where a label ending in a character with Bidi property AL, 648 - AN, or R occurs before a label beginning with a character of Bidi 649 - property EN. In that case, the number will appear to move into the 650 - label containing the right-to-left character, violating the Character 651 - Grouping requirement. 652 - 653 - If the label that occurs after the right-to-left label itself 654 - satisfies the Bidi criterion, the requirements will be satisfied in 655 - all cases (this is the reason why the criterion talks about strings 656 - containing L in some cases). However, the IDNABIS WG concluded that 657 - this could not be required for several reasons: 658 - 659 - o There is a large current deployment of ASCII domain names starting 660 - with digits. These cannot possibly be invalidated. 661 - 662 - o Domain names are often constructed piecemeal, for instance, by 663 - combining a string with the content of a search list. This may 664 - occur after IDNA processing, and thus in part of the code that is 665 - not IDNA-aware, making detection of the undesirable combination 666 - impossible. 667 - 668 - 669 - 670 - 671 - 672 - 673 - 674 - Alvestrand & Karp Standards Track [Page 12] 675 - 676 - RFC 5893 IDNA Right to Left August 2010 677 - 678 - 679 - o Even if a label is registered under a "safe" label, there may be a 680 - DNAME [RFC2672] with an "unsafe" label that points to the "safe" 681 - label, thus creating seemingly valid names that would not satisfy 682 - the criterion. 683 - 684 - o Wildcards create the odd situation where a label is "valid" (can 685 - be looked up successfully) without the zone owner knowing that 686 - this label exists. So an owner of a zone whose name starts with a 687 - digit and contains a wildcard has no way of controlling whether or 688 - not names with RTL labels in them are looked up in his zone. 689 - 690 - Rather than trying to suggest rules that disallow all such 691 - undesirable situations, this document merely warns about the 692 - possibility, and leaves it to application developers to take whatever 693 - measures they deem appropriate to avoid problematic situations. 694 - 695 - 6. Other Issues in Need of Resolution 696 - 697 - This document concerns itself only with the rules that are needed 698 - when dealing with domain names with characters that have differing 699 - Bidi properties, and considers characters only in terms of their Bidi 700 - properties. All other issues with scripts that are written from 701 - right to left must be considered in other contexts. 702 - 703 - One such issue is the need to keep numbers separate. Several scripts 704 - are used with multiple sets of numbers -- most commonly they use 705 - Latin numbers and a script-specific set of numbers, but in the case 706 - of Arabic, there are two sets of "Arabic-Indic" digits involved. 707 - 708 - The algorithm in this document disallows occurrences of AN-class 709 - characters ("Arabic-Indic digits", U+0660 to U+0669) together with 710 - EN-class characters (which includes "European" digits, U+0030 to 711 - U+0039 and "extended Arabic-Indic digits", U+06F0 to U+06F9), but 712 - does not help in preventing the mixing of, for instance, Bengali 713 - digits (U+09E6 to U+09EF) and Gujarati digits (U+0AE6 to U+0AEF), 714 - both of which have Bidi class L. A registry or script community that 715 - wishes to create rules restricting the mixing of digits in a label 716 - will be able to specify these restrictions at the registry level. 717 - Some rules are also specified at the protocol level. 718 - 719 - Another set of issues concerns the proper display of IDNs with a 720 - mixture of LTR and RTL labels, or only RTL labels. 721 - 722 - It is unrealistic to expect that applications will display domain 723 - names using embedded formatting codes between their labels (for one 724 - thing, no reliable algorithms for identifying domain names in running 725 - text exist); thus, the display order will be determined by the Bidi 726 - algorithm. Thus, a sequence (in network order) of R1.R2.ltr will be 727 - 728 - 729 - 730 - Alvestrand & Karp Standards Track [Page 13] 731 - 732 - RFC 5893 IDNA Right to Left August 2010 733 - 734 - 735 - displayed in the order 2R.1R.ltr in an LTR context, which might 736 - surprise someone expecting to see labels displayed in hierarchical 737 - order. People used to working with text that mixes LTR and RTL 738 - strings might not be so surprised by this. Again, this memo does not 739 - attempt to suggest a solution to this problem. 740 - 741 - 7. Compatibility Considerations 742 - 743 - 7.1. Backwards Compatibility Considerations 744 - 745 - As with any change to an existing standard, it is important to 746 - consider what happens with existing implementations when the change 747 - is introduced. Some troublesome cases include: 748 - 749 - o An old program used to input the newly allowed label. If the old 750 - program checks the input against RFC 3454, some labels will not be 751 - allowed, and domain names containing those labels will remain 752 - inaccessible. 753 - 754 - o An old program is asked to display the newly allowed label, and 755 - checks it against RFC 3454 before displaying. The program will 756 - perform some kind of fallback, most likely displaying the label in 757 - A-label form. 758 - 759 - o An old program tries to display the newly allowed label. If the 760 - old program has code for displaying the last character of a label 761 - that is different from the code used to display the characters in 762 - the middle of the label, the display may be inconsistent and cause 763 - confusion. 764 - 765 - One particular example of the last case is if a program chooses to 766 - examine the last character (in network order) of a string in order to 767 - determine its directionality, rather than its first. If it finds an 768 - NSM character and tries to display the string as if it was a 769 - left-to-right string, the resulting display may be interesting, but 770 - not useful. 771 - 772 - The editors believe that these cases will have a less harmful impact 773 - in practice than continuing to deny the use of words from the 774 - languages for which these strings are necessary as IDN labels. 775 - 776 - This specification does not forbid using leading European digits in 777 - ASCII-only labels, since this would conflict with a large installed 778 - base of such labels, and would increase the scope of the 779 - specification from RTL labels to all labels. The harm resulting from 780 - this limitation of scope is described in Section 5. Registries and 781 - private zone managers can check for this particular condition before 782 - they allow registration of any RTL label. Generally, it is best to 783 - 784 - 785 - 786 - Alvestrand & Karp Standards Track [Page 14] 787 - 788 - RFC 5893 IDNA Right to Left August 2010 789 - 790 - 791 - disallow registration of any right-to-left strings in a zone where 792 - the label at the level above begins with a digit. 793 - 794 - 7.2. Forward Compatibility Considerations 795 - 796 - This text is intentionally specified strictly in terms of the Unicode 797 - Bidi properties. The determination that the condition is sufficient 798 - to fulfill the criteria depends on the Unicode Bidi algorithm; it is 799 - unlikely that drastic changes will be made to this algorithm. 800 - 801 - However, the determination of validity for any string depends on the 802 - Unicode Bidi property values, which are not declared immutable by the 803 - Unicode Consortium. Furthermore, the behavior of the algorithm for 804 - any given character is likely to be linguistically and culturally 805 - sensitive, so while it should occur rarely, it is possible that later 806 - versions of the Unicode Standard may change the Bidi properties 807 - assigned to certain Unicode characters. 808 - 809 - This memo does not propose a solution for this problem. 810 - 811 - 8. Security Considerations 812 - 813 - The display behavior of mixed-direction text can be extremely 814 - surprising to users who are not used to it; for instance, cut and 815 - paste of a piece of text can cause the text to display differently at 816 - the destination, if the destination is in another directionality 817 - context, and adding a character in one place of a text can cause 818 - characters some distance from the point of insertion to change their 819 - display position. This is, however, not a phenomenon unique to the 820 - display of domain names. 821 - 822 - The new IDNA protocol, and particularly these new Bidi rules, will 823 - allow some strings to be used in IDNA contexts that are not allowed 824 - today. It is possible that differences in the interpretation of 825 - labels between implementations of IDNA2003 and IDNA2008 could pose a 826 - security risk, but it is difficult to envision any specific 827 - instantiation of this. 828 - 829 - Any rational attempt to compute, for instance, a hash over an 830 - identifier processed by IDNA would use network order for its 831 - computation, and thus be unaffected by the new rules proposed here. 832 - 833 - While it is not believed to pose a problem, if display routines had 834 - been written with specific knowledge of the RFC 3454 IDNA 835 - prohibitions, it is possible that the potential problems noted under 836 - "Backwards Compatibility Considerations" could cause new kinds of 837 - confusion. 838 - 839 - 840 - 841 - 842 - Alvestrand & Karp Standards Track [Page 15] 843 - 844 - RFC 5893 IDNA Right to Left August 2010 845 - 846 - 847 - 9. Acknowledgements 848 - 849 - While the listed editors held the pen, this document represents the 850 - joint work and conclusions of an ad hoc design team. In addition to 851 - the editors, this consisted of, in alphabetic order, Tina Dam, Patrik 852 - Faltstrom, and John Klensin. Many further specific contributions and 853 - helpful comments were received from the people listed below, and 854 - others who have contributed to the development and use of the IDNA 855 - protocols. 856 - 857 - The particular formulation of the Bidi rule in Section 2 was 858 - suggested by Matitiahu Allouche. 859 - 860 - The team wishes, in particular, to thank Roozbeh Pournader for 861 - calling its attention to the issue with the Thaana script, Paul 862 - Hoffman for pointing out the need to be explicit about backwards 863 - compatibility considerations, Ken Whistler for suggesting the basis 864 - of the formalized "Character Grouping" requirement, Mark Davis for 865 - commentary, Erik van der Poel for careful review, comments, and 866 - verification of the rulesets, Marcos Sanz, Andrew Sullivan, and Pete 867 - Resnick for reviews, and Vint Cerf for chairing the working group and 868 - contributing massively to getting the documents finished. 869 - 870 - 10. References 871 - 872 - 10.1. Normative References 873 - 874 - [RFC5890] Klensin, J., "Internationalized Domain Names for 875 - Applications (IDNA): Definitions and Document 876 - Framework", RFC 5890, August 2010. 877 - 878 - [Unicode-UAX9] The Unicode Consortium, "Unicode Standard Annex #9: 879 - Unicode Bidirectional Algorithm", September 2009, 880 - <http://www.unicode.org/reports/tr9/>. 881 - 882 - [Unicode52] The Unicode Consortium. The Unicode Standard, Version 883 - 5.2.0, defined by: "The Unicode Standard, Version 884 - 5.2.0", (Mountain View, CA: The Unicode Consortium, 885 - 2009. ISBN 978-1-936213-00-9). 886 - <http://www.unicode.org/versions/Unicode5.2.0/>. 887 - 888 - 889 - 890 - 891 - 892 - 893 - 894 - 895 - 896 - 897 - 898 - Alvestrand & Karp Standards Track [Page 16] 899 - 900 - RFC 5893 IDNA Right to Left August 2010 901 - 902 - 903 - 10.2. Informative References 904 - 905 - [RFC2672] Crawford, M., "Non-Terminal DNS Name Redirection", 906 - RFC 2672, August 1999. 907 - 908 - [RFC3454] Hoffman, P. and M. Blanchet, "Preparation of 909 - Internationalized Strings ("stringprep")", RFC 3454, 910 - December 2002. 911 - 912 - [RFC5891] Klensin, J., "Internationalized Domain Names in 913 - Applications (IDNA): Protocol", RFC 5891, August 2010. 914 - 915 - [SYO] "The Standardized Yiddish Orthography: Rules of 916 - Yiddish Spelling, 6th ed., New York, ISBN 917 - 0-914512-25-0", 1999. 918 - 919 - Authors' Addresses 920 - 921 - Harald Tveit Alvestrand (editor) 922 - Google 923 - Beddingen 10 924 - Trondheim, 7014 925 - Norway 926 - 927 - EMail: harald@alvestrand.no 928 - 929 - 930 - Cary Karp 931 - Swedish Museum of Natural History 932 - Frescativ. 40 933 - Stockholm, 10405 934 - Sweden 935 - 936 - Phone: +46 8 5195 4055 937 - Fax: 938 - EMail: ck@nic.museum 939 - 940 - 941 - 942 - 943 - 944 - 945 - 946 - 947 - 948 - 949 - 950 - 951 - 952 - 953 - 954 - Alvestrand & Karp Standards Track [Page 17] 955 -
-4
ocaml-punycode/test/dune
··· 1 - (test 2 - (name test_punycode) 3 - (libraries punycode punycode.idna alcotest) 4 - (modules test_punycode))
-855
ocaml-punycode/test/test_punycode.ml
··· 1 - (*--------------------------------------------------------------------------- 2 - Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 - SPDX-License-Identifier: ISC 4 - ---------------------------------------------------------------------------*) 5 - 6 - (* Comprehensive tests for Punycode (RFC 3492) implementation *) 7 - 8 - open Alcotest 9 - module Punycode = Punycode 10 - module Punycode_idna = Punycode_idna 11 - 12 - (* Helper to convert hex code points to Uchar array *) 13 - let codepoints_of_hex_list hex_list = 14 - Array.of_list (List.map Uchar.of_int hex_list) 15 - 16 - (* Helper to convert string to code points *) 17 - let codepoints_of_string s = 18 - let acc = ref [] in 19 - let i = ref 0 in 20 - while !i < String.length s do 21 - let dec = String.get_utf_8_uchar s !i in 22 - acc := Uchar.utf_decode_uchar dec :: !acc; 23 - i := !i + Uchar.utf_decode_length dec 24 - done; 25 - Array.of_list (List.rev !acc) 26 - 27 - (* Test result helper *) 28 - let check_encode_ok expected input = 29 - match Punycode.encode input with 30 - | Ok result -> check string "encode" expected result 31 - | Error e -> fail (Format.asprintf "encode failed: %a" Punycode.pp_error e) 32 - 33 - let check_decode_ok expected input = 34 - match Punycode.decode input with 35 - | Ok result -> 36 - let expected_arr = codepoints_of_hex_list expected in 37 - check int "length" (Array.length expected_arr) (Array.length result); 38 - Array.iteri 39 - (fun i u -> 40 - check int 41 - (Printf.sprintf "char %d" i) 42 - (Uchar.to_int expected_arr.(i)) 43 - (Uchar.to_int u)) 44 - result 45 - | Error e -> fail (Format.asprintf "decode failed: %a" Punycode.pp_error e) 46 - 47 - let check_utf8_roundtrip s = 48 - match Punycode.encode_utf8 s with 49 - | Error e -> 50 - fail (Format.asprintf "encode_utf8 failed: %a" Punycode.pp_error e) 51 - | Ok encoded -> ( 52 - match Punycode.decode_utf8 encoded with 53 - | Error e -> 54 - fail (Format.asprintf "decode_utf8 failed: %a" Punycode.pp_error e) 55 - | Ok decoded -> check string "roundtrip" s decoded) 56 - 57 - (* RFC 3492 Section 7.1 Test Vectors *) 58 - 59 - (* (A) Arabic (Egyptian) *) 60 - let arabic_codepoints = 61 - [ 62 - 0x0644; 63 - 0x064A; 64 - 0x0647; 65 - 0x0645; 66 - 0x0627; 67 - 0x0628; 68 - 0x062A; 69 - 0x0643; 70 - 0x0644; 71 - 0x0645; 72 - 0x0648; 73 - 0x0634; 74 - 0x0639; 75 - 0x0631; 76 - 0x0628; 77 - 0x064A; 78 - 0x061F; 79 - ] 80 - 81 - let arabic_punycode = "egbpdaj6bu4bxfgehfvwxn" 82 - 83 - (* (B) Chinese (simplified) *) 84 - let chinese_simplified_codepoints = 85 - [ 0x4ED6; 0x4EEC; 0x4E3A; 0x4EC0; 0x4E48; 0x4E0D; 0x8BF4; 0x4E2D; 0x6587 ] 86 - 87 - let chinese_simplified_punycode = "ihqwcrb4cv8a8dqg056pqjye" 88 - 89 - (* (C) Chinese (traditional) *) 90 - let chinese_traditional_codepoints = 91 - [ 0x4ED6; 0x5011; 0x7232; 0x4EC0; 0x9EBD; 0x4E0D; 0x8AAA; 0x4E2D; 0x6587 ] 92 - 93 - let chinese_traditional_punycode = "ihqwctvzc91f659drss3x8bo0yb" 94 - 95 - (* (D) Czech *) 96 - let czech_codepoints = 97 - [ 98 - 0x0050; 99 - 0x0072; 100 - 0x006F; 101 - 0x010D; 102 - 0x0070; 103 - 0x0072; 104 - 0x006F; 105 - 0x0073; 106 - 0x0074; 107 - 0x011B; 108 - 0x006E; 109 - 0x0065; 110 - 0x006D; 111 - 0x006C; 112 - 0x0075; 113 - 0x0076; 114 - 0x00ED; 115 - 0x010D; 116 - 0x0065; 117 - 0x0073; 118 - 0x006B; 119 - 0x0079; 120 - ] 121 - 122 - let czech_punycode = "Proprostnemluvesky-uyb24dma41a" 123 - 124 - (* (E) Hebrew *) 125 - let hebrew_codepoints = 126 - [ 127 - 0x05DC; 128 - 0x05DE; 129 - 0x05D4; 130 - 0x05D4; 131 - 0x05DD; 132 - 0x05E4; 133 - 0x05E9; 134 - 0x05D5; 135 - 0x05D8; 136 - 0x05DC; 137 - 0x05D0; 138 - 0x05DE; 139 - 0x05D3; 140 - 0x05D1; 141 - 0x05E8; 142 - 0x05D9; 143 - 0x05DD; 144 - 0x05E2; 145 - 0x05D1; 146 - 0x05E8; 147 - 0x05D9; 148 - 0x05EA; 149 - ] 150 - 151 - let hebrew_punycode = "4dbcagdahymbxekheh6e0a7fei0b" 152 - 153 - (* (F) Hindi (Devanagari) *) 154 - let hindi_codepoints = 155 - [ 156 - 0x092F; 157 - 0x0939; 158 - 0x0932; 159 - 0x094B; 160 - 0x0917; 161 - 0x0939; 162 - 0x093F; 163 - 0x0928; 164 - 0x094D; 165 - 0x0926; 166 - 0x0940; 167 - 0x0915; 168 - 0x094D; 169 - 0x092F; 170 - 0x094B; 171 - 0x0902; 172 - 0x0928; 173 - 0x0939; 174 - 0x0940; 175 - 0x0902; 176 - 0x092C; 177 - 0x094B; 178 - 0x0932; 179 - 0x0938; 180 - 0x0915; 181 - 0x0924; 182 - 0x0947; 183 - 0x0939; 184 - 0x0948; 185 - 0x0902; 186 - ] 187 - 188 - let hindi_punycode = "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd" 189 - 190 - (* (G) Japanese (kanji and hiragana) *) 191 - let japanese_codepoints = 192 - [ 193 - 0x306A; 194 - 0x305C; 195 - 0x307F; 196 - 0x3093; 197 - 0x306A; 198 - 0x65E5; 199 - 0x672C; 200 - 0x8A9E; 201 - 0x3092; 202 - 0x8A71; 203 - 0x3057; 204 - 0x3066; 205 - 0x304F; 206 - 0x308C; 207 - 0x306A; 208 - 0x3044; 209 - 0x306E; 210 - 0x304B; 211 - ] 212 - 213 - let japanese_punycode = "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa" 214 - 215 - (* (H) Korean (Hangul syllables) *) 216 - let korean_codepoints = 217 - [ 218 - 0xC138; 219 - 0xACC4; 220 - 0xC758; 221 - 0xBAA8; 222 - 0xB4E0; 223 - 0xC0AC; 224 - 0xB78C; 225 - 0xB4E4; 226 - 0xC774; 227 - 0xD55C; 228 - 0xAD6D; 229 - 0xC5B4; 230 - 0xB97C; 231 - 0xC774; 232 - 0xD574; 233 - 0xD55C; 234 - 0xB2E4; 235 - 0xBA74; 236 - 0xC5BC; 237 - 0xB9C8; 238 - 0xB098; 239 - 0xC88B; 240 - 0xC744; 241 - 0xAE4C; 242 - ] 243 - 244 - let korean_punycode = 245 - "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c" 246 - 247 - (* (I) Russian (Cyrillic) *) 248 - let russian_codepoints = 249 - [ 250 - 0x043F; 251 - 0x043E; 252 - 0x0447; 253 - 0x0435; 254 - 0x043C; 255 - 0x0443; 256 - 0x0436; 257 - 0x0435; 258 - 0x043E; 259 - 0x043D; 260 - 0x0438; 261 - 0x043D; 262 - 0x0435; 263 - 0x0433; 264 - 0x043E; 265 - 0x0432; 266 - 0x043E; 267 - 0x0440; 268 - 0x044F; 269 - 0x0442; 270 - 0x043F; 271 - 0x043E; 272 - 0x0440; 273 - 0x0443; 274 - 0x0441; 275 - 0x0441; 276 - 0x043A; 277 - 0x0438; 278 - ] 279 - 280 - let russian_punycode = "b1abfaaepdrnnbgefbadotcwatmq2g4l" 281 - 282 - (* (J) Spanish *) 283 - let spanish_codepoints = 284 - [ 285 - 0x0050; 286 - 0x006F; 287 - 0x0072; 288 - 0x0071; 289 - 0x0075; 290 - 0x00E9; 291 - 0x006E; 292 - 0x006F; 293 - 0x0070; 294 - 0x0075; 295 - 0x0065; 296 - 0x0064; 297 - 0x0065; 298 - 0x006E; 299 - 0x0073; 300 - 0x0069; 301 - 0x006D; 302 - 0x0070; 303 - 0x006C; 304 - 0x0065; 305 - 0x006D; 306 - 0x0065; 307 - 0x006E; 308 - 0x0074; 309 - 0x0065; 310 - 0x0068; 311 - 0x0061; 312 - 0x0062; 313 - 0x006C; 314 - 0x0061; 315 - 0x0072; 316 - 0x0065; 317 - 0x006E; 318 - 0x0045; 319 - 0x0073; 320 - 0x0070; 321 - 0x0061; 322 - 0x00F1; 323 - 0x006F; 324 - 0x006C; 325 - ] 326 - 327 - let spanish_punycode = "PorqunopuedensimplementehablarenEspaol-fmd56a" 328 - 329 - (* (K) Vietnamese *) 330 - let vietnamese_codepoints = 331 - [ 332 - 0x0054; 333 - 0x1EA1; 334 - 0x0069; 335 - 0x0073; 336 - 0x0061; 337 - 0x006F; 338 - 0x0068; 339 - 0x1ECD; 340 - 0x006B; 341 - 0x0068; 342 - 0x00F4; 343 - 0x006E; 344 - 0x0067; 345 - 0x0074; 346 - 0x0068; 347 - 0x1EC3; 348 - 0x0063; 349 - 0x0068; 350 - 0x1EC9; 351 - 0x006E; 352 - 0x00F3; 353 - 0x0069; 354 - 0x0074; 355 - 0x0069; 356 - 0x1EBF; 357 - 0x006E; 358 - 0x0067; 359 - 0x0056; 360 - 0x0069; 361 - 0x1EC7; 362 - 0x0074; 363 - ] 364 - 365 - let vietnamese_punycode = "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g" 366 - 367 - (* (L) 3年B組金八先生 - Japanese with ASCII *) 368 - let example_l_codepoints = 369 - [ 0x0033; 0x5E74; 0x0042; 0x7D44; 0x91D1; 0x516B; 0x5148; 0x751F ] 370 - 371 - let example_l_punycode = "3B-ww4c5e180e575a65lsy2b" 372 - 373 - (* (M) 安室奈美恵-with-SUPER-MONKEYS *) 374 - let example_m_codepoints = 375 - [ 376 - 0x5B89; 377 - 0x5BA4; 378 - 0x5948; 379 - 0x7F8E; 380 - 0x6075; 381 - 0x002D; 382 - 0x0077; 383 - 0x0069; 384 - 0x0074; 385 - 0x0068; 386 - 0x002D; 387 - 0x0053; 388 - 0x0055; 389 - 0x0050; 390 - 0x0045; 391 - 0x0052; 392 - 0x002D; 393 - 0x004D; 394 - 0x004F; 395 - 0x004E; 396 - 0x004B; 397 - 0x0045; 398 - 0x0059; 399 - 0x0053; 400 - ] 401 - 402 - let example_m_punycode = "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n" 403 - 404 - (* (N) Hello-Another-Way-それぞれの場所 *) 405 - let example_n_codepoints = 406 - [ 407 - 0x0048; 408 - 0x0065; 409 - 0x006C; 410 - 0x006C; 411 - 0x006F; 412 - 0x002D; 413 - 0x0041; 414 - 0x006E; 415 - 0x006F; 416 - 0x0074; 417 - 0x0068; 418 - 0x0065; 419 - 0x0072; 420 - 0x002D; 421 - 0x0057; 422 - 0x0061; 423 - 0x0079; 424 - 0x002D; 425 - 0x305D; 426 - 0x308C; 427 - 0x305E; 428 - 0x308C; 429 - 0x306E; 430 - 0x5834; 431 - 0x6240; 432 - ] 433 - 434 - let example_n_punycode = "Hello-Another-Way--fc4qua05auwb3674vfr0b" 435 - 436 - (* (O) ひとつ屋根の下2 *) 437 - let example_o_codepoints = 438 - [ 0x3072; 0x3068; 0x3064; 0x5C4B; 0x6839; 0x306E; 0x4E0B; 0x0032 ] 439 - 440 - let example_o_punycode = "2-u9tlzr9756bt3uc0v" 441 - 442 - (* (P) MaijでKoiする5秒前 *) 443 - let example_p_codepoints = 444 - [ 445 - 0x004D; 446 - 0x0061; 447 - 0x006A; 448 - 0x0069; 449 - 0x3067; 450 - 0x004B; 451 - 0x006F; 452 - 0x0069; 453 - 0x3059; 454 - 0x308B; 455 - 0x0035; 456 - 0x79D2; 457 - 0x524D; 458 - ] 459 - 460 - let example_p_punycode = "MajiKoi5-783gue6qz075azm5e" 461 - 462 - (* (Q) パフィーdeルンバ *) 463 - let example_q_codepoints = 464 - [ 0x30D1; 0x30D5; 0x30A3; 0x30FC; 0x0064; 0x0065; 0x30EB; 0x30F3; 0x30D0 ] 465 - 466 - let example_q_punycode = "de-jg4avhby1noc0d" 467 - 468 - (* (R) そのスピードで *) 469 - let example_r_codepoints = 470 - [ 0x305D; 0x306E; 0x30B9; 0x30D4; 0x30FC; 0x30C9; 0x3067 ] 471 - 472 - let example_r_punycode = "d9juau41awczczp" 473 - 474 - (* (S) -> $1.00 <- (pure ASCII) *) 475 - let example_s_codepoints = 476 - [ 477 - 0x002D; 478 - 0x003E; 479 - 0x0020; 480 - 0x0024; 481 - 0x0031; 482 - 0x002E; 483 - 0x0030; 484 - 0x0030; 485 - 0x0020; 486 - 0x003C; 487 - 0x002D; 488 - ] 489 - 490 - let example_s_punycode = "-> $1.00 <--" 491 - 492 - (* Test functions *) 493 - 494 - let test_decode_arabic () = check_decode_ok arabic_codepoints arabic_punycode 495 - 496 - let test_decode_chinese_simplified () = 497 - check_decode_ok chinese_simplified_codepoints chinese_simplified_punycode 498 - 499 - let test_decode_chinese_traditional () = 500 - check_decode_ok chinese_traditional_codepoints chinese_traditional_punycode 501 - 502 - let test_decode_hebrew () = check_decode_ok hebrew_codepoints hebrew_punycode 503 - let test_decode_hindi () = check_decode_ok hindi_codepoints hindi_punycode 504 - 505 - let test_decode_japanese () = 506 - check_decode_ok japanese_codepoints japanese_punycode 507 - 508 - let test_decode_korean () = check_decode_ok korean_codepoints korean_punycode 509 - 510 - let test_decode_example_l () = 511 - check_decode_ok example_l_codepoints example_l_punycode 512 - 513 - let test_decode_example_m () = 514 - check_decode_ok example_m_codepoints example_m_punycode 515 - 516 - let test_decode_example_n () = 517 - check_decode_ok example_n_codepoints example_n_punycode 518 - 519 - let test_decode_example_o () = 520 - check_decode_ok example_o_codepoints example_o_punycode 521 - 522 - let test_decode_example_q () = 523 - check_decode_ok example_q_codepoints example_q_punycode 524 - 525 - let test_decode_example_r () = 526 - check_decode_ok example_r_codepoints example_r_punycode 527 - 528 - let test_decode_czech () = check_decode_ok czech_codepoints czech_punycode 529 - 530 - let test_decode_russian () = 531 - check_decode_ok russian_codepoints (String.lowercase_ascii russian_punycode) 532 - 533 - let test_decode_spanish () = check_decode_ok spanish_codepoints spanish_punycode 534 - 535 - let test_decode_vietnamese () = 536 - check_decode_ok vietnamese_codepoints vietnamese_punycode 537 - 538 - let test_decode_example_p () = 539 - check_decode_ok example_p_codepoints example_p_punycode 540 - 541 - let test_decode_example_s () = 542 - check_decode_ok example_s_codepoints example_s_punycode 543 - 544 - let test_encode_arabic () = 545 - check_encode_ok arabic_punycode (codepoints_of_hex_list arabic_codepoints) 546 - 547 - let test_encode_chinese_simplified () = 548 - check_encode_ok chinese_simplified_punycode 549 - (codepoints_of_hex_list chinese_simplified_codepoints) 550 - 551 - let test_encode_chinese_traditional () = 552 - check_encode_ok chinese_traditional_punycode 553 - (codepoints_of_hex_list chinese_traditional_codepoints) 554 - 555 - let test_encode_hebrew () = 556 - check_encode_ok hebrew_punycode (codepoints_of_hex_list hebrew_codepoints) 557 - 558 - let test_encode_hindi () = 559 - check_encode_ok hindi_punycode (codepoints_of_hex_list hindi_codepoints) 560 - 561 - let test_encode_japanese () = 562 - check_encode_ok japanese_punycode (codepoints_of_hex_list japanese_codepoints) 563 - 564 - let test_encode_korean () = 565 - check_encode_ok korean_punycode (codepoints_of_hex_list korean_codepoints) 566 - 567 - let test_encode_example_l () = 568 - check_encode_ok 569 - (String.lowercase_ascii example_l_punycode) 570 - (codepoints_of_hex_list example_l_codepoints) 571 - 572 - let test_encode_example_m () = 573 - check_encode_ok 574 - (String.lowercase_ascii example_m_punycode) 575 - (codepoints_of_hex_list example_m_codepoints) 576 - 577 - let test_encode_example_n () = 578 - check_encode_ok 579 - (String.lowercase_ascii example_n_punycode) 580 - (codepoints_of_hex_list example_n_codepoints) 581 - 582 - let test_encode_example_o () = 583 - check_encode_ok 584 - (String.lowercase_ascii example_o_punycode) 585 - (codepoints_of_hex_list example_o_codepoints) 586 - 587 - let test_encode_example_q () = 588 - check_encode_ok example_q_punycode 589 - (codepoints_of_hex_list example_q_codepoints) 590 - 591 - let test_encode_example_r () = 592 - check_encode_ok example_r_punycode 593 - (codepoints_of_hex_list example_r_codepoints) 594 - 595 - (* UTF-8 roundtrip tests *) 596 - let test_utf8_roundtrip_german () = check_utf8_roundtrip "münchen" 597 - let test_utf8_roundtrip_chinese () = check_utf8_roundtrip "中文" 598 - let test_utf8_roundtrip_japanese () = check_utf8_roundtrip "日本語" 599 - let test_utf8_roundtrip_arabic () = check_utf8_roundtrip "العربية" 600 - let test_utf8_roundtrip_russian () = check_utf8_roundtrip "русский" 601 - let test_utf8_roundtrip_greek () = check_utf8_roundtrip "ελληνικά" 602 - let test_utf8_roundtrip_korean () = check_utf8_roundtrip "한국어" 603 - let test_utf8_roundtrip_emoji () = check_utf8_roundtrip "hello👋world" 604 - 605 - (* Label encoding tests *) 606 - let test_label_encode_ascii () = 607 - match Punycode.encode_label "example" with 608 - | Ok result -> check string "ascii passthrough" "example" result 609 - | Error e -> 610 - fail (Format.asprintf "encode_label failed: %a" Punycode.pp_error e) 611 - 612 - let test_label_encode_german () = 613 - match Punycode.encode_label "münchen" with 614 - | Ok result -> check string "german label" "xn--mnchen-3ya" result 615 - | Error e -> 616 - fail (Format.asprintf "encode_label failed: %a" Punycode.pp_error e) 617 - 618 - let test_label_decode_german () = 619 - match Punycode.decode_label "xn--mnchen-3ya" with 620 - | Ok result -> check string "german decode" "münchen" result 621 - | Error e -> 622 - fail (Format.asprintf "decode_label failed: %a" Punycode.pp_error e) 623 - 624 - (* IDNA tests *) 625 - let test_idna_to_ascii_simple () = 626 - match Punycode_idna.to_ascii "münchen.example.com" with 627 - | Ok result -> 628 - check string "idna to_ascii" "xn--mnchen-3ya.example.com" result 629 - | Error e -> 630 - fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 631 - 632 - let test_idna_to_unicode_simple () = 633 - match Punycode_idna.to_unicode "xn--mnchen-3ya.example.com" with 634 - | Ok result -> check string "idna to_unicode" "münchen.example.com" result 635 - | Error e -> 636 - fail (Format.asprintf "to_unicode failed: %a" Punycode_idna.pp_error e) 637 - 638 - let test_idna_roundtrip () = 639 - let original = "münchen.example.com" in 640 - match Punycode_idna.to_ascii original with 641 - | Error e -> 642 - fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 643 - | Ok ascii -> ( 644 - match Punycode_idna.to_unicode ascii with 645 - | Error e -> 646 - fail 647 - (Format.asprintf "to_unicode failed: %a" Punycode_idna.pp_error e) 648 - | Ok unicode -> check string "idna roundtrip" original unicode) 649 - 650 - let test_idna_all_ascii () = 651 - match Punycode_idna.to_ascii "www.example.com" with 652 - | Ok result -> check string "all ascii passthrough" "www.example.com" result 653 - | Error e -> 654 - fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 655 - 656 - let test_idna_mixed_labels () = 657 - match Punycode_idna.to_ascii "日本語.example.com" with 658 - | Ok result -> 659 - (* Check that result starts with xn-- and ends with .example.com *) 660 - check bool "has ace prefix" true (Punycode.has_ace_prefix result); 661 - check bool "ends with example.com" true 662 - (String.length result > 12 663 - && String.sub result (String.length result - 12) 12 = ".example.com") 664 - | Error e -> 665 - fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 666 - 667 - (* Case annotation tests *) 668 - let test_case_annotation_decode () = 669 - (* RFC example: uppercase letters indicate case flags *) 670 - match Punycode.decode_with_case "MajiKoi5-783gue6qz075azm5e" with 671 - | Ok (codepoints, case_flags) -> 672 - check int "codepoints length" 673 - (List.length example_p_codepoints) 674 - (Array.length codepoints); 675 - check int "case_flags length" (Array.length codepoints) 676 - (Array.length case_flags); 677 - (* M should be uppercase *) 678 - check bool "M uppercase" true (case_flags.(0) = Punycode.Uppercase); 679 - (* a should be lowercase *) 680 - check bool "a lowercase" true (case_flags.(1) = Punycode.Lowercase) 681 - | Error e -> 682 - fail (Format.asprintf "decode_with_case failed: %a" Punycode.pp_error e) 683 - 684 - let test_case_annotation_encode () = 685 - let codepoints = codepoints_of_hex_list [ 0x0061; 0x0062; 0x0063 ] in 686 - (* "abc" *) 687 - let case_flags = 688 - [| Punycode.Uppercase; Punycode.Lowercase; Punycode.Uppercase |] 689 - in 690 - match Punycode.encode_with_case codepoints case_flags with 691 - | Ok result -> 692 - (* Should encode as "AbC-" (basic code points with case annotation) *) 693 - check string "case encoded" "AbC-" result 694 - | Error e -> 695 - fail (Format.asprintf "encode_with_case failed: %a" Punycode.pp_error e) 696 - 697 - (* Edge case tests *) 698 - let test_empty_input () = 699 - match Punycode.encode [||] with 700 - | Ok result -> check string "empty encode" "" result 701 - | Error _ -> fail "empty encode should succeed" 702 - 703 - let test_empty_decode () = 704 - match Punycode.decode "" with 705 - | Ok result -> check int "empty decode length" 0 (Array.length result) 706 - | Error _ -> fail "empty decode should succeed" 707 - 708 - let test_pure_ascii () = 709 - let input = codepoints_of_string "hello" in 710 - match Punycode.encode input with 711 - | Ok result -> check string "pure ascii" "hello-" result 712 - | Error e -> fail (Format.asprintf "encode failed: %a" Punycode.pp_error e) 713 - 714 - let test_invalid_digit () = 715 - match Punycode.decode "hello!" with 716 - | Ok _ -> fail "should fail on invalid digit" 717 - | Error (Punycode.Invalid_digit _) -> () 718 - | Error e -> fail (Format.asprintf "wrong error type: %a" Punycode.pp_error e) 719 - 720 - let test_label_too_long () = 721 - let long_label = String.make 100 'a' in 722 - match Punycode.encode_label long_label with 723 - | Ok _ -> fail "should fail on long label" 724 - | Error (Punycode.Label_too_long _) -> () 725 - | Error e -> fail (Format.asprintf "wrong error type: %a" Punycode.pp_error e) 726 - 727 - let test_empty_label () = 728 - match Punycode.encode_label "" with 729 - | Ok _ -> fail "should fail on empty label" 730 - | Error Punycode.Empty_label -> () 731 - | Error e -> fail (Format.asprintf "wrong error type: %a" Punycode.pp_error e) 732 - 733 - (* Validation tests *) 734 - let test_is_basic () = 735 - check bool "space is basic" true (Punycode.is_basic (Uchar.of_int 0x20)); 736 - check bool "A is basic" true (Punycode.is_basic (Uchar.of_int 0x41)); 737 - check bool "DEL is basic" true (Punycode.is_basic (Uchar.of_int 0x7F)); 738 - check bool "0x80 not basic" false (Punycode.is_basic (Uchar.of_int 0x80)); 739 - check bool "ü not basic" false (Punycode.is_basic (Uchar.of_int 0xFC)) 740 - 741 - let test_is_ascii_string () = 742 - check bool "ascii string" true (Punycode.is_ascii_string "hello"); 743 - check bool "non-ascii string" false (Punycode.is_ascii_string "héllo"); 744 - check bool "empty string" true (Punycode.is_ascii_string "") 745 - 746 - let test_has_ace_prefix () = 747 - check bool "has xn--" true (Punycode.has_ace_prefix "xn--mnchen-3ya"); 748 - check bool "has XN--" true (Punycode.has_ace_prefix "XN--mnchen-3ya"); 749 - check bool "no prefix" false (Punycode.has_ace_prefix "example"); 750 - check bool "too short" false (Punycode.has_ace_prefix "xn-") 751 - 752 - (* Test suites *) 753 - let decode_tests = 754 - [ 755 - ("Arabic", `Quick, test_decode_arabic); 756 - ("Chinese simplified", `Quick, test_decode_chinese_simplified); 757 - ("Chinese traditional", `Quick, test_decode_chinese_traditional); 758 - ("Czech", `Quick, test_decode_czech); 759 - ("Hebrew", `Quick, test_decode_hebrew); 760 - ("Hindi", `Quick, test_decode_hindi); 761 - ("Japanese", `Quick, test_decode_japanese); 762 - ("Korean", `Quick, test_decode_korean); 763 - ("Russian", `Quick, test_decode_russian); 764 - ("Spanish", `Quick, test_decode_spanish); 765 - ("Vietnamese", `Quick, test_decode_vietnamese); 766 - ("Example L (mixed)", `Quick, test_decode_example_l); 767 - ("Example M (mixed)", `Quick, test_decode_example_m); 768 - ("Example N (mixed)", `Quick, test_decode_example_n); 769 - ("Example O (mixed)", `Quick, test_decode_example_o); 770 - ("Example P (mixed)", `Quick, test_decode_example_p); 771 - ("Example Q (mixed)", `Quick, test_decode_example_q); 772 - ("Example R", `Quick, test_decode_example_r); 773 - ("Example S (ASCII)", `Quick, test_decode_example_s); 774 - ] 775 - 776 - let encode_tests = 777 - [ 778 - ("Arabic", `Quick, test_encode_arabic); 779 - ("Chinese simplified", `Quick, test_encode_chinese_simplified); 780 - ("Chinese traditional", `Quick, test_encode_chinese_traditional); 781 - ("Hebrew", `Quick, test_encode_hebrew); 782 - ("Hindi", `Quick, test_encode_hindi); 783 - ("Japanese", `Quick, test_encode_japanese); 784 - ("Korean", `Quick, test_encode_korean); 785 - ("Example L (mixed)", `Quick, test_encode_example_l); 786 - ("Example M (mixed)", `Quick, test_encode_example_m); 787 - ("Example N (mixed)", `Quick, test_encode_example_n); 788 - ("Example O (mixed)", `Quick, test_encode_example_o); 789 - ("Example Q (mixed)", `Quick, test_encode_example_q); 790 - ("Example R", `Quick, test_encode_example_r); 791 - ] 792 - 793 - let utf8_tests = 794 - [ 795 - ("German roundtrip", `Quick, test_utf8_roundtrip_german); 796 - ("Chinese roundtrip", `Quick, test_utf8_roundtrip_chinese); 797 - ("Japanese roundtrip", `Quick, test_utf8_roundtrip_japanese); 798 - ("Arabic roundtrip", `Quick, test_utf8_roundtrip_arabic); 799 - ("Russian roundtrip", `Quick, test_utf8_roundtrip_russian); 800 - ("Greek roundtrip", `Quick, test_utf8_roundtrip_greek); 801 - ("Korean roundtrip", `Quick, test_utf8_roundtrip_korean); 802 - ("Emoji roundtrip", `Quick, test_utf8_roundtrip_emoji); 803 - ] 804 - 805 - let label_tests = 806 - [ 807 - ("ASCII passthrough", `Quick, test_label_encode_ascii); 808 - ("German encode", `Quick, test_label_encode_german); 809 - ("German decode", `Quick, test_label_decode_german); 810 - ] 811 - 812 - let idna_tests = 813 - [ 814 - ("to_ascii simple", `Quick, test_idna_to_ascii_simple); 815 - ("to_unicode simple", `Quick, test_idna_to_unicode_simple); 816 - ("roundtrip", `Quick, test_idna_roundtrip); 817 - ("all ASCII", `Quick, test_idna_all_ascii); 818 - ("mixed labels", `Quick, test_idna_mixed_labels); 819 - ] 820 - 821 - let case_tests = 822 - [ 823 - ("decode with case", `Quick, test_case_annotation_decode); 824 - ("encode with case", `Quick, test_case_annotation_encode); 825 - ] 826 - 827 - let edge_case_tests = 828 - [ 829 - ("empty encode", `Quick, test_empty_input); 830 - ("empty decode", `Quick, test_empty_decode); 831 - ("pure ASCII", `Quick, test_pure_ascii); 832 - ("invalid digit", `Quick, test_invalid_digit); 833 - ("label too long", `Quick, test_label_too_long); 834 - ("empty label", `Quick, test_empty_label); 835 - ] 836 - 837 - let validation_tests = 838 - [ 839 - ("is_basic", `Quick, test_is_basic); 840 - ("is_ascii_string", `Quick, test_is_ascii_string); 841 - ("has_ace_prefix", `Quick, test_has_ace_prefix); 842 - ] 843 - 844 - let () = 845 - run "Punycode" 846 - [ 847 - ("decode RFC vectors", decode_tests); 848 - ("encode RFC vectors", encode_tests); 849 - ("UTF-8 roundtrip", utf8_tests); 850 - ("label operations", label_tests); 851 - ("IDNA operations", idna_tests); 852 - ("case annotation", case_tests); 853 - ("edge cases", edge_case_tests); 854 - ("validation", validation_tests); 855 - ]