we (web engine): Experimental web browser project to understand the limits of Claude
2
fork

Configure Feed

Select the types of activity you want to include in your feed.

Implement WHATWG Encoding: legacy single-byte encodings

Add all 27 WHATWG single-byte encodings to the encoding crate:
- Windows-1252 (with ISO-8859-1 and US-ASCII aliases per WHATWG spec)
- ISO-8859-2 through ISO-8859-16 (excluding 8859-9 which maps to Windows-1254)
- Windows-874, Windows-1250 through Windows-1258
- KOI8-R, KOI8-U
- macintosh, x-mac-cyrillic
- IBM866

Each encoding uses a 128-entry u16 lookup table for bytes 0x80-0xFF.
All WHATWG label aliases are registered for case-insensitive lookup.
32 new tests covering per-encoding correctness and error handling.

No external dependencies, no unsafe.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

+1221 -15
+363 -15
crates/encoding/src/lib.rs
··· 1 - //! WHATWG Encoding Standard — UTF-8 and UTF-16 codecs, pure Rust. 1 + //! WHATWG Encoding Standard — UTF-8, UTF-16, and legacy single-byte codecs, pure Rust. 2 2 3 3 pub mod error; 4 + mod single_byte; 4 5 mod utf16; 5 6 mod utf8; 6 7 ··· 17 18 Utf8, 18 19 Utf16Be, 19 20 Utf16Le, 21 + // Single-byte encodings 22 + Ibm866, 23 + Iso8859_2, 24 + Iso8859_3, 25 + Iso8859_4, 26 + Iso8859_5, 27 + Iso8859_6, 28 + Iso8859_7, 29 + Iso8859_8, 30 + Iso8859_8I, 31 + Iso8859_10, 32 + Iso8859_13, 33 + Iso8859_14, 34 + Iso8859_15, 35 + Iso8859_16, 36 + Koi8R, 37 + Koi8U, 38 + Macintosh, 39 + Windows874, 40 + Windows1250, 41 + Windows1251, 42 + Windows1252, 43 + Windows1253, 44 + Windows1254, 45 + Windows1255, 46 + Windows1256, 47 + Windows1257, 48 + Windows1258, 49 + XMacCyrillic, 20 50 } 21 51 22 52 impl Encoding { ··· 26 56 Self::Utf8 => "UTF-8", 27 57 Self::Utf16Be => "UTF-16BE", 28 58 Self::Utf16Le => "UTF-16LE", 59 + Self::Ibm866 => "IBM866", 60 + Self::Iso8859_2 => "ISO-8859-2", 61 + Self::Iso8859_3 => "ISO-8859-3", 62 + Self::Iso8859_4 => "ISO-8859-4", 63 + Self::Iso8859_5 => "ISO-8859-5", 64 + Self::Iso8859_6 => "ISO-8859-6", 65 + Self::Iso8859_7 => "ISO-8859-7", 66 + Self::Iso8859_8 => "ISO-8859-8", 67 + Self::Iso8859_8I => "ISO-8859-8-I", 68 + Self::Iso8859_10 => "ISO-8859-10", 69 + Self::Iso8859_13 => "ISO-8859-13", 70 + Self::Iso8859_14 => "ISO-8859-14", 71 + Self::Iso8859_15 => "ISO-8859-15", 72 + Self::Iso8859_16 => "ISO-8859-16", 73 + Self::Koi8R => "KOI8-R", 74 + Self::Koi8U => "KOI8-U", 75 + Self::Macintosh => "macintosh", 76 + Self::Windows874 => "windows-874", 77 + Self::Windows1250 => "windows-1250", 78 + Self::Windows1251 => "windows-1251", 79 + Self::Windows1252 => "windows-1252", 80 + Self::Windows1253 => "windows-1253", 81 + Self::Windows1254 => "windows-1254", 82 + Self::Windows1255 => "windows-1255", 83 + Self::Windows1256 => "windows-1256", 84 + Self::Windows1257 => "windows-1257", 85 + Self::Windows1258 => "windows-1258", 86 + Self::XMacCyrillic => "x-mac-cyrillic", 29 87 } 30 88 } 31 89 } ··· 55 113 ("unicodefeff", Encoding::Utf16Le), 56 114 ("utf-16", Encoding::Utf16Le), 57 115 ("utf-16le", Encoding::Utf16Le), 116 + // IBM866 labels 117 + ("866", Encoding::Ibm866), 118 + ("cp866", Encoding::Ibm866), 119 + ("csibm866", Encoding::Ibm866), 120 + ("ibm866", Encoding::Ibm866), 121 + // ISO-8859-2 labels 122 + ("csisolatin2", Encoding::Iso8859_2), 123 + ("iso-8859-2", Encoding::Iso8859_2), 124 + ("iso-ir-101", Encoding::Iso8859_2), 125 + ("iso8859-2", Encoding::Iso8859_2), 126 + ("iso88592", Encoding::Iso8859_2), 127 + ("iso_8859-2", Encoding::Iso8859_2), 128 + ("iso_8859-2:1987", Encoding::Iso8859_2), 129 + ("l2", Encoding::Iso8859_2), 130 + ("latin2", Encoding::Iso8859_2), 131 + // ISO-8859-3 labels 132 + ("csisolatin3", Encoding::Iso8859_3), 133 + ("iso-8859-3", Encoding::Iso8859_3), 134 + ("iso-ir-109", Encoding::Iso8859_3), 135 + ("iso8859-3", Encoding::Iso8859_3), 136 + ("iso88593", Encoding::Iso8859_3), 137 + ("iso_8859-3", Encoding::Iso8859_3), 138 + ("iso_8859-3:1988", Encoding::Iso8859_3), 139 + ("l3", Encoding::Iso8859_3), 140 + ("latin3", Encoding::Iso8859_3), 141 + // ISO-8859-4 labels 142 + ("csisolatin4", Encoding::Iso8859_4), 143 + ("iso-8859-4", Encoding::Iso8859_4), 144 + ("iso-ir-110", Encoding::Iso8859_4), 145 + ("iso8859-4", Encoding::Iso8859_4), 146 + ("iso88594", Encoding::Iso8859_4), 147 + ("iso_8859-4", Encoding::Iso8859_4), 148 + ("iso_8859-4:1988", Encoding::Iso8859_4), 149 + ("l4", Encoding::Iso8859_4), 150 + ("latin4", Encoding::Iso8859_4), 151 + // ISO-8859-5 labels 152 + ("csisolatincyrillic", Encoding::Iso8859_5), 153 + ("cyrillic", Encoding::Iso8859_5), 154 + ("iso-8859-5", Encoding::Iso8859_5), 155 + ("iso-ir-144", Encoding::Iso8859_5), 156 + ("iso8859-5", Encoding::Iso8859_5), 157 + ("iso88595", Encoding::Iso8859_5), 158 + ("iso_8859-5", Encoding::Iso8859_5), 159 + ("iso_8859-5:1988", Encoding::Iso8859_5), 160 + // ISO-8859-6 labels 161 + ("arabic", Encoding::Iso8859_6), 162 + ("asmo-708", Encoding::Iso8859_6), 163 + ("csiso88596e", Encoding::Iso8859_6), 164 + ("csiso88596i", Encoding::Iso8859_6), 165 + ("csisolatinarabic", Encoding::Iso8859_6), 166 + ("ecma-114", Encoding::Iso8859_6), 167 + ("iso-8859-6", Encoding::Iso8859_6), 168 + ("iso-8859-6-e", Encoding::Iso8859_6), 169 + ("iso-8859-6-i", Encoding::Iso8859_6), 170 + ("iso-ir-127", Encoding::Iso8859_6), 171 + ("iso8859-6", Encoding::Iso8859_6), 172 + ("iso88596", Encoding::Iso8859_6), 173 + ("iso_8859-6", Encoding::Iso8859_6), 174 + ("iso_8859-6:1987", Encoding::Iso8859_6), 175 + // ISO-8859-7 labels 176 + ("csisolatingreek", Encoding::Iso8859_7), 177 + ("ecma-118", Encoding::Iso8859_7), 178 + ("elot_928", Encoding::Iso8859_7), 179 + ("greek", Encoding::Iso8859_7), 180 + ("greek8", Encoding::Iso8859_7), 181 + ("iso-8859-7", Encoding::Iso8859_7), 182 + ("iso-ir-126", Encoding::Iso8859_7), 183 + ("iso8859-7", Encoding::Iso8859_7), 184 + ("iso88597", Encoding::Iso8859_7), 185 + ("iso_8859-7", Encoding::Iso8859_7), 186 + ("iso_8859-7:1987", Encoding::Iso8859_7), 187 + ("sun_eu_greek", Encoding::Iso8859_7), 188 + // ISO-8859-8 labels 189 + ("csiso88598e", Encoding::Iso8859_8), 190 + ("csisolatinhebrew", Encoding::Iso8859_8), 191 + ("hebrew", Encoding::Iso8859_8), 192 + ("iso-8859-8", Encoding::Iso8859_8), 193 + ("iso-8859-8-e", Encoding::Iso8859_8), 194 + ("iso-ir-138", Encoding::Iso8859_8), 195 + ("iso8859-8", Encoding::Iso8859_8), 196 + ("iso88598", Encoding::Iso8859_8), 197 + ("iso_8859-8", Encoding::Iso8859_8), 198 + ("iso_8859-8:1988", Encoding::Iso8859_8), 199 + ("visual", Encoding::Iso8859_8), 200 + // ISO-8859-8-I labels 201 + ("csiso88598i", Encoding::Iso8859_8I), 202 + ("iso-8859-8-i", Encoding::Iso8859_8I), 203 + ("logical", Encoding::Iso8859_8I), 204 + // ISO-8859-10 labels 205 + ("csisolatin6", Encoding::Iso8859_10), 206 + ("iso-8859-10", Encoding::Iso8859_10), 207 + ("iso-ir-157", Encoding::Iso8859_10), 208 + ("iso8859-10", Encoding::Iso8859_10), 209 + ("iso885910", Encoding::Iso8859_10), 210 + ("l6", Encoding::Iso8859_10), 211 + ("latin6", Encoding::Iso8859_10), 212 + // ISO-8859-13 labels 213 + ("iso-8859-13", Encoding::Iso8859_13), 214 + ("iso8859-13", Encoding::Iso8859_13), 215 + ("iso885913", Encoding::Iso8859_13), 216 + // ISO-8859-14 labels 217 + ("iso-8859-14", Encoding::Iso8859_14), 218 + ("iso8859-14", Encoding::Iso8859_14), 219 + ("iso885914", Encoding::Iso8859_14), 220 + // ISO-8859-15 labels 221 + ("csisolatin9", Encoding::Iso8859_15), 222 + ("iso-8859-15", Encoding::Iso8859_15), 223 + ("iso8859-15", Encoding::Iso8859_15), 224 + ("iso885915", Encoding::Iso8859_15), 225 + ("iso_8859-15", Encoding::Iso8859_15), 226 + ("l9", Encoding::Iso8859_15), 227 + // ISO-8859-16 labels 228 + ("iso-8859-16", Encoding::Iso8859_16), 229 + // KOI8-R labels 230 + ("cskoi8r", Encoding::Koi8R), 231 + ("koi", Encoding::Koi8R), 232 + ("koi8", Encoding::Koi8R), 233 + ("koi8-r", Encoding::Koi8R), 234 + ("koi8_r", Encoding::Koi8R), 235 + // KOI8-U labels 236 + ("koi8-ru", Encoding::Koi8U), 237 + ("koi8-u", Encoding::Koi8U), 238 + // macintosh labels 239 + ("csmacintosh", Encoding::Macintosh), 240 + ("mac", Encoding::Macintosh), 241 + ("macintosh", Encoding::Macintosh), 242 + ("x-mac-roman", Encoding::Macintosh), 243 + // windows-874 labels 244 + ("dos-874", Encoding::Windows874), 245 + ("iso-8859-11", Encoding::Windows874), 246 + ("iso8859-11", Encoding::Windows874), 247 + ("iso885911", Encoding::Windows874), 248 + ("tis-620", Encoding::Windows874), 249 + ("windows-874", Encoding::Windows874), 250 + // windows-1250 labels 251 + ("cp1250", Encoding::Windows1250), 252 + ("windows-1250", Encoding::Windows1250), 253 + ("x-cp1250", Encoding::Windows1250), 254 + // windows-1251 labels 255 + ("cp1251", Encoding::Windows1251), 256 + ("windows-1251", Encoding::Windows1251), 257 + ("x-cp1251", Encoding::Windows1251), 258 + // windows-1252 labels (also serves as ISO-8859-1 and US-ASCII per WHATWG) 259 + ("ansi_x3.4-1968", Encoding::Windows1252), 260 + ("ascii", Encoding::Windows1252), 261 + ("cp1252", Encoding::Windows1252), 262 + ("cp819", Encoding::Windows1252), 263 + ("csisolatin1", Encoding::Windows1252), 264 + ("ibm819", Encoding::Windows1252), 265 + ("iso-8859-1", Encoding::Windows1252), 266 + ("iso-ir-100", Encoding::Windows1252), 267 + ("iso8859-1", Encoding::Windows1252), 268 + ("iso88591", Encoding::Windows1252), 269 + ("iso_8859-1", Encoding::Windows1252), 270 + ("iso_8859-1:1987", Encoding::Windows1252), 271 + ("l1", Encoding::Windows1252), 272 + ("latin1", Encoding::Windows1252), 273 + ("us-ascii", Encoding::Windows1252), 274 + ("windows-1252", Encoding::Windows1252), 275 + ("x-cp1252", Encoding::Windows1252), 276 + // windows-1253 labels 277 + ("cp1253", Encoding::Windows1253), 278 + ("windows-1253", Encoding::Windows1253), 279 + ("x-cp1253", Encoding::Windows1253), 280 + // windows-1254 labels 281 + ("cp1254", Encoding::Windows1254), 282 + ("csisolatin5", Encoding::Windows1254), 283 + ("iso-8859-9", Encoding::Windows1254), 284 + ("iso-ir-148", Encoding::Windows1254), 285 + ("iso8859-9", Encoding::Windows1254), 286 + ("iso88599", Encoding::Windows1254), 287 + ("iso_8859-9", Encoding::Windows1254), 288 + ("iso_8859-9:1989", Encoding::Windows1254), 289 + ("l5", Encoding::Windows1254), 290 + ("latin5", Encoding::Windows1254), 291 + ("windows-1254", Encoding::Windows1254), 292 + ("x-cp1254", Encoding::Windows1254), 293 + // windows-1255 labels 294 + ("cp1255", Encoding::Windows1255), 295 + ("windows-1255", Encoding::Windows1255), 296 + ("x-cp1255", Encoding::Windows1255), 297 + // windows-1256 labels 298 + ("cp1256", Encoding::Windows1256), 299 + ("windows-1256", Encoding::Windows1256), 300 + ("x-cp1256", Encoding::Windows1256), 301 + // windows-1257 labels 302 + ("cp1257", Encoding::Windows1257), 303 + ("windows-1257", Encoding::Windows1257), 304 + ("x-cp1257", Encoding::Windows1257), 305 + // windows-1258 labels 306 + ("cp1258", Encoding::Windows1258), 307 + ("windows-1258", Encoding::Windows1258), 308 + ("x-cp1258", Encoding::Windows1258), 309 + // x-mac-cyrillic labels 310 + ("x-mac-cyrillic", Encoding::XMacCyrillic), 311 + ("x-mac-ukrainian", Encoding::XMacCyrillic), 58 312 ]; 59 313 60 314 /// Look up an encoding by its WHATWG label. ··· 102 356 Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Replacement).unwrap(), 103 357 Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Replacement).unwrap(), 104 358 Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Replacement).unwrap(), 359 + enc => { 360 + let table = single_byte::table_for(&enc).unwrap(); 361 + single_byte::decode_single_byte(bytes, table, enc.name(), ErrorMode::Replacement) 362 + .unwrap() 363 + } 105 364 } 106 365 } 107 366 ··· 113 372 Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Fatal), 114 373 Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Fatal), 115 374 Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Fatal), 375 + enc => { 376 + let table = single_byte::table_for(&enc).unwrap(); 377 + single_byte::decode_single_byte(bytes, table, enc.name(), ErrorMode::Fatal) 378 + } 116 379 } 117 380 } 118 381 119 382 /// Encode a string to bytes using the given encoding. 120 383 /// 121 - /// Only UTF-8 encoding is supported for encode. Per WHATWG spec, UTF-16 384 + /// Only UTF-8 encoding is supported for encode. Per WHATWG spec, all other 122 385 /// encodings are decode-only. 123 386 pub fn encode(text: &str, encoding: Encoding) -> Result<Vec<u8>> { 124 387 match encoding { 125 388 Encoding::Utf8 => Ok(utf8::encode_utf8(text)), 126 - Encoding::Utf16Be => Err(EncodingError::EncodeNotSupported { 127 - encoding: "UTF-16BE", 128 - }), 129 - Encoding::Utf16Le => Err(EncodingError::EncodeNotSupported { 130 - encoding: "UTF-16LE", 389 + other => Err(EncodingError::EncodeNotSupported { 390 + encoding: other.name(), 131 391 }), 132 392 } 133 393 } ··· 177 437 assert_eq!(Encoding::Utf8.name(), "UTF-8"); 178 438 assert_eq!(Encoding::Utf16Be.name(), "UTF-16BE"); 179 439 assert_eq!(Encoding::Utf16Le.name(), "UTF-16LE"); 440 + assert_eq!(Encoding::Windows1252.name(), "windows-1252"); 441 + assert_eq!(Encoding::Iso8859_2.name(), "ISO-8859-2"); 442 + assert_eq!(Encoding::Koi8R.name(), "KOI8-R"); 443 + assert_eq!(Encoding::Macintosh.name(), "macintosh"); 180 444 } 181 445 182 446 // -- Label lookup -- ··· 204 468 } 205 469 206 470 #[test] 471 + fn lookup_windows_1252_labels() { 472 + // windows-1252 is THE most important single-byte encoding 473 + assert_eq!(lookup("windows-1252"), Some(Encoding::Windows1252)); 474 + assert_eq!(lookup("cp1252"), Some(Encoding::Windows1252)); 475 + assert_eq!(lookup("x-cp1252"), Some(Encoding::Windows1252)); 476 + // ISO-8859-1 maps to windows-1252 per WHATWG 477 + assert_eq!(lookup("iso-8859-1"), Some(Encoding::Windows1252)); 478 + assert_eq!(lookup("latin1"), Some(Encoding::Windows1252)); 479 + assert_eq!(lookup("l1"), Some(Encoding::Windows1252)); 480 + // US-ASCII maps to windows-1252 per WHATWG 481 + assert_eq!(lookup("us-ascii"), Some(Encoding::Windows1252)); 482 + assert_eq!(lookup("ascii"), Some(Encoding::Windows1252)); 483 + } 484 + 485 + #[test] 486 + fn lookup_legacy_labels() { 487 + assert_eq!(lookup("iso-8859-2"), Some(Encoding::Iso8859_2)); 488 + assert_eq!(lookup("latin2"), Some(Encoding::Iso8859_2)); 489 + assert_eq!(lookup("iso-8859-5"), Some(Encoding::Iso8859_5)); 490 + assert_eq!(lookup("cyrillic"), Some(Encoding::Iso8859_5)); 491 + assert_eq!(lookup("iso-8859-7"), Some(Encoding::Iso8859_7)); 492 + assert_eq!(lookup("greek"), Some(Encoding::Iso8859_7)); 493 + assert_eq!(lookup("iso-8859-15"), Some(Encoding::Iso8859_15)); 494 + assert_eq!(lookup("koi8-r"), Some(Encoding::Koi8R)); 495 + assert_eq!(lookup("koi8-u"), Some(Encoding::Koi8U)); 496 + assert_eq!(lookup("macintosh"), Some(Encoding::Macintosh)); 497 + assert_eq!(lookup("ibm866"), Some(Encoding::Ibm866)); 498 + assert_eq!(lookup("windows-1251"), Some(Encoding::Windows1251)); 499 + assert_eq!(lookup("windows-874"), Some(Encoding::Windows874)); 500 + assert_eq!(lookup("iso-8859-9"), Some(Encoding::Windows1254)); 501 + assert_eq!(lookup("x-mac-cyrillic"), Some(Encoding::XMacCyrillic)); 502 + } 503 + 504 + #[test] 207 505 fn lookup_with_whitespace() { 208 506 assert_eq!(lookup(" utf-8 "), Some(Encoding::Utf8)); 209 507 assert_eq!(lookup("\tutf-8\n"), Some(Encoding::Utf8)); 210 508 assert_eq!(lookup("\r\nutf-16le\r\n"), Some(Encoding::Utf16Le)); 509 + assert_eq!(lookup(" windows-1252 "), Some(Encoding::Windows1252)); 211 510 } 212 511 213 512 #[test] 214 513 fn lookup_unknown() { 215 - assert_eq!(lookup("latin1"), None); 216 514 assert_eq!(lookup(""), None); 217 515 assert_eq!(lookup(" "), None); 218 516 assert_eq!(lookup("utf-99"), None); 517 + assert_eq!(lookup("bogus-encoding"), None); 219 518 } 220 519 221 520 // -- BOM sniffing -- ··· 285 584 assert_eq!(decode(&[0x00, 0x41], Encoding::Utf16Be), "A"); 286 585 } 287 586 587 + #[test] 588 + fn decode_windows_1252_euro() { 589 + assert_eq!(decode(&[0x80], Encoding::Windows1252), "\u{20AC}"); 590 + } 591 + 592 + #[test] 593 + fn decode_windows_1252_cafe() { 594 + // "Café" in windows-1252 595 + assert_eq!( 596 + decode(&[0x43, 0x61, 0x66, 0xE9], Encoding::Windows1252), 597 + "Caf\u{00E9}" 598 + ); 599 + } 600 + 601 + #[test] 602 + fn decode_iso_8859_2() { 603 + // 0xA1 → Ą 604 + assert_eq!(decode(&[0xA1], Encoding::Iso8859_2), "\u{0104}"); 605 + } 606 + 607 + #[test] 608 + fn decode_koi8r_cyrillic() { 609 + // 0xE1 → А (U+0410) 610 + assert_eq!(decode(&[0xE1], Encoding::Koi8R), "\u{0410}"); 611 + } 612 + 613 + #[test] 614 + fn decode_windows_1251_cyrillic() { 615 + // 0xC0 → А (U+0410), 0xE0 → а (U+0430) 616 + assert_eq!( 617 + decode(&[0xC0, 0xE0], Encoding::Windows1251), 618 + "\u{0410}\u{0430}" 619 + ); 620 + } 621 + 288 622 // -- Top-level decode_strict -- 289 623 290 624 #[test] ··· 297 631 assert!(decode_strict(&[0xFF], Encoding::Utf8).is_err()); 298 632 } 299 633 634 + #[test] 635 + fn decode_strict_single_byte_unmapped() { 636 + // ISO-8859-3 byte 0xA5 is unmapped 637 + assert!(decode_strict(&[0xA5], Encoding::Iso8859_3).is_err()); 638 + } 639 + 640 + #[test] 641 + fn decode_strict_single_byte_valid() { 642 + assert_eq!( 643 + decode_strict(&[0x80], Encoding::Windows1252).unwrap(), 644 + "\u{20AC}" 645 + ); 646 + } 647 + 300 648 // -- Top-level encode -- 301 649 302 650 #[test] ··· 305 653 } 306 654 307 655 #[test] 308 - fn encode_utf16_not_supported() { 656 + fn encode_non_utf8_not_supported() { 309 657 assert!(matches!( 310 658 encode("Hello", Encoding::Utf16Le), 311 - Err(EncodingError::EncodeNotSupported { 312 - encoding: "UTF-16LE" 313 - }) 659 + Err(EncodingError::EncodeNotSupported { .. }) 314 660 )); 315 661 assert!(matches!( 316 662 encode("Hello", Encoding::Utf16Be), 317 - Err(EncodingError::EncodeNotSupported { 318 - encoding: "UTF-16BE" 319 - }) 663 + Err(EncodingError::EncodeNotSupported { .. }) 664 + )); 665 + assert!(matches!( 666 + encode("Hello", Encoding::Windows1252), 667 + Err(EncodingError::EncodeNotSupported { .. }) 320 668 )); 321 669 } 322 670
+858
crates/encoding/src/single_byte.rs
··· 1 + //! Single-byte encoding decoders per WHATWG Encoding Standard §14.1. 2 + //! 3 + //! Each single-byte encoding maps bytes 0x00–0x7F to ASCII and 4 + //! bytes 0x80–0xFF via a 128-entry lookup table to Unicode codepoints. 5 + 6 + use crate::error::{EncodingError, Result}; 7 + use crate::utf8::ErrorMode; 8 + 9 + /// Decode bytes using a single-byte encoding lookup table. 10 + /// 11 + /// Bytes 0x00–0x7F are ASCII. Bytes 0x80–0xFF are looked up in `table`. 12 + /// Table entries of `0xFFFD` indicate unmapped bytes. 13 + pub(crate) fn decode_single_byte( 14 + bytes: &[u8], 15 + table: &[u16; 128], 16 + encoding_name: &'static str, 17 + mode: ErrorMode, 18 + ) -> Result<String> { 19 + let mut output = String::with_capacity(bytes.len()); 20 + for (i, &byte) in bytes.iter().enumerate() { 21 + if byte < 0x80 { 22 + output.push(byte as char); 23 + } else { 24 + let cp = table[(byte - 0x80) as usize]; 25 + if cp == 0xFFFD { 26 + if mode == ErrorMode::Fatal { 27 + return Err(EncodingError::InvalidSequence { 28 + encoding: encoding_name, 29 + position: i, 30 + }); 31 + } 32 + output.push('\u{FFFD}'); 33 + } else { 34 + output.push(char::from_u32(cp as u32).unwrap_or('\u{FFFD}')); 35 + } 36 + } 37 + } 38 + Ok(output) 39 + } 40 + 41 + // --------------------------------------------------------------------------- 42 + // WHATWG single-byte encoding index tables 43 + // Each table maps bytes 0x80–0xFF (index 0–127) to Unicode codepoints. 44 + // 0xFFFD = unmapped byte position. 45 + // --------------------------------------------------------------------------- 46 + 47 + /// windows-1252 (WHATWG index) 48 + /// Also serves as the encoding for labels: iso-8859-1, us-ascii, latin1, etc. 49 + pub(crate) const WINDOWS_1252: [u16; 128] = [ 50 + // 0x80–0x8F 51 + 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x0160, 0x2039, 52 + 0x0152, 0x008D, 0x017D, 0x008F, // 0x90–0x9F 53 + 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x0161, 0x203A, 54 + 0x0153, 0x009D, 0x017E, 0x0178, // 0xA0–0xAF 55 + 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00AA, 0x00AB, 56 + 0x00AC, 0x00AD, 0x00AE, 0x00AF, // 0xB0–0xBF 57 + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00BA, 0x00BB, 58 + 0x00BC, 0x00BD, 0x00BE, 0x00BF, // 0xC0–0xCF 59 + 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 60 + 0x00CC, 0x00CD, 0x00CE, 0x00CF, // 0xD0–0xDF 61 + 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, 0x00D8, 0x00D9, 0x00DA, 0x00DB, 62 + 0x00DC, 0x00DD, 0x00DE, 0x00DF, // 0xE0–0xEF 63 + 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 64 + 0x00EC, 0x00ED, 0x00EE, 0x00EF, // 0xF0–0xFF 65 + 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, 0x00F8, 0x00F9, 0x00FA, 0x00FB, 66 + 0x00FC, 0x00FD, 0x00FE, 0x00FF, 67 + ]; 68 + 69 + /// ISO-8859-2 (Latin-2, Central European) 70 + pub(crate) const ISO_8859_2: [u16; 128] = [ 71 + // 0x80–0x8F: C1 controls 72 + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 73 + 0x008C, 0x008D, 0x008E, 0x008F, // 0x90–0x9F: C1 controls 74 + 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 75 + 0x009C, 0x009D, 0x009E, 0x009F, // 0xA0–0xAF 76 + 0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7, 0x00A8, 0x0160, 0x015E, 0x0164, 77 + 0x0179, 0x00AD, 0x017D, 0x017B, // 0xB0–0xBF 78 + 0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7, 0x00B8, 0x0161, 0x015F, 0x0165, 79 + 0x017A, 0x02DD, 0x017E, 0x017C, // 0xC0–0xCF 80 + 0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7, 0x010C, 0x00C9, 0x0118, 0x00CB, 81 + 0x011A, 0x00CD, 0x00CE, 0x010E, // 0xD0–0xDF 82 + 0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7, 0x0158, 0x016E, 0x00DA, 0x0170, 83 + 0x00DC, 0x00DD, 0x0162, 0x00DF, // 0xE0–0xEF 84 + 0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7, 0x010D, 0x00E9, 0x0119, 0x00EB, 85 + 0x011B, 0x00ED, 0x00EE, 0x010F, // 0xF0–0xFF 86 + 0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7, 0x0159, 0x016F, 0x00FA, 0x0171, 87 + 0x00FC, 0x00FD, 0x0163, 0x02D9, 88 + ]; 89 + 90 + /// ISO-8859-3 (Latin-3, South European) 91 + pub(crate) const ISO_8859_3: [u16; 128] = [ 92 + // 0x80–0x9F: C1 controls 93 + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 94 + 0x008C, 0x008D, 0x008E, 0x008F, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 95 + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, // 0xA0–0xAF 96 + 0x00A0, 0x0126, 0x02D8, 0x00A3, 0x00A4, 0xFFFD, 0x0124, 0x00A7, 0x00A8, 0x0130, 0x015E, 0x011E, 97 + 0x0134, 0x00AD, 0xFFFD, 0x017B, // 0xB0–0xBF 98 + 0x00B0, 0x0127, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x0125, 0x00B7, 0x00B8, 0x0131, 0x015F, 0x011F, 99 + 0x0135, 0x00BD, 0xFFFD, 0x017C, // 0xC0–0xCF 100 + 0x00C0, 0x00C1, 0x00C2, 0xFFFD, 0x00C4, 0x010A, 0x0108, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 101 + 0x00CC, 0x00CD, 0x00CE, 0x00CF, // 0xD0–0xDF 102 + 0xFFFD, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x0120, 0x00D6, 0x00D7, 0x011C, 0x00D9, 0x00DA, 0x00DB, 103 + 0x00DC, 0x016C, 0x015C, 0x00DF, // 0xE0–0xEF 104 + 0x00E0, 0x00E1, 0x00E2, 0xFFFD, 0x00E4, 0x010B, 0x0109, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 105 + 0x00EC, 0x00ED, 0x00EE, 0x00EF, // 0xF0–0xFF 106 + 0xFFFD, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x0121, 0x00F6, 0x00F7, 0x011D, 0x00F9, 0x00FA, 0x00FB, 107 + 0x00FC, 0x016D, 0x015D, 0x02D9, 108 + ]; 109 + 110 + /// ISO-8859-4 (Latin-4, North European) 111 + pub(crate) const ISO_8859_4: [u16; 128] = [ 112 + // 0x80–0x9F: C1 controls 113 + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 114 + 0x008C, 0x008D, 0x008E, 0x008F, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 115 + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, // 0xA0–0xAF 116 + 0x00A0, 0x0104, 0x0138, 0x0156, 0x00A4, 0x0128, 0x013B, 0x00A7, 0x00A8, 0x0160, 0x0112, 0x0122, 117 + 0x0166, 0x00AD, 0x017D, 0x00AF, // 0xB0–0xBF 118 + 0x00B0, 0x0105, 0x02DB, 0x0157, 0x00B4, 0x0129, 0x013C, 0x02C7, 0x00B8, 0x0161, 0x0113, 0x0123, 119 + 0x0167, 0x014A, 0x017E, 0x014B, // 0xC0–0xCF 120 + 0x0100, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E, 0x010C, 0x00C9, 0x0118, 0x00CB, 121 + 0x0116, 0x00CD, 0x00CE, 0x012A, // 0xD0–0xDF 122 + 0x0110, 0x0145, 0x014C, 0x0136, 0x00D4, 0x00D5, 0x00D6, 0x00D7, 0x00D8, 0x0172, 0x00DA, 0x00DB, 123 + 0x00DC, 0x0168, 0x016A, 0x00DF, // 0xE0–0xEF 124 + 0x0101, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x012F, 0x010D, 0x00E9, 0x0119, 0x00EB, 125 + 0x0117, 0x00ED, 0x00EE, 0x012B, // 0xF0–0xFF 126 + 0x0111, 0x0146, 0x014D, 0x0137, 0x00F4, 0x00F5, 0x00F6, 0x00F7, 0x00F8, 0x0173, 0x00FA, 0x00FB, 127 + 0x00FC, 0x0169, 0x016B, 0x02D9, 128 + ]; 129 + 130 + /// ISO-8859-5 (Cyrillic) 131 + pub(crate) const ISO_8859_5: [u16; 128] = [ 132 + // 0x80–0x9F: C1 controls 133 + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 134 + 0x008C, 0x008D, 0x008E, 0x008F, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 135 + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, // 0xA0–0xAF 136 + 0x00A0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, 0x0408, 0x0409, 0x040A, 0x040B, 137 + 0x040C, 0x00AD, 0x040E, 0x040F, // 0xB0–0xBF 138 + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 139 + 0x041C, 0x041D, 0x041E, 0x041F, // 0xC0–0xCF 140 + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 141 + 0x042C, 0x042D, 0x042E, 0x042F, // 0xD0–0xDF 142 + 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 143 + 0x043C, 0x043D, 0x043E, 0x043F, // 0xE0–0xEF 144 + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 145 + 0x044C, 0x044D, 0x044E, 0x044F, // 0xF0–0xFF 146 + 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, 0x0458, 0x0459, 0x045A, 0x045B, 147 + 0x045C, 0x00A7, 0x045E, 0x045F, 148 + ]; 149 + 150 + /// ISO-8859-6 (Arabic) 151 + pub(crate) const ISO_8859_6: [u16; 128] = [ 152 + // 0x80–0x9F: C1 controls 153 + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 154 + 0x008C, 0x008D, 0x008E, 0x008F, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 155 + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, // 0xA0–0xAF 156 + 0x00A0, 0xFFFD, 0xFFFD, 0xFFFD, 0x00A4, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 157 + 0x060C, 0x00AD, 0xFFFD, 0xFFFD, // 0xB0–0xBF 158 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x061B, 159 + 0xFFFD, 0xFFFD, 0xFFFD, 0x061F, // 0xC0–0xCF 160 + 0xFFFD, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627, 0x0628, 0x0629, 0x062A, 0x062B, 161 + 0x062C, 0x062D, 0x062E, 0x062F, // 0xD0–0xDF 162 + 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637, 0x0638, 0x0639, 0x063A, 0xFFFD, 163 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, // 0xE0–0xEF 164 + 0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647, 0x0648, 0x0649, 0x064A, 0x064B, 165 + 0x064C, 0x064D, 0x064E, 0x064F, // 0xF0–0xFF 166 + 0x0650, 0x0651, 0x0652, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 167 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 168 + ]; 169 + 170 + /// ISO-8859-7 (Greek) 171 + pub(crate) const ISO_8859_7: [u16; 128] = [ 172 + // 0x80–0x9F: C1 controls 173 + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 174 + 0x008C, 0x008D, 0x008E, 0x008F, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 175 + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, // 0xA0–0xAF 176 + 0x00A0, 0x2018, 0x2019, 0x00A3, 0x20AC, 0x20AF, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x037A, 0x00AB, 177 + 0x00AC, 0x00AD, 0xFFFD, 0x2015, // 0xB0–0xBF 178 + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x0385, 0x0386, 0x00B7, 0x0388, 0x0389, 0x038A, 0x00BB, 179 + 0x038C, 0x00BD, 0x038E, 0x038F, // 0xC0–0xCF 180 + 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 181 + 0x039C, 0x039D, 0x039E, 0x039F, // 0xD0–0xDF 182 + 0x03A0, 0x03A1, 0xFFFD, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0x03AA, 0x03AB, 183 + 0x03AC, 0x03AD, 0x03AE, 0x03AF, // 0xE0–0xEF 184 + 0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 185 + 0x03BC, 0x03BD, 0x03BE, 0x03BF, // 0xF0–0xFF 186 + 0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03CA, 0x03CB, 187 + 0x03CC, 0x03CD, 0x03CE, 0xFFFD, 188 + ]; 189 + 190 + /// ISO-8859-8 (Hebrew) — also used for ISO-8859-8-I (logical order) 191 + pub(crate) const ISO_8859_8: [u16; 128] = [ 192 + // 0x80–0x9F: C1 controls 193 + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 194 + 0x008C, 0x008D, 0x008E, 0x008F, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 195 + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, // 0xA0–0xAF 196 + 0x00A0, 0xFFFD, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00D7, 0x00AB, 197 + 0x00AC, 0x00AD, 0x00AE, 0x00AF, // 0xB0–0xBF 198 + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00F7, 0x00BB, 199 + 0x00BC, 0x00BD, 0x00BE, 0xFFFD, // 0xC0–0xCF 200 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 201 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, // 0xD0–0xDF 202 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 203 + 0xFFFD, 0xFFFD, 0xFFFD, 0x2017, // 0xE0–0xEF 204 + 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, 0x05D8, 0x05D9, 0x05DA, 0x05DB, 205 + 0x05DC, 0x05DD, 0x05DE, 0x05DF, // 0xF0–0xFF 206 + 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA, 0xFFFD, 207 + 0xFFFD, 0x200E, 0x200F, 0xFFFD, 208 + ]; 209 + 210 + /// ISO-8859-10 (Latin-6, Nordic) 211 + pub(crate) const ISO_8859_10: [u16; 128] = [ 212 + // 0x80–0x9F: C1 controls 213 + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 214 + 0x008C, 0x008D, 0x008E, 0x008F, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 215 + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, // 0xA0–0xAF 216 + 0x00A0, 0x0104, 0x0112, 0x0122, 0x012A, 0x0128, 0x0136, 0x00A7, 0x013B, 0x0110, 0x0160, 0x0166, 217 + 0x017D, 0x00AD, 0x016A, 0x014A, // 0xB0–0xBF 218 + 0x00B0, 0x0105, 0x0113, 0x0123, 0x012B, 0x0129, 0x0137, 0x00B7, 0x013C, 0x0111, 0x0161, 0x0167, 219 + 0x017E, 0x2015, 0x016B, 0x014B, // 0xC0–0xCF 220 + 0x0100, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E, 0x010C, 0x00C9, 0x0118, 0x00CB, 221 + 0x0116, 0x00CD, 0x00CE, 0x00CF, // 0xD0–0xDF 222 + 0x00D0, 0x0145, 0x014C, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x0168, 0x00D8, 0x0172, 0x00DA, 0x00DB, 223 + 0x00DC, 0x00DD, 0x00DE, 0x00DF, // 0xE0–0xEF 224 + 0x0101, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x012F, 0x010D, 0x00E9, 0x0119, 0x00EB, 225 + 0x0117, 0x00ED, 0x00EE, 0x00EF, // 0xF0–0xFF 226 + 0x00F0, 0x0146, 0x014D, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x0169, 0x00F8, 0x0173, 0x00FA, 0x00FB, 227 + 0x00FC, 0x00FD, 0x00FE, 0x0138, 228 + ]; 229 + 230 + /// ISO-8859-13 (Latin-7, Baltic Rim) 231 + pub(crate) const ISO_8859_13: [u16; 128] = [ 232 + // 0x80–0x9F: C1 controls 233 + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 234 + 0x008C, 0x008D, 0x008E, 0x008F, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 235 + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, // 0xA0–0xAF 236 + 0x00A0, 0x201D, 0x00A2, 0x00A3, 0x00A4, 0x201E, 0x00A6, 0x00A7, 0x00D8, 0x00A9, 0x0156, 0x00AB, 237 + 0x00AC, 0x00AD, 0x00AE, 0x00C6, // 0xB0–0xBF 238 + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x201C, 0x00B5, 0x00B6, 0x00B7, 0x00F8, 0x00B9, 0x0157, 0x00BB, 239 + 0x00BC, 0x00BD, 0x00BE, 0x00E6, // 0xC0–0xCF 240 + 0x0104, 0x012E, 0x0100, 0x0106, 0x00C4, 0x00C5, 0x0118, 0x0112, 0x010C, 0x00C9, 0x0179, 0x0116, 241 + 0x0122, 0x0136, 0x012A, 0x013B, // 0xD0–0xDF 242 + 0x0160, 0x0143, 0x0145, 0x00D3, 0x014C, 0x00D5, 0x00D6, 0x00D7, 0x0172, 0x0141, 0x015A, 0x016A, 243 + 0x00DC, 0x017B, 0x017D, 0x00DF, // 0xE0–0xEF 244 + 0x0105, 0x012F, 0x0101, 0x0107, 0x00E4, 0x00E5, 0x0119, 0x0113, 0x010D, 0x00E9, 0x017A, 0x0117, 245 + 0x0123, 0x0137, 0x012B, 0x013C, // 0xF0–0xFF 246 + 0x0161, 0x0144, 0x0146, 0x00F3, 0x014D, 0x00F5, 0x00F6, 0x00F7, 0x0173, 0x0142, 0x015B, 0x016B, 247 + 0x00FC, 0x017C, 0x017E, 0x2019, 248 + ]; 249 + 250 + /// ISO-8859-14 (Latin-8, Celtic) 251 + pub(crate) const ISO_8859_14: [u16; 128] = [ 252 + // 0x80–0x9F: C1 controls 253 + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 254 + 0x008C, 0x008D, 0x008E, 0x008F, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 255 + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, // 0xA0–0xAF 256 + 0x00A0, 0x1E02, 0x1E03, 0x00A3, 0x010A, 0x010B, 0x1E0A, 0x00A7, 0x1E80, 0x00A9, 0x1E82, 0x1E0B, 257 + 0x1EF2, 0x00AD, 0x00AE, 0x0178, // 0xB0–0xBF 258 + 0x1E1E, 0x1E1F, 0x0120, 0x0121, 0x1E40, 0x1E41, 0x00B6, 0x1E56, 0x1E81, 0x1E57, 0x1E83, 0x1E60, 259 + 0x1EF3, 0x1E84, 0x1E85, 0x1E61, // 0xC0–0xCF 260 + 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 261 + 0x00CC, 0x00CD, 0x00CE, 0x00CF, // 0xD0–0xDF 262 + 0x0174, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x1E6A, 0x00D8, 0x00D9, 0x00DA, 0x00DB, 263 + 0x00DC, 0x00DD, 0x0176, 0x00DF, // 0xE0–0xEF 264 + 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 265 + 0x00EC, 0x00ED, 0x00EE, 0x00EF, // 0xF0–0xFF 266 + 0x0175, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x1E6B, 0x00F8, 0x00F9, 0x00FA, 0x00FB, 267 + 0x00FC, 0x00FD, 0x0177, 0x00FF, 268 + ]; 269 + 270 + /// ISO-8859-15 (Latin-9, updated Western European) 271 + pub(crate) const ISO_8859_15: [u16; 128] = [ 272 + // 0x80–0x9F: C1 controls 273 + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 274 + 0x008C, 0x008D, 0x008E, 0x008F, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 275 + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, 276 + // 0xA0–0xAF: Differs from 8859-1 at A4, A6, A8, B4, B8, BC, BD, BE 277 + 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x20AC, 0x00A5, 0x0160, 0x00A7, 0x0161, 0x00A9, 0x00AA, 0x00AB, 278 + 0x00AC, 0x00AD, 0x00AE, 0x00AF, // 0xB0–0xBF 279 + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x017D, 0x00B5, 0x00B6, 0x00B7, 0x017E, 0x00B9, 0x00BA, 0x00BB, 280 + 0x0152, 0x0153, 0x0178, 0x00BF, // 0xC0–0xFF: same as ISO-8859-1 281 + 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 282 + 0x00CC, 0x00CD, 0x00CE, 0x00CF, 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, 283 + 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, 0x00E0, 0x00E1, 0x00E2, 0x00E3, 284 + 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, 285 + 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, 0x00F8, 0x00F9, 0x00FA, 0x00FB, 286 + 0x00FC, 0x00FD, 0x00FE, 0x00FF, 287 + ]; 288 + 289 + /// ISO-8859-16 (Latin-10, South-Eastern European) 290 + pub(crate) const ISO_8859_16: [u16; 128] = [ 291 + // 0x80–0x9F: C1 controls 292 + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 293 + 0x008C, 0x008D, 0x008E, 0x008F, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 294 + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, // 0xA0–0xAF 295 + 0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7, 0x0161, 0x00A9, 0x0218, 0x00AB, 296 + 0x0179, 0x00AD, 0x017A, 0x017B, // 0xB0–0xBF 297 + 0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7, 0x017E, 0x010D, 0x0219, 0x00BB, 298 + 0x0152, 0x0153, 0x0178, 0x017C, // 0xC0–0xCF 299 + 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 300 + 0x00CC, 0x00CD, 0x00CE, 0x00CF, // 0xD0–0xDF 301 + 0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A, 0x0170, 0x00D9, 0x00DA, 0x00DB, 302 + 0x00DC, 0x0118, 0x021A, 0x00DF, // 0xE0–0xEF 303 + 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 304 + 0x00EC, 0x00ED, 0x00EE, 0x00EF, // 0xF0–0xFF 305 + 0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B, 0x0171, 0x00F9, 0x00FA, 0x00FB, 306 + 0x00FC, 0x0119, 0x021B, 0x00FF, 307 + ]; 308 + 309 + /// KOI8-R (Russian) 310 + pub(crate) const KOI8_R: [u16; 128] = [ 311 + // 0x80–0x8F: box drawing 312 + 0x2500, 0x2502, 0x250C, 0x2510, 0x2514, 0x2518, 0x251C, 0x2524, 0x252C, 0x2534, 0x253C, 0x2580, 313 + 0x2584, 0x2588, 0x258C, 0x2590, // 0x90–0x9F: more box drawing + block elements 314 + 0x2591, 0x2592, 0x2593, 0x2320, 0x25A0, 0x2219, 0x221A, 0x2248, 0x2264, 0x2265, 0x00A0, 0x2321, 315 + 0x00B0, 0x00B2, 0x00B7, 0x00F7, // 0xA0–0xAF 316 + 0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255A, 317 + 0x255B, 0x255C, 0x255D, 0x255E, // 0xB0–0xBF 318 + 0x255F, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 0x2568, 0x2569, 319 + 0x256A, 0x256B, 0x256C, 0x00A9, // 0xC0–0xCF: Cyrillic lowercase 320 + 0x044E, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, 0x0445, 0x0438, 0x0439, 0x043A, 321 + 0x043B, 0x043C, 0x043D, 0x043E, // 0xD0–0xDF 322 + 0x043F, 0x044F, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, 0x044C, 0x044B, 0x0437, 0x0448, 323 + 0x044D, 0x0449, 0x0447, 0x044A, // 0xE0–0xEF: Cyrillic uppercase 324 + 0x042E, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, 0x0425, 0x0418, 0x0419, 0x041A, 325 + 0x041B, 0x041C, 0x041D, 0x041E, // 0xF0–0xFF 326 + 0x041F, 0x042F, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, 0x042C, 0x042B, 0x0417, 0x0428, 327 + 0x042D, 0x0429, 0x0427, 0x042A, 328 + ]; 329 + 330 + /// KOI8-U (Ukrainian) 331 + pub(crate) const KOI8_U: [u16; 128] = [ 332 + // 0x80–0x8F: box drawing (same as KOI8-R) 333 + 0x2500, 0x2502, 0x250C, 0x2510, 0x2514, 0x2518, 0x251C, 0x2524, 0x252C, 0x2534, 0x253C, 0x2580, 334 + 0x2584, 0x2588, 0x258C, 0x2590, // 0x90–0x9F 335 + 0x2591, 0x2592, 0x2593, 0x2320, 0x25A0, 0x2219, 0x221A, 0x2248, 0x2264, 0x2265, 0x00A0, 0x2321, 336 + 0x00B0, 0x00B2, 0x00B7, 0x00F7, 337 + // 0xA0–0xAF: differs from KOI8-R at A3, A4, A6, A7 338 + 0x2550, 0x2551, 0x2552, 0x0451, 0x0454, 0x2554, 0x0456, 0x0457, 0x2557, 0x2558, 0x2559, 0x255A, 339 + 0x255B, 0x0491, 0x255D, 0x255E, // 0xB0–0xBF 340 + 0x255F, 0x2560, 0x2561, 0x0401, 0x0404, 0x2563, 0x0406, 0x0407, 0x2566, 0x2567, 0x2568, 0x2569, 341 + 0x256A, 0x0490, 0x256C, 0x00A9, // 0xC0–0xFF: Cyrillic (same as KOI8-R) 342 + 0x044E, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, 0x0445, 0x0438, 0x0439, 0x043A, 343 + 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, 0x044F, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, 344 + 0x044C, 0x044B, 0x0437, 0x0448, 0x044D, 0x0449, 0x0447, 0x044A, 0x042E, 0x0410, 0x0411, 0x0426, 345 + 0x0414, 0x0415, 0x0424, 0x0413, 0x0425, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 346 + 0x041F, 0x042F, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, 0x042C, 0x042B, 0x0417, 0x0428, 347 + 0x042D, 0x0429, 0x0427, 0x042A, 348 + ]; 349 + 350 + /// macintosh (Mac OS Roman) 351 + pub(crate) const MACINTOSH: [u16; 128] = [ 352 + // 0x80–0x8F 353 + 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1, 0x00E0, 0x00E2, 0x00E4, 0x00E3, 354 + 0x00E5, 0x00E7, 0x00E9, 0x00E8, // 0x90–0x9F 355 + 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3, 0x00F2, 0x00F4, 0x00F6, 0x00F5, 356 + 0x00FA, 0x00F9, 0x00FB, 0x00FC, // 0xA0–0xAF 357 + 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF, 0x00AE, 0x00A9, 0x2122, 0x00B4, 358 + 0x00A8, 0x2260, 0x00C6, 0x00D8, // 0xB0–0xBF 359 + 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211, 0x220F, 0x03C0, 0x222B, 0x00AA, 360 + 0x00BA, 0x2126, 0x00E6, 0x00F8, // 0xC0–0xCF 361 + 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB, 0x00BB, 0x2026, 0x00A0, 0x00C0, 362 + 0x00C3, 0x00D5, 0x0152, 0x0153, // 0xD0–0xDF 363 + 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA, 0x00FF, 0x0178, 0x2044, 0x20AC, 364 + 0x2039, 0x203A, 0xFB01, 0xFB02, // 0xE0–0xEF 365 + 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1, 0x00CB, 0x00C8, 0x00CD, 0x00CE, 366 + 0x00CF, 0x00CC, 0x00D3, 0x00D4, // 0xF0–0xFF 367 + 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC, 0x00AF, 0x02D8, 0x02D9, 0x02DA, 368 + 0x00B8, 0x02DD, 0x02DB, 0x02C7, 369 + ]; 370 + 371 + /// IBM866 (DOS Cyrillic) 372 + pub(crate) const IBM866: [u16; 128] = [ 373 + // 0x80–0x8F: Cyrillic uppercase А–П 374 + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 375 + 0x041C, 0x041D, 0x041E, 0x041F, // 0x90–0x9F: Cyrillic uppercase Р–Я 376 + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 377 + 0x042C, 0x042D, 0x042E, 0x042F, // 0xA0–0xAF: Cyrillic lowercase а–п 378 + 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 379 + 0x043C, 0x043D, 0x043E, 0x043F, // 0xB0–0xBF: box drawing light 380 + 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 381 + 0x255D, 0x255C, 0x255B, 0x2510, // 0xC0–0xCF: box drawing 382 + 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 0x255A, 0x2554, 0x2569, 0x2566, 383 + 0x2560, 0x2550, 0x256C, 0x2567, // 0xD0–0xDF: more box drawing 384 + 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 0x256A, 0x2518, 0x250C, 0x2588, 385 + 0x2584, 0x258C, 0x2590, 0x2580, // 0xE0–0xEF: Cyrillic lowercase р–я 386 + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 387 + 0x044C, 0x044D, 0x044E, 0x044F, // 0xF0–0xFF 388 + 0x0401, 0x0451, 0x0404, 0x0454, 0x0407, 0x0457, 0x040E, 0x045E, 0x00B0, 0x2219, 0x00B7, 0x221A, 389 + 0x2116, 0x00A4, 0x25A0, 0x00A0, 390 + ]; 391 + 392 + /// windows-874 (Thai) 393 + pub(crate) const WINDOWS_874: [u16; 128] = [ 394 + // 0x80–0x8F 395 + 0x20AC, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x2026, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 396 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, // 0x90–0x9F 397 + 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 398 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, // 0xA0–0xAF 399 + 0x00A0, 0x0E01, 0x0E02, 0x0E03, 0x0E04, 0x0E05, 0x0E06, 0x0E07, 0x0E08, 0x0E09, 0x0E0A, 0x0E0B, 400 + 0x0E0C, 0x0E0D, 0x0E0E, 0x0E0F, // 0xB0–0xBF 401 + 0x0E10, 0x0E11, 0x0E12, 0x0E13, 0x0E14, 0x0E15, 0x0E16, 0x0E17, 0x0E18, 0x0E19, 0x0E1A, 0x0E1B, 402 + 0x0E1C, 0x0E1D, 0x0E1E, 0x0E1F, // 0xC0–0xCF 403 + 0x0E20, 0x0E21, 0x0E22, 0x0E23, 0x0E24, 0x0E25, 0x0E26, 0x0E27, 0x0E28, 0x0E29, 0x0E2A, 0x0E2B, 404 + 0x0E2C, 0x0E2D, 0x0E2E, 0x0E2F, // 0xD0–0xDF 405 + 0x0E30, 0x0E31, 0x0E32, 0x0E33, 0x0E34, 0x0E35, 0x0E36, 0x0E37, 0x0E38, 0x0E39, 0x0E3A, 0xFFFD, 406 + 0xFFFD, 0xFFFD, 0xFFFD, 0x0E3F, // 0xE0–0xEF 407 + 0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0x0E45, 0x0E46, 0x0E47, 0x0E48, 0x0E49, 0x0E4A, 0x0E4B, 408 + 0x0E4C, 0x0E4D, 0x0E4E, 0x0E4F, // 0xF0–0xFF 409 + 0x0E50, 0x0E51, 0x0E52, 0x0E53, 0x0E54, 0x0E55, 0x0E56, 0x0E57, 0x0E58, 0x0E59, 0x0E5A, 0x0E5B, 410 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 411 + ]; 412 + 413 + /// windows-1250 (Central European) 414 + pub(crate) const WINDOWS_1250: [u16; 128] = [ 415 + // 0x80–0x8F 416 + 0x20AC, 0xFFFD, 0x201A, 0xFFFD, 0x201E, 0x2026, 0x2020, 0x2021, 0xFFFD, 0x2030, 0x0160, 0x2039, 417 + 0x015A, 0x0164, 0x017D, 0x0179, // 0x90–0x9F 418 + 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0xFFFD, 0x2122, 0x0161, 0x203A, 419 + 0x015B, 0x0165, 0x017E, 0x017A, // 0xA0–0xAF 420 + 0x00A0, 0x02C7, 0x02D8, 0x0141, 0x00A4, 0x0104, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x015E, 0x00AB, 421 + 0x00AC, 0x00AD, 0x00AE, 0x017B, // 0xB0–0xBF 422 + 0x00B0, 0x00B1, 0x02DB, 0x0142, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x0105, 0x015F, 0x00BB, 423 + 0x013D, 0x02DD, 0x013E, 0x017C, // 0xC0–0xCF 424 + 0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7, 0x010C, 0x00C9, 0x0118, 0x00CB, 425 + 0x011A, 0x00CD, 0x00CE, 0x010E, // 0xD0–0xDF 426 + 0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7, 0x0158, 0x016E, 0x00DA, 0x0170, 427 + 0x00DC, 0x00DD, 0x0162, 0x00DF, // 0xE0–0xEF 428 + 0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7, 0x010D, 0x00E9, 0x0119, 0x00EB, 429 + 0x011B, 0x00ED, 0x00EE, 0x010F, // 0xF0–0xFF 430 + 0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7, 0x0159, 0x016F, 0x00FA, 0x0171, 431 + 0x00FC, 0x00FD, 0x0163, 0x02D9, 432 + ]; 433 + 434 + /// windows-1251 (Cyrillic) 435 + pub(crate) const WINDOWS_1251: [u16; 128] = [ 436 + // 0x80–0x8F 437 + 0x0402, 0x0403, 0x201A, 0x0453, 0x201E, 0x2026, 0x2020, 0x2021, 0x20AC, 0x2030, 0x0409, 0x2039, 438 + 0x040A, 0x040C, 0x040B, 0x040F, // 0x90–0x9F 439 + 0x0452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0xFFFD, 0x2122, 0x0459, 0x203A, 440 + 0x045A, 0x045C, 0x045B, 0x045F, // 0xA0–0xAF 441 + 0x00A0, 0x040E, 0x045E, 0x0408, 0x00A4, 0x0490, 0x00A6, 0x00A7, 0x0401, 0x00A9, 0x0404, 0x00AB, 442 + 0x00AC, 0x00AD, 0x00AE, 0x0407, // 0xB0–0xBF 443 + 0x00B0, 0x00B1, 0x0406, 0x0456, 0x0491, 0x00B5, 0x00B6, 0x00B7, 0x0451, 0x2116, 0x0454, 0x00BB, 444 + 0x0458, 0x0405, 0x0455, 0x0457, // 0xC0–0xCF: А–П 445 + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 446 + 0x041C, 0x041D, 0x041E, 0x041F, // 0xD0–0xDF: Р–Я 447 + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 448 + 0x042C, 0x042D, 0x042E, 0x042F, // 0xE0–0xEF: а–п 449 + 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 450 + 0x043C, 0x043D, 0x043E, 0x043F, // 0xF0–0xFF: р–я 451 + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 452 + 0x044C, 0x044D, 0x044E, 0x044F, 453 + ]; 454 + 455 + /// windows-1253 (Greek) 456 + pub(crate) const WINDOWS_1253: [u16; 128] = [ 457 + // 0x80–0x8F 458 + 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0xFFFD, 0x2030, 0xFFFD, 0x2039, 459 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, // 0x90–0x9F 460 + 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0xFFFD, 0x2122, 0xFFFD, 0x203A, 461 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, // 0xA0–0xAF 462 + 0x00A0, 0x0385, 0x0386, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0xFFFD, 0x00AB, 463 + 0x00AC, 0x00AD, 0x00AE, 0x2015, // 0xB0–0xBF 464 + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x00B5, 0x00B6, 0x00B7, 0x0388, 0x0389, 0x038A, 0x00BB, 465 + 0x038C, 0x00BD, 0x038E, 0x038F, // 0xC0–0xCF 466 + 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 467 + 0x039C, 0x039D, 0x039E, 0x039F, // 0xD0–0xDF 468 + 0x03A0, 0x03A1, 0xFFFD, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0x03AA, 0x03AB, 469 + 0x03AC, 0x03AD, 0x03AE, 0x03AF, // 0xE0–0xEF 470 + 0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 471 + 0x03BC, 0x03BD, 0x03BE, 0x03BF, // 0xF0–0xFF 472 + 0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03CA, 0x03CB, 473 + 0x03CC, 0x03CD, 0x03CE, 0xFFFD, 474 + ]; 475 + 476 + /// windows-1254 (Turkish) 477 + pub(crate) const WINDOWS_1254: [u16; 128] = [ 478 + // 0x80–0x8F 479 + 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x0160, 0x2039, 480 + 0x0152, 0xFFFD, 0xFFFD, 0xFFFD, // 0x90–0x9F 481 + 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x0161, 0x203A, 482 + 0x0153, 0xFFFD, 0xFFFD, 0x0178, // 0xA0–0xAF 483 + 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00AA, 0x00AB, 484 + 0x00AC, 0x00AD, 0x00AE, 0x00AF, // 0xB0–0xBF 485 + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00BA, 0x00BB, 486 + 0x00BC, 0x00BD, 0x00BE, 0x00BF, // 0xC0–0xCF 487 + 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 488 + 0x00CC, 0x00CD, 0x00CE, 0x00CF, // 0xD0–0xDF: differs from 1252 at D0, DD, DE 489 + 0x011E, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, 0x00D8, 0x00D9, 0x00DA, 0x00DB, 490 + 0x00DC, 0x0130, 0x015E, 0x00DF, // 0xE0–0xEF 491 + 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 492 + 0x00EC, 0x00ED, 0x00EE, 0x00EF, // 0xF0–0xFF: differs from 1252 at F0, FD, FE 493 + 0x011F, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, 0x00F8, 0x00F9, 0x00FA, 0x00FB, 494 + 0x00FC, 0x0131, 0x015F, 0x00FF, 495 + ]; 496 + 497 + /// windows-1255 (Hebrew) 498 + pub(crate) const WINDOWS_1255: [u16; 128] = [ 499 + // 0x80–0x8F 500 + 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0xFFFD, 0x2039, 501 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, // 0x90–0x9F 502 + 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0xFFFD, 0x203A, 503 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, // 0xA0–0xAF 504 + 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x20AA, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00D7, 0x00AB, 505 + 0x00AC, 0x00AD, 0x00AE, 0x00AF, // 0xB0–0xBF 506 + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00F7, 0x00BB, 507 + 0x00BC, 0x00BD, 0x00BE, 0x00BF, // 0xC0–0xCF 508 + 0x05B0, 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, 0x05B8, 0x05B9, 0x05BA, 0x05BB, 509 + 0x05BC, 0x05BD, 0x05BE, 0x05BF, // 0xD0–0xDF 510 + 0x05C0, 0x05C1, 0x05C2, 0x05C3, 0x05F0, 0x05F1, 0x05F2, 0x05F3, 0x05F4, 0xFFFD, 0xFFFD, 0xFFFD, 511 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, // 0xE0–0xEF 512 + 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, 0x05D8, 0x05D9, 0x05DA, 0x05DB, 513 + 0x05DC, 0x05DD, 0x05DE, 0x05DF, // 0xF0–0xFF 514 + 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA, 0xFFFD, 515 + 0xFFFD, 0x200E, 0x200F, 0xFFFD, 516 + ]; 517 + 518 + /// windows-1256 (Arabic) 519 + pub(crate) const WINDOWS_1256: [u16; 128] = [ 520 + // 0x80–0x8F 521 + 0x20AC, 0x067E, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x0679, 0x2039, 522 + 0x0152, 0x0686, 0x0698, 0x0688, // 0x90–0x9F 523 + 0x06AF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x06A9, 0x2122, 0x0691, 0x203A, 524 + 0x0153, 0x200C, 0x200D, 0x06BA, // 0xA0–0xAF 525 + 0x00A0, 0x060C, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x06BE, 0x00AB, 526 + 0x00AC, 0x00AD, 0x00AE, 0x00AF, // 0xB0–0xBF 527 + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x061B, 0x00BB, 528 + 0x00BC, 0x00BD, 0x00BE, 0x061F, // 0xC0–0xCF 529 + 0x06C1, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627, 0x0628, 0x0629, 0x062A, 0x062B, 530 + 0x062C, 0x062D, 0x062E, 0x062F, // 0xD0–0xDF 531 + 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x00D7, 0x0637, 0x0638, 0x0639, 0x063A, 532 + 0x0640, 0x0641, 0x0642, 0x0643, // 0xE0–0xEF 533 + 0x00E0, 0x0644, 0x00E2, 0x0645, 0x0646, 0x0647, 0x0648, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 534 + 0x0649, 0x064A, 0x00EE, 0x00EF, // 0xF0–0xFF 535 + 0x064B, 0x064C, 0x064D, 0x064E, 0x00F4, 0x064F, 0x0650, 0x00F7, 0x0651, 0x00F9, 0x0652, 0x00FB, 536 + 0x00FC, 0x200E, 0x200F, 0x06D2, 537 + ]; 538 + 539 + /// windows-1257 (Baltic) 540 + pub(crate) const WINDOWS_1257: [u16; 128] = [ 541 + // 0x80–0x8F 542 + 0x20AC, 0xFFFD, 0x201A, 0xFFFD, 0x201E, 0x2026, 0x2020, 0x2021, 0xFFFD, 0x2030, 0xFFFD, 0x2039, 543 + 0xFFFD, 0x00A8, 0x02C7, 0x00B8, // 0x90–0x9F 544 + 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0xFFFD, 0x2122, 0xFFFD, 0x203A, 545 + 0xFFFD, 0x00AF, 0x02DB, 0xFFFD, // 0xA0–0xAF 546 + 0x00A0, 0xFFFD, 0x00A2, 0x00A3, 0x00A4, 0xFFFD, 0x00A6, 0x00A7, 0x00D8, 0x00A9, 0x0156, 0x00AB, 547 + 0x00AC, 0x00AD, 0x00AE, 0x00C6, // 0xB0–0xBF 548 + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00F8, 0x00B9, 0x0157, 0x00BB, 549 + 0x00BC, 0x00BD, 0x00BE, 0x00E6, // 0xC0–0xCF 550 + 0x0104, 0x012E, 0x0100, 0x0106, 0x00C4, 0x00C5, 0x0118, 0x0112, 0x010C, 0x00C9, 0x0179, 0x0116, 551 + 0x0122, 0x0136, 0x012A, 0x013B, // 0xD0–0xDF 552 + 0x0160, 0x0143, 0x0145, 0x00D3, 0x014C, 0x00D5, 0x00D6, 0x00D7, 0x0172, 0x0141, 0x015A, 0x016A, 553 + 0x00DC, 0x017B, 0x017D, 0x00DF, // 0xE0–0xEF 554 + 0x0105, 0x012F, 0x0101, 0x0107, 0x00E4, 0x00E5, 0x0119, 0x0113, 0x010D, 0x00E9, 0x017A, 0x0117, 555 + 0x0123, 0x0137, 0x012B, 0x013C, // 0xF0–0xFF 556 + 0x0161, 0x0144, 0x0146, 0x00F3, 0x014D, 0x00F5, 0x00F6, 0x00F7, 0x0173, 0x0142, 0x015B, 0x016B, 557 + 0x00FC, 0x017C, 0x017E, 0x02D9, 558 + ]; 559 + 560 + /// windows-1258 (Vietnamese) 561 + pub(crate) const WINDOWS_1258: [u16; 128] = [ 562 + // 0x80–0x8F 563 + 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0xFFFD, 0x2039, 564 + 0x0152, 0xFFFD, 0xFFFD, 0xFFFD, // 0x90–0x9F 565 + 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0xFFFD, 0x203A, 566 + 0x0153, 0xFFFD, 0xFFFD, 0x0178, // 0xA0–0xAF 567 + 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00AA, 0x00AB, 568 + 0x00AC, 0x00AD, 0x00AE, 0x00AF, // 0xB0–0xBF 569 + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00BA, 0x00BB, 570 + 0x00BC, 0x00BD, 0x00BE, 0x00BF, // 0xC0–0xCF 571 + 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x00C5, 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 572 + 0x0300, 0x00CD, 0x00CE, 0x00CF, // 0xD0–0xDF 573 + 0x0110, 0x00D1, 0x0309, 0x00D3, 0x00D4, 0x01A0, 0x00D6, 0x00D7, 0x00D8, 0x00D9, 0x00DA, 0x00DB, 574 + 0x00DC, 0x01AF, 0x0303, 0x00DF, // 0xE0–0xEF 575 + 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 576 + 0x0301, 0x00ED, 0x00EE, 0x00EF, // 0xF0–0xFF 577 + 0x0111, 0x00F1, 0x0323, 0x00F3, 0x00F4, 0x01A1, 0x00F6, 0x00F7, 0x00F8, 0x00F9, 0x00FA, 0x00FB, 578 + 0x00FC, 0x01B0, 0x20AB, 0x00FF, 579 + ]; 580 + 581 + /// x-mac-cyrillic 582 + pub(crate) const X_MAC_CYRILLIC: [u16; 128] = [ 583 + // 0x80–0x8F 584 + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 585 + 0x041C, 0x041D, 0x041E, 0x041F, // 0x90–0x9F 586 + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 587 + 0x042C, 0x042D, 0x042E, 0x042F, // 0xA0–0xAF 588 + 0x2020, 0x00B0, 0x0490, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x0406, 0x00AE, 0x00A9, 0x2122, 0x0402, 589 + 0x0452, 0x2260, 0x0403, 0x0453, // 0xB0–0xBF 590 + 0x221E, 0x00B1, 0x2264, 0x2265, 0x0456, 0x00B5, 0x0491, 0x0408, 0x0404, 0x0454, 0x0407, 0x0457, 591 + 0x0409, 0x0459, 0x040A, 0x045A, // 0xC0–0xCF 592 + 0x0458, 0x0405, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB, 0x00BB, 0x2026, 0x00A0, 0x040B, 593 + 0x045B, 0x040C, 0x045C, 0x0455, // 0xD0–0xDF 594 + 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x201E, 0x040E, 0x045E, 0x040F, 0x045F, 595 + 0x2116, 0x0401, 0x0451, 0x044F, // 0xE0–0xEF 596 + 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 597 + 0x043C, 0x043D, 0x043E, 0x043F, // 0xF0–0xFF 598 + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 599 + 0x044C, 0x044D, 0x044E, 0x20AC, 600 + ]; 601 + 602 + /// Return the lookup table for a single-byte encoding variant. 603 + pub(crate) fn table_for(encoding: &crate::Encoding) -> Option<&'static [u16; 128]> { 604 + use crate::Encoding::*; 605 + match encoding { 606 + Windows1252 => Some(&WINDOWS_1252), 607 + Iso8859_2 => Some(&ISO_8859_2), 608 + Iso8859_3 => Some(&ISO_8859_3), 609 + Iso8859_4 => Some(&ISO_8859_4), 610 + Iso8859_5 => Some(&ISO_8859_5), 611 + Iso8859_6 => Some(&ISO_8859_6), 612 + Iso8859_7 => Some(&ISO_8859_7), 613 + Iso8859_8 | Iso8859_8I => Some(&ISO_8859_8), 614 + Iso8859_10 => Some(&ISO_8859_10), 615 + Iso8859_13 => Some(&ISO_8859_13), 616 + Iso8859_14 => Some(&ISO_8859_14), 617 + Iso8859_15 => Some(&ISO_8859_15), 618 + Iso8859_16 => Some(&ISO_8859_16), 619 + Koi8R => Some(&KOI8_R), 620 + Koi8U => Some(&KOI8_U), 621 + Macintosh => Some(&MACINTOSH), 622 + Ibm866 => Some(&IBM866), 623 + Windows874 => Some(&WINDOWS_874), 624 + Windows1250 => Some(&WINDOWS_1250), 625 + Windows1251 => Some(&WINDOWS_1251), 626 + Windows1253 => Some(&WINDOWS_1253), 627 + Windows1254 => Some(&WINDOWS_1254), 628 + Windows1255 => Some(&WINDOWS_1255), 629 + Windows1256 => Some(&WINDOWS_1256), 630 + Windows1257 => Some(&WINDOWS_1257), 631 + Windows1258 => Some(&WINDOWS_1258), 632 + XMacCyrillic => Some(&X_MAC_CYRILLIC), 633 + _ => None, 634 + } 635 + } 636 + 637 + // --------------------------------------------------------------------------- 638 + // Tests 639 + // --------------------------------------------------------------------------- 640 + 641 + #[cfg(test)] 642 + mod tests { 643 + use super::*; 644 + use crate::error::EncodingError; 645 + 646 + fn decode_replace(bytes: &[u8], table: &[u16; 128]) -> String { 647 + decode_single_byte(bytes, table, "test", ErrorMode::Replacement).unwrap() 648 + } 649 + 650 + // -- Basic ASCII passthrough -- 651 + 652 + #[test] 653 + fn ascii_passthrough() { 654 + assert_eq!( 655 + decode_replace(b"Hello, world!", &WINDOWS_1252), 656 + "Hello, world!" 657 + ); 658 + } 659 + 660 + #[test] 661 + fn empty_input() { 662 + assert_eq!(decode_replace(b"", &WINDOWS_1252), ""); 663 + } 664 + 665 + #[test] 666 + fn null_byte() { 667 + assert_eq!(decode_replace(&[0x00], &WINDOWS_1252), "\0"); 668 + } 669 + 670 + // -- Windows-1252 -- 671 + 672 + #[test] 673 + fn windows_1252_euro() { 674 + // 0x80 → U+20AC (€) 675 + assert_eq!(decode_replace(&[0x80], &WINDOWS_1252), "\u{20AC}"); 676 + } 677 + 678 + #[test] 679 + fn windows_1252_smart_quotes() { 680 + // 0x93 → U+201C (") and 0x94 → U+201D (") 681 + assert_eq!( 682 + decode_replace(&[0x93, 0x94], &WINDOWS_1252), 683 + "\u{201C}\u{201D}" 684 + ); 685 + } 686 + 687 + #[test] 688 + fn windows_1252_trademark() { 689 + // 0x99 → U+2122 (™) 690 + assert_eq!(decode_replace(&[0x99], &WINDOWS_1252), "\u{2122}"); 691 + } 692 + 693 + #[test] 694 + fn windows_1252_high_latin() { 695 + // 0xE9 → U+00E9 (é) 696 + assert_eq!(decode_replace(&[0xE9], &WINDOWS_1252), "\u{00E9}"); 697 + } 698 + 699 + #[test] 700 + fn windows_1252_mixed() { 701 + // "Caf" + 0xE9 → "Café" 702 + assert_eq!( 703 + decode_replace(&[0x43, 0x61, 0x66, 0xE9], &WINDOWS_1252), 704 + "Caf\u{00E9}" 705 + ); 706 + } 707 + 708 + // -- ISO-8859-2 (Central European) -- 709 + 710 + #[test] 711 + fn iso_8859_2_polish() { 712 + // 0xA1 → U+0104 (Ą), 0xB1 → U+0105 (ą) 713 + assert_eq!(decode_replace(&[0xA1], &ISO_8859_2), "\u{0104}"); 714 + assert_eq!(decode_replace(&[0xB1], &ISO_8859_2), "\u{0105}"); 715 + } 716 + 717 + #[test] 718 + fn iso_8859_2_czech() { 719 + // 0xC8 → U+010C (Č), 0xE8 → U+010D (č) 720 + assert_eq!(decode_replace(&[0xC8], &ISO_8859_2), "\u{010C}"); 721 + assert_eq!(decode_replace(&[0xE8], &ISO_8859_2), "\u{010D}"); 722 + } 723 + 724 + // -- ISO-8859-5 (Cyrillic) -- 725 + 726 + #[test] 727 + fn iso_8859_5_cyrillic() { 728 + // 0xB0 → U+0410 (А), 0xD0 → U+0430 (а) 729 + assert_eq!(decode_replace(&[0xB0], &ISO_8859_5), "\u{0410}"); 730 + assert_eq!(decode_replace(&[0xD0], &ISO_8859_5), "\u{0430}"); 731 + } 732 + 733 + // -- ISO-8859-7 (Greek) -- 734 + 735 + #[test] 736 + fn iso_8859_7_greek() { 737 + // 0xC1 → U+0391 (Α), 0xE1 → U+03B1 (α) 738 + assert_eq!(decode_replace(&[0xC1], &ISO_8859_7), "\u{0391}"); 739 + assert_eq!(decode_replace(&[0xE1], &ISO_8859_7), "\u{03B1}"); 740 + } 741 + 742 + // -- ISO-8859-15 (Latin-9) -- 743 + 744 + #[test] 745 + fn iso_8859_15_euro() { 746 + // 0xA4 → U+20AC (€) — differs from ISO-8859-1 747 + assert_eq!(decode_replace(&[0xA4], &ISO_8859_15), "\u{20AC}"); 748 + } 749 + 750 + #[test] 751 + fn iso_8859_15_oe_ligature() { 752 + // 0xBC → U+0152 (Œ), 0xBD → U+0153 (œ) 753 + assert_eq!(decode_replace(&[0xBC], &ISO_8859_15), "\u{0152}"); 754 + assert_eq!(decode_replace(&[0xBD], &ISO_8859_15), "\u{0153}"); 755 + } 756 + 757 + // -- KOI8-R (Russian) -- 758 + 759 + #[test] 760 + fn koi8_r_cyrillic() { 761 + // 0xC1 → U+0430 (а), 0xE1 → U+0410 (А) 762 + assert_eq!(decode_replace(&[0xC1], &KOI8_R), "\u{0430}"); 763 + assert_eq!(decode_replace(&[0xE1], &KOI8_R), "\u{0410}"); 764 + } 765 + 766 + #[test] 767 + fn koi8_r_copyright() { 768 + // 0xBF → U+00A9 (©) 769 + assert_eq!(decode_replace(&[0xBF], &KOI8_R), "\u{00A9}"); 770 + } 771 + 772 + // -- Windows-1251 (Cyrillic) -- 773 + 774 + #[test] 775 + fn windows_1251_cyrillic() { 776 + // 0xC0 → U+0410 (А), 0xE0 → U+0430 (а) 777 + assert_eq!(decode_replace(&[0xC0], &WINDOWS_1251), "\u{0410}"); 778 + assert_eq!(decode_replace(&[0xE0], &WINDOWS_1251), "\u{0430}"); 779 + } 780 + 781 + #[test] 782 + fn windows_1251_euro() { 783 + // 0x88 → U+20AC (€) 784 + assert_eq!(decode_replace(&[0x88], &WINDOWS_1251), "\u{20AC}"); 785 + } 786 + 787 + // -- macintosh -- 788 + 789 + #[test] 790 + fn macintosh_special() { 791 + // 0x80 → U+00C4 (Ä), 0xCA → U+00A0 (NBSP) 792 + assert_eq!(decode_replace(&[0x80], &MACINTOSH), "\u{00C4}"); 793 + assert_eq!(decode_replace(&[0xCA], &MACINTOSH), "\u{00A0}"); 794 + } 795 + 796 + // -- IBM866 -- 797 + 798 + #[test] 799 + fn ibm866_cyrillic() { 800 + // 0x80 → U+0410 (А), 0xA0 → U+0430 (а) 801 + assert_eq!(decode_replace(&[0x80], &IBM866), "\u{0410}"); 802 + assert_eq!(decode_replace(&[0xA0], &IBM866), "\u{0430}"); 803 + } 804 + 805 + // -- Unmapped bytes -- 806 + 807 + #[test] 808 + fn unmapped_replacement() { 809 + // ISO-8859-3 has unmapped bytes, e.g. 0xA5 810 + assert_eq!(decode_replace(&[0xA5], &ISO_8859_3), "\u{FFFD}"); 811 + } 812 + 813 + #[test] 814 + fn unmapped_fatal() { 815 + let err = decode_single_byte(&[0x41, 0xA5], &ISO_8859_3, "ISO-8859-3", ErrorMode::Fatal) 816 + .unwrap_err(); 817 + assert!(matches!( 818 + err, 819 + EncodingError::InvalidSequence { 820 + encoding: "ISO-8859-3", 821 + position: 1 822 + } 823 + )); 824 + } 825 + 826 + // -- Table sizes -- 827 + 828 + #[test] 829 + fn all_tables_128_entries() { 830 + assert_eq!(WINDOWS_1252.len(), 128); 831 + assert_eq!(ISO_8859_2.len(), 128); 832 + assert_eq!(ISO_8859_3.len(), 128); 833 + assert_eq!(ISO_8859_4.len(), 128); 834 + assert_eq!(ISO_8859_5.len(), 128); 835 + assert_eq!(ISO_8859_6.len(), 128); 836 + assert_eq!(ISO_8859_7.len(), 128); 837 + assert_eq!(ISO_8859_8.len(), 128); 838 + assert_eq!(ISO_8859_10.len(), 128); 839 + assert_eq!(ISO_8859_13.len(), 128); 840 + assert_eq!(ISO_8859_14.len(), 128); 841 + assert_eq!(ISO_8859_15.len(), 128); 842 + assert_eq!(ISO_8859_16.len(), 128); 843 + assert_eq!(KOI8_R.len(), 128); 844 + assert_eq!(KOI8_U.len(), 128); 845 + assert_eq!(MACINTOSH.len(), 128); 846 + assert_eq!(IBM866.len(), 128); 847 + assert_eq!(WINDOWS_874.len(), 128); 848 + assert_eq!(WINDOWS_1250.len(), 128); 849 + assert_eq!(WINDOWS_1251.len(), 128); 850 + assert_eq!(WINDOWS_1253.len(), 128); 851 + assert_eq!(WINDOWS_1254.len(), 128); 852 + assert_eq!(WINDOWS_1255.len(), 128); 853 + assert_eq!(WINDOWS_1256.len(), 128); 854 + assert_eq!(WINDOWS_1257.len(), 128); 855 + assert_eq!(WINDOWS_1258.len(), 128); 856 + assert_eq!(X_MAC_CYRILLIC.len(), 128); 857 + } 858 + }