Implement WHATWG Encoding: UTF-8 and UTF-16 codecs

+70

crates/encoding/src/error.rs

··· 1 + //! Encoding error types. 2 + 3 + use std::fmt; 4 + 5 + /// Errors that can occur during encoding/decoding operations. 6 + #[derive(Debug, Clone, PartialEq, Eq)] 7 + pub enum EncodingError { 8 + /// An invalid byte sequence was encountered in fatal (strict) mode. 9 + InvalidSequence { 10 + encoding: &'static str, 11 + position: usize, 12 + }, 13 + /// The requested encoding label is not recognized. 14 + UnknownLabel(String), 15 + /// The encoding does not support the encode operation. 16 + /// Per WHATWG spec, UTF-16BE/LE are decode-only. 17 + EncodeNotSupported { encoding: &'static str }, 18 + } 19 + 20 + impl fmt::Display for EncodingError { 21 + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 22 + match self { 23 + Self::InvalidSequence { encoding, position } => { 24 + write!( 25 + f, 26 + "invalid byte sequence in {encoding} at position {position}" 27 + ) 28 + } 29 + Self::UnknownLabel(label) => { 30 + write!(f, "unknown encoding label: {label}") 31 + } 32 + Self::EncodeNotSupported { encoding } => { 33 + write!(f, "encode not supported for {encoding}") 34 + } 35 + } 36 + } 37 + } 38 + 39 + pub type Result<T> = std::result::Result<T, EncodingError>; 40 + 41 + #[cfg(test)] 42 + mod tests { 43 + use super::*; 44 + 45 + #[test] 46 + fn display_invalid_sequence() { 47 + let err = EncodingError::InvalidSequence { 48 + encoding: "UTF-8", 49 + position: 5, 50 + }; 51 + assert_eq!( 52 + err.to_string(), 53 + "invalid byte sequence in UTF-8 at position 5" 54 + ); 55 + } 56 + 57 + #[test] 58 + fn display_unknown_label() { 59 + let err = EncodingError::UnknownLabel("bogus".to_string()); 60 + assert_eq!(err.to_string(), "unknown encoding label: bogus"); 61 + } 62 + 63 + #[test] 64 + fn display_encode_not_supported() { 65 + let err = EncodingError::EncodeNotSupported { 66 + encoding: "UTF-16LE", 67 + }; 68 + assert_eq!(err.to_string(), "encode not supported for UTF-16LE"); 69 + } 70 + }

+341 -1

crates/encoding/src/lib.rs

··· 1 - //! WHATWG Encoding Standard — all encodings, pure Rust. 1 + //! WHATWG Encoding Standard — UTF-8 and UTF-16 codecs, pure Rust. 2 + 3 + pub mod error; 4 + mod utf16; 5 + mod utf8; 6 + 7 + use error::{EncodingError, Result}; 8 + use utf8::ErrorMode; 9 + 10 + // --------------------------------------------------------------------------- 11 + // Encoding enum 12 + // --------------------------------------------------------------------------- 13 + 14 + /// Supported text encodings per WHATWG Encoding Standard. 15 + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 16 + pub enum Encoding { 17 + Utf8, 18 + Utf16Be, 19 + Utf16Le, 20 + } 21 + 22 + impl Encoding { 23 + /// Canonical name per WHATWG spec. 24 + pub fn name(&self) -> &'static str { 25 + match self { 26 + Self::Utf8 => "UTF-8", 27 + Self::Utf16Be => "UTF-16BE", 28 + Self::Utf16Le => "UTF-16LE", 29 + } 30 + } 31 + } 32 + 33 + // --------------------------------------------------------------------------- 34 + // Label lookup (WHATWG Encoding Standard §4.2) 35 + // --------------------------------------------------------------------------- 36 + 37 + /// WHATWG encoding label mappings. 38 + /// Labels are stored in lowercase; lookup normalizes input to lowercase. 39 + const ENCODING_LABELS: &[(&str, Encoding)] = &[ 40 + // UTF-8 labels 41 + ("unicode-1-1-utf-8", Encoding::Utf8), 42 + ("unicode11utf8", Encoding::Utf8), 43 + ("unicode20utf8", Encoding::Utf8), 44 + ("utf-8", Encoding::Utf8), 45 + ("utf8", Encoding::Utf8), 46 + ("x-unicode20utf8", Encoding::Utf8), 47 + // UTF-16BE labels 48 + ("unicodefffe", Encoding::Utf16Be), 49 + ("utf-16be", Encoding::Utf16Be), 50 + // UTF-16LE labels 51 + ("csunicode", Encoding::Utf16Le), 52 + ("iso-10646-ucs-2", Encoding::Utf16Le), 53 + ("ucs-2", Encoding::Utf16Le), 54 + ("unicode", Encoding::Utf16Le), 55 + ("unicodefeff", Encoding::Utf16Le), 56 + ("utf-16", Encoding::Utf16Le), 57 + ("utf-16le", Encoding::Utf16Le), 58 + ]; 59 + 60 + /// Look up an encoding by its WHATWG label. 61 + /// 62 + /// Strips leading/trailing ASCII whitespace and compares case-insensitively, 63 + /// per the WHATWG Encoding Standard. 64 + pub fn lookup(label: &str) -> Option<Encoding> { 65 + let trimmed = trim_ascii_whitespace(label); 66 + if trimmed.is_empty() { 67 + return None; 68 + } 69 + for &(name, enc) in ENCODING_LABELS { 70 + if ascii_eq_ignore_case(trimmed, name) { 71 + return Some(enc); 72 + } 73 + } 74 + None 75 + } 76 + 77 + /// Sniff BOM from the start of a byte slice. 78 + /// 79 + /// Returns the detected encoding (if any) and the remaining bytes after the BOM. 80 + pub fn bom_sniff(bytes: &[u8]) -> (Option<Encoding>, &[u8]) { 81 + if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF { 82 + (Some(Encoding::Utf8), &bytes[3..]) 83 + } else if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF { 84 + (Some(Encoding::Utf16Be), &bytes[2..]) 85 + } else if bytes.len() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE { 86 + (Some(Encoding::Utf16Le), &bytes[2..]) 87 + } else { 88 + (None, bytes) 89 + } 90 + } 91 + 92 + // --------------------------------------------------------------------------- 93 + // Public API 94 + // --------------------------------------------------------------------------- 95 + 96 + /// Decode bytes to a `String` using the given encoding. 97 + /// 98 + /// Invalid sequences are replaced with U+FFFD (replacement mode per WHATWG spec). 99 + pub fn decode(bytes: &[u8], encoding: Encoding) -> String { 100 + // Replacement mode never fails 101 + match encoding { 102 + Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Replacement).unwrap(), 103 + Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Replacement).unwrap(), 104 + Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Replacement).unwrap(), 105 + } 106 + } 107 + 108 + /// Decode bytes to a `String`, returning an error on any invalid sequence. 109 + /// 110 + /// Fatal mode per WHATWG spec — returns `Err` on the first invalid byte sequence. 111 + pub fn decode_strict(bytes: &[u8], encoding: Encoding) -> Result<String> { 112 + match encoding { 113 + Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Fatal), 114 + Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Fatal), 115 + Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Fatal), 116 + } 117 + } 118 + 119 + /// Encode a string to bytes using the given encoding. 120 + /// 121 + /// Only UTF-8 encoding is supported for encode. Per WHATWG spec, UTF-16 122 + /// encodings are decode-only. 123 + pub fn encode(text: &str, encoding: Encoding) -> Result<Vec<u8>> { 124 + match encoding { 125 + Encoding::Utf8 => Ok(utf8::encode_utf8(text)), 126 + Encoding::Utf16Be => Err(EncodingError::EncodeNotSupported { 127 + encoding: "UTF-16BE", 128 + }), 129 + Encoding::Utf16Le => Err(EncodingError::EncodeNotSupported { 130 + encoding: "UTF-16LE", 131 + }), 132 + } 133 + } 134 + 135 + // --------------------------------------------------------------------------- 136 + // Internal helpers 137 + // --------------------------------------------------------------------------- 138 + 139 + /// ASCII whitespace per WHATWG spec: TAB, LF, FF, CR, SPACE. 140 + fn trim_ascii_whitespace(s: &str) -> &str { 141 + let bytes = s.as_bytes(); 142 + let start = bytes 143 + .iter() 144 + .position(|&b| !is_ascii_whitespace(b)) 145 + .unwrap_or(bytes.len()); 146 + let end = bytes 147 + .iter() 148 + .rposition(|&b| !is_ascii_whitespace(b)) 149 + .map(|p| p + 1) 150 + .unwrap_or(0); 151 + if start >= end { 152 + return ""; 153 + } 154 + &s[start..end] 155 + } 156 + 157 + fn is_ascii_whitespace(b: u8) -> bool { 158 + matches!(b, 0x09 | 0x0A | 0x0C | 0x0D | 0x20) 159 + } 160 + 161 + fn ascii_eq_ignore_case(a: &str, b: &str) -> bool { 162 + a.eq_ignore_ascii_case(b) 163 + } 164 + 165 + // --------------------------------------------------------------------------- 166 + // Tests 167 + // --------------------------------------------------------------------------- 168 + 169 + #[cfg(test)] 170 + mod tests { 171 + use super::*; 172 + 173 + // -- Encoding enum -- 174 + 175 + #[test] 176 + fn encoding_names() { 177 + assert_eq!(Encoding::Utf8.name(), "UTF-8"); 178 + assert_eq!(Encoding::Utf16Be.name(), "UTF-16BE"); 179 + assert_eq!(Encoding::Utf16Le.name(), "UTF-16LE"); 180 + } 181 + 182 + // -- Label lookup -- 183 + 184 + #[test] 185 + fn lookup_utf8_labels() { 186 + assert_eq!(lookup("utf-8"), Some(Encoding::Utf8)); 187 + assert_eq!(lookup("UTF-8"), Some(Encoding::Utf8)); 188 + assert_eq!(lookup("utf8"), Some(Encoding::Utf8)); 189 + assert_eq!(lookup("Utf8"), Some(Encoding::Utf8)); 190 + assert_eq!(lookup("unicode-1-1-utf-8"), Some(Encoding::Utf8)); 191 + assert_eq!(lookup("x-unicode20utf8"), Some(Encoding::Utf8)); 192 + } 193 + 194 + #[test] 195 + fn lookup_utf16_labels() { 196 + assert_eq!(lookup("utf-16be"), Some(Encoding::Utf16Be)); 197 + assert_eq!(lookup("UTF-16BE"), Some(Encoding::Utf16Be)); 198 + assert_eq!(lookup("unicodefffe"), Some(Encoding::Utf16Be)); 199 + assert_eq!(lookup("utf-16le"), Some(Encoding::Utf16Le)); 200 + assert_eq!(lookup("utf-16"), Some(Encoding::Utf16Le)); 201 + assert_eq!(lookup("unicode"), Some(Encoding::Utf16Le)); 202 + assert_eq!(lookup("ucs-2"), Some(Encoding::Utf16Le)); 203 + assert_eq!(lookup("iso-10646-ucs-2"), Some(Encoding::Utf16Le)); 204 + } 205 + 206 + #[test] 207 + fn lookup_with_whitespace() { 208 + assert_eq!(lookup(" utf-8 "), Some(Encoding::Utf8)); 209 + assert_eq!(lookup("\tutf-8\n"), Some(Encoding::Utf8)); 210 + assert_eq!(lookup("\r\nutf-16le\r\n"), Some(Encoding::Utf16Le)); 211 + } 212 + 213 + #[test] 214 + fn lookup_unknown() { 215 + assert_eq!(lookup("latin1"), None); 216 + assert_eq!(lookup(""), None); 217 + assert_eq!(lookup(" "), None); 218 + assert_eq!(lookup("utf-99"), None); 219 + } 220 + 221 + // -- BOM sniffing -- 222 + 223 + #[test] 224 + fn bom_utf8() { 225 + let (enc, rest) = bom_sniff(&[0xEF, 0xBB, 0xBF, 0x41]); 226 + assert_eq!(enc, Some(Encoding::Utf8)); 227 + assert_eq!(rest, &[0x41]); 228 + } 229 + 230 + #[test] 231 + fn bom_utf16be() { 232 + let (enc, rest) = bom_sniff(&[0xFE, 0xFF, 0x00, 0x41]); 233 + assert_eq!(enc, Some(Encoding::Utf16Be)); 234 + assert_eq!(rest, &[0x00, 0x41]); 235 + } 236 + 237 + #[test] 238 + fn bom_utf16le() { 239 + let (enc, rest) = bom_sniff(&[0xFF, 0xFE, 0x41, 0x00]); 240 + assert_eq!(enc, Some(Encoding::Utf16Le)); 241 + assert_eq!(rest, &[0x41, 0x00]); 242 + } 243 + 244 + #[test] 245 + fn bom_none() { 246 + let data = [0x41, 0x42, 0x43]; 247 + let (enc, rest) = bom_sniff(&data); 248 + assert_eq!(enc, None); 249 + assert_eq!(rest, &data); 250 + } 251 + 252 + #[test] 253 + fn bom_empty() { 254 + let (enc, rest) = bom_sniff(&[]); 255 + assert_eq!(enc, None); 256 + assert_eq!(rest, &[] as &[u8]); 257 + } 258 + 259 + #[test] 260 + fn bom_short() { 261 + let (enc, rest) = bom_sniff(&[0xEF, 0xBB]); 262 + assert_eq!(enc, None); 263 + assert_eq!(rest, &[0xEF, 0xBB]); 264 + } 265 + 266 + // -- Top-level decode -- 267 + 268 + #[test] 269 + fn decode_utf8_basic() { 270 + assert_eq!(decode(b"Hello", Encoding::Utf8), "Hello"); 271 + } 272 + 273 + #[test] 274 + fn decode_utf8_invalid_replaces() { 275 + assert_eq!(decode(&[0xFF], Encoding::Utf8), "\u{FFFD}"); 276 + } 277 + 278 + #[test] 279 + fn decode_utf16le_basic() { 280 + assert_eq!(decode(&[0x41, 0x00], Encoding::Utf16Le), "A"); 281 + } 282 + 283 + #[test] 284 + fn decode_utf16be_basic() { 285 + assert_eq!(decode(&[0x00, 0x41], Encoding::Utf16Be), "A"); 286 + } 287 + 288 + // -- Top-level decode_strict -- 289 + 290 + #[test] 291 + fn decode_strict_valid() { 292 + assert_eq!(decode_strict(b"Hello", Encoding::Utf8).unwrap(), "Hello"); 293 + } 294 + 295 + #[test] 296 + fn decode_strict_invalid() { 297 + assert!(decode_strict(&[0xFF], Encoding::Utf8).is_err()); 298 + } 299 + 300 + // -- Top-level encode -- 301 + 302 + #[test] 303 + fn encode_utf8_basic() { 304 + assert_eq!(encode("Hello", Encoding::Utf8).unwrap(), b"Hello"); 305 + } 306 + 307 + #[test] 308 + fn encode_utf16_not_supported() { 309 + assert!(matches!( 310 + encode("Hello", Encoding::Utf16Le), 311 + Err(EncodingError::EncodeNotSupported { 312 + encoding: "UTF-16LE" 313 + }) 314 + )); 315 + assert!(matches!( 316 + encode("Hello", Encoding::Utf16Be), 317 + Err(EncodingError::EncodeNotSupported { 318 + encoding: "UTF-16BE" 319 + }) 320 + )); 321 + } 322 + 323 + // -- Trim helpers -- 324 + 325 + #[test] 326 + fn trim_ascii_whitespace_basic() { 327 + assert_eq!(trim_ascii_whitespace(" hello "), "hello"); 328 + assert_eq!(trim_ascii_whitespace("hello"), "hello"); 329 + assert_eq!(trim_ascii_whitespace(""), ""); 330 + assert_eq!(trim_ascii_whitespace(" "), ""); 331 + assert_eq!(trim_ascii_whitespace("\t\nhello\r\n"), "hello"); 332 + } 333 + 334 + #[test] 335 + fn ascii_eq_ignore_case_basic() { 336 + assert!(ascii_eq_ignore_case("utf-8", "UTF-8")); 337 + assert!(ascii_eq_ignore_case("Utf-8", "utf-8")); 338 + assert!(!ascii_eq_ignore_case("utf-8", "utf-9")); 339 + assert!(!ascii_eq_ignore_case("utf-8", "utf-8x")); 340 + } 341 + }

+390

crates/encoding/src/utf16.rs

··· 1 + //! UTF-16 decoder per WHATWG Encoding Standard. 2 + 3 + use crate::error::{EncodingError, Result}; 4 + use crate::utf8::ErrorMode; 5 + 6 + /// Decode a byte slice as UTF-16LE. 7 + pub(crate) fn decode_utf16le(bytes: &[u8], mode: ErrorMode) -> Result<String> { 8 + decode_utf16(bytes, false, mode) 9 + } 10 + 11 + /// Decode a byte slice as UTF-16BE. 12 + pub(crate) fn decode_utf16be(bytes: &[u8], mode: ErrorMode) -> Result<String> { 13 + decode_utf16(bytes, true, mode) 14 + } 15 + 16 + /// Shared UTF-16 decoder (WHATWG Encoding Standard §14.2). 17 + fn decode_utf16(bytes: &[u8], big_endian: bool, mode: ErrorMode) -> Result<String> { 18 + let mut output = String::with_capacity(bytes.len() / 2); 19 + let mut i = 0; 20 + let mut lead_surrogate: Option<u16> = None; 21 + let mut bom_checked = false; 22 + 23 + while i + 1 < bytes.len() { 24 + let code_unit = if big_endian { 25 + ((bytes[i] as u16) << 8) | (bytes[i + 1] as u16) 26 + } else { 27 + ((bytes[i + 1] as u16) << 8) | (bytes[i] as u16) 28 + }; 29 + i += 2; 30 + 31 + // BOM handling: strip BOM matching our endianness at the start 32 + if !bom_checked { 33 + bom_checked = true; 34 + if code_unit == 0xFEFF { 35 + // BOM matches our endianness — consume it 36 + continue; 37 + } 38 + // 0xFFFE is NOT treated as a BOM — fall through to normal processing 39 + } 40 + 41 + if is_lead_surrogate(code_unit) { 42 + // If we already have an unpaired lead, emit error for it 43 + if let Some(_prev) = lead_surrogate { 44 + if mode == ErrorMode::Fatal { 45 + return Err(EncodingError::InvalidSequence { 46 + encoding: encoding_name(big_endian), 47 + position: i - 4, // position of the previous unpaired lead 48 + }); 49 + } 50 + output.push('\u{FFFD}'); 51 + } 52 + lead_surrogate = Some(code_unit); 53 + } else if is_trail_surrogate(code_unit) { 54 + if let Some(lead) = lead_surrogate.take() { 55 + // Valid surrogate pair — compute supplementary code point 56 + let cp = 0x10000 + ((lead as u32 - 0xD800) << 10) + (code_unit as u32 - 0xDC00); 57 + let ch = char::from_u32(cp).unwrap_or('\u{FFFD}'); 58 + output.push(ch); 59 + } else { 60 + // Trail surrogate without lead 61 + if mode == ErrorMode::Fatal { 62 + return Err(EncodingError::InvalidSequence { 63 + encoding: encoding_name(big_endian), 64 + position: i - 2, 65 + }); 66 + } 67 + output.push('\u{FFFD}'); 68 + } 69 + } else { 70 + // Regular BMP character 71 + if let Some(_lead) = lead_surrogate.take() { 72 + // Unpaired lead surrogate before this code unit 73 + if mode == ErrorMode::Fatal { 74 + return Err(EncodingError::InvalidSequence { 75 + encoding: encoding_name(big_endian), 76 + position: i - 4, 77 + }); 78 + } 79 + output.push('\u{FFFD}'); 80 + } 81 + let ch = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); 82 + output.push(ch); 83 + } 84 + } 85 + 86 + // Handle trailing single byte (odd byte count) 87 + if i < bytes.len() { 88 + // Flush any pending lead surrogate first 89 + if lead_surrogate.take().is_some() { 90 + if mode == ErrorMode::Fatal { 91 + return Err(EncodingError::InvalidSequence { 92 + encoding: encoding_name(big_endian), 93 + position: i - 2, 94 + }); 95 + } 96 + output.push('\u{FFFD}'); 97 + } 98 + if mode == ErrorMode::Fatal { 99 + return Err(EncodingError::InvalidSequence { 100 + encoding: encoding_name(big_endian), 101 + position: i, 102 + }); 103 + } 104 + output.push('\u{FFFD}'); 105 + } else if lead_surrogate.is_some() { 106 + // Unpaired lead surrogate at end of input 107 + if mode == ErrorMode::Fatal { 108 + return Err(EncodingError::InvalidSequence { 109 + encoding: encoding_name(big_endian), 110 + position: i - 2, 111 + }); 112 + } 113 + output.push('\u{FFFD}'); 114 + } 115 + 116 + Ok(output) 117 + } 118 + 119 + fn is_lead_surrogate(cu: u16) -> bool { 120 + (0xD800..=0xDBFF).contains(&cu) 121 + } 122 + 123 + fn is_trail_surrogate(cu: u16) -> bool { 124 + (0xDC00..=0xDFFF).contains(&cu) 125 + } 126 + 127 + fn encoding_name(big_endian: bool) -> &'static str { 128 + if big_endian { 129 + "UTF-16BE" 130 + } else { 131 + "UTF-16LE" 132 + } 133 + } 134 + 135 + // --------------------------------------------------------------------------- 136 + // Tests 137 + // --------------------------------------------------------------------------- 138 + 139 + #[cfg(test)] 140 + mod tests { 141 + use super::*; 142 + 143 + fn le(bytes: &[u8]) -> String { 144 + decode_utf16le(bytes, ErrorMode::Replacement).unwrap() 145 + } 146 + 147 + fn be(bytes: &[u8]) -> String { 148 + decode_utf16be(bytes, ErrorMode::Replacement).unwrap() 149 + } 150 + 151 + // -- Basic ASCII -- 152 + 153 + #[test] 154 + fn le_ascii() { 155 + assert_eq!(le(&[0x41, 0x00]), "A"); 156 + } 157 + 158 + #[test] 159 + fn be_ascii() { 160 + assert_eq!(be(&[0x00, 0x41]), "A"); 161 + } 162 + 163 + #[test] 164 + fn le_hello() { 165 + assert_eq!(le(&[0x48, 0x00, 0x69, 0x00]), "Hi"); 166 + } 167 + 168 + #[test] 169 + fn be_hello() { 170 + assert_eq!(be(&[0x00, 0x48, 0x00, 0x69]), "Hi"); 171 + } 172 + 173 + // -- BMP characters -- 174 + 175 + #[test] 176 + fn le_bmp() { 177 + // U+00E9 (e with acute) = 0xE9 0x00 in LE 178 + assert_eq!(le(&[0xE9, 0x00]), "\u{00E9}"); 179 + } 180 + 181 + #[test] 182 + fn be_bmp() { 183 + // U+00E9 in BE = 0x00 0xE9 184 + assert_eq!(be(&[0x00, 0xE9]), "\u{00E9}"); 185 + } 186 + 187 + #[test] 188 + fn le_cjk() { 189 + // U+4E16 = 0x16 0x4E in LE 190 + assert_eq!(le(&[0x16, 0x4E]), "\u{4E16}"); 191 + } 192 + 193 + // -- Surrogate pairs -- 194 + 195 + #[test] 196 + fn le_surrogate_pair() { 197 + // U+1F600 = D83D DE00 in UTF-16 198 + // LE: 3D D8 00 DE 199 + assert_eq!(le(&[0x3D, 0xD8, 0x00, 0xDE]), "\u{1F600}"); 200 + } 201 + 202 + #[test] 203 + fn be_surrogate_pair() { 204 + // U+1F600 = D83D DE00 in UTF-16 205 + // BE: D8 3D DE 00 206 + assert_eq!(be(&[0xD8, 0x3D, 0xDE, 0x00]), "\u{1F600}"); 207 + } 208 + 209 + #[test] 210 + fn le_supplementary_u10000() { 211 + // U+10000 = D800 DC00 212 + // LE: 00 D8 00 DC 213 + assert_eq!(le(&[0x00, 0xD8, 0x00, 0xDC]), "\u{10000}"); 214 + } 215 + 216 + #[test] 217 + fn le_supplementary_u10ffff() { 218 + // U+10FFFF = DBFF DFFF 219 + // LE: FF DB FF DF 220 + assert_eq!(le(&[0xFF, 0xDB, 0xFF, 0xDF]), "\u{10FFFF}"); 221 + } 222 + 223 + // -- Unpaired surrogates -- 224 + 225 + #[test] 226 + fn le_unpaired_lead() { 227 + // Lead surrogate D800 followed by non-surrogate 0041 228 + // LE: 00 D8 41 00 229 + assert_eq!(le(&[0x00, 0xD8, 0x41, 0x00]), "\u{FFFD}A"); 230 + } 231 + 232 + #[test] 233 + fn le_unpaired_trail() { 234 + // Trail surrogate DC00 without lead 235 + // LE: 00 DC 236 + assert_eq!(le(&[0x00, 0xDC]), "\u{FFFD}"); 237 + } 238 + 239 + #[test] 240 + fn le_lead_at_end() { 241 + // Lead surrogate at end of input 242 + assert_eq!(le(&[0x00, 0xD8]), "\u{FFFD}"); 243 + } 244 + 245 + #[test] 246 + fn le_two_leads_in_a_row() { 247 + // Two lead surrogates: D800 D801 — first is unpaired, second is unpaired at end 248 + // LE: 00 D8 01 D8 249 + assert_eq!(le(&[0x00, 0xD8, 0x01, 0xD8]), "\u{FFFD}\u{FFFD}"); 250 + } 251 + 252 + // -- BOM handling -- 253 + 254 + #[test] 255 + fn le_bom_stripped() { 256 + // UTF-16LE BOM: FF FE 257 + assert_eq!(le(&[0xFF, 0xFE, 0x41, 0x00]), "A"); 258 + } 259 + 260 + #[test] 261 + fn be_bom_stripped() { 262 + // UTF-16BE BOM: FE FF 263 + assert_eq!(be(&[0xFE, 0xFF, 0x00, 0x41]), "A"); 264 + } 265 + 266 + #[test] 267 + fn le_wrong_bom_not_stripped() { 268 + // FE FF is NOT the LE BOM — it's U+FEFF (ZWNBSP) 269 + assert_eq!(le(&[0xFE, 0xFF]), "\u{FFFE}"); 270 + } 271 + 272 + #[test] 273 + fn be_wrong_bom_not_stripped() { 274 + // FF FE is NOT the BE BOM — it's U+FFFE 275 + assert_eq!(be(&[0xFF, 0xFE]), "\u{FFFE}"); 276 + } 277 + 278 + #[test] 279 + fn le_bom_only() { 280 + assert_eq!(le(&[0xFF, 0xFE]), ""); 281 + } 282 + 283 + #[test] 284 + fn be_bom_only() { 285 + assert_eq!(be(&[0xFE, 0xFF]), ""); 286 + } 287 + 288 + // -- Odd byte count -- 289 + 290 + #[test] 291 + fn le_odd_byte() { 292 + assert_eq!(le(&[0x41, 0x00, 0x42]), "A\u{FFFD}"); 293 + } 294 + 295 + #[test] 296 + fn be_odd_byte() { 297 + assert_eq!(be(&[0x00, 0x41, 0x42]), "A\u{FFFD}"); 298 + } 299 + 300 + #[test] 301 + fn single_byte() { 302 + assert_eq!(le(&[0x41]), "\u{FFFD}"); 303 + } 304 + 305 + // -- Empty input -- 306 + 307 + #[test] 308 + fn empty_le() { 309 + assert_eq!(le(&[]), ""); 310 + } 311 + 312 + #[test] 313 + fn empty_be() { 314 + assert_eq!(be(&[]), ""); 315 + } 316 + 317 + // -- Fatal mode -- 318 + 319 + #[test] 320 + fn fatal_valid_le() { 321 + assert_eq!( 322 + decode_utf16le(&[0x41, 0x00], ErrorMode::Fatal).unwrap(), 323 + "A" 324 + ); 325 + } 326 + 327 + #[test] 328 + fn fatal_unpaired_lead_le() { 329 + let err = decode_utf16le(&[0x00, 0xD8, 0x41, 0x00], ErrorMode::Fatal).unwrap_err(); 330 + assert!(matches!( 331 + err, 332 + EncodingError::InvalidSequence { 333 + encoding: "UTF-16LE", 334 + .. 335 + } 336 + )); 337 + } 338 + 339 + #[test] 340 + fn fatal_unpaired_trail_le() { 341 + let err = decode_utf16le(&[0x00, 0xDC], ErrorMode::Fatal).unwrap_err(); 342 + assert!(matches!( 343 + err, 344 + EncodingError::InvalidSequence { 345 + encoding: "UTF-16LE", 346 + .. 347 + } 348 + )); 349 + } 350 + 351 + #[test] 352 + fn fatal_odd_byte_le() { 353 + let err = decode_utf16le(&[0x41, 0x00, 0x42], ErrorMode::Fatal).unwrap_err(); 354 + assert!(matches!( 355 + err, 356 + EncodingError::InvalidSequence { 357 + encoding: "UTF-16LE", 358 + .. 359 + } 360 + )); 361 + } 362 + 363 + // -- Mixed content -- 364 + 365 + #[test] 366 + fn le_mixed_bmp_and_supplementary() { 367 + // "A" + U+1F600 + "B" 368 + // LE: 41 00 | 3D D8 00 DE | 42 00 369 + assert_eq!( 370 + le(&[0x41, 0x00, 0x3D, 0xD8, 0x00, 0xDE, 0x42, 0x00]), 371 + "A\u{1F600}B" 372 + ); 373 + } 374 + 375 + #[test] 376 + fn be_mixed_bmp_and_supplementary() { 377 + // "A" + U+1F600 + "B" 378 + // BE: 00 41 | D8 3D DE 00 | 00 42 379 + assert_eq!( 380 + be(&[0x00, 0x41, 0xD8, 0x3D, 0xDE, 0x00, 0x00, 0x42]), 381 + "A\u{1F600}B" 382 + ); 383 + } 384 + 385 + #[test] 386 + fn le_null_character() { 387 + // U+0000 = 00 00 in LE 388 + assert_eq!(le(&[0x00, 0x00]), "\0"); 389 + } 390 + }

+486

crates/encoding/src/utf8.rs

··· 1 + //! UTF-8 decoder and encoder per WHATWG Encoding Standard. 2 + 3 + use crate::error::{EncodingError, Result}; 4 + 5 + /// Error handling mode. 6 + #[derive(Debug, Clone, Copy, PartialEq, Eq)] 7 + pub(crate) enum ErrorMode { 8 + Replacement, 9 + Fatal, 10 + } 11 + 12 + /// Decode a byte slice as UTF-8. 13 + /// 14 + /// In replacement mode, invalid sequences are replaced with U+FFFD. 15 + /// In fatal mode, the first invalid sequence causes an error. 16 + pub(crate) fn decode_utf8(bytes: &[u8], mode: ErrorMode) -> Result<String> { 17 + // Strip UTF-8 BOM if present 18 + let bytes = if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF { 19 + &bytes[3..] 20 + } else { 21 + bytes 22 + }; 23 + 24 + let mut output = String::with_capacity(bytes.len()); 25 + let mut decoder = Utf8Decoder::new(); 26 + let mut i = 0; 27 + 28 + while i < bytes.len() { 29 + match decoder.process_byte(bytes[i]) { 30 + DecoderResult::CodePoint(ch) => { 31 + output.push(ch); 32 + i += 1; 33 + } 34 + DecoderResult::Error(error_pos) => { 35 + if mode == ErrorMode::Fatal { 36 + return Err(EncodingError::InvalidSequence { 37 + encoding: "UTF-8", 38 + position: error_pos, 39 + }); 40 + } 41 + output.push('\u{FFFD}'); 42 + i += 1; 43 + } 44 + DecoderResult::ErrorPrepend(error_pos) => { 45 + if mode == ErrorMode::Fatal { 46 + return Err(EncodingError::InvalidSequence { 47 + encoding: "UTF-8", 48 + position: error_pos, 49 + }); 50 + } 51 + output.push('\u{FFFD}'); 52 + // Do NOT advance i — re-process this byte 53 + } 54 + DecoderResult::Continue => { 55 + i += 1; 56 + } 57 + } 58 + } 59 + 60 + // Handle incomplete sequence at end of input 61 + if decoder.bytes_needed > 0 { 62 + if mode == ErrorMode::Fatal { 63 + return Err(EncodingError::InvalidSequence { 64 + encoding: "UTF-8", 65 + position: bytes.len().saturating_sub(decoder.bytes_seen as usize), 66 + }); 67 + } 68 + output.push('\u{FFFD}'); 69 + } 70 + 71 + Ok(output) 72 + } 73 + 74 + /// Encode a string as UTF-8 bytes. 75 + /// 76 + /// Since Rust strings are already valid UTF-8, this is a straightforward copy. 77 + pub(crate) fn encode_utf8(text: &str) -> Vec<u8> { 78 + text.as_bytes().to_vec() 79 + } 80 + 81 + // --------------------------------------------------------------------------- 82 + // Streaming UTF-8 decoder (WHATWG Encoding Standard §8.1.1) 83 + // --------------------------------------------------------------------------- 84 + 85 + enum DecoderResult { 86 + /// A valid code point was decoded. 87 + CodePoint(char), 88 + /// An error occurred at the given byte position; advance to next byte. 89 + Error(usize), 90 + /// An error occurred at the given byte position; re-process current byte. 91 + ErrorPrepend(usize), 92 + /// More bytes needed; continue feeding. 93 + Continue, 94 + } 95 + 96 + struct Utf8Decoder { 97 + code_point: u32, 98 + bytes_seen: u8, 99 + bytes_needed: u8, 100 + lower_boundary: u8, 101 + upper_boundary: u8, 102 + /// Position of the start of the current multi-byte sequence. 103 + sequence_start: usize, 104 + /// Total bytes processed so far. 105 + position: usize, 106 + } 107 + 108 + impl Utf8Decoder { 109 + fn new() -> Self { 110 + Self { 111 + code_point: 0, 112 + bytes_seen: 0, 113 + bytes_needed: 0, 114 + lower_boundary: 0x80, 115 + upper_boundary: 0xBF, 116 + sequence_start: 0, 117 + position: 0, 118 + } 119 + } 120 + 121 + fn process_byte(&mut self, byte: u8) -> DecoderResult { 122 + let pos = self.position; 123 + self.position += 1; 124 + 125 + if self.bytes_needed == 0 { 126 + match byte { 127 + 0x00..=0x7F => DecoderResult::CodePoint(byte as char), 128 + 0xC2..=0xDF => { 129 + self.bytes_needed = 1; 130 + self.code_point = (byte & 0x1F) as u32; 131 + self.sequence_start = pos; 132 + DecoderResult::Continue 133 + } 134 + 0xE0 => { 135 + self.bytes_needed = 2; 136 + self.lower_boundary = 0xA0; 137 + self.code_point = (byte & 0x0F) as u32; 138 + self.sequence_start = pos; 139 + DecoderResult::Continue 140 + } 141 + 0xE1..=0xEC | 0xEE..=0xEF => { 142 + self.bytes_needed = 2; 143 + self.code_point = (byte & 0x0F) as u32; 144 + self.sequence_start = pos; 145 + DecoderResult::Continue 146 + } 147 + 0xED => { 148 + self.bytes_needed = 2; 149 + self.upper_boundary = 0x9F; 150 + self.code_point = (byte & 0x0F) as u32; 151 + self.sequence_start = pos; 152 + DecoderResult::Continue 153 + } 154 + 0xF0 => { 155 + self.bytes_needed = 3; 156 + self.lower_boundary = 0x90; 157 + self.code_point = (byte & 0x07) as u32; 158 + self.sequence_start = pos; 159 + DecoderResult::Continue 160 + } 161 + 0xF1..=0xF3 => { 162 + self.bytes_needed = 3; 163 + self.code_point = (byte & 0x07) as u32; 164 + self.sequence_start = pos; 165 + DecoderResult::Continue 166 + } 167 + 0xF4 => { 168 + self.bytes_needed = 3; 169 + self.upper_boundary = 0x8F; 170 + self.code_point = (byte & 0x07) as u32; 171 + self.sequence_start = pos; 172 + DecoderResult::Continue 173 + } 174 + _ => { 175 + // 0x80..=0xC1, 0xF5..=0xFF: invalid lead byte 176 + DecoderResult::Error(pos) 177 + } 178 + } 179 + } else { 180 + // Expecting continuation byte 181 + if byte < self.lower_boundary || byte > self.upper_boundary { 182 + // Invalid continuation — reset and prepend byte 183 + let err_pos = self.sequence_start; 184 + self.reset(); 185 + self.position -= 1; // will be re-processed 186 + return DecoderResult::ErrorPrepend(err_pos); 187 + } 188 + 189 + // Valid continuation byte 190 + self.lower_boundary = 0x80; 191 + self.upper_boundary = 0xBF; 192 + self.code_point = (self.code_point << 6) | (byte & 0x3F) as u32; 193 + self.bytes_seen += 1; 194 + 195 + if self.bytes_seen == self.bytes_needed { 196 + let cp = self.code_point; 197 + self.reset(); 198 + // The WHATWG state machine guarantees valid scalar values here, 199 + // but use fallback for defense-in-depth. 200 + let ch = char::from_u32(cp).unwrap_or('\u{FFFD}'); 201 + DecoderResult::CodePoint(ch) 202 + } else { 203 + DecoderResult::Continue 204 + } 205 + } 206 + } 207 + 208 + fn reset(&mut self) { 209 + self.code_point = 0; 210 + self.bytes_seen = 0; 211 + self.bytes_needed = 0; 212 + self.lower_boundary = 0x80; 213 + self.upper_boundary = 0xBF; 214 + } 215 + } 216 + 217 + // --------------------------------------------------------------------------- 218 + // Tests 219 + // --------------------------------------------------------------------------- 220 + 221 + #[cfg(test)] 222 + mod tests { 223 + use super::*; 224 + 225 + fn decode_replace(bytes: &[u8]) -> String { 226 + decode_utf8(bytes, ErrorMode::Replacement).unwrap() 227 + } 228 + 229 + fn decode_fatal(bytes: &[u8]) -> Result<String> { 230 + decode_utf8(bytes, ErrorMode::Fatal) 231 + } 232 + 233 + // -- Basic ASCII -- 234 + 235 + #[test] 236 + fn ascii_roundtrip() { 237 + assert_eq!(decode_replace(b"Hello, world!"), "Hello, world!"); 238 + } 239 + 240 + #[test] 241 + fn empty_input() { 242 + assert_eq!(decode_replace(b""), ""); 243 + } 244 + 245 + #[test] 246 + fn null_byte() { 247 + assert_eq!(decode_replace(&[0x00]), "\0"); 248 + } 249 + 250 + // -- Multi-byte sequences -- 251 + 252 + #[test] 253 + fn two_byte_sequence() { 254 + // U+00E9 (e with acute) = 0xC3 0xA9 255 + assert_eq!(decode_replace(&[0xC3, 0xA9]), "\u{00E9}"); 256 + } 257 + 258 + #[test] 259 + fn three_byte_sequence() { 260 + // U+4E16 (CJK character) = 0xE4 0xB8 0x96 261 + assert_eq!(decode_replace(&[0xE4, 0xB8, 0x96]), "\u{4E16}"); 262 + } 263 + 264 + #[test] 265 + fn four_byte_sequence() { 266 + // U+1F600 (grinning face) = 0xF0 0x9F 0x98 0x80 267 + assert_eq!(decode_replace(&[0xF0, 0x9F, 0x98, 0x80]), "\u{1F600}"); 268 + } 269 + 270 + #[test] 271 + fn mixed_ascii_and_multibyte() { 272 + // "Caf\u{00E9}" = [0x43, 0x61, 0x66, 0xC3, 0xA9] 273 + assert_eq!( 274 + decode_replace(&[0x43, 0x61, 0x66, 0xC3, 0xA9]), 275 + "Caf\u{00E9}" 276 + ); 277 + } 278 + 279 + // -- BOM handling -- 280 + 281 + #[test] 282 + fn bom_stripped() { 283 + // UTF-8 BOM + "A" 284 + assert_eq!(decode_replace(&[0xEF, 0xBB, 0xBF, 0x41]), "A"); 285 + } 286 + 287 + #[test] 288 + fn bom_only() { 289 + assert_eq!(decode_replace(&[0xEF, 0xBB, 0xBF]), ""); 290 + } 291 + 292 + // -- Invalid sequences (replacement mode) -- 293 + 294 + #[test] 295 + fn invalid_byte_ff() { 296 + assert_eq!(decode_replace(&[0xFF]), "\u{FFFD}"); 297 + } 298 + 299 + #[test] 300 + fn invalid_byte_fe() { 301 + assert_eq!(decode_replace(&[0xFE]), "\u{FFFD}"); 302 + } 303 + 304 + #[test] 305 + fn invalid_continuation_byte_standalone() { 306 + // 0x80 without a lead byte 307 + assert_eq!(decode_replace(&[0x80]), "\u{FFFD}"); 308 + } 309 + 310 + #[test] 311 + fn overlong_two_byte() { 312 + // 0xC0 0xAF is an overlong encoding of U+002F ('/') 313 + // 0xC0 is always invalid (lead byte rejected), 0xAF is a continuation 314 + // byte without a lead (also invalid) — both produce U+FFFD 315 + assert_eq!(decode_replace(&[0xC0, 0xAF]), "\u{FFFD}\u{FFFD}"); 316 + } 317 + 318 + #[test] 319 + fn truncated_two_byte() { 320 + // 0xC3 without continuation 321 + assert_eq!(decode_replace(&[0xC3]), "\u{FFFD}"); 322 + } 323 + 324 + #[test] 325 + fn truncated_three_byte() { 326 + // 0xE4 0xB8 without third byte 327 + assert_eq!(decode_replace(&[0xE4, 0xB8]), "\u{FFFD}"); 328 + } 329 + 330 + #[test] 331 + fn truncated_four_byte() { 332 + // 0xF0 0x9F 0x98 without fourth byte 333 + assert_eq!(decode_replace(&[0xF0, 0x9F, 0x98]), "\u{FFFD}"); 334 + } 335 + 336 + #[test] 337 + fn surrogate_half_rejected() { 338 + // U+D800 would encode as 0xED 0xA0 0x80, but surrogates are invalid in UTF-8 339 + // 0xED with upper_boundary 0x9F rejects 0xA0 340 + assert_eq!( 341 + decode_replace(&[0xED, 0xA0, 0x80]), 342 + "\u{FFFD}\u{FFFD}\u{FFFD}" 343 + ); 344 + } 345 + 346 + #[test] 347 + fn invalid_continuation_mid_sequence() { 348 + // 0xE4 expects continuation, but 0x41 is ASCII — error + prepend 349 + assert_eq!(decode_replace(&[0xE4, 0x41]), "\u{FFFD}A"); 350 + } 351 + 352 + #[test] 353 + fn invalid_between_valid() { 354 + // Valid 'A', invalid 0xFF, valid 'B' 355 + assert_eq!(decode_replace(&[0x41, 0xFF, 0x42]), "A\u{FFFD}B"); 356 + } 357 + 358 + #[test] 359 + fn multiple_errors_in_a_row() { 360 + assert_eq!( 361 + decode_replace(&[0xFE, 0xFF, 0xFE]), 362 + "\u{FFFD}\u{FFFD}\u{FFFD}" 363 + ); 364 + } 365 + 366 + // -- Fatal mode -- 367 + 368 + #[test] 369 + fn fatal_valid() { 370 + assert_eq!(decode_fatal(b"Hello").unwrap(), "Hello"); 371 + } 372 + 373 + #[test] 374 + fn fatal_invalid() { 375 + let err = decode_fatal(&[0x41, 0xFF]).unwrap_err(); 376 + assert!(matches!( 377 + err, 378 + EncodingError::InvalidSequence { 379 + encoding: "UTF-8", 380 + position: 1 381 + } 382 + )); 383 + } 384 + 385 + #[test] 386 + fn fatal_truncated() { 387 + let err = decode_fatal(&[0xC3]).unwrap_err(); 388 + assert!(matches!( 389 + err, 390 + EncodingError::InvalidSequence { 391 + encoding: "UTF-8", 392 + .. 393 + } 394 + )); 395 + } 396 + 397 + // -- Encoder -- 398 + 399 + #[test] 400 + fn encode_ascii() { 401 + assert_eq!(encode_utf8("Hello"), b"Hello"); 402 + } 403 + 404 + #[test] 405 + fn encode_multibyte() { 406 + assert_eq!(encode_utf8("\u{00E9}"), &[0xC3, 0xA9]); 407 + } 408 + 409 + #[test] 410 + fn encode_emoji() { 411 + assert_eq!(encode_utf8("\u{1F600}"), &[0xF0, 0x9F, 0x98, 0x80]); 412 + } 413 + 414 + #[test] 415 + fn encode_empty() { 416 + assert_eq!(encode_utf8(""), b""); 417 + } 418 + 419 + #[test] 420 + fn roundtrip() { 421 + let original = "Hello \u{4E16}\u{754C} \u{1F600}"; 422 + let encoded = encode_utf8(original); 423 + let decoded = decode_replace(&encoded); 424 + assert_eq!(decoded, original); 425 + } 426 + 427 + // -- Edge cases -- 428 + 429 + #[test] 430 + fn max_two_byte() { 431 + // U+07FF = 0xDF 0xBF 432 + assert_eq!(decode_replace(&[0xDF, 0xBF]), "\u{07FF}"); 433 + } 434 + 435 + #[test] 436 + fn min_three_byte() { 437 + // U+0800 = 0xE0 0xA0 0x80 438 + assert_eq!(decode_replace(&[0xE0, 0xA0, 0x80]), "\u{0800}"); 439 + } 440 + 441 + #[test] 442 + fn max_three_byte() { 443 + // U+FFFF = 0xEF 0xBF 0xBF 444 + assert_eq!(decode_replace(&[0xEF, 0xBF, 0xBF]), "\u{FFFF}"); 445 + } 446 + 447 + #[test] 448 + fn min_four_byte() { 449 + // U+10000 = 0xF0 0x90 0x80 0x80 450 + assert_eq!(decode_replace(&[0xF0, 0x90, 0x80, 0x80]), "\u{10000}"); 451 + } 452 + 453 + #[test] 454 + fn max_unicode() { 455 + // U+10FFFF = 0xF4 0x8F 0xBF 0xBF 456 + assert_eq!(decode_replace(&[0xF4, 0x8F, 0xBF, 0xBF]), "\u{10FFFF}"); 457 + } 458 + 459 + #[test] 460 + fn above_max_unicode_rejected() { 461 + // 0xF4 0x90 would start U+110000, which is above max 462 + // 0xF4 has upper_boundary = 0x8F, so 0x90 is rejected 463 + assert_eq!( 464 + decode_replace(&[0xF4, 0x90, 0x80, 0x80]), 465 + "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" 466 + ); 467 + } 468 + 469 + #[test] 470 + fn overlong_three_byte_rejected() { 471 + // 0xE0 requires lower_boundary = 0xA0, so 0xE0 0x80 0x80 is rejected 472 + assert_eq!( 473 + decode_replace(&[0xE0, 0x80, 0x80]), 474 + "\u{FFFD}\u{FFFD}\u{FFFD}" 475 + ); 476 + } 477 + 478 + #[test] 479 + fn overlong_four_byte_rejected() { 480 + // 0xF0 requires lower_boundary = 0x90, so 0xF0 0x80 0x80 0x80 is rejected 481 + assert_eq!( 482 + decode_replace(&[0xF0, 0x80, 0x80, 0x80]), 483 + "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" 484 + ); 485 + } 486 + }

Configure Feed

Configure Feed