Implement encoding sniffing: BOM, HTTP charset, meta prescan

+714

2 changed files

expand all

crates

encoding

src

lib.rs

sniff.rs

crates/encoding/src/lib.rs

··· 2 2 3 3 pub mod error; 4 4 mod single_byte; 5 + pub mod sniff; 5 6 mod utf16; 6 7 mod utf8; 7 8

+713

crates/encoding/src/sniff.rs

··· 1 + //! Encoding sniffing per WHATWG Encoding Standard and HTML spec. 2 + //! 3 + //! Detects character encoding from BOM, HTTP Content-Type charset, or HTML meta prescan. 4 + 5 + use crate::{bom_sniff, lookup, Encoding}; 6 + 7 + /// How the encoding was determined. 8 + #[derive(Debug, Clone, Copy, PartialEq, Eq)] 9 + pub enum EncodingSource { 10 + /// Byte Order Mark at the start of the byte stream. 11 + Bom, 12 + /// `charset` parameter from the HTTP `Content-Type` header. 13 + HttpHeader, 14 + /// `<meta charset>` or `<meta http-equiv="Content-Type">` prescan. 15 + MetaPrescan, 16 + /// Default fallback (Windows-1252 for HTML). 17 + Default, 18 + } 19 + 20 + /// Sniff the encoding of a byte stream. 21 + /// 22 + /// Priority order per spec: BOM > HTTP Content-Type charset > HTML meta prescan > default. 23 + /// The default encoding is Windows-1252 per WHATWG spec for HTML. 24 + pub fn sniff_encoding(bytes: &[u8], http_content_type: Option<&str>) -> (Encoding, EncodingSource) { 25 + // 1. BOM sniffing (highest priority) 26 + let (bom_enc, _) = bom_sniff(bytes); 27 + if let Some(enc) = bom_enc { 28 + return (enc, EncodingSource::Bom); 29 + } 30 + 31 + // 2. HTTP Content-Type charset 32 + if let Some(ct) = http_content_type { 33 + if let Some(enc) = extract_charset_from_content_type(ct) { 34 + return (enc, EncodingSource::HttpHeader); 35 + } 36 + } 37 + 38 + // 3. HTML meta prescan (first 1024 bytes) 39 + if let Some(enc) = meta_prescan(bytes) { 40 + return (enc, EncodingSource::MetaPrescan); 41 + } 42 + 43 + // 4. Default: Windows-1252 44 + (Encoding::Windows1252, EncodingSource::Default) 45 + } 46 + 47 + /// Extract charset from an HTTP `Content-Type` header value. 48 + /// 49 + /// Handles formats like: 50 + /// - `text/html; charset=utf-8` 51 + /// - `text/html; charset="utf-8"` 52 + /// - `text/html;charset=utf-8` (no space) 53 + /// 54 + /// Per WHATWG spec, the charset parameter value is looked up via the encoding label table. 55 + /// Returns `None` for UTF-16BE/LE from HTTP headers per spec (those are only valid via BOM). 56 + fn extract_charset_from_content_type(content_type: &str) -> Option<Encoding> { 57 + let charset_value = extract_charset_value(content_type)?; 58 + let enc = lookup(charset_value)?; 59 + // Per WHATWG: if the encoding from HTTP is UTF-16BE or UTF-16LE, use UTF-8 instead 60 + Some(match enc { 61 + Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8, 62 + other => other, 63 + }) 64 + } 65 + 66 + /// Extract the raw charset value from a Content-Type string. 67 + fn extract_charset_value(content_type: &str) -> Option<&str> { 68 + // Find "charset" (case-insensitive) after a ';' 69 + let lower = content_type.to_ascii_lowercase(); 70 + let idx = lower.find("charset")?; 71 + 72 + // Must be preceded by ';' or whitespace (or be in parameters section) 73 + let after_charset = &content_type[idx + 7..]; 74 + // Skip optional whitespace then '=' 75 + let after_charset = after_charset.trim_start(); 76 + let after_eq = after_charset.strip_prefix('=')?; 77 + let after_eq = after_eq.trim_start(); 78 + 79 + if let Some(inner) = after_eq.strip_prefix('"') { 80 + // Quoted value 81 + let end = inner.find('"')?; 82 + Some(&inner[..end]) 83 + } else { 84 + // Unquoted value: terminated by whitespace, ';', or end of string 85 + let end = after_eq 86 + .find(|c: char| c == ';' || c.is_ascii_whitespace()) 87 + .unwrap_or(after_eq.len()); 88 + if end == 0 { 89 + return None; 90 + } 91 + Some(&after_eq[..end]) 92 + } 93 + } 94 + 95 + /// Prescan the first 1024 bytes of an HTML document for encoding declarations. 96 + /// 97 + /// Per the HTML spec "prescan a byte stream to determine its encoding" algorithm. 98 + /// Looks for: 99 + /// - `<meta charset="...">` 100 + /// - `<meta http-equiv="Content-Type" content="...;charset=...">` 101 + fn meta_prescan(bytes: &[u8]) -> Option<Encoding> { 102 + let limit = bytes.len().min(1024); 103 + let bytes = &bytes[..limit]; 104 + let mut pos = 0; 105 + 106 + while pos < bytes.len() { 107 + // Skip until we find '<' 108 + if bytes[pos] != b'<' { 109 + pos += 1; 110 + continue; 111 + } 112 + pos += 1; 113 + if pos >= bytes.len() { 114 + break; 115 + } 116 + 117 + // Check for comment "" 121 + while pos + 2 < bytes.len() { 122 + if bytes[pos] == b'-' && bytes[pos + 1] == b'-' && bytes[pos + 2] == b'>' { 123 + pos += 3; 124 + break; 125 + } 126 + pos += 1; 127 + } 128 + continue; 129 + } 130 + 131 + // Check for "<meta" (case-insensitive) 132 + if pos + 4 <= bytes.len() && ascii_ci_eq(&bytes[pos..pos + 4], b"meta") { 133 + let after_meta = pos + 4; 134 + if after_meta < bytes.len() && is_space_or_slash(bytes[after_meta]) { 135 + if let Some((enc, _tag_end)) = parse_meta_tag(bytes, after_meta) { 136 + // Per spec: override UTF-16 from meta to UTF-8 137 + let enc = match enc { 138 + Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8, 139 + other => other, 140 + }; 141 + return Some(enc); 142 + } else { 143 + pos = skip_tag(bytes, after_meta); 144 + continue; 145 + } 146 + } 147 + } 148 + 149 + // Skip other tags (like <!DOCTYPE>, <html>, etc.) 150 + if bytes[pos..].starts_with(b"!") || bytes[pos..].starts_with(b"/") || bytes[pos] == b'?' { 151 + pos = skip_tag(bytes, pos); 152 + continue; 153 + } 154 + 155 + // Check if it's a letter (start of a tag name) 156 + if pos < bytes.len() && bytes[pos].is_ascii_alphabetic() { 157 + pos = skip_tag(bytes, pos); 158 + continue; 159 + } 160 + 161 + // Not a tag, continue 162 + } 163 + 164 + None 165 + } 166 + 167 + /// Parse attributes of a `<meta` tag looking for charset declarations. 168 + /// 169 + /// Returns the encoding and position after the tag if found. 170 + fn parse_meta_tag(bytes: &[u8], start: usize) -> Option<(Encoding, usize)> { 171 + let mut pos = start; 172 + let mut got_pragma = false; 173 + let mut need_pragma: Option<bool> = None; 174 + let mut charset: Option<Encoding> = None; 175 + 176 + loop { 177 + // Skip whitespace 178 + while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { 179 + pos += 1; 180 + } 181 + if pos >= bytes.len() { 182 + break; 183 + } 184 + // End of tag? 185 + if bytes[pos] == b'>' 186 + || (bytes[pos] == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'>') 187 + { 188 + break; 189 + } 190 + 191 + let (attr_name, attr_value, new_pos) = parse_attribute(bytes, pos)?; 192 + pos = new_pos; 193 + 194 + if ascii_ci_eq_str(&attr_name, "http-equiv") { 195 + if ascii_ci_eq_str(&attr_value, "content-type") { 196 + got_pragma = true; 197 + } 198 + } else if ascii_ci_eq_str(&attr_name, "content") { 199 + if let Some(charset_val) = extract_charset_from_meta_content(&attr_value) { 200 + if let Some(enc) = lookup(&charset_val) { 201 + charset = Some(enc); 202 + need_pragma = Some(true); 203 + } 204 + } 205 + } else if ascii_ci_eq_str(&attr_name, "charset") { 206 + if let Some(enc) = lookup(&attr_value) { 207 + charset = Some(enc); 208 + need_pragma = Some(false); 209 + } 210 + } 211 + } 212 + 213 + // Determine result per spec 214 + match (need_pragma, charset) { 215 + (Some(true), Some(enc)) if got_pragma => Some((enc, pos)), 216 + (Some(false), Some(enc)) => Some((enc, pos)), 217 + _ => None, 218 + } 219 + } 220 + 221 + /// Parse a single HTML attribute (name=value pair). 222 + /// 223 + /// Returns (name, value, new_position). Returns None if we hit end of tag or input. 224 + fn parse_attribute(bytes: &[u8], start: usize) -> Option<(String, String, usize)> { 225 + let mut pos = start; 226 + 227 + // Skip whitespace 228 + while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { 229 + pos += 1; 230 + } 231 + if pos >= bytes.len() || bytes[pos] == b'>' { 232 + return None; 233 + } 234 + 235 + // Read attribute name 236 + let name_start = pos; 237 + while pos < bytes.len() 238 + && bytes[pos] != b'=' 239 + && bytes[pos] != b'>' 240 + && !bytes[pos].is_ascii_whitespace() 241 + && bytes[pos] != b'/' 242 + { 243 + pos += 1; 244 + } 245 + let name = to_ascii_lowercase(&bytes[name_start..pos]); 246 + if name.is_empty() { 247 + return None; 248 + } 249 + 250 + // Skip whitespace 251 + while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { 252 + pos += 1; 253 + } 254 + 255 + // No value 256 + if pos >= bytes.len() || bytes[pos] != b'=' { 257 + return Some((name, String::new(), pos)); 258 + } 259 + pos += 1; // skip '=' 260 + 261 + // Skip whitespace 262 + while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { 263 + pos += 1; 264 + } 265 + 266 + if pos >= bytes.len() { 267 + return Some((name, String::new(), pos)); 268 + } 269 + 270 + // Read value 271 + let value; 272 + if bytes[pos] == b'"' || bytes[pos] == b'\'' { 273 + let quote = bytes[pos]; 274 + pos += 1; 275 + let val_start = pos; 276 + while pos < bytes.len() && bytes[pos] != quote { 277 + pos += 1; 278 + } 279 + value = to_ascii_lowercase(&bytes[val_start..pos]); 280 + if pos < bytes.len() { 281 + pos += 1; // skip closing quote 282 + } 283 + } else { 284 + let val_start = pos; 285 + while pos < bytes.len() 286 + && !bytes[pos].is_ascii_whitespace() 287 + && bytes[pos] != b'>' 288 + && bytes[pos] != b';' 289 + { 290 + pos += 1; 291 + } 292 + value = to_ascii_lowercase(&bytes[val_start..pos]); 293 + } 294 + 295 + Some((name, value, pos)) 296 + } 297 + 298 + /// Extract charset value from a meta content attribute value. 299 + /// 300 + /// Looks for `charset=` in strings like `text/html; charset=utf-8`. 301 + fn extract_charset_from_meta_content(content: &str) -> Option<String> { 302 + let lower = content.to_ascii_lowercase(); 303 + let idx = lower.find("charset")?; 304 + let rest = &content[idx + 7..]; 305 + // Skip whitespace 306 + let rest = rest.trim_start(); 307 + let rest = rest.strip_prefix('=')?; 308 + let rest = rest.trim_start(); 309 + 310 + if rest.is_empty() { 311 + return None; 312 + } 313 + 314 + // The value is terminated by ';', whitespace, or end 315 + if rest.starts_with('"') || rest.starts_with('\'') { 316 + let quote = rest.as_bytes()[0]; 317 + let inner = &rest[1..]; 318 + let end = inner.find(quote as char).unwrap_or(inner.len()); 319 + let val = inner[..end].trim(); 320 + if val.is_empty() { 321 + return None; 322 + } 323 + Some(val.to_string()) 324 + } else { 325 + let end = rest 326 + .find(|c: char| c == ';' || c.is_ascii_whitespace()) 327 + .unwrap_or(rest.len()); 328 + if end == 0 { 329 + return None; 330 + } 331 + Some(rest[..end].to_string()) 332 + } 333 + } 334 + 335 + /// Skip a tag (find the closing '>'). 336 + fn skip_tag(bytes: &[u8], start: usize) -> usize { 337 + let mut pos = start; 338 + while pos < bytes.len() && bytes[pos] != b'>' { 339 + pos += 1; 340 + } 341 + if pos < bytes.len() { 342 + pos + 1 343 + } else { 344 + pos 345 + } 346 + } 347 + 348 + fn is_space_or_slash(b: u8) -> bool { 349 + b.is_ascii_whitespace() || b == b'/' 350 + } 351 + 352 + fn ascii_ci_eq(a: &[u8], b: &[u8]) -> bool { 353 + a.len() == b.len() && a.iter().zip(b).all(|(x, y)| x.eq_ignore_ascii_case(y)) 354 + } 355 + 356 + fn ascii_ci_eq_str(a: &str, b: &str) -> bool { 357 + a.eq_ignore_ascii_case(b) 358 + } 359 + 360 + fn to_ascii_lowercase(bytes: &[u8]) -> String { 361 + bytes 362 + .iter() 363 + .map(|&b| b.to_ascii_lowercase() as char) 364 + .collect() 365 + } 366 + 367 + #[cfg(test)] 368 + mod tests { 369 + use super::*; 370 + 371 + // ----------------------------------------------------------------------- 372 + // sniff_encoding — BOM priority 373 + // ----------------------------------------------------------------------- 374 + 375 + #[test] 376 + fn sniff_bom_utf8() { 377 + let bytes = b"\xEF\xBB\xBFHello"; 378 + let (enc, src) = sniff_encoding(bytes, None); 379 + assert_eq!(enc, Encoding::Utf8); 380 + assert_eq!(src, EncodingSource::Bom); 381 + } 382 + 383 + #[test] 384 + fn sniff_bom_utf16be() { 385 + let bytes = b"\xFE\xFF\x00A"; 386 + let (enc, src) = sniff_encoding(bytes, None); 387 + assert_eq!(enc, Encoding::Utf16Be); 388 + assert_eq!(src, EncodingSource::Bom); 389 + } 390 + 391 + #[test] 392 + fn sniff_bom_utf16le() { 393 + let bytes = b"\xFF\xFEA\x00"; 394 + let (enc, src) = sniff_encoding(bytes, None); 395 + assert_eq!(enc, Encoding::Utf16Le); 396 + assert_eq!(src, EncodingSource::Bom); 397 + } 398 + 399 + #[test] 400 + fn sniff_bom_beats_http_header() { 401 + let bytes = b"\xEF\xBB\xBFHello"; 402 + let (enc, src) = sniff_encoding(bytes, Some("text/html; charset=iso-8859-2")); 403 + assert_eq!(enc, Encoding::Utf8); 404 + assert_eq!(src, EncodingSource::Bom); 405 + } 406 + 407 + #[test] 408 + fn sniff_bom_beats_meta() { 409 + let mut bytes = vec![0xEF, 0xBB, 0xBF]; 410 + bytes.extend_from_slice(b"<meta charset=\"iso-8859-5\">"); 411 + let (enc, src) = sniff_encoding(&bytes, None); 412 + assert_eq!(enc, Encoding::Utf8); 413 + assert_eq!(src, EncodingSource::Bom); 414 + } 415 + 416 + // ----------------------------------------------------------------------- 417 + // sniff_encoding — HTTP Content-Type priority 418 + // ----------------------------------------------------------------------- 419 + 420 + #[test] 421 + fn sniff_http_charset_utf8() { 422 + let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-8")); 423 + assert_eq!(enc, Encoding::Utf8); 424 + assert_eq!(src, EncodingSource::HttpHeader); 425 + } 426 + 427 + #[test] 428 + fn sniff_http_charset_quoted() { 429 + let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=\"utf-8\"")); 430 + assert_eq!(enc, Encoding::Utf8); 431 + assert_eq!(src, EncodingSource::HttpHeader); 432 + } 433 + 434 + #[test] 435 + fn sniff_http_charset_case_insensitive() { 436 + let (enc, src) = sniff_encoding(b"Hello", Some("text/html; Charset=UTF-8")); 437 + assert_eq!(enc, Encoding::Utf8); 438 + assert_eq!(src, EncodingSource::HttpHeader); 439 + } 440 + 441 + #[test] 442 + fn sniff_http_charset_no_space() { 443 + let (enc, src) = sniff_encoding(b"Hello", Some("text/html;charset=utf-8")); 444 + assert_eq!(enc, Encoding::Utf8); 445 + assert_eq!(src, EncodingSource::HttpHeader); 446 + } 447 + 448 + #[test] 449 + fn sniff_http_charset_windows_1252() { 450 + let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=windows-1252")); 451 + assert_eq!(enc, Encoding::Windows1252); 452 + assert_eq!(src, EncodingSource::HttpHeader); 453 + } 454 + 455 + #[test] 456 + fn sniff_http_charset_iso_8859_1_maps_to_1252() { 457 + let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=iso-8859-1")); 458 + assert_eq!(enc, Encoding::Windows1252); 459 + assert_eq!(src, EncodingSource::HttpHeader); 460 + } 461 + 462 + #[test] 463 + fn sniff_http_utf16_override_to_utf8() { 464 + // Per WHATWG spec: UTF-16 from HTTP becomes UTF-8 465 + let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-16le")); 466 + assert_eq!(enc, Encoding::Utf8); 467 + assert_eq!(src, EncodingSource::HttpHeader); 468 + } 469 + 470 + #[test] 471 + fn sniff_http_no_charset() { 472 + let (enc, src) = sniff_encoding(b"Hello", Some("text/html")); 473 + // Falls through to default 474 + assert_eq!(enc, Encoding::Windows1252); 475 + assert_eq!(src, EncodingSource::Default); 476 + } 477 + 478 + #[test] 479 + fn sniff_http_beats_meta() { 480 + let html = b"<meta charset=\"iso-8859-5\">"; 481 + let (enc, src) = sniff_encoding(html, Some("text/html; charset=utf-8")); 482 + assert_eq!(enc, Encoding::Utf8); 483 + assert_eq!(src, EncodingSource::HttpHeader); 484 + } 485 + 486 + // ----------------------------------------------------------------------- 487 + // sniff_encoding — meta prescan 488 + // ----------------------------------------------------------------------- 489 + 490 + #[test] 491 + fn sniff_meta_charset() { 492 + let html = b"<meta charset=\"utf-8\">"; 493 + let (enc, src) = sniff_encoding(html, None); 494 + assert_eq!(enc, Encoding::Utf8); 495 + assert_eq!(src, EncodingSource::MetaPrescan); 496 + } 497 + 498 + #[test] 499 + fn sniff_meta_charset_single_quotes() { 500 + let html = b"<meta charset='utf-8'>"; 501 + let (enc, src) = sniff_encoding(html, None); 502 + assert_eq!(enc, Encoding::Utf8); 503 + assert_eq!(src, EncodingSource::MetaPrescan); 504 + } 505 + 506 + #[test] 507 + fn sniff_meta_http_equiv() { 508 + let html = b"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">"; 509 + let (enc, src) = sniff_encoding(html, None); 510 + assert_eq!(enc, Encoding::Utf8); 511 + assert_eq!(src, EncodingSource::MetaPrescan); 512 + } 513 + 514 + #[test] 515 + fn sniff_meta_http_equiv_case_insensitive() { 516 + let html = b"<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=UTF-8\">"; 517 + let (enc, src) = sniff_encoding(html, None); 518 + assert_eq!(enc, Encoding::Utf8); 519 + assert_eq!(src, EncodingSource::MetaPrescan); 520 + } 521 + 522 + #[test] 523 + fn sniff_meta_charset_legacy_encoding() { 524 + let html = b"<meta charset=\"windows-1251\">"; 525 + let (enc, src) = sniff_encoding(html, None); 526 + assert_eq!(enc, Encoding::Windows1251); 527 + assert_eq!(src, EncodingSource::MetaPrescan); 528 + } 529 + 530 + #[test] 531 + fn sniff_meta_utf16_override_to_utf8() { 532 + let html = b"<meta charset=\"utf-16le\">"; 533 + let (enc, src) = sniff_encoding(html, None); 534 + assert_eq!(enc, Encoding::Utf8); 535 + assert_eq!(src, EncodingSource::MetaPrescan); 536 + } 537 + 538 + #[test] 539 + fn sniff_meta_with_doctype_and_html() { 540 + let html = b"<!DOCTYPE html><html><head><meta charset=\"utf-8\"></head>"; 541 + let (enc, src) = sniff_encoding(html, None); 542 + assert_eq!(enc, Encoding::Utf8); 543 + assert_eq!(src, EncodingSource::MetaPrescan); 544 + } 545 + 546 + #[test] 547 + fn sniff_meta_with_comment_before() { 548 + let html = b"<meta charset=\"utf-8\">"; 549 + let (enc, src) = sniff_encoding(html, None); 550 + assert_eq!(enc, Encoding::Utf8); 551 + assert_eq!(src, EncodingSource::MetaPrescan); 552 + } 553 + 554 + #[test] 555 + fn sniff_meta_beyond_1024_bytes_not_found() { 556 + let mut html = vec![b' '; 1024]; 557 + html.extend_from_slice(b"<meta charset=\"utf-8\">"); 558 + let (enc, src) = sniff_encoding(&html, None); 559 + assert_eq!(enc, Encoding::Windows1252); 560 + assert_eq!(src, EncodingSource::Default); 561 + } 562 + 563 + #[test] 564 + fn sniff_meta_within_1024_bytes() { 565 + let mut html = vec![b' '; 1000]; 566 + html.extend_from_slice(b"<meta charset=\"utf-8\">"); 567 + let (enc, src) = sniff_encoding(&html, None); 568 + assert_eq!(enc, Encoding::Utf8); 569 + assert_eq!(src, EncodingSource::MetaPrescan); 570 + } 571 + 572 + // ----------------------------------------------------------------------- 573 + // sniff_encoding — default fallback 574 + // ----------------------------------------------------------------------- 575 + 576 + #[test] 577 + fn sniff_default_no_signals() { 578 + let (enc, src) = sniff_encoding(b"Hello world", None); 579 + assert_eq!(enc, Encoding::Windows1252); 580 + assert_eq!(src, EncodingSource::Default); 581 + } 582 + 583 + #[test] 584 + fn sniff_default_empty() { 585 + let (enc, src) = sniff_encoding(b"", None); 586 + assert_eq!(enc, Encoding::Windows1252); 587 + assert_eq!(src, EncodingSource::Default); 588 + } 589 + 590 + // ----------------------------------------------------------------------- 591 + // extract_charset_from_content_type 592 + // ----------------------------------------------------------------------- 593 + 594 + #[test] 595 + fn extract_charset_basic() { 596 + assert_eq!( 597 + extract_charset_from_content_type("text/html; charset=utf-8"), 598 + Some(Encoding::Utf8) 599 + ); 600 + } 601 + 602 + #[test] 603 + fn extract_charset_quoted() { 604 + assert_eq!( 605 + extract_charset_from_content_type("text/html; charset=\"utf-8\""), 606 + Some(Encoding::Utf8) 607 + ); 608 + } 609 + 610 + #[test] 611 + fn extract_charset_no_space() { 612 + assert_eq!( 613 + extract_charset_from_content_type("text/html;charset=utf-8"), 614 + Some(Encoding::Utf8) 615 + ); 616 + } 617 + 618 + #[test] 619 + fn extract_charset_uppercase() { 620 + assert_eq!( 621 + extract_charset_from_content_type("text/html; CHARSET=UTF-8"), 622 + Some(Encoding::Utf8) 623 + ); 624 + } 625 + 626 + #[test] 627 + fn extract_charset_missing() { 628 + assert_eq!(extract_charset_from_content_type("text/html"), None); 629 + } 630 + 631 + #[test] 632 + fn extract_charset_empty_value() { 633 + assert_eq!( 634 + extract_charset_from_content_type("text/html; charset="), 635 + None 636 + ); 637 + } 638 + 639 + #[test] 640 + fn extract_charset_unknown_encoding() { 641 + assert_eq!( 642 + extract_charset_from_content_type("text/html; charset=bogus"), 643 + None 644 + ); 645 + } 646 + 647 + #[test] 648 + fn extract_charset_with_extra_params() { 649 + assert_eq!( 650 + extract_charset_from_content_type("text/html; charset=utf-8; boundary=something"), 651 + Some(Encoding::Utf8) 652 + ); 653 + } 654 + 655 + // ----------------------------------------------------------------------- 656 + // meta_prescan internals 657 + // ----------------------------------------------------------------------- 658 + 659 + #[test] 660 + fn meta_prescan_charset_attr() { 661 + let html = b"<meta charset=\"iso-8859-2\">"; 662 + assert_eq!(meta_prescan(html), Some(Encoding::Iso8859_2)); 663 + } 664 + 665 + #[test] 666 + fn meta_prescan_http_equiv_content() { 667 + let html = b"<meta http-equiv=\"content-type\" content=\"text/html; charset=koi8-r\">"; 668 + assert_eq!(meta_prescan(html), Some(Encoding::Koi8R)); 669 + } 670 + 671 + #[test] 672 + fn meta_prescan_no_meta() { 673 + let html = b"<html><head><title>Test</title></head></html>"; 674 + assert_eq!(meta_prescan(html), None); 675 + } 676 + 677 + #[test] 678 + fn meta_prescan_meta_without_charset() { 679 + let html = b"<meta name=\"viewport\" content=\"width=device-width\">"; 680 + assert_eq!(meta_prescan(html), None); 681 + } 682 + 683 + #[test] 684 + fn meta_prescan_http_equiv_without_content() { 685 + let html = b"<meta http-equiv=\"content-type\">"; 686 + assert_eq!(meta_prescan(html), None); 687 + } 688 + 689 + #[test] 690 + fn meta_prescan_content_without_http_equiv() { 691 + // charset in content but no http-equiv="content-type" -> need_pragma is true but got_pragma is false 692 + let html = b"<meta content=\"text/html; charset=utf-8\">"; 693 + assert_eq!(meta_prescan(html), None); 694 + } 695 + 696 + #[test] 697 + fn meta_prescan_skips_comments() { 698 + let html = b"<meta charset=\"utf-8\">"; 699 + assert_eq!(meta_prescan(html), Some(Encoding::Utf8)); 700 + } 701 + 702 + #[test] 703 + fn meta_prescan_unquoted_charset() { 704 + let html = b"<meta charset=utf-8>"; 705 + assert_eq!(meta_prescan(html), Some(Encoding::Utf8)); 706 + } 707 + 708 + #[test] 709 + fn meta_prescan_self_closing() { 710 + let html = b"<meta charset=\"utf-8\" />"; 711 + assert_eq!(meta_prescan(html), Some(Encoding::Utf8)); 712 + } 713 + }

Configure Feed

Configure Feed