(*--------------------------------------------------------------------------- Copyright (c) 2025 Anil Madhavapeddy . All rights reserved. SPDX-License-Identifier: MIT ---------------------------------------------------------------------------*) (** HTML5 Encoding Detection and Decoding This module implements the WHATWG encoding sniffing and decoding algorithms for HTML5 documents. It handles automatic character encoding detection from byte order marks (BOM), meta charset declarations, and transport layer hints. {2 Encoding Detection Algorithm} The encoding detection follows the WHATWG specification: 1. Check for a BOM (UTF-8, UTF-16LE, UTF-16BE) 2. Prescan for [] or [] 3. Use transport layer encoding hint if provided 4. Fall back to UTF-8 as the default @see WHATWG encoding sniffing algorithm *) (** {1 Types} *) (** Character encodings supported by the parser. The HTML5 specification requires support for a large number of encodings, but this implementation focuses on the most common ones. Other encodings are mapped to their closest equivalent. *) type encoding = Encoding_types.t = | Utf8 (** UTF-8 encoding (default) *) | Utf16le (** UTF-16 little-endian *) | Utf16be (** UTF-16 big-endian *) | Windows_1252 (** Windows-1252 (Latin-1 superset) *) | Iso_8859_2 (** ISO-8859-2 (Central European) *) | Euc_jp (** EUC-JP (Japanese) *) (** {1 Encoding Utilities} *) (** Convert an encoding to its canonical label string. Returns the WHATWG canonical name, e.g., ["utf-8"], ["utf-16le"]. *) let encoding_to_string = Encoding_types.to_string (** Detect encoding from a byte order mark. Examines the first bytes of the input for a BOM and returns the detected encoding with the number of bytes to skip. @return [(Some (encoding, skip_bytes))] if a BOM is found, [None] otherwise. *) let sniff_bom = Encoding_bom.sniff (** Normalize an encoding label to its canonical form. Maps encoding labels (case-insensitive, with optional whitespace) to the supported encoding types. @return [Some encoding] if the label is recognized, [None] otherwise. {[ normalize_label "UTF-8" (* Some Utf8 *) normalize_label "utf8" (* Some Utf8 *) normalize_label "latin1" (* Some Windows_1252 *) ]} *) let normalize_label = Encoding_labels.normalize_label (** Prescan bytes to find a meta charset declaration. Implements the WHATWG prescan algorithm that looks for encoding declarations in the first 1024 bytes of an HTML document. @return [Some encoding] if a meta charset is found, [None] otherwise. *) let prescan_for_meta_charset = Encoding_prescan.prescan_for_meta_charset (** {1 Decoding} *) (** Decode raw bytes to a UTF-8 string with automatic encoding detection. This function implements the full encoding sniffing algorithm: 1. Check for BOM 2. Prescan for meta charset 3. Use transport encoding hint if provided 4. Fall back to UTF-8 @param transport_encoding Encoding hint from HTTP Content-Type header @return [(decoded_string, detected_encoding)] {[ let (html, enc) = decode raw_bytes () (* html is now a UTF-8 string, enc is the detected encoding *) ]} *) let decode = Encoding_decode.decode let pp fmt enc = Format.pp_print_string fmt (encoding_to_string enc)