OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 104 lines 3.7 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** HTML5 Encoding Detection and Decoding 7 8 This module implements the WHATWG encoding sniffing and decoding 9 algorithms for HTML5 documents. It handles automatic character 10 encoding detection from byte order marks (BOM), meta charset 11 declarations, and transport layer hints. 12 13 {2 Encoding Detection Algorithm} 14 15 The encoding detection follows the WHATWG specification: 16 1. Check for a BOM (UTF-8, UTF-16LE, UTF-16BE) 17 2. Prescan for [<meta charset>] or [<meta http-equiv="content-type">] 18 3. Use transport layer encoding hint if provided 19 4. Fall back to UTF-8 as the default 20 21 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding> 22 WHATWG encoding sniffing algorithm 23*) 24 25(** {1 Types} *) 26 27(** Character encodings supported by the parser. 28 29 The HTML5 specification requires support for a large number of 30 encodings, but this implementation focuses on the most common ones. 31 Other encodings are mapped to their closest equivalent. 32*) 33type encoding = Encoding_types.t = 34 | Utf8 (** UTF-8 encoding (default) *) 35 | Utf16le (** UTF-16 little-endian *) 36 | Utf16be (** UTF-16 big-endian *) 37 | Windows_1252 (** Windows-1252 (Latin-1 superset) *) 38 | Iso_8859_2 (** ISO-8859-2 (Central European) *) 39 | Euc_jp (** EUC-JP (Japanese) *) 40 41val pp : Format.formatter -> encoding -> unit 42(** Pretty-print an encoding using its canonical label. *) 43 44(** {1 Encoding Utilities} *) 45 46val encoding_to_string : encoding -> string 47(** Convert an encoding to its canonical label string. 48 49 Returns the WHATWG canonical name, e.g., ["utf-8"], ["utf-16le"]. 50*) 51 52val sniff_bom : bytes -> (encoding * int) option 53(** Detect encoding from a byte order mark. 54 55 Examines the first bytes of the input for a BOM and returns the 56 detected encoding with the number of bytes to skip. 57 58 @return [(Some (encoding, skip_bytes))] if a BOM is found, 59 [None] otherwise. 60*) 61 62val normalize_label : string -> encoding option 63(** Normalize an encoding label to its canonical form. 64 65 Maps encoding labels (case-insensitive, with optional whitespace) 66 to the supported encoding types. 67 68 @return [Some encoding] if the label is recognized, [None] otherwise. 69 70 {[ 71 normalize_label "UTF-8" (* Some Utf8 *) 72 normalize_label "utf8" (* Some Utf8 *) 73 normalize_label "latin1" (* Some Windows_1252 *) 74 ]} 75*) 76 77val prescan_for_meta_charset : bytes -> encoding option 78(** Prescan bytes to find a meta charset declaration. 79 80 Implements the WHATWG prescan algorithm that looks for encoding 81 declarations in the first 1024 bytes of an HTML document. 82 83 @return [Some encoding] if a meta charset is found, [None] otherwise. 84*) 85 86(** {1 Decoding} *) 87 88val decode : bytes -> ?transport_encoding:string -> unit -> string * encoding 89(** Decode raw bytes to a UTF-8 string with automatic encoding detection. 90 91 This function implements the full encoding sniffing algorithm: 92 1. Check for BOM 93 2. Prescan for meta charset 94 3. Use transport encoding hint if provided 95 4. Fall back to UTF-8 96 97 @param transport_encoding Encoding hint from HTTP Content-Type header 98 @return [(decoded_string, detected_encoding)] 99 100 {[ 101 let (html, enc) = decode raw_bytes () 102 (* html is now a UTF-8 string, enc is the detected encoding *) 103 ]} 104*)