OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** HTML5 Encoding Detection and Decoding
7
8 This module implements the WHATWG encoding sniffing and decoding
9 algorithms for HTML5 documents. It handles automatic character
10 encoding detection from byte order marks (BOM), meta charset
11 declarations, and transport layer hints.
12
13 {2 Encoding Detection Algorithm}
14
15 The encoding detection follows the WHATWG specification:
16 1. Check for a BOM (UTF-8, UTF-16LE, UTF-16BE)
17 2. Prescan for [<meta charset>] or [<meta http-equiv="content-type">]
18 3. Use transport layer encoding hint if provided
19 4. Fall back to UTF-8 as the default
20
21 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
22 WHATWG encoding sniffing algorithm
23*)
24
25(** {1 Types} *)
26
27(** Character encodings supported by the parser.
28
29 The HTML5 specification requires support for a large number of
30 encodings, but this implementation focuses on the most common ones.
31 Other encodings are mapped to their closest equivalent.
32*)
33type encoding = Encoding_types.t =
34 | Utf8 (** UTF-8 encoding (default) *)
35 | Utf16le (** UTF-16 little-endian *)
36 | Utf16be (** UTF-16 big-endian *)
37 | Windows_1252 (** Windows-1252 (Latin-1 superset) *)
38 | Iso_8859_2 (** ISO-8859-2 (Central European) *)
39 | Euc_jp (** EUC-JP (Japanese) *)
40
41val pp : Format.formatter -> encoding -> unit
42(** Pretty-print an encoding using its canonical label. *)
43
44(** {1 Encoding Utilities} *)
45
46val encoding_to_string : encoding -> string
47(** Convert an encoding to its canonical label string.
48
49 Returns the WHATWG canonical name, e.g., ["utf-8"], ["utf-16le"].
50*)
51
52val sniff_bom : bytes -> (encoding * int) option
53(** Detect encoding from a byte order mark.
54
55 Examines the first bytes of the input for a BOM and returns the
56 detected encoding with the number of bytes to skip.
57
58 @return [(Some (encoding, skip_bytes))] if a BOM is found,
59 [None] otherwise.
60*)
61
62val normalize_label : string -> encoding option
63(** Normalize an encoding label to its canonical form.
64
65 Maps encoding labels (case-insensitive, with optional whitespace)
66 to the supported encoding types.
67
68 @return [Some encoding] if the label is recognized, [None] otherwise.
69
70 {[
71 normalize_label "UTF-8" (* Some Utf8 *)
72 normalize_label "utf8" (* Some Utf8 *)
73 normalize_label "latin1" (* Some Windows_1252 *)
74 ]}
75*)
76
77val prescan_for_meta_charset : bytes -> encoding option
78(** Prescan bytes to find a meta charset declaration.
79
80 Implements the WHATWG prescan algorithm that looks for encoding
81 declarations in the first 1024 bytes of an HTML document.
82
83 @return [Some encoding] if a meta charset is found, [None] otherwise.
84*)
85
86(** {1 Decoding} *)
87
88val decode : bytes -> ?transport_encoding:string -> unit -> string * encoding
89(** Decode raw bytes to a UTF-8 string with automatic encoding detection.
90
91 This function implements the full encoding sniffing algorithm:
92 1. Check for BOM
93 2. Prescan for meta charset
94 3. Use transport encoding hint if provided
95 4. Fall back to UTF-8
96
97 @param transport_encoding Encoding hint from HTTP Content-Type header
98 @return [(decoded_string, detected_encoding)]
99
100 {[
101 let (html, enc) = decode raw_bytes ()
102 (* html is now a UTF-8 string, enc is the detected encoding *)
103 ]}
104*)