OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** HTML5 Tokenizer
7
8 This module implements the WHATWG HTML5 tokenization algorithm. The
9 tokenizer converts an input byte stream into a sequence of tokens
10 (start tags, end tags, text, comments, doctypes) that can be consumed
11 by a tree builder.
12*)
13
14(** {1 Sub-modules} *)
15
16(** Token types produced by the tokenizer. *)
17module Token : sig
18 type tag_kind = Tokenizer_token.tag_kind = Start | End
19
20 type doctype = Tokenizer_token.doctype = {
21 name : string option;
22 public_id : string option;
23 system_id : string option;
24 force_quirks : bool;
25 }
26
27 type tag = Tokenizer_token.tag = {
28 kind : tag_kind;
29 name : string;
30 attrs : (string * string) list;
31 self_closing : bool;
32 }
33
34 type t = Tokenizer_token.t =
35 | Tag of tag
36 | Character of string
37 | Comment of string
38 | Doctype of doctype
39 | EOF
40
41 val make_start_tag : string -> (string * string) list -> bool -> t
42 val make_end_tag : string -> t
43 val make_doctype :
44 ?name:string ->
45 ?public_id:string ->
46 ?system_id:string ->
47 ?force_quirks:bool ->
48 unit ->
49 t
50 val make_comment : string -> t
51 val make_character : string -> t
52 val eof : t
53
54 val pp_tag_kind : Format.formatter -> tag_kind -> unit
55 (** Pretty-print a tag kind (Start or End). *)
56
57 val pp_doctype : Format.formatter -> doctype -> unit
58 (** Pretty-print a DOCTYPE token. *)
59
60 val pp_tag : Format.formatter -> tag -> unit
61 (** Pretty-print a tag token. *)
62
63 val pp : Format.formatter -> t -> unit
64 (** Pretty-print a token. *)
65end
66
67(** Tokenizer states. *)
68module State : sig
69 type t = Tokenizer_state.t =
70 | Data
71 | Rcdata
72 | Rawtext
73 | Script_data
74 | Plaintext
75 | Tag_open
76 | End_tag_open
77 | Tag_name
78 | Rcdata_less_than_sign
79 | Rcdata_end_tag_open
80 | Rcdata_end_tag_name
81 | Rawtext_less_than_sign
82 | Rawtext_end_tag_open
83 | Rawtext_end_tag_name
84 | Script_data_less_than_sign
85 | Script_data_end_tag_open
86 | Script_data_end_tag_name
87 | Script_data_escape_start
88 | Script_data_escape_start_dash
89 | Script_data_escaped
90 | Script_data_escaped_dash
91 | Script_data_escaped_dash_dash
92 | Script_data_escaped_less_than_sign
93 | Script_data_escaped_end_tag_open
94 | Script_data_escaped_end_tag_name
95 | Script_data_double_escape_start
96 | Script_data_double_escaped
97 | Script_data_double_escaped_dash
98 | Script_data_double_escaped_dash_dash
99 | Script_data_double_escaped_less_than_sign
100 | Script_data_double_escape_end
101 | Before_attribute_name
102 | Attribute_name
103 | After_attribute_name
104 | Before_attribute_value
105 | Attribute_value_double_quoted
106 | Attribute_value_single_quoted
107 | Attribute_value_unquoted
108 | After_attribute_value_quoted
109 | Self_closing_start_tag
110 | Bogus_comment
111 | Markup_declaration_open
112 | Comment_start
113 | Comment_start_dash
114 | Comment
115 | Comment_less_than_sign
116 | Comment_less_than_sign_bang
117 | Comment_less_than_sign_bang_dash
118 | Comment_less_than_sign_bang_dash_dash
119 | Comment_end_dash
120 | Comment_end
121 | Comment_end_bang
122 | Doctype
123 | Before_doctype_name
124 | Doctype_name
125 | After_doctype_name
126 | After_doctype_public_keyword
127 | Before_doctype_public_identifier
128 | Doctype_public_identifier_double_quoted
129 | Doctype_public_identifier_single_quoted
130 | After_doctype_public_identifier
131 | Between_doctype_public_and_system_identifiers
132 | After_doctype_system_keyword
133 | Before_doctype_system_identifier
134 | Doctype_system_identifier_double_quoted
135 | Doctype_system_identifier_single_quoted
136 | After_doctype_system_identifier
137 | Bogus_doctype
138 | Cdata_section
139 | Cdata_section_bracket
140 | Cdata_section_end
141 | Character_reference
142 | Named_character_reference
143 | Ambiguous_ampersand
144 | Numeric_character_reference
145 | Hexadecimal_character_reference_start
146 | Decimal_character_reference_start
147 | Hexadecimal_character_reference
148 | Decimal_character_reference
149 | Numeric_character_reference_end
150
151 val pp : Format.formatter -> t -> unit
152 (** Pretty-print a tokenizer state. *)
153end
154
155(** Parse error types. *)
156module Errors : sig
157 type t = Tokenizer_errors.t = {
158 code : Parse_error_code.t;
159 line : int;
160 column : int;
161 }
162
163 val make : code:string -> line:int -> column:int -> t
164 (** Create an error from a string code. The string is converted to
165 {!Parse_error_code.t} using {!Parse_error_code.of_string}. *)
166
167 val make_with_code : code:Parse_error_code.t -> line:int -> column:int -> t
168 (** Create an error with a typed error code. *)
169
170 val to_string : t -> string
171
172 val pp : Format.formatter -> t -> unit
173 (** Pretty-print a tokenizer error. *)
174end
175
176(** Input stream with position tracking. *)
177module Stream : sig
178 type t = Tokenizer_stream.t
179
180 val create : string -> t
181 val create_from_reader : Bytesrw.Bytes.Reader.t -> t
182 val set_error_callback : t -> (string -> unit) -> unit
183 val position : t -> int * int
184end
185
186(** {1 Token Sink Interface} *)
187
188(** Interface for token consumers.
189
190 The tokenizer calls [process] for each token it produces. The sink
191 can return [`Continue] to keep tokenizing, or [`SwitchTo state] to
192 change the tokenizer state (used by the tree builder for things like
193 [<script>] and [<textarea>]).
194*)
195module type SINK = sig
196 type t
197 val process : t -> Tokenizer_token.t -> line:int -> column:int -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
198 val adjusted_current_node_in_html_namespace : t -> bool
199end
200
201(** {1 Tokenizer} *)
202
203(** The tokenizer type, parameterized by the sink type. *)
204type 'sink t
205
206val create :
207 (module SINK with type t = 'sink) ->
208 'sink ->
209 ?collect_errors:bool ->
210 ?xml_mode:bool ->
211 unit ->
212 'sink t
213(** Create a new tokenizer.
214
215 @param sink The token sink that will receive tokens
216 @param collect_errors If [true], collect parse errors (default: [false])
217 @param xml_mode If [true], apply XML compatibility transformations
218*)
219
220val run :
221 'sink t ->
222 (module SINK with type t = 'sink) ->
223 Bytesrw.Bytes.Reader.t ->
224 unit
225(** Run the tokenizer on the given input.
226
227 The tokenizer will read from the reader and call the sink's [process]
228 function for each token until EOF is reached.
229*)
230
231val get_errors : 'sink t -> Tokenizer_errors.t list
232(** Get the list of parse errors encountered during tokenization.
233
234 Only populated if [collect_errors:true] was passed to {!create}.
235*)
236
237val set_state : 'sink t -> Tokenizer_state.t -> unit
238(** Set the tokenizer state.
239
240 Used by the tree builder to switch states for raw text elements.
241*)
242
243val set_last_start_tag : 'sink t -> string -> unit
244(** Set the last start tag name.
245
246 Used by the tree builder to track the context for end tag matching.
247*)