lib/html5rw/tokenizer/tokenizer.mli at f7c69be4eae5476a0985d55de71f2cc34c8d5361

anil.recoil.org / ocaml-html5rw
fork
OCaml HTML5 parser/serialiser based on Python's JustHTML
fork
ocaml-html5rw / lib / html5rw / tokenizer / tokenizer.mli
at f7c69be4eae5476a0985d55de71f2cc34c8d5361 247 lines 7.0 kB view raw
wrap content
Anil Madhavapeddy validator 4mo ago
dfa235a8
  1(*---------------------------------------------------------------------------
  2  Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
  3  SPDX-License-Identifier: MIT
  4 ---------------------------------------------------------------------------*)
  5
  6(** HTML5 Tokenizer
  7
  8    This module implements the WHATWG HTML5 tokenization algorithm. The
  9    tokenizer converts an input byte stream into a sequence of tokens
 10    (start tags, end tags, text, comments, doctypes) that can be consumed
 11    by a tree builder.
 12*)
 13
 14(** {1 Sub-modules} *)
 15
 16(** Token types produced by the tokenizer. *)
 17module Token : sig
 18  type tag_kind = Tokenizer_token.tag_kind = Start | End
 19
 20  type doctype = Tokenizer_token.doctype = {
 21    name : string option;
 22    public_id : string option;
 23    system_id : string option;
 24    force_quirks : bool;
 25  }
 26
 27  type tag = Tokenizer_token.tag = {
 28    kind : tag_kind;
 29    name : string;
 30    attrs : (string * string) list;
 31    self_closing : bool;
 32  }
 33
 34  type t = Tokenizer_token.t =
 35    | Tag of tag
 36    | Character of string
 37    | Comment of string
 38    | Doctype of doctype
 39    | EOF
 40
 41  val make_start_tag : string -> (string * string) list -> bool -> t
 42  val make_end_tag : string -> t
 43  val make_doctype :
 44    ?name:string ->
 45    ?public_id:string ->
 46    ?system_id:string ->
 47    ?force_quirks:bool ->
 48    unit ->
 49    t
 50  val make_comment : string -> t
 51  val make_character : string -> t
 52  val eof : t
 53
 54  val pp_tag_kind : Format.formatter -> tag_kind -> unit
 55  (** Pretty-print a tag kind (Start or End). *)
 56
 57  val pp_doctype : Format.formatter -> doctype -> unit
 58  (** Pretty-print a DOCTYPE token. *)
 59
 60  val pp_tag : Format.formatter -> tag -> unit
 61  (** Pretty-print a tag token. *)
 62
 63  val pp : Format.formatter -> t -> unit
 64  (** Pretty-print a token. *)
 65end
 66
 67(** Tokenizer states. *)
 68module State : sig
 69  type t = Tokenizer_state.t =
 70    | Data
 71    | Rcdata
 72    | Rawtext
 73    | Script_data
 74    | Plaintext
 75    | Tag_open
 76    | End_tag_open
 77    | Tag_name
 78    | Rcdata_less_than_sign
 79    | Rcdata_end_tag_open
 80    | Rcdata_end_tag_name
 81    | Rawtext_less_than_sign
 82    | Rawtext_end_tag_open
 83    | Rawtext_end_tag_name
 84    | Script_data_less_than_sign
 85    | Script_data_end_tag_open
 86    | Script_data_end_tag_name
 87    | Script_data_escape_start
 88    | Script_data_escape_start_dash
 89    | Script_data_escaped
 90    | Script_data_escaped_dash
 91    | Script_data_escaped_dash_dash
 92    | Script_data_escaped_less_than_sign
 93    | Script_data_escaped_end_tag_open
 94    | Script_data_escaped_end_tag_name
 95    | Script_data_double_escape_start
 96    | Script_data_double_escaped
 97    | Script_data_double_escaped_dash
 98    | Script_data_double_escaped_dash_dash
 99    | Script_data_double_escaped_less_than_sign
100    | Script_data_double_escape_end
101    | Before_attribute_name
102    | Attribute_name
103    | After_attribute_name
104    | Before_attribute_value
105    | Attribute_value_double_quoted
106    | Attribute_value_single_quoted
107    | Attribute_value_unquoted
108    | After_attribute_value_quoted
109    | Self_closing_start_tag
110    | Bogus_comment
111    | Markup_declaration_open
112    | Comment_start
113    | Comment_start_dash
114    | Comment
115    | Comment_less_than_sign
116    | Comment_less_than_sign_bang
117    | Comment_less_than_sign_bang_dash
118    | Comment_less_than_sign_bang_dash_dash
119    | Comment_end_dash
120    | Comment_end
121    | Comment_end_bang
122    | Doctype
123    | Before_doctype_name
124    | Doctype_name
125    | After_doctype_name
126    | After_doctype_public_keyword
127    | Before_doctype_public_identifier
128    | Doctype_public_identifier_double_quoted
129    | Doctype_public_identifier_single_quoted
130    | After_doctype_public_identifier
131    | Between_doctype_public_and_system_identifiers
132    | After_doctype_system_keyword
133    | Before_doctype_system_identifier
134    | Doctype_system_identifier_double_quoted
135    | Doctype_system_identifier_single_quoted
136    | After_doctype_system_identifier
137    | Bogus_doctype
138    | Cdata_section
139    | Cdata_section_bracket
140    | Cdata_section_end
141    | Character_reference
142    | Named_character_reference
143    | Ambiguous_ampersand
144    | Numeric_character_reference
145    | Hexadecimal_character_reference_start
146    | Decimal_character_reference_start
147    | Hexadecimal_character_reference
148    | Decimal_character_reference
149    | Numeric_character_reference_end
150
151  val pp : Format.formatter -> t -> unit
152  (** Pretty-print a tokenizer state. *)
153end
154
155(** Parse error types. *)
156module Errors : sig
157  type t = Tokenizer_errors.t = {
158    code : Parse_error_code.t;
159    line : int;
160    column : int;
161  }
162
163  val make : code:string -> line:int -> column:int -> t
164  (** Create an error from a string code. The string is converted to
165      {!Parse_error_code.t} using {!Parse_error_code.of_string}. *)
166
167  val make_with_code : code:Parse_error_code.t -> line:int -> column:int -> t
168  (** Create an error with a typed error code. *)
169
170  val to_string : t -> string
171
172  val pp : Format.formatter -> t -> unit
173  (** Pretty-print a tokenizer error. *)
174end
175
176(** Input stream with position tracking. *)
177module Stream : sig
178  type t = Tokenizer_stream.t
179
180  val create : string -> t
181  val create_from_reader : Bytesrw.Bytes.Reader.t -> t
182  val set_error_callback : t -> (string -> unit) -> unit
183  val position : t -> int * int
184end
185
186(** {1 Token Sink Interface} *)
187
188(** Interface for token consumers.
189
190    The tokenizer calls [process] for each token it produces. The sink
191    can return [`Continue] to keep tokenizing, or [`SwitchTo state] to
192    change the tokenizer state (used by the tree builder for things like
193    [<script>] and [<textarea>]).
194*)
195module type SINK = sig
196  type t
197  val process : t -> Tokenizer_token.t -> line:int -> column:int -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
198  val adjusted_current_node_in_html_namespace : t -> bool
199end
200
201(** {1 Tokenizer} *)
202
203(** The tokenizer type, parameterized by the sink type. *)
204type 'sink t
205
206val create :
207  (module SINK with type t = 'sink) ->
208  'sink ->
209  ?collect_errors:bool ->
210  ?xml_mode:bool ->
211  unit ->
212  'sink t
213(** Create a new tokenizer.
214
215    @param sink The token sink that will receive tokens
216    @param collect_errors If [true], collect parse errors (default: [false])
217    @param xml_mode If [true], apply XML compatibility transformations
218*)
219
220val run :
221  'sink t ->
222  (module SINK with type t = 'sink) ->
223  Bytesrw.Bytes.Reader.t ->
224  unit
225(** Run the tokenizer on the given input.
226
227    The tokenizer will read from the reader and call the sink's [process]
228    function for each token until EOF is reached.
229*)
230
231val get_errors : 'sink t -> Tokenizer_errors.t list
232(** Get the list of parse errors encountered during tokenization.
233
234    Only populated if [collect_errors:true] was passed to {!create}.
235*)
236
237val set_state : 'sink t -> Tokenizer_state.t -> unit
238(** Set the tokenizer state.
239
240    Used by the tree builder to switch states for raw text elements.
241*)
242
243val set_last_start_tag : 'sink t -> string -> unit
244(** Set the last start tag name.
245
246    Used by the tree builder to track the context for end tag matching.
247*)
Configure Feed

Configure Feed