(*--------------------------------------------------------------------------- Copyright (c) 2025 Anil Madhavapeddy . All rights reserved. SPDX-License-Identifier: MIT ---------------------------------------------------------------------------*) (** HTML5 Tokenizer This module implements the WHATWG HTML5 tokenization algorithm. The tokenizer converts an input byte stream into a sequence of tokens (start tags, end tags, text, comments, doctypes) that can be consumed by a tree builder. *) (** {1 Sub-modules} *) (** Token types produced by the tokenizer. *) module Token : sig type tag_kind = Tokenizer_token.tag_kind = Start | End type doctype = Tokenizer_token.doctype = { name : string option; public_id : string option; system_id : string option; force_quirks : bool; } type tag = Tokenizer_token.tag = { kind : tag_kind; name : string; attrs : (string * string) list; self_closing : bool; } type t = Tokenizer_token.t = | Tag of tag | Character of string | Comment of string | Doctype of doctype | EOF val make_start_tag : string -> (string * string) list -> bool -> t val make_end_tag : string -> t val make_doctype : ?name:string -> ?public_id:string -> ?system_id:string -> ?force_quirks:bool -> unit -> t val make_comment : string -> t val make_character : string -> t val eof : t val pp_tag_kind : Format.formatter -> tag_kind -> unit (** Pretty-print a tag kind (Start or End). *) val pp_doctype : Format.formatter -> doctype -> unit (** Pretty-print a DOCTYPE token. *) val pp_tag : Format.formatter -> tag -> unit (** Pretty-print a tag token. *) val pp : Format.formatter -> t -> unit (** Pretty-print a token. *) end (** Tokenizer states. *) module State : sig type t = Tokenizer_state.t = | Data | Rcdata | Rawtext | Script_data | Plaintext | Tag_open | End_tag_open | Tag_name | Rcdata_less_than_sign | Rcdata_end_tag_open | Rcdata_end_tag_name | Rawtext_less_than_sign | Rawtext_end_tag_open | Rawtext_end_tag_name | Script_data_less_than_sign | Script_data_end_tag_open | Script_data_end_tag_name | Script_data_escape_start | Script_data_escape_start_dash | Script_data_escaped | Script_data_escaped_dash | Script_data_escaped_dash_dash | Script_data_escaped_less_than_sign | Script_data_escaped_end_tag_open | Script_data_escaped_end_tag_name | Script_data_double_escape_start | Script_data_double_escaped | Script_data_double_escaped_dash | Script_data_double_escaped_dash_dash | Script_data_double_escaped_less_than_sign | Script_data_double_escape_end | Before_attribute_name | Attribute_name | After_attribute_name | Before_attribute_value | Attribute_value_double_quoted | Attribute_value_single_quoted | Attribute_value_unquoted | After_attribute_value_quoted | Self_closing_start_tag | Bogus_comment | Markup_declaration_open | Comment_start | Comment_start_dash | Comment | Comment_less_than_sign | Comment_less_than_sign_bang | Comment_less_than_sign_bang_dash | Comment_less_than_sign_bang_dash_dash | Comment_end_dash | Comment_end | Comment_end_bang | Doctype | Before_doctype_name | Doctype_name | After_doctype_name | After_doctype_public_keyword | Before_doctype_public_identifier | Doctype_public_identifier_double_quoted | Doctype_public_identifier_single_quoted | After_doctype_public_identifier | Between_doctype_public_and_system_identifiers | After_doctype_system_keyword | Before_doctype_system_identifier | Doctype_system_identifier_double_quoted | Doctype_system_identifier_single_quoted | After_doctype_system_identifier | Bogus_doctype | Cdata_section | Cdata_section_bracket | Cdata_section_end | Character_reference | Named_character_reference | Ambiguous_ampersand | Numeric_character_reference | Hexadecimal_character_reference_start | Decimal_character_reference_start | Hexadecimal_character_reference | Decimal_character_reference | Numeric_character_reference_end val pp : Format.formatter -> t -> unit (** Pretty-print a tokenizer state. *) end (** Parse error types. *) module Errors : sig type t = Tokenizer_errors.t = { code : Parse_error_code.t; line : int; column : int; } val make : code:string -> line:int -> column:int -> t (** Create an error from a string code. The string is converted to {!Parse_error_code.t} using {!Parse_error_code.of_string}. *) val make_with_code : code:Parse_error_code.t -> line:int -> column:int -> t (** Create an error with a typed error code. *) val to_string : t -> string val pp : Format.formatter -> t -> unit (** Pretty-print a tokenizer error. *) end (** Input stream with position tracking. *) module Stream : sig type t = Tokenizer_stream.t val create : string -> t val create_from_reader : Bytesrw.Bytes.Reader.t -> t val set_error_callback : t -> (string -> unit) -> unit val position : t -> int * int end (** {1 Token Sink Interface} *) (** Interface for token consumers. The tokenizer calls [process] for each token it produces. The sink can return [`Continue] to keep tokenizing, or [`SwitchTo state] to change the tokenizer state (used by the tree builder for things like [