(*---------------------------------------------------------------------------
  Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
  SPDX-License-Identifier: MIT
 ---------------------------------------------------------------------------*)

(** HTML5 Tokenizer

    This module implements the WHATWG HTML5 tokenization algorithm. The
    tokenizer converts an input byte stream into a sequence of tokens
    (start tags, end tags, text, comments, doctypes) that can be consumed
    by a tree builder.
*)

(** {1 Sub-modules} *)

(** Token types produced by the tokenizer. *)
module Token : sig
  type tag_kind = Tokenizer_token.tag_kind = Start | End

  type doctype = Tokenizer_token.doctype = {
    name : string option;
    public_id : string option;
    system_id : string option;
    force_quirks : bool;
  }

  type tag = Tokenizer_token.tag = {
    kind : tag_kind;
    name : string;
    attrs : (string * string) list;
    self_closing : bool;
  }

  type t = Tokenizer_token.t =
    | Tag of tag
    | Character of string
    | Comment of string
    | Doctype of doctype
    | EOF

  val make_start_tag : string -> (string * string) list -> bool -> t
  val make_end_tag : string -> t
  val make_doctype :
    ?name:string ->
    ?public_id:string ->
    ?system_id:string ->
    ?force_quirks:bool ->
    unit ->
    t
  val make_comment : string -> t
  val make_character : string -> t
  val eof : t

  val pp_tag_kind : Format.formatter -> tag_kind -> unit
  (** Pretty-print a tag kind (Start or End). *)

  val pp_doctype : Format.formatter -> doctype -> unit
  (** Pretty-print a DOCTYPE token. *)

  val pp_tag : Format.formatter -> tag -> unit
  (** Pretty-print a tag token. *)

  val pp : Format.formatter -> t -> unit
  (** Pretty-print a token. *)
end

(** Tokenizer states. *)
module State : sig
  type t = Tokenizer_state.t =
    | Data
    | Rcdata
    | Rawtext
    | Script_data
    | Plaintext
    | Tag_open
    | End_tag_open
    | Tag_name
    | Rcdata_less_than_sign
    | Rcdata_end_tag_open
    | Rcdata_end_tag_name
    | Rawtext_less_than_sign
    | Rawtext_end_tag_open
    | Rawtext_end_tag_name
    | Script_data_less_than_sign
    | Script_data_end_tag_open
    | Script_data_end_tag_name
    | Script_data_escape_start
    | Script_data_escape_start_dash
    | Script_data_escaped
    | Script_data_escaped_dash
    | Script_data_escaped_dash_dash
    | Script_data_escaped_less_than_sign
    | Script_data_escaped_end_tag_open
    | Script_data_escaped_end_tag_name
    | Script_data_double_escape_start
    | Script_data_double_escaped
    | Script_data_double_escaped_dash
    | Script_data_double_escaped_dash_dash
    | Script_data_double_escaped_less_than_sign
    | Script_data_double_escape_end
    | Before_attribute_name
    | Attribute_name
    | After_attribute_name
    | Before_attribute_value
    | Attribute_value_double_quoted
    | Attribute_value_single_quoted
    | Attribute_value_unquoted
    | After_attribute_value_quoted
    | Self_closing_start_tag
    | Bogus_comment
    | Markup_declaration_open
    | Comment_start
    | Comment_start_dash
    | Comment
    | Comment_less_than_sign
    | Comment_less_than_sign_bang
    | Comment_less_than_sign_bang_dash
    | Comment_less_than_sign_bang_dash_dash
    | Comment_end_dash
    | Comment_end
    | Comment_end_bang
    | Doctype
    | Before_doctype_name
    | Doctype_name
    | After_doctype_name
    | After_doctype_public_keyword
    | Before_doctype_public_identifier
    | Doctype_public_identifier_double_quoted
    | Doctype_public_identifier_single_quoted
    | After_doctype_public_identifier
    | Between_doctype_public_and_system_identifiers
    | After_doctype_system_keyword
    | Before_doctype_system_identifier
    | Doctype_system_identifier_double_quoted
    | Doctype_system_identifier_single_quoted
    | After_doctype_system_identifier
    | Bogus_doctype
    | Cdata_section
    | Cdata_section_bracket
    | Cdata_section_end
    | Character_reference
    | Named_character_reference
    | Ambiguous_ampersand
    | Numeric_character_reference
    | Hexadecimal_character_reference_start
    | Decimal_character_reference_start
    | Hexadecimal_character_reference
    | Decimal_character_reference
    | Numeric_character_reference_end

  val pp : Format.formatter -> t -> unit
  (** Pretty-print a tokenizer state. *)
end

(** Parse error types. *)
module Errors : sig
  type t = Tokenizer_errors.t = {
    code : Parse_error_code.t;
    line : int;
    column : int;
  }

  val make : code:string -> line:int -> column:int -> t
  (** Create an error from a string code. The string is converted to
      {!Parse_error_code.t} using {!Parse_error_code.of_string}. *)

  val make_with_code : code:Parse_error_code.t -> line:int -> column:int -> t
  (** Create an error with a typed error code. *)

  val to_string : t -> string

  val pp : Format.formatter -> t -> unit
  (** Pretty-print a tokenizer error. *)
end

(** Input stream with position tracking. *)
module Stream : sig
  type t = Tokenizer_stream.t

  val create : string -> t
  val create_from_reader : Bytesrw.Bytes.Reader.t -> t
  val set_error_callback : t -> (string -> unit) -> unit
  val position : t -> int * int
end

(** {1 Token Sink Interface} *)

(** Interface for token consumers.

    The tokenizer calls [process] for each token it produces. The sink
    can return [`Continue] to keep tokenizing, or [`SwitchTo state] to
    change the tokenizer state (used by the tree builder for things like
    [<script>] and [<textarea>]).
*)
module type SINK = sig
  type t
  val process : t -> Tokenizer_token.t -> line:int -> column:int -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
  val adjusted_current_node_in_html_namespace : t -> bool
end

(** {1 Tokenizer} *)

(** The tokenizer type, parameterized by the sink type. *)
type 'sink t

val create :
  (module SINK with type t = 'sink) ->
  'sink ->
  ?collect_errors:bool ->
  ?xml_mode:bool ->
  unit ->
  'sink t
(** Create a new tokenizer.

    @param sink The token sink that will receive tokens
    @param collect_errors If [true], collect parse errors (default: [false])
    @param xml_mode If [true], apply XML compatibility transformations
*)

val run :
  'sink t ->
  (module SINK with type t = 'sink) ->
  Bytesrw.Bytes.Reader.t ->
  unit
(** Run the tokenizer on the given input.

    The tokenizer will read from the reader and call the sink's [process]
    function for each token until EOF is reached.
*)

val get_errors : 'sink t -> Tokenizer_errors.t list
(** Get the list of parse errors encountered during tokenization.

    Only populated if [collect_errors:true] was passed to {!create}.
*)

val set_state : 'sink t -> Tokenizer_state.t -> unit
(** Set the tokenizer state.

    Used by the tree builder to switch states for raw text elements.
*)

val set_last_start_tag : 'sink t -> string -> unit
(** Set the last start tag name.

    Used by the tree builder to track the context for end tag matching.
*)