OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at f7c69be4eae5476a0985d55de71f2cc34c8d5361 247 lines 7.0 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** HTML5 Tokenizer 7 8 This module implements the WHATWG HTML5 tokenization algorithm. The 9 tokenizer converts an input byte stream into a sequence of tokens 10 (start tags, end tags, text, comments, doctypes) that can be consumed 11 by a tree builder. 12*) 13 14(** {1 Sub-modules} *) 15 16(** Token types produced by the tokenizer. *) 17module Token : sig 18 type tag_kind = Tokenizer_token.tag_kind = Start | End 19 20 type doctype = Tokenizer_token.doctype = { 21 name : string option; 22 public_id : string option; 23 system_id : string option; 24 force_quirks : bool; 25 } 26 27 type tag = Tokenizer_token.tag = { 28 kind : tag_kind; 29 name : string; 30 attrs : (string * string) list; 31 self_closing : bool; 32 } 33 34 type t = Tokenizer_token.t = 35 | Tag of tag 36 | Character of string 37 | Comment of string 38 | Doctype of doctype 39 | EOF 40 41 val make_start_tag : string -> (string * string) list -> bool -> t 42 val make_end_tag : string -> t 43 val make_doctype : 44 ?name:string -> 45 ?public_id:string -> 46 ?system_id:string -> 47 ?force_quirks:bool -> 48 unit -> 49 t 50 val make_comment : string -> t 51 val make_character : string -> t 52 val eof : t 53 54 val pp_tag_kind : Format.formatter -> tag_kind -> unit 55 (** Pretty-print a tag kind (Start or End). *) 56 57 val pp_doctype : Format.formatter -> doctype -> unit 58 (** Pretty-print a DOCTYPE token. *) 59 60 val pp_tag : Format.formatter -> tag -> unit 61 (** Pretty-print a tag token. *) 62 63 val pp : Format.formatter -> t -> unit 64 (** Pretty-print a token. *) 65end 66 67(** Tokenizer states. *) 68module State : sig 69 type t = Tokenizer_state.t = 70 | Data 71 | Rcdata 72 | Rawtext 73 | Script_data 74 | Plaintext 75 | Tag_open 76 | End_tag_open 77 | Tag_name 78 | Rcdata_less_than_sign 79 | Rcdata_end_tag_open 80 | Rcdata_end_tag_name 81 | Rawtext_less_than_sign 82 | Rawtext_end_tag_open 83 | Rawtext_end_tag_name 84 | Script_data_less_than_sign 85 | Script_data_end_tag_open 86 | Script_data_end_tag_name 87 | Script_data_escape_start 88 | Script_data_escape_start_dash 89 | Script_data_escaped 90 | Script_data_escaped_dash 91 | Script_data_escaped_dash_dash 92 | Script_data_escaped_less_than_sign 93 | Script_data_escaped_end_tag_open 94 | Script_data_escaped_end_tag_name 95 | Script_data_double_escape_start 96 | Script_data_double_escaped 97 | Script_data_double_escaped_dash 98 | Script_data_double_escaped_dash_dash 99 | Script_data_double_escaped_less_than_sign 100 | Script_data_double_escape_end 101 | Before_attribute_name 102 | Attribute_name 103 | After_attribute_name 104 | Before_attribute_value 105 | Attribute_value_double_quoted 106 | Attribute_value_single_quoted 107 | Attribute_value_unquoted 108 | After_attribute_value_quoted 109 | Self_closing_start_tag 110 | Bogus_comment 111 | Markup_declaration_open 112 | Comment_start 113 | Comment_start_dash 114 | Comment 115 | Comment_less_than_sign 116 | Comment_less_than_sign_bang 117 | Comment_less_than_sign_bang_dash 118 | Comment_less_than_sign_bang_dash_dash 119 | Comment_end_dash 120 | Comment_end 121 | Comment_end_bang 122 | Doctype 123 | Before_doctype_name 124 | Doctype_name 125 | After_doctype_name 126 | After_doctype_public_keyword 127 | Before_doctype_public_identifier 128 | Doctype_public_identifier_double_quoted 129 | Doctype_public_identifier_single_quoted 130 | After_doctype_public_identifier 131 | Between_doctype_public_and_system_identifiers 132 | After_doctype_system_keyword 133 | Before_doctype_system_identifier 134 | Doctype_system_identifier_double_quoted 135 | Doctype_system_identifier_single_quoted 136 | After_doctype_system_identifier 137 | Bogus_doctype 138 | Cdata_section 139 | Cdata_section_bracket 140 | Cdata_section_end 141 | Character_reference 142 | Named_character_reference 143 | Ambiguous_ampersand 144 | Numeric_character_reference 145 | Hexadecimal_character_reference_start 146 | Decimal_character_reference_start 147 | Hexadecimal_character_reference 148 | Decimal_character_reference 149 | Numeric_character_reference_end 150 151 val pp : Format.formatter -> t -> unit 152 (** Pretty-print a tokenizer state. *) 153end 154 155(** Parse error types. *) 156module Errors : sig 157 type t = Tokenizer_errors.t = { 158 code : Parse_error_code.t; 159 line : int; 160 column : int; 161 } 162 163 val make : code:string -> line:int -> column:int -> t 164 (** Create an error from a string code. The string is converted to 165 {!Parse_error_code.t} using {!Parse_error_code.of_string}. *) 166 167 val make_with_code : code:Parse_error_code.t -> line:int -> column:int -> t 168 (** Create an error with a typed error code. *) 169 170 val to_string : t -> string 171 172 val pp : Format.formatter -> t -> unit 173 (** Pretty-print a tokenizer error. *) 174end 175 176(** Input stream with position tracking. *) 177module Stream : sig 178 type t = Tokenizer_stream.t 179 180 val create : string -> t 181 val create_from_reader : Bytesrw.Bytes.Reader.t -> t 182 val set_error_callback : t -> (string -> unit) -> unit 183 val position : t -> int * int 184end 185 186(** {1 Token Sink Interface} *) 187 188(** Interface for token consumers. 189 190 The tokenizer calls [process] for each token it produces. The sink 191 can return [`Continue] to keep tokenizing, or [`SwitchTo state] to 192 change the tokenizer state (used by the tree builder for things like 193 [<script>] and [<textarea>]). 194*) 195module type SINK = sig 196 type t 197 val process : t -> Tokenizer_token.t -> line:int -> column:int -> [ `Continue | `SwitchTo of Tokenizer_state.t ] 198 val adjusted_current_node_in_html_namespace : t -> bool 199end 200 201(** {1 Tokenizer} *) 202 203(** The tokenizer type, parameterized by the sink type. *) 204type 'sink t 205 206val create : 207 (module SINK with type t = 'sink) -> 208 'sink -> 209 ?collect_errors:bool -> 210 ?xml_mode:bool -> 211 unit -> 212 'sink t 213(** Create a new tokenizer. 214 215 @param sink The token sink that will receive tokens 216 @param collect_errors If [true], collect parse errors (default: [false]) 217 @param xml_mode If [true], apply XML compatibility transformations 218*) 219 220val run : 221 'sink t -> 222 (module SINK with type t = 'sink) -> 223 Bytesrw.Bytes.Reader.t -> 224 unit 225(** Run the tokenizer on the given input. 226 227 The tokenizer will read from the reader and call the sink's [process] 228 function for each token until EOF is reached. 229*) 230 231val get_errors : 'sink t -> Tokenizer_errors.t list 232(** Get the list of parse errors encountered during tokenization. 233 234 Only populated if [collect_errors:true] was passed to {!create}. 235*) 236 237val set_state : 'sink t -> Tokenizer_state.t -> unit 238(** Set the tokenizer state. 239 240 Used by the tree builder to switch states for raw text elements. 241*) 242 243val set_last_start_tag : 'sink t -> string -> unit 244(** Set the last start tag name. 245 246 Used by the tree builder to track the context for end tag matching. 247*)