(* HTML5 Tokenizer - implements WHATWG tokenization algorithm *) (* Character classification using Astring *) let is_ascii_alpha = Astring.Char.Ascii.is_letter let is_ascii_digit = Astring.Char.Ascii.is_digit let is_ascii_hex = Astring.Char.Ascii.is_hex_digit let is_ascii_alnum = Astring.Char.Ascii.is_alphanum let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' let ascii_lower = Astring.Char.Ascii.lowercase (* Token sink interface *) module type SINK = sig type t val process : t -> Tokenizer_token.t -> line:int -> column:int -> [ `Continue | `SwitchTo of Tokenizer_state.t ] val adjusted_current_node_in_html_namespace : t -> bool end type 'sink t = { mutable stream : Tokenizer_stream.t; sink : 'sink; mutable state : Tokenizer_state.t; mutable return_state : Tokenizer_state.t; mutable char_ref_code : int; mutable temp_buffer : Buffer.t; mutable last_start_tag : string; mutable current_tag_name : Buffer.t; mutable current_tag_kind : Tokenizer_token.tag_kind; mutable current_tag_self_closing : bool; mutable current_attr_name : Buffer.t; mutable current_attr_value : Buffer.t; mutable current_attrs : (string * string) list; mutable current_doctype_name : Buffer.t option; mutable current_doctype_public : Buffer.t option; mutable current_doctype_system : Buffer.t option; mutable current_doctype_force_quirks : bool; mutable current_comment : Buffer.t; mutable pending_chars : Buffer.t; mutable errors : Tokenizer_errors.t list; collect_errors : bool; xml_mode : bool; (* XML violation mode: transform chars for XML compatibility *) } let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) ?(xml_mode=false) () = { stream = Tokenizer_stream.create ""; sink; state = Tokenizer_state.Data; return_state = Tokenizer_state.Data; char_ref_code = 0; temp_buffer = Buffer.create 64; last_start_tag = ""; current_tag_name = Buffer.create 32; current_tag_kind = Tokenizer_token.Start; current_tag_self_closing = false; current_attr_name = Buffer.create 32; current_attr_value = Buffer.create 64; current_attrs = []; current_doctype_name = None; current_doctype_public = None; current_doctype_system = None; current_doctype_force_quirks = false; current_comment = Buffer.create 64; pending_chars = Buffer.create 256; errors = []; collect_errors; xml_mode; } let error t code = if t.collect_errors then begin let (line, column) = Tokenizer_stream.position t.stream in t.errors <- Tokenizer_errors.make ~code ~line ~column :: t.errors end (* emit functions are defined locally inside run *) (* XML mode character transformation: form feed → space *) let emit_char t c = if t.xml_mode && c = '\x0C' then Buffer.add_char t.pending_chars ' ' else Buffer.add_char t.pending_chars c (* XML mode string transformation: U+FFFF → U+FFFD, form feed → space *) let emit_str t s = if t.xml_mode then begin (* Transform: \xEF\xBF\xBF (U+FFFF) → \xEF\xBF\xBD (U+FFFD), \x0C → space *) let len = String.length s in let i = ref 0 in while !i < len do let c = s.[!i] in if c = '\x0C' then begin Buffer.add_char t.pending_chars ' '; incr i end else if c = '\xEF' && !i + 2 < len && s.[!i+1] = '\xBF' && s.[!i+2] = '\xBF' then begin (* U+FFFF → U+FFFD *) Buffer.add_string t.pending_chars "\xEF\xBF\xBD"; i := !i + 3 end else begin Buffer.add_char t.pending_chars c; incr i end done end else Buffer.add_string t.pending_chars s let start_new_tag t kind = Buffer.clear t.current_tag_name; t.current_tag_kind <- kind; t.current_tag_self_closing <- false; t.current_attrs <- [] let start_new_attribute t = (* Save previous attribute if any *) let name = Buffer.contents t.current_attr_name in if String.length name > 0 then begin let value = Buffer.contents t.current_attr_value in (* Check for duplicates - only add if not already present *) if not (List.exists (fun (n, _) -> n = name) t.current_attrs) then t.current_attrs <- (name, value) :: t.current_attrs else error t "duplicate-attribute" end; Buffer.clear t.current_attr_name; Buffer.clear t.current_attr_value let finish_attribute t = start_new_attribute t let start_new_doctype t = t.current_doctype_name <- None; t.current_doctype_public <- None; t.current_doctype_system <- None; t.current_doctype_force_quirks <- false (* emit_current_tag, emit_current_doctype, emit_current_comment are defined locally inside run *) let is_appropriate_end_tag t = let name = Buffer.contents t.current_tag_name in String.length t.last_start_tag > 0 && name = t.last_start_tag let flush_code_points_consumed_as_char_ref t = let s = Buffer.contents t.temp_buffer in match t.return_state with | Tokenizer_state.Attribute_value_double_quoted | Tokenizer_state.Attribute_value_single_quoted | Tokenizer_state.Attribute_value_unquoted -> Buffer.add_string t.current_attr_value s | _ -> emit_str t s open Bytesrw (* Main tokenization loop *) let run (type s) t (module S : SINK with type t = s) (reader : Bytes.Reader.t) = t.stream <- Tokenizer_stream.create_from_reader reader; t.errors <- []; (* Set up error callback for surrogate/noncharacter detection in stream *) (* In XML mode, we don't report noncharacter errors - we transform them instead *) if not t.xml_mode then Tokenizer_stream.set_error_callback t.stream (fun code -> error t code); (* XML mode transformation for pending chars: U+FFFF → U+FFFD *) let transform_xml_chars data = let len = String.length data in let buf = Buffer.create len in let i = ref 0 in while !i < len do let c = data.[!i] in if c = '\xEF' && !i + 2 < len && data.[!i+1] = '\xBF' && data.[!i+2] = '\xBF' then begin (* U+FFFF → U+FFFD *) Buffer.add_string buf "\xEF\xBF\xBD"; i := !i + 3 end else begin Buffer.add_char buf c; incr i end done; Buffer.contents buf in (* Local emit functions with access to S *) let emit_pending_chars () = if Buffer.length t.pending_chars > 0 then begin let data = Buffer.contents t.pending_chars in Buffer.clear t.pending_chars; let data = if t.xml_mode then transform_xml_chars data else data in let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink (Tokenizer_token.Character data) ~line ~column) end in let emit token = emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in match S.process t.sink token ~line ~column with | `Continue -> () | `SwitchTo new_state -> t.state <- new_state in let emit_current_tag () = finish_attribute t; let name = Buffer.contents t.current_tag_name in let attrs = List.rev t.current_attrs in (* Check for end tag with attributes or self-closing flag *) if t.current_tag_kind = Tokenizer_token.End then begin if attrs <> [] then error t "end-tag-with-attributes"; if t.current_tag_self_closing then error t "end-tag-with-trailing-solidus" end; let tag = { Tokenizer_token.kind = t.current_tag_kind; name; attrs; self_closing = t.current_tag_self_closing; } in if t.current_tag_kind = Tokenizer_token.Start then t.last_start_tag <- name; emit (Tokenizer_token.Tag tag) in let emit_current_doctype () = let doctype = { Tokenizer_token.name = Option.map Buffer.contents t.current_doctype_name; public_id = Option.map Buffer.contents t.current_doctype_public; system_id = Option.map Buffer.contents t.current_doctype_system; force_quirks = t.current_doctype_force_quirks; } in emit (Tokenizer_token.Doctype doctype) in let emit_current_comment () = let content = Buffer.contents t.current_comment in let content = if t.xml_mode then begin (* XML mode: transform -- to - - in comments *) let buf = Buffer.create (String.length content + 10) in let len = String.length content in let i = ref 0 in while !i < len do if !i + 1 < len && content.[!i] = '-' && content.[!i+1] = '-' then begin Buffer.add_string buf "- -"; i := !i + 2 end else begin Buffer.add_char buf content.[!i]; incr i end done; Buffer.contents buf end else content in emit (Tokenizer_token.Comment content) in (* Check for control characters and emit error if needed *) (* Only checks ASCII control chars; C1 controls (U+0080-U+009F) are 2-byte in UTF-8 *) let check_control_char c = let code = Char.code c in (* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F *) (* Allowed: U+0009 (tab), U+000A (LF), U+000C (FF), U+000D (CR) *) (* Note: U+0080-U+009F (C1 controls) are 2-byte UTF-8 sequences starting with 0xC2 *) (* Note: We only check single-byte control chars here; multi-byte checks are TODO *) if (code >= 0x01 && code <= 0x08) || code = 0x0B || (code >= 0x0E && code <= 0x1F) || code = 0x7F then error t (Printf.sprintf "control-character-in-input-stream:%04x" code) in (* Emit char with control character check *) let emit_char_checked c = check_control_char c; emit_char t c in let rec process_state () = if Tokenizer_stream.is_eof t.stream && t.state <> Tokenizer_state.Data then begin (* Handle EOF in various states *) handle_eof () end else if Tokenizer_stream.is_eof t.stream then begin emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) end else begin step (); process_state () end and handle_eof () = match t.state with | Tokenizer_state.Data -> emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Tag_open -> error t "eof-before-tag-name"; emit_char t '<'; emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.End_tag_open -> error t "eof-before-tag-name"; emit_str t " error t "eof-in-tag"; emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Rawtext -> emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Rawtext_less_than_sign -> emit_char t '<'; emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Rawtext_end_tag_open -> emit_str t " emit_str t " emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Rcdata_less_than_sign -> emit_char t '<'; emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Rcdata_end_tag_open -> emit_str t " emit_str t " emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Script_data_less_than_sign -> emit_char t '<'; emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Script_data_end_tag_open -> emit_str t " emit_str t " error t "eof-in-script-html-comment-like-text"; emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Script_data_escaped_less_than_sign -> emit_char t '<'; emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Script_data_escaped_end_tag_open -> emit_str t " emit_str t " error t "eof-in-script-html-comment-like-text"; emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Script_data_double_escaped_less_than_sign -> (* '<' was already emitted when entering this state from Script_data_double_escaped *) emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Script_data_double_escape_end -> emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Plaintext -> emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Comment_start | Tokenizer_state.Comment_start_dash | Tokenizer_state.Comment | Tokenizer_state.Comment_less_than_sign | Tokenizer_state.Comment_less_than_sign_bang | Tokenizer_state.Comment_less_than_sign_bang_dash | Tokenizer_state.Comment_less_than_sign_bang_dash_dash | Tokenizer_state.Comment_end_dash | Tokenizer_state.Comment_end | Tokenizer_state.Comment_end_bang -> error t "eof-in-comment"; emit_current_comment (); emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Bogus_comment -> emit_current_comment (); emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Markup_declaration_open -> error t "incorrectly-opened-comment"; Buffer.clear t.current_comment; emit_current_comment (); emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Doctype | Tokenizer_state.Before_doctype_name -> error t "eof-in-doctype"; start_new_doctype t; t.current_doctype_force_quirks <- true; emit_current_doctype (); emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Doctype_name | Tokenizer_state.After_doctype_name | Tokenizer_state.After_doctype_public_keyword | Tokenizer_state.Before_doctype_public_identifier | Tokenizer_state.Doctype_public_identifier_double_quoted | Tokenizer_state.Doctype_public_identifier_single_quoted | Tokenizer_state.After_doctype_public_identifier | Tokenizer_state.Between_doctype_public_and_system_identifiers | Tokenizer_state.After_doctype_system_keyword | Tokenizer_state.Before_doctype_system_identifier | Tokenizer_state.Doctype_system_identifier_double_quoted | Tokenizer_state.Doctype_system_identifier_single_quoted | Tokenizer_state.After_doctype_system_identifier -> error t "eof-in-doctype"; t.current_doctype_force_quirks <- true; emit_current_doctype (); emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Bogus_doctype -> emit_current_doctype (); emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Cdata_section -> error t "eof-in-cdata"; emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Cdata_section_bracket -> error t "eof-in-cdata"; emit_char t ']'; emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Cdata_section_end -> error t "eof-in-cdata"; emit_str t "]]"; emit_pending_chars (); let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) | Tokenizer_state.Character_reference -> (* state_character_reference never ran, so initialize temp_buffer with & *) Buffer.clear t.temp_buffer; Buffer.add_char t.temp_buffer '&'; flush_code_points_consumed_as_char_ref t; t.state <- t.return_state; handle_eof () | Tokenizer_state.Named_character_reference -> flush_code_points_consumed_as_char_ref t; t.state <- t.return_state; handle_eof () | Tokenizer_state.Numeric_character_reference -> (* At EOF with just "&#" - no digits follow *) error t "absence-of-digits-in-numeric-character-reference"; flush_code_points_consumed_as_char_ref t; t.state <- t.return_state; handle_eof () | Tokenizer_state.Hexadecimal_character_reference_start | Tokenizer_state.Decimal_character_reference_start -> error t "absence-of-digits-in-numeric-character-reference"; flush_code_points_consumed_as_char_ref t; t.state <- t.return_state; handle_eof () | Tokenizer_state.Numeric_character_reference_end -> (* We have collected digits, just need to finalize the character reference *) step (); handle_eof () | Tokenizer_state.Ambiguous_ampersand -> (* Buffer was already flushed when entering this state, just transition *) t.state <- t.return_state; handle_eof () | Tokenizer_state.Hexadecimal_character_reference | Tokenizer_state.Decimal_character_reference -> (* At EOF with collected digits - convert the numeric reference *) error t "missing-semicolon-after-character-reference"; let code = t.char_ref_code in let replacement_char = "\xEF\xBF\xBD" in let result = if code = 0 then begin error t "null-character-reference"; replacement_char end else if code > 0x10FFFF then begin error t "character-reference-outside-unicode-range"; replacement_char end else if code >= 0xD800 && code <= 0xDFFF then begin error t "surrogate-character-reference"; replacement_char end else Entities.Numeric_ref.codepoint_to_utf8 code in Buffer.clear t.temp_buffer; Buffer.add_string t.temp_buffer result; flush_code_points_consumed_as_char_ref t; t.state <- t.return_state; handle_eof () and step () = match t.state with | Tokenizer_state.Data -> state_data () | Tokenizer_state.Rcdata -> state_rcdata () | Tokenizer_state.Rawtext -> state_rawtext () | Tokenizer_state.Script_data -> state_script_data () | Tokenizer_state.Plaintext -> state_plaintext () | Tokenizer_state.Tag_open -> state_tag_open () | Tokenizer_state.End_tag_open -> state_end_tag_open () | Tokenizer_state.Tag_name -> state_tag_name () | Tokenizer_state.Rcdata_less_than_sign -> state_rcdata_less_than_sign () | Tokenizer_state.Rcdata_end_tag_open -> state_rcdata_end_tag_open () | Tokenizer_state.Rcdata_end_tag_name -> state_rcdata_end_tag_name () | Tokenizer_state.Rawtext_less_than_sign -> state_rawtext_less_than_sign () | Tokenizer_state.Rawtext_end_tag_open -> state_rawtext_end_tag_open () | Tokenizer_state.Rawtext_end_tag_name -> state_rawtext_end_tag_name () | Tokenizer_state.Script_data_less_than_sign -> state_script_data_less_than_sign () | Tokenizer_state.Script_data_end_tag_open -> state_script_data_end_tag_open () | Tokenizer_state.Script_data_end_tag_name -> state_script_data_end_tag_name () | Tokenizer_state.Script_data_escape_start -> state_script_data_escape_start () | Tokenizer_state.Script_data_escape_start_dash -> state_script_data_escape_start_dash () | Tokenizer_state.Script_data_escaped -> state_script_data_escaped () | Tokenizer_state.Script_data_escaped_dash -> state_script_data_escaped_dash () | Tokenizer_state.Script_data_escaped_dash_dash -> state_script_data_escaped_dash_dash () | Tokenizer_state.Script_data_escaped_less_than_sign -> state_script_data_escaped_less_than_sign () | Tokenizer_state.Script_data_escaped_end_tag_open -> state_script_data_escaped_end_tag_open () | Tokenizer_state.Script_data_escaped_end_tag_name -> state_script_data_escaped_end_tag_name () | Tokenizer_state.Script_data_double_escape_start -> state_script_data_double_escape_start () | Tokenizer_state.Script_data_double_escaped -> state_script_data_double_escaped () | Tokenizer_state.Script_data_double_escaped_dash -> state_script_data_double_escaped_dash () | Tokenizer_state.Script_data_double_escaped_dash_dash -> state_script_data_double_escaped_dash_dash () | Tokenizer_state.Script_data_double_escaped_less_than_sign -> state_script_data_double_escaped_less_than_sign () | Tokenizer_state.Script_data_double_escape_end -> state_script_data_double_escape_end () | Tokenizer_state.Before_attribute_name -> state_before_attribute_name () | Tokenizer_state.Attribute_name -> state_attribute_name () | Tokenizer_state.After_attribute_name -> state_after_attribute_name () | Tokenizer_state.Before_attribute_value -> state_before_attribute_value () | Tokenizer_state.Attribute_value_double_quoted -> state_attribute_value_double_quoted () | Tokenizer_state.Attribute_value_single_quoted -> state_attribute_value_single_quoted () | Tokenizer_state.Attribute_value_unquoted -> state_attribute_value_unquoted () | Tokenizer_state.After_attribute_value_quoted -> state_after_attribute_value_quoted () | Tokenizer_state.Self_closing_start_tag -> state_self_closing_start_tag () | Tokenizer_state.Bogus_comment -> state_bogus_comment () | Tokenizer_state.Markup_declaration_open -> state_markup_declaration_open () | Tokenizer_state.Comment_start -> state_comment_start () | Tokenizer_state.Comment_start_dash -> state_comment_start_dash () | Tokenizer_state.Comment -> state_comment () | Tokenizer_state.Comment_less_than_sign -> state_comment_less_than_sign () | Tokenizer_state.Comment_less_than_sign_bang -> state_comment_less_than_sign_bang () | Tokenizer_state.Comment_less_than_sign_bang_dash -> state_comment_less_than_sign_bang_dash () | Tokenizer_state.Comment_less_than_sign_bang_dash_dash -> state_comment_less_than_sign_bang_dash_dash () | Tokenizer_state.Comment_end_dash -> state_comment_end_dash () | Tokenizer_state.Comment_end -> state_comment_end () | Tokenizer_state.Comment_end_bang -> state_comment_end_bang () | Tokenizer_state.Doctype -> state_doctype () | Tokenizer_state.Before_doctype_name -> state_before_doctype_name () | Tokenizer_state.Doctype_name -> state_doctype_name () | Tokenizer_state.After_doctype_name -> state_after_doctype_name () | Tokenizer_state.After_doctype_public_keyword -> state_after_doctype_public_keyword () | Tokenizer_state.Before_doctype_public_identifier -> state_before_doctype_public_identifier () | Tokenizer_state.Doctype_public_identifier_double_quoted -> state_doctype_public_identifier_double_quoted () | Tokenizer_state.Doctype_public_identifier_single_quoted -> state_doctype_public_identifier_single_quoted () | Tokenizer_state.After_doctype_public_identifier -> state_after_doctype_public_identifier () | Tokenizer_state.Between_doctype_public_and_system_identifiers -> state_between_doctype_public_and_system_identifiers () | Tokenizer_state.After_doctype_system_keyword -> state_after_doctype_system_keyword () | Tokenizer_state.Before_doctype_system_identifier -> state_before_doctype_system_identifier () | Tokenizer_state.Doctype_system_identifier_double_quoted -> state_doctype_system_identifier_double_quoted () | Tokenizer_state.Doctype_system_identifier_single_quoted -> state_doctype_system_identifier_single_quoted () | Tokenizer_state.After_doctype_system_identifier -> state_after_doctype_system_identifier () | Tokenizer_state.Bogus_doctype -> state_bogus_doctype () | Tokenizer_state.Cdata_section -> state_cdata_section () | Tokenizer_state.Cdata_section_bracket -> state_cdata_section_bracket () | Tokenizer_state.Cdata_section_end -> state_cdata_section_end () | Tokenizer_state.Character_reference -> state_character_reference () | Tokenizer_state.Named_character_reference -> state_named_character_reference () | Tokenizer_state.Ambiguous_ampersand -> state_ambiguous_ampersand () | Tokenizer_state.Numeric_character_reference -> state_numeric_character_reference () | Tokenizer_state.Hexadecimal_character_reference_start -> state_hexadecimal_character_reference_start () | Tokenizer_state.Decimal_character_reference_start -> state_decimal_character_reference_start () | Tokenizer_state.Hexadecimal_character_reference -> state_hexadecimal_character_reference () | Tokenizer_state.Decimal_character_reference -> state_decimal_character_reference () | Tokenizer_state.Numeric_character_reference_end -> state_numeric_character_reference_end () (* State implementations *) and state_data () = match Tokenizer_stream.consume t.stream with | Some '&' -> t.return_state <- Tokenizer_state.Data; t.state <- Tokenizer_state.Character_reference | Some '<' -> t.state <- Tokenizer_state.Tag_open | Some '\x00' -> (* Emit pending chars first, then emit null separately for proper tree builder handling *) emit_pending_chars (); error t "unexpected-null-character"; let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink (Tokenizer_token.Character "\x00") ~line ~column) | Some c -> emit_char_checked c | None -> () and state_rcdata () = match Tokenizer_stream.consume t.stream with | Some '&' -> t.return_state <- Tokenizer_state.Rcdata; t.state <- Tokenizer_state.Character_reference | Some '<' -> t.state <- Tokenizer_state.Rcdata_less_than_sign | Some '\x00' -> error t "unexpected-null-character"; emit_str t "\xEF\xBF\xBD" | Some c -> emit_char_checked c | None -> () and state_rawtext () = match Tokenizer_stream.consume t.stream with | Some '<' -> t.state <- Tokenizer_state.Rawtext_less_than_sign | Some '\x00' -> error t "unexpected-null-character"; emit_str t "\xEF\xBF\xBD" | Some c -> emit_char_checked c | None -> () and state_script_data () = match Tokenizer_stream.consume t.stream with | Some '<' -> t.state <- Tokenizer_state.Script_data_less_than_sign | Some '\x00' -> error t "unexpected-null-character"; emit_str t "\xEF\xBF\xBD" | Some c -> emit_char_checked c | None -> () and state_plaintext () = match Tokenizer_stream.consume t.stream with | Some '\x00' -> error t "unexpected-null-character"; emit_str t "\xEF\xBF\xBD" | Some c -> emit_char_checked c | None -> () and state_tag_open () = match Tokenizer_stream.peek t.stream with | Some '!' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Markup_declaration_open | Some '/' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.End_tag_open | Some c when is_ascii_alpha c -> start_new_tag t Tokenizer_token.Start; t.state <- Tokenizer_state.Tag_name | Some '?' -> error t "unexpected-question-mark-instead-of-tag-name"; Buffer.clear t.current_comment; t.state <- Tokenizer_state.Bogus_comment | None -> error t "eof-before-tag-name"; emit_char t '<' | Some _ -> error t "invalid-first-character-of-tag-name"; emit_char t '<'; t.state <- Tokenizer_state.Data and state_end_tag_open () = match Tokenizer_stream.peek t.stream with | Some c when is_ascii_alpha c -> start_new_tag t Tokenizer_token.End; t.state <- Tokenizer_state.Tag_name | Some '>' -> Tokenizer_stream.advance t.stream; error t "missing-end-tag-name"; t.state <- Tokenizer_state.Data | None -> error t "eof-before-tag-name"; emit_str t " error t "invalid-first-character-of-tag-name"; Buffer.clear t.current_comment; t.state <- Tokenizer_state.Bogus_comment and state_tag_name () = match Tokenizer_stream.consume t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> t.state <- Tokenizer_state.Before_attribute_name | Some '/' -> t.state <- Tokenizer_state.Self_closing_start_tag | Some '>' -> t.state <- Tokenizer_state.Data; emit_current_tag () | Some '\x00' -> error t "unexpected-null-character"; Buffer.add_string t.current_tag_name "\xEF\xBF\xBD" | Some c -> check_control_char c; Buffer.add_char t.current_tag_name (ascii_lower c) | None -> () and state_rcdata_less_than_sign () = match Tokenizer_stream.peek t.stream with | Some '/' -> Tokenizer_stream.advance t.stream; Buffer.clear t.temp_buffer; t.state <- Tokenizer_state.Rcdata_end_tag_open | _ -> emit_char t '<'; t.state <- Tokenizer_state.Rcdata and state_rcdata_end_tag_open () = match Tokenizer_stream.peek t.stream with | Some c when is_ascii_alpha c -> start_new_tag t Tokenizer_token.End; t.state <- Tokenizer_state.Rcdata_end_tag_name | _ -> emit_str t " Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Before_attribute_name | Some '/' when is_appropriate_end_tag t -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Self_closing_start_tag | Some '>' when is_appropriate_end_tag t -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Data; emit_current_tag () | Some c when is_ascii_alpha c -> Tokenizer_stream.advance t.stream; Buffer.add_char t.current_tag_name (ascii_lower c); Buffer.add_char t.temp_buffer c | _ -> emit_str t " Tokenizer_stream.advance t.stream; Buffer.clear t.temp_buffer; t.state <- Tokenizer_state.Rawtext_end_tag_open | _ -> emit_char t '<'; t.state <- Tokenizer_state.Rawtext and state_rawtext_end_tag_open () = match Tokenizer_stream.peek t.stream with | Some c when is_ascii_alpha c -> start_new_tag t Tokenizer_token.End; t.state <- Tokenizer_state.Rawtext_end_tag_name | _ -> emit_str t " Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Before_attribute_name | Some '/' when is_appropriate_end_tag t -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Self_closing_start_tag | Some '>' when is_appropriate_end_tag t -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Data; emit_current_tag () | Some c when is_ascii_alpha c -> Tokenizer_stream.advance t.stream; Buffer.add_char t.current_tag_name (ascii_lower c); Buffer.add_char t.temp_buffer c | _ -> emit_str t " Tokenizer_stream.advance t.stream; Buffer.clear t.temp_buffer; t.state <- Tokenizer_state.Script_data_end_tag_open | Some '!' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Script_data_escape_start; emit_str t " emit_char t '<'; t.state <- Tokenizer_state.Script_data and state_script_data_end_tag_open () = match Tokenizer_stream.peek t.stream with | Some c when is_ascii_alpha c -> start_new_tag t Tokenizer_token.End; t.state <- Tokenizer_state.Script_data_end_tag_name | _ -> emit_str t " Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Before_attribute_name | Some '/' when is_appropriate_end_tag t -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Self_closing_start_tag | Some '>' when is_appropriate_end_tag t -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Data; emit_current_tag () | Some c when is_ascii_alpha c -> Tokenizer_stream.advance t.stream; Buffer.add_char t.current_tag_name (ascii_lower c); Buffer.add_char t.temp_buffer c | _ -> emit_str t " Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Script_data_escape_start_dash; emit_char t '-' | _ -> t.state <- Tokenizer_state.Script_data and state_script_data_escape_start_dash () = match Tokenizer_stream.peek t.stream with | Some '-' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Script_data_escaped_dash_dash; emit_char t '-' | _ -> t.state <- Tokenizer_state.Script_data and state_script_data_escaped () = match Tokenizer_stream.consume t.stream with | Some '-' -> t.state <- Tokenizer_state.Script_data_escaped_dash; emit_char t '-' | Some '<' -> t.state <- Tokenizer_state.Script_data_escaped_less_than_sign | Some '\x00' -> error t "unexpected-null-character"; emit_str t "\xEF\xBF\xBD" | Some c -> emit_char_checked c | None -> () and state_script_data_escaped_dash () = match Tokenizer_stream.consume t.stream with | Some '-' -> t.state <- Tokenizer_state.Script_data_escaped_dash_dash; emit_char t '-' | Some '<' -> t.state <- Tokenizer_state.Script_data_escaped_less_than_sign | Some '\x00' -> error t "unexpected-null-character"; t.state <- Tokenizer_state.Script_data_escaped; emit_str t "\xEF\xBF\xBD" | Some c -> t.state <- Tokenizer_state.Script_data_escaped; emit_char_checked c | None -> () and state_script_data_escaped_dash_dash () = match Tokenizer_stream.consume t.stream with | Some '-' -> emit_char t '-' | Some '<' -> t.state <- Tokenizer_state.Script_data_escaped_less_than_sign | Some '>' -> t.state <- Tokenizer_state.Script_data; emit_char t '>' | Some '\x00' -> error t "unexpected-null-character"; t.state <- Tokenizer_state.Script_data_escaped; emit_str t "\xEF\xBF\xBD" | Some c -> t.state <- Tokenizer_state.Script_data_escaped; emit_char_checked c | None -> () and state_script_data_escaped_less_than_sign () = match Tokenizer_stream.peek t.stream with | Some '/' -> Tokenizer_stream.advance t.stream; Buffer.clear t.temp_buffer; t.state <- Tokenizer_state.Script_data_escaped_end_tag_open | Some c when is_ascii_alpha c -> Buffer.clear t.temp_buffer; emit_char t '<'; t.state <- Tokenizer_state.Script_data_double_escape_start | _ -> emit_char t '<'; t.state <- Tokenizer_state.Script_data_escaped and state_script_data_escaped_end_tag_open () = match Tokenizer_stream.peek t.stream with | Some c when is_ascii_alpha c -> start_new_tag t Tokenizer_token.End; t.state <- Tokenizer_state.Script_data_escaped_end_tag_name | _ -> emit_str t " Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Before_attribute_name | Some '/' when is_appropriate_end_tag t -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Self_closing_start_tag | Some '>' when is_appropriate_end_tag t -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Data; emit_current_tag () | Some c when is_ascii_alpha c -> Tokenizer_stream.advance t.stream; Buffer.add_char t.current_tag_name (ascii_lower c); Buffer.add_char t.temp_buffer c | _ -> emit_str t "') as c_opt -> Tokenizer_stream.advance t.stream; let c = Option.get c_opt in if Buffer.contents t.temp_buffer = "script" then t.state <- Tokenizer_state.Script_data_double_escaped else t.state <- Tokenizer_state.Script_data_escaped; emit_char t c | Some c when is_ascii_alpha c -> Tokenizer_stream.advance t.stream; Buffer.add_char t.temp_buffer (ascii_lower c); emit_char t c | _ -> t.state <- Tokenizer_state.Script_data_escaped and state_script_data_double_escaped () = match Tokenizer_stream.consume t.stream with | Some '-' -> t.state <- Tokenizer_state.Script_data_double_escaped_dash; emit_char t '-' | Some '<' -> t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign; emit_char t '<' | Some '\x00' -> error t "unexpected-null-character"; emit_str t "\xEF\xBF\xBD" | Some c -> emit_char_checked c | None -> () and state_script_data_double_escaped_dash () = match Tokenizer_stream.consume t.stream with | Some '-' -> t.state <- Tokenizer_state.Script_data_double_escaped_dash_dash; emit_char t '-' | Some '<' -> t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign; emit_char t '<' | Some '\x00' -> error t "unexpected-null-character"; t.state <- Tokenizer_state.Script_data_double_escaped; emit_str t "\xEF\xBF\xBD" | Some c -> t.state <- Tokenizer_state.Script_data_double_escaped; emit_char_checked c | None -> () and state_script_data_double_escaped_dash_dash () = match Tokenizer_stream.consume t.stream with | Some '-' -> emit_char t '-' | Some '<' -> t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign; emit_char t '<' | Some '>' -> t.state <- Tokenizer_state.Script_data; emit_char t '>' | Some '\x00' -> error t "unexpected-null-character"; t.state <- Tokenizer_state.Script_data_double_escaped; emit_str t "\xEF\xBF\xBD" | Some c -> t.state <- Tokenizer_state.Script_data_double_escaped; emit_char_checked c | None -> () and state_script_data_double_escaped_less_than_sign () = match Tokenizer_stream.peek t.stream with | Some '/' -> Tokenizer_stream.advance t.stream; Buffer.clear t.temp_buffer; t.state <- Tokenizer_state.Script_data_double_escape_end; emit_char t '/' | _ -> t.state <- Tokenizer_state.Script_data_double_escaped and state_script_data_double_escape_end () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt -> Tokenizer_stream.advance t.stream; let c = Option.get c_opt in if Buffer.contents t.temp_buffer = "script" then t.state <- Tokenizer_state.Script_data_escaped else t.state <- Tokenizer_state.Script_data_double_escaped; emit_char t c | Some c when is_ascii_alpha c -> Tokenizer_stream.advance t.stream; Buffer.add_char t.temp_buffer (ascii_lower c); emit_char t c | _ -> t.state <- Tokenizer_state.Script_data_double_escaped and state_before_attribute_name () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream | Some '/' | Some '>' | None -> t.state <- Tokenizer_state.After_attribute_name | Some '=' -> Tokenizer_stream.advance t.stream; error t "unexpected-equals-sign-before-attribute-name"; start_new_attribute t; Buffer.add_char t.current_attr_name '='; t.state <- Tokenizer_state.Attribute_name | Some _ -> start_new_attribute t; t.state <- Tokenizer_state.Attribute_name and state_attribute_name () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.After_attribute_name | Some '/' | Some '>' | None -> t.state <- Tokenizer_state.After_attribute_name | Some '=' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Before_attribute_value | Some '\x00' -> Tokenizer_stream.advance t.stream; error t "unexpected-null-character"; Buffer.add_string t.current_attr_name "\xEF\xBF\xBD" | Some ('"' | '\'' | '<') as c_opt -> Tokenizer_stream.advance t.stream; error t "unexpected-character-in-attribute-name"; Buffer.add_char t.current_attr_name (Option.get c_opt) | Some c -> Tokenizer_stream.advance t.stream; check_control_char c; Buffer.add_char t.current_attr_name (ascii_lower c) and state_after_attribute_name () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream | Some '/' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Self_closing_start_tag | Some '=' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Before_attribute_value | Some '>' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Data; emit_current_tag () | None -> () | Some _ -> start_new_attribute t; t.state <- Tokenizer_state.Attribute_name and state_before_attribute_value () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream | Some '"' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Attribute_value_double_quoted | Some '\'' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Attribute_value_single_quoted | Some '>' -> Tokenizer_stream.advance t.stream; error t "missing-attribute-value"; t.state <- Tokenizer_state.Data; emit_current_tag () | _ -> t.state <- Tokenizer_state.Attribute_value_unquoted and state_attribute_value_double_quoted () = match Tokenizer_stream.consume t.stream with | Some '"' -> t.state <- Tokenizer_state.After_attribute_value_quoted | Some '&' -> t.return_state <- Tokenizer_state.Attribute_value_double_quoted; t.state <- Tokenizer_state.Character_reference | Some '\x00' -> error t "unexpected-null-character"; Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" | Some c -> check_control_char c; Buffer.add_char t.current_attr_value c | None -> () and state_attribute_value_single_quoted () = match Tokenizer_stream.consume t.stream with | Some '\'' -> t.state <- Tokenizer_state.After_attribute_value_quoted | Some '&' -> t.return_state <- Tokenizer_state.Attribute_value_single_quoted; t.state <- Tokenizer_state.Character_reference | Some '\x00' -> error t "unexpected-null-character"; Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" | Some c -> check_control_char c; Buffer.add_char t.current_attr_value c | None -> () and state_attribute_value_unquoted () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Before_attribute_name | Some '&' -> Tokenizer_stream.advance t.stream; t.return_state <- Tokenizer_state.Attribute_value_unquoted; t.state <- Tokenizer_state.Character_reference | Some '>' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Data; emit_current_tag () | Some '\x00' -> Tokenizer_stream.advance t.stream; error t "unexpected-null-character"; Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" | Some ('"' | '\'' | '<' | '=' | '`') as c_opt -> Tokenizer_stream.advance t.stream; error t "unexpected-character-in-unquoted-attribute-value"; Buffer.add_char t.current_attr_value (Option.get c_opt) | Some c -> Tokenizer_stream.advance t.stream; check_control_char c; Buffer.add_char t.current_attr_value c | None -> () and state_after_attribute_value_quoted () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Before_attribute_name | Some '/' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Self_closing_start_tag | Some '>' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Data; emit_current_tag () | None -> () | Some _ -> error t "missing-whitespace-between-attributes"; t.state <- Tokenizer_state.Before_attribute_name and state_self_closing_start_tag () = match Tokenizer_stream.peek t.stream with | Some '>' -> Tokenizer_stream.advance t.stream; t.current_tag_self_closing <- true; t.state <- Tokenizer_state.Data; emit_current_tag () | None -> () | Some _ -> error t "unexpected-solidus-in-tag"; t.state <- Tokenizer_state.Before_attribute_name and state_bogus_comment () = match Tokenizer_stream.consume t.stream with | Some '>' -> t.state <- Tokenizer_state.Data; emit_current_comment () | Some '\x00' -> error t "unexpected-null-character"; Buffer.add_string t.current_comment "\xEF\xBF\xBD" | Some c -> check_control_char c; Buffer.add_char t.current_comment c | None -> () and state_markup_declaration_open () = if Tokenizer_stream.matches_ci t.stream "--" then begin ignore (Tokenizer_stream.consume_exact_ci t.stream "--"); Buffer.clear t.current_comment; t.state <- Tokenizer_state.Comment_start end else if Tokenizer_stream.matches_ci t.stream "DOCTYPE" then begin ignore (Tokenizer_stream.consume_exact_ci t.stream "DOCTYPE"); t.state <- Tokenizer_state.Doctype end else if Tokenizer_stream.matches_ci t.stream "[CDATA[" then begin ignore (Tokenizer_stream.consume_exact_ci t.stream "[CDATA["); (* CDATA only allowed in foreign content *) if S.adjusted_current_node_in_html_namespace t.sink then begin error t "cdata-in-html-content"; Buffer.clear t.current_comment; Buffer.add_string t.current_comment "[CDATA["; t.state <- Tokenizer_state.Bogus_comment end else t.state <- Tokenizer_state.Cdata_section end else begin error t "incorrectly-opened-comment"; Buffer.clear t.current_comment; t.state <- Tokenizer_state.Bogus_comment end and state_comment_start () = match Tokenizer_stream.peek t.stream with | Some '-' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Comment_start_dash | Some '>' -> Tokenizer_stream.advance t.stream; error t "abrupt-closing-of-empty-comment"; t.state <- Tokenizer_state.Data; emit_current_comment () | _ -> t.state <- Tokenizer_state.Comment and state_comment_start_dash () = match Tokenizer_stream.peek t.stream with | Some '-' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Comment_end | Some '>' -> Tokenizer_stream.advance t.stream; error t "abrupt-closing-of-empty-comment"; t.state <- Tokenizer_state.Data; emit_current_comment () | None -> () | Some _ -> Buffer.add_char t.current_comment '-'; t.state <- Tokenizer_state.Comment and state_comment () = match Tokenizer_stream.consume t.stream with | Some '<' -> Buffer.add_char t.current_comment '<'; t.state <- Tokenizer_state.Comment_less_than_sign | Some '-' -> t.state <- Tokenizer_state.Comment_end_dash | Some '\x00' -> error t "unexpected-null-character"; Buffer.add_string t.current_comment "\xEF\xBF\xBD" | Some c -> check_control_char c; Buffer.add_char t.current_comment c | None -> () and state_comment_less_than_sign () = match Tokenizer_stream.peek t.stream with | Some '!' -> Tokenizer_stream.advance t.stream; Buffer.add_char t.current_comment '!'; t.state <- Tokenizer_state.Comment_less_than_sign_bang | Some '<' -> Tokenizer_stream.advance t.stream; Buffer.add_char t.current_comment '<' | _ -> t.state <- Tokenizer_state.Comment and state_comment_less_than_sign_bang () = match Tokenizer_stream.peek t.stream with | Some '-' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Comment_less_than_sign_bang_dash | _ -> t.state <- Tokenizer_state.Comment and state_comment_less_than_sign_bang_dash () = match Tokenizer_stream.peek t.stream with | Some '-' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Comment_less_than_sign_bang_dash_dash | _ -> t.state <- Tokenizer_state.Comment_end_dash and state_comment_less_than_sign_bang_dash_dash () = match Tokenizer_stream.peek t.stream with | Some '>' | None -> t.state <- Tokenizer_state.Comment_end | Some _ -> error t "nested-comment"; t.state <- Tokenizer_state.Comment_end and state_comment_end_dash () = match Tokenizer_stream.peek t.stream with | Some '-' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Comment_end | None -> () | Some _ -> Buffer.add_char t.current_comment '-'; t.state <- Tokenizer_state.Comment and state_comment_end () = match Tokenizer_stream.peek t.stream with | Some '>' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Data; emit_current_comment () | Some '!' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Comment_end_bang | Some '-' -> Tokenizer_stream.advance t.stream; Buffer.add_char t.current_comment '-' | None -> () | Some _ -> Buffer.add_string t.current_comment "--"; t.state <- Tokenizer_state.Comment and state_comment_end_bang () = match Tokenizer_stream.peek t.stream with | Some '-' -> Tokenizer_stream.advance t.stream; Buffer.add_string t.current_comment "--!"; t.state <- Tokenizer_state.Comment_end_dash | Some '>' -> Tokenizer_stream.advance t.stream; error t "incorrectly-closed-comment"; t.state <- Tokenizer_state.Data; emit_current_comment () | None -> () | Some _ -> Buffer.add_string t.current_comment "--!"; t.state <- Tokenizer_state.Comment and state_doctype () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Before_doctype_name | Some '>' -> t.state <- Tokenizer_state.Before_doctype_name | None -> () | Some _ -> error t "missing-whitespace-before-doctype-name"; t.state <- Tokenizer_state.Before_doctype_name and state_before_doctype_name () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream | Some '\x00' -> Tokenizer_stream.advance t.stream; error t "unexpected-null-character"; start_new_doctype t; t.current_doctype_name <- Some (Buffer.create 8); Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD"; t.state <- Tokenizer_state.Doctype_name | Some '>' -> Tokenizer_stream.advance t.stream; error t "missing-doctype-name"; start_new_doctype t; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Data; emit_current_doctype () | None -> () | Some c -> Tokenizer_stream.advance t.stream; check_control_char c; start_new_doctype t; t.current_doctype_name <- Some (Buffer.create 8); Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c); t.state <- Tokenizer_state.Doctype_name and state_doctype_name () = match Tokenizer_stream.consume t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> t.state <- Tokenizer_state.After_doctype_name | Some '>' -> t.state <- Tokenizer_state.Data; emit_current_doctype () | Some '\x00' -> error t "unexpected-null-character"; Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD" | Some c -> check_control_char c; Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c) | None -> () and state_after_doctype_name () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream | Some '>' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Data; emit_current_doctype () | None -> () | Some _ -> (* Don't check control char here - bogus_doctype will check when it consumes *) if Tokenizer_stream.matches_ci t.stream "PUBLIC" then begin ignore (Tokenizer_stream.consume_exact_ci t.stream "PUBLIC"); t.state <- Tokenizer_state.After_doctype_public_keyword end else if Tokenizer_stream.matches_ci t.stream "SYSTEM" then begin ignore (Tokenizer_stream.consume_exact_ci t.stream "SYSTEM"); t.state <- Tokenizer_state.After_doctype_system_keyword end else begin error t "invalid-character-sequence-after-doctype-name"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Bogus_doctype end and state_after_doctype_public_keyword () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Before_doctype_public_identifier | Some '"' -> Tokenizer_stream.advance t.stream; error t "missing-whitespace-after-doctype-public-keyword"; t.current_doctype_public <- Some (Buffer.create 32); t.state <- Tokenizer_state.Doctype_public_identifier_double_quoted | Some '\'' -> Tokenizer_stream.advance t.stream; error t "missing-whitespace-after-doctype-public-keyword"; t.current_doctype_public <- Some (Buffer.create 32); t.state <- Tokenizer_state.Doctype_public_identifier_single_quoted | Some '>' -> Tokenizer_stream.advance t.stream; error t "missing-doctype-public-identifier"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Data; emit_current_doctype () | None -> () | Some _ -> (* Don't check control char here - bogus_doctype will check when it consumes *) error t "missing-quote-before-doctype-public-identifier"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Bogus_doctype and state_before_doctype_public_identifier () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream | Some '"' -> Tokenizer_stream.advance t.stream; t.current_doctype_public <- Some (Buffer.create 32); t.state <- Tokenizer_state.Doctype_public_identifier_double_quoted | Some '\'' -> Tokenizer_stream.advance t.stream; t.current_doctype_public <- Some (Buffer.create 32); t.state <- Tokenizer_state.Doctype_public_identifier_single_quoted | Some '>' -> Tokenizer_stream.advance t.stream; error t "missing-doctype-public-identifier"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Data; emit_current_doctype () | None -> () | Some _ -> error t "missing-quote-before-doctype-public-identifier"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Bogus_doctype and state_doctype_public_identifier_double_quoted () = match Tokenizer_stream.consume t.stream with | Some '"' -> t.state <- Tokenizer_state.After_doctype_public_identifier | Some '\x00' -> error t "unexpected-null-character"; Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD" | Some '>' -> error t "abrupt-doctype-public-identifier"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Data; emit_current_doctype () | Some c -> check_control_char c; Buffer.add_char (Option.get t.current_doctype_public) c | None -> () and state_doctype_public_identifier_single_quoted () = match Tokenizer_stream.consume t.stream with | Some '\'' -> t.state <- Tokenizer_state.After_doctype_public_identifier | Some '\x00' -> error t "unexpected-null-character"; Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD" | Some '>' -> error t "abrupt-doctype-public-identifier"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Data; emit_current_doctype () | Some c -> check_control_char c; Buffer.add_char (Option.get t.current_doctype_public) c | None -> () and state_after_doctype_public_identifier () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Between_doctype_public_and_system_identifiers | Some '>' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Data; emit_current_doctype () | Some '"' -> Tokenizer_stream.advance t.stream; error t "missing-whitespace-between-doctype-public-and-system-identifiers"; t.current_doctype_system <- Some (Buffer.create 32); t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted | Some '\'' -> Tokenizer_stream.advance t.stream; error t "missing-whitespace-between-doctype-public-and-system-identifiers"; t.current_doctype_system <- Some (Buffer.create 32); t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted | None -> () | Some _ -> (* Don't check control char here - bogus_doctype will check when it consumes *) error t "missing-quote-before-doctype-system-identifier"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Bogus_doctype and state_between_doctype_public_and_system_identifiers () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream | Some '>' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Data; emit_current_doctype () | Some '"' -> Tokenizer_stream.advance t.stream; t.current_doctype_system <- Some (Buffer.create 32); t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted | Some '\'' -> Tokenizer_stream.advance t.stream; t.current_doctype_system <- Some (Buffer.create 32); t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted | None -> () | Some _ -> (* Don't check control char here - bogus_doctype will check when it consumes *) error t "missing-quote-before-doctype-system-identifier"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Bogus_doctype and state_after_doctype_system_keyword () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Before_doctype_system_identifier | Some '"' -> Tokenizer_stream.advance t.stream; error t "missing-whitespace-after-doctype-system-keyword"; t.current_doctype_system <- Some (Buffer.create 32); t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted | Some '\'' -> Tokenizer_stream.advance t.stream; error t "missing-whitespace-after-doctype-system-keyword"; t.current_doctype_system <- Some (Buffer.create 32); t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted | Some '>' -> Tokenizer_stream.advance t.stream; error t "missing-doctype-system-identifier"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Data; emit_current_doctype () | None -> () | Some _ -> (* Don't check control char here - bogus_doctype will check when it consumes *) error t "missing-quote-before-doctype-system-identifier"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Bogus_doctype and state_before_doctype_system_identifier () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream | Some '"' -> Tokenizer_stream.advance t.stream; t.current_doctype_system <- Some (Buffer.create 32); t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted | Some '\'' -> Tokenizer_stream.advance t.stream; t.current_doctype_system <- Some (Buffer.create 32); t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted | Some '>' -> Tokenizer_stream.advance t.stream; error t "missing-doctype-system-identifier"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Data; emit_current_doctype () | None -> () | Some _ -> (* Don't check control char here - bogus_doctype will check when it consumes *) error t "missing-quote-before-doctype-system-identifier"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Bogus_doctype and state_doctype_system_identifier_double_quoted () = match Tokenizer_stream.consume t.stream with | Some '"' -> t.state <- Tokenizer_state.After_doctype_system_identifier | Some '\x00' -> error t "unexpected-null-character"; Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD" | Some '>' -> error t "abrupt-doctype-system-identifier"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Data; emit_current_doctype () | Some c -> check_control_char c; Buffer.add_char (Option.get t.current_doctype_system) c | None -> () and state_doctype_system_identifier_single_quoted () = match Tokenizer_stream.consume t.stream with | Some '\'' -> t.state <- Tokenizer_state.After_doctype_system_identifier | Some '\x00' -> error t "unexpected-null-character"; Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD" | Some '>' -> error t "abrupt-doctype-system-identifier"; t.current_doctype_force_quirks <- true; t.state <- Tokenizer_state.Data; emit_current_doctype () | Some c -> check_control_char c; Buffer.add_char (Option.get t.current_doctype_system) c | None -> () and state_after_doctype_system_identifier () = match Tokenizer_stream.peek t.stream with | Some ('\t' | '\n' | '\x0C' | ' ') -> Tokenizer_stream.advance t.stream | Some '>' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Data; emit_current_doctype () | None -> () | Some _ -> (* Don't check control char here - bogus_doctype will check when it consumes *) error t "unexpected-character-after-doctype-system-identifier"; t.state <- Tokenizer_state.Bogus_doctype and state_bogus_doctype () = match Tokenizer_stream.consume t.stream with | Some '>' -> t.state <- Tokenizer_state.Data; emit_current_doctype () | Some '\x00' -> error t "unexpected-null-character" | Some c -> check_control_char c (* Check all chars in bogus doctype *) | None -> () and state_cdata_section () = match Tokenizer_stream.consume t.stream with | Some ']' -> t.state <- Tokenizer_state.Cdata_section_bracket | Some c -> (* CDATA section emits all characters as-is, including NUL, but still check for control chars *) emit_char_checked c | None -> () and state_cdata_section_bracket () = match Tokenizer_stream.peek t.stream with | Some ']' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Cdata_section_end | _ -> emit_char t ']'; t.state <- Tokenizer_state.Cdata_section and state_cdata_section_end () = match Tokenizer_stream.peek t.stream with | Some ']' -> Tokenizer_stream.advance t.stream; emit_char t ']' | Some '>' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Data | _ -> emit_str t "]]"; t.state <- Tokenizer_state.Cdata_section and state_character_reference () = Buffer.clear t.temp_buffer; Buffer.add_char t.temp_buffer '&'; match Tokenizer_stream.peek t.stream with | Some c when is_ascii_alnum c -> t.state <- Tokenizer_state.Named_character_reference | Some '#' -> Tokenizer_stream.advance t.stream; Buffer.add_char t.temp_buffer '#'; t.state <- Tokenizer_state.Numeric_character_reference | _ -> flush_code_points_consumed_as_char_ref t; t.state <- t.return_state and state_named_character_reference () = (* Collect alphanumeric characters *) let rec collect () = match Tokenizer_stream.peek t.stream with | Some c when is_ascii_alnum c -> Tokenizer_stream.advance t.stream; Buffer.add_char t.temp_buffer c; collect () | _ -> () in collect (); let has_semicolon = match Tokenizer_stream.peek t.stream with | Some ';' -> Tokenizer_stream.advance t.stream; Buffer.add_char t.temp_buffer ';'; true | _ -> false in (* Try to match entity - buffer contains "&name" or "&name;" *) let buf_contents = Buffer.contents t.temp_buffer in let name_start = 1 in (* Skip '&' *) let name_end = String.length buf_contents - (if has_semicolon then 1 else 0) in let entity_name = String.sub buf_contents name_start (name_end - name_start) in (* Try progressively shorter matches *) (* Only match if: 1. Full match with semicolon, OR 2. Legacy entity (can be used without semicolon) *) let rec try_match len = if len <= 0 then None else let prefix = String.sub entity_name 0 len in let is_full = len = String.length entity_name in let would_have_semi = has_semicolon && is_full in (* Only use this match if it has semicolon or is a legacy entity *) if would_have_semi || Entities.is_legacy prefix then match Entities.lookup prefix with | Some decoded -> Some (decoded, len) | None -> try_match (len - 1) else try_match (len - 1) in match try_match (String.length entity_name) with | Some (decoded, matched_len) -> let full_match = matched_len = String.length entity_name in let ends_with_semi = has_semicolon && full_match in (* Check attribute context restrictions *) let in_attribute = match t.return_state with | Tokenizer_state.Attribute_value_double_quoted | Tokenizer_state.Attribute_value_single_quoted | Tokenizer_state.Attribute_value_unquoted -> true | _ -> false in let next_char = if full_match && not has_semicolon then Tokenizer_stream.peek t.stream else if not full_match then Some entity_name.[matched_len] else None in let blocked = in_attribute && not ends_with_semi && match next_char with | Some '=' -> true | Some c when is_ascii_alnum c -> true | _ -> false in if blocked then begin flush_code_points_consumed_as_char_ref t; t.state <- t.return_state end else begin if not ends_with_semi then error t "missing-semicolon-after-character-reference"; Buffer.clear t.temp_buffer; Buffer.add_string t.temp_buffer decoded; flush_code_points_consumed_as_char_ref t; (* Emit unconsumed chars after partial match *) if not full_match then begin let unconsumed = String.sub entity_name matched_len (String.length entity_name - matched_len) in emit_str t unconsumed; (* If there was a semicolon in input but we didn't use the full match, emit the semicolon too *) if has_semicolon then emit_char t ';' end; t.state <- t.return_state end | None -> (* No match - check if we should report unknown-named-character-reference *) if String.length entity_name > 0 then begin (* If we have a semicolon, it's definitely an unknown named character reference *) if has_semicolon then error t "unknown-named-character-reference"; (* Emit all the chars we consumed *) flush_code_points_consumed_as_char_ref t; t.state <- t.return_state end else begin flush_code_points_consumed_as_char_ref t; t.state <- t.return_state end and state_ambiguous_ampersand () = match Tokenizer_stream.peek t.stream with | Some c when is_ascii_alnum c -> Tokenizer_stream.advance t.stream; (match t.return_state with | Tokenizer_state.Attribute_value_double_quoted | Tokenizer_state.Attribute_value_single_quoted | Tokenizer_state.Attribute_value_unquoted -> Buffer.add_char t.current_attr_value c | _ -> emit_char t c) | Some ';' -> error t "unknown-named-character-reference"; t.state <- t.return_state | _ -> t.state <- t.return_state and state_numeric_character_reference () = t.char_ref_code <- 0; match Tokenizer_stream.peek t.stream with | Some (('x' | 'X') as c) -> Tokenizer_stream.advance t.stream; Buffer.add_char t.temp_buffer c; t.state <- Tokenizer_state.Hexadecimal_character_reference_start | _ -> t.state <- Tokenizer_state.Decimal_character_reference_start and state_hexadecimal_character_reference_start () = match Tokenizer_stream.peek t.stream with | Some c when is_ascii_hex c -> t.state <- Tokenizer_state.Hexadecimal_character_reference | _ -> error t "absence-of-digits-in-numeric-character-reference"; flush_code_points_consumed_as_char_ref t; t.state <- t.return_state and state_decimal_character_reference_start () = match Tokenizer_stream.peek t.stream with | Some c when is_ascii_digit c -> t.state <- Tokenizer_state.Decimal_character_reference | _ -> error t "absence-of-digits-in-numeric-character-reference"; flush_code_points_consumed_as_char_ref t; t.state <- t.return_state and state_hexadecimal_character_reference () = match Tokenizer_stream.peek t.stream with | Some c when is_ascii_digit c -> Tokenizer_stream.advance t.stream; t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code '0'); if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 | Some c when c >= 'A' && c <= 'F' -> Tokenizer_stream.advance t.stream; t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'A' + 10); if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 | Some c when c >= 'a' && c <= 'f' -> Tokenizer_stream.advance t.stream; t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'a' + 10); if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 | Some ';' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Numeric_character_reference_end | _ -> error t "missing-semicolon-after-character-reference"; t.state <- Tokenizer_state.Numeric_character_reference_end and state_decimal_character_reference () = match Tokenizer_stream.peek t.stream with | Some c when is_ascii_digit c -> Tokenizer_stream.advance t.stream; t.char_ref_code <- t.char_ref_code * 10 + (Char.code c - Char.code '0'); if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 | Some ';' -> Tokenizer_stream.advance t.stream; t.state <- Tokenizer_state.Numeric_character_reference_end | _ -> error t "missing-semicolon-after-character-reference"; t.state <- Tokenizer_state.Numeric_character_reference_end and state_numeric_character_reference_end () = let code = t.char_ref_code in let replacement_char = "\xEF\xBF\xBD" in let result = if code = 0 then begin error t "null-character-reference"; replacement_char end else if code > 0x10FFFF then begin error t (Printf.sprintf "character-reference-outside-unicode-range:%x" code); replacement_char end else if code >= 0xD800 && code <= 0xDFFF then begin error t (Printf.sprintf "surrogate-character-reference:%04x" code); replacement_char end else if (code >= 0xFDD0 && code <= 0xFDEF) || (* Noncharacters end in 0xFFFE or 0xFFFF in each plane (0-16). O(1) bitwise check instead of O(n) list membership. *) (let low16 = code land 0xFFFF in low16 = 0xFFFE || low16 = 0xFFFF) then begin error t (Printf.sprintf "noncharacter-character-reference:%05x" code); Entities.Numeric_ref.codepoint_to_utf8 code end else if (code >= 0x01 && code <= 0x08) || code = 0x0B || (code >= 0x0D && code <= 0x1F) || (code >= 0x7F && code <= 0x9F) then begin error t (Printf.sprintf "control-character-reference:%04x" code); (* Apply Windows-1252 replacement table for 0x80-0x9F *) match Entities.Numeric_ref.find_replacement code with | Some replacement -> Entities.Numeric_ref.codepoint_to_utf8 replacement | None -> Entities.Numeric_ref.codepoint_to_utf8 code end else Entities.Numeric_ref.codepoint_to_utf8 code in Buffer.clear t.temp_buffer; Buffer.add_string t.temp_buffer result; flush_code_points_consumed_as_char_ref t; t.state <- t.return_state in process_state () let get_errors t = List.rev t.errors let set_state t state = t.state <- state let set_last_start_tag t name = t.last_start_tag <- name