(* HTML5 Tokenizer - implements WHATWG tokenization algorithm *)
(* Character classification using Astring *)
let is_ascii_alpha = Astring.Char.Ascii.is_letter
let is_ascii_digit = Astring.Char.Ascii.is_digit
let is_ascii_hex = Astring.Char.Ascii.is_hex_digit
let is_ascii_alnum = Astring.Char.Ascii.is_alphanum
let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r'
let ascii_lower = Astring.Char.Ascii.lowercase
(* Token sink interface *)
module type SINK = sig
type t
val process : t -> Tokenizer_token.t -> line:int -> column:int -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
val adjusted_current_node_in_html_namespace : t -> bool
end
type 'sink t = {
mutable stream : Tokenizer_stream.t;
sink : 'sink;
mutable state : Tokenizer_state.t;
mutable return_state : Tokenizer_state.t;
mutable char_ref_code : int;
mutable temp_buffer : Buffer.t;
mutable last_start_tag : string;
mutable current_tag_name : Buffer.t;
mutable current_tag_kind : Tokenizer_token.tag_kind;
mutable current_tag_self_closing : bool;
mutable current_attr_name : Buffer.t;
mutable current_attr_value : Buffer.t;
mutable current_attrs : (string * string) list;
mutable current_doctype_name : Buffer.t option;
mutable current_doctype_public : Buffer.t option;
mutable current_doctype_system : Buffer.t option;
mutable current_doctype_force_quirks : bool;
mutable current_comment : Buffer.t;
mutable pending_chars : Buffer.t;
mutable errors : Tokenizer_errors.t list;
collect_errors : bool;
xml_mode : bool; (* XML violation mode: transform chars for XML compatibility *)
}
let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) ?(xml_mode=false) () = {
stream = Tokenizer_stream.create "";
sink;
state = Tokenizer_state.Data;
return_state = Tokenizer_state.Data;
char_ref_code = 0;
temp_buffer = Buffer.create 64;
last_start_tag = "";
current_tag_name = Buffer.create 32;
current_tag_kind = Tokenizer_token.Start;
current_tag_self_closing = false;
current_attr_name = Buffer.create 32;
current_attr_value = Buffer.create 64;
current_attrs = [];
current_doctype_name = None;
current_doctype_public = None;
current_doctype_system = None;
current_doctype_force_quirks = false;
current_comment = Buffer.create 64;
pending_chars = Buffer.create 256;
errors = [];
collect_errors;
xml_mode;
}
let error t code =
if t.collect_errors then begin
let (line, column) = Tokenizer_stream.position t.stream in
t.errors <- Tokenizer_errors.make ~code ~line ~column :: t.errors
end
(* emit functions are defined locally inside run *)
(* XML mode character transformation: form feed → space *)
let emit_char t c =
if t.xml_mode && c = '\x0C' then
Buffer.add_char t.pending_chars ' '
else
Buffer.add_char t.pending_chars c
(* XML mode string transformation: U+FFFF → U+FFFD, form feed → space *)
let emit_str t s =
if t.xml_mode then begin
(* Transform: \xEF\xBF\xBF (U+FFFF) → \xEF\xBF\xBD (U+FFFD), \x0C → space *)
let len = String.length s in
let i = ref 0 in
while !i < len do
let c = s.[!i] in
if c = '\x0C' then begin
Buffer.add_char t.pending_chars ' ';
incr i
end else if c = '\xEF' && !i + 2 < len && s.[!i+1] = '\xBF' && s.[!i+2] = '\xBF' then begin
(* U+FFFF → U+FFFD *)
Buffer.add_string t.pending_chars "\xEF\xBF\xBD";
i := !i + 3
end else begin
Buffer.add_char t.pending_chars c;
incr i
end
done
end else
Buffer.add_string t.pending_chars s
let start_new_tag t kind =
Buffer.clear t.current_tag_name;
t.current_tag_kind <- kind;
t.current_tag_self_closing <- false;
t.current_attrs <- []
let start_new_attribute t =
(* Save previous attribute if any *)
let name = Buffer.contents t.current_attr_name in
if String.length name > 0 then begin
let value = Buffer.contents t.current_attr_value in
(* Check for duplicates - only add if not already present *)
if not (List.exists (fun (n, _) -> n = name) t.current_attrs) then
t.current_attrs <- (name, value) :: t.current_attrs
else
error t "duplicate-attribute"
end;
Buffer.clear t.current_attr_name;
Buffer.clear t.current_attr_value
let finish_attribute t =
start_new_attribute t
let start_new_doctype t =
t.current_doctype_name <- None;
t.current_doctype_public <- None;
t.current_doctype_system <- None;
t.current_doctype_force_quirks <- false
(* emit_current_tag, emit_current_doctype, emit_current_comment are defined locally inside run *)
let is_appropriate_end_tag t =
let name = Buffer.contents t.current_tag_name in
String.length t.last_start_tag > 0 && name = t.last_start_tag
let flush_code_points_consumed_as_char_ref t =
let s = Buffer.contents t.temp_buffer in
match t.return_state with
| Tokenizer_state.Attribute_value_double_quoted
| Tokenizer_state.Attribute_value_single_quoted
| Tokenizer_state.Attribute_value_unquoted ->
Buffer.add_string t.current_attr_value s
| _ ->
emit_str t s
open Bytesrw
(* Main tokenization loop *)
let run (type s) t (module S : SINK with type t = s) (reader : Bytes.Reader.t) =
t.stream <- Tokenizer_stream.create_from_reader reader;
t.errors <- [];
(* Set up error callback for surrogate/noncharacter detection in stream *)
(* In XML mode, we don't report noncharacter errors - we transform them instead *)
if not t.xml_mode then
Tokenizer_stream.set_error_callback t.stream (fun code -> error t code);
(* XML mode transformation for pending chars: U+FFFF → U+FFFD *)
let transform_xml_chars data =
let len = String.length data in
let buf = Buffer.create len in
let i = ref 0 in
while !i < len do
let c = data.[!i] in
if c = '\xEF' && !i + 2 < len && data.[!i+1] = '\xBF' && data.[!i+2] = '\xBF' then begin
(* U+FFFF → U+FFFD *)
Buffer.add_string buf "\xEF\xBF\xBD";
i := !i + 3
end else begin
Buffer.add_char buf c;
incr i
end
done;
Buffer.contents buf
in
(* Local emit functions with access to S *)
let emit_pending_chars () =
if Buffer.length t.pending_chars > 0 then begin
let data = Buffer.contents t.pending_chars in
Buffer.clear t.pending_chars;
let data = if t.xml_mode then transform_xml_chars data else data in
let line, column = Tokenizer_stream.position t.stream in
ignore (S.process t.sink (Tokenizer_token.Character data) ~line ~column)
end
in
let emit token =
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in
match S.process t.sink token ~line ~column with
| `Continue -> ()
| `SwitchTo new_state -> t.state <- new_state
in
let emit_current_tag () =
finish_attribute t;
let name = Buffer.contents t.current_tag_name in
let attrs = List.rev t.current_attrs in
(* Check for end tag with attributes or self-closing flag *)
if t.current_tag_kind = Tokenizer_token.End then begin
if attrs <> [] then
error t "end-tag-with-attributes";
if t.current_tag_self_closing then
error t "end-tag-with-trailing-solidus"
end;
let tag = {
Tokenizer_token.kind = t.current_tag_kind;
name;
attrs;
self_closing = t.current_tag_self_closing;
} in
if t.current_tag_kind = Tokenizer_token.Start then
t.last_start_tag <- name;
emit (Tokenizer_token.Tag tag)
in
let emit_current_doctype () =
let doctype = {
Tokenizer_token.name = Option.map Buffer.contents t.current_doctype_name;
public_id = Option.map Buffer.contents t.current_doctype_public;
system_id = Option.map Buffer.contents t.current_doctype_system;
force_quirks = t.current_doctype_force_quirks;
} in
emit (Tokenizer_token.Doctype doctype)
in
let emit_current_comment () =
let content = Buffer.contents t.current_comment in
let content =
if t.xml_mode then begin
(* XML mode: transform -- to - - in comments *)
let buf = Buffer.create (String.length content + 10) in
let len = String.length content in
let i = ref 0 in
while !i < len do
if !i + 1 < len && content.[!i] = '-' && content.[!i+1] = '-' then begin
Buffer.add_string buf "- -";
i := !i + 2
end else begin
Buffer.add_char buf content.[!i];
incr i
end
done;
Buffer.contents buf
end else content
in
emit (Tokenizer_token.Comment content)
in
(* Check for control characters and emit error if needed *)
(* Only checks ASCII control chars; C1 controls (U+0080-U+009F) are 2-byte in UTF-8 *)
let check_control_char c =
let code = Char.code c in
(* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F *)
(* Allowed: U+0009 (tab), U+000A (LF), U+000C (FF), U+000D (CR) *)
(* Note: U+0080-U+009F (C1 controls) are 2-byte UTF-8 sequences starting with 0xC2 *)
(* Note: We only check single-byte control chars here; multi-byte checks are TODO *)
if (code >= 0x01 && code <= 0x08) ||
code = 0x0B ||
(code >= 0x0E && code <= 0x1F) ||
code = 0x7F then
error t (Printf.sprintf "control-character-in-input-stream:%04x" code)
in
(* Emit char with control character check *)
let emit_char_checked c =
check_control_char c;
emit_char t c
in
let rec process_state () =
if Tokenizer_stream.is_eof t.stream && t.state <> Tokenizer_state.Data then begin
(* Handle EOF in various states *)
handle_eof ()
end else if Tokenizer_stream.is_eof t.stream then begin
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
end else begin
step ();
process_state ()
end
and handle_eof () =
match t.state with
| Tokenizer_state.Data ->
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Tag_open ->
error t "eof-before-tag-name";
emit_char t '<';
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.End_tag_open ->
error t "eof-before-tag-name";
emit_str t "";
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Tag_name
| Tokenizer_state.Before_attribute_name
| Tokenizer_state.Attribute_name
| Tokenizer_state.After_attribute_name
| Tokenizer_state.Before_attribute_value
| Tokenizer_state.Attribute_value_double_quoted
| Tokenizer_state.Attribute_value_single_quoted
| Tokenizer_state.Attribute_value_unquoted
| Tokenizer_state.After_attribute_value_quoted
| Tokenizer_state.Self_closing_start_tag ->
error t "eof-in-tag";
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Rawtext ->
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Rawtext_less_than_sign ->
emit_char t '<';
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Rawtext_end_tag_open ->
emit_str t "";
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Rawtext_end_tag_name ->
emit_str t "";
emit_str t (Buffer.contents t.temp_buffer);
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Rcdata ->
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Rcdata_less_than_sign ->
emit_char t '<';
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Rcdata_end_tag_open ->
emit_str t "";
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Rcdata_end_tag_name ->
emit_str t "";
emit_str t (Buffer.contents t.temp_buffer);
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Script_data ->
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Script_data_less_than_sign ->
emit_char t '<';
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Script_data_end_tag_open ->
emit_str t "";
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Script_data_end_tag_name ->
emit_str t "";
emit_str t (Buffer.contents t.temp_buffer);
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Script_data_escape_start
| Tokenizer_state.Script_data_escape_start_dash
| Tokenizer_state.Script_data_escaped
| Tokenizer_state.Script_data_escaped_dash
| Tokenizer_state.Script_data_escaped_dash_dash ->
error t "eof-in-script-html-comment-like-text";
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Script_data_escaped_less_than_sign ->
emit_char t '<';
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Script_data_escaped_end_tag_open ->
emit_str t "";
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Script_data_escaped_end_tag_name ->
emit_str t "";
emit_str t (Buffer.contents t.temp_buffer);
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Script_data_double_escape_start
| Tokenizer_state.Script_data_double_escaped
| Tokenizer_state.Script_data_double_escaped_dash
| Tokenizer_state.Script_data_double_escaped_dash_dash ->
error t "eof-in-script-html-comment-like-text";
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Script_data_double_escaped_less_than_sign ->
(* '<' was already emitted when entering this state from Script_data_double_escaped *)
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Script_data_double_escape_end ->
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Plaintext ->
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Comment_start
| Tokenizer_state.Comment_start_dash
| Tokenizer_state.Comment
| Tokenizer_state.Comment_less_than_sign
| Tokenizer_state.Comment_less_than_sign_bang
| Tokenizer_state.Comment_less_than_sign_bang_dash
| Tokenizer_state.Comment_less_than_sign_bang_dash_dash
| Tokenizer_state.Comment_end_dash
| Tokenizer_state.Comment_end
| Tokenizer_state.Comment_end_bang ->
error t "eof-in-comment";
emit_current_comment ();
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Bogus_comment ->
emit_current_comment ();
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Markup_declaration_open ->
error t "incorrectly-opened-comment";
Buffer.clear t.current_comment;
emit_current_comment ();
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Doctype
| Tokenizer_state.Before_doctype_name ->
error t "eof-in-doctype";
start_new_doctype t;
t.current_doctype_force_quirks <- true;
emit_current_doctype ();
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Doctype_name
| Tokenizer_state.After_doctype_name
| Tokenizer_state.After_doctype_public_keyword
| Tokenizer_state.Before_doctype_public_identifier
| Tokenizer_state.Doctype_public_identifier_double_quoted
| Tokenizer_state.Doctype_public_identifier_single_quoted
| Tokenizer_state.After_doctype_public_identifier
| Tokenizer_state.Between_doctype_public_and_system_identifiers
| Tokenizer_state.After_doctype_system_keyword
| Tokenizer_state.Before_doctype_system_identifier
| Tokenizer_state.Doctype_system_identifier_double_quoted
| Tokenizer_state.Doctype_system_identifier_single_quoted
| Tokenizer_state.After_doctype_system_identifier ->
error t "eof-in-doctype";
t.current_doctype_force_quirks <- true;
emit_current_doctype ();
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Bogus_doctype ->
emit_current_doctype ();
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Cdata_section ->
error t "eof-in-cdata";
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Cdata_section_bracket ->
error t "eof-in-cdata";
emit_char t ']';
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Cdata_section_end ->
error t "eof-in-cdata";
emit_str t "]]";
emit_pending_chars ();
let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
| Tokenizer_state.Character_reference ->
(* state_character_reference never ran, so initialize temp_buffer with & *)
Buffer.clear t.temp_buffer;
Buffer.add_char t.temp_buffer '&';
flush_code_points_consumed_as_char_ref t;
t.state <- t.return_state;
handle_eof ()
| Tokenizer_state.Named_character_reference ->
flush_code_points_consumed_as_char_ref t;
t.state <- t.return_state;
handle_eof ()
| Tokenizer_state.Numeric_character_reference ->
(* At EOF with just "" - no digits follow *)
error t "absence-of-digits-in-numeric-character-reference";
flush_code_points_consumed_as_char_ref t;
t.state <- t.return_state;
handle_eof ()
| Tokenizer_state.Hexadecimal_character_reference_start
| Tokenizer_state.Decimal_character_reference_start ->
error t "absence-of-digits-in-numeric-character-reference";
flush_code_points_consumed_as_char_ref t;
t.state <- t.return_state;
handle_eof ()
| Tokenizer_state.Numeric_character_reference_end ->
(* We have collected digits, just need to finalize the character reference *)
step ();
handle_eof ()
| Tokenizer_state.Ambiguous_ampersand ->
(* Buffer was already flushed when entering this state, just transition *)
t.state <- t.return_state;
handle_eof ()
| Tokenizer_state.Hexadecimal_character_reference
| Tokenizer_state.Decimal_character_reference ->
(* At EOF with collected digits - convert the numeric reference *)
error t "missing-semicolon-after-character-reference";
let code = t.char_ref_code in
let replacement_char = "\xEF\xBF\xBD" in
let result =
if code = 0 then begin
error t "null-character-reference";
replacement_char
end else if code > 0x10FFFF then begin
error t "character-reference-outside-unicode-range";
replacement_char
end else if code >= 0xD800 && code <= 0xDFFF then begin
error t "surrogate-character-reference";
replacement_char
end else
Entities.Numeric_ref.codepoint_to_utf8 code
in
Buffer.clear t.temp_buffer;
Buffer.add_string t.temp_buffer result;
flush_code_points_consumed_as_char_ref t;
t.state <- t.return_state;
handle_eof ()
and step () =
match t.state with
| Tokenizer_state.Data -> state_data ()
| Tokenizer_state.Rcdata -> state_rcdata ()
| Tokenizer_state.Rawtext -> state_rawtext ()
| Tokenizer_state.Script_data -> state_script_data ()
| Tokenizer_state.Plaintext -> state_plaintext ()
| Tokenizer_state.Tag_open -> state_tag_open ()
| Tokenizer_state.End_tag_open -> state_end_tag_open ()
| Tokenizer_state.Tag_name -> state_tag_name ()
| Tokenizer_state.Rcdata_less_than_sign -> state_rcdata_less_than_sign ()
| Tokenizer_state.Rcdata_end_tag_open -> state_rcdata_end_tag_open ()
| Tokenizer_state.Rcdata_end_tag_name -> state_rcdata_end_tag_name ()
| Tokenizer_state.Rawtext_less_than_sign -> state_rawtext_less_than_sign ()
| Tokenizer_state.Rawtext_end_tag_open -> state_rawtext_end_tag_open ()
| Tokenizer_state.Rawtext_end_tag_name -> state_rawtext_end_tag_name ()
| Tokenizer_state.Script_data_less_than_sign -> state_script_data_less_than_sign ()
| Tokenizer_state.Script_data_end_tag_open -> state_script_data_end_tag_open ()
| Tokenizer_state.Script_data_end_tag_name -> state_script_data_end_tag_name ()
| Tokenizer_state.Script_data_escape_start -> state_script_data_escape_start ()
| Tokenizer_state.Script_data_escape_start_dash -> state_script_data_escape_start_dash ()
| Tokenizer_state.Script_data_escaped -> state_script_data_escaped ()
| Tokenizer_state.Script_data_escaped_dash -> state_script_data_escaped_dash ()
| Tokenizer_state.Script_data_escaped_dash_dash -> state_script_data_escaped_dash_dash ()
| Tokenizer_state.Script_data_escaped_less_than_sign -> state_script_data_escaped_less_than_sign ()
| Tokenizer_state.Script_data_escaped_end_tag_open -> state_script_data_escaped_end_tag_open ()
| Tokenizer_state.Script_data_escaped_end_tag_name -> state_script_data_escaped_end_tag_name ()
| Tokenizer_state.Script_data_double_escape_start -> state_script_data_double_escape_start ()
| Tokenizer_state.Script_data_double_escaped -> state_script_data_double_escaped ()
| Tokenizer_state.Script_data_double_escaped_dash -> state_script_data_double_escaped_dash ()
| Tokenizer_state.Script_data_double_escaped_dash_dash -> state_script_data_double_escaped_dash_dash ()
| Tokenizer_state.Script_data_double_escaped_less_than_sign -> state_script_data_double_escaped_less_than_sign ()
| Tokenizer_state.Script_data_double_escape_end -> state_script_data_double_escape_end ()
| Tokenizer_state.Before_attribute_name -> state_before_attribute_name ()
| Tokenizer_state.Attribute_name -> state_attribute_name ()
| Tokenizer_state.After_attribute_name -> state_after_attribute_name ()
| Tokenizer_state.Before_attribute_value -> state_before_attribute_value ()
| Tokenizer_state.Attribute_value_double_quoted -> state_attribute_value_double_quoted ()
| Tokenizer_state.Attribute_value_single_quoted -> state_attribute_value_single_quoted ()
| Tokenizer_state.Attribute_value_unquoted -> state_attribute_value_unquoted ()
| Tokenizer_state.After_attribute_value_quoted -> state_after_attribute_value_quoted ()
| Tokenizer_state.Self_closing_start_tag -> state_self_closing_start_tag ()
| Tokenizer_state.Bogus_comment -> state_bogus_comment ()
| Tokenizer_state.Markup_declaration_open -> state_markup_declaration_open ()
| Tokenizer_state.Comment_start -> state_comment_start ()
| Tokenizer_state.Comment_start_dash -> state_comment_start_dash ()
| Tokenizer_state.Comment -> state_comment ()
| Tokenizer_state.Comment_less_than_sign -> state_comment_less_than_sign ()
| Tokenizer_state.Comment_less_than_sign_bang -> state_comment_less_than_sign_bang ()
| Tokenizer_state.Comment_less_than_sign_bang_dash -> state_comment_less_than_sign_bang_dash ()
| Tokenizer_state.Comment_less_than_sign_bang_dash_dash -> state_comment_less_than_sign_bang_dash_dash ()
| Tokenizer_state.Comment_end_dash -> state_comment_end_dash ()
| Tokenizer_state.Comment_end -> state_comment_end ()
| Tokenizer_state.Comment_end_bang -> state_comment_end_bang ()
| Tokenizer_state.Doctype -> state_doctype ()
| Tokenizer_state.Before_doctype_name -> state_before_doctype_name ()
| Tokenizer_state.Doctype_name -> state_doctype_name ()
| Tokenizer_state.After_doctype_name -> state_after_doctype_name ()
| Tokenizer_state.After_doctype_public_keyword -> state_after_doctype_public_keyword ()
| Tokenizer_state.Before_doctype_public_identifier -> state_before_doctype_public_identifier ()
| Tokenizer_state.Doctype_public_identifier_double_quoted -> state_doctype_public_identifier_double_quoted ()
| Tokenizer_state.Doctype_public_identifier_single_quoted -> state_doctype_public_identifier_single_quoted ()
| Tokenizer_state.After_doctype_public_identifier -> state_after_doctype_public_identifier ()
| Tokenizer_state.Between_doctype_public_and_system_identifiers -> state_between_doctype_public_and_system_identifiers ()
| Tokenizer_state.After_doctype_system_keyword -> state_after_doctype_system_keyword ()
| Tokenizer_state.Before_doctype_system_identifier -> state_before_doctype_system_identifier ()
| Tokenizer_state.Doctype_system_identifier_double_quoted -> state_doctype_system_identifier_double_quoted ()
| Tokenizer_state.Doctype_system_identifier_single_quoted -> state_doctype_system_identifier_single_quoted ()
| Tokenizer_state.After_doctype_system_identifier -> state_after_doctype_system_identifier ()
| Tokenizer_state.Bogus_doctype -> state_bogus_doctype ()
| Tokenizer_state.Cdata_section -> state_cdata_section ()
| Tokenizer_state.Cdata_section_bracket -> state_cdata_section_bracket ()
| Tokenizer_state.Cdata_section_end -> state_cdata_section_end ()
| Tokenizer_state.Character_reference -> state_character_reference ()
| Tokenizer_state.Named_character_reference -> state_named_character_reference ()
| Tokenizer_state.Ambiguous_ampersand -> state_ambiguous_ampersand ()
| Tokenizer_state.Numeric_character_reference -> state_numeric_character_reference ()
| Tokenizer_state.Hexadecimal_character_reference_start -> state_hexadecimal_character_reference_start ()
| Tokenizer_state.Decimal_character_reference_start -> state_decimal_character_reference_start ()
| Tokenizer_state.Hexadecimal_character_reference -> state_hexadecimal_character_reference ()
| Tokenizer_state.Decimal_character_reference -> state_decimal_character_reference ()
| Tokenizer_state.Numeric_character_reference_end -> state_numeric_character_reference_end ()
(* State implementations *)
and state_data () =
match Tokenizer_stream.consume t.stream with
| Some '&' ->
t.return_state <- Tokenizer_state.Data;
t.state <- Tokenizer_state.Character_reference
| Some '<' ->
t.state <- Tokenizer_state.Tag_open
| Some '\x00' ->
(* Emit pending chars first, then emit null separately for proper tree builder handling *)
emit_pending_chars ();
error t "unexpected-null-character";
let line, column = Tokenizer_stream.position t.stream in
ignore (S.process t.sink (Tokenizer_token.Character "\x00") ~line ~column)
| Some c ->
emit_char_checked c
| None -> ()
and state_rcdata () =
match Tokenizer_stream.consume t.stream with
| Some '&' ->
t.return_state <- Tokenizer_state.Rcdata;
t.state <- Tokenizer_state.Character_reference
| Some '<' ->
t.state <- Tokenizer_state.Rcdata_less_than_sign
| Some '\x00' ->
error t "unexpected-null-character";
emit_str t "\xEF\xBF\xBD"
| Some c ->
emit_char_checked c
| None -> ()
and state_rawtext () =
match Tokenizer_stream.consume t.stream with
| Some '<' ->
t.state <- Tokenizer_state.Rawtext_less_than_sign
| Some '\x00' ->
error t "unexpected-null-character";
emit_str t "\xEF\xBF\xBD"
| Some c ->
emit_char_checked c
| None -> ()
and state_script_data () =
match Tokenizer_stream.consume t.stream with
| Some '<' ->
t.state <- Tokenizer_state.Script_data_less_than_sign
| Some '\x00' ->
error t "unexpected-null-character";
emit_str t "\xEF\xBF\xBD"
| Some c ->
emit_char_checked c
| None -> ()
and state_plaintext () =
match Tokenizer_stream.consume t.stream with
| Some '\x00' ->
error t "unexpected-null-character";
emit_str t "\xEF\xBF\xBD"
| Some c ->
emit_char_checked c
| None -> ()
and state_tag_open () =
match Tokenizer_stream.peek t.stream with
| Some '!' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Markup_declaration_open
| Some '/' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.End_tag_open
| Some c when is_ascii_alpha c ->
start_new_tag t Tokenizer_token.Start;
t.state <- Tokenizer_state.Tag_name
| Some '?' ->
error t "unexpected-question-mark-instead-of-tag-name";
Buffer.clear t.current_comment;
t.state <- Tokenizer_state.Bogus_comment
| None ->
error t "eof-before-tag-name";
emit_char t '<'
| Some _ ->
error t "invalid-first-character-of-tag-name";
emit_char t '<';
t.state <- Tokenizer_state.Data
and state_end_tag_open () =
match Tokenizer_stream.peek t.stream with
| Some c when is_ascii_alpha c ->
start_new_tag t Tokenizer_token.End;
t.state <- Tokenizer_state.Tag_name
| Some '>' ->
Tokenizer_stream.advance t.stream;
error t "missing-end-tag-name";
t.state <- Tokenizer_state.Data
| None ->
error t "eof-before-tag-name";
emit_str t ""
| Some _ ->
error t "invalid-first-character-of-tag-name";
Buffer.clear t.current_comment;
t.state <- Tokenizer_state.Bogus_comment
and state_tag_name () =
match Tokenizer_stream.consume t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
t.state <- Tokenizer_state.Before_attribute_name
| Some '/' ->
t.state <- Tokenizer_state.Self_closing_start_tag
| Some '>' ->
t.state <- Tokenizer_state.Data;
emit_current_tag ()
| Some '\x00' ->
error t "unexpected-null-character";
Buffer.add_string t.current_tag_name "\xEF\xBF\xBD"
| Some c ->
check_control_char c;
Buffer.add_char t.current_tag_name (ascii_lower c)
| None -> ()
and state_rcdata_less_than_sign () =
match Tokenizer_stream.peek t.stream with
| Some '/' ->
Tokenizer_stream.advance t.stream;
Buffer.clear t.temp_buffer;
t.state <- Tokenizer_state.Rcdata_end_tag_open
| _ ->
emit_char t '<';
t.state <- Tokenizer_state.Rcdata
and state_rcdata_end_tag_open () =
match Tokenizer_stream.peek t.stream with
| Some c when is_ascii_alpha c ->
start_new_tag t Tokenizer_token.End;
t.state <- Tokenizer_state.Rcdata_end_tag_name
| _ ->
emit_str t "";
t.state <- Tokenizer_state.Rcdata
and state_rcdata_end_tag_name () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Before_attribute_name
| Some '/' when is_appropriate_end_tag t ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Self_closing_start_tag
| Some '>' when is_appropriate_end_tag t ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Data;
emit_current_tag ()
| Some c when is_ascii_alpha c ->
Tokenizer_stream.advance t.stream;
Buffer.add_char t.current_tag_name (ascii_lower c);
Buffer.add_char t.temp_buffer c
| _ ->
emit_str t "";
emit_str t (Buffer.contents t.temp_buffer);
t.state <- Tokenizer_state.Rcdata
and state_rawtext_less_than_sign () =
match Tokenizer_stream.peek t.stream with
| Some '/' ->
Tokenizer_stream.advance t.stream;
Buffer.clear t.temp_buffer;
t.state <- Tokenizer_state.Rawtext_end_tag_open
| _ ->
emit_char t '<';
t.state <- Tokenizer_state.Rawtext
and state_rawtext_end_tag_open () =
match Tokenizer_stream.peek t.stream with
| Some c when is_ascii_alpha c ->
start_new_tag t Tokenizer_token.End;
t.state <- Tokenizer_state.Rawtext_end_tag_name
| _ ->
emit_str t "";
t.state <- Tokenizer_state.Rawtext
and state_rawtext_end_tag_name () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Before_attribute_name
| Some '/' when is_appropriate_end_tag t ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Self_closing_start_tag
| Some '>' when is_appropriate_end_tag t ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Data;
emit_current_tag ()
| Some c when is_ascii_alpha c ->
Tokenizer_stream.advance t.stream;
Buffer.add_char t.current_tag_name (ascii_lower c);
Buffer.add_char t.temp_buffer c
| _ ->
emit_str t "";
emit_str t (Buffer.contents t.temp_buffer);
t.state <- Tokenizer_state.Rawtext
and state_script_data_less_than_sign () =
match Tokenizer_stream.peek t.stream with
| Some '/' ->
Tokenizer_stream.advance t.stream;
Buffer.clear t.temp_buffer;
t.state <- Tokenizer_state.Script_data_end_tag_open
| Some '!' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Script_data_escape_start;
emit_str t "
emit_char t '<';
t.state <- Tokenizer_state.Script_data
and state_script_data_end_tag_open () =
match Tokenizer_stream.peek t.stream with
| Some c when is_ascii_alpha c ->
start_new_tag t Tokenizer_token.End;
t.state <- Tokenizer_state.Script_data_end_tag_name
| _ ->
emit_str t "";
t.state <- Tokenizer_state.Script_data
and state_script_data_end_tag_name () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Before_attribute_name
| Some '/' when is_appropriate_end_tag t ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Self_closing_start_tag
| Some '>' when is_appropriate_end_tag t ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Data;
emit_current_tag ()
| Some c when is_ascii_alpha c ->
Tokenizer_stream.advance t.stream;
Buffer.add_char t.current_tag_name (ascii_lower c);
Buffer.add_char t.temp_buffer c
| _ ->
emit_str t "";
emit_str t (Buffer.contents t.temp_buffer);
t.state <- Tokenizer_state.Script_data
and state_script_data_escape_start () =
match Tokenizer_stream.peek t.stream with
| Some '-' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Script_data_escape_start_dash;
emit_char t '-'
| _ ->
t.state <- Tokenizer_state.Script_data
and state_script_data_escape_start_dash () =
match Tokenizer_stream.peek t.stream with
| Some '-' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Script_data_escaped_dash_dash;
emit_char t '-'
| _ ->
t.state <- Tokenizer_state.Script_data
and state_script_data_escaped () =
match Tokenizer_stream.consume t.stream with
| Some '-' ->
t.state <- Tokenizer_state.Script_data_escaped_dash;
emit_char t '-'
| Some '<' ->
t.state <- Tokenizer_state.Script_data_escaped_less_than_sign
| Some '\x00' ->
error t "unexpected-null-character";
emit_str t "\xEF\xBF\xBD"
| Some c ->
emit_char_checked c
| None -> ()
and state_script_data_escaped_dash () =
match Tokenizer_stream.consume t.stream with
| Some '-' ->
t.state <- Tokenizer_state.Script_data_escaped_dash_dash;
emit_char t '-'
| Some '<' ->
t.state <- Tokenizer_state.Script_data_escaped_less_than_sign
| Some '\x00' ->
error t "unexpected-null-character";
t.state <- Tokenizer_state.Script_data_escaped;
emit_str t "\xEF\xBF\xBD"
| Some c ->
t.state <- Tokenizer_state.Script_data_escaped;
emit_char_checked c
| None -> ()
and state_script_data_escaped_dash_dash () =
match Tokenizer_stream.consume t.stream with
| Some '-' ->
emit_char t '-'
| Some '<' ->
t.state <- Tokenizer_state.Script_data_escaped_less_than_sign
| Some '>' ->
t.state <- Tokenizer_state.Script_data;
emit_char t '>'
| Some '\x00' ->
error t "unexpected-null-character";
t.state <- Tokenizer_state.Script_data_escaped;
emit_str t "\xEF\xBF\xBD"
| Some c ->
t.state <- Tokenizer_state.Script_data_escaped;
emit_char_checked c
| None -> ()
and state_script_data_escaped_less_than_sign () =
match Tokenizer_stream.peek t.stream with
| Some '/' ->
Tokenizer_stream.advance t.stream;
Buffer.clear t.temp_buffer;
t.state <- Tokenizer_state.Script_data_escaped_end_tag_open
| Some c when is_ascii_alpha c ->
Buffer.clear t.temp_buffer;
emit_char t '<';
t.state <- Tokenizer_state.Script_data_double_escape_start
| _ ->
emit_char t '<';
t.state <- Tokenizer_state.Script_data_escaped
and state_script_data_escaped_end_tag_open () =
match Tokenizer_stream.peek t.stream with
| Some c when is_ascii_alpha c ->
start_new_tag t Tokenizer_token.End;
t.state <- Tokenizer_state.Script_data_escaped_end_tag_name
| _ ->
emit_str t "";
t.state <- Tokenizer_state.Script_data_escaped
and state_script_data_escaped_end_tag_name () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Before_attribute_name
| Some '/' when is_appropriate_end_tag t ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Self_closing_start_tag
| Some '>' when is_appropriate_end_tag t ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Data;
emit_current_tag ()
| Some c when is_ascii_alpha c ->
Tokenizer_stream.advance t.stream;
Buffer.add_char t.current_tag_name (ascii_lower c);
Buffer.add_char t.temp_buffer c
| _ ->
emit_str t "";
emit_str t (Buffer.contents t.temp_buffer);
t.state <- Tokenizer_state.Script_data_escaped
and state_script_data_double_escape_start () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt ->
Tokenizer_stream.advance t.stream;
let c = Option.get c_opt in
if Buffer.contents t.temp_buffer = "script" then
t.state <- Tokenizer_state.Script_data_double_escaped
else
t.state <- Tokenizer_state.Script_data_escaped;
emit_char t c
| Some c when is_ascii_alpha c ->
Tokenizer_stream.advance t.stream;
Buffer.add_char t.temp_buffer (ascii_lower c);
emit_char t c
| _ ->
t.state <- Tokenizer_state.Script_data_escaped
and state_script_data_double_escaped () =
match Tokenizer_stream.consume t.stream with
| Some '-' ->
t.state <- Tokenizer_state.Script_data_double_escaped_dash;
emit_char t '-'
| Some '<' ->
t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign;
emit_char t '<'
| Some '\x00' ->
error t "unexpected-null-character";
emit_str t "\xEF\xBF\xBD"
| Some c ->
emit_char_checked c
| None -> ()
and state_script_data_double_escaped_dash () =
match Tokenizer_stream.consume t.stream with
| Some '-' ->
t.state <- Tokenizer_state.Script_data_double_escaped_dash_dash;
emit_char t '-'
| Some '<' ->
t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign;
emit_char t '<'
| Some '\x00' ->
error t "unexpected-null-character";
t.state <- Tokenizer_state.Script_data_double_escaped;
emit_str t "\xEF\xBF\xBD"
| Some c ->
t.state <- Tokenizer_state.Script_data_double_escaped;
emit_char_checked c
| None -> ()
and state_script_data_double_escaped_dash_dash () =
match Tokenizer_stream.consume t.stream with
| Some '-' ->
emit_char t '-'
| Some '<' ->
t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign;
emit_char t '<'
| Some '>' ->
t.state <- Tokenizer_state.Script_data;
emit_char t '>'
| Some '\x00' ->
error t "unexpected-null-character";
t.state <- Tokenizer_state.Script_data_double_escaped;
emit_str t "\xEF\xBF\xBD"
| Some c ->
t.state <- Tokenizer_state.Script_data_double_escaped;
emit_char_checked c
| None -> ()
and state_script_data_double_escaped_less_than_sign () =
match Tokenizer_stream.peek t.stream with
| Some '/' ->
Tokenizer_stream.advance t.stream;
Buffer.clear t.temp_buffer;
t.state <- Tokenizer_state.Script_data_double_escape_end;
emit_char t '/'
| _ ->
t.state <- Tokenizer_state.Script_data_double_escaped
and state_script_data_double_escape_end () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt ->
Tokenizer_stream.advance t.stream;
let c = Option.get c_opt in
if Buffer.contents t.temp_buffer = "script" then
t.state <- Tokenizer_state.Script_data_escaped
else
t.state <- Tokenizer_state.Script_data_double_escaped;
emit_char t c
| Some c when is_ascii_alpha c ->
Tokenizer_stream.advance t.stream;
Buffer.add_char t.temp_buffer (ascii_lower c);
emit_char t c
| _ ->
t.state <- Tokenizer_state.Script_data_double_escaped
and state_before_attribute_name () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream
| Some '/' | Some '>' | None ->
t.state <- Tokenizer_state.After_attribute_name
| Some '=' ->
Tokenizer_stream.advance t.stream;
error t "unexpected-equals-sign-before-attribute-name";
start_new_attribute t;
Buffer.add_char t.current_attr_name '=';
t.state <- Tokenizer_state.Attribute_name
| Some _ ->
start_new_attribute t;
t.state <- Tokenizer_state.Attribute_name
and state_attribute_name () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.After_attribute_name
| Some '/' | Some '>' | None ->
t.state <- Tokenizer_state.After_attribute_name
| Some '=' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Before_attribute_value
| Some '\x00' ->
Tokenizer_stream.advance t.stream;
error t "unexpected-null-character";
Buffer.add_string t.current_attr_name "\xEF\xBF\xBD"
| Some ('"' | '\'' | '<') as c_opt ->
Tokenizer_stream.advance t.stream;
error t "unexpected-character-in-attribute-name";
Buffer.add_char t.current_attr_name (Option.get c_opt)
| Some c ->
Tokenizer_stream.advance t.stream;
check_control_char c;
Buffer.add_char t.current_attr_name (ascii_lower c)
and state_after_attribute_name () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream
| Some '/' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Self_closing_start_tag
| Some '=' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Before_attribute_value
| Some '>' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Data;
emit_current_tag ()
| None -> ()
| Some _ ->
start_new_attribute t;
t.state <- Tokenizer_state.Attribute_name
and state_before_attribute_value () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream
| Some '"' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Attribute_value_double_quoted
| Some '\'' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Attribute_value_single_quoted
| Some '>' ->
Tokenizer_stream.advance t.stream;
error t "missing-attribute-value";
t.state <- Tokenizer_state.Data;
emit_current_tag ()
| _ ->
t.state <- Tokenizer_state.Attribute_value_unquoted
and state_attribute_value_double_quoted () =
match Tokenizer_stream.consume t.stream with
| Some '"' ->
t.state <- Tokenizer_state.After_attribute_value_quoted
| Some '&' ->
t.return_state <- Tokenizer_state.Attribute_value_double_quoted;
t.state <- Tokenizer_state.Character_reference
| Some '\x00' ->
error t "unexpected-null-character";
Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
| Some c ->
check_control_char c;
Buffer.add_char t.current_attr_value c
| None -> ()
and state_attribute_value_single_quoted () =
match Tokenizer_stream.consume t.stream with
| Some '\'' ->
t.state <- Tokenizer_state.After_attribute_value_quoted
| Some '&' ->
t.return_state <- Tokenizer_state.Attribute_value_single_quoted;
t.state <- Tokenizer_state.Character_reference
| Some '\x00' ->
error t "unexpected-null-character";
Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
| Some c ->
check_control_char c;
Buffer.add_char t.current_attr_value c
| None -> ()
and state_attribute_value_unquoted () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Before_attribute_name
| Some '&' ->
Tokenizer_stream.advance t.stream;
t.return_state <- Tokenizer_state.Attribute_value_unquoted;
t.state <- Tokenizer_state.Character_reference
| Some '>' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Data;
emit_current_tag ()
| Some '\x00' ->
Tokenizer_stream.advance t.stream;
error t "unexpected-null-character";
Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
| Some ('"' | '\'' | '<' | '=' | '`') as c_opt ->
Tokenizer_stream.advance t.stream;
error t "unexpected-character-in-unquoted-attribute-value";
Buffer.add_char t.current_attr_value (Option.get c_opt)
| Some c ->
Tokenizer_stream.advance t.stream;
check_control_char c;
Buffer.add_char t.current_attr_value c
| None -> ()
and state_after_attribute_value_quoted () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Before_attribute_name
| Some '/' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Self_closing_start_tag
| Some '>' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Data;
emit_current_tag ()
| None -> ()
| Some _ ->
error t "missing-whitespace-between-attributes";
t.state <- Tokenizer_state.Before_attribute_name
and state_self_closing_start_tag () =
match Tokenizer_stream.peek t.stream with
| Some '>' ->
Tokenizer_stream.advance t.stream;
t.current_tag_self_closing <- true;
t.state <- Tokenizer_state.Data;
emit_current_tag ()
| None -> ()
| Some _ ->
error t "unexpected-solidus-in-tag";
t.state <- Tokenizer_state.Before_attribute_name
and state_bogus_comment () =
match Tokenizer_stream.consume t.stream with
| Some '>' ->
t.state <- Tokenizer_state.Data;
emit_current_comment ()
| Some '\x00' ->
error t "unexpected-null-character";
Buffer.add_string t.current_comment "\xEF\xBF\xBD"
| Some c ->
check_control_char c;
Buffer.add_char t.current_comment c
| None -> ()
and state_markup_declaration_open () =
if Tokenizer_stream.matches_ci t.stream "--" then begin
ignore (Tokenizer_stream.consume_exact_ci t.stream "--");
Buffer.clear t.current_comment;
t.state <- Tokenizer_state.Comment_start
end else if Tokenizer_stream.matches_ci t.stream "DOCTYPE" then begin
ignore (Tokenizer_stream.consume_exact_ci t.stream "DOCTYPE");
t.state <- Tokenizer_state.Doctype
end else if Tokenizer_stream.matches_ci t.stream "[CDATA[" then begin
ignore (Tokenizer_stream.consume_exact_ci t.stream "[CDATA[");
(* CDATA only allowed in foreign content *)
if S.adjusted_current_node_in_html_namespace t.sink then begin
error t "cdata-in-html-content";
Buffer.clear t.current_comment;
Buffer.add_string t.current_comment "[CDATA[";
t.state <- Tokenizer_state.Bogus_comment
end else
t.state <- Tokenizer_state.Cdata_section
end else begin
error t "incorrectly-opened-comment";
Buffer.clear t.current_comment;
t.state <- Tokenizer_state.Bogus_comment
end
and state_comment_start () =
match Tokenizer_stream.peek t.stream with
| Some '-' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Comment_start_dash
| Some '>' ->
Tokenizer_stream.advance t.stream;
error t "abrupt-closing-of-empty-comment";
t.state <- Tokenizer_state.Data;
emit_current_comment ()
| _ ->
t.state <- Tokenizer_state.Comment
and state_comment_start_dash () =
match Tokenizer_stream.peek t.stream with
| Some '-' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Comment_end
| Some '>' ->
Tokenizer_stream.advance t.stream;
error t "abrupt-closing-of-empty-comment";
t.state <- Tokenizer_state.Data;
emit_current_comment ()
| None -> ()
| Some _ ->
Buffer.add_char t.current_comment '-';
t.state <- Tokenizer_state.Comment
and state_comment () =
match Tokenizer_stream.consume t.stream with
| Some '<' ->
Buffer.add_char t.current_comment '<';
t.state <- Tokenizer_state.Comment_less_than_sign
| Some '-' ->
t.state <- Tokenizer_state.Comment_end_dash
| Some '\x00' ->
error t "unexpected-null-character";
Buffer.add_string t.current_comment "\xEF\xBF\xBD"
| Some c ->
check_control_char c;
Buffer.add_char t.current_comment c
| None -> ()
and state_comment_less_than_sign () =
match Tokenizer_stream.peek t.stream with
| Some '!' ->
Tokenizer_stream.advance t.stream;
Buffer.add_char t.current_comment '!';
t.state <- Tokenizer_state.Comment_less_than_sign_bang
| Some '<' ->
Tokenizer_stream.advance t.stream;
Buffer.add_char t.current_comment '<'
| _ ->
t.state <- Tokenizer_state.Comment
and state_comment_less_than_sign_bang () =
match Tokenizer_stream.peek t.stream with
| Some '-' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Comment_less_than_sign_bang_dash
| _ ->
t.state <- Tokenizer_state.Comment
and state_comment_less_than_sign_bang_dash () =
match Tokenizer_stream.peek t.stream with
| Some '-' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Comment_less_than_sign_bang_dash_dash
| _ ->
t.state <- Tokenizer_state.Comment_end_dash
and state_comment_less_than_sign_bang_dash_dash () =
match Tokenizer_stream.peek t.stream with
| Some '>' | None ->
t.state <- Tokenizer_state.Comment_end
| Some _ ->
error t "nested-comment";
t.state <- Tokenizer_state.Comment_end
and state_comment_end_dash () =
match Tokenizer_stream.peek t.stream with
| Some '-' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Comment_end
| None -> ()
| Some _ ->
Buffer.add_char t.current_comment '-';
t.state <- Tokenizer_state.Comment
and state_comment_end () =
match Tokenizer_stream.peek t.stream with
| Some '>' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Data;
emit_current_comment ()
| Some '!' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Comment_end_bang
| Some '-' ->
Tokenizer_stream.advance t.stream;
Buffer.add_char t.current_comment '-'
| None -> ()
| Some _ ->
Buffer.add_string t.current_comment "--";
t.state <- Tokenizer_state.Comment
and state_comment_end_bang () =
match Tokenizer_stream.peek t.stream with
| Some '-' ->
Tokenizer_stream.advance t.stream;
Buffer.add_string t.current_comment "--!";
t.state <- Tokenizer_state.Comment_end_dash
| Some '>' ->
Tokenizer_stream.advance t.stream;
error t "incorrectly-closed-comment";
t.state <- Tokenizer_state.Data;
emit_current_comment ()
| None -> ()
| Some _ ->
Buffer.add_string t.current_comment "--!";
t.state <- Tokenizer_state.Comment
and state_doctype () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Before_doctype_name
| Some '>' ->
t.state <- Tokenizer_state.Before_doctype_name
| None -> ()
| Some _ ->
error t "missing-whitespace-before-doctype-name";
t.state <- Tokenizer_state.Before_doctype_name
and state_before_doctype_name () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream
| Some '\x00' ->
Tokenizer_stream.advance t.stream;
error t "unexpected-null-character";
start_new_doctype t;
t.current_doctype_name <- Some (Buffer.create 8);
Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD";
t.state <- Tokenizer_state.Doctype_name
| Some '>' ->
Tokenizer_stream.advance t.stream;
error t "missing-doctype-name";
start_new_doctype t;
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| None -> ()
| Some c ->
Tokenizer_stream.advance t.stream;
check_control_char c;
start_new_doctype t;
t.current_doctype_name <- Some (Buffer.create 8);
Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c);
t.state <- Tokenizer_state.Doctype_name
and state_doctype_name () =
match Tokenizer_stream.consume t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
t.state <- Tokenizer_state.After_doctype_name
| Some '>' ->
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| Some '\x00' ->
error t "unexpected-null-character";
Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD"
| Some c ->
check_control_char c;
Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c)
| None -> ()
and state_after_doctype_name () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream
| Some '>' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| None -> ()
| Some _ ->
(* Don't check control char here - bogus_doctype will check when it consumes *)
if Tokenizer_stream.matches_ci t.stream "PUBLIC" then begin
ignore (Tokenizer_stream.consume_exact_ci t.stream "PUBLIC");
t.state <- Tokenizer_state.After_doctype_public_keyword
end else if Tokenizer_stream.matches_ci t.stream "SYSTEM" then begin
ignore (Tokenizer_stream.consume_exact_ci t.stream "SYSTEM");
t.state <- Tokenizer_state.After_doctype_system_keyword
end else begin
error t "invalid-character-sequence-after-doctype-name";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Bogus_doctype
end
and state_after_doctype_public_keyword () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Before_doctype_public_identifier
| Some '"' ->
Tokenizer_stream.advance t.stream;
error t "missing-whitespace-after-doctype-public-keyword";
t.current_doctype_public <- Some (Buffer.create 32);
t.state <- Tokenizer_state.Doctype_public_identifier_double_quoted
| Some '\'' ->
Tokenizer_stream.advance t.stream;
error t "missing-whitespace-after-doctype-public-keyword";
t.current_doctype_public <- Some (Buffer.create 32);
t.state <- Tokenizer_state.Doctype_public_identifier_single_quoted
| Some '>' ->
Tokenizer_stream.advance t.stream;
error t "missing-doctype-public-identifier";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| None -> ()
| Some _ ->
(* Don't check control char here - bogus_doctype will check when it consumes *)
error t "missing-quote-before-doctype-public-identifier";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Bogus_doctype
and state_before_doctype_public_identifier () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream
| Some '"' ->
Tokenizer_stream.advance t.stream;
t.current_doctype_public <- Some (Buffer.create 32);
t.state <- Tokenizer_state.Doctype_public_identifier_double_quoted
| Some '\'' ->
Tokenizer_stream.advance t.stream;
t.current_doctype_public <- Some (Buffer.create 32);
t.state <- Tokenizer_state.Doctype_public_identifier_single_quoted
| Some '>' ->
Tokenizer_stream.advance t.stream;
error t "missing-doctype-public-identifier";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| None -> ()
| Some _ ->
error t "missing-quote-before-doctype-public-identifier";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Bogus_doctype
and state_doctype_public_identifier_double_quoted () =
match Tokenizer_stream.consume t.stream with
| Some '"' ->
t.state <- Tokenizer_state.After_doctype_public_identifier
| Some '\x00' ->
error t "unexpected-null-character";
Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD"
| Some '>' ->
error t "abrupt-doctype-public-identifier";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| Some c ->
check_control_char c;
Buffer.add_char (Option.get t.current_doctype_public) c
| None -> ()
and state_doctype_public_identifier_single_quoted () =
match Tokenizer_stream.consume t.stream with
| Some '\'' ->
t.state <- Tokenizer_state.After_doctype_public_identifier
| Some '\x00' ->
error t "unexpected-null-character";
Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD"
| Some '>' ->
error t "abrupt-doctype-public-identifier";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| Some c ->
check_control_char c;
Buffer.add_char (Option.get t.current_doctype_public) c
| None -> ()
and state_after_doctype_public_identifier () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Between_doctype_public_and_system_identifiers
| Some '>' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| Some '"' ->
Tokenizer_stream.advance t.stream;
error t "missing-whitespace-between-doctype-public-and-system-identifiers";
t.current_doctype_system <- Some (Buffer.create 32);
t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted
| Some '\'' ->
Tokenizer_stream.advance t.stream;
error t "missing-whitespace-between-doctype-public-and-system-identifiers";
t.current_doctype_system <- Some (Buffer.create 32);
t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted
| None -> ()
| Some _ ->
(* Don't check control char here - bogus_doctype will check when it consumes *)
error t "missing-quote-before-doctype-system-identifier";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Bogus_doctype
and state_between_doctype_public_and_system_identifiers () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream
| Some '>' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| Some '"' ->
Tokenizer_stream.advance t.stream;
t.current_doctype_system <- Some (Buffer.create 32);
t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted
| Some '\'' ->
Tokenizer_stream.advance t.stream;
t.current_doctype_system <- Some (Buffer.create 32);
t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted
| None -> ()
| Some _ ->
(* Don't check control char here - bogus_doctype will check when it consumes *)
error t "missing-quote-before-doctype-system-identifier";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Bogus_doctype
and state_after_doctype_system_keyword () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Before_doctype_system_identifier
| Some '"' ->
Tokenizer_stream.advance t.stream;
error t "missing-whitespace-after-doctype-system-keyword";
t.current_doctype_system <- Some (Buffer.create 32);
t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted
| Some '\'' ->
Tokenizer_stream.advance t.stream;
error t "missing-whitespace-after-doctype-system-keyword";
t.current_doctype_system <- Some (Buffer.create 32);
t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted
| Some '>' ->
Tokenizer_stream.advance t.stream;
error t "missing-doctype-system-identifier";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| None -> ()
| Some _ ->
(* Don't check control char here - bogus_doctype will check when it consumes *)
error t "missing-quote-before-doctype-system-identifier";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Bogus_doctype
and state_before_doctype_system_identifier () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream
| Some '"' ->
Tokenizer_stream.advance t.stream;
t.current_doctype_system <- Some (Buffer.create 32);
t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted
| Some '\'' ->
Tokenizer_stream.advance t.stream;
t.current_doctype_system <- Some (Buffer.create 32);
t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted
| Some '>' ->
Tokenizer_stream.advance t.stream;
error t "missing-doctype-system-identifier";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| None -> ()
| Some _ ->
(* Don't check control char here - bogus_doctype will check when it consumes *)
error t "missing-quote-before-doctype-system-identifier";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Bogus_doctype
and state_doctype_system_identifier_double_quoted () =
match Tokenizer_stream.consume t.stream with
| Some '"' ->
t.state <- Tokenizer_state.After_doctype_system_identifier
| Some '\x00' ->
error t "unexpected-null-character";
Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD"
| Some '>' ->
error t "abrupt-doctype-system-identifier";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| Some c ->
check_control_char c;
Buffer.add_char (Option.get t.current_doctype_system) c
| None -> ()
and state_doctype_system_identifier_single_quoted () =
match Tokenizer_stream.consume t.stream with
| Some '\'' ->
t.state <- Tokenizer_state.After_doctype_system_identifier
| Some '\x00' ->
error t "unexpected-null-character";
Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD"
| Some '>' ->
error t "abrupt-doctype-system-identifier";
t.current_doctype_force_quirks <- true;
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| Some c ->
check_control_char c;
Buffer.add_char (Option.get t.current_doctype_system) c
| None -> ()
and state_after_doctype_system_identifier () =
match Tokenizer_stream.peek t.stream with
| Some ('\t' | '\n' | '\x0C' | ' ') ->
Tokenizer_stream.advance t.stream
| Some '>' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| None -> ()
| Some _ ->
(* Don't check control char here - bogus_doctype will check when it consumes *)
error t "unexpected-character-after-doctype-system-identifier";
t.state <- Tokenizer_state.Bogus_doctype
and state_bogus_doctype () =
match Tokenizer_stream.consume t.stream with
| Some '>' ->
t.state <- Tokenizer_state.Data;
emit_current_doctype ()
| Some '\x00' ->
error t "unexpected-null-character"
| Some c ->
check_control_char c (* Check all chars in bogus doctype *)
| None -> ()
and state_cdata_section () =
match Tokenizer_stream.consume t.stream with
| Some ']' ->
t.state <- Tokenizer_state.Cdata_section_bracket
| Some c ->
(* CDATA section emits all characters as-is, including NUL, but still check for control chars *)
emit_char_checked c
| None -> ()
and state_cdata_section_bracket () =
match Tokenizer_stream.peek t.stream with
| Some ']' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Cdata_section_end
| _ ->
emit_char t ']';
t.state <- Tokenizer_state.Cdata_section
and state_cdata_section_end () =
match Tokenizer_stream.peek t.stream with
| Some ']' ->
Tokenizer_stream.advance t.stream;
emit_char t ']'
| Some '>' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Data
| _ ->
emit_str t "]]";
t.state <- Tokenizer_state.Cdata_section
and state_character_reference () =
Buffer.clear t.temp_buffer;
Buffer.add_char t.temp_buffer '&';
match Tokenizer_stream.peek t.stream with
| Some c when is_ascii_alnum c ->
t.state <- Tokenizer_state.Named_character_reference
| Some '#' ->
Tokenizer_stream.advance t.stream;
Buffer.add_char t.temp_buffer '#';
t.state <- Tokenizer_state.Numeric_character_reference
| _ ->
flush_code_points_consumed_as_char_ref t;
t.state <- t.return_state
and state_named_character_reference () =
(* Collect alphanumeric characters *)
let rec collect () =
match Tokenizer_stream.peek t.stream with
| Some c when is_ascii_alnum c ->
Tokenizer_stream.advance t.stream;
Buffer.add_char t.temp_buffer c;
collect ()
| _ -> ()
in
collect ();
let has_semicolon =
match Tokenizer_stream.peek t.stream with
| Some ';' -> Tokenizer_stream.advance t.stream; Buffer.add_char t.temp_buffer ';'; true
| _ -> false
in
(* Try to match entity - buffer contains "&name" or "&name;" *)
let buf_contents = Buffer.contents t.temp_buffer in
let name_start = 1 in (* Skip '&' *)
let name_end = String.length buf_contents - (if has_semicolon then 1 else 0) in
let entity_name = String.sub buf_contents name_start (name_end - name_start) in
(* Try progressively shorter matches *)
(* Only match if:
1. Full match with semicolon, OR
2. Legacy entity (can be used without semicolon) *)
let rec try_match len =
if len <= 0 then None
else
let prefix = String.sub entity_name 0 len in
let is_full = len = String.length entity_name in
let would_have_semi = has_semicolon && is_full in
(* Only use this match if it has semicolon or is a legacy entity *)
if would_have_semi || Entities.is_legacy prefix then
match Entities.lookup prefix with
| Some decoded -> Some (decoded, len)
| None -> try_match (len - 1)
else
try_match (len - 1)
in
match try_match (String.length entity_name) with
| Some (decoded, matched_len) ->
let full_match = matched_len = String.length entity_name in
let ends_with_semi = has_semicolon && full_match in
(* Check attribute context restrictions *)
let in_attribute = match t.return_state with
| Tokenizer_state.Attribute_value_double_quoted
| Tokenizer_state.Attribute_value_single_quoted
| Tokenizer_state.Attribute_value_unquoted -> true
| _ -> false
in
let next_char =
if full_match && not has_semicolon then
Tokenizer_stream.peek t.stream
else if not full_match then
Some entity_name.[matched_len]
else None
in
let blocked = in_attribute && not ends_with_semi &&
match next_char with
| Some '=' -> true
| Some c when is_ascii_alnum c -> true
| _ -> false
in
if blocked then begin
flush_code_points_consumed_as_char_ref t;
t.state <- t.return_state
end else begin
if not ends_with_semi then
error t "missing-semicolon-after-character-reference";
Buffer.clear t.temp_buffer;
Buffer.add_string t.temp_buffer decoded;
flush_code_points_consumed_as_char_ref t;
(* Emit unconsumed chars after partial match *)
if not full_match then begin
let unconsumed = String.sub entity_name matched_len (String.length entity_name - matched_len) in
emit_str t unconsumed;
(* If there was a semicolon in input but we didn't use the full match, emit the semicolon too *)
if has_semicolon then
emit_char t ';'
end;
t.state <- t.return_state
end
| None ->
(* No match - check if we should report unknown-named-character-reference *)
if String.length entity_name > 0 then begin
(* If we have a semicolon, it's definitely an unknown named character reference *)
if has_semicolon then
error t "unknown-named-character-reference";
(* Emit all the chars we consumed *)
flush_code_points_consumed_as_char_ref t;
t.state <- t.return_state
end else begin
flush_code_points_consumed_as_char_ref t;
t.state <- t.return_state
end
and state_ambiguous_ampersand () =
match Tokenizer_stream.peek t.stream with
| Some c when is_ascii_alnum c ->
Tokenizer_stream.advance t.stream;
(match t.return_state with
| Tokenizer_state.Attribute_value_double_quoted
| Tokenizer_state.Attribute_value_single_quoted
| Tokenizer_state.Attribute_value_unquoted ->
Buffer.add_char t.current_attr_value c
| _ ->
emit_char t c)
| Some ';' ->
error t "unknown-named-character-reference";
t.state <- t.return_state
| _ ->
t.state <- t.return_state
and state_numeric_character_reference () =
t.char_ref_code <- 0;
match Tokenizer_stream.peek t.stream with
| Some (('x' | 'X') as c) ->
Tokenizer_stream.advance t.stream;
Buffer.add_char t.temp_buffer c;
t.state <- Tokenizer_state.Hexadecimal_character_reference_start
| _ ->
t.state <- Tokenizer_state.Decimal_character_reference_start
and state_hexadecimal_character_reference_start () =
match Tokenizer_stream.peek t.stream with
| Some c when is_ascii_hex c ->
t.state <- Tokenizer_state.Hexadecimal_character_reference
| _ ->
error t "absence-of-digits-in-numeric-character-reference";
flush_code_points_consumed_as_char_ref t;
t.state <- t.return_state
and state_decimal_character_reference_start () =
match Tokenizer_stream.peek t.stream with
| Some c when is_ascii_digit c ->
t.state <- Tokenizer_state.Decimal_character_reference
| _ ->
error t "absence-of-digits-in-numeric-character-reference";
flush_code_points_consumed_as_char_ref t;
t.state <- t.return_state
and state_hexadecimal_character_reference () =
match Tokenizer_stream.peek t.stream with
| Some c when is_ascii_digit c ->
Tokenizer_stream.advance t.stream;
t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code '0');
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
| Some c when c >= 'A' && c <= 'F' ->
Tokenizer_stream.advance t.stream;
t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'A' + 10);
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
| Some c when c >= 'a' && c <= 'f' ->
Tokenizer_stream.advance t.stream;
t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'a' + 10);
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
| Some ';' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Numeric_character_reference_end
| _ ->
error t "missing-semicolon-after-character-reference";
t.state <- Tokenizer_state.Numeric_character_reference_end
and state_decimal_character_reference () =
match Tokenizer_stream.peek t.stream with
| Some c when is_ascii_digit c ->
Tokenizer_stream.advance t.stream;
t.char_ref_code <- t.char_ref_code * 10 + (Char.code c - Char.code '0');
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
| Some ';' ->
Tokenizer_stream.advance t.stream;
t.state <- Tokenizer_state.Numeric_character_reference_end
| _ ->
error t "missing-semicolon-after-character-reference";
t.state <- Tokenizer_state.Numeric_character_reference_end
and state_numeric_character_reference_end () =
let code = t.char_ref_code in
let replacement_char = "\xEF\xBF\xBD" in
let result =
if code = 0 then begin
error t "null-character-reference";
replacement_char
end else if code > 0x10FFFF then begin
error t (Printf.sprintf "character-reference-outside-unicode-range:%x" code);
replacement_char
end else if code >= 0xD800 && code <= 0xDFFF then begin
error t (Printf.sprintf "surrogate-character-reference:%04x" code);
replacement_char
end else if (code >= 0xFDD0 && code <= 0xFDEF) ||
(* Noncharacters end in 0xFFFE or 0xFFFF in each plane (0-16).
O(1) bitwise check instead of O(n) list membership. *)
(let low16 = code land 0xFFFF in low16 = 0xFFFE || low16 = 0xFFFF) then begin
error t (Printf.sprintf "noncharacter-character-reference:%05x" code);
Entities.Numeric_ref.codepoint_to_utf8 code
end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
(code >= 0x0D && code <= 0x1F) ||
(code >= 0x7F && code <= 0x9F) then begin
error t (Printf.sprintf "control-character-reference:%04x" code);
(* Apply Windows-1252 replacement table for 0x80-0x9F *)
match Entities.Numeric_ref.find_replacement code with
| Some replacement -> Entities.Numeric_ref.codepoint_to_utf8 replacement
| None -> Entities.Numeric_ref.codepoint_to_utf8 code
end else
Entities.Numeric_ref.codepoint_to_utf8 code
in
Buffer.clear t.temp_buffer;
Buffer.add_string t.temp_buffer result;
flush_code_points_consumed_as_char_ref t;
t.state <- t.return_state
in
process_state ()
let get_errors t = List.rev t.errors
let set_state t state = t.state <- state
let set_last_start_tag t name = t.last_start_tag <- name