(* HTML5 Tree Builder *)
module Dom = Dom
module Token = Tokenizer_token
module State = Tokenizer_state
type fragment_context = {
tag_name : string;
namespace : string option;
}
type formatting_entry =
| Marker
| Entry of {
name : string;
attrs : (string * string) list;
node : Dom.node;
}
type parse_error = {
code : Parse_error_code.t;
line : int;
column : int;
}
type t = {
mutable document : Dom.node;
mutable mode : Parser_insertion_mode.t;
mutable original_mode : Parser_insertion_mode.t option;
mutable open_elements : Dom.node list;
mutable active_formatting : formatting_entry list;
mutable head_element : Dom.node option;
mutable form_element : Dom.node option;
mutable frameset_ok : bool;
mutable ignore_lf : bool;
mutable foster_parenting : bool;
mutable pending_table_chars : string list;
mutable template_modes : Parser_insertion_mode.t list;
mutable quirks_mode : Dom.quirks_mode;
mutable errors : parse_error list;
collect_errors : bool;
fragment_context : fragment_context option;
mutable fragment_context_element : Dom.node option;
iframe_srcdoc : bool;
mutable current_line : int;
mutable current_column : int;
}
let create ?(collect_errors=false) ?fragment_context ?(iframe_srcdoc=false) () =
let is_fragment = fragment_context <> None in
let doc = if is_fragment then Dom.create_document_fragment () else Dom.create_document () in
let t = {
document = doc;
mode = Parser_insertion_mode.Initial;
original_mode = None;
open_elements = [];
active_formatting = [];
head_element = None;
form_element = None;
frameset_ok = true;
ignore_lf = false;
foster_parenting = false;
pending_table_chars = [];
template_modes = [];
quirks_mode = Dom.No_quirks;
errors = [];
collect_errors;
fragment_context;
fragment_context_element = None;
iframe_srcdoc;
current_line = 1;
current_column = 1;
} in
(* Initialize fragment parsing *)
(match fragment_context with
| Some ctx ->
let name = String.lowercase_ascii ctx.tag_name in
let ns = ctx.namespace in
(* Create html root *)
let root = Dom.create_element "html" () in
Dom.append_child doc root;
t.open_elements <- [root];
(* For foreign content contexts, create context element *)
(match ns with
| Some namespace when namespace <> "html" ->
let context_elem = Dom.create_element ctx.tag_name ~namespace:ns () in
Dom.append_child root context_elem;
t.open_elements <- [context_elem; root];
t.fragment_context_element <- Some context_elem
| _ -> ());
(* Set initial mode based on context *)
t.mode <- (
if name = "html" then Parser_insertion_mode.Before_head
else if Parser_constants.is_table_section_element name && (ns = None || ns = Some "html") then
Parser_insertion_mode.In_table_body
else if name = "tr" && (ns = None || ns = Some "html") then
Parser_insertion_mode.In_row
else if Parser_constants.is_table_cell_element name && (ns = None || ns = Some "html") then
Parser_insertion_mode.In_cell
else if name = "caption" && (ns = None || ns = Some "html") then
Parser_insertion_mode.In_caption
else if name = "colgroup" && (ns = None || ns = Some "html") then
Parser_insertion_mode.In_column_group
else if name = "table" && (ns = None || ns = Some "html") then
Parser_insertion_mode.In_table
else if name = "template" && (ns = None || ns = Some "html") then begin
t.template_modes <- [Parser_insertion_mode.In_template];
Parser_insertion_mode.In_template
end
else
Parser_insertion_mode.In_body
);
t.frameset_ok <- false
| None -> ());
t
(* Position tracking for error reporting *)
let set_position t ~line ~column =
t.current_line <- line;
t.current_column <- column
(* Error handling *)
let parse_error t code =
if t.collect_errors then
t.errors <- { code = Parse_error_code.of_string code; line = t.current_line; column = t.current_column } :: t.errors
(* Stack helpers *)
let current_node t =
match t.open_elements with
| [] -> None
| x :: _ -> Some x
let adjusted_current_node t =
match t.fragment_context, t.open_elements with
| Some ctx, [_] ->
(* Fragment case: use context element info *)
Some (Dom.create_element ctx.tag_name ~namespace:ctx.namespace ())
| _, x :: _ -> Some x
| _, [] -> None
let is_in_html_namespace node =
node.Dom.namespace = None || node.Dom.namespace = Some "html"
(* Namespace-aware check for "special" elements per WHATWG spec *)
let is_special_element node =
let name = String.lowercase_ascii node.Dom.name in
match node.Dom.namespace with
| None | Some "html" -> Parser_constants.is_special name
| Some "mathml" -> List.mem name ["mi"; "mo"; "mn"; "ms"; "mtext"; "annotation-xml"]
| Some "svg" -> List.mem name ["foreignobject"; "desc"; "title"]
| _ -> false
let adjusted_current_node_in_html_namespace t =
match adjusted_current_node t with
| Some node -> is_in_html_namespace node
| None -> true
(* Insertion helpers *)
let appropriate_insertion_place t =
match current_node t with
| None -> (t.document, None)
| Some target ->
if t.foster_parenting && Parser_constants.is_foster_parenting_element target.Dom.name then begin
(* Foster parenting per WHATWG spec *)
(* Step 1: Find last (most recent) template and table in stack *)
(* Note: index 0 = top of stack = most recently added *)
let last_template_idx = ref None in
let last_table_idx = ref None in
List.iteri (fun i n ->
(* Take first match (most recent = lowest index) *)
if n.Dom.name = "template" && !last_template_idx = None then last_template_idx := Some i;
if n.Dom.name = "table" && !last_table_idx = None then last_table_idx := Some i
) t.open_elements;
(* Step 2-3: If last template is more recent than last table (lower index = more recent) *)
match !last_template_idx, !last_table_idx with
| Some ti, None ->
(* No table, use template content *)
let template = List.nth t.open_elements ti in
(match template.Dom.template_content with
| Some tc -> (tc, None)
| None -> (template, None))
| Some ti, Some tbi when ti < tbi ->
(* Template is more recent than table, use template content *)
let template = List.nth t.open_elements ti in
(match template.Dom.template_content with
| Some tc -> (tc, None)
| None -> (template, None))
| _, Some tbi ->
(* Use table's parent as foster parent *)
let table = List.nth t.open_elements tbi in
(match table.Dom.parent with
| Some parent -> (parent, Some table)
| None ->
(* Step 6: element above table in stack (index + 1 since 0 is top) *)
if tbi + 1 < List.length t.open_elements then
(List.nth t.open_elements (tbi + 1), None)
else
(t.document, None))
| None, None ->
(* No table or template, use document *)
(t.document, None)
end else begin
(* If target is a template, insert into its content document fragment *)
match target.Dom.template_content with
| Some tc -> (tc, None)
| None -> (target, None)
end
let insert_element t name ?(namespace=None) ?(push=false) attrs =
let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
let node = Dom.create_element name ~namespace ~attrs ~location () in
let (parent, before) = appropriate_insertion_place t in
(match before with
| None -> Dom.append_child parent node
| Some ref -> Dom.insert_before parent node ref);
if push then t.open_elements <- node :: t.open_elements;
node
let insert_element_for_token t (tag : Token.tag) =
insert_element t tag.name ~push:true tag.attrs
let insert_foreign_element t (tag : Token.tag) namespace =
let attrs =
if namespace = Some "svg" then
Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs tag.attrs)
else
Parser_constants.adjust_foreign_attrs tag.attrs
in
let name =
if namespace = Some "svg" then Parser_constants.adjust_svg_tag_name tag.name
else tag.name
in
let node = insert_element t name ~namespace attrs in
t.open_elements <- node :: t.open_elements;
node
let insert_character t data =
if t.ignore_lf && String.length data > 0 && data.[0] = '\n' then begin
t.ignore_lf <- false;
if String.length data > 1 then begin
let rest = String.sub data 1 (String.length data - 1) in
let (parent, before) = appropriate_insertion_place t in
Dom.insert_text_at parent rest before
end
end else begin
t.ignore_lf <- false;
let (parent, before) = appropriate_insertion_place t in
Dom.insert_text_at parent data before
end
let insert_comment t data =
let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
let node = Dom.create_comment ~location data in
let (parent, _) = appropriate_insertion_place t in
Dom.append_child parent node
let insert_comment_to_document t data =
let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
let node = Dom.create_comment ~location data in
Dom.append_child t.document node
(* Stack manipulation *)
let pop_current t =
match t.open_elements with
| [] -> ()
| _ :: rest -> t.open_elements <- rest
let pop_until t pred =
let rec loop () =
match t.open_elements with
| [] -> ()
| x :: rest ->
t.open_elements <- rest;
if not (pred x) then loop ()
in
loop ()
let pop_until_tag t name =
pop_until t (fun n -> n.Dom.name = name)
(* Pop until HTML namespace element with given name *)
let pop_until_html_tag t name =
pop_until t (fun n -> n.Dom.name = name && is_in_html_namespace n)
let pop_until_one_of t names =
pop_until t (fun n -> List.mem n.Dom.name names)
(* Pop until HTML namespace element with one of given names *)
let pop_until_html_one_of t names =
pop_until t (fun n -> List.mem n.Dom.name names && is_in_html_namespace n)
(* Check if element is an HTML integration point *)
let is_html_integration_point node =
(* SVG foreignObject, desc, and title are always HTML integration points *)
if node.Dom.namespace = Some "svg" &&
Parser_constants.is_svg_html_integration node.Dom.name then true
(* annotation-xml is an HTML integration point only with specific encoding values *)
else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
match List.assoc_opt "encoding" node.Dom.attrs with
| Some enc ->
let enc_lower = String.lowercase_ascii enc in
enc_lower = "text/html" || enc_lower = "application/xhtml+xml"
| None -> false
else false
(* Check if element is a MathML text integration point *)
let is_mathml_text_integration_point node =
node.Dom.namespace = Some "mathml" &&
Parser_constants.is_mathml_text_integration node.Dom.name
(* Scope checks - integration points also terminate scope (except for table scope) *)
(* Per WHATWG spec, scope checks only consider HTML namespace elements for the target names *)
let has_element_in_scope_impl t names exclude_list ~check_integration_points =
let rec check = function
| [] -> false
| n :: rest ->
(* Target elements must be in HTML namespace *)
if is_in_html_namespace n && List.mem n.Dom.name names then true
else if is_in_html_namespace n && List.mem n.Dom.name exclude_list then false
(* Integration points terminate scope (unless we're checking table scope) *)
else if check_integration_points && (is_html_integration_point n || is_mathml_text_integration_point n) then false
else check rest
in
check t.open_elements
let has_element_in_scope t name =
has_element_in_scope_impl t [name] Parser_constants.default_scope ~check_integration_points:true
let has_element_in_button_scope t name =
has_element_in_scope_impl t [name] Parser_constants.button_scope ~check_integration_points:true
let has_element_in_list_item_scope t name =
has_element_in_scope_impl t [name] Parser_constants.list_item_scope ~check_integration_points:true
let has_element_in_table_scope t name =
has_element_in_scope_impl t [name] Parser_constants.table_scope ~check_integration_points:false
let has_element_in_select_scope t name =
let rec check = function
| [] -> false
| n :: rest ->
if n.Dom.name = name then true
else if not (Parser_constants.is_select_scope_exclude n.Dom.name) then false
else check rest
in
check t.open_elements
(* Implied end tags *)
let generate_implied_end_tags t ?except () =
let rec loop () =
match current_node t with
| Some n when Parser_constants.is_implied_end_tag n.Dom.name ->
(match except with
| Some ex when n.Dom.name = ex -> ()
| _ -> pop_current t; loop ())
| _ -> ()
in
loop ()
let generate_all_implied_end_tags t =
let rec loop () =
match current_node t with
| Some n when Parser_constants.is_thoroughly_implied_end_tag n.Dom.name ->
pop_current t; loop ()
| _ -> ()
in
loop ()
(* Active formatting elements *)
let push_formatting_marker t =
t.active_formatting <- Marker :: t.active_formatting
let push_formatting_element t node name attrs =
(* Noah's Ark: remove earlier identical elements (up to 3) *)
let rec count_and_remove same acc = function
| [] -> List.rev acc
| Marker :: rest -> List.rev acc @ (Marker :: rest)
| Entry e :: rest when e.name = name && e.attrs = attrs ->
if same >= 2 then
count_and_remove same acc rest (* Remove this one *)
else
count_and_remove (same + 1) (Entry e :: acc) rest
| x :: rest -> count_and_remove same (x :: acc) rest
in
t.active_formatting <- count_and_remove 0 [] t.active_formatting;
t.active_formatting <- Entry { name; attrs; node } :: t.active_formatting
let clear_active_formatting_to_marker t =
let rec loop = function
| [] -> []
| Marker :: rest -> rest
| _ :: rest -> loop rest
in
t.active_formatting <- loop t.active_formatting
let reconstruct_active_formatting t =
let rec find_to_reconstruct acc = function
| [] -> acc
| Marker :: _ -> acc
| Entry e :: rest ->
if List.exists (fun n -> n == e.node) t.open_elements then acc
else find_to_reconstruct (Entry e :: acc) rest
in
let to_reconstruct = find_to_reconstruct [] t.active_formatting in
List.iter (fun entry ->
match entry with
| Entry e ->
let node = insert_element t e.name e.attrs in
t.open_elements <- node :: t.open_elements;
(* Update the entry to point to new node *)
t.active_formatting <- List.map (fun x ->
if x == entry then Entry { e with node }
else x
) t.active_formatting
| Marker -> ()
) to_reconstruct
(* Adoption agency algorithm - follows WHATWG spec *)
let adoption_agency t tag_name =
(* Step 1: If current node is subject and not in active formatting list, just pop *)
(match current_node t with
| Some n when n.Dom.name = tag_name ->
let in_active = List.exists (function
| Entry e -> e.name = tag_name
| Marker -> false
) t.active_formatting in
if not in_active then begin
pop_current t;
() (* Return early - this case is handled *)
end
| _ -> ());
(* Step 2: Outer loop *)
let outer_loop_counter = ref 0 in
let done_flag = ref false in
while !outer_loop_counter < 8 && not !done_flag do
incr outer_loop_counter;
(* Step 3: Find formatting element in active formatting list *)
let rec find_formatting_index idx = function
| [] -> None
| Marker :: _ -> None
| Entry e :: rest ->
if e.name = tag_name then Some (idx, e.node, e.attrs)
else find_formatting_index (idx + 1) rest
in
let formatting_entry = find_formatting_index 0 t.active_formatting in
match formatting_entry with
| None ->
(* No formatting element found - done *)
done_flag := true
| Some (fmt_idx, fmt_node, fmt_attrs) ->
(* Step 4: Check if formatting element is in open elements *)
if not (List.exists (fun n -> n == fmt_node) t.open_elements) then begin
parse_error t "adoption-agency-1.2";
t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
done_flag := true
end
(* Step 5: Check if formatting element is in scope *)
else if not (has_element_in_scope t tag_name) then begin
parse_error t "adoption-agency-1.3";
done_flag := true
end else begin
(* Step 6: Parse error if not current node *)
(match current_node t with
| Some n when n != fmt_node -> parse_error t "adoption-agency-1.3"
| _ -> ());
(* Step 7: Find furthest block - first special element BELOW formatting element *)
(* open_elements is [current(top)...html(bottom)], formatting element is somewhere in the middle *)
(* We need the first special element going from formatting element toward current *)
(* This is the "topmost" (closest to formatting element) special element that is "lower" (closer to current) *)
let fmt_stack_idx = ref (-1) in
List.iteri (fun i n -> if n == fmt_node then fmt_stack_idx := i) t.open_elements;
let furthest_block =
if !fmt_stack_idx <= 0 then None
else begin
(* Look from fmt_stack_idx-1 down to 0, find first special element *)
let rec find_from_idx idx =
if idx < 0 then None
else
let n = List.nth t.open_elements idx in
if is_special_element n then Some n
else find_from_idx (idx - 1)
in
find_from_idx (!fmt_stack_idx - 1)
end
in
match furthest_block with
| None ->
(* Step 8: No furthest block - pop elements including formatting element *)
pop_until t (fun n -> n == fmt_node);
t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
done_flag := true
| Some fb ->
(* Step 9: Let common ancestor be element immediately above formatting element *)
let rec find_common_ancestor = function
| [] -> None
| n :: rest when n == fmt_node ->
(match rest with x :: _ -> Some x | [] -> None)
| _ :: rest -> find_common_ancestor rest
in
let common_ancestor = find_common_ancestor t.open_elements in
(* Step 10: Bookmark starts after formatting element *)
let bookmark = ref (fmt_idx + 1) in
(* Step 11: Let last_node = furthest block *)
let last_node = ref fb in
(* Step 12: Inner loop *)
(* The inner loop processes elements between furthest_block and formatting_element,
removing non-formatting elements and reparenting formatting elements *)
let inner_loop_counter = ref 0 in
(* Get index of furthest block in open elements *)
let fb_idx = ref 0 in
List.iteri (fun i n -> if n == fb then fb_idx := i) t.open_elements;
(* Start from element after furthest block (toward formatting element) *)
let node_idx = ref (!fb_idx + 1) in
while !node_idx < List.length t.open_elements &&
(List.nth t.open_elements !node_idx) != fmt_node do
incr inner_loop_counter;
let current_node = List.nth t.open_elements !node_idx in
(* Step 12.3: Find node in active formatting list *)
let rec find_node_in_formatting idx = function
| [] -> None
| Entry e :: _rest when e.node == current_node -> Some idx
| _ :: rest -> find_node_in_formatting (idx + 1) rest
in
let node_fmt_idx = find_node_in_formatting 0 t.active_formatting in
(* Step 12.4: If inner loop counter > 3 and node in active formatting, remove it *)
let node_fmt_idx =
match node_fmt_idx with
| Some idx when !inner_loop_counter > 3 ->
t.active_formatting <- List.filteri (fun i _ -> i <> idx) t.active_formatting;
if idx < !bookmark then decr bookmark;
None
| x -> x
in
(* Step 12.5: If node not in active formatting, remove from stack and continue *)
match node_fmt_idx with
| None ->
(* Remove from stack - this shifts indices *)
t.open_elements <- List.filteri (fun i _ -> i <> !node_idx) t.open_elements
(* Don't increment node_idx since we removed an element *)
| Some af_idx ->
(* Step 12.6: Create new element for node *)
let (node_name, node_attrs) = match List.nth t.active_formatting af_idx with
| Entry e -> (e.name, e.attrs)
| Marker -> failwith "unexpected marker"
in
let new_node_elem = Dom.create_element node_name ~attrs:node_attrs () in
(* Update active formatting with new node *)
t.active_formatting <- List.mapi (fun i entry ->
if i = af_idx then Entry { name = node_name; node = new_node_elem; attrs = node_attrs }
else entry
) t.active_formatting;
(* Replace node in open elements *)
t.open_elements <- List.mapi (fun i n ->
if i = !node_idx then new_node_elem else n
) t.open_elements;
(* Step 12.7: If last_node is furthest block, update bookmark *)
if !last_node == fb then
bookmark := af_idx + 1;
(* Step 12.8: Reparent last_node to new node *)
(match !last_node.Dom.parent with
| Some p -> Dom.remove_child p !last_node
| None -> ());
Dom.append_child new_node_elem !last_node;
(* Step 12.9: Let last_node = new node *)
last_node := new_node_elem;
(* Move to next element *)
incr node_idx
done;
(* Step 13: Insert last_node into common ancestor *)
(match common_ancestor with
| Some ca ->
(match !last_node.Dom.parent with
| Some p -> Dom.remove_child p !last_node
| None -> ());
(* Check if we need foster parenting *)
if t.foster_parenting && Parser_constants.is_foster_parenting_element ca.Dom.name then begin
(* Find table and insert before it *)
let rec find_table = function
| [] -> None
| n :: rest when n.Dom.name = "table" -> Some (n, rest)
| _ :: rest -> find_table rest
in
match find_table t.open_elements with
| Some (table, _) ->
(match table.Dom.parent with
| Some parent -> Dom.insert_before parent !last_node table
| None -> Dom.append_child ca !last_node)
| None -> Dom.append_child ca !last_node
end else begin
(* If common ancestor is template, insert into its content *)
match ca.Dom.template_content with
| Some tc -> Dom.append_child tc !last_node
| None -> Dom.append_child ca !last_node
end
| None -> ());
(* Step 14: Create new formatting element *)
let new_formatting = Dom.create_element tag_name ~attrs:fmt_attrs () in
(* Step 15: Move children of furthest block to new formatting element *)
let fb_children = fb.Dom.children in
List.iter (fun child ->
Dom.remove_child fb child;
Dom.append_child new_formatting child
) fb_children;
(* Step 16: Append new formatting element to furthest block *)
Dom.append_child fb new_formatting;
(* Step 17: Remove old from active formatting, insert new at bookmark *)
let new_entry = Entry { name = tag_name; node = new_formatting; attrs = fmt_attrs } in
t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
(* Adjust bookmark since we removed an element *)
let adjusted_bookmark = if fmt_idx < !bookmark then !bookmark - 1 else !bookmark in
let rec insert_at_bookmark idx acc = function
| [] -> List.rev (new_entry :: acc)
| x :: rest when idx = adjusted_bookmark ->
List.rev_append acc (new_entry :: x :: rest)
| x :: rest -> insert_at_bookmark (idx + 1) (x :: acc) rest
in
t.active_formatting <- insert_at_bookmark 0 [] t.active_formatting;
(* Step 18: Remove formatting element from open elements, insert new after furthest block *)
(* "After" in stack terms means new_formatting should be between fb and current node *)
(* In our list orientation (current at index 0), this means new_formatting at lower index than fb *)
t.open_elements <- List.filter (fun n -> n != fmt_node) t.open_elements;
(* Find fb and insert new_formatting before it (lower index = closer to current) *)
let rec insert_before acc = function
| [] -> List.rev (new_formatting :: acc)
| n :: rest when n == fb ->
(* Insert new_formatting before fb: acc reversed, then new_formatting, then fb, then rest *)
List.rev_append acc (new_formatting :: n :: rest)
| n :: rest -> insert_before (n :: acc) rest
in
t.open_elements <- insert_before [] t.open_elements
(* Continue outer loop *)
end
done
(* Close p element *)
let close_p_element t =
generate_implied_end_tags t ~except:"p" ();
(match current_node t with
| Some n when n.Dom.name <> "p" -> parse_error t "end-tag-p-implied-but-open-elements"
| _ -> ());
pop_until_tag t "p"
(* Reset insertion mode *)
let reset_insertion_mode t =
let rec check_node last = function
| [] -> t.mode <- Parser_insertion_mode.In_body
| node :: rest ->
let is_last = rest = [] in
let node_to_check =
if is_last then
match t.fragment_context with
| Some ctx -> Dom.create_element ctx.tag_name ~namespace:ctx.namespace ()
| None -> node
else node
in
let name = node_to_check.Dom.name in
if name = "select" then begin
if not is_last then begin
let rec find_table_or_template = function
| [] -> ()
| n :: rest ->
if n.Dom.name = "template" then t.mode <- Parser_insertion_mode.In_select
else if n.Dom.name = "table" then t.mode <- Parser_insertion_mode.In_select_in_table
else find_table_or_template rest
in
find_table_or_template rest
end;
if t.mode <> Parser_insertion_mode.In_select_in_table then
t.mode <- Parser_insertion_mode.In_select
end else if Parser_constants.is_table_cell_element name && not is_last then
t.mode <- Parser_insertion_mode.In_cell
else if name = "tr" then
t.mode <- Parser_insertion_mode.In_row
else if Parser_constants.is_table_section_element name then
t.mode <- Parser_insertion_mode.In_table_body
else if name = "caption" then
t.mode <- Parser_insertion_mode.In_caption
else if name = "colgroup" then
t.mode <- Parser_insertion_mode.In_column_group
else if name = "table" then
t.mode <- Parser_insertion_mode.In_table
else if name = "template" then
t.mode <- (match t.template_modes with m :: _ -> m | [] -> Parser_insertion_mode.In_template)
else if name = "head" && not is_last then
t.mode <- Parser_insertion_mode.In_head
else if name = "body" then
t.mode <- Parser_insertion_mode.In_body
else if name = "frameset" then
t.mode <- Parser_insertion_mode.In_frameset
else if name = "html" then
t.mode <- (if t.head_element = None then Parser_insertion_mode.Before_head else Parser_insertion_mode.After_head)
else if is_last then
t.mode <- Parser_insertion_mode.In_body
else
check_node last rest
in
check_node false t.open_elements
let is_whitespace s =
let ws = [' '; '\t'; '\n'; '\x0C'; '\r'] in
String.for_all (fun c -> List.mem c ws) s
(* Mode handlers *)
let rec process_initial t token =
match token with
| Token.Character data when is_whitespace data -> ()
| Token.Comment data -> insert_comment_to_document t data
| Token.Doctype dt ->
let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
let node = Dom.create_doctype ?name:dt.name ?public_id:dt.public_id ?system_id:dt.system_id ~location () in
Dom.append_child t.document node;
(* Quirks mode detection *)
if dt.force_quirks then
t.quirks_mode <- Dom.Quirks
else if dt.name <> Some "html" then
t.quirks_mode <- Dom.Quirks
else begin
let pub = Option.map String.lowercase_ascii dt.public_id in
let sys = Option.map String.lowercase_ascii dt.system_id in
let is_quirky =
(match pub with
| Some p -> List.mem p Parser_constants.quirky_public_matches ||
List.exists (fun prefix -> String.length p >= String.length prefix &&
String.sub p 0 (String.length prefix) = prefix) Parser_constants.quirky_public_prefixes
| None -> false) ||
(match sys with
| Some s -> List.mem s Parser_constants.quirky_system_matches
| None -> false)
in
if is_quirky then t.quirks_mode <- Dom.Quirks
else begin
let is_limited_quirky =
match pub with
| Some p -> List.exists (fun prefix -> String.length p >= String.length prefix &&
String.sub p 0 (String.length prefix) = prefix)
Parser_constants.limited_quirky_public_prefixes
| None -> false
in
if is_limited_quirky then t.quirks_mode <- Dom.Limited_quirks
end
end;
t.mode <- Parser_insertion_mode.Before_html
| _ ->
parse_error t "expected-doctype-but-got-other";
t.quirks_mode <- Dom.Quirks;
t.mode <- Parser_insertion_mode.Before_html;
process_token t token
and process_before_html t token =
match token with
| Token.Doctype _ -> parse_error t "unexpected-doctype"
| Token.Comment data -> insert_comment_to_document t data
| Token.Character data when is_whitespace data -> ()
| Token.Tag { kind = Token.Start; name = "html"; attrs; _ } ->
let html = insert_element t "html" attrs in
t.open_elements <- [html];
t.mode <- Parser_insertion_mode.Before_head
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] ->
let html = insert_element t "html" [] in
t.open_elements <- [html];
t.mode <- Parser_insertion_mode.Before_head;
process_token t token
| Token.Tag { kind = Token.End; name; _ } ->
parse_error t ("unexpected-end-tag:" ^ name)
| _ ->
let html = insert_element t "html" [] in
t.open_elements <- [html];
t.mode <- Parser_insertion_mode.Before_head;
process_token t token
and process_before_head t token =
match token with
| Token.Character data when is_whitespace data -> ()
| Token.Comment data -> insert_comment t data
| Token.Doctype _ -> parse_error t "unexpected-doctype"
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
process_in_body t token
| Token.Tag { kind = Token.Start; name = "head"; attrs; _ } ->
let head = insert_element t "head" attrs in
t.open_elements <- head :: t.open_elements;
t.head_element <- Some head;
t.mode <- Parser_insertion_mode.In_head
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] ->
let head = insert_element t "head" [] in
t.open_elements <- head :: t.open_elements;
t.head_element <- Some head;
t.mode <- Parser_insertion_mode.In_head;
process_token t token
| Token.Tag { kind = Token.End; name; _ } ->
parse_error t ("unexpected-end-tag:" ^ name)
| _ ->
let head = insert_element t "head" [] in
t.open_elements <- head :: t.open_elements;
t.head_element <- Some head;
t.mode <- Parser_insertion_mode.In_head;
process_token t token
and process_in_head t token =
match token with
| Token.Character data when is_whitespace data ->
insert_character t data
| Token.Character data ->
(* Extract leading whitespace *)
let rec count_leading_ws i =
if i >= String.length data then i
else match data.[i] with
| '\t' | '\n' | '\x0C' | '\r' | ' ' -> count_leading_ws (i + 1)
| _ -> i
in
let ws_count = count_leading_ws 0 in
let leading_ws = String.sub data 0 ws_count in
let remaining = String.sub data ws_count (String.length data - ws_count) in
(* If there's leading whitespace and current element has children, insert it *)
if leading_ws <> "" then
(match current_node t with
| Some n when n.Dom.children <> [] -> insert_character t leading_ws
| _ -> ());
pop_current t;
t.mode <- Parser_insertion_mode.After_head;
process_token t (Token.Character remaining)
| Token.Comment data ->
insert_comment t data
| Token.Doctype _ ->
parse_error t "unexpected-doctype"
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
process_in_body t token
| Token.Tag { kind = Token.Start; name; attrs; _ }
when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"] ->
ignore (insert_element t name attrs)
| Token.Tag { kind = Token.Start; name = "title"; attrs; self_closing } ->
ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs; self_closing });
t.original_mode <- Some t.mode;
t.mode <- Parser_insertion_mode.Text
| Token.Tag { kind = Token.Start; name; attrs; self_closing }
when List.mem name ["noframes"; "style"] ->
ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing });
t.original_mode <- Some t.mode;
t.mode <- Parser_insertion_mode.Text
| Token.Tag { kind = Token.Start; name = "noscript"; attrs; self_closing } ->
(* Scripting is disabled: parse noscript content as HTML *)
ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs; self_closing });
t.mode <- Parser_insertion_mode.In_head_noscript
| Token.Tag { kind = Token.Start; name = "script"; attrs; self_closing } ->
ignore (insert_element_for_token t { kind = Token.Start; name = "script"; attrs; self_closing });
t.original_mode <- Some t.mode;
t.mode <- Parser_insertion_mode.Text
| Token.Tag { kind = Token.End; name = "head"; _ } ->
pop_current t;
t.mode <- Parser_insertion_mode.After_head
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] ->
pop_current t;
t.mode <- Parser_insertion_mode.After_head;
process_token t token
| Token.Tag { kind = Token.Start; name = "template"; attrs; _ } ->
let node = Dom.create_template ~attrs () in
let (parent, _) = appropriate_insertion_place t in
Dom.append_child parent node;
t.open_elements <- node :: t.open_elements;
push_formatting_marker t;
t.frameset_ok <- false;
t.mode <- Parser_insertion_mode.In_template;
t.template_modes <- Parser_insertion_mode.In_template :: t.template_modes
| Token.Tag { kind = Token.End; name = "template"; _ } ->
if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then
parse_error t "unexpected-end-tag"
else begin
generate_all_implied_end_tags t;
(match current_node t with
| Some n when not (n.Dom.name = "template" && is_in_html_namespace n) -> parse_error t "unexpected-end-tag"
| _ -> ());
pop_until_html_tag t "template";
clear_active_formatting_to_marker t;
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
reset_insertion_mode t
end
| Token.Tag { kind = Token.Start; name = "head"; _ } ->
parse_error t "unexpected-start-tag"
| Token.Tag { kind = Token.End; name; _ } ->
parse_error t ("unexpected-end-tag:" ^ name)
| _ ->
pop_current t;
t.mode <- Parser_insertion_mode.After_head;
process_token t token
and process_in_head_noscript t token =
match token with
| Token.Character data when is_whitespace data ->
process_in_head t token
| Token.Character _ ->
parse_error t "unexpected-char-in-noscript";
pop_current t; (* Pop noscript *)
t.mode <- Parser_insertion_mode.In_head;
process_token t token
| Token.Comment _ ->
process_in_head t token
| Token.Doctype _ ->
parse_error t "unexpected-doctype"
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
process_in_body t token
| Token.Tag { kind = Token.Start; name; _ }
when List.mem name ["basefont"; "bgsound"; "link"; "meta"; "noframes"; "style"] ->
process_in_head t token
| Token.Tag { kind = Token.Start; name; _ }
when List.mem name ["head"; "noscript"] ->
parse_error t "unexpected-start-tag"
| Token.Tag { kind = Token.Start; name; _ } ->
parse_error t ("bad-start-tag-in-head-noscript:" ^ name);
pop_current t; (* Pop noscript *)
t.mode <- Parser_insertion_mode.In_head;
process_token t token
| Token.Tag { kind = Token.End; name = "noscript"; _ } ->
pop_current t; (* Pop noscript *)
t.mode <- Parser_insertion_mode.In_head
| Token.Tag { kind = Token.End; name = "br"; _ } ->
parse_error t "unexpected-end-tag";
pop_current t; (* Pop noscript *)
t.mode <- Parser_insertion_mode.In_head;
process_token t token
| Token.Tag { kind = Token.End; name; _ } ->
parse_error t ("unexpected-end-tag:" ^ name)
| Token.EOF ->
parse_error t "expected-closing-tag-but-got-eof";
pop_current t; (* Pop noscript *)
t.mode <- Parser_insertion_mode.In_head;
process_token t token
and process_after_head t token =
match token with
| Token.Character data when is_whitespace data ->
insert_character t data
| Token.Comment data ->
insert_comment t data
| Token.Doctype _ ->
parse_error t "unexpected-doctype"
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
process_in_body t token
| Token.Tag { kind = Token.Start; name = "body"; attrs; _ } ->
ignore (insert_element t "body" ~push:true attrs);
t.frameset_ok <- false;
t.mode <- Parser_insertion_mode.In_body
| Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
ignore (insert_element t "frameset" ~push:true attrs);
t.mode <- Parser_insertion_mode.In_frameset
| Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
(* Special handling for input type="hidden" - parse error, ignore *)
let input_type = List.assoc_opt "type" attrs in
(match input_type with
| Some typ when String.lowercase_ascii typ = "hidden" ->
parse_error t "unexpected-hidden-input-after-head"
| _ ->
(* Non-hidden input creates body *)
let body = insert_element t "body" [] in
t.open_elements <- body :: t.open_elements;
t.mode <- Parser_insertion_mode.In_body;
process_token t token)
| Token.Tag { kind = Token.Start; name; _ }
when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
parse_error t "unexpected-start-tag";
(match t.head_element with
| Some head ->
t.open_elements <- head :: t.open_elements;
process_in_head t token;
t.open_elements <- List.filter (fun n -> n != head) t.open_elements
| None -> ())
| Token.Tag { kind = Token.End; name = "template"; _ } ->
process_in_head t token
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] ->
let body = insert_element t "body" [] in
t.open_elements <- body :: t.open_elements;
t.mode <- Parser_insertion_mode.In_body;
process_token t token
| Token.Tag { kind = Token.Start; name = "head"; _ } ->
parse_error t "unexpected-start-tag"
| Token.Tag { kind = Token.End; name; _ } ->
parse_error t ("unexpected-end-tag:" ^ name)
| _ ->
let body = insert_element t "body" [] in
t.open_elements <- body :: t.open_elements;
t.mode <- Parser_insertion_mode.In_body;
process_token t token
and process_in_body t token =
match token with
| Token.Character "\x00" ->
parse_error t "unexpected-null-character"
| Token.Character data ->
reconstruct_active_formatting t;
insert_character t data;
if not (is_whitespace data) then t.frameset_ok <- false
| Token.Comment data ->
insert_comment t data
| Token.Doctype _ ->
parse_error t "unexpected-doctype"
| Token.Tag { kind = Token.Start; name = "html"; attrs; _ } ->
parse_error t "unexpected-start-tag";
if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
(* Find the html element (at the bottom of the stack) *)
let html_elem = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in
(match html_elem with
| Some html ->
List.iter (fun (k, v) ->
if not (Dom.has_attr html k) then Dom.set_attr html k v
) attrs
| None -> ())
| Token.Tag { kind = Token.Start; name; _ }
when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
process_in_head t token
| Token.Tag { kind = Token.End; name = "template"; _ } ->
process_in_head t token
| Token.Tag { kind = Token.Start; name = "body"; attrs; _ } ->
parse_error t "unexpected-start-tag";
(* Find body element on stack - it should be near the end (html is last) *)
let body = List.find_opt (fun n -> n.Dom.name = "body") t.open_elements in
(match body with
| Some body when not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) ->
t.frameset_ok <- false;
List.iter (fun (k, v) ->
if not (Dom.has_attr body k) then Dom.set_attr body k v
) attrs
| _ -> ())
| Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
if not t.frameset_ok then
parse_error t "unexpected-start-tag-ignored"
else begin
(* Find body element on the stack *)
let rec find_body_index idx = function
| [] -> None
| n :: rest ->
if n.Dom.name = "body" then Some (idx, n)
else find_body_index (idx + 1) rest
in
match find_body_index 0 t.open_elements with
| None ->
parse_error t "unexpected-start-tag-ignored"
| Some (idx, body_elem) ->
(* Remove body from its parent (the html element) *)
(match body_elem.Dom.parent with
| Some parent -> Dom.remove_child parent body_elem
| None -> ());
(* Pop all elements up to and including body - keep only elements after body_idx *)
let rec drop n lst = if n <= 0 then lst else match lst with [] -> [] | _ :: rest -> drop (n - 1) rest in
t.open_elements <- drop (idx + 1) t.open_elements;
(* Insert frameset element *)
ignore (insert_element t "frameset" ~push:true attrs);
t.mode <- Parser_insertion_mode.In_frameset
end
| Token.EOF ->
if t.template_modes <> [] then
process_in_template t token
else begin
let has_unclosed = List.exists (fun n ->
not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"])
) t.open_elements in
if has_unclosed then parse_error t "expected-closing-tag-but-got-eof"
end
| Token.Tag { kind = Token.End; name = "body"; _ } ->
if not (has_element_in_scope t "body") then
parse_error t "unexpected-end-tag"
else begin
let has_unclosed = List.exists (fun n ->
not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"])
) t.open_elements in
if has_unclosed then parse_error t "end-tag-too-early";
t.mode <- Parser_insertion_mode.After_body
end
| Token.Tag { kind = Token.End; name = "html"; _ } ->
if not (has_element_in_scope t "body") then
parse_error t "unexpected-end-tag"
else begin
t.mode <- Parser_insertion_mode.After_body;
process_token t token
end
| Token.Tag { kind = Token.Start; name; attrs; _ }
when List.mem name ["address"; "article"; "aside"; "blockquote"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "main"; "menu"; "nav"; "ol"; "p"; "search"; "section"; "summary"; "ul"] ->
if has_element_in_button_scope t "p" then close_p_element t;
ignore (insert_element t name ~push:true attrs)
| Token.Tag { kind = Token.Start; name; attrs; _ } when Parser_constants.is_heading_element name ->
if has_element_in_button_scope t "p" then close_p_element t;
(match current_node t with
| Some n when Parser_constants.is_heading_element n.Dom.name ->
parse_error t "unexpected-start-tag";
pop_current t
| _ -> ());
ignore (insert_element t name ~push:true attrs)
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["pre"; "listing"] ->
if has_element_in_button_scope t "p" then close_p_element t;
ignore (insert_element t name ~push:true attrs);
t.ignore_lf <- true;
t.frameset_ok <- false
| Token.Tag { kind = Token.Start; name = "form"; attrs; _ } ->
if t.form_element <> None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
parse_error t "unexpected-start-tag"
else begin
if has_element_in_button_scope t "p" then close_p_element t;
let form = insert_element t "form" attrs in
t.open_elements <- form :: t.open_elements;
if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
t.form_element <- Some form
end
| Token.Tag { kind = Token.Start; name = "li"; attrs; _ } ->
t.frameset_ok <- false;
let rec check = function
| [] -> ()
| n :: rest ->
if n.Dom.name = "li" then begin
generate_implied_end_tags t ~except:"li" ();
(match current_node t with
| Some cn when cn.Dom.name <> "li" -> parse_error t "unexpected-start-tag"
| _ -> ());
pop_until_tag t "li"
end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then
()
else
check rest
in
check t.open_elements;
if has_element_in_button_scope t "p" then close_p_element t;
ignore (insert_element t "li" ~push:true attrs)
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["dd"; "dt"] ->
t.frameset_ok <- false;
let rec check = function
| [] -> ()
| n :: rest ->
if List.mem n.Dom.name ["dd"; "dt"] then begin
generate_implied_end_tags t ~except:n.Dom.name ();
(match current_node t with
| Some cn when cn.Dom.name <> n.Dom.name -> parse_error t "unexpected-start-tag"
| _ -> ());
pop_until_one_of t ["dd"; "dt"]
end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then
()
else
check rest
in
check t.open_elements;
if has_element_in_button_scope t "p" then close_p_element t;
ignore (insert_element t name ~push:true attrs)
| Token.Tag { kind = Token.Start; name = "plaintext"; _ } ->
if has_element_in_button_scope t "p" then close_p_element t;
ignore (insert_element t "plaintext" ~push:true [])
(* Tokenizer should switch to PLAINTEXT state *)
| Token.Tag { kind = Token.Start; name = "button"; attrs; _ } ->
if has_element_in_scope t "button" then begin
parse_error t "unexpected-start-tag";
generate_implied_end_tags t ();
pop_until_tag t "button"
end;
reconstruct_active_formatting t;
ignore (insert_element t "button" ~push:true attrs);
t.frameset_ok <- false
| Token.Tag { kind = Token.End; name; _ }
when List.mem name ["address"; "article"; "aside"; "blockquote"; "button"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "listing"; "main"; "menu"; "nav"; "ol"; "pre"; "search"; "section"; "summary"; "ul"] ->
if not (has_element_in_scope t name) then
parse_error t ("unexpected-end-tag:" ^ name)
else begin
generate_implied_end_tags t ();
(match current_node t with
| Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
| _ -> ());
pop_until_tag t name
end
| Token.Tag { kind = Token.End; name = "form"; _ } ->
if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin
let node = t.form_element in
t.form_element <- None;
match node with
| None -> parse_error t "unexpected-end-tag"
| Some form_node ->
if not (has_element_in_scope t "form") then
parse_error t "unexpected-end-tag"
else begin
generate_implied_end_tags t ();
(match current_node t with
| Some n when n != form_node -> parse_error t "end-tag-too-early"
| _ -> ());
t.open_elements <- List.filter (fun n -> n != form_node) t.open_elements
end
end else begin
if not (has_element_in_scope t "form") then
parse_error t "unexpected-end-tag"
else begin
generate_implied_end_tags t ();
(match current_node t with
| Some n when n.Dom.name <> "form" -> parse_error t "end-tag-too-early"
| _ -> ());
pop_until_tag t "form"
end
end
| Token.Tag { kind = Token.End; name = "p"; _ } ->
if not (has_element_in_button_scope t "p") then begin
parse_error t "no-p-element-in-scope";
ignore (insert_element t "p" ~push:true [])
end;
close_p_element t
| Token.Tag { kind = Token.End; name = "li"; _ } ->
if not (has_element_in_list_item_scope t "li") then
parse_error t "unexpected-end-tag"
else begin
generate_implied_end_tags t ~except:"li" ();
(match current_node t with
| Some n when n.Dom.name <> "li" -> parse_error t "end-tag-too-early"
| _ -> ());
pop_until_tag t "li"
end
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["dd"; "dt"] ->
if not (has_element_in_scope t name) then
parse_error t "unexpected-end-tag"
else begin
generate_implied_end_tags t ~except:name ();
(match current_node t with
| Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
| _ -> ());
pop_until_tag t name
end
| Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_heading_element name ->
if not (has_element_in_scope_impl t Parser_constants.heading_elements Parser_constants.default_scope ~check_integration_points:true) then
parse_error t "unexpected-end-tag"
else begin
generate_implied_end_tags t ();
(match current_node t with
| Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
| _ -> ());
pop_until_one_of t Parser_constants.heading_elements
end
| Token.Tag { kind = Token.Start; name = "a"; attrs; _ } ->
(* Check for existing in active formatting *)
let rec find_a = function
| [] -> None
| Marker :: _ -> None
| Entry e :: _ when e.name = "a" -> Some e.node
| _ :: rest -> find_a rest
in
(match find_a t.active_formatting with
| Some existing ->
parse_error t "unexpected-start-tag";
adoption_agency t "a";
t.active_formatting <- List.filter (function
| Entry e -> e.node != existing
| _ -> true
) t.active_formatting;
t.open_elements <- List.filter (fun n -> n != existing) t.open_elements
| None -> ());
reconstruct_active_formatting t;
let node = insert_element t "a" attrs in
t.open_elements <- node :: t.open_elements;
push_formatting_element t node "a" attrs
| Token.Tag { kind = Token.Start; name; attrs; _ }
when List.mem name ["b"; "big"; "code"; "em"; "font"; "i"; "s"; "small"; "strike"; "strong"; "tt"; "u"] ->
reconstruct_active_formatting t;
let node = insert_element t name attrs in
t.open_elements <- node :: t.open_elements;
push_formatting_element t node name attrs
| Token.Tag { kind = Token.Start; name = "nobr"; attrs; _ } ->
if has_element_in_scope t "nobr" then begin
parse_error t "unexpected-start-tag";
adoption_agency t "nobr";
(* Remove nobr from active formatting *)
t.active_formatting <- List.filter (function
| Entry e -> e.name <> "nobr"
| Marker -> true
) t.active_formatting;
(* Remove nobr from open elements *)
t.open_elements <- List.filter (fun n -> n.Dom.name <> "nobr") t.open_elements
end;
reconstruct_active_formatting t;
let node = insert_element t "nobr" attrs in
t.open_elements <- node :: t.open_elements;
push_formatting_element t node "nobr" attrs
| Token.Tag { kind = Token.End; name; _ }
when List.mem name ["a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u"] ->
adoption_agency t name
| Token.Tag { kind = Token.Start; name; attrs; _ }
when List.mem name ["applet"; "marquee"; "object"] ->
reconstruct_active_formatting t;
ignore (insert_element t name ~push:true attrs);
push_formatting_marker t;
t.frameset_ok <- false
| Token.Tag { kind = Token.End; name; _ }
when List.mem name ["applet"; "marquee"; "object"] ->
if not (has_element_in_scope t name) then
parse_error t "unexpected-end-tag"
else begin
generate_implied_end_tags t ();
(match current_node t with
| Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
| _ -> ());
pop_until_tag t name;
clear_active_formatting_to_marker t
end
| Token.Tag { kind = Token.Start; name = "table"; attrs; _ } ->
if t.quirks_mode <> Dom.Quirks && has_element_in_button_scope t "p" then
close_p_element t;
ignore (insert_element t "table" ~push:true attrs);
t.frameset_ok <- false;
t.mode <- Parser_insertion_mode.In_table
| Token.Tag { kind = Token.End; name = "br"; _ } ->
parse_error t "end-tag-br";
reconstruct_active_formatting t;
ignore (insert_element t "br" ~push:true []);
pop_current t;
t.frameset_ok <- false
| Token.Tag { kind = Token.Start; name; attrs; _ }
when List.mem name ["area"; "br"; "embed"; "img"; "keygen"; "wbr"] ->
reconstruct_active_formatting t;
ignore (insert_element t name ~push:true attrs);
pop_current t;
t.frameset_ok <- false
| Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
reconstruct_active_formatting t;
ignore (insert_element t "input" ~push:true attrs);
pop_current t;
let is_hidden = List.exists (fun (k, v) ->
String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden"
) attrs in
if not is_hidden then t.frameset_ok <- false
| Token.Tag { kind = Token.Start; name; attrs; _ }
when List.mem name ["param"; "source"; "track"] ->
ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing = false });
pop_current t
| Token.Tag { kind = Token.Start; name = "hr"; _ } ->
if has_element_in_button_scope t "p" then close_p_element t;
ignore (insert_element t "hr" ~push:true []);
pop_current t;
t.frameset_ok <- false
| Token.Tag { kind = Token.Start; name = "image"; attrs; _ } ->
parse_error t "unexpected-start-tag";
(* Treat *)
reconstruct_active_formatting t;
ignore (insert_element t "img" ~push:true attrs);
pop_current t;
t.frameset_ok <- false
| Token.Tag { kind = Token.Start; name = "textarea"; attrs; _ } ->
ignore (insert_element t "textarea" ~push:true attrs);
t.ignore_lf <- true;
t.original_mode <- Some t.mode;
t.frameset_ok <- false;
t.mode <- Parser_insertion_mode.Text
| Token.Tag { kind = Token.Start; name = "xmp"; attrs; _ } ->
if has_element_in_button_scope t "p" then close_p_element t;
reconstruct_active_formatting t;
t.frameset_ok <- false;
ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs; self_closing = false });
t.original_mode <- Some t.mode;
t.mode <- Parser_insertion_mode.Text
| Token.Tag { kind = Token.Start; name = "iframe"; attrs; _ } ->
t.frameset_ok <- false;
ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs; self_closing = false });
t.original_mode <- Some t.mode;
t.mode <- Parser_insertion_mode.Text
| Token.Tag { kind = Token.Start; name = "noembed"; attrs; _ } ->
ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs; self_closing = false });
t.original_mode <- Some t.mode;
t.mode <- Parser_insertion_mode.Text
| Token.Tag { kind = Token.Start; name = "select"; attrs; _ } ->
reconstruct_active_formatting t;
ignore (insert_element t "select" ~push:true attrs);
t.frameset_ok <- false;
if List.mem t.mode [Parser_insertion_mode.In_table; Parser_insertion_mode.In_caption; Parser_insertion_mode.In_table_body; Parser_insertion_mode.In_row; Parser_insertion_mode.In_cell] then
t.mode <- Parser_insertion_mode.In_select_in_table
else
t.mode <- Parser_insertion_mode.In_select
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["optgroup"; "option"] ->
(match current_node t with
| Some n when n.Dom.name = "option" -> pop_current t
| _ -> ());
reconstruct_active_formatting t;
ignore (insert_element t name ~push:true attrs)
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rb"; "rtc"] ->
if has_element_in_scope t "ruby" then begin
generate_implied_end_tags t ()
end;
(match current_node t with
| Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag"
| _ -> ());
ignore (insert_element t name ~push:true attrs)
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rp"; "rt"] ->
if has_element_in_scope t "ruby" then begin
generate_implied_end_tags t ~except:"rtc" ()
end;
(match current_node t with
| Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag"
| _ -> ());
ignore (insert_element t name ~push:true attrs)
| Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } ->
reconstruct_active_formatting t;
let adjusted_attrs = Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs) in
ignore (insert_foreign_element t { kind = Token.Start; name = "math"; attrs = adjusted_attrs; self_closing } (Some "mathml"));
if self_closing then pop_current t
| Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } ->
reconstruct_active_formatting t;
let adjusted_attrs = Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs) in
ignore (insert_foreign_element t { kind = Token.Start; name = "svg"; attrs = adjusted_attrs; self_closing } (Some "svg"));
if self_closing then pop_current t
| Token.Tag { kind = Token.Start; name; attrs; _ }
when List.mem name ["col"; "frame"] ->
(* In fragment context, insert these; otherwise ignore *)
if t.fragment_context = None then
parse_error t "unexpected-start-tag-ignored"
else
ignore (insert_element t name attrs)
| Token.Tag { kind = Token.Start; name; _ }
when List.mem name ["caption"; "colgroup"; "head"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
parse_error t "unexpected-start-tag"
| Token.Tag { kind = Token.Start; name; attrs; self_closing } ->
(* Any other start tag *)
reconstruct_active_formatting t;
ignore (insert_element t name ~push:true attrs);
(* Check for self-closing on non-void HTML element *)
if self_closing && not (Parser_constants.is_void_element name) then
parse_error t "non-void-html-element-start-tag-with-trailing-solidus"
| Token.Tag { kind = Token.End; name; _ } ->
(* Any other end tag *)
let rec check = function
| [] -> ()
| node :: rest ->
if node.Dom.name = name then begin
generate_implied_end_tags t ~except:name ();
(match current_node t with
| Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
| _ -> ());
pop_until t (fun n -> n == node)
end else if is_special_element node then
parse_error t ("unexpected-end-tag:" ^ name)
else
check rest
in
check t.open_elements
and process_text t token =
match token with
| Token.Character data ->
insert_character t data
| Token.EOF ->
parse_error t "expected-closing-tag-but-got-eof";
pop_current t;
t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body;
process_token t token
| Token.Tag { kind = Token.End; _ } ->
pop_current t;
t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body
| _ -> ()
and process_in_table t token =
match token with
| Token.Character _ when (match current_node t with Some n -> Parser_constants.is_foster_parenting_element n.Dom.name | None -> false) ->
t.pending_table_chars <- [];
t.original_mode <- Some t.mode;
t.mode <- Parser_insertion_mode.In_table_text;
process_token t token
| Token.Comment data ->
insert_comment t data
| Token.Doctype _ ->
parse_error t "unexpected-doctype"
| Token.Tag { kind = Token.Start; name = "caption"; attrs; _ } ->
clear_stack_back_to_table_context t;
push_formatting_marker t;
ignore (insert_element t "caption" ~push:true attrs);
t.mode <- Parser_insertion_mode.In_caption
| Token.Tag { kind = Token.Start; name = "colgroup"; attrs; _ } ->
clear_stack_back_to_table_context t;
ignore (insert_element t "colgroup" ~push:true attrs);
t.mode <- Parser_insertion_mode.In_column_group
| Token.Tag { kind = Token.Start; name = "col"; _ } ->
clear_stack_back_to_table_context t;
ignore (insert_element t "colgroup" ~push:true []);
t.mode <- Parser_insertion_mode.In_column_group;
process_token t token
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
clear_stack_back_to_table_context t;
ignore (insert_element t name ~push:true attrs);
t.mode <- Parser_insertion_mode.In_table_body
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"; "tr"] ->
clear_stack_back_to_table_context t;
ignore (insert_element t "tbody" ~push:true []);
t.mode <- Parser_insertion_mode.In_table_body;
process_token t token
| Token.Tag { kind = Token.Start; name = "table"; _ } ->
parse_error t "unexpected-start-tag";
if has_element_in_table_scope t "table" then begin
pop_until_tag t "table";
reset_insertion_mode t;
process_token t token
end
| Token.Tag { kind = Token.End; name = "table"; _ } ->
if not (has_element_in_table_scope t "table") then
parse_error t "unexpected-end-tag"
else begin
pop_until_tag t "table";
reset_insertion_mode t
end
| Token.Tag { kind = Token.End; name; _ }
when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
parse_error t "unexpected-end-tag"
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["style"; "script"; "template"] ->
process_in_head t token
| Token.Tag { kind = Token.End; name = "template"; _ } ->
process_in_head t token
| Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
let is_hidden = List.exists (fun (k, v) ->
String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden"
) attrs in
if not is_hidden then begin
parse_error t "start-tag-in-table:input";
t.foster_parenting <- true;
process_in_body t token;
t.foster_parenting <- false
end else begin
parse_error t "start-tag-in-table:input";
ignore (insert_element t "input" ~push:true attrs);
pop_current t
end
| Token.Tag { kind = Token.Start; name = "form"; attrs; _ } ->
parse_error t "unexpected-start-tag";
if t.form_element = None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin
let form = insert_element t "form" attrs in
t.open_elements <- form :: t.open_elements;
t.form_element <- Some form;
pop_current t
end
| Token.EOF ->
process_in_body t token
| _ ->
parse_error t "unexpected-token-in-table";
t.foster_parenting <- true;
process_in_body t token;
t.foster_parenting <- false
and clear_stack_back_to_table_context t =
let rec loop () =
match current_node t with
| Some n when not (List.mem n.Dom.name ["table"; "template"; "html"]) ->
pop_current t;
loop ()
| _ -> ()
in
loop ()
and process_in_table_text t token =
match token with
| Token.Character data ->
if String.contains data '\x00' then
parse_error t "unexpected-null-character"
else
t.pending_table_chars <- data :: t.pending_table_chars
| _ ->
let pending = String.concat "" (List.rev t.pending_table_chars) in
t.pending_table_chars <- [];
if not (is_whitespace pending) then begin
parse_error t "unexpected-character-in-table";
t.foster_parenting <- true;
reconstruct_active_formatting t;
insert_character t pending;
t.foster_parenting <- false
end else
insert_character t pending;
t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_table;
process_token t token
and process_in_caption t token =
match token with
| Token.Tag { kind = Token.End; name = "caption"; _ } ->
if not (has_element_in_table_scope t "caption") then
parse_error t "unexpected-end-tag"
else begin
generate_implied_end_tags t ();
(match current_node t with
| Some n when n.Dom.name <> "caption" -> parse_error t "end-tag-too-early"
| _ -> ());
pop_until_tag t "caption";
clear_active_formatting_to_marker t;
t.mode <- Parser_insertion_mode.In_table
end
| Token.Tag { kind = Token.Start; name; _ }
when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
if not (has_element_in_table_scope t "caption") then
parse_error t "unexpected-start-tag"
else begin
generate_implied_end_tags t ();
pop_until_tag t "caption";
clear_active_formatting_to_marker t;
t.mode <- Parser_insertion_mode.In_table;
process_token t token
end
| Token.Tag { kind = Token.End; name = "table"; _ } ->
if not (has_element_in_table_scope t "caption") then
parse_error t "unexpected-end-tag"
else begin
generate_implied_end_tags t ();
pop_until_tag t "caption";
clear_active_formatting_to_marker t;
t.mode <- Parser_insertion_mode.In_table;
process_token t token
end
| Token.Tag { kind = Token.End; name; _ }
when List.mem name ["body"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
parse_error t "unexpected-end-tag"
| _ ->
process_in_body t token
and process_in_column_group t token =
match token with
| Token.Character data when is_whitespace data ->
insert_character t data
| Token.Character data ->
(* Split leading whitespace from non-whitespace *)
let ws_chars = [' '; '\t'; '\n'; '\x0C'; '\r'] in
let len = String.length data in
let ws_end = ref 0 in
while !ws_end < len && List.mem data.[!ws_end] ws_chars do incr ws_end done;
if !ws_end > 0 then
insert_character t (String.sub data 0 !ws_end);
if !ws_end < len then begin
let remaining = String.sub data !ws_end (len - !ws_end) in
(match current_node t with
| Some n when n.Dom.name = "colgroup" ->
pop_current t;
t.mode <- Parser_insertion_mode.In_table;
process_token t (Token.Character remaining)
| _ ->
parse_error t "unexpected-token")
end
| Token.Comment data ->
insert_comment t data
| Token.Doctype _ ->
parse_error t "unexpected-doctype"
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
process_in_body t token
| Token.Tag { kind = Token.Start; name = "col"; attrs; _ } ->
ignore (insert_element t "col" ~push:true attrs);
pop_current t
| Token.Tag { kind = Token.End; name = "colgroup"; _ } ->
(match current_node t with
| Some n when n.Dom.name <> "colgroup" -> parse_error t "unexpected-end-tag"
| Some _ -> pop_current t; t.mode <- Parser_insertion_mode.In_table
| None -> parse_error t "unexpected-end-tag")
| Token.Tag { kind = Token.End; name = "col"; _ } ->
parse_error t "unexpected-end-tag"
| Token.Tag { kind = Token.Start; name = "template"; _ }
| Token.Tag { kind = Token.End; name = "template"; _ } ->
process_in_head t token
| Token.EOF ->
process_in_body t token
| _ ->
(match current_node t with
| Some n when n.Dom.name = "colgroup" ->
pop_current t;
t.mode <- Parser_insertion_mode.In_table;
process_token t token
| _ ->
parse_error t "unexpected-token")
and process_in_table_body t token =
match token with
| Token.Tag { kind = Token.Start; name = "tr"; attrs; _ } ->
clear_stack_back_to_table_body_context t;
ignore (insert_element t "tr" ~push:true attrs);
t.mode <- Parser_insertion_mode.In_row
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["th"; "td"] ->
parse_error t "unexpected-start-tag";
clear_stack_back_to_table_body_context t;
ignore (insert_element t "tr" ~push:true []);
t.mode <- Parser_insertion_mode.In_row;
process_token t token
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
if not (has_element_in_table_scope t name) then
parse_error t "unexpected-end-tag"
else begin
clear_stack_back_to_table_body_context t;
pop_current t;
t.mode <- Parser_insertion_mode.In_table
end
| Token.Tag { kind = Token.Start; name; _ }
when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"] ->
if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then
parse_error t "unexpected-start-tag"
else begin
clear_stack_back_to_table_body_context t;
pop_current t;
t.mode <- Parser_insertion_mode.In_table;
process_token t token
end
| Token.Tag { kind = Token.End; name = "table"; _ } ->
if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then
parse_error t "unexpected-end-tag"
else begin
clear_stack_back_to_table_body_context t;
pop_current t;
t.mode <- Parser_insertion_mode.In_table;
process_token t token
end
| Token.Tag { kind = Token.End; name; _ }
when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"; "tr"] ->
parse_error t "unexpected-end-tag"
| _ ->
process_in_table t token
and clear_stack_back_to_table_body_context t =
let rec loop () =
match current_node t with
| Some n when not (List.mem n.Dom.name ["tbody"; "tfoot"; "thead"; "template"; "html"]) ->
pop_current t;
loop ()
| _ -> ()
in
loop ()
and process_in_row t token =
match token with
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["th"; "td"] ->
clear_stack_back_to_table_row_context t;
ignore (insert_element t name ~push:true attrs);
t.mode <- Parser_insertion_mode.In_cell;
push_formatting_marker t
| Token.Tag { kind = Token.End; name = "tr"; _ } ->
if not (has_element_in_table_scope t "tr") then
parse_error t "unexpected-end-tag"
else begin
clear_stack_back_to_table_row_context t;
pop_current t;
t.mode <- Parser_insertion_mode.In_table_body
end
| Token.Tag { kind = Token.Start; name; _ }
when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"; "tr"] ->
if not (has_element_in_table_scope t "tr") then
parse_error t "unexpected-start-tag"
else begin
clear_stack_back_to_table_row_context t;
pop_current t;
t.mode <- Parser_insertion_mode.In_table_body;
process_token t token
end
| Token.Tag { kind = Token.End; name = "table"; _ } ->
if not (has_element_in_table_scope t "tr") then
parse_error t "unexpected-end-tag"
else begin
clear_stack_back_to_table_row_context t;
pop_current t;
t.mode <- Parser_insertion_mode.In_table_body;
process_token t token
end
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
if not (has_element_in_table_scope t name) then
parse_error t "unexpected-end-tag"
else if not (has_element_in_table_scope t "tr") then
parse_error t "unexpected-end-tag"
else begin
clear_stack_back_to_table_row_context t;
pop_current t;
t.mode <- Parser_insertion_mode.In_table_body;
process_token t token
end
| Token.Tag { kind = Token.End; name; _ }
when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"] ->
parse_error t "unexpected-end-tag"
| _ ->
process_in_table t token
and clear_stack_back_to_table_row_context t =
let rec loop () =
match current_node t with
| Some n when not (List.mem n.Dom.name ["tr"; "template"; "html"]) ->
pop_current t;
loop ()
| _ -> ()
in
loop ()
and process_in_cell t token =
match token with
| Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_table_cell_element name ->
if not (has_element_in_table_scope t name) then
parse_error t "unexpected-end-tag"
else begin
generate_implied_end_tags t ();
(match current_node t with
| Some n when not (n.Dom.name = name && is_in_html_namespace n) -> parse_error t "end-tag-too-early"
| _ -> ());
pop_until_html_tag t name;
clear_active_formatting_to_marker t;
t.mode <- Parser_insertion_mode.In_row
end
| Token.Tag { kind = Token.Start; name; _ }
when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
if not (has_element_in_scope_impl t ["td"; "th"] Parser_constants.table_scope ~check_integration_points:false) then
parse_error t "unexpected-start-tag"
else begin
close_cell t;
process_token t token
end
| Token.Tag { kind = Token.End; name; _ }
when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"] ->
parse_error t "unexpected-end-tag"
| Token.Tag { kind = Token.End; name; _ }
when Parser_constants.is_foster_parenting_element name ->
if not (has_element_in_table_scope t name) then
parse_error t "unexpected-end-tag"
else begin
close_cell t;
process_token t token
end
| _ ->
process_in_body t token
and close_cell t =
generate_implied_end_tags t ();
(match current_node t with
| Some n when not (Parser_constants.is_table_cell_element n.Dom.name && is_in_html_namespace n) -> parse_error t "end-tag-too-early"
| _ -> ());
pop_until_html_one_of t ["td"; "th"];
clear_active_formatting_to_marker t;
t.mode <- Parser_insertion_mode.In_row
and process_in_select t token =
match token with
| Token.Character "\x00" ->
parse_error t "unexpected-null-character"
| Token.Character data ->
reconstruct_active_formatting t;
insert_character t data
| Token.Comment data ->
insert_comment t data
| Token.Doctype _ ->
parse_error t "unexpected-doctype"
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
process_in_body t token
| Token.Tag { kind = Token.Start; name = "option"; attrs; _ } ->
(match current_node t with
| Some n when n.Dom.name = "option" -> pop_current t
| _ -> ());
reconstruct_active_formatting t;
ignore (insert_element t "option" ~push:true attrs)
| Token.Tag { kind = Token.Start; name = "optgroup"; attrs; _ } ->
(match current_node t with
| Some n when n.Dom.name = "option" -> pop_current t
| _ -> ());
(match current_node t with
| Some n when n.Dom.name = "optgroup" -> pop_current t
| _ -> ());
reconstruct_active_formatting t;
ignore (insert_element t "optgroup" ~push:true attrs)
| Token.Tag { kind = Token.Start; name = "hr"; _ } ->
(match current_node t with
| Some n when n.Dom.name = "option" -> pop_current t
| _ -> ());
(match current_node t with
| Some n when n.Dom.name = "optgroup" -> pop_current t
| _ -> ());
ignore (insert_element t "hr" ~push:true []);
pop_current t
| Token.Tag { kind = Token.End; name = "optgroup"; _ } ->
(match t.open_elements with
| opt :: optg :: _ when opt.Dom.name = "option" && optg.Dom.name = "optgroup" ->
pop_current t
| _ -> ());
(match current_node t with
| Some n when n.Dom.name = "optgroup" -> pop_current t
| _ -> parse_error t "unexpected-end-tag")
| Token.Tag { kind = Token.End; name = "option"; _ } ->
(match current_node t with
| Some n when n.Dom.name = "option" -> pop_current t
| _ -> parse_error t "unexpected-end-tag")
| Token.Tag { kind = Token.End; name = "select"; _ } ->
if not (has_element_in_select_scope t "select") then
parse_error t "unexpected-end-tag"
else begin
pop_until_tag t "select";
reset_insertion_mode t
end
| Token.Tag { kind = Token.Start; name = "select"; _ } ->
parse_error t "unexpected-start-tag";
(* Per spec: in IN_SELECT mode, select is always on the stack - just pop *)
pop_until_tag t "select";
reset_insertion_mode t
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["input"; "textarea"] ->
parse_error t "unexpected-start-tag";
(* Per spec: in IN_SELECT mode, select is always on the stack - just pop *)
pop_until_tag t "select";
reset_insertion_mode t;
process_token t token
| Token.Tag { kind = Token.Start; name = "plaintext"; attrs; _ } ->
(* plaintext is allowed in select - creates element, parser will switch tokenizer to PLAINTEXT mode *)
reconstruct_active_formatting t;
ignore (insert_element t "plaintext" ~push:true attrs)
| Token.Tag { kind = Token.Start; name = "menuitem"; attrs; _ } ->
(* menuitem is allowed in select *)
reconstruct_active_formatting t;
ignore (insert_element t "menuitem" ~push:true attrs)
| Token.Tag { kind = Token.Start; name = "keygen"; attrs; _ } ->
(* keygen is handled specially in select - inserted directly *)
reconstruct_active_formatting t;
ignore (insert_element t "keygen" attrs)
(* Void element, don't push to stack *)
| Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } ->
reconstruct_active_formatting t;
let node = insert_foreign_element t { kind = Token.Start; name = "svg"; attrs; self_closing } (Some "svg") in
if not self_closing then t.open_elements <- node :: t.open_elements
| Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } ->
reconstruct_active_formatting t;
let node = insert_foreign_element t { kind = Token.Start; name = "math"; attrs; self_closing } (Some "mathml") in
if not self_closing then t.open_elements <- node :: t.open_elements
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["script"; "template"] ->
process_in_head t token
| Token.Tag { kind = Token.End; name = "template"; _ } ->
process_in_head t token
(* Allow certain HTML elements in select - newer spec behavior *)
| Token.Tag { kind = Token.Start; name; attrs; self_closing } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] ->
reconstruct_active_formatting t;
let node = insert_element t name attrs in
if not self_closing then t.open_elements <- node :: t.open_elements
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["br"; "img"] ->
reconstruct_active_formatting t;
ignore (insert_element t name attrs)
(* Don't push to stack - void elements *)
(* Handle formatting elements in select *)
| Token.Tag { kind = Token.Start; name; attrs; _ } when Parser_constants.is_formatting_element name ->
reconstruct_active_formatting t;
let node = insert_element t name ~push:true attrs in
push_formatting_element t node name attrs
| Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_formatting_element name ->
(* Find select element and check if formatting element is inside select *)
let select_idx = ref None in
let fmt_idx = ref None in
List.iteri (fun i n ->
if n.Dom.name = "select" && !select_idx = None then select_idx := Some i;
if n.Dom.name = name then fmt_idx := Some i
) t.open_elements;
(match !fmt_idx, !select_idx with
| Some fi, Some si when fi < si ->
(* Formatting element is inside select, run adoption agency *)
adoption_agency t name
| Some _, Some _ ->
(* Formatting element is outside select boundary - parse error, ignore *)
parse_error t "unexpected-end-tag"
| Some _, None ->
adoption_agency t name
| None, _ ->
parse_error t "unexpected-end-tag")
(* End tags for HTML elements allowed in select *)
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] ->
(* Find select and target indices *)
let select_idx = ref None in
let target_idx = ref None in
List.iteri (fun i n ->
if n.Dom.name = "select" && !select_idx = None then select_idx := Some i;
if n.Dom.name = name then target_idx := Some i
) t.open_elements;
(* Only pop if target exists and is inside select (lower index = closer to current) *)
(match !target_idx, !select_idx with
| Some ti, Some si when ti < si ->
(* Pop until we reach the target *)
let rec pop_to_target () =
match t.open_elements with
| [] -> ()
| n :: rest ->
t.open_elements <- rest;
if n.Dom.name <> name then pop_to_target ()
in
pop_to_target ()
| Some _, Some _ ->
parse_error t "unexpected-end-tag"
| Some _, None ->
(* No select on stack, just pop to target *)
let rec pop_to_target () =
match t.open_elements with
| [] -> ()
| n :: rest ->
t.open_elements <- rest;
if n.Dom.name <> name then pop_to_target ()
in
pop_to_target ()
| None, _ ->
parse_error t "unexpected-end-tag")
| Token.EOF ->
process_in_body t token
| _ ->
parse_error t "unexpected-token-in-select"
and process_in_select_in_table t token =
match token with
| Token.Tag { kind = Token.Start; name; _ }
when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] ->
parse_error t "unexpected-start-tag";
pop_until_tag t "select";
reset_insertion_mode t;
process_token t token
| Token.Tag { kind = Token.End; name; _ }
when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] ->
parse_error t "unexpected-end-tag";
if has_element_in_table_scope t name then begin
pop_until_tag t "select";
reset_insertion_mode t;
process_token t token
end
| _ ->
process_in_select t token
and process_in_template t token =
match token with
| Token.Character _ | Token.Comment _ | Token.Doctype _ ->
process_in_body t token
| Token.Tag { kind = Token.Start; name; _ }
when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
process_in_head t token
| Token.Tag { kind = Token.End; name = "template"; _ } ->
process_in_head t token
| Token.Tag { kind = Token.Start; name; _ }
when List.mem name ["caption"; "colgroup"; "tbody"; "tfoot"; "thead"] ->
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
t.template_modes <- Parser_insertion_mode.In_table :: t.template_modes;
t.mode <- Parser_insertion_mode.In_table;
process_token t token
| Token.Tag { kind = Token.Start; name = "col"; _ } ->
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
t.template_modes <- Parser_insertion_mode.In_column_group :: t.template_modes;
t.mode <- Parser_insertion_mode.In_column_group;
process_token t token
| Token.Tag { kind = Token.Start; name = "tr"; _ } ->
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
t.template_modes <- Parser_insertion_mode.In_table_body :: t.template_modes;
t.mode <- Parser_insertion_mode.In_table_body;
process_token t token
| Token.Tag { kind = Token.Start; name; _ } when Parser_constants.is_table_cell_element name ->
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
t.template_modes <- Parser_insertion_mode.In_row :: t.template_modes;
t.mode <- Parser_insertion_mode.In_row;
process_token t token
| Token.Tag { kind = Token.Start; _ } ->
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
t.template_modes <- Parser_insertion_mode.In_body :: t.template_modes;
t.mode <- Parser_insertion_mode.In_body;
process_token t token
| Token.Tag { kind = Token.End; name; _ } ->
parse_error t ("unexpected-end-tag:" ^ name)
| Token.EOF ->
if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then
() (* Stop parsing *)
else begin
parse_error t "expected-closing-tag-but-got-eof";
pop_until_html_tag t "template";
clear_active_formatting_to_marker t;
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
reset_insertion_mode t;
process_token t token
end
and process_after_body t token =
match token with
| Token.Character data when is_whitespace data ->
process_in_body t token
| Token.Comment data ->
(* Insert as last child of html element - html is at bottom of stack *)
let html_opt = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in
(match html_opt with
| Some html ->
let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
Dom.append_child html (Dom.create_comment ~location data)
| None -> ())
| Token.Doctype _ ->
parse_error t "unexpected-doctype"
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
process_in_body t token
| Token.Tag { kind = Token.End; name = "html"; _ } ->
if t.fragment_context <> None then
parse_error t "unexpected-end-tag"
else
t.mode <- Parser_insertion_mode.After_after_body
| Token.EOF ->
() (* Stop parsing *)
| _ ->
parse_error t "unexpected-token-after-body";
t.mode <- Parser_insertion_mode.In_body;
process_token t token
and process_in_frameset t token =
match token with
| Token.Character data ->
(* Extract only whitespace characters and insert them *)
let whitespace = String.to_seq data
|> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
|> String.of_seq in
if whitespace <> "" then insert_character t whitespace;
if not (is_whitespace data) then
parse_error t "unexpected-char-in-frameset"
| Token.Comment data ->
insert_comment t data
| Token.Doctype _ ->
parse_error t "unexpected-doctype"
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
process_in_body t token
| Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
ignore (insert_element t "frameset" ~push:true attrs)
| Token.Tag { kind = Token.End; name = "frameset"; _ } ->
(match current_node t with
| Some n when n.Dom.name = "html" -> parse_error t "unexpected-end-tag"
| _ ->
pop_current t;
if t.fragment_context = None then
(match current_node t with
| Some n when n.Dom.name <> "frameset" -> t.mode <- Parser_insertion_mode.After_frameset
| _ -> ()))
| Token.Tag { kind = Token.Start; name = "frame"; attrs; _ } ->
ignore (insert_element t "frame" ~push:true attrs);
pop_current t
| Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
process_in_head t token
| Token.EOF ->
(match current_node t with
| Some n when n.Dom.name <> "html" -> parse_error t "expected-closing-tag-but-got-eof"
| _ -> ())
| _ ->
parse_error t "unexpected-token-in-frameset"
and process_after_frameset t token =
match token with
| Token.Character data ->
(* Extract only whitespace characters and insert them *)
let whitespace = String.to_seq data
|> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
|> String.of_seq in
if whitespace <> "" then insert_character t whitespace;
if not (is_whitespace data) then
parse_error t "unexpected-char-after-frameset"
| Token.Comment data ->
insert_comment t data
| Token.Doctype _ ->
parse_error t "unexpected-doctype"
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
process_in_body t token
| Token.Tag { kind = Token.End; name = "html"; _ } ->
t.mode <- Parser_insertion_mode.After_after_frameset
| Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
process_in_head t token
| Token.EOF ->
() (* Stop parsing *)
| _ ->
parse_error t "unexpected-token-after-frameset"
and process_after_after_body t token =
match token with
| Token.Comment data ->
insert_comment_to_document t data
| Token.Doctype _ ->
process_in_body t token
| Token.Character data when is_whitespace data ->
process_in_body t token
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
process_in_body t token
| Token.EOF ->
() (* Stop parsing *)
| _ ->
parse_error t "unexpected-token-after-after-body";
t.mode <- Parser_insertion_mode.In_body;
process_token t token
and process_after_after_frameset t token =
match token with
| Token.Comment data ->
insert_comment_to_document t data
| Token.Doctype _ ->
process_in_body t token
| Token.Character data ->
(* Extract only whitespace characters and process using in_body rules *)
let whitespace = String.to_seq data
|> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
|> String.of_seq in
if whitespace <> "" then process_in_body t (Token.Character whitespace);
if not (is_whitespace data) then
parse_error t "unexpected-char-after-after-frameset"
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
process_in_body t token
| Token.EOF ->
() (* Stop parsing *)
| Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
process_in_head t token
| _ ->
parse_error t "unexpected-token-after-after-frameset"
and process_token t token =
(* Check for HTML integration points (SVG foreignObject, desc, title) *)
let is_html_integration_point node =
(* SVG foreignObject, desc, and title are always HTML integration points *)
if node.Dom.namespace = Some "svg" &&
Parser_constants.is_svg_html_integration node.Dom.name then true
(* annotation-xml is an HTML integration point only with specific encoding values *)
else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
match List.assoc_opt "encoding" node.Dom.attrs with
| Some enc ->
let enc_lower = String.lowercase_ascii enc in
enc_lower = "text/html" || enc_lower = "application/xhtml+xml"
| None -> false
else false
in
(* Check for MathML text integration points *)
let is_mathml_text_integration_point node =
node.Dom.namespace = Some "mathml" &&
Parser_constants.is_mathml_text_integration node.Dom.name
in
(* Foreign content handling *)
let in_foreign =
match adjusted_current_node t with
| None -> false
| Some node ->
if is_in_html_namespace node then false
else begin
(* At HTML integration points, characters and start tags (except mglyph/malignmark) use HTML rules *)
if is_html_integration_point node then begin
match token with
| Token.Character _ -> false
| Token.Tag { kind = Token.Start; _ } -> false
| _ -> true
end
(* At MathML text integration points, characters and start tags (except mglyph/malignmark) use HTML rules *)
else if is_mathml_text_integration_point node then begin
match token with
| Token.Character _ -> false
| Token.Tag { kind = Token.Start; name; _ } ->
name = "mglyph" || name = "malignmark"
| _ -> true
end
(* Special case: