(* HTML5 Tree Builder *) module Dom = Dom module Token = Tokenizer_token module State = Tokenizer_state type fragment_context = { tag_name : string; namespace : string option; } type formatting_entry = | Marker | Entry of { name : string; attrs : (string * string) list; node : Dom.node; } type parse_error = { code : Parse_error_code.t; line : int; column : int; } type t = { mutable document : Dom.node; mutable mode : Parser_insertion_mode.t; mutable original_mode : Parser_insertion_mode.t option; mutable open_elements : Dom.node list; mutable active_formatting : formatting_entry list; mutable head_element : Dom.node option; mutable form_element : Dom.node option; mutable frameset_ok : bool; mutable ignore_lf : bool; mutable foster_parenting : bool; mutable pending_table_chars : string list; mutable template_modes : Parser_insertion_mode.t list; mutable quirks_mode : Dom.quirks_mode; mutable errors : parse_error list; collect_errors : bool; fragment_context : fragment_context option; mutable fragment_context_element : Dom.node option; iframe_srcdoc : bool; mutable current_line : int; mutable current_column : int; } let create ?(collect_errors=false) ?fragment_context ?(iframe_srcdoc=false) () = let is_fragment = fragment_context <> None in let doc = if is_fragment then Dom.create_document_fragment () else Dom.create_document () in let t = { document = doc; mode = Parser_insertion_mode.Initial; original_mode = None; open_elements = []; active_formatting = []; head_element = None; form_element = None; frameset_ok = true; ignore_lf = false; foster_parenting = false; pending_table_chars = []; template_modes = []; quirks_mode = Dom.No_quirks; errors = []; collect_errors; fragment_context; fragment_context_element = None; iframe_srcdoc; current_line = 1; current_column = 1; } in (* Initialize fragment parsing *) (match fragment_context with | Some ctx -> let name = String.lowercase_ascii ctx.tag_name in let ns = ctx.namespace in (* Create html root *) let root = Dom.create_element "html" () in Dom.append_child doc root; t.open_elements <- [root]; (* For foreign content contexts, create context element *) (match ns with | Some namespace when namespace <> "html" -> let context_elem = Dom.create_element ctx.tag_name ~namespace:ns () in Dom.append_child root context_elem; t.open_elements <- [context_elem; root]; t.fragment_context_element <- Some context_elem | _ -> ()); (* Set initial mode based on context *) t.mode <- ( if name = "html" then Parser_insertion_mode.Before_head else if Parser_constants.is_table_section_element name && (ns = None || ns = Some "html") then Parser_insertion_mode.In_table_body else if name = "tr" && (ns = None || ns = Some "html") then Parser_insertion_mode.In_row else if Parser_constants.is_table_cell_element name && (ns = None || ns = Some "html") then Parser_insertion_mode.In_cell else if name = "caption" && (ns = None || ns = Some "html") then Parser_insertion_mode.In_caption else if name = "colgroup" && (ns = None || ns = Some "html") then Parser_insertion_mode.In_column_group else if name = "table" && (ns = None || ns = Some "html") then Parser_insertion_mode.In_table else if name = "template" && (ns = None || ns = Some "html") then begin t.template_modes <- [Parser_insertion_mode.In_template]; Parser_insertion_mode.In_template end else Parser_insertion_mode.In_body ); t.frameset_ok <- false | None -> ()); t (* Position tracking for error reporting *) let set_position t ~line ~column = t.current_line <- line; t.current_column <- column (* Error handling *) let parse_error t code = if t.collect_errors then t.errors <- { code = Parse_error_code.of_string code; line = t.current_line; column = t.current_column } :: t.errors (* Stack helpers *) let current_node t = match t.open_elements with | [] -> None | x :: _ -> Some x let adjusted_current_node t = match t.fragment_context, t.open_elements with | Some ctx, [_] -> (* Fragment case: use context element info *) Some (Dom.create_element ctx.tag_name ~namespace:ctx.namespace ()) | _, x :: _ -> Some x | _, [] -> None let is_in_html_namespace node = node.Dom.namespace = None || node.Dom.namespace = Some "html" (* Namespace-aware check for "special" elements per WHATWG spec *) let is_special_element node = let name = String.lowercase_ascii node.Dom.name in match node.Dom.namespace with | None | Some "html" -> Parser_constants.is_special name | Some "mathml" -> List.mem name ["mi"; "mo"; "mn"; "ms"; "mtext"; "annotation-xml"] | Some "svg" -> List.mem name ["foreignobject"; "desc"; "title"] | _ -> false let adjusted_current_node_in_html_namespace t = match adjusted_current_node t with | Some node -> is_in_html_namespace node | None -> true (* Insertion helpers *) let appropriate_insertion_place t = match current_node t with | None -> (t.document, None) | Some target -> if t.foster_parenting && Parser_constants.is_foster_parenting_element target.Dom.name then begin (* Foster parenting per WHATWG spec *) (* Step 1: Find last (most recent) template and table in stack *) (* Note: index 0 = top of stack = most recently added *) let last_template_idx = ref None in let last_table_idx = ref None in List.iteri (fun i n -> (* Take first match (most recent = lowest index) *) if n.Dom.name = "template" && !last_template_idx = None then last_template_idx := Some i; if n.Dom.name = "table" && !last_table_idx = None then last_table_idx := Some i ) t.open_elements; (* Step 2-3: If last template is more recent than last table (lower index = more recent) *) match !last_template_idx, !last_table_idx with | Some ti, None -> (* No table, use template content *) let template = List.nth t.open_elements ti in (match template.Dom.template_content with | Some tc -> (tc, None) | None -> (template, None)) | Some ti, Some tbi when ti < tbi -> (* Template is more recent than table, use template content *) let template = List.nth t.open_elements ti in (match template.Dom.template_content with | Some tc -> (tc, None) | None -> (template, None)) | _, Some tbi -> (* Use table's parent as foster parent *) let table = List.nth t.open_elements tbi in (match table.Dom.parent with | Some parent -> (parent, Some table) | None -> (* Step 6: element above table in stack (index + 1 since 0 is top) *) if tbi + 1 < List.length t.open_elements then (List.nth t.open_elements (tbi + 1), None) else (t.document, None)) | None, None -> (* No table or template, use document *) (t.document, None) end else begin (* If target is a template, insert into its content document fragment *) match target.Dom.template_content with | Some tc -> (tc, None) | None -> (target, None) end let insert_element t name ?(namespace=None) ?(push=false) attrs = let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in let node = Dom.create_element name ~namespace ~attrs ~location () in let (parent, before) = appropriate_insertion_place t in (match before with | None -> Dom.append_child parent node | Some ref -> Dom.insert_before parent node ref); if push then t.open_elements <- node :: t.open_elements; node let insert_element_for_token t (tag : Token.tag) = insert_element t tag.name ~push:true tag.attrs let insert_foreign_element t (tag : Token.tag) namespace = let attrs = if namespace = Some "svg" then Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs tag.attrs) else Parser_constants.adjust_foreign_attrs tag.attrs in let name = if namespace = Some "svg" then Parser_constants.adjust_svg_tag_name tag.name else tag.name in let node = insert_element t name ~namespace attrs in t.open_elements <- node :: t.open_elements; node let insert_character t data = if t.ignore_lf && String.length data > 0 && data.[0] = '\n' then begin t.ignore_lf <- false; if String.length data > 1 then begin let rest = String.sub data 1 (String.length data - 1) in let (parent, before) = appropriate_insertion_place t in Dom.insert_text_at parent rest before end end else begin t.ignore_lf <- false; let (parent, before) = appropriate_insertion_place t in Dom.insert_text_at parent data before end let insert_comment t data = let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in let node = Dom.create_comment ~location data in let (parent, _) = appropriate_insertion_place t in Dom.append_child parent node let insert_comment_to_document t data = let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in let node = Dom.create_comment ~location data in Dom.append_child t.document node (* Stack manipulation *) let pop_current t = match t.open_elements with | [] -> () | _ :: rest -> t.open_elements <- rest let pop_until t pred = let rec loop () = match t.open_elements with | [] -> () | x :: rest -> t.open_elements <- rest; if not (pred x) then loop () in loop () let pop_until_tag t name = pop_until t (fun n -> n.Dom.name = name) (* Pop until HTML namespace element with given name *) let pop_until_html_tag t name = pop_until t (fun n -> n.Dom.name = name && is_in_html_namespace n) let pop_until_one_of t names = pop_until t (fun n -> List.mem n.Dom.name names) (* Pop until HTML namespace element with one of given names *) let pop_until_html_one_of t names = pop_until t (fun n -> List.mem n.Dom.name names && is_in_html_namespace n) (* Check if element is an HTML integration point *) let is_html_integration_point node = (* SVG foreignObject, desc, and title are always HTML integration points *) if node.Dom.namespace = Some "svg" && Parser_constants.is_svg_html_integration node.Dom.name then true (* annotation-xml is an HTML integration point only with specific encoding values *) else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then match List.assoc_opt "encoding" node.Dom.attrs with | Some enc -> let enc_lower = String.lowercase_ascii enc in enc_lower = "text/html" || enc_lower = "application/xhtml+xml" | None -> false else false (* Check if element is a MathML text integration point *) let is_mathml_text_integration_point node = node.Dom.namespace = Some "mathml" && Parser_constants.is_mathml_text_integration node.Dom.name (* Scope checks - integration points also terminate scope (except for table scope) *) (* Per WHATWG spec, scope checks only consider HTML namespace elements for the target names *) let has_element_in_scope_impl t names exclude_list ~check_integration_points = let rec check = function | [] -> false | n :: rest -> (* Target elements must be in HTML namespace *) if is_in_html_namespace n && List.mem n.Dom.name names then true else if is_in_html_namespace n && List.mem n.Dom.name exclude_list then false (* Integration points terminate scope (unless we're checking table scope) *) else if check_integration_points && (is_html_integration_point n || is_mathml_text_integration_point n) then false else check rest in check t.open_elements let has_element_in_scope t name = has_element_in_scope_impl t [name] Parser_constants.default_scope ~check_integration_points:true let has_element_in_button_scope t name = has_element_in_scope_impl t [name] Parser_constants.button_scope ~check_integration_points:true let has_element_in_list_item_scope t name = has_element_in_scope_impl t [name] Parser_constants.list_item_scope ~check_integration_points:true let has_element_in_table_scope t name = has_element_in_scope_impl t [name] Parser_constants.table_scope ~check_integration_points:false let has_element_in_select_scope t name = let rec check = function | [] -> false | n :: rest -> if n.Dom.name = name then true else if not (Parser_constants.is_select_scope_exclude n.Dom.name) then false else check rest in check t.open_elements (* Implied end tags *) let generate_implied_end_tags t ?except () = let rec loop () = match current_node t with | Some n when Parser_constants.is_implied_end_tag n.Dom.name -> (match except with | Some ex when n.Dom.name = ex -> () | _ -> pop_current t; loop ()) | _ -> () in loop () let generate_all_implied_end_tags t = let rec loop () = match current_node t with | Some n when Parser_constants.is_thoroughly_implied_end_tag n.Dom.name -> pop_current t; loop () | _ -> () in loop () (* Active formatting elements *) let push_formatting_marker t = t.active_formatting <- Marker :: t.active_formatting let push_formatting_element t node name attrs = (* Noah's Ark: remove earlier identical elements (up to 3) *) let rec count_and_remove same acc = function | [] -> List.rev acc | Marker :: rest -> List.rev acc @ (Marker :: rest) | Entry e :: rest when e.name = name && e.attrs = attrs -> if same >= 2 then count_and_remove same acc rest (* Remove this one *) else count_and_remove (same + 1) (Entry e :: acc) rest | x :: rest -> count_and_remove same (x :: acc) rest in t.active_formatting <- count_and_remove 0 [] t.active_formatting; t.active_formatting <- Entry { name; attrs; node } :: t.active_formatting let clear_active_formatting_to_marker t = let rec loop = function | [] -> [] | Marker :: rest -> rest | _ :: rest -> loop rest in t.active_formatting <- loop t.active_formatting let reconstruct_active_formatting t = let rec find_to_reconstruct acc = function | [] -> acc | Marker :: _ -> acc | Entry e :: rest -> if List.exists (fun n -> n == e.node) t.open_elements then acc else find_to_reconstruct (Entry e :: acc) rest in let to_reconstruct = find_to_reconstruct [] t.active_formatting in List.iter (fun entry -> match entry with | Entry e -> let node = insert_element t e.name e.attrs in t.open_elements <- node :: t.open_elements; (* Update the entry to point to new node *) t.active_formatting <- List.map (fun x -> if x == entry then Entry { e with node } else x ) t.active_formatting | Marker -> () ) to_reconstruct (* Adoption agency algorithm - follows WHATWG spec *) let adoption_agency t tag_name = (* Step 1: If current node is subject and not in active formatting list, just pop *) (match current_node t with | Some n when n.Dom.name = tag_name -> let in_active = List.exists (function | Entry e -> e.name = tag_name | Marker -> false ) t.active_formatting in if not in_active then begin pop_current t; () (* Return early - this case is handled *) end | _ -> ()); (* Step 2: Outer loop *) let outer_loop_counter = ref 0 in let done_flag = ref false in while !outer_loop_counter < 8 && not !done_flag do incr outer_loop_counter; (* Step 3: Find formatting element in active formatting list *) let rec find_formatting_index idx = function | [] -> None | Marker :: _ -> None | Entry e :: rest -> if e.name = tag_name then Some (idx, e.node, e.attrs) else find_formatting_index (idx + 1) rest in let formatting_entry = find_formatting_index 0 t.active_formatting in match formatting_entry with | None -> (* No formatting element found - done *) done_flag := true | Some (fmt_idx, fmt_node, fmt_attrs) -> (* Step 4: Check if formatting element is in open elements *) if not (List.exists (fun n -> n == fmt_node) t.open_elements) then begin parse_error t "adoption-agency-1.2"; t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting; done_flag := true end (* Step 5: Check if formatting element is in scope *) else if not (has_element_in_scope t tag_name) then begin parse_error t "adoption-agency-1.3"; done_flag := true end else begin (* Step 6: Parse error if not current node *) (match current_node t with | Some n when n != fmt_node -> parse_error t "adoption-agency-1.3" | _ -> ()); (* Step 7: Find furthest block - first special element BELOW formatting element *) (* open_elements is [current(top)...html(bottom)], formatting element is somewhere in the middle *) (* We need the first special element going from formatting element toward current *) (* This is the "topmost" (closest to formatting element) special element that is "lower" (closer to current) *) let fmt_stack_idx = ref (-1) in List.iteri (fun i n -> if n == fmt_node then fmt_stack_idx := i) t.open_elements; let furthest_block = if !fmt_stack_idx <= 0 then None else begin (* Look from fmt_stack_idx-1 down to 0, find first special element *) let rec find_from_idx idx = if idx < 0 then None else let n = List.nth t.open_elements idx in if is_special_element n then Some n else find_from_idx (idx - 1) in find_from_idx (!fmt_stack_idx - 1) end in match furthest_block with | None -> (* Step 8: No furthest block - pop elements including formatting element *) pop_until t (fun n -> n == fmt_node); t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting; done_flag := true | Some fb -> (* Step 9: Let common ancestor be element immediately above formatting element *) let rec find_common_ancestor = function | [] -> None | n :: rest when n == fmt_node -> (match rest with x :: _ -> Some x | [] -> None) | _ :: rest -> find_common_ancestor rest in let common_ancestor = find_common_ancestor t.open_elements in (* Step 10: Bookmark starts after formatting element *) let bookmark = ref (fmt_idx + 1) in (* Step 11: Let last_node = furthest block *) let last_node = ref fb in (* Step 12: Inner loop *) (* The inner loop processes elements between furthest_block and formatting_element, removing non-formatting elements and reparenting formatting elements *) let inner_loop_counter = ref 0 in (* Get index of furthest block in open elements *) let fb_idx = ref 0 in List.iteri (fun i n -> if n == fb then fb_idx := i) t.open_elements; (* Start from element after furthest block (toward formatting element) *) let node_idx = ref (!fb_idx + 1) in while !node_idx < List.length t.open_elements && (List.nth t.open_elements !node_idx) != fmt_node do incr inner_loop_counter; let current_node = List.nth t.open_elements !node_idx in (* Step 12.3: Find node in active formatting list *) let rec find_node_in_formatting idx = function | [] -> None | Entry e :: _rest when e.node == current_node -> Some idx | _ :: rest -> find_node_in_formatting (idx + 1) rest in let node_fmt_idx = find_node_in_formatting 0 t.active_formatting in (* Step 12.4: If inner loop counter > 3 and node in active formatting, remove it *) let node_fmt_idx = match node_fmt_idx with | Some idx when !inner_loop_counter > 3 -> t.active_formatting <- List.filteri (fun i _ -> i <> idx) t.active_formatting; if idx < !bookmark then decr bookmark; None | x -> x in (* Step 12.5: If node not in active formatting, remove from stack and continue *) match node_fmt_idx with | None -> (* Remove from stack - this shifts indices *) t.open_elements <- List.filteri (fun i _ -> i <> !node_idx) t.open_elements (* Don't increment node_idx since we removed an element *) | Some af_idx -> (* Step 12.6: Create new element for node *) let (node_name, node_attrs) = match List.nth t.active_formatting af_idx with | Entry e -> (e.name, e.attrs) | Marker -> failwith "unexpected marker" in let new_node_elem = Dom.create_element node_name ~attrs:node_attrs () in (* Update active formatting with new node *) t.active_formatting <- List.mapi (fun i entry -> if i = af_idx then Entry { name = node_name; node = new_node_elem; attrs = node_attrs } else entry ) t.active_formatting; (* Replace node in open elements *) t.open_elements <- List.mapi (fun i n -> if i = !node_idx then new_node_elem else n ) t.open_elements; (* Step 12.7: If last_node is furthest block, update bookmark *) if !last_node == fb then bookmark := af_idx + 1; (* Step 12.8: Reparent last_node to new node *) (match !last_node.Dom.parent with | Some p -> Dom.remove_child p !last_node | None -> ()); Dom.append_child new_node_elem !last_node; (* Step 12.9: Let last_node = new node *) last_node := new_node_elem; (* Move to next element *) incr node_idx done; (* Step 13: Insert last_node into common ancestor *) (match common_ancestor with | Some ca -> (match !last_node.Dom.parent with | Some p -> Dom.remove_child p !last_node | None -> ()); (* Check if we need foster parenting *) if t.foster_parenting && Parser_constants.is_foster_parenting_element ca.Dom.name then begin (* Find table and insert before it *) let rec find_table = function | [] -> None | n :: rest when n.Dom.name = "table" -> Some (n, rest) | _ :: rest -> find_table rest in match find_table t.open_elements with | Some (table, _) -> (match table.Dom.parent with | Some parent -> Dom.insert_before parent !last_node table | None -> Dom.append_child ca !last_node) | None -> Dom.append_child ca !last_node end else begin (* If common ancestor is template, insert into its content *) match ca.Dom.template_content with | Some tc -> Dom.append_child tc !last_node | None -> Dom.append_child ca !last_node end | None -> ()); (* Step 14: Create new formatting element *) let new_formatting = Dom.create_element tag_name ~attrs:fmt_attrs () in (* Step 15: Move children of furthest block to new formatting element *) let fb_children = fb.Dom.children in List.iter (fun child -> Dom.remove_child fb child; Dom.append_child new_formatting child ) fb_children; (* Step 16: Append new formatting element to furthest block *) Dom.append_child fb new_formatting; (* Step 17: Remove old from active formatting, insert new at bookmark *) let new_entry = Entry { name = tag_name; node = new_formatting; attrs = fmt_attrs } in t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting; (* Adjust bookmark since we removed an element *) let adjusted_bookmark = if fmt_idx < !bookmark then !bookmark - 1 else !bookmark in let rec insert_at_bookmark idx acc = function | [] -> List.rev (new_entry :: acc) | x :: rest when idx = adjusted_bookmark -> List.rev_append acc (new_entry :: x :: rest) | x :: rest -> insert_at_bookmark (idx + 1) (x :: acc) rest in t.active_formatting <- insert_at_bookmark 0 [] t.active_formatting; (* Step 18: Remove formatting element from open elements, insert new after furthest block *) (* "After" in stack terms means new_formatting should be between fb and current node *) (* In our list orientation (current at index 0), this means new_formatting at lower index than fb *) t.open_elements <- List.filter (fun n -> n != fmt_node) t.open_elements; (* Find fb and insert new_formatting before it (lower index = closer to current) *) let rec insert_before acc = function | [] -> List.rev (new_formatting :: acc) | n :: rest when n == fb -> (* Insert new_formatting before fb: acc reversed, then new_formatting, then fb, then rest *) List.rev_append acc (new_formatting :: n :: rest) | n :: rest -> insert_before (n :: acc) rest in t.open_elements <- insert_before [] t.open_elements (* Continue outer loop *) end done (* Close p element *) let close_p_element t = generate_implied_end_tags t ~except:"p" (); (match current_node t with | Some n when n.Dom.name <> "p" -> parse_error t "end-tag-p-implied-but-open-elements" | _ -> ()); pop_until_tag t "p" (* Reset insertion mode *) let reset_insertion_mode t = let rec check_node last = function | [] -> t.mode <- Parser_insertion_mode.In_body | node :: rest -> let is_last = rest = [] in let node_to_check = if is_last then match t.fragment_context with | Some ctx -> Dom.create_element ctx.tag_name ~namespace:ctx.namespace () | None -> node else node in let name = node_to_check.Dom.name in if name = "select" then begin if not is_last then begin let rec find_table_or_template = function | [] -> () | n :: rest -> if n.Dom.name = "template" then t.mode <- Parser_insertion_mode.In_select else if n.Dom.name = "table" then t.mode <- Parser_insertion_mode.In_select_in_table else find_table_or_template rest in find_table_or_template rest end; if t.mode <> Parser_insertion_mode.In_select_in_table then t.mode <- Parser_insertion_mode.In_select end else if Parser_constants.is_table_cell_element name && not is_last then t.mode <- Parser_insertion_mode.In_cell else if name = "tr" then t.mode <- Parser_insertion_mode.In_row else if Parser_constants.is_table_section_element name then t.mode <- Parser_insertion_mode.In_table_body else if name = "caption" then t.mode <- Parser_insertion_mode.In_caption else if name = "colgroup" then t.mode <- Parser_insertion_mode.In_column_group else if name = "table" then t.mode <- Parser_insertion_mode.In_table else if name = "template" then t.mode <- (match t.template_modes with m :: _ -> m | [] -> Parser_insertion_mode.In_template) else if name = "head" && not is_last then t.mode <- Parser_insertion_mode.In_head else if name = "body" then t.mode <- Parser_insertion_mode.In_body else if name = "frameset" then t.mode <- Parser_insertion_mode.In_frameset else if name = "html" then t.mode <- (if t.head_element = None then Parser_insertion_mode.Before_head else Parser_insertion_mode.After_head) else if is_last then t.mode <- Parser_insertion_mode.In_body else check_node last rest in check_node false t.open_elements let is_whitespace s = let ws = [' '; '\t'; '\n'; '\x0C'; '\r'] in String.for_all (fun c -> List.mem c ws) s (* Mode handlers *) let rec process_initial t token = match token with | Token.Character data when is_whitespace data -> () | Token.Comment data -> insert_comment_to_document t data | Token.Doctype dt -> let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in let node = Dom.create_doctype ?name:dt.name ?public_id:dt.public_id ?system_id:dt.system_id ~location () in Dom.append_child t.document node; (* Quirks mode detection *) if dt.force_quirks then t.quirks_mode <- Dom.Quirks else if dt.name <> Some "html" then t.quirks_mode <- Dom.Quirks else begin let pub = Option.map String.lowercase_ascii dt.public_id in let sys = Option.map String.lowercase_ascii dt.system_id in let is_quirky = (match pub with | Some p -> List.mem p Parser_constants.quirky_public_matches || List.exists (fun prefix -> String.length p >= String.length prefix && String.sub p 0 (String.length prefix) = prefix) Parser_constants.quirky_public_prefixes | None -> false) || (match sys with | Some s -> List.mem s Parser_constants.quirky_system_matches | None -> false) in if is_quirky then t.quirks_mode <- Dom.Quirks else begin let is_limited_quirky = match pub with | Some p -> List.exists (fun prefix -> String.length p >= String.length prefix && String.sub p 0 (String.length prefix) = prefix) Parser_constants.limited_quirky_public_prefixes | None -> false in if is_limited_quirky then t.quirks_mode <- Dom.Limited_quirks end end; t.mode <- Parser_insertion_mode.Before_html | _ -> parse_error t "expected-doctype-but-got-other"; t.quirks_mode <- Dom.Quirks; t.mode <- Parser_insertion_mode.Before_html; process_token t token and process_before_html t token = match token with | Token.Doctype _ -> parse_error t "unexpected-doctype" | Token.Comment data -> insert_comment_to_document t data | Token.Character data when is_whitespace data -> () | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } -> let html = insert_element t "html" attrs in t.open_elements <- [html]; t.mode <- Parser_insertion_mode.Before_head | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] -> let html = insert_element t "html" [] in t.open_elements <- [html]; t.mode <- Parser_insertion_mode.Before_head; process_token t token | Token.Tag { kind = Token.End; name; _ } -> parse_error t ("unexpected-end-tag:" ^ name) | _ -> let html = insert_element t "html" [] in t.open_elements <- [html]; t.mode <- Parser_insertion_mode.Before_head; process_token t token and process_before_head t token = match token with | Token.Character data when is_whitespace data -> () | Token.Comment data -> insert_comment t data | Token.Doctype _ -> parse_error t "unexpected-doctype" | Token.Tag { kind = Token.Start; name = "html"; _ } -> process_in_body t token | Token.Tag { kind = Token.Start; name = "head"; attrs; _ } -> let head = insert_element t "head" attrs in t.open_elements <- head :: t.open_elements; t.head_element <- Some head; t.mode <- Parser_insertion_mode.In_head | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] -> let head = insert_element t "head" [] in t.open_elements <- head :: t.open_elements; t.head_element <- Some head; t.mode <- Parser_insertion_mode.In_head; process_token t token | Token.Tag { kind = Token.End; name; _ } -> parse_error t ("unexpected-end-tag:" ^ name) | _ -> let head = insert_element t "head" [] in t.open_elements <- head :: t.open_elements; t.head_element <- Some head; t.mode <- Parser_insertion_mode.In_head; process_token t token and process_in_head t token = match token with | Token.Character data when is_whitespace data -> insert_character t data | Token.Character data -> (* Extract leading whitespace *) let rec count_leading_ws i = if i >= String.length data then i else match data.[i] with | '\t' | '\n' | '\x0C' | '\r' | ' ' -> count_leading_ws (i + 1) | _ -> i in let ws_count = count_leading_ws 0 in let leading_ws = String.sub data 0 ws_count in let remaining = String.sub data ws_count (String.length data - ws_count) in (* If there's leading whitespace and current element has children, insert it *) if leading_ws <> "" then (match current_node t with | Some n when n.Dom.children <> [] -> insert_character t leading_ws | _ -> ()); pop_current t; t.mode <- Parser_insertion_mode.After_head; process_token t (Token.Character remaining) | Token.Comment data -> insert_comment t data | Token.Doctype _ -> parse_error t "unexpected-doctype" | Token.Tag { kind = Token.Start; name = "html"; _ } -> process_in_body t token | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"] -> ignore (insert_element t name attrs) | Token.Tag { kind = Token.Start; name = "title"; attrs; self_closing } -> ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs; self_closing }); t.original_mode <- Some t.mode; t.mode <- Parser_insertion_mode.Text | Token.Tag { kind = Token.Start; name; attrs; self_closing } when List.mem name ["noframes"; "style"] -> ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing }); t.original_mode <- Some t.mode; t.mode <- Parser_insertion_mode.Text | Token.Tag { kind = Token.Start; name = "noscript"; attrs; self_closing } -> (* Scripting is disabled: parse noscript content as HTML *) ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs; self_closing }); t.mode <- Parser_insertion_mode.In_head_noscript | Token.Tag { kind = Token.Start; name = "script"; attrs; self_closing } -> ignore (insert_element_for_token t { kind = Token.Start; name = "script"; attrs; self_closing }); t.original_mode <- Some t.mode; t.mode <- Parser_insertion_mode.Text | Token.Tag { kind = Token.End; name = "head"; _ } -> pop_current t; t.mode <- Parser_insertion_mode.After_head | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] -> pop_current t; t.mode <- Parser_insertion_mode.After_head; process_token t token | Token.Tag { kind = Token.Start; name = "template"; attrs; _ } -> let node = Dom.create_template ~attrs () in let (parent, _) = appropriate_insertion_place t in Dom.append_child parent node; t.open_elements <- node :: t.open_elements; push_formatting_marker t; t.frameset_ok <- false; t.mode <- Parser_insertion_mode.In_template; t.template_modes <- Parser_insertion_mode.In_template :: t.template_modes | Token.Tag { kind = Token.End; name = "template"; _ } -> if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then parse_error t "unexpected-end-tag" else begin generate_all_implied_end_tags t; (match current_node t with | Some n when not (n.Dom.name = "template" && is_in_html_namespace n) -> parse_error t "unexpected-end-tag" | _ -> ()); pop_until_html_tag t "template"; clear_active_formatting_to_marker t; t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); reset_insertion_mode t end | Token.Tag { kind = Token.Start; name = "head"; _ } -> parse_error t "unexpected-start-tag" | Token.Tag { kind = Token.End; name; _ } -> parse_error t ("unexpected-end-tag:" ^ name) | _ -> pop_current t; t.mode <- Parser_insertion_mode.After_head; process_token t token and process_in_head_noscript t token = match token with | Token.Character data when is_whitespace data -> process_in_head t token | Token.Character _ -> parse_error t "unexpected-char-in-noscript"; pop_current t; (* Pop noscript *) t.mode <- Parser_insertion_mode.In_head; process_token t token | Token.Comment _ -> process_in_head t token | Token.Doctype _ -> parse_error t "unexpected-doctype" | Token.Tag { kind = Token.Start; name = "html"; _ } -> process_in_body t token | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["basefont"; "bgsound"; "link"; "meta"; "noframes"; "style"] -> process_in_head t token | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["head"; "noscript"] -> parse_error t "unexpected-start-tag" | Token.Tag { kind = Token.Start; name; _ } -> parse_error t ("bad-start-tag-in-head-noscript:" ^ name); pop_current t; (* Pop noscript *) t.mode <- Parser_insertion_mode.In_head; process_token t token | Token.Tag { kind = Token.End; name = "noscript"; _ } -> pop_current t; (* Pop noscript *) t.mode <- Parser_insertion_mode.In_head | Token.Tag { kind = Token.End; name = "br"; _ } -> parse_error t "unexpected-end-tag"; pop_current t; (* Pop noscript *) t.mode <- Parser_insertion_mode.In_head; process_token t token | Token.Tag { kind = Token.End; name; _ } -> parse_error t ("unexpected-end-tag:" ^ name) | Token.EOF -> parse_error t "expected-closing-tag-but-got-eof"; pop_current t; (* Pop noscript *) t.mode <- Parser_insertion_mode.In_head; process_token t token and process_after_head t token = match token with | Token.Character data when is_whitespace data -> insert_character t data | Token.Comment data -> insert_comment t data | Token.Doctype _ -> parse_error t "unexpected-doctype" | Token.Tag { kind = Token.Start; name = "html"; _ } -> process_in_body t token | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } -> ignore (insert_element t "body" ~push:true attrs); t.frameset_ok <- false; t.mode <- Parser_insertion_mode.In_body | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } -> ignore (insert_element t "frameset" ~push:true attrs); t.mode <- Parser_insertion_mode.In_frameset | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } -> (* Special handling for input type="hidden" - parse error, ignore *) let input_type = List.assoc_opt "type" attrs in (match input_type with | Some typ when String.lowercase_ascii typ = "hidden" -> parse_error t "unexpected-hidden-input-after-head" | _ -> (* Non-hidden input creates body *) let body = insert_element t "body" [] in t.open_elements <- body :: t.open_elements; t.mode <- Parser_insertion_mode.In_body; process_token t token) | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] -> parse_error t "unexpected-start-tag"; (match t.head_element with | Some head -> t.open_elements <- head :: t.open_elements; process_in_head t token; t.open_elements <- List.filter (fun n -> n != head) t.open_elements | None -> ()) | Token.Tag { kind = Token.End; name = "template"; _ } -> process_in_head t token | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] -> let body = insert_element t "body" [] in t.open_elements <- body :: t.open_elements; t.mode <- Parser_insertion_mode.In_body; process_token t token | Token.Tag { kind = Token.Start; name = "head"; _ } -> parse_error t "unexpected-start-tag" | Token.Tag { kind = Token.End; name; _ } -> parse_error t ("unexpected-end-tag:" ^ name) | _ -> let body = insert_element t "body" [] in t.open_elements <- body :: t.open_elements; t.mode <- Parser_insertion_mode.In_body; process_token t token and process_in_body t token = match token with | Token.Character "\x00" -> parse_error t "unexpected-null-character" | Token.Character data -> reconstruct_active_formatting t; insert_character t data; if not (is_whitespace data) then t.frameset_ok <- false | Token.Comment data -> insert_comment t data | Token.Doctype _ -> parse_error t "unexpected-doctype" | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } -> parse_error t "unexpected-start-tag"; if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then (* Find the html element (at the bottom of the stack) *) let html_elem = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in (match html_elem with | Some html -> List.iter (fun (k, v) -> if not (Dom.has_attr html k) then Dom.set_attr html k v ) attrs | None -> ()) | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] -> process_in_head t token | Token.Tag { kind = Token.End; name = "template"; _ } -> process_in_head t token | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } -> parse_error t "unexpected-start-tag"; (* Find body element on stack - it should be near the end (html is last) *) let body = List.find_opt (fun n -> n.Dom.name = "body") t.open_elements in (match body with | Some body when not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) -> t.frameset_ok <- false; List.iter (fun (k, v) -> if not (Dom.has_attr body k) then Dom.set_attr body k v ) attrs | _ -> ()) | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } -> if not t.frameset_ok then parse_error t "unexpected-start-tag-ignored" else begin (* Find body element on the stack *) let rec find_body_index idx = function | [] -> None | n :: rest -> if n.Dom.name = "body" then Some (idx, n) else find_body_index (idx + 1) rest in match find_body_index 0 t.open_elements with | None -> parse_error t "unexpected-start-tag-ignored" | Some (idx, body_elem) -> (* Remove body from its parent (the html element) *) (match body_elem.Dom.parent with | Some parent -> Dom.remove_child parent body_elem | None -> ()); (* Pop all elements up to and including body - keep only elements after body_idx *) let rec drop n lst = if n <= 0 then lst else match lst with [] -> [] | _ :: rest -> drop (n - 1) rest in t.open_elements <- drop (idx + 1) t.open_elements; (* Insert frameset element *) ignore (insert_element t "frameset" ~push:true attrs); t.mode <- Parser_insertion_mode.In_frameset end | Token.EOF -> if t.template_modes <> [] then process_in_template t token else begin let has_unclosed = List.exists (fun n -> not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"]) ) t.open_elements in if has_unclosed then parse_error t "expected-closing-tag-but-got-eof" end | Token.Tag { kind = Token.End; name = "body"; _ } -> if not (has_element_in_scope t "body") then parse_error t "unexpected-end-tag" else begin let has_unclosed = List.exists (fun n -> not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"]) ) t.open_elements in if has_unclosed then parse_error t "end-tag-too-early"; t.mode <- Parser_insertion_mode.After_body end | Token.Tag { kind = Token.End; name = "html"; _ } -> if not (has_element_in_scope t "body") then parse_error t "unexpected-end-tag" else begin t.mode <- Parser_insertion_mode.After_body; process_token t token end | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["address"; "article"; "aside"; "blockquote"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "main"; "menu"; "nav"; "ol"; "p"; "search"; "section"; "summary"; "ul"] -> if has_element_in_button_scope t "p" then close_p_element t; ignore (insert_element t name ~push:true attrs) | Token.Tag { kind = Token.Start; name; attrs; _ } when Parser_constants.is_heading_element name -> if has_element_in_button_scope t "p" then close_p_element t; (match current_node t with | Some n when Parser_constants.is_heading_element n.Dom.name -> parse_error t "unexpected-start-tag"; pop_current t | _ -> ()); ignore (insert_element t name ~push:true attrs) | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["pre"; "listing"] -> if has_element_in_button_scope t "p" then close_p_element t; ignore (insert_element t name ~push:true attrs); t.ignore_lf <- true; t.frameset_ok <- false | Token.Tag { kind = Token.Start; name = "form"; attrs; _ } -> if t.form_element <> None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then parse_error t "unexpected-start-tag" else begin if has_element_in_button_scope t "p" then close_p_element t; let form = insert_element t "form" attrs in t.open_elements <- form :: t.open_elements; if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then t.form_element <- Some form end | Token.Tag { kind = Token.Start; name = "li"; attrs; _ } -> t.frameset_ok <- false; let rec check = function | [] -> () | n :: rest -> if n.Dom.name = "li" then begin generate_implied_end_tags t ~except:"li" (); (match current_node t with | Some cn when cn.Dom.name <> "li" -> parse_error t "unexpected-start-tag" | _ -> ()); pop_until_tag t "li" end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then () else check rest in check t.open_elements; if has_element_in_button_scope t "p" then close_p_element t; ignore (insert_element t "li" ~push:true attrs) | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["dd"; "dt"] -> t.frameset_ok <- false; let rec check = function | [] -> () | n :: rest -> if List.mem n.Dom.name ["dd"; "dt"] then begin generate_implied_end_tags t ~except:n.Dom.name (); (match current_node t with | Some cn when cn.Dom.name <> n.Dom.name -> parse_error t "unexpected-start-tag" | _ -> ()); pop_until_one_of t ["dd"; "dt"] end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then () else check rest in check t.open_elements; if has_element_in_button_scope t "p" then close_p_element t; ignore (insert_element t name ~push:true attrs) | Token.Tag { kind = Token.Start; name = "plaintext"; _ } -> if has_element_in_button_scope t "p" then close_p_element t; ignore (insert_element t "plaintext" ~push:true []) (* Tokenizer should switch to PLAINTEXT state *) | Token.Tag { kind = Token.Start; name = "button"; attrs; _ } -> if has_element_in_scope t "button" then begin parse_error t "unexpected-start-tag"; generate_implied_end_tags t (); pop_until_tag t "button" end; reconstruct_active_formatting t; ignore (insert_element t "button" ~push:true attrs); t.frameset_ok <- false | Token.Tag { kind = Token.End; name; _ } when List.mem name ["address"; "article"; "aside"; "blockquote"; "button"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "listing"; "main"; "menu"; "nav"; "ol"; "pre"; "search"; "section"; "summary"; "ul"] -> if not (has_element_in_scope t name) then parse_error t ("unexpected-end-tag:" ^ name) else begin generate_implied_end_tags t (); (match current_node t with | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" | _ -> ()); pop_until_tag t name end | Token.Tag { kind = Token.End; name = "form"; _ } -> if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin let node = t.form_element in t.form_element <- None; match node with | None -> parse_error t "unexpected-end-tag" | Some form_node -> if not (has_element_in_scope t "form") then parse_error t "unexpected-end-tag" else begin generate_implied_end_tags t (); (match current_node t with | Some n when n != form_node -> parse_error t "end-tag-too-early" | _ -> ()); t.open_elements <- List.filter (fun n -> n != form_node) t.open_elements end end else begin if not (has_element_in_scope t "form") then parse_error t "unexpected-end-tag" else begin generate_implied_end_tags t (); (match current_node t with | Some n when n.Dom.name <> "form" -> parse_error t "end-tag-too-early" | _ -> ()); pop_until_tag t "form" end end | Token.Tag { kind = Token.End; name = "p"; _ } -> if not (has_element_in_button_scope t "p") then begin parse_error t "no-p-element-in-scope"; ignore (insert_element t "p" ~push:true []) end; close_p_element t | Token.Tag { kind = Token.End; name = "li"; _ } -> if not (has_element_in_list_item_scope t "li") then parse_error t "unexpected-end-tag" else begin generate_implied_end_tags t ~except:"li" (); (match current_node t with | Some n when n.Dom.name <> "li" -> parse_error t "end-tag-too-early" | _ -> ()); pop_until_tag t "li" end | Token.Tag { kind = Token.End; name; _ } when List.mem name ["dd"; "dt"] -> if not (has_element_in_scope t name) then parse_error t "unexpected-end-tag" else begin generate_implied_end_tags t ~except:name (); (match current_node t with | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" | _ -> ()); pop_until_tag t name end | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_heading_element name -> if not (has_element_in_scope_impl t Parser_constants.heading_elements Parser_constants.default_scope ~check_integration_points:true) then parse_error t "unexpected-end-tag" else begin generate_implied_end_tags t (); (match current_node t with | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" | _ -> ()); pop_until_one_of t Parser_constants.heading_elements end | Token.Tag { kind = Token.Start; name = "a"; attrs; _ } -> (* Check for existing in active formatting *) let rec find_a = function | [] -> None | Marker :: _ -> None | Entry e :: _ when e.name = "a" -> Some e.node | _ :: rest -> find_a rest in (match find_a t.active_formatting with | Some existing -> parse_error t "unexpected-start-tag"; adoption_agency t "a"; t.active_formatting <- List.filter (function | Entry e -> e.node != existing | _ -> true ) t.active_formatting; t.open_elements <- List.filter (fun n -> n != existing) t.open_elements | None -> ()); reconstruct_active_formatting t; let node = insert_element t "a" attrs in t.open_elements <- node :: t.open_elements; push_formatting_element t node "a" attrs | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["b"; "big"; "code"; "em"; "font"; "i"; "s"; "small"; "strike"; "strong"; "tt"; "u"] -> reconstruct_active_formatting t; let node = insert_element t name attrs in t.open_elements <- node :: t.open_elements; push_formatting_element t node name attrs | Token.Tag { kind = Token.Start; name = "nobr"; attrs; _ } -> if has_element_in_scope t "nobr" then begin parse_error t "unexpected-start-tag"; adoption_agency t "nobr"; (* Remove nobr from active formatting *) t.active_formatting <- List.filter (function | Entry e -> e.name <> "nobr" | Marker -> true ) t.active_formatting; (* Remove nobr from open elements *) t.open_elements <- List.filter (fun n -> n.Dom.name <> "nobr") t.open_elements end; reconstruct_active_formatting t; let node = insert_element t "nobr" attrs in t.open_elements <- node :: t.open_elements; push_formatting_element t node "nobr" attrs | Token.Tag { kind = Token.End; name; _ } when List.mem name ["a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u"] -> adoption_agency t name | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["applet"; "marquee"; "object"] -> reconstruct_active_formatting t; ignore (insert_element t name ~push:true attrs); push_formatting_marker t; t.frameset_ok <- false | Token.Tag { kind = Token.End; name; _ } when List.mem name ["applet"; "marquee"; "object"] -> if not (has_element_in_scope t name) then parse_error t "unexpected-end-tag" else begin generate_implied_end_tags t (); (match current_node t with | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" | _ -> ()); pop_until_tag t name; clear_active_formatting_to_marker t end | Token.Tag { kind = Token.Start; name = "table"; attrs; _ } -> if t.quirks_mode <> Dom.Quirks && has_element_in_button_scope t "p" then close_p_element t; ignore (insert_element t "table" ~push:true attrs); t.frameset_ok <- false; t.mode <- Parser_insertion_mode.In_table | Token.Tag { kind = Token.End; name = "br"; _ } -> parse_error t "end-tag-br"; reconstruct_active_formatting t; ignore (insert_element t "br" ~push:true []); pop_current t; t.frameset_ok <- false | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["area"; "br"; "embed"; "img"; "keygen"; "wbr"] -> reconstruct_active_formatting t; ignore (insert_element t name ~push:true attrs); pop_current t; t.frameset_ok <- false | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } -> reconstruct_active_formatting t; ignore (insert_element t "input" ~push:true attrs); pop_current t; let is_hidden = List.exists (fun (k, v) -> String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden" ) attrs in if not is_hidden then t.frameset_ok <- false | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["param"; "source"; "track"] -> ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing = false }); pop_current t | Token.Tag { kind = Token.Start; name = "hr"; _ } -> if has_element_in_button_scope t "p" then close_p_element t; ignore (insert_element t "hr" ~push:true []); pop_current t; t.frameset_ok <- false | Token.Tag { kind = Token.Start; name = "image"; attrs; _ } -> parse_error t "unexpected-start-tag"; (* Treat as *) reconstruct_active_formatting t; ignore (insert_element t "img" ~push:true attrs); pop_current t; t.frameset_ok <- false | Token.Tag { kind = Token.Start; name = "textarea"; attrs; _ } -> ignore (insert_element t "textarea" ~push:true attrs); t.ignore_lf <- true; t.original_mode <- Some t.mode; t.frameset_ok <- false; t.mode <- Parser_insertion_mode.Text | Token.Tag { kind = Token.Start; name = "xmp"; attrs; _ } -> if has_element_in_button_scope t "p" then close_p_element t; reconstruct_active_formatting t; t.frameset_ok <- false; ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs; self_closing = false }); t.original_mode <- Some t.mode; t.mode <- Parser_insertion_mode.Text | Token.Tag { kind = Token.Start; name = "iframe"; attrs; _ } -> t.frameset_ok <- false; ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs; self_closing = false }); t.original_mode <- Some t.mode; t.mode <- Parser_insertion_mode.Text | Token.Tag { kind = Token.Start; name = "noembed"; attrs; _ } -> ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs; self_closing = false }); t.original_mode <- Some t.mode; t.mode <- Parser_insertion_mode.Text | Token.Tag { kind = Token.Start; name = "select"; attrs; _ } -> reconstruct_active_formatting t; ignore (insert_element t "select" ~push:true attrs); t.frameset_ok <- false; if List.mem t.mode [Parser_insertion_mode.In_table; Parser_insertion_mode.In_caption; Parser_insertion_mode.In_table_body; Parser_insertion_mode.In_row; Parser_insertion_mode.In_cell] then t.mode <- Parser_insertion_mode.In_select_in_table else t.mode <- Parser_insertion_mode.In_select | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["optgroup"; "option"] -> (match current_node t with | Some n when n.Dom.name = "option" -> pop_current t | _ -> ()); reconstruct_active_formatting t; ignore (insert_element t name ~push:true attrs) | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rb"; "rtc"] -> if has_element_in_scope t "ruby" then begin generate_implied_end_tags t () end; (match current_node t with | Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag" | _ -> ()); ignore (insert_element t name ~push:true attrs) | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rp"; "rt"] -> if has_element_in_scope t "ruby" then begin generate_implied_end_tags t ~except:"rtc" () end; (match current_node t with | Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag" | _ -> ()); ignore (insert_element t name ~push:true attrs) | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } -> reconstruct_active_formatting t; let adjusted_attrs = Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs) in ignore (insert_foreign_element t { kind = Token.Start; name = "math"; attrs = adjusted_attrs; self_closing } (Some "mathml")); if self_closing then pop_current t | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } -> reconstruct_active_formatting t; let adjusted_attrs = Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs) in ignore (insert_foreign_element t { kind = Token.Start; name = "svg"; attrs = adjusted_attrs; self_closing } (Some "svg")); if self_closing then pop_current t | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["col"; "frame"] -> (* In fragment context, insert these; otherwise ignore *) if t.fragment_context = None then parse_error t "unexpected-start-tag-ignored" else ignore (insert_element t name attrs) | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["caption"; "colgroup"; "head"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> parse_error t "unexpected-start-tag" | Token.Tag { kind = Token.Start; name; attrs; self_closing } -> (* Any other start tag *) reconstruct_active_formatting t; ignore (insert_element t name ~push:true attrs); (* Check for self-closing on non-void HTML element *) if self_closing && not (Parser_constants.is_void_element name) then parse_error t "non-void-html-element-start-tag-with-trailing-solidus" | Token.Tag { kind = Token.End; name; _ } -> (* Any other end tag *) let rec check = function | [] -> () | node :: rest -> if node.Dom.name = name then begin generate_implied_end_tags t ~except:name (); (match current_node t with | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" | _ -> ()); pop_until t (fun n -> n == node) end else if is_special_element node then parse_error t ("unexpected-end-tag:" ^ name) else check rest in check t.open_elements and process_text t token = match token with | Token.Character data -> insert_character t data | Token.EOF -> parse_error t "expected-closing-tag-but-got-eof"; pop_current t; t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body; process_token t token | Token.Tag { kind = Token.End; _ } -> pop_current t; t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body | _ -> () and process_in_table t token = match token with | Token.Character _ when (match current_node t with Some n -> Parser_constants.is_foster_parenting_element n.Dom.name | None -> false) -> t.pending_table_chars <- []; t.original_mode <- Some t.mode; t.mode <- Parser_insertion_mode.In_table_text; process_token t token | Token.Comment data -> insert_comment t data | Token.Doctype _ -> parse_error t "unexpected-doctype" | Token.Tag { kind = Token.Start; name = "caption"; attrs; _ } -> clear_stack_back_to_table_context t; push_formatting_marker t; ignore (insert_element t "caption" ~push:true attrs); t.mode <- Parser_insertion_mode.In_caption | Token.Tag { kind = Token.Start; name = "colgroup"; attrs; _ } -> clear_stack_back_to_table_context t; ignore (insert_element t "colgroup" ~push:true attrs); t.mode <- Parser_insertion_mode.In_column_group | Token.Tag { kind = Token.Start; name = "col"; _ } -> clear_stack_back_to_table_context t; ignore (insert_element t "colgroup" ~push:true []); t.mode <- Parser_insertion_mode.In_column_group; process_token t token | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> clear_stack_back_to_table_context t; ignore (insert_element t name ~push:true attrs); t.mode <- Parser_insertion_mode.In_table_body | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"; "tr"] -> clear_stack_back_to_table_context t; ignore (insert_element t "tbody" ~push:true []); t.mode <- Parser_insertion_mode.In_table_body; process_token t token | Token.Tag { kind = Token.Start; name = "table"; _ } -> parse_error t "unexpected-start-tag"; if has_element_in_table_scope t "table" then begin pop_until_tag t "table"; reset_insertion_mode t; process_token t token end | Token.Tag { kind = Token.End; name = "table"; _ } -> if not (has_element_in_table_scope t "table") then parse_error t "unexpected-end-tag" else begin pop_until_tag t "table"; reset_insertion_mode t end | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> parse_error t "unexpected-end-tag" | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["style"; "script"; "template"] -> process_in_head t token | Token.Tag { kind = Token.End; name = "template"; _ } -> process_in_head t token | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } -> let is_hidden = List.exists (fun (k, v) -> String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden" ) attrs in if not is_hidden then begin parse_error t "start-tag-in-table:input"; t.foster_parenting <- true; process_in_body t token; t.foster_parenting <- false end else begin parse_error t "start-tag-in-table:input"; ignore (insert_element t "input" ~push:true attrs); pop_current t end | Token.Tag { kind = Token.Start; name = "form"; attrs; _ } -> parse_error t "unexpected-start-tag"; if t.form_element = None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin let form = insert_element t "form" attrs in t.open_elements <- form :: t.open_elements; t.form_element <- Some form; pop_current t end | Token.EOF -> process_in_body t token | _ -> parse_error t "unexpected-token-in-table"; t.foster_parenting <- true; process_in_body t token; t.foster_parenting <- false and clear_stack_back_to_table_context t = let rec loop () = match current_node t with | Some n when not (List.mem n.Dom.name ["table"; "template"; "html"]) -> pop_current t; loop () | _ -> () in loop () and process_in_table_text t token = match token with | Token.Character data -> if String.contains data '\x00' then parse_error t "unexpected-null-character" else t.pending_table_chars <- data :: t.pending_table_chars | _ -> let pending = String.concat "" (List.rev t.pending_table_chars) in t.pending_table_chars <- []; if not (is_whitespace pending) then begin parse_error t "unexpected-character-in-table"; t.foster_parenting <- true; reconstruct_active_formatting t; insert_character t pending; t.foster_parenting <- false end else insert_character t pending; t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_table; process_token t token and process_in_caption t token = match token with | Token.Tag { kind = Token.End; name = "caption"; _ } -> if not (has_element_in_table_scope t "caption") then parse_error t "unexpected-end-tag" else begin generate_implied_end_tags t (); (match current_node t with | Some n when n.Dom.name <> "caption" -> parse_error t "end-tag-too-early" | _ -> ()); pop_until_tag t "caption"; clear_active_formatting_to_marker t; t.mode <- Parser_insertion_mode.In_table end | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> if not (has_element_in_table_scope t "caption") then parse_error t "unexpected-start-tag" else begin generate_implied_end_tags t (); pop_until_tag t "caption"; clear_active_formatting_to_marker t; t.mode <- Parser_insertion_mode.In_table; process_token t token end | Token.Tag { kind = Token.End; name = "table"; _ } -> if not (has_element_in_table_scope t "caption") then parse_error t "unexpected-end-tag" else begin generate_implied_end_tags t (); pop_until_tag t "caption"; clear_active_formatting_to_marker t; t.mode <- Parser_insertion_mode.In_table; process_token t token end | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> parse_error t "unexpected-end-tag" | _ -> process_in_body t token and process_in_column_group t token = match token with | Token.Character data when is_whitespace data -> insert_character t data | Token.Character data -> (* Split leading whitespace from non-whitespace *) let ws_chars = [' '; '\t'; '\n'; '\x0C'; '\r'] in let len = String.length data in let ws_end = ref 0 in while !ws_end < len && List.mem data.[!ws_end] ws_chars do incr ws_end done; if !ws_end > 0 then insert_character t (String.sub data 0 !ws_end); if !ws_end < len then begin let remaining = String.sub data !ws_end (len - !ws_end) in (match current_node t with | Some n when n.Dom.name = "colgroup" -> pop_current t; t.mode <- Parser_insertion_mode.In_table; process_token t (Token.Character remaining) | _ -> parse_error t "unexpected-token") end | Token.Comment data -> insert_comment t data | Token.Doctype _ -> parse_error t "unexpected-doctype" | Token.Tag { kind = Token.Start; name = "html"; _ } -> process_in_body t token | Token.Tag { kind = Token.Start; name = "col"; attrs; _ } -> ignore (insert_element t "col" ~push:true attrs); pop_current t | Token.Tag { kind = Token.End; name = "colgroup"; _ } -> (match current_node t with | Some n when n.Dom.name <> "colgroup" -> parse_error t "unexpected-end-tag" | Some _ -> pop_current t; t.mode <- Parser_insertion_mode.In_table | None -> parse_error t "unexpected-end-tag") | Token.Tag { kind = Token.End; name = "col"; _ } -> parse_error t "unexpected-end-tag" | Token.Tag { kind = Token.Start; name = "template"; _ } | Token.Tag { kind = Token.End; name = "template"; _ } -> process_in_head t token | Token.EOF -> process_in_body t token | _ -> (match current_node t with | Some n when n.Dom.name = "colgroup" -> pop_current t; t.mode <- Parser_insertion_mode.In_table; process_token t token | _ -> parse_error t "unexpected-token") and process_in_table_body t token = match token with | Token.Tag { kind = Token.Start; name = "tr"; attrs; _ } -> clear_stack_back_to_table_body_context t; ignore (insert_element t "tr" ~push:true attrs); t.mode <- Parser_insertion_mode.In_row | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["th"; "td"] -> parse_error t "unexpected-start-tag"; clear_stack_back_to_table_body_context t; ignore (insert_element t "tr" ~push:true []); t.mode <- Parser_insertion_mode.In_row; process_token t token | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> if not (has_element_in_table_scope t name) then parse_error t "unexpected-end-tag" else begin clear_stack_back_to_table_body_context t; pop_current t; t.mode <- Parser_insertion_mode.In_table end | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"] -> if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then parse_error t "unexpected-start-tag" else begin clear_stack_back_to_table_body_context t; pop_current t; t.mode <- Parser_insertion_mode.In_table; process_token t token end | Token.Tag { kind = Token.End; name = "table"; _ } -> if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then parse_error t "unexpected-end-tag" else begin clear_stack_back_to_table_body_context t; pop_current t; t.mode <- Parser_insertion_mode.In_table; process_token t token end | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"; "tr"] -> parse_error t "unexpected-end-tag" | _ -> process_in_table t token and clear_stack_back_to_table_body_context t = let rec loop () = match current_node t with | Some n when not (List.mem n.Dom.name ["tbody"; "tfoot"; "thead"; "template"; "html"]) -> pop_current t; loop () | _ -> () in loop () and process_in_row t token = match token with | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["th"; "td"] -> clear_stack_back_to_table_row_context t; ignore (insert_element t name ~push:true attrs); t.mode <- Parser_insertion_mode.In_cell; push_formatting_marker t | Token.Tag { kind = Token.End; name = "tr"; _ } -> if not (has_element_in_table_scope t "tr") then parse_error t "unexpected-end-tag" else begin clear_stack_back_to_table_row_context t; pop_current t; t.mode <- Parser_insertion_mode.In_table_body end | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"; "tr"] -> if not (has_element_in_table_scope t "tr") then parse_error t "unexpected-start-tag" else begin clear_stack_back_to_table_row_context t; pop_current t; t.mode <- Parser_insertion_mode.In_table_body; process_token t token end | Token.Tag { kind = Token.End; name = "table"; _ } -> if not (has_element_in_table_scope t "tr") then parse_error t "unexpected-end-tag" else begin clear_stack_back_to_table_row_context t; pop_current t; t.mode <- Parser_insertion_mode.In_table_body; process_token t token end | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> if not (has_element_in_table_scope t name) then parse_error t "unexpected-end-tag" else if not (has_element_in_table_scope t "tr") then parse_error t "unexpected-end-tag" else begin clear_stack_back_to_table_row_context t; pop_current t; t.mode <- Parser_insertion_mode.In_table_body; process_token t token end | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"] -> parse_error t "unexpected-end-tag" | _ -> process_in_table t token and clear_stack_back_to_table_row_context t = let rec loop () = match current_node t with | Some n when not (List.mem n.Dom.name ["tr"; "template"; "html"]) -> pop_current t; loop () | _ -> () in loop () and process_in_cell t token = match token with | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_table_cell_element name -> if not (has_element_in_table_scope t name) then parse_error t "unexpected-end-tag" else begin generate_implied_end_tags t (); (match current_node t with | Some n when not (n.Dom.name = name && is_in_html_namespace n) -> parse_error t "end-tag-too-early" | _ -> ()); pop_until_html_tag t name; clear_active_formatting_to_marker t; t.mode <- Parser_insertion_mode.In_row end | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> if not (has_element_in_scope_impl t ["td"; "th"] Parser_constants.table_scope ~check_integration_points:false) then parse_error t "unexpected-start-tag" else begin close_cell t; process_token t token end | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"] -> parse_error t "unexpected-end-tag" | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_foster_parenting_element name -> if not (has_element_in_table_scope t name) then parse_error t "unexpected-end-tag" else begin close_cell t; process_token t token end | _ -> process_in_body t token and close_cell t = generate_implied_end_tags t (); (match current_node t with | Some n when not (Parser_constants.is_table_cell_element n.Dom.name && is_in_html_namespace n) -> parse_error t "end-tag-too-early" | _ -> ()); pop_until_html_one_of t ["td"; "th"]; clear_active_formatting_to_marker t; t.mode <- Parser_insertion_mode.In_row and process_in_select t token = match token with | Token.Character "\x00" -> parse_error t "unexpected-null-character" | Token.Character data -> reconstruct_active_formatting t; insert_character t data | Token.Comment data -> insert_comment t data | Token.Doctype _ -> parse_error t "unexpected-doctype" | Token.Tag { kind = Token.Start; name = "html"; _ } -> process_in_body t token | Token.Tag { kind = Token.Start; name = "option"; attrs; _ } -> (match current_node t with | Some n when n.Dom.name = "option" -> pop_current t | _ -> ()); reconstruct_active_formatting t; ignore (insert_element t "option" ~push:true attrs) | Token.Tag { kind = Token.Start; name = "optgroup"; attrs; _ } -> (match current_node t with | Some n when n.Dom.name = "option" -> pop_current t | _ -> ()); (match current_node t with | Some n when n.Dom.name = "optgroup" -> pop_current t | _ -> ()); reconstruct_active_formatting t; ignore (insert_element t "optgroup" ~push:true attrs) | Token.Tag { kind = Token.Start; name = "hr"; _ } -> (match current_node t with | Some n when n.Dom.name = "option" -> pop_current t | _ -> ()); (match current_node t with | Some n when n.Dom.name = "optgroup" -> pop_current t | _ -> ()); ignore (insert_element t "hr" ~push:true []); pop_current t | Token.Tag { kind = Token.End; name = "optgroup"; _ } -> (match t.open_elements with | opt :: optg :: _ when opt.Dom.name = "option" && optg.Dom.name = "optgroup" -> pop_current t | _ -> ()); (match current_node t with | Some n when n.Dom.name = "optgroup" -> pop_current t | _ -> parse_error t "unexpected-end-tag") | Token.Tag { kind = Token.End; name = "option"; _ } -> (match current_node t with | Some n when n.Dom.name = "option" -> pop_current t | _ -> parse_error t "unexpected-end-tag") | Token.Tag { kind = Token.End; name = "select"; _ } -> if not (has_element_in_select_scope t "select") then parse_error t "unexpected-end-tag" else begin pop_until_tag t "select"; reset_insertion_mode t end | Token.Tag { kind = Token.Start; name = "select"; _ } -> parse_error t "unexpected-start-tag"; (* Per spec: in IN_SELECT mode, select is always on the stack - just pop *) pop_until_tag t "select"; reset_insertion_mode t | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["input"; "textarea"] -> parse_error t "unexpected-start-tag"; (* Per spec: in IN_SELECT mode, select is always on the stack - just pop *) pop_until_tag t "select"; reset_insertion_mode t; process_token t token | Token.Tag { kind = Token.Start; name = "plaintext"; attrs; _ } -> (* plaintext is allowed in select - creates element, parser will switch tokenizer to PLAINTEXT mode *) reconstruct_active_formatting t; ignore (insert_element t "plaintext" ~push:true attrs) | Token.Tag { kind = Token.Start; name = "menuitem"; attrs; _ } -> (* menuitem is allowed in select *) reconstruct_active_formatting t; ignore (insert_element t "menuitem" ~push:true attrs) | Token.Tag { kind = Token.Start; name = "keygen"; attrs; _ } -> (* keygen is handled specially in select - inserted directly *) reconstruct_active_formatting t; ignore (insert_element t "keygen" attrs) (* Void element, don't push to stack *) | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } -> reconstruct_active_formatting t; let node = insert_foreign_element t { kind = Token.Start; name = "svg"; attrs; self_closing } (Some "svg") in if not self_closing then t.open_elements <- node :: t.open_elements | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } -> reconstruct_active_formatting t; let node = insert_foreign_element t { kind = Token.Start; name = "math"; attrs; self_closing } (Some "mathml") in if not self_closing then t.open_elements <- node :: t.open_elements | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["script"; "template"] -> process_in_head t token | Token.Tag { kind = Token.End; name = "template"; _ } -> process_in_head t token (* Allow certain HTML elements in select - newer spec behavior *) | Token.Tag { kind = Token.Start; name; attrs; self_closing } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] -> reconstruct_active_formatting t; let node = insert_element t name attrs in if not self_closing then t.open_elements <- node :: t.open_elements | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["br"; "img"] -> reconstruct_active_formatting t; ignore (insert_element t name attrs) (* Don't push to stack - void elements *) (* Handle formatting elements in select *) | Token.Tag { kind = Token.Start; name; attrs; _ } when Parser_constants.is_formatting_element name -> reconstruct_active_formatting t; let node = insert_element t name ~push:true attrs in push_formatting_element t node name attrs | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_formatting_element name -> (* Find select element and check if formatting element is inside select *) let select_idx = ref None in let fmt_idx = ref None in List.iteri (fun i n -> if n.Dom.name = "select" && !select_idx = None then select_idx := Some i; if n.Dom.name = name then fmt_idx := Some i ) t.open_elements; (match !fmt_idx, !select_idx with | Some fi, Some si when fi < si -> (* Formatting element is inside select, run adoption agency *) adoption_agency t name | Some _, Some _ -> (* Formatting element is outside select boundary - parse error, ignore *) parse_error t "unexpected-end-tag" | Some _, None -> adoption_agency t name | None, _ -> parse_error t "unexpected-end-tag") (* End tags for HTML elements allowed in select *) | Token.Tag { kind = Token.End; name; _ } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] -> (* Find select and target indices *) let select_idx = ref None in let target_idx = ref None in List.iteri (fun i n -> if n.Dom.name = "select" && !select_idx = None then select_idx := Some i; if n.Dom.name = name then target_idx := Some i ) t.open_elements; (* Only pop if target exists and is inside select (lower index = closer to current) *) (match !target_idx, !select_idx with | Some ti, Some si when ti < si -> (* Pop until we reach the target *) let rec pop_to_target () = match t.open_elements with | [] -> () | n :: rest -> t.open_elements <- rest; if n.Dom.name <> name then pop_to_target () in pop_to_target () | Some _, Some _ -> parse_error t "unexpected-end-tag" | Some _, None -> (* No select on stack, just pop to target *) let rec pop_to_target () = match t.open_elements with | [] -> () | n :: rest -> t.open_elements <- rest; if n.Dom.name <> name then pop_to_target () in pop_to_target () | None, _ -> parse_error t "unexpected-end-tag") | Token.EOF -> process_in_body t token | _ -> parse_error t "unexpected-token-in-select" and process_in_select_in_table t token = match token with | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] -> parse_error t "unexpected-start-tag"; pop_until_tag t "select"; reset_insertion_mode t; process_token t token | Token.Tag { kind = Token.End; name; _ } when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] -> parse_error t "unexpected-end-tag"; if has_element_in_table_scope t name then begin pop_until_tag t "select"; reset_insertion_mode t; process_token t token end | _ -> process_in_select t token and process_in_template t token = match token with | Token.Character _ | Token.Comment _ | Token.Doctype _ -> process_in_body t token | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] -> process_in_head t token | Token.Tag { kind = Token.End; name = "template"; _ } -> process_in_head t token | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["caption"; "colgroup"; "tbody"; "tfoot"; "thead"] -> t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); t.template_modes <- Parser_insertion_mode.In_table :: t.template_modes; t.mode <- Parser_insertion_mode.In_table; process_token t token | Token.Tag { kind = Token.Start; name = "col"; _ } -> t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); t.template_modes <- Parser_insertion_mode.In_column_group :: t.template_modes; t.mode <- Parser_insertion_mode.In_column_group; process_token t token | Token.Tag { kind = Token.Start; name = "tr"; _ } -> t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); t.template_modes <- Parser_insertion_mode.In_table_body :: t.template_modes; t.mode <- Parser_insertion_mode.In_table_body; process_token t token | Token.Tag { kind = Token.Start; name; _ } when Parser_constants.is_table_cell_element name -> t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); t.template_modes <- Parser_insertion_mode.In_row :: t.template_modes; t.mode <- Parser_insertion_mode.In_row; process_token t token | Token.Tag { kind = Token.Start; _ } -> t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); t.template_modes <- Parser_insertion_mode.In_body :: t.template_modes; t.mode <- Parser_insertion_mode.In_body; process_token t token | Token.Tag { kind = Token.End; name; _ } -> parse_error t ("unexpected-end-tag:" ^ name) | Token.EOF -> if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then () (* Stop parsing *) else begin parse_error t "expected-closing-tag-but-got-eof"; pop_until_html_tag t "template"; clear_active_formatting_to_marker t; t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); reset_insertion_mode t; process_token t token end and process_after_body t token = match token with | Token.Character data when is_whitespace data -> process_in_body t token | Token.Comment data -> (* Insert as last child of html element - html is at bottom of stack *) let html_opt = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in (match html_opt with | Some html -> let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in Dom.append_child html (Dom.create_comment ~location data) | None -> ()) | Token.Doctype _ -> parse_error t "unexpected-doctype" | Token.Tag { kind = Token.Start; name = "html"; _ } -> process_in_body t token | Token.Tag { kind = Token.End; name = "html"; _ } -> if t.fragment_context <> None then parse_error t "unexpected-end-tag" else t.mode <- Parser_insertion_mode.After_after_body | Token.EOF -> () (* Stop parsing *) | _ -> parse_error t "unexpected-token-after-body"; t.mode <- Parser_insertion_mode.In_body; process_token t token and process_in_frameset t token = match token with | Token.Character data -> (* Extract only whitespace characters and insert them *) let whitespace = String.to_seq data |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' ']) |> String.of_seq in if whitespace <> "" then insert_character t whitespace; if not (is_whitespace data) then parse_error t "unexpected-char-in-frameset" | Token.Comment data -> insert_comment t data | Token.Doctype _ -> parse_error t "unexpected-doctype" | Token.Tag { kind = Token.Start; name = "html"; _ } -> process_in_body t token | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } -> ignore (insert_element t "frameset" ~push:true attrs) | Token.Tag { kind = Token.End; name = "frameset"; _ } -> (match current_node t with | Some n when n.Dom.name = "html" -> parse_error t "unexpected-end-tag" | _ -> pop_current t; if t.fragment_context = None then (match current_node t with | Some n when n.Dom.name <> "frameset" -> t.mode <- Parser_insertion_mode.After_frameset | _ -> ())) | Token.Tag { kind = Token.Start; name = "frame"; attrs; _ } -> ignore (insert_element t "frame" ~push:true attrs); pop_current t | Token.Tag { kind = Token.Start; name = "noframes"; _ } -> process_in_head t token | Token.EOF -> (match current_node t with | Some n when n.Dom.name <> "html" -> parse_error t "expected-closing-tag-but-got-eof" | _ -> ()) | _ -> parse_error t "unexpected-token-in-frameset" and process_after_frameset t token = match token with | Token.Character data -> (* Extract only whitespace characters and insert them *) let whitespace = String.to_seq data |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' ']) |> String.of_seq in if whitespace <> "" then insert_character t whitespace; if not (is_whitespace data) then parse_error t "unexpected-char-after-frameset" | Token.Comment data -> insert_comment t data | Token.Doctype _ -> parse_error t "unexpected-doctype" | Token.Tag { kind = Token.Start; name = "html"; _ } -> process_in_body t token | Token.Tag { kind = Token.End; name = "html"; _ } -> t.mode <- Parser_insertion_mode.After_after_frameset | Token.Tag { kind = Token.Start; name = "noframes"; _ } -> process_in_head t token | Token.EOF -> () (* Stop parsing *) | _ -> parse_error t "unexpected-token-after-frameset" and process_after_after_body t token = match token with | Token.Comment data -> insert_comment_to_document t data | Token.Doctype _ -> process_in_body t token | Token.Character data when is_whitespace data -> process_in_body t token | Token.Tag { kind = Token.Start; name = "html"; _ } -> process_in_body t token | Token.EOF -> () (* Stop parsing *) | _ -> parse_error t "unexpected-token-after-after-body"; t.mode <- Parser_insertion_mode.In_body; process_token t token and process_after_after_frameset t token = match token with | Token.Comment data -> insert_comment_to_document t data | Token.Doctype _ -> process_in_body t token | Token.Character data -> (* Extract only whitespace characters and process using in_body rules *) let whitespace = String.to_seq data |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' ']) |> String.of_seq in if whitespace <> "" then process_in_body t (Token.Character whitespace); if not (is_whitespace data) then parse_error t "unexpected-char-after-after-frameset" | Token.Tag { kind = Token.Start; name = "html"; _ } -> process_in_body t token | Token.EOF -> () (* Stop parsing *) | Token.Tag { kind = Token.Start; name = "noframes"; _ } -> process_in_head t token | _ -> parse_error t "unexpected-token-after-after-frameset" and process_token t token = (* Check for HTML integration points (SVG foreignObject, desc, title) *) let is_html_integration_point node = (* SVG foreignObject, desc, and title are always HTML integration points *) if node.Dom.namespace = Some "svg" && Parser_constants.is_svg_html_integration node.Dom.name then true (* annotation-xml is an HTML integration point only with specific encoding values *) else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then match List.assoc_opt "encoding" node.Dom.attrs with | Some enc -> let enc_lower = String.lowercase_ascii enc in enc_lower = "text/html" || enc_lower = "application/xhtml+xml" | None -> false else false in (* Check for MathML text integration points *) let is_mathml_text_integration_point node = node.Dom.namespace = Some "mathml" && Parser_constants.is_mathml_text_integration node.Dom.name in (* Foreign content handling *) let in_foreign = match adjusted_current_node t with | None -> false | Some node -> if is_in_html_namespace node then false else begin (* At HTML integration points, characters and start tags (except mglyph/malignmark) use HTML rules *) if is_html_integration_point node then begin match token with | Token.Character _ -> false | Token.Tag { kind = Token.Start; _ } -> false | _ -> true end (* At MathML text integration points, characters and start tags (except mglyph/malignmark) use HTML rules *) else if is_mathml_text_integration_point node then begin match token with | Token.Character _ -> false | Token.Tag { kind = Token.Start; name; _ } -> name = "mglyph" || name = "malignmark" | _ -> true end (* Special case: inside annotation-xml uses HTML rules (creates svg in svg namespace) *) else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then begin match token with | Token.Tag { kind = Token.Start; name; _ } when String.lowercase_ascii name = "svg" -> false | _ -> true end (* Not at integration point - use foreign content rules *) (* Breakout handling is done inside process_foreign_content *) else true end in (* Check if at HTML integration point for special table mode handling *) let at_integration_point = match adjusted_current_node t with | Some node -> is_html_integration_point node || is_mathml_text_integration_point node | None -> false in if in_foreign then process_foreign_content t token else if at_integration_point then begin (* At integration points, check if in table mode without table in scope *) let is_table_mode = List.mem t.mode [In_table; In_table_body; In_row; In_cell; In_caption; In_column_group] in let has_table = has_element_in_table_scope t "table" in if is_table_mode && not has_table then begin match token with | Token.Tag { kind = Token.Start; _ } -> (* Temporarily use IN_BODY for start tags in table mode without table *) let saved_mode = t.mode in t.mode <- In_body; process_by_mode t token; if t.mode = In_body then t.mode <- saved_mode | _ -> process_by_mode t token end else process_by_mode t token end else process_by_mode t token (* Pop foreign elements until HTML or integration point *) and pop_until_html_or_integration_point t = let is_html_integration_point node = (* SVG foreignObject, desc, and title are always HTML integration points *) if node.Dom.namespace = Some "svg" && Parser_constants.is_svg_html_integration node.Dom.name then true (* annotation-xml is an HTML integration point only with specific encoding values *) else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then match List.assoc_opt "encoding" node.Dom.attrs with | Some enc -> let enc_lower = String.lowercase_ascii enc in enc_lower = "text/html" || enc_lower = "application/xhtml+xml" | None -> false else false in (* Get fragment context element - only for foreign namespace fragment contexts *) let fragment_context_elem = t.fragment_context_element in let rec pop () = match current_node t with | None -> () | Some node -> if is_in_html_namespace node then () else if is_html_integration_point node then () (* Don't pop past fragment context element *) else (match fragment_context_elem with | Some ctx when node == ctx -> () | _ -> pop_current t; pop ()) in pop () (* Foreign breakout elements - these break out of foreign content *) and is_foreign_breakout_element name = List.mem (String.lowercase_ascii name) ["b"; "big"; "blockquote"; "body"; "br"; "center"; "code"; "dd"; "div"; "dl"; "dt"; "em"; "embed"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "hr"; "i"; "img"; "li"; "listing"; "menu"; "meta"; "nobr"; "ol"; "p"; "pre"; "ruby"; "s"; "small"; "span"; "strong"; "strike"; "sub"; "sup"; "table"; "tt"; "u"; "ul"; "var"] and process_foreign_content t token = match token with | Token.Character data when String.contains data '\x00' -> (* Replace NUL characters with U+FFFD replacement character *) parse_error t "unexpected-null-character"; let buf = Buffer.create (String.length data) in let has_non_ws_non_nul = ref false in String.iter (fun c -> if c = '\x00' then Buffer.add_string buf "\xEF\xBF\xBD" else begin Buffer.add_char buf c; if not (c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r') then has_non_ws_non_nul := true end ) data; let replaced = Buffer.contents buf in insert_character t replaced; (* Only set frameset_ok to false if there's actual non-whitespace non-NUL content *) if !has_non_ws_non_nul then t.frameset_ok <- false | Token.Character data when is_whitespace data -> insert_character t data | Token.Character data -> insert_character t data; t.frameset_ok <- false | Token.Comment data -> insert_comment t data | Token.Doctype _ -> parse_error t "unexpected-doctype" | Token.Tag { kind = Token.Start; name; _ } when is_foreign_breakout_element name -> (* Breakout from foreign content - pop until HTML or integration point, reprocess in HTML mode *) parse_error t "unexpected-html-element-in-foreign-content"; pop_until_html_or_integration_point t; reset_insertion_mode t; (* Use process_by_mode to force HTML mode processing and avoid infinite loop *) process_by_mode t token | Token.Tag { kind = Token.Start; name = "font"; attrs; _ } when List.exists (fun (n, _) -> let n = String.lowercase_ascii n in n = "color" || n = "face" || n = "size") attrs -> (* font with color/face/size breaks out of foreign content *) parse_error t "unexpected-html-element-in-foreign-content"; pop_until_html_or_integration_point t; reset_insertion_mode t; process_by_mode t token | Token.Tag { kind = Token.Start; name; attrs; self_closing } -> let name = match adjusted_current_node t with | Some n when n.Dom.namespace = Some "svg" -> Parser_constants.adjust_svg_tag_name name | _ -> name in let attrs = match adjusted_current_node t with | Some n when n.Dom.namespace = Some "svg" -> Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs) | Some n when n.Dom.namespace = Some "mathml" -> Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs) | _ -> Parser_constants.adjust_foreign_attrs attrs in let namespace = match adjusted_current_node t with | Some n -> n.Dom.namespace | None -> None in let node = insert_element t name ~namespace attrs in t.open_elements <- node :: t.open_elements; if self_closing then pop_current t | Token.Tag { kind = Token.End; name; _ } when List.mem (String.lowercase_ascii name) ["br"; "p"] -> (* Special case:
and

end tags trigger breakout from foreign content *) parse_error t "unexpected-html-element-in-foreign-content"; pop_until_html_or_integration_point t; reset_insertion_mode t; (* Use process_by_mode to force HTML mode processing and avoid infinite loop *) process_by_mode t token | Token.Tag { kind = Token.End; name; _ } -> (* Find matching element per WHATWG spec for foreign content *) let is_fragment_context n = match t.fragment_context_element with | Some ctx -> n == ctx | None -> false in let name_lower = String.lowercase_ascii name in (* Walk through stack looking for matching element *) let rec find_and_process first_node idx = function | [] -> () (* Stack exhausted - ignore tag *) | n :: rest -> let node_name_lower = String.lowercase_ascii n.Dom.name in let is_html = is_in_html_namespace n in let name_matches = node_name_lower = name_lower in (* If first node doesn't match tag name, it's a parse error *) if first_node && not name_matches then parse_error t "unexpected-end-tag-in-foreign-content"; (* Check if this node matches the end tag *) if name_matches then begin (* Fragment context check *) if is_fragment_context n then parse_error t "unexpected-end-tag-in-fragment-context" (* If matched element is in HTML namespace, reprocess via HTML mode *) else if is_html then process_by_mode t token (* Otherwise it's a foreign element - pop everything from this point up *) else begin (* Pop all elements from current down to and including the matched element *) let rec pop_to_idx current_idx = if current_idx >= idx then begin pop_current t; pop_to_idx (current_idx - 1) end in pop_to_idx (List.length t.open_elements - 1) end end (* If we hit an HTML element that doesn't match, process via HTML mode *) else if is_html then process_by_mode t token (* Continue searching in the stack *) else find_and_process false (idx - 1) rest in find_and_process true (List.length t.open_elements - 1) t.open_elements | Token.EOF -> process_by_mode t token and process_by_mode t token = match t.mode with | Parser_insertion_mode.Initial -> process_initial t token | Parser_insertion_mode.Before_html -> process_before_html t token | Parser_insertion_mode.Before_head -> process_before_head t token | Parser_insertion_mode.In_head -> process_in_head t token | Parser_insertion_mode.In_head_noscript -> process_in_head_noscript t token | Parser_insertion_mode.After_head -> process_after_head t token | Parser_insertion_mode.In_body -> process_in_body t token | Parser_insertion_mode.Text -> process_text t token | Parser_insertion_mode.In_table -> process_in_table t token | Parser_insertion_mode.In_table_text -> process_in_table_text t token | Parser_insertion_mode.In_caption -> process_in_caption t token | Parser_insertion_mode.In_column_group -> process_in_column_group t token | Parser_insertion_mode.In_table_body -> process_in_table_body t token | Parser_insertion_mode.In_row -> process_in_row t token | Parser_insertion_mode.In_cell -> process_in_cell t token | Parser_insertion_mode.In_select -> process_in_select t token | Parser_insertion_mode.In_select_in_table -> process_in_select_in_table t token | Parser_insertion_mode.In_template -> process_in_template t token | Parser_insertion_mode.After_body -> process_after_body t token | Parser_insertion_mode.In_frameset -> process_in_frameset t token | Parser_insertion_mode.After_frameset -> process_after_frameset t token | Parser_insertion_mode.After_after_body -> process_after_after_body t token | Parser_insertion_mode.After_after_frameset -> process_after_after_frameset t token (* Populate selectedcontent elements with content from selected option *) let find_elements name node = let result = ref [] in let rec find n = if n.Dom.name = name then result := n :: !result; List.iter find n.Dom.children in find node; List.rev !result (* Reverse to maintain document order *) let find_element name node = let rec find n = if n.Dom.name = name then Some n else List.find_map find n.Dom.children in find node let populate_selectedcontent document = let selects = find_elements "select" document in List.iter (fun select -> match find_element "selectedcontent" select with | None -> () | Some selectedcontent -> let options = find_elements "option" select in if options <> [] then begin (* Find selected option or use first *) let selected_option = match List.find_opt (fun opt -> Dom.has_attr opt "selected") options with | Some opt -> opt | None -> List.hd options in (* Clone children from selected option to selectedcontent *) List.iter (fun child -> let cloned = Dom.clone ~deep:true child in Dom.append_child selectedcontent cloned ) selected_option.Dom.children end ) selects let finish t = (* Populate selectedcontent elements *) populate_selectedcontent t.document; (* For fragment parsing, remove the html wrapper and promote children *) if t.fragment_context <> None then begin match t.document.Dom.children with | [root] when root.Dom.name = "html" -> (* Move context element's children to root if applicable *) (match t.fragment_context_element with | Some ctx_elem -> (match ctx_elem.Dom.parent with | Some p when p == root -> let ctx_children = ctx_elem.Dom.children in List.iter (fun child -> Dom.remove_child ctx_elem child; Dom.append_child root child ) ctx_children; Dom.remove_child root ctx_elem | _ -> ()) | None -> ()); (* Promote root's children to document - preserve order *) let children_copy = root.Dom.children in List.iter (fun child -> Dom.remove_child root child; Dom.append_child t.document child ) children_copy; Dom.remove_child t.document root | _ -> () end; t.document let get_errors t = List.rev t.errors