OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at fuzz 2555 lines 107 kB view raw
1(* HTML5 Tree Builder *) 2 3module Dom = Dom 4module Token = Tokenizer_token 5module State = Tokenizer_state 6 7type fragment_context = { 8 tag_name : string; 9 namespace : string option; 10} 11 12type formatting_entry = 13 | Marker 14 | Entry of { 15 name : string; 16 attrs : (string * string) list; 17 node : Dom.node; 18 } 19 20type parse_error = { 21 code : Parse_error_code.t; 22 line : int; 23 column : int; 24} 25 26type t = { 27 mutable document : Dom.node; 28 mutable mode : Parser_insertion_mode.t; 29 mutable original_mode : Parser_insertion_mode.t option; 30 mutable open_elements : Dom.node list; 31 mutable active_formatting : formatting_entry list; 32 mutable head_element : Dom.node option; 33 mutable form_element : Dom.node option; 34 mutable frameset_ok : bool; 35 mutable ignore_lf : bool; 36 mutable foster_parenting : bool; 37 mutable pending_table_chars : string list; 38 mutable template_modes : Parser_insertion_mode.t list; 39 mutable quirks_mode : Dom.quirks_mode; 40 mutable errors : parse_error list; 41 collect_errors : bool; 42 fragment_context : fragment_context option; 43 mutable fragment_context_element : Dom.node option; 44 iframe_srcdoc : bool; 45 mutable current_line : int; 46 mutable current_column : int; 47} 48 49let create ?(collect_errors=false) ?fragment_context ?(iframe_srcdoc=false) () = 50 let is_fragment = fragment_context <> None in 51 let doc = if is_fragment then Dom.create_document_fragment () else Dom.create_document () in 52 let t = { 53 document = doc; 54 mode = Parser_insertion_mode.Initial; 55 original_mode = None; 56 open_elements = []; 57 active_formatting = []; 58 head_element = None; 59 form_element = None; 60 frameset_ok = true; 61 ignore_lf = false; 62 foster_parenting = false; 63 pending_table_chars = []; 64 template_modes = []; 65 quirks_mode = Dom.No_quirks; 66 errors = []; 67 collect_errors; 68 fragment_context; 69 fragment_context_element = None; 70 iframe_srcdoc; 71 current_line = 1; 72 current_column = 1; 73 } in 74 (* Initialize fragment parsing *) 75 (match fragment_context with 76 | Some ctx -> 77 let name = String.lowercase_ascii ctx.tag_name in 78 let ns = ctx.namespace in 79 (* Create html root *) 80 let root = Dom.create_element "html" () in 81 Dom.append_child doc root; 82 t.open_elements <- [root]; 83 (* For foreign content contexts, create context element *) 84 (match ns with 85 | Some namespace when namespace <> "html" -> 86 let context_elem = Dom.create_element ctx.tag_name ~namespace:ns () in 87 Dom.append_child root context_elem; 88 t.open_elements <- [context_elem; root]; 89 t.fragment_context_element <- Some context_elem 90 | _ -> ()); 91 (* Set initial mode based on context *) 92 t.mode <- ( 93 if name = "html" then Parser_insertion_mode.Before_head 94 else if Parser_constants.is_table_section_element name && (ns = None || ns = Some "html") then 95 Parser_insertion_mode.In_table_body 96 else if name = "tr" && (ns = None || ns = Some "html") then 97 Parser_insertion_mode.In_row 98 else if Parser_constants.is_table_cell_element name && (ns = None || ns = Some "html") then 99 Parser_insertion_mode.In_cell 100 else if name = "caption" && (ns = None || ns = Some "html") then 101 Parser_insertion_mode.In_caption 102 else if name = "colgroup" && (ns = None || ns = Some "html") then 103 Parser_insertion_mode.In_column_group 104 else if name = "table" && (ns = None || ns = Some "html") then 105 Parser_insertion_mode.In_table 106 else if name = "template" && (ns = None || ns = Some "html") then begin 107 t.template_modes <- [Parser_insertion_mode.In_template]; 108 Parser_insertion_mode.In_template 109 end 110 else 111 Parser_insertion_mode.In_body 112 ); 113 t.frameset_ok <- false 114 | None -> ()); 115 t 116 117(* Position tracking for error reporting *) 118let set_position t ~line ~column = 119 t.current_line <- line; 120 t.current_column <- column 121 122(* Error handling *) 123let parse_error t code = 124 if t.collect_errors then 125 t.errors <- { code = Parse_error_code.of_string code; line = t.current_line; column = t.current_column } :: t.errors 126 127(* Stack helpers *) 128let current_node t = 129 match t.open_elements with 130 | [] -> None 131 | x :: _ -> Some x 132 133let adjusted_current_node t = 134 match t.fragment_context, t.open_elements with 135 | Some ctx, [_] -> 136 (* Fragment case: use context element info *) 137 Some (Dom.create_element ctx.tag_name ~namespace:ctx.namespace ()) 138 | _, x :: _ -> Some x 139 | _, [] -> None 140 141let is_in_html_namespace node = 142 node.Dom.namespace = None || node.Dom.namespace = Some "html" 143 144(* Namespace-aware check for "special" elements per WHATWG spec *) 145let is_special_element node = 146 let name = String.lowercase_ascii node.Dom.name in 147 match node.Dom.namespace with 148 | None | Some "html" -> Parser_constants.is_special name 149 | Some "mathml" -> List.mem name ["mi"; "mo"; "mn"; "ms"; "mtext"; "annotation-xml"] 150 | Some "svg" -> List.mem name ["foreignobject"; "desc"; "title"] 151 | _ -> false 152 153let adjusted_current_node_in_html_namespace t = 154 match adjusted_current_node t with 155 | Some node -> is_in_html_namespace node 156 | None -> true 157 158(* Insertion helpers *) 159let appropriate_insertion_place t = 160 match current_node t with 161 | None -> (t.document, None) 162 | Some target -> 163 if t.foster_parenting && Parser_constants.is_foster_parenting_element target.Dom.name then begin 164 (* Foster parenting per WHATWG spec *) 165 (* Step 1: Find last (most recent) template and table in stack *) 166 (* Note: index 0 = top of stack = most recently added *) 167 let last_template_idx = ref None in 168 let last_table_idx = ref None in 169 List.iteri (fun i n -> 170 (* Take first match (most recent = lowest index) *) 171 if n.Dom.name = "template" && !last_template_idx = None then last_template_idx := Some i; 172 if n.Dom.name = "table" && !last_table_idx = None then last_table_idx := Some i 173 ) t.open_elements; 174 175 (* Step 2-3: If last template is more recent than last table (lower index = more recent) *) 176 match !last_template_idx, !last_table_idx with 177 | Some ti, None -> 178 (* No table, use template content *) 179 let template = List.nth t.open_elements ti in 180 (match template.Dom.template_content with 181 | Some tc -> (tc, None) 182 | None -> (template, None)) 183 | Some ti, Some tbi when ti < tbi -> 184 (* Template is more recent than table, use template content *) 185 let template = List.nth t.open_elements ti in 186 (match template.Dom.template_content with 187 | Some tc -> (tc, None) 188 | None -> (template, None)) 189 | _, Some tbi -> 190 (* Use table's parent as foster parent *) 191 let table = List.nth t.open_elements tbi in 192 (match table.Dom.parent with 193 | Some parent -> (parent, Some table) 194 | None -> 195 (* Step 6: element above table in stack (index + 1 since 0 is top) *) 196 if tbi + 1 < List.length t.open_elements then 197 (List.nth t.open_elements (tbi + 1), None) 198 else 199 (t.document, None)) 200 | None, None -> 201 (* No table or template, use document *) 202 (t.document, None) 203 end else begin 204 (* If target is a template, insert into its content document fragment *) 205 match target.Dom.template_content with 206 | Some tc -> (tc, None) 207 | None -> (target, None) 208 end 209 210let insert_element t name ?(namespace=None) ?(push=false) attrs = 211 (* Reset ignore_lf flag - per HTML5 spec, only the immediately next token 212 after pre/textarea/listing should be checked for leading LF *) 213 t.ignore_lf <- false; 214 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in 215 let node = Dom.create_element name ~namespace ~attrs ~location () in 216 let (parent, before) = appropriate_insertion_place t in 217 (match before with 218 | None -> Dom.append_child parent node 219 | Some ref -> Dom.insert_before parent node ref); 220 if push then t.open_elements <- node :: t.open_elements; 221 node 222 223let insert_element_for_token t (tag : Token.tag) = 224 insert_element t tag.name ~push:true tag.attrs 225 226let insert_foreign_element t (tag : Token.tag) namespace = 227 let attrs = 228 if namespace = Some "svg" then 229 Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs tag.attrs) 230 else 231 Parser_constants.adjust_foreign_attrs tag.attrs 232 in 233 let name = 234 if namespace = Some "svg" then Parser_constants.adjust_svg_tag_name tag.name 235 else tag.name 236 in 237 let node = insert_element t name ~namespace attrs in 238 t.open_elements <- node :: t.open_elements; 239 node 240 241let insert_character t data = 242 if t.ignore_lf && String.length data > 0 && data.[0] = '\n' then begin 243 t.ignore_lf <- false; 244 if String.length data > 1 then begin 245 let rest = String.sub data 1 (String.length data - 1) in 246 let (parent, before) = appropriate_insertion_place t in 247 Dom.insert_text_at parent rest before 248 end 249 end else begin 250 t.ignore_lf <- false; 251 let (parent, before) = appropriate_insertion_place t in 252 Dom.insert_text_at parent data before 253 end 254 255let insert_comment t data = 256 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in 257 let node = Dom.create_comment ~location data in 258 let (parent, _) = appropriate_insertion_place t in 259 Dom.append_child parent node 260 261let insert_comment_to_document t data = 262 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in 263 let node = Dom.create_comment ~location data in 264 Dom.append_child t.document node 265 266(* Stack manipulation *) 267let pop_current t = 268 match t.open_elements with 269 | [] -> () 270 | _ :: rest -> t.open_elements <- rest 271 272let pop_until t pred = 273 let rec loop () = 274 match t.open_elements with 275 | [] -> () 276 | x :: rest -> 277 t.open_elements <- rest; 278 if not (pred x) then loop () 279 in 280 loop () 281 282let pop_until_tag t name = 283 pop_until t (fun n -> n.Dom.name = name) 284 285(* Pop until HTML namespace element with given name *) 286let pop_until_html_tag t name = 287 pop_until t (fun n -> n.Dom.name = name && is_in_html_namespace n) 288 289let pop_until_one_of t names = 290 pop_until t (fun n -> List.mem n.Dom.name names) 291 292(* Pop until HTML namespace element with one of given names *) 293let pop_until_html_one_of t names = 294 pop_until t (fun n -> List.mem n.Dom.name names && is_in_html_namespace n) 295 296(* Check if element is an HTML integration point *) 297let is_html_integration_point node = 298 (* SVG foreignObject, desc, and title are always HTML integration points *) 299 if node.Dom.namespace = Some "svg" && 300 Parser_constants.is_svg_html_integration node.Dom.name then true 301 (* annotation-xml is an HTML integration point only with specific encoding values *) 302 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then 303 match List.assoc_opt "encoding" node.Dom.attrs with 304 | Some enc -> 305 let enc_lower = String.lowercase_ascii enc in 306 enc_lower = "text/html" || enc_lower = "application/xhtml+xml" 307 | None -> false 308 else false 309 310(* Check if element is a MathML text integration point *) 311let is_mathml_text_integration_point node = 312 node.Dom.namespace = Some "mathml" && 313 Parser_constants.is_mathml_text_integration node.Dom.name 314 315(* Scope checks - integration points also terminate scope (except for table scope) *) 316(* Per WHATWG spec, scope checks only consider HTML namespace elements for the target names *) 317let has_element_in_scope_impl t names exclude_list ~check_integration_points = 318 let rec check = function 319 | [] -> false 320 | n :: rest -> 321 (* Target elements must be in HTML namespace *) 322 if is_in_html_namespace n && List.mem n.Dom.name names then true 323 else if is_in_html_namespace n && List.mem n.Dom.name exclude_list then false 324 (* Integration points terminate scope (unless we're checking table scope) *) 325 else if check_integration_points && (is_html_integration_point n || is_mathml_text_integration_point n) then false 326 else check rest 327 in 328 check t.open_elements 329 330let has_element_in_scope t name = 331 has_element_in_scope_impl t [name] Parser_constants.default_scope ~check_integration_points:true 332 333let has_element_in_button_scope t name = 334 has_element_in_scope_impl t [name] Parser_constants.button_scope ~check_integration_points:true 335 336let has_element_in_list_item_scope t name = 337 has_element_in_scope_impl t [name] Parser_constants.list_item_scope ~check_integration_points:true 338 339let has_element_in_table_scope t name = 340 has_element_in_scope_impl t [name] Parser_constants.table_scope ~check_integration_points:false 341 342let has_element_in_select_scope t name = 343 let rec check = function 344 | [] -> false 345 | n :: rest -> 346 if n.Dom.name = name then true 347 else if not (Parser_constants.is_select_scope_exclude n.Dom.name) then false 348 else check rest 349 in 350 check t.open_elements 351 352(* Implied end tags *) 353let generate_implied_end_tags t ?except () = 354 let rec loop () = 355 match current_node t with 356 | Some n when Parser_constants.is_implied_end_tag n.Dom.name -> 357 (match except with 358 | Some ex when n.Dom.name = ex -> () 359 | _ -> pop_current t; loop ()) 360 | _ -> () 361 in 362 loop () 363 364let generate_all_implied_end_tags t = 365 let rec loop () = 366 match current_node t with 367 | Some n when Parser_constants.is_thoroughly_implied_end_tag n.Dom.name -> 368 pop_current t; loop () 369 | _ -> () 370 in 371 loop () 372 373(* Active formatting elements *) 374let push_formatting_marker t = 375 t.active_formatting <- Marker :: t.active_formatting 376 377let push_formatting_element t node name attrs = 378 (* Noah's Ark: remove earlier identical elements (up to 3) *) 379 let rec count_and_remove same acc = function 380 | [] -> List.rev acc 381 | Marker :: rest -> List.rev acc @ (Marker :: rest) 382 | Entry e :: rest when e.name = name && e.attrs = attrs -> 383 if same >= 2 then 384 count_and_remove same acc rest (* Remove this one *) 385 else 386 count_and_remove (same + 1) (Entry e :: acc) rest 387 | x :: rest -> count_and_remove same (x :: acc) rest 388 in 389 t.active_formatting <- count_and_remove 0 [] t.active_formatting; 390 t.active_formatting <- Entry { name; attrs; node } :: t.active_formatting 391 392let clear_active_formatting_to_marker t = 393 let rec loop = function 394 | [] -> [] 395 | Marker :: rest -> rest 396 | _ :: rest -> loop rest 397 in 398 t.active_formatting <- loop t.active_formatting 399 400let reconstruct_active_formatting t = 401 let rec find_to_reconstruct acc = function 402 | [] -> acc 403 | Marker :: _ -> acc 404 | Entry e :: rest -> 405 if List.exists (fun n -> n == e.node) t.open_elements then acc 406 else find_to_reconstruct (Entry e :: acc) rest 407 in 408 let to_reconstruct = find_to_reconstruct [] t.active_formatting in 409 List.iter (fun entry -> 410 match entry with 411 | Entry e -> 412 let node = insert_element t e.name e.attrs in 413 t.open_elements <- node :: t.open_elements; 414 (* Update the entry to point to new node *) 415 t.active_formatting <- List.map (fun x -> 416 if x == entry then Entry { e with node } 417 else x 418 ) t.active_formatting 419 | Marker -> () 420 ) to_reconstruct 421 422(* Adoption agency algorithm - follows WHATWG spec *) 423let adoption_agency t tag_name = 424 (* Step 1: If current node is subject and not in active formatting list, just pop *) 425 (match current_node t with 426 | Some n when n.Dom.name = tag_name -> 427 let in_active = List.exists (function 428 | Entry e -> e.name = tag_name 429 | Marker -> false 430 ) t.active_formatting in 431 if not in_active then begin 432 pop_current t; 433 () (* Return early - this case is handled *) 434 end 435 | _ -> ()); 436 437 (* Step 2: Outer loop *) 438 let outer_loop_counter = ref 0 in 439 let done_flag = ref false in 440 441 while !outer_loop_counter < 8 && not !done_flag do 442 incr outer_loop_counter; 443 444 (* Step 3: Find formatting element in active formatting list *) 445 let rec find_formatting_index idx = function 446 | [] -> None 447 | Marker :: _ -> None 448 | Entry e :: rest -> 449 if e.name = tag_name then Some (idx, e.node, e.attrs) 450 else find_formatting_index (idx + 1) rest 451 in 452 let formatting_entry = find_formatting_index 0 t.active_formatting in 453 454 match formatting_entry with 455 | None -> 456 (* No formatting element found - done *) 457 done_flag := true 458 | Some (fmt_idx, fmt_node, fmt_attrs) -> 459 460 (* Step 4: Check if formatting element is in open elements *) 461 if not (List.exists (fun n -> n == fmt_node) t.open_elements) then begin 462 parse_error t "adoption-agency-1.2"; 463 t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting; 464 done_flag := true 465 end 466 (* Step 5: Check if formatting element is in scope *) 467 else if not (has_element_in_scope t tag_name) then begin 468 parse_error t "adoption-agency-1.3"; 469 done_flag := true 470 end else begin 471 (* Step 6: Parse error if not current node *) 472 (match current_node t with 473 | Some n when n != fmt_node -> parse_error t "adoption-agency-1.3" 474 | _ -> ()); 475 476 (* Step 7: Find furthest block - first special element BELOW formatting element *) 477 (* open_elements is [current(top)...html(bottom)], formatting element is somewhere in the middle *) 478 (* We need the first special element going from formatting element toward current *) 479 (* This is the "topmost" (closest to formatting element) special element that is "lower" (closer to current) *) 480 let fmt_stack_idx = ref (-1) in 481 List.iteri (fun i n -> if n == fmt_node then fmt_stack_idx := i) t.open_elements; 482 let furthest_block = 483 if !fmt_stack_idx <= 0 then None 484 else begin 485 (* Look from fmt_stack_idx-1 down to 0, find first special element *) 486 let rec find_from_idx idx = 487 if idx < 0 then None 488 else 489 let n = List.nth t.open_elements idx in 490 if is_special_element n then Some n 491 else find_from_idx (idx - 1) 492 in 493 find_from_idx (!fmt_stack_idx - 1) 494 end 495 in 496 497 match furthest_block with 498 | None -> 499 (* Step 8: No furthest block - pop elements including formatting element *) 500 pop_until t (fun n -> n == fmt_node); 501 t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting; 502 done_flag := true 503 504 | Some fb -> 505 (* Step 9: Let common ancestor be element immediately above formatting element *) 506 let rec find_common_ancestor = function 507 | [] -> None 508 | n :: rest when n == fmt_node -> 509 (match rest with x :: _ -> Some x | [] -> None) 510 | _ :: rest -> find_common_ancestor rest 511 in 512 let common_ancestor = find_common_ancestor t.open_elements in 513 514 (* Step 10: Bookmark starts after formatting element *) 515 let bookmark = ref (fmt_idx + 1) in 516 517 (* Step 11: Let last_node = furthest block *) 518 let last_node = ref fb in 519 520 (* Step 12: Inner loop *) 521 (* The inner loop processes elements between furthest_block and formatting_element, 522 removing non-formatting elements and reparenting formatting elements *) 523 let inner_loop_counter = ref 0 in 524 525 (* Get index of furthest block in open elements *) 526 let fb_idx = ref 0 in 527 List.iteri (fun i n -> if n == fb then fb_idx := i) t.open_elements; 528 529 (* Start from element after furthest block (toward formatting element) *) 530 let node_idx = ref (!fb_idx + 1) in 531 532 while !node_idx < List.length t.open_elements && 533 (List.nth t.open_elements !node_idx) != fmt_node do 534 incr inner_loop_counter; 535 let current_node = List.nth t.open_elements !node_idx in 536 537 (* Step 12.3: Find node in active formatting list *) 538 let rec find_node_in_formatting idx = function 539 | [] -> None 540 | Entry e :: _rest when e.node == current_node -> Some idx 541 | _ :: rest -> find_node_in_formatting (idx + 1) rest 542 in 543 let node_fmt_idx = find_node_in_formatting 0 t.active_formatting in 544 545 (* Step 12.4: If inner loop counter > 3 and node in active formatting, remove it *) 546 let node_fmt_idx = 547 match node_fmt_idx with 548 | Some idx when !inner_loop_counter > 3 -> 549 t.active_formatting <- List.filteri (fun i _ -> i <> idx) t.active_formatting; 550 if idx < !bookmark then decr bookmark; 551 None 552 | x -> x 553 in 554 555 (* Step 12.5: If node not in active formatting, remove from stack and continue *) 556 match node_fmt_idx with 557 | None -> 558 (* Remove from stack - this shifts indices *) 559 t.open_elements <- List.filteri (fun i _ -> i <> !node_idx) t.open_elements 560 (* Don't increment node_idx since we removed an element *) 561 562 | Some af_idx -> 563 (* Step 12.6: Create new element for node *) 564 let (node_name, node_attrs) = match List.nth t.active_formatting af_idx with 565 | Entry e -> (e.name, e.attrs) 566 | Marker -> failwith "unexpected marker" 567 in 568 let new_node_elem = Dom.create_element node_name ~attrs:node_attrs () in 569 570 (* Update active formatting with new node *) 571 t.active_formatting <- List.mapi (fun i entry -> 572 if i = af_idx then Entry { name = node_name; node = new_node_elem; attrs = node_attrs } 573 else entry 574 ) t.active_formatting; 575 576 (* Replace node in open elements *) 577 t.open_elements <- List.mapi (fun i n -> 578 if i = !node_idx then new_node_elem else n 579 ) t.open_elements; 580 581 (* Step 12.7: If last_node is furthest block, update bookmark *) 582 if !last_node == fb then 583 bookmark := af_idx + 1; 584 585 (* Step 12.8: Reparent last_node to new node *) 586 (match !last_node.Dom.parent with 587 | Some p -> Dom.remove_child p !last_node 588 | None -> ()); 589 Dom.append_child new_node_elem !last_node; 590 591 (* Step 12.9: Let last_node = new node *) 592 last_node := new_node_elem; 593 594 (* Move to next element *) 595 incr node_idx 596 done; 597 598 (* Step 13: Insert last_node into common ancestor *) 599 (match common_ancestor with 600 | Some ca -> 601 (match !last_node.Dom.parent with 602 | Some p -> Dom.remove_child p !last_node 603 | None -> ()); 604 (* Check if we need foster parenting *) 605 if t.foster_parenting && Parser_constants.is_foster_parenting_element ca.Dom.name then begin 606 (* Find table and insert before it *) 607 let rec find_table = function 608 | [] -> None 609 | n :: rest when n.Dom.name = "table" -> Some (n, rest) 610 | _ :: rest -> find_table rest 611 in 612 match find_table t.open_elements with 613 | Some (table, _) -> 614 (match table.Dom.parent with 615 | Some parent -> Dom.insert_before parent !last_node table 616 | None -> Dom.append_child ca !last_node) 617 | None -> Dom.append_child ca !last_node 618 end else begin 619 (* If common ancestor is template, insert into its content *) 620 match ca.Dom.template_content with 621 | Some tc -> Dom.append_child tc !last_node 622 | None -> Dom.append_child ca !last_node 623 end 624 | None -> ()); 625 626 (* Step 14: Create new formatting element *) 627 let new_formatting = Dom.create_element tag_name ~attrs:fmt_attrs () in 628 629 (* Step 15: Move children of furthest block to new formatting element *) 630 let fb_children = fb.Dom.children in 631 List.iter (fun child -> 632 Dom.remove_child fb child; 633 Dom.append_child new_formatting child 634 ) fb_children; 635 636 (* Step 16: Append new formatting element to furthest block *) 637 Dom.append_child fb new_formatting; 638 639 (* Step 17: Remove old from active formatting, insert new at bookmark *) 640 let new_entry = Entry { name = tag_name; node = new_formatting; attrs = fmt_attrs } in 641 t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting; 642 (* Adjust bookmark since we removed an element *) 643 let adjusted_bookmark = if fmt_idx < !bookmark then !bookmark - 1 else !bookmark in 644 let rec insert_at_bookmark idx acc = function 645 | [] -> List.rev (new_entry :: acc) 646 | x :: rest when idx = adjusted_bookmark -> 647 List.rev_append acc (new_entry :: x :: rest) 648 | x :: rest -> insert_at_bookmark (idx + 1) (x :: acc) rest 649 in 650 t.active_formatting <- insert_at_bookmark 0 [] t.active_formatting; 651 652 (* Step 18: Remove formatting element from open elements, insert new after furthest block *) 653 (* "After" in stack terms means new_formatting should be between fb and current node *) 654 (* In our list orientation (current at index 0), this means new_formatting at lower index than fb *) 655 t.open_elements <- List.filter (fun n -> n != fmt_node) t.open_elements; 656 (* Find fb and insert new_formatting before it (lower index = closer to current) *) 657 let rec insert_before acc = function 658 | [] -> List.rev (new_formatting :: acc) 659 | n :: rest when n == fb -> 660 (* Insert new_formatting before fb: acc reversed, then new_formatting, then fb, then rest *) 661 List.rev_append acc (new_formatting :: n :: rest) 662 | n :: rest -> insert_before (n :: acc) rest 663 in 664 t.open_elements <- insert_before [] t.open_elements 665 (* Continue outer loop *) 666 end 667 done 668 669(* Close p element *) 670let close_p_element t = 671 generate_implied_end_tags t ~except:"p" (); 672 (match current_node t with 673 | Some n when n.Dom.name <> "p" -> parse_error t "end-tag-p-implied-but-open-elements" 674 | _ -> ()); 675 pop_until_tag t "p" 676 677(* Reset insertion mode *) 678let reset_insertion_mode t = 679 let rec check_node last = function 680 | [] -> t.mode <- Parser_insertion_mode.In_body 681 | node :: rest -> 682 let is_last = rest = [] in 683 let node_to_check = 684 if is_last then 685 match t.fragment_context with 686 | Some ctx -> Dom.create_element ctx.tag_name ~namespace:ctx.namespace () 687 | None -> node 688 else node 689 in 690 let name = node_to_check.Dom.name in 691 if name = "select" then begin 692 if not is_last then begin 693 let rec find_table_or_template = function 694 | [] -> () 695 | n :: rest -> 696 if n.Dom.name = "template" then t.mode <- Parser_insertion_mode.In_select 697 else if n.Dom.name = "table" then t.mode <- Parser_insertion_mode.In_select_in_table 698 else find_table_or_template rest 699 in 700 find_table_or_template rest 701 end; 702 if t.mode <> Parser_insertion_mode.In_select_in_table then 703 t.mode <- Parser_insertion_mode.In_select 704 end else if Parser_constants.is_table_cell_element name && not is_last then 705 t.mode <- Parser_insertion_mode.In_cell 706 else if name = "tr" then 707 t.mode <- Parser_insertion_mode.In_row 708 else if Parser_constants.is_table_section_element name then 709 t.mode <- Parser_insertion_mode.In_table_body 710 else if name = "caption" then 711 t.mode <- Parser_insertion_mode.In_caption 712 else if name = "colgroup" then 713 t.mode <- Parser_insertion_mode.In_column_group 714 else if name = "table" then 715 t.mode <- Parser_insertion_mode.In_table 716 else if name = "template" then 717 t.mode <- (match t.template_modes with m :: _ -> m | [] -> Parser_insertion_mode.In_template) 718 else if name = "head" && not is_last then 719 t.mode <- Parser_insertion_mode.In_head 720 else if name = "body" then 721 t.mode <- Parser_insertion_mode.In_body 722 else if name = "frameset" then 723 t.mode <- Parser_insertion_mode.In_frameset 724 else if name = "html" then 725 t.mode <- (if t.head_element = None then Parser_insertion_mode.Before_head else Parser_insertion_mode.After_head) 726 else if is_last then 727 t.mode <- Parser_insertion_mode.In_body 728 else 729 check_node last rest 730 in 731 check_node false t.open_elements 732 733let is_whitespace s = 734 let ws = [' '; '\t'; '\n'; '\x0C'; '\r'] in 735 String.for_all (fun c -> List.mem c ws) s 736 737(* Mode handlers *) 738let rec process_initial t token = 739 match token with 740 | Token.Character data when is_whitespace data -> () 741 | Token.Comment data -> insert_comment_to_document t data 742 | Token.Doctype dt -> 743 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in 744 let node = Dom.create_doctype ?name:dt.name ?public_id:dt.public_id ?system_id:dt.system_id ~location () in 745 Dom.append_child t.document node; 746 (* Quirks mode detection *) 747 if dt.force_quirks then 748 t.quirks_mode <- Dom.Quirks 749 else if dt.name <> Some "html" then 750 t.quirks_mode <- Dom.Quirks 751 else begin 752 let pub = Option.map String.lowercase_ascii dt.public_id in 753 let sys = Option.map String.lowercase_ascii dt.system_id in 754 let is_quirky = 755 (match pub with 756 | Some p -> List.mem p Parser_constants.quirky_public_matches || 757 List.exists (fun prefix -> String.length p >= String.length prefix && 758 String.sub p 0 (String.length prefix) = prefix) Parser_constants.quirky_public_prefixes 759 | None -> false) || 760 (match sys with 761 | Some s -> List.mem s Parser_constants.quirky_system_matches 762 | None -> false) 763 in 764 if is_quirky then t.quirks_mode <- Dom.Quirks 765 else begin 766 let is_limited_quirky = 767 match pub with 768 | Some p -> List.exists (fun prefix -> String.length p >= String.length prefix && 769 String.sub p 0 (String.length prefix) = prefix) 770 Parser_constants.limited_quirky_public_prefixes 771 | None -> false 772 in 773 if is_limited_quirky then t.quirks_mode <- Dom.Limited_quirks 774 end 775 end; 776 t.mode <- Parser_insertion_mode.Before_html 777 | _ -> 778 parse_error t "expected-doctype-but-got-other"; 779 t.quirks_mode <- Dom.Quirks; 780 t.mode <- Parser_insertion_mode.Before_html; 781 process_token t token 782 783and process_before_html t token = 784 match token with 785 | Token.Doctype _ -> parse_error t "unexpected-doctype" 786 | Token.Comment data -> insert_comment_to_document t data 787 | Token.Character data when is_whitespace data -> () 788 | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } -> 789 let html = insert_element t "html" attrs in 790 t.open_elements <- [html]; 791 t.mode <- Parser_insertion_mode.Before_head 792 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] -> 793 let html = insert_element t "html" [] in 794 t.open_elements <- [html]; 795 t.mode <- Parser_insertion_mode.Before_head; 796 process_token t token 797 | Token.Tag { kind = Token.End; name; _ } -> 798 parse_error t ("unexpected-end-tag:" ^ name) 799 | _ -> 800 let html = insert_element t "html" [] in 801 t.open_elements <- [html]; 802 t.mode <- Parser_insertion_mode.Before_head; 803 process_token t token 804 805and process_before_head t token = 806 match token with 807 | Token.Character data when is_whitespace data -> () 808 | Token.Comment data -> insert_comment t data 809 | Token.Doctype _ -> parse_error t "unexpected-doctype" 810 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 811 process_in_body t token 812 | Token.Tag { kind = Token.Start; name = "head"; attrs; _ } -> 813 let head = insert_element t "head" attrs in 814 t.open_elements <- head :: t.open_elements; 815 t.head_element <- Some head; 816 t.mode <- Parser_insertion_mode.In_head 817 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] -> 818 let head = insert_element t "head" [] in 819 t.open_elements <- head :: t.open_elements; 820 t.head_element <- Some head; 821 t.mode <- Parser_insertion_mode.In_head; 822 process_token t token 823 | Token.Tag { kind = Token.End; name; _ } -> 824 parse_error t ("unexpected-end-tag:" ^ name) 825 | _ -> 826 let head = insert_element t "head" [] in 827 t.open_elements <- head :: t.open_elements; 828 t.head_element <- Some head; 829 t.mode <- Parser_insertion_mode.In_head; 830 process_token t token 831 832and process_in_head t token = 833 match token with 834 | Token.Character data when is_whitespace data -> 835 insert_character t data 836 | Token.Character data -> 837 (* Extract leading whitespace *) 838 let rec count_leading_ws i = 839 if i >= String.length data then i 840 else match data.[i] with 841 | '\t' | '\n' | '\x0C' | '\r' | ' ' -> count_leading_ws (i + 1) 842 | _ -> i 843 in 844 let ws_count = count_leading_ws 0 in 845 let leading_ws = String.sub data 0 ws_count in 846 let remaining = String.sub data ws_count (String.length data - ws_count) in 847 (* If there's leading whitespace and current element has children, insert it *) 848 if leading_ws <> "" then 849 (match current_node t with 850 | Some n when n.Dom.children <> [] -> insert_character t leading_ws 851 | _ -> ()); 852 pop_current t; 853 t.mode <- Parser_insertion_mode.After_head; 854 process_token t (Token.Character remaining) 855 | Token.Comment data -> 856 insert_comment t data 857 | Token.Doctype _ -> 858 parse_error t "unexpected-doctype" 859 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 860 process_in_body t token 861 | Token.Tag { kind = Token.Start; name; attrs; _ } 862 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"] -> 863 ignore (insert_element t name attrs) 864 | Token.Tag { kind = Token.Start; name = "title"; attrs; self_closing } -> 865 ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs; self_closing }); 866 t.original_mode <- Some t.mode; 867 t.mode <- Parser_insertion_mode.Text 868 | Token.Tag { kind = Token.Start; name; attrs; self_closing } 869 when List.mem name ["noframes"; "style"] -> 870 ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing }); 871 t.original_mode <- Some t.mode; 872 t.mode <- Parser_insertion_mode.Text 873 | Token.Tag { kind = Token.Start; name = "noscript"; attrs; self_closing } -> 874 (* Scripting is disabled: parse noscript content as HTML *) 875 ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs; self_closing }); 876 t.mode <- Parser_insertion_mode.In_head_noscript 877 | Token.Tag { kind = Token.Start; name = "script"; attrs; self_closing } -> 878 ignore (insert_element_for_token t { kind = Token.Start; name = "script"; attrs; self_closing }); 879 t.original_mode <- Some t.mode; 880 t.mode <- Parser_insertion_mode.Text 881 | Token.Tag { kind = Token.End; name = "head"; _ } -> 882 pop_current t; 883 t.mode <- Parser_insertion_mode.After_head 884 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] -> 885 pop_current t; 886 t.mode <- Parser_insertion_mode.After_head; 887 process_token t token 888 | Token.Tag { kind = Token.Start; name = "template"; attrs; _ } -> 889 let node = Dom.create_template ~attrs () in 890 let (parent, _) = appropriate_insertion_place t in 891 Dom.append_child parent node; 892 t.open_elements <- node :: t.open_elements; 893 push_formatting_marker t; 894 t.frameset_ok <- false; 895 t.mode <- Parser_insertion_mode.In_template; 896 t.template_modes <- Parser_insertion_mode.In_template :: t.template_modes 897 | Token.Tag { kind = Token.End; name = "template"; _ } -> 898 if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then 899 parse_error t "unexpected-end-tag" 900 else begin 901 generate_all_implied_end_tags t; 902 (match current_node t with 903 | Some n when not (n.Dom.name = "template" && is_in_html_namespace n) -> parse_error t "unexpected-end-tag" 904 | _ -> ()); 905 pop_until_html_tag t "template"; 906 clear_active_formatting_to_marker t; 907 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 908 reset_insertion_mode t 909 end 910 | Token.Tag { kind = Token.Start; name = "head"; _ } -> 911 parse_error t "unexpected-start-tag" 912 | Token.Tag { kind = Token.End; name; _ } -> 913 parse_error t ("unexpected-end-tag:" ^ name) 914 | _ -> 915 pop_current t; 916 t.mode <- Parser_insertion_mode.After_head; 917 process_token t token 918 919and process_in_head_noscript t token = 920 match token with 921 | Token.Character data when is_whitespace data -> 922 process_in_head t token 923 | Token.Character _ -> 924 parse_error t "unexpected-char-in-noscript"; 925 pop_current t; (* Pop noscript *) 926 t.mode <- Parser_insertion_mode.In_head; 927 process_token t token 928 | Token.Comment _ -> 929 process_in_head t token 930 | Token.Doctype _ -> 931 parse_error t "unexpected-doctype" 932 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 933 process_in_body t token 934 | Token.Tag { kind = Token.Start; name; _ } 935 when List.mem name ["basefont"; "bgsound"; "link"; "meta"; "noframes"; "style"] -> 936 process_in_head t token 937 | Token.Tag { kind = Token.Start; name; _ } 938 when List.mem name ["head"; "noscript"] -> 939 parse_error t "unexpected-start-tag" 940 | Token.Tag { kind = Token.Start; name; _ } -> 941 parse_error t ("bad-start-tag-in-head-noscript:" ^ name); 942 pop_current t; (* Pop noscript *) 943 t.mode <- Parser_insertion_mode.In_head; 944 process_token t token 945 | Token.Tag { kind = Token.End; name = "noscript"; _ } -> 946 pop_current t; (* Pop noscript *) 947 t.mode <- Parser_insertion_mode.In_head 948 | Token.Tag { kind = Token.End; name = "br"; _ } -> 949 parse_error t "unexpected-end-tag"; 950 pop_current t; (* Pop noscript *) 951 t.mode <- Parser_insertion_mode.In_head; 952 process_token t token 953 | Token.Tag { kind = Token.End; name; _ } -> 954 parse_error t ("unexpected-end-tag:" ^ name) 955 | Token.EOF -> 956 parse_error t "expected-closing-tag-but-got-eof"; 957 pop_current t; (* Pop noscript *) 958 t.mode <- Parser_insertion_mode.In_head; 959 process_token t token 960 961and process_after_head t token = 962 match token with 963 | Token.Character data when is_whitespace data -> 964 insert_character t data 965 | Token.Comment data -> 966 insert_comment t data 967 | Token.Doctype _ -> 968 parse_error t "unexpected-doctype" 969 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 970 process_in_body t token 971 | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } -> 972 ignore (insert_element t "body" ~push:true attrs); 973 t.frameset_ok <- false; 974 t.mode <- Parser_insertion_mode.In_body 975 | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } -> 976 ignore (insert_element t "frameset" ~push:true attrs); 977 t.mode <- Parser_insertion_mode.In_frameset 978 | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } -> 979 (* Special handling for input type="hidden" - parse error, ignore *) 980 let input_type = List.assoc_opt "type" attrs in 981 (match input_type with 982 | Some typ when String.lowercase_ascii typ = "hidden" -> 983 parse_error t "unexpected-hidden-input-after-head" 984 | _ -> 985 (* Non-hidden input creates body *) 986 let body = insert_element t "body" [] in 987 t.open_elements <- body :: t.open_elements; 988 t.mode <- Parser_insertion_mode.In_body; 989 process_token t token) 990 | Token.Tag { kind = Token.Start; name; _ } 991 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] -> 992 parse_error t "unexpected-start-tag"; 993 (match t.head_element with 994 | Some head -> 995 t.open_elements <- head :: t.open_elements; 996 process_in_head t token; 997 t.open_elements <- List.filter (fun n -> n != head) t.open_elements 998 | None -> ()) 999 | Token.Tag { kind = Token.End; name = "template"; _ } -> 1000 process_in_head t token 1001 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] -> 1002 let body = insert_element t "body" [] in 1003 t.open_elements <- body :: t.open_elements; 1004 t.mode <- Parser_insertion_mode.In_body; 1005 process_token t token 1006 | Token.Tag { kind = Token.Start; name = "head"; _ } -> 1007 parse_error t "unexpected-start-tag" 1008 | Token.Tag { kind = Token.End; name; _ } -> 1009 parse_error t ("unexpected-end-tag:" ^ name) 1010 | _ -> 1011 let body = insert_element t "body" [] in 1012 t.open_elements <- body :: t.open_elements; 1013 t.mode <- Parser_insertion_mode.In_body; 1014 process_token t token 1015 1016and process_in_body t token = 1017 match token with 1018 | Token.Character "\x00" -> 1019 parse_error t "unexpected-null-character" 1020 | Token.Character data -> 1021 reconstruct_active_formatting t; 1022 insert_character t data; 1023 if not (is_whitespace data) then t.frameset_ok <- false 1024 | Token.Comment data -> 1025 insert_comment t data 1026 | Token.Doctype _ -> 1027 parse_error t "unexpected-doctype" 1028 | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } -> 1029 parse_error t "unexpected-start-tag"; 1030 if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then 1031 (* Find the html element (at the bottom of the stack) *) 1032 let html_elem = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in 1033 (match html_elem with 1034 | Some html -> 1035 List.iter (fun (k, v) -> 1036 if not (Dom.has_attr html k) then Dom.set_attr html k v 1037 ) attrs 1038 | None -> ()) 1039 | Token.Tag { kind = Token.Start; name; _ } 1040 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] -> 1041 process_in_head t token 1042 | Token.Tag { kind = Token.End; name = "template"; _ } -> 1043 process_in_head t token 1044 | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } -> 1045 parse_error t "unexpected-start-tag"; 1046 (* Find body element on stack - it should be near the end (html is last) *) 1047 let body = List.find_opt (fun n -> n.Dom.name = "body") t.open_elements in 1048 (match body with 1049 | Some body when not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) -> 1050 t.frameset_ok <- false; 1051 List.iter (fun (k, v) -> 1052 if not (Dom.has_attr body k) then Dom.set_attr body k v 1053 ) attrs 1054 | _ -> ()) 1055 | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } -> 1056 if not t.frameset_ok then 1057 parse_error t "unexpected-start-tag-ignored" 1058 else begin 1059 (* Find body element on the stack *) 1060 let rec find_body_index idx = function 1061 | [] -> None 1062 | n :: rest -> 1063 if n.Dom.name = "body" then Some (idx, n) 1064 else find_body_index (idx + 1) rest 1065 in 1066 match find_body_index 0 t.open_elements with 1067 | None -> 1068 parse_error t "unexpected-start-tag-ignored" 1069 | Some (idx, body_elem) -> 1070 (* Remove body from its parent (the html element) *) 1071 (match body_elem.Dom.parent with 1072 | Some parent -> Dom.remove_child parent body_elem 1073 | None -> ()); 1074 (* Pop all elements up to and including body - keep only elements after body_idx *) 1075 let rec drop n lst = if n <= 0 then lst else match lst with [] -> [] | _ :: rest -> drop (n - 1) rest in 1076 t.open_elements <- drop (idx + 1) t.open_elements; 1077 (* Insert frameset element *) 1078 ignore (insert_element t "frameset" ~push:true attrs); 1079 t.mode <- Parser_insertion_mode.In_frameset 1080 end 1081 | Token.EOF -> 1082 if t.template_modes <> [] then 1083 process_in_template t token 1084 else begin 1085 let has_unclosed = List.exists (fun n -> 1086 not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"]) 1087 ) t.open_elements in 1088 if has_unclosed then parse_error t "expected-closing-tag-but-got-eof" 1089 end 1090 | Token.Tag { kind = Token.End; name = "body"; _ } -> 1091 if not (has_element_in_scope t "body") then 1092 parse_error t "unexpected-end-tag" 1093 else begin 1094 let has_unclosed = List.exists (fun n -> 1095 not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"]) 1096 ) t.open_elements in 1097 if has_unclosed then parse_error t "end-tag-too-early"; 1098 t.mode <- Parser_insertion_mode.After_body 1099 end 1100 | Token.Tag { kind = Token.End; name = "html"; _ } -> 1101 if not (has_element_in_scope t "body") then 1102 parse_error t "unexpected-end-tag" 1103 else begin 1104 t.mode <- Parser_insertion_mode.After_body; 1105 process_token t token 1106 end 1107 | Token.Tag { kind = Token.Start; name; attrs; _ } 1108 when List.mem name ["address"; "article"; "aside"; "blockquote"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "main"; "menu"; "nav"; "ol"; "p"; "search"; "section"; "summary"; "ul"] -> 1109 if has_element_in_button_scope t "p" then close_p_element t; 1110 ignore (insert_element t name ~push:true attrs) 1111 | Token.Tag { kind = Token.Start; name; attrs; _ } when Parser_constants.is_heading_element name -> 1112 if has_element_in_button_scope t "p" then close_p_element t; 1113 (match current_node t with 1114 | Some n when Parser_constants.is_heading_element n.Dom.name -> 1115 parse_error t "unexpected-start-tag"; 1116 pop_current t 1117 | _ -> ()); 1118 ignore (insert_element t name ~push:true attrs) 1119 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["pre"; "listing"] -> 1120 if has_element_in_button_scope t "p" then close_p_element t; 1121 ignore (insert_element t name ~push:true attrs); 1122 t.ignore_lf <- true; 1123 t.frameset_ok <- false 1124 | Token.Tag { kind = Token.Start; name = "form"; attrs; _ } -> 1125 if t.form_element <> None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then 1126 parse_error t "unexpected-start-tag" 1127 else begin 1128 if has_element_in_button_scope t "p" then close_p_element t; 1129 let form = insert_element t "form" attrs in 1130 t.open_elements <- form :: t.open_elements; 1131 if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then 1132 t.form_element <- Some form 1133 end 1134 | Token.Tag { kind = Token.Start; name = "li"; attrs; _ } -> 1135 t.frameset_ok <- false; 1136 let rec check = function 1137 | [] -> () 1138 | n :: rest -> 1139 if n.Dom.name = "li" then begin 1140 generate_implied_end_tags t ~except:"li" (); 1141 (match current_node t with 1142 | Some cn when cn.Dom.name <> "li" -> parse_error t "unexpected-start-tag" 1143 | _ -> ()); 1144 pop_until_tag t "li" 1145 end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then 1146 () 1147 else 1148 check rest 1149 in 1150 check t.open_elements; 1151 if has_element_in_button_scope t "p" then close_p_element t; 1152 ignore (insert_element t "li" ~push:true attrs) 1153 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["dd"; "dt"] -> 1154 t.frameset_ok <- false; 1155 let rec check = function 1156 | [] -> () 1157 | n :: rest -> 1158 if List.mem n.Dom.name ["dd"; "dt"] then begin 1159 generate_implied_end_tags t ~except:n.Dom.name (); 1160 (match current_node t with 1161 | Some cn when cn.Dom.name <> n.Dom.name -> parse_error t "unexpected-start-tag" 1162 | _ -> ()); 1163 pop_until_one_of t ["dd"; "dt"] 1164 end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then 1165 () 1166 else 1167 check rest 1168 in 1169 check t.open_elements; 1170 if has_element_in_button_scope t "p" then close_p_element t; 1171 ignore (insert_element t name ~push:true attrs) 1172 | Token.Tag { kind = Token.Start; name = "plaintext"; _ } -> 1173 if has_element_in_button_scope t "p" then close_p_element t; 1174 ignore (insert_element t "plaintext" ~push:true []) 1175 (* Tokenizer should switch to PLAINTEXT state *) 1176 | Token.Tag { kind = Token.Start; name = "button"; attrs; _ } -> 1177 if has_element_in_scope t "button" then begin 1178 parse_error t "unexpected-start-tag"; 1179 generate_implied_end_tags t (); 1180 pop_until_tag t "button" 1181 end; 1182 reconstruct_active_formatting t; 1183 ignore (insert_element t "button" ~push:true attrs); 1184 t.frameset_ok <- false 1185 | Token.Tag { kind = Token.End; name; _ } 1186 when List.mem name ["address"; "article"; "aside"; "blockquote"; "button"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "listing"; "main"; "menu"; "nav"; "ol"; "pre"; "search"; "section"; "summary"; "ul"] -> 1187 if not (has_element_in_scope t name) then 1188 parse_error t ("unexpected-end-tag:" ^ name) 1189 else begin 1190 generate_implied_end_tags t (); 1191 (match current_node t with 1192 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1193 | _ -> ()); 1194 pop_until_tag t name 1195 end 1196 | Token.Tag { kind = Token.End; name = "form"; _ } -> 1197 if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin 1198 let node = t.form_element in 1199 t.form_element <- None; 1200 match node with 1201 | None -> parse_error t "unexpected-end-tag" 1202 | Some form_node -> 1203 if not (has_element_in_scope t "form") then 1204 parse_error t "unexpected-end-tag" 1205 else begin 1206 generate_implied_end_tags t (); 1207 (match current_node t with 1208 | Some n when n != form_node -> parse_error t "end-tag-too-early" 1209 | _ -> ()); 1210 t.open_elements <- List.filter (fun n -> n != form_node) t.open_elements 1211 end 1212 end else begin 1213 if not (has_element_in_scope t "form") then 1214 parse_error t "unexpected-end-tag" 1215 else begin 1216 generate_implied_end_tags t (); 1217 (match current_node t with 1218 | Some n when n.Dom.name <> "form" -> parse_error t "end-tag-too-early" 1219 | _ -> ()); 1220 pop_until_tag t "form" 1221 end 1222 end 1223 | Token.Tag { kind = Token.End; name = "p"; _ } -> 1224 if not (has_element_in_button_scope t "p") then begin 1225 parse_error t "no-p-element-in-scope"; 1226 ignore (insert_element t "p" ~push:true []) 1227 end; 1228 close_p_element t 1229 | Token.Tag { kind = Token.End; name = "li"; _ } -> 1230 if not (has_element_in_list_item_scope t "li") then 1231 parse_error t "unexpected-end-tag" 1232 else begin 1233 generate_implied_end_tags t ~except:"li" (); 1234 (match current_node t with 1235 | Some n when n.Dom.name <> "li" -> parse_error t "end-tag-too-early" 1236 | _ -> ()); 1237 pop_until_tag t "li" 1238 end 1239 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["dd"; "dt"] -> 1240 if not (has_element_in_scope t name) then 1241 parse_error t "unexpected-end-tag" 1242 else begin 1243 generate_implied_end_tags t ~except:name (); 1244 (match current_node t with 1245 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1246 | _ -> ()); 1247 pop_until_tag t name 1248 end 1249 | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_heading_element name -> 1250 if not (has_element_in_scope_impl t Parser_constants.heading_elements Parser_constants.default_scope ~check_integration_points:true) then 1251 parse_error t "unexpected-end-tag" 1252 else begin 1253 generate_implied_end_tags t (); 1254 (match current_node t with 1255 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1256 | _ -> ()); 1257 pop_until_one_of t Parser_constants.heading_elements 1258 end 1259 | Token.Tag { kind = Token.Start; name = "a"; attrs; _ } -> 1260 (* Check for existing <a> in active formatting *) 1261 let rec find_a = function 1262 | [] -> None 1263 | Marker :: _ -> None 1264 | Entry e :: _ when e.name = "a" -> Some e.node 1265 | _ :: rest -> find_a rest 1266 in 1267 (match find_a t.active_formatting with 1268 | Some existing -> 1269 parse_error t "unexpected-start-tag"; 1270 adoption_agency t "a"; 1271 t.active_formatting <- List.filter (function 1272 | Entry e -> e.node != existing 1273 | _ -> true 1274 ) t.active_formatting; 1275 t.open_elements <- List.filter (fun n -> n != existing) t.open_elements 1276 | None -> ()); 1277 reconstruct_active_formatting t; 1278 let node = insert_element t "a" attrs in 1279 t.open_elements <- node :: t.open_elements; 1280 push_formatting_element t node "a" attrs 1281 | Token.Tag { kind = Token.Start; name; attrs; _ } 1282 when List.mem name ["b"; "big"; "code"; "em"; "font"; "i"; "s"; "small"; "strike"; "strong"; "tt"; "u"] -> 1283 reconstruct_active_formatting t; 1284 let node = insert_element t name attrs in 1285 t.open_elements <- node :: t.open_elements; 1286 push_formatting_element t node name attrs 1287 | Token.Tag { kind = Token.Start; name = "nobr"; attrs; _ } -> 1288 if has_element_in_scope t "nobr" then begin 1289 parse_error t "unexpected-start-tag"; 1290 adoption_agency t "nobr"; 1291 (* Remove nobr from active formatting *) 1292 t.active_formatting <- List.filter (function 1293 | Entry e -> e.name <> "nobr" 1294 | Marker -> true 1295 ) t.active_formatting; 1296 (* Remove nobr from open elements *) 1297 t.open_elements <- List.filter (fun n -> n.Dom.name <> "nobr") t.open_elements 1298 end; 1299 reconstruct_active_formatting t; 1300 let node = insert_element t "nobr" attrs in 1301 t.open_elements <- node :: t.open_elements; 1302 push_formatting_element t node "nobr" attrs 1303 | Token.Tag { kind = Token.End; name; _ } 1304 when List.mem name ["a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u"] -> 1305 adoption_agency t name 1306 | Token.Tag { kind = Token.Start; name; attrs; _ } 1307 when List.mem name ["applet"; "marquee"; "object"] -> 1308 reconstruct_active_formatting t; 1309 ignore (insert_element t name ~push:true attrs); 1310 push_formatting_marker t; 1311 t.frameset_ok <- false 1312 | Token.Tag { kind = Token.End; name; _ } 1313 when List.mem name ["applet"; "marquee"; "object"] -> 1314 if not (has_element_in_scope t name) then 1315 parse_error t "unexpected-end-tag" 1316 else begin 1317 generate_implied_end_tags t (); 1318 (match current_node t with 1319 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1320 | _ -> ()); 1321 pop_until_tag t name; 1322 clear_active_formatting_to_marker t 1323 end 1324 | Token.Tag { kind = Token.Start; name = "table"; attrs; _ } -> 1325 if t.quirks_mode <> Dom.Quirks && has_element_in_button_scope t "p" then 1326 close_p_element t; 1327 ignore (insert_element t "table" ~push:true attrs); 1328 t.frameset_ok <- false; 1329 t.mode <- Parser_insertion_mode.In_table 1330 | Token.Tag { kind = Token.End; name = "br"; _ } -> 1331 parse_error t "end-tag-br"; 1332 reconstruct_active_formatting t; 1333 ignore (insert_element t "br" ~push:true []); 1334 pop_current t; 1335 t.frameset_ok <- false 1336 | Token.Tag { kind = Token.Start; name; attrs; _ } 1337 when List.mem name ["area"; "br"; "embed"; "img"; "keygen"; "wbr"] -> 1338 reconstruct_active_formatting t; 1339 ignore (insert_element t name ~push:true attrs); 1340 pop_current t; 1341 t.frameset_ok <- false 1342 | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } -> 1343 reconstruct_active_formatting t; 1344 ignore (insert_element t "input" ~push:true attrs); 1345 pop_current t; 1346 let is_hidden = List.exists (fun (k, v) -> 1347 String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden" 1348 ) attrs in 1349 if not is_hidden then t.frameset_ok <- false 1350 | Token.Tag { kind = Token.Start; name; attrs; _ } 1351 when List.mem name ["param"; "source"; "track"] -> 1352 ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing = false }); 1353 pop_current t 1354 | Token.Tag { kind = Token.Start; name = "hr"; _ } -> 1355 if has_element_in_button_scope t "p" then close_p_element t; 1356 ignore (insert_element t "hr" ~push:true []); 1357 pop_current t; 1358 t.frameset_ok <- false 1359 | Token.Tag { kind = Token.Start; name = "image"; attrs; _ } -> 1360 parse_error t "unexpected-start-tag"; 1361 (* Treat <image> as <img> *) 1362 reconstruct_active_formatting t; 1363 ignore (insert_element t "img" ~push:true attrs); 1364 pop_current t; 1365 t.frameset_ok <- false 1366 | Token.Tag { kind = Token.Start; name = "textarea"; attrs; _ } -> 1367 ignore (insert_element t "textarea" ~push:true attrs); 1368 t.ignore_lf <- true; 1369 t.original_mode <- Some t.mode; 1370 t.frameset_ok <- false; 1371 t.mode <- Parser_insertion_mode.Text 1372 | Token.Tag { kind = Token.Start; name = "xmp"; attrs; _ } -> 1373 if has_element_in_button_scope t "p" then close_p_element t; 1374 reconstruct_active_formatting t; 1375 t.frameset_ok <- false; 1376 ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs; self_closing = false }); 1377 t.original_mode <- Some t.mode; 1378 t.mode <- Parser_insertion_mode.Text 1379 | Token.Tag { kind = Token.Start; name = "iframe"; attrs; _ } -> 1380 t.frameset_ok <- false; 1381 ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs; self_closing = false }); 1382 t.original_mode <- Some t.mode; 1383 t.mode <- Parser_insertion_mode.Text 1384 | Token.Tag { kind = Token.Start; name = "noembed"; attrs; _ } -> 1385 ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs; self_closing = false }); 1386 t.original_mode <- Some t.mode; 1387 t.mode <- Parser_insertion_mode.Text 1388 | Token.Tag { kind = Token.Start; name = "select"; attrs; _ } -> 1389 reconstruct_active_formatting t; 1390 ignore (insert_element t "select" ~push:true attrs); 1391 t.frameset_ok <- false; 1392 if List.mem t.mode [Parser_insertion_mode.In_table; Parser_insertion_mode.In_caption; Parser_insertion_mode.In_table_body; Parser_insertion_mode.In_row; Parser_insertion_mode.In_cell] then 1393 t.mode <- Parser_insertion_mode.In_select_in_table 1394 else 1395 t.mode <- Parser_insertion_mode.In_select 1396 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["optgroup"; "option"] -> 1397 (match current_node t with 1398 | Some n when n.Dom.name = "option" -> pop_current t 1399 | _ -> ()); 1400 reconstruct_active_formatting t; 1401 ignore (insert_element t name ~push:true attrs) 1402 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rb"; "rtc"] -> 1403 if has_element_in_scope t "ruby" then begin 1404 generate_implied_end_tags t () 1405 end; 1406 (match current_node t with 1407 | Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag" 1408 | _ -> ()); 1409 ignore (insert_element t name ~push:true attrs) 1410 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rp"; "rt"] -> 1411 if has_element_in_scope t "ruby" then begin 1412 generate_implied_end_tags t ~except:"rtc" () 1413 end; 1414 (match current_node t with 1415 | Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag" 1416 | _ -> ()); 1417 ignore (insert_element t name ~push:true attrs) 1418 | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } -> 1419 reconstruct_active_formatting t; 1420 let adjusted_attrs = Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs) in 1421 ignore (insert_foreign_element t { kind = Token.Start; name = "math"; attrs = adjusted_attrs; self_closing } (Some "mathml")); 1422 if self_closing then pop_current t 1423 | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } -> 1424 reconstruct_active_formatting t; 1425 let adjusted_attrs = Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs) in 1426 ignore (insert_foreign_element t { kind = Token.Start; name = "svg"; attrs = adjusted_attrs; self_closing } (Some "svg")); 1427 if self_closing then pop_current t 1428 | Token.Tag { kind = Token.Start; name; attrs; _ } 1429 when List.mem name ["col"; "frame"] -> 1430 (* In fragment context, insert these; otherwise ignore *) 1431 if t.fragment_context = None then 1432 parse_error t "unexpected-start-tag-ignored" 1433 else 1434 ignore (insert_element t name attrs) 1435 | Token.Tag { kind = Token.Start; name; _ } 1436 when List.mem name ["caption"; "colgroup"; "head"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1437 parse_error t "unexpected-start-tag" 1438 | Token.Tag { kind = Token.Start; name; attrs; self_closing } -> 1439 (* Any other start tag *) 1440 reconstruct_active_formatting t; 1441 ignore (insert_element t name ~push:true attrs); 1442 (* Check for self-closing on non-void HTML element *) 1443 if self_closing && not (Parser_constants.is_void_element name) then 1444 parse_error t "non-void-html-element-start-tag-with-trailing-solidus" 1445 | Token.Tag { kind = Token.End; name; _ } -> 1446 (* Any other end tag *) 1447 let rec check = function 1448 | [] -> () 1449 | node :: rest -> 1450 if node.Dom.name = name then begin 1451 generate_implied_end_tags t ~except:name (); 1452 (match current_node t with 1453 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1454 | _ -> ()); 1455 pop_until t (fun n -> n == node) 1456 end else if is_special_element node then 1457 parse_error t ("unexpected-end-tag:" ^ name) 1458 else 1459 check rest 1460 in 1461 check t.open_elements 1462 1463and process_text t token = 1464 match token with 1465 | Token.Character data -> 1466 insert_character t data 1467 | Token.EOF -> 1468 parse_error t "expected-closing-tag-but-got-eof"; 1469 pop_current t; 1470 t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body; 1471 process_token t token 1472 | Token.Tag { kind = Token.End; _ } -> 1473 pop_current t; 1474 t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body 1475 | _ -> () 1476 1477and process_in_table t token = 1478 match token with 1479 | Token.Character _ when (match current_node t with Some n -> Parser_constants.is_foster_parenting_element n.Dom.name | None -> false) -> 1480 t.pending_table_chars <- []; 1481 t.original_mode <- Some t.mode; 1482 t.mode <- Parser_insertion_mode.In_table_text; 1483 process_token t token 1484 | Token.Comment data -> 1485 insert_comment t data 1486 | Token.Doctype _ -> 1487 parse_error t "unexpected-doctype" 1488 | Token.Tag { kind = Token.Start; name = "caption"; attrs; _ } -> 1489 clear_stack_back_to_table_context t; 1490 push_formatting_marker t; 1491 ignore (insert_element t "caption" ~push:true attrs); 1492 t.mode <- Parser_insertion_mode.In_caption 1493 | Token.Tag { kind = Token.Start; name = "colgroup"; attrs; _ } -> 1494 clear_stack_back_to_table_context t; 1495 ignore (insert_element t "colgroup" ~push:true attrs); 1496 t.mode <- Parser_insertion_mode.In_column_group 1497 | Token.Tag { kind = Token.Start; name = "col"; _ } -> 1498 clear_stack_back_to_table_context t; 1499 ignore (insert_element t "colgroup" ~push:true []); 1500 t.mode <- Parser_insertion_mode.In_column_group; 1501 process_token t token 1502 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> 1503 clear_stack_back_to_table_context t; 1504 ignore (insert_element t name ~push:true attrs); 1505 t.mode <- Parser_insertion_mode.In_table_body 1506 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"; "tr"] -> 1507 clear_stack_back_to_table_context t; 1508 ignore (insert_element t "tbody" ~push:true []); 1509 t.mode <- Parser_insertion_mode.In_table_body; 1510 process_token t token 1511 | Token.Tag { kind = Token.Start; name = "table"; _ } -> 1512 parse_error t "unexpected-start-tag"; 1513 if has_element_in_table_scope t "table" then begin 1514 pop_until_tag t "table"; 1515 reset_insertion_mode t; 1516 process_token t token 1517 end 1518 | Token.Tag { kind = Token.End; name = "table"; _ } -> 1519 if not (has_element_in_table_scope t "table") then 1520 parse_error t "unexpected-end-tag" 1521 else begin 1522 pop_until_tag t "table"; 1523 reset_insertion_mode t 1524 end 1525 | Token.Tag { kind = Token.End; name; _ } 1526 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1527 parse_error t "unexpected-end-tag" 1528 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["style"; "script"; "template"] -> 1529 process_in_head t token 1530 | Token.Tag { kind = Token.End; name = "template"; _ } -> 1531 process_in_head t token 1532 | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } -> 1533 let is_hidden = List.exists (fun (k, v) -> 1534 String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden" 1535 ) attrs in 1536 if not is_hidden then begin 1537 parse_error t "start-tag-in-table:input"; 1538 t.foster_parenting <- true; 1539 process_in_body t token; 1540 t.foster_parenting <- false 1541 end else begin 1542 parse_error t "start-tag-in-table:input"; 1543 ignore (insert_element t "input" ~push:true attrs); 1544 pop_current t 1545 end 1546 | Token.Tag { kind = Token.Start; name = "form"; attrs; _ } -> 1547 parse_error t "unexpected-start-tag"; 1548 if t.form_element = None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin 1549 let form = insert_element t "form" attrs in 1550 t.open_elements <- form :: t.open_elements; 1551 t.form_element <- Some form; 1552 pop_current t 1553 end 1554 | Token.EOF -> 1555 process_in_body t token 1556 | _ -> 1557 parse_error t "unexpected-token-in-table"; 1558 t.foster_parenting <- true; 1559 process_in_body t token; 1560 t.foster_parenting <- false 1561 1562and clear_stack_back_to_table_context t = 1563 let rec loop () = 1564 match current_node t with 1565 | Some n when not (List.mem n.Dom.name ["table"; "template"; "html"]) -> 1566 pop_current t; 1567 loop () 1568 | _ -> () 1569 in 1570 loop () 1571 1572and process_in_table_text t token = 1573 match token with 1574 | Token.Character data -> 1575 if String.contains data '\x00' then 1576 parse_error t "unexpected-null-character" 1577 else 1578 t.pending_table_chars <- data :: t.pending_table_chars 1579 | _ -> 1580 let pending = String.concat "" (List.rev t.pending_table_chars) in 1581 t.pending_table_chars <- []; 1582 if not (is_whitespace pending) then begin 1583 parse_error t "unexpected-character-in-table"; 1584 t.foster_parenting <- true; 1585 reconstruct_active_formatting t; 1586 insert_character t pending; 1587 t.foster_parenting <- false 1588 end else 1589 insert_character t pending; 1590 t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_table; 1591 process_token t token 1592 1593and process_in_caption t token = 1594 match token with 1595 | Token.Tag { kind = Token.End; name = "caption"; _ } -> 1596 if not (has_element_in_table_scope t "caption") then 1597 parse_error t "unexpected-end-tag" 1598 else begin 1599 generate_implied_end_tags t (); 1600 (match current_node t with 1601 | Some n when n.Dom.name <> "caption" -> parse_error t "end-tag-too-early" 1602 | _ -> ()); 1603 pop_until_tag t "caption"; 1604 clear_active_formatting_to_marker t; 1605 t.mode <- Parser_insertion_mode.In_table 1606 end 1607 | Token.Tag { kind = Token.Start; name; _ } 1608 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1609 if not (has_element_in_table_scope t "caption") then 1610 parse_error t "unexpected-start-tag" 1611 else begin 1612 generate_implied_end_tags t (); 1613 pop_until_tag t "caption"; 1614 clear_active_formatting_to_marker t; 1615 t.mode <- Parser_insertion_mode.In_table; 1616 process_token t token 1617 end 1618 | Token.Tag { kind = Token.End; name = "table"; _ } -> 1619 if not (has_element_in_table_scope t "caption") then 1620 parse_error t "unexpected-end-tag" 1621 else begin 1622 generate_implied_end_tags t (); 1623 pop_until_tag t "caption"; 1624 clear_active_formatting_to_marker t; 1625 t.mode <- Parser_insertion_mode.In_table; 1626 process_token t token 1627 end 1628 | Token.Tag { kind = Token.End; name; _ } 1629 when List.mem name ["body"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1630 parse_error t "unexpected-end-tag" 1631 | _ -> 1632 process_in_body t token 1633 1634and process_in_column_group t token = 1635 match token with 1636 | Token.Character data when is_whitespace data -> 1637 insert_character t data 1638 | Token.Character data -> 1639 (* Split leading whitespace from non-whitespace *) 1640 let ws_chars = [' '; '\t'; '\n'; '\x0C'; '\r'] in 1641 let len = String.length data in 1642 let ws_end = ref 0 in 1643 while !ws_end < len && List.mem data.[!ws_end] ws_chars do incr ws_end done; 1644 if !ws_end > 0 then 1645 insert_character t (String.sub data 0 !ws_end); 1646 if !ws_end < len then begin 1647 let remaining = String.sub data !ws_end (len - !ws_end) in 1648 (match current_node t with 1649 | Some n when n.Dom.name = "colgroup" -> 1650 pop_current t; 1651 t.mode <- Parser_insertion_mode.In_table; 1652 process_token t (Token.Character remaining) 1653 | _ -> 1654 parse_error t "unexpected-token") 1655 end 1656 | Token.Comment data -> 1657 insert_comment t data 1658 | Token.Doctype _ -> 1659 parse_error t "unexpected-doctype" 1660 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 1661 process_in_body t token 1662 | Token.Tag { kind = Token.Start; name = "col"; attrs; _ } -> 1663 ignore (insert_element t "col" ~push:true attrs); 1664 pop_current t 1665 | Token.Tag { kind = Token.End; name = "colgroup"; _ } -> 1666 (match current_node t with 1667 | Some n when n.Dom.name <> "colgroup" -> parse_error t "unexpected-end-tag" 1668 | Some _ -> pop_current t; t.mode <- Parser_insertion_mode.In_table 1669 | None -> parse_error t "unexpected-end-tag") 1670 | Token.Tag { kind = Token.End; name = "col"; _ } -> 1671 parse_error t "unexpected-end-tag" 1672 | Token.Tag { kind = Token.Start; name = "template"; _ } 1673 | Token.Tag { kind = Token.End; name = "template"; _ } -> 1674 process_in_head t token 1675 | Token.EOF -> 1676 process_in_body t token 1677 | _ -> 1678 (match current_node t with 1679 | Some n when n.Dom.name = "colgroup" -> 1680 pop_current t; 1681 t.mode <- Parser_insertion_mode.In_table; 1682 process_token t token 1683 | _ -> 1684 parse_error t "unexpected-token") 1685 1686and process_in_table_body t token = 1687 match token with 1688 | Token.Tag { kind = Token.Start; name = "tr"; attrs; _ } -> 1689 clear_stack_back_to_table_body_context t; 1690 ignore (insert_element t "tr" ~push:true attrs); 1691 t.mode <- Parser_insertion_mode.In_row 1692 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["th"; "td"] -> 1693 parse_error t "unexpected-start-tag"; 1694 clear_stack_back_to_table_body_context t; 1695 ignore (insert_element t "tr" ~push:true []); 1696 t.mode <- Parser_insertion_mode.In_row; 1697 process_token t token 1698 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> 1699 if not (has_element_in_table_scope t name) then 1700 parse_error t "unexpected-end-tag" 1701 else begin 1702 clear_stack_back_to_table_body_context t; 1703 pop_current t; 1704 t.mode <- Parser_insertion_mode.In_table 1705 end 1706 | Token.Tag { kind = Token.Start; name; _ } 1707 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"] -> 1708 if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then 1709 parse_error t "unexpected-start-tag" 1710 else begin 1711 clear_stack_back_to_table_body_context t; 1712 pop_current t; 1713 t.mode <- Parser_insertion_mode.In_table; 1714 process_token t token 1715 end 1716 | Token.Tag { kind = Token.End; name = "table"; _ } -> 1717 if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then 1718 parse_error t "unexpected-end-tag" 1719 else begin 1720 clear_stack_back_to_table_body_context t; 1721 pop_current t; 1722 t.mode <- Parser_insertion_mode.In_table; 1723 process_token t token 1724 end 1725 | Token.Tag { kind = Token.End; name; _ } 1726 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"; "tr"] -> 1727 parse_error t "unexpected-end-tag" 1728 | _ -> 1729 process_in_table t token 1730 1731and clear_stack_back_to_table_body_context t = 1732 let rec loop () = 1733 match current_node t with 1734 | Some n when not (List.mem n.Dom.name ["tbody"; "tfoot"; "thead"; "template"; "html"]) -> 1735 pop_current t; 1736 loop () 1737 | _ -> () 1738 in 1739 loop () 1740 1741and process_in_row t token = 1742 match token with 1743 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["th"; "td"] -> 1744 clear_stack_back_to_table_row_context t; 1745 ignore (insert_element t name ~push:true attrs); 1746 t.mode <- Parser_insertion_mode.In_cell; 1747 push_formatting_marker t 1748 | Token.Tag { kind = Token.End; name = "tr"; _ } -> 1749 if not (has_element_in_table_scope t "tr") then 1750 parse_error t "unexpected-end-tag" 1751 else begin 1752 clear_stack_back_to_table_row_context t; 1753 pop_current t; 1754 t.mode <- Parser_insertion_mode.In_table_body 1755 end 1756 | Token.Tag { kind = Token.Start; name; _ } 1757 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"; "tr"] -> 1758 if not (has_element_in_table_scope t "tr") then 1759 parse_error t "unexpected-start-tag" 1760 else begin 1761 clear_stack_back_to_table_row_context t; 1762 pop_current t; 1763 t.mode <- Parser_insertion_mode.In_table_body; 1764 process_token t token 1765 end 1766 | Token.Tag { kind = Token.End; name = "table"; _ } -> 1767 if not (has_element_in_table_scope t "tr") then 1768 parse_error t "unexpected-end-tag" 1769 else begin 1770 clear_stack_back_to_table_row_context t; 1771 pop_current t; 1772 t.mode <- Parser_insertion_mode.In_table_body; 1773 process_token t token 1774 end 1775 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> 1776 if not (has_element_in_table_scope t name) then 1777 parse_error t "unexpected-end-tag" 1778 else if not (has_element_in_table_scope t "tr") then 1779 parse_error t "unexpected-end-tag" 1780 else begin 1781 clear_stack_back_to_table_row_context t; 1782 pop_current t; 1783 t.mode <- Parser_insertion_mode.In_table_body; 1784 process_token t token 1785 end 1786 | Token.Tag { kind = Token.End; name; _ } 1787 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"] -> 1788 parse_error t "unexpected-end-tag" 1789 | _ -> 1790 process_in_table t token 1791 1792and clear_stack_back_to_table_row_context t = 1793 let rec loop () = 1794 match current_node t with 1795 | Some n when not (List.mem n.Dom.name ["tr"; "template"; "html"]) -> 1796 pop_current t; 1797 loop () 1798 | _ -> () 1799 in 1800 loop () 1801 1802and process_in_cell t token = 1803 match token with 1804 | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_table_cell_element name -> 1805 if not (has_element_in_table_scope t name) then 1806 parse_error t "unexpected-end-tag" 1807 else begin 1808 generate_implied_end_tags t (); 1809 (match current_node t with 1810 | Some n when not (n.Dom.name = name && is_in_html_namespace n) -> parse_error t "end-tag-too-early" 1811 | _ -> ()); 1812 pop_until_html_tag t name; 1813 clear_active_formatting_to_marker t; 1814 t.mode <- Parser_insertion_mode.In_row 1815 end 1816 | Token.Tag { kind = Token.Start; name; _ } 1817 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1818 if not (has_element_in_scope_impl t ["td"; "th"] Parser_constants.table_scope ~check_integration_points:false) then 1819 parse_error t "unexpected-start-tag" 1820 else begin 1821 close_cell t; 1822 process_token t token 1823 end 1824 | Token.Tag { kind = Token.End; name; _ } 1825 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"] -> 1826 parse_error t "unexpected-end-tag" 1827 | Token.Tag { kind = Token.End; name; _ } 1828 when Parser_constants.is_foster_parenting_element name -> 1829 if not (has_element_in_table_scope t name) then 1830 parse_error t "unexpected-end-tag" 1831 else begin 1832 close_cell t; 1833 process_token t token 1834 end 1835 | _ -> 1836 process_in_body t token 1837 1838and close_cell t = 1839 generate_implied_end_tags t (); 1840 (match current_node t with 1841 | Some n when not (Parser_constants.is_table_cell_element n.Dom.name && is_in_html_namespace n) -> parse_error t "end-tag-too-early" 1842 | _ -> ()); 1843 pop_until_html_one_of t ["td"; "th"]; 1844 clear_active_formatting_to_marker t; 1845 t.mode <- Parser_insertion_mode.In_row 1846 1847and process_in_select t token = 1848 match token with 1849 | Token.Character "\x00" -> 1850 parse_error t "unexpected-null-character" 1851 | Token.Character data -> 1852 reconstruct_active_formatting t; 1853 insert_character t data 1854 | Token.Comment data -> 1855 insert_comment t data 1856 | Token.Doctype _ -> 1857 parse_error t "unexpected-doctype" 1858 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 1859 process_in_body t token 1860 | Token.Tag { kind = Token.Start; name = "option"; attrs; _ } -> 1861 (match current_node t with 1862 | Some n when n.Dom.name = "option" -> pop_current t 1863 | _ -> ()); 1864 reconstruct_active_formatting t; 1865 ignore (insert_element t "option" ~push:true attrs) 1866 | Token.Tag { kind = Token.Start; name = "optgroup"; attrs; _ } -> 1867 (match current_node t with 1868 | Some n when n.Dom.name = "option" -> pop_current t 1869 | _ -> ()); 1870 (match current_node t with 1871 | Some n when n.Dom.name = "optgroup" -> pop_current t 1872 | _ -> ()); 1873 reconstruct_active_formatting t; 1874 ignore (insert_element t "optgroup" ~push:true attrs) 1875 | Token.Tag { kind = Token.Start; name = "hr"; _ } -> 1876 (match current_node t with 1877 | Some n when n.Dom.name = "option" -> pop_current t 1878 | _ -> ()); 1879 (match current_node t with 1880 | Some n when n.Dom.name = "optgroup" -> pop_current t 1881 | _ -> ()); 1882 ignore (insert_element t "hr" ~push:true []); 1883 pop_current t 1884 | Token.Tag { kind = Token.End; name = "optgroup"; _ } -> 1885 (match t.open_elements with 1886 | opt :: optg :: _ when opt.Dom.name = "option" && optg.Dom.name = "optgroup" -> 1887 pop_current t 1888 | _ -> ()); 1889 (match current_node t with 1890 | Some n when n.Dom.name = "optgroup" -> pop_current t 1891 | _ -> parse_error t "unexpected-end-tag") 1892 | Token.Tag { kind = Token.End; name = "option"; _ } -> 1893 (match current_node t with 1894 | Some n when n.Dom.name = "option" -> pop_current t 1895 | _ -> parse_error t "unexpected-end-tag") 1896 | Token.Tag { kind = Token.End; name = "select"; _ } -> 1897 if not (has_element_in_select_scope t "select") then 1898 parse_error t "unexpected-end-tag" 1899 else begin 1900 pop_until_tag t "select"; 1901 reset_insertion_mode t 1902 end 1903 | Token.Tag { kind = Token.Start; name = "select"; _ } -> 1904 parse_error t "unexpected-start-tag"; 1905 (* Per spec: in IN_SELECT mode, select is always on the stack - just pop *) 1906 pop_until_tag t "select"; 1907 reset_insertion_mode t 1908 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["input"; "textarea"] -> 1909 parse_error t "unexpected-start-tag"; 1910 (* Per spec: in IN_SELECT mode, select is always on the stack - just pop *) 1911 pop_until_tag t "select"; 1912 reset_insertion_mode t; 1913 process_token t token 1914 | Token.Tag { kind = Token.Start; name = "plaintext"; attrs; _ } -> 1915 (* plaintext is allowed in select - creates element, parser will switch tokenizer to PLAINTEXT mode *) 1916 reconstruct_active_formatting t; 1917 ignore (insert_element t "plaintext" ~push:true attrs) 1918 | Token.Tag { kind = Token.Start; name = "menuitem"; attrs; _ } -> 1919 (* menuitem is allowed in select *) 1920 reconstruct_active_formatting t; 1921 ignore (insert_element t "menuitem" ~push:true attrs) 1922 | Token.Tag { kind = Token.Start; name = "keygen"; attrs; _ } -> 1923 (* keygen is handled specially in select - inserted directly *) 1924 reconstruct_active_formatting t; 1925 ignore (insert_element t "keygen" attrs) 1926 (* Void element, don't push to stack *) 1927 | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } -> 1928 reconstruct_active_formatting t; 1929 let node = insert_foreign_element t { kind = Token.Start; name = "svg"; attrs; self_closing } (Some "svg") in 1930 if not self_closing then t.open_elements <- node :: t.open_elements 1931 | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } -> 1932 reconstruct_active_formatting t; 1933 let node = insert_foreign_element t { kind = Token.Start; name = "math"; attrs; self_closing } (Some "mathml") in 1934 if not self_closing then t.open_elements <- node :: t.open_elements 1935 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["script"; "template"] -> 1936 process_in_head t token 1937 | Token.Tag { kind = Token.End; name = "template"; _ } -> 1938 process_in_head t token 1939 (* Allow certain HTML elements in select - newer spec behavior *) 1940 | Token.Tag { kind = Token.Start; name; attrs; self_closing } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] -> 1941 reconstruct_active_formatting t; 1942 let node = insert_element t name attrs in 1943 if not self_closing then t.open_elements <- node :: t.open_elements 1944 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["br"; "img"] -> 1945 reconstruct_active_formatting t; 1946 ignore (insert_element t name attrs) 1947 (* Don't push to stack - void elements *) 1948 (* Handle formatting elements in select *) 1949 | Token.Tag { kind = Token.Start; name; attrs; _ } when Parser_constants.is_formatting_element name -> 1950 reconstruct_active_formatting t; 1951 let node = insert_element t name ~push:true attrs in 1952 push_formatting_element t node name attrs 1953 | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_formatting_element name -> 1954 (* Find select element and check if formatting element is inside select *) 1955 let select_idx = ref None in 1956 let fmt_idx = ref None in 1957 List.iteri (fun i n -> 1958 if n.Dom.name = "select" && !select_idx = None then select_idx := Some i; 1959 if n.Dom.name = name then fmt_idx := Some i 1960 ) t.open_elements; 1961 (match !fmt_idx, !select_idx with 1962 | Some fi, Some si when fi < si -> 1963 (* Formatting element is inside select, run adoption agency *) 1964 adoption_agency t name 1965 | Some _, Some _ -> 1966 (* Formatting element is outside select boundary - parse error, ignore *) 1967 parse_error t "unexpected-end-tag" 1968 | Some _, None -> 1969 adoption_agency t name 1970 | None, _ -> 1971 parse_error t "unexpected-end-tag") 1972 (* End tags for HTML elements allowed in select *) 1973 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] -> 1974 (* Find select and target indices *) 1975 let select_idx = ref None in 1976 let target_idx = ref None in 1977 List.iteri (fun i n -> 1978 if n.Dom.name = "select" && !select_idx = None then select_idx := Some i; 1979 if n.Dom.name = name then target_idx := Some i 1980 ) t.open_elements; 1981 (* Only pop if target exists and is inside select (lower index = closer to current) *) 1982 (match !target_idx, !select_idx with 1983 | Some ti, Some si when ti < si -> 1984 (* Pop until we reach the target *) 1985 let rec pop_to_target () = 1986 match t.open_elements with 1987 | [] -> () 1988 | n :: rest -> 1989 t.open_elements <- rest; 1990 if n.Dom.name <> name then pop_to_target () 1991 in 1992 pop_to_target () 1993 | Some _, Some _ -> 1994 parse_error t "unexpected-end-tag" 1995 | Some _, None -> 1996 (* No select on stack, just pop to target *) 1997 let rec pop_to_target () = 1998 match t.open_elements with 1999 | [] -> () 2000 | n :: rest -> 2001 t.open_elements <- rest; 2002 if n.Dom.name <> name then pop_to_target () 2003 in 2004 pop_to_target () 2005 | None, _ -> 2006 parse_error t "unexpected-end-tag") 2007 | Token.EOF -> 2008 process_in_body t token 2009 | _ -> 2010 parse_error t "unexpected-token-in-select" 2011 2012and process_in_select_in_table t token = 2013 match token with 2014 | Token.Tag { kind = Token.Start; name; _ } 2015 when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] -> 2016 parse_error t "unexpected-start-tag"; 2017 pop_until_tag t "select"; 2018 reset_insertion_mode t; 2019 process_token t token 2020 | Token.Tag { kind = Token.End; name; _ } 2021 when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] -> 2022 parse_error t "unexpected-end-tag"; 2023 if has_element_in_table_scope t name then begin 2024 pop_until_tag t "select"; 2025 reset_insertion_mode t; 2026 process_token t token 2027 end 2028 | _ -> 2029 process_in_select t token 2030 2031and process_in_template t token = 2032 match token with 2033 | Token.Character _ | Token.Comment _ | Token.Doctype _ -> 2034 process_in_body t token 2035 | Token.Tag { kind = Token.Start; name; _ } 2036 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] -> 2037 process_in_head t token 2038 | Token.Tag { kind = Token.End; name = "template"; _ } -> 2039 process_in_head t token 2040 | Token.Tag { kind = Token.Start; name; _ } 2041 when List.mem name ["caption"; "colgroup"; "tbody"; "tfoot"; "thead"] -> 2042 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2043 t.template_modes <- Parser_insertion_mode.In_table :: t.template_modes; 2044 t.mode <- Parser_insertion_mode.In_table; 2045 process_token t token 2046 | Token.Tag { kind = Token.Start; name = "col"; _ } -> 2047 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2048 t.template_modes <- Parser_insertion_mode.In_column_group :: t.template_modes; 2049 t.mode <- Parser_insertion_mode.In_column_group; 2050 process_token t token 2051 | Token.Tag { kind = Token.Start; name = "tr"; _ } -> 2052 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2053 t.template_modes <- Parser_insertion_mode.In_table_body :: t.template_modes; 2054 t.mode <- Parser_insertion_mode.In_table_body; 2055 process_token t token 2056 | Token.Tag { kind = Token.Start; name; _ } when Parser_constants.is_table_cell_element name -> 2057 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2058 t.template_modes <- Parser_insertion_mode.In_row :: t.template_modes; 2059 t.mode <- Parser_insertion_mode.In_row; 2060 process_token t token 2061 | Token.Tag { kind = Token.Start; _ } -> 2062 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2063 t.template_modes <- Parser_insertion_mode.In_body :: t.template_modes; 2064 t.mode <- Parser_insertion_mode.In_body; 2065 process_token t token 2066 | Token.Tag { kind = Token.End; name; _ } -> 2067 parse_error t ("unexpected-end-tag:" ^ name) 2068 | Token.EOF -> 2069 if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then 2070 () (* Stop parsing *) 2071 else begin 2072 parse_error t "expected-closing-tag-but-got-eof"; 2073 pop_until_html_tag t "template"; 2074 clear_active_formatting_to_marker t; 2075 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2076 reset_insertion_mode t; 2077 process_token t token 2078 end 2079 2080and process_after_body t token = 2081 match token with 2082 | Token.Character data when is_whitespace data -> 2083 process_in_body t token 2084 | Token.Comment data -> 2085 (* Insert as last child of html element - html is at bottom of stack *) 2086 let html_opt = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in 2087 (match html_opt with 2088 | Some html -> 2089 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in 2090 Dom.append_child html (Dom.create_comment ~location data) 2091 | None -> ()) 2092 | Token.Doctype _ -> 2093 parse_error t "unexpected-doctype" 2094 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2095 process_in_body t token 2096 | Token.Tag { kind = Token.End; name = "html"; _ } -> 2097 if t.fragment_context <> None then 2098 parse_error t "unexpected-end-tag" 2099 else 2100 t.mode <- Parser_insertion_mode.After_after_body 2101 | Token.EOF -> 2102 () (* Stop parsing *) 2103 | _ -> 2104 parse_error t "unexpected-token-after-body"; 2105 t.mode <- Parser_insertion_mode.In_body; 2106 process_token t token 2107 2108and process_in_frameset t token = 2109 match token with 2110 | Token.Character data -> 2111 (* Extract only whitespace characters and insert them *) 2112 let whitespace = String.to_seq data 2113 |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' ']) 2114 |> String.of_seq in 2115 if whitespace <> "" then insert_character t whitespace; 2116 if not (is_whitespace data) then 2117 parse_error t "unexpected-char-in-frameset" 2118 | Token.Comment data -> 2119 insert_comment t data 2120 | Token.Doctype _ -> 2121 parse_error t "unexpected-doctype" 2122 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2123 process_in_body t token 2124 | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } -> 2125 ignore (insert_element t "frameset" ~push:true attrs) 2126 | Token.Tag { kind = Token.End; name = "frameset"; _ } -> 2127 (match current_node t with 2128 | Some n when n.Dom.name = "html" -> parse_error t "unexpected-end-tag" 2129 | _ -> 2130 pop_current t; 2131 if t.fragment_context = None then 2132 (match current_node t with 2133 | Some n when n.Dom.name <> "frameset" -> t.mode <- Parser_insertion_mode.After_frameset 2134 | _ -> ())) 2135 | Token.Tag { kind = Token.Start; name = "frame"; attrs; _ } -> 2136 ignore (insert_element t "frame" ~push:true attrs); 2137 pop_current t 2138 | Token.Tag { kind = Token.Start; name = "noframes"; _ } -> 2139 process_in_head t token 2140 | Token.EOF -> 2141 (match current_node t with 2142 | Some n when n.Dom.name <> "html" -> parse_error t "expected-closing-tag-but-got-eof" 2143 | _ -> ()) 2144 | _ -> 2145 parse_error t "unexpected-token-in-frameset" 2146 2147and process_after_frameset t token = 2148 match token with 2149 | Token.Character data -> 2150 (* Extract only whitespace characters and insert them *) 2151 let whitespace = String.to_seq data 2152 |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' ']) 2153 |> String.of_seq in 2154 if whitespace <> "" then insert_character t whitespace; 2155 if not (is_whitespace data) then 2156 parse_error t "unexpected-char-after-frameset" 2157 | Token.Comment data -> 2158 insert_comment t data 2159 | Token.Doctype _ -> 2160 parse_error t "unexpected-doctype" 2161 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2162 process_in_body t token 2163 | Token.Tag { kind = Token.End; name = "html"; _ } -> 2164 t.mode <- Parser_insertion_mode.After_after_frameset 2165 | Token.Tag { kind = Token.Start; name = "noframes"; _ } -> 2166 process_in_head t token 2167 | Token.EOF -> 2168 () (* Stop parsing *) 2169 | _ -> 2170 parse_error t "unexpected-token-after-frameset" 2171 2172and process_after_after_body t token = 2173 match token with 2174 | Token.Comment data -> 2175 insert_comment_to_document t data 2176 | Token.Doctype _ -> 2177 process_in_body t token 2178 | Token.Character data when is_whitespace data -> 2179 process_in_body t token 2180 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2181 process_in_body t token 2182 | Token.EOF -> 2183 () (* Stop parsing *) 2184 | _ -> 2185 parse_error t "unexpected-token-after-after-body"; 2186 t.mode <- Parser_insertion_mode.In_body; 2187 process_token t token 2188 2189and process_after_after_frameset t token = 2190 match token with 2191 | Token.Comment data -> 2192 insert_comment_to_document t data 2193 | Token.Doctype _ -> 2194 process_in_body t token 2195 | Token.Character data -> 2196 (* Extract only whitespace characters and process using in_body rules *) 2197 let whitespace = String.to_seq data 2198 |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' ']) 2199 |> String.of_seq in 2200 if whitespace <> "" then process_in_body t (Token.Character whitespace); 2201 if not (is_whitespace data) then 2202 parse_error t "unexpected-char-after-after-frameset" 2203 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2204 process_in_body t token 2205 | Token.EOF -> 2206 () (* Stop parsing *) 2207 | Token.Tag { kind = Token.Start; name = "noframes"; _ } -> 2208 process_in_head t token 2209 | _ -> 2210 parse_error t "unexpected-token-after-after-frameset" 2211 2212and process_token t token = 2213 (* Check for HTML integration points (SVG foreignObject, desc, title) *) 2214 let is_html_integration_point node = 2215 (* SVG foreignObject, desc, and title are always HTML integration points *) 2216 if node.Dom.namespace = Some "svg" && 2217 Parser_constants.is_svg_html_integration node.Dom.name then true 2218 (* annotation-xml is an HTML integration point only with specific encoding values *) 2219 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then 2220 match List.assoc_opt "encoding" node.Dom.attrs with 2221 | Some enc -> 2222 let enc_lower = String.lowercase_ascii enc in 2223 enc_lower = "text/html" || enc_lower = "application/xhtml+xml" 2224 | None -> false 2225 else false 2226 in 2227 (* Check for MathML text integration points *) 2228 let is_mathml_text_integration_point node = 2229 node.Dom.namespace = Some "mathml" && 2230 Parser_constants.is_mathml_text_integration node.Dom.name 2231 in 2232 (* Foreign content handling *) 2233 let in_foreign = 2234 match adjusted_current_node t with 2235 | None -> false 2236 | Some node -> 2237 if is_in_html_namespace node then false 2238 else begin 2239 (* At HTML integration points, characters and start tags (except mglyph/malignmark) use HTML rules *) 2240 if is_html_integration_point node then begin 2241 match token with 2242 | Token.Character _ -> false 2243 | Token.Tag { kind = Token.Start; _ } -> false 2244 | _ -> true 2245 end 2246 (* At MathML text integration points, characters and start tags (except mglyph/malignmark) use HTML rules *) 2247 else if is_mathml_text_integration_point node then begin 2248 match token with 2249 | Token.Character _ -> false 2250 | Token.Tag { kind = Token.Start; name; _ } -> 2251 name = "mglyph" || name = "malignmark" 2252 | _ -> true 2253 end 2254 (* Special case: <svg> inside annotation-xml uses HTML rules (creates svg in svg namespace) *) 2255 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then begin 2256 match token with 2257 | Token.Tag { kind = Token.Start; name; _ } when String.lowercase_ascii name = "svg" -> false 2258 | _ -> true 2259 end 2260 (* Not at integration point - use foreign content rules *) 2261 (* Breakout handling is done inside process_foreign_content *) 2262 else true 2263 end 2264 in 2265 2266 (* Check if at HTML integration point for special table mode handling *) 2267 let at_integration_point = 2268 match adjusted_current_node t with 2269 | Some node -> 2270 is_html_integration_point node || is_mathml_text_integration_point node 2271 | None -> false 2272 in 2273 2274 if in_foreign then 2275 process_foreign_content t token 2276 else if at_integration_point then begin 2277 (* At integration points, check if in table mode without table in scope *) 2278 let is_table_mode = List.mem t.mode [In_table; In_table_body; In_row; In_cell; In_caption; In_column_group] in 2279 let has_table = has_element_in_table_scope t "table" in 2280 if is_table_mode && not has_table then begin 2281 match token with 2282 | Token.Tag { kind = Token.Start; _ } -> 2283 (* Temporarily use IN_BODY for start tags in table mode without table *) 2284 let saved_mode = t.mode in 2285 t.mode <- In_body; 2286 process_by_mode t token; 2287 if t.mode = In_body then t.mode <- saved_mode 2288 | _ -> process_by_mode t token 2289 end else 2290 process_by_mode t token 2291 end else 2292 process_by_mode t token 2293 2294(* Pop foreign elements until HTML or integration point *) 2295and pop_until_html_or_integration_point t = 2296 let is_html_integration_point node = 2297 (* SVG foreignObject, desc, and title are always HTML integration points *) 2298 if node.Dom.namespace = Some "svg" && 2299 Parser_constants.is_svg_html_integration node.Dom.name then true 2300 (* annotation-xml is an HTML integration point only with specific encoding values *) 2301 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then 2302 match List.assoc_opt "encoding" node.Dom.attrs with 2303 | Some enc -> 2304 let enc_lower = String.lowercase_ascii enc in 2305 enc_lower = "text/html" || enc_lower = "application/xhtml+xml" 2306 | None -> false 2307 else false 2308 in 2309 (* Get fragment context element - only for foreign namespace fragment contexts *) 2310 let fragment_context_elem = t.fragment_context_element in 2311 let rec pop () = 2312 match current_node t with 2313 | None -> () 2314 | Some node -> 2315 if is_in_html_namespace node then () 2316 else if is_html_integration_point node then () 2317 (* Don't pop past fragment context element *) 2318 else (match fragment_context_elem with 2319 | Some ctx when node == ctx -> () 2320 | _ -> 2321 pop_current t; 2322 pop ()) 2323 in 2324 pop () 2325 2326(* Foreign breakout elements - these break out of foreign content *) 2327and is_foreign_breakout_element name = 2328 List.mem (String.lowercase_ascii name) 2329 ["b"; "big"; "blockquote"; "body"; "br"; "center"; "code"; "dd"; "div"; "dl"; "dt"; 2330 "em"; "embed"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "hr"; "i"; "img"; "li"; 2331 "listing"; "menu"; "meta"; "nobr"; "ol"; "p"; "pre"; "ruby"; "s"; "small"; "span"; 2332 "strong"; "strike"; "sub"; "sup"; "table"; "tt"; "u"; "ul"; "var"] 2333 2334and process_foreign_content t token = 2335 match token with 2336 | Token.Character data when String.contains data '\x00' -> 2337 (* Replace NUL characters with U+FFFD replacement character *) 2338 parse_error t "unexpected-null-character"; 2339 let buf = Buffer.create (String.length data) in 2340 let has_non_ws_non_nul = ref false in 2341 String.iter (fun c -> 2342 if c = '\x00' then Buffer.add_string buf "\xEF\xBF\xBD" 2343 else begin 2344 Buffer.add_char buf c; 2345 if not (c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r') then 2346 has_non_ws_non_nul := true 2347 end 2348 ) data; 2349 let replaced = Buffer.contents buf in 2350 insert_character t replaced; 2351 (* Only set frameset_ok to false if there's actual non-whitespace non-NUL content *) 2352 if !has_non_ws_non_nul then t.frameset_ok <- false 2353 | Token.Character data when is_whitespace data -> 2354 insert_character t data 2355 | Token.Character data -> 2356 insert_character t data; 2357 t.frameset_ok <- false 2358 | Token.Comment data -> 2359 insert_comment t data 2360 | Token.Doctype _ -> 2361 parse_error t "unexpected-doctype" 2362 | Token.Tag { kind = Token.Start; name; _ } when is_foreign_breakout_element name -> 2363 (* Breakout from foreign content - pop until HTML or integration point, reprocess in HTML mode *) 2364 parse_error t "unexpected-html-element-in-foreign-content"; 2365 pop_until_html_or_integration_point t; 2366 reset_insertion_mode t; 2367 (* Use process_by_mode to force HTML mode processing and avoid infinite loop *) 2368 process_by_mode t token 2369 | Token.Tag { kind = Token.Start; name = "font"; attrs; _ } 2370 when List.exists (fun (n, _) -> 2371 let n = String.lowercase_ascii n in 2372 n = "color" || n = "face" || n = "size") attrs -> 2373 (* font with color/face/size breaks out of foreign content *) 2374 parse_error t "unexpected-html-element-in-foreign-content"; 2375 pop_until_html_or_integration_point t; 2376 reset_insertion_mode t; 2377 process_by_mode t token 2378 | Token.Tag { kind = Token.Start; name; attrs; self_closing } -> 2379 let name = 2380 match adjusted_current_node t with 2381 | Some n when n.Dom.namespace = Some "svg" -> Parser_constants.adjust_svg_tag_name name 2382 | _ -> name 2383 in 2384 let attrs = 2385 match adjusted_current_node t with 2386 | Some n when n.Dom.namespace = Some "svg" -> 2387 Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs) 2388 | Some n when n.Dom.namespace = Some "mathml" -> 2389 Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs) 2390 | _ -> Parser_constants.adjust_foreign_attrs attrs 2391 in 2392 let namespace = 2393 match adjusted_current_node t with 2394 | Some n -> n.Dom.namespace 2395 | None -> None 2396 in 2397 let node = insert_element t name ~namespace attrs in 2398 t.open_elements <- node :: t.open_elements; 2399 if self_closing then pop_current t 2400 | Token.Tag { kind = Token.End; name; _ } when List.mem (String.lowercase_ascii name) ["br"; "p"] -> 2401 (* Special case: </br> and </p> end tags trigger breakout from foreign content *) 2402 parse_error t "unexpected-html-element-in-foreign-content"; 2403 pop_until_html_or_integration_point t; 2404 reset_insertion_mode t; 2405 (* Use process_by_mode to force HTML mode processing and avoid infinite loop *) 2406 process_by_mode t token 2407 | Token.Tag { kind = Token.End; name; _ } -> 2408 (* Find matching element per WHATWG spec for foreign content *) 2409 let is_fragment_context n = 2410 match t.fragment_context_element with 2411 | Some ctx -> n == ctx 2412 | None -> false 2413 in 2414 let name_lower = String.lowercase_ascii name in 2415 (* Walk through stack looking for matching element *) 2416 let rec find_and_process first_node idx = function 2417 | [] -> () (* Stack exhausted - ignore tag *) 2418 | n :: rest -> 2419 let node_name_lower = String.lowercase_ascii n.Dom.name in 2420 let is_html = is_in_html_namespace n in 2421 let name_matches = node_name_lower = name_lower in 2422 2423 (* If first node doesn't match tag name, it's a parse error *) 2424 if first_node && not name_matches then 2425 parse_error t "unexpected-end-tag-in-foreign-content"; 2426 2427 (* Check if this node matches the end tag *) 2428 if name_matches then begin 2429 (* Fragment context check *) 2430 if is_fragment_context n then 2431 parse_error t "unexpected-end-tag-in-fragment-context" 2432 (* If matched element is in HTML namespace, reprocess via HTML mode *) 2433 else if is_html then 2434 process_by_mode t token 2435 (* Otherwise it's a foreign element - pop everything from this point up *) 2436 else begin 2437 (* Pop all elements from current down to and including the matched element *) 2438 let rec pop_to_idx current_idx = 2439 if current_idx >= idx then begin 2440 pop_current t; 2441 pop_to_idx (current_idx - 1) 2442 end 2443 in 2444 pop_to_idx (List.length t.open_elements - 1) 2445 end 2446 end 2447 (* If we hit an HTML element that doesn't match, process via HTML mode *) 2448 else if is_html then 2449 process_by_mode t token 2450 (* Continue searching in the stack *) 2451 else 2452 find_and_process false (idx - 1) rest 2453 in 2454 find_and_process true (List.length t.open_elements - 1) t.open_elements 2455 | Token.EOF -> 2456 process_by_mode t token 2457 2458and process_by_mode t token = 2459 match t.mode with 2460 | Parser_insertion_mode.Initial -> process_initial t token 2461 | Parser_insertion_mode.Before_html -> process_before_html t token 2462 | Parser_insertion_mode.Before_head -> process_before_head t token 2463 | Parser_insertion_mode.In_head -> process_in_head t token 2464 | Parser_insertion_mode.In_head_noscript -> process_in_head_noscript t token 2465 | Parser_insertion_mode.After_head -> process_after_head t token 2466 | Parser_insertion_mode.In_body -> process_in_body t token 2467 | Parser_insertion_mode.Text -> process_text t token 2468 | Parser_insertion_mode.In_table -> process_in_table t token 2469 | Parser_insertion_mode.In_table_text -> process_in_table_text t token 2470 | Parser_insertion_mode.In_caption -> process_in_caption t token 2471 | Parser_insertion_mode.In_column_group -> process_in_column_group t token 2472 | Parser_insertion_mode.In_table_body -> process_in_table_body t token 2473 | Parser_insertion_mode.In_row -> process_in_row t token 2474 | Parser_insertion_mode.In_cell -> process_in_cell t token 2475 | Parser_insertion_mode.In_select -> process_in_select t token 2476 | Parser_insertion_mode.In_select_in_table -> process_in_select_in_table t token 2477 | Parser_insertion_mode.In_template -> process_in_template t token 2478 | Parser_insertion_mode.After_body -> process_after_body t token 2479 | Parser_insertion_mode.In_frameset -> process_in_frameset t token 2480 | Parser_insertion_mode.After_frameset -> process_after_frameset t token 2481 | Parser_insertion_mode.After_after_body -> process_after_after_body t token 2482 | Parser_insertion_mode.After_after_frameset -> process_after_after_frameset t token 2483 2484(* Populate selectedcontent elements with content from selected option *) 2485let find_elements name node = 2486 let result = ref [] in 2487 let rec find n = 2488 if n.Dom.name = name then result := n :: !result; 2489 List.iter find n.Dom.children 2490 in 2491 find node; 2492 List.rev !result (* Reverse to maintain document order *) 2493 2494let find_element name node = 2495 let rec find n = 2496 if n.Dom.name = name then Some n 2497 else 2498 List.find_map find n.Dom.children 2499 in 2500 find node 2501 2502let populate_selectedcontent document = 2503 let selects = find_elements "select" document in 2504 List.iter (fun select -> 2505 match find_element "selectedcontent" select with 2506 | None -> () 2507 | Some selectedcontent -> 2508 let options = find_elements "option" select in 2509 if options <> [] then begin 2510 (* Find selected option or use first *) 2511 let selected_option = 2512 match List.find_opt (fun opt -> Dom.has_attr opt "selected") options with 2513 | Some opt -> opt 2514 | None -> List.hd options 2515 in 2516 (* Clone children from selected option to selectedcontent *) 2517 List.iter (fun child -> 2518 let cloned = Dom.clone ~deep:true child in 2519 Dom.append_child selectedcontent cloned 2520 ) selected_option.Dom.children 2521 end 2522 ) selects 2523 2524let finish t = 2525 (* Populate selectedcontent elements *) 2526 populate_selectedcontent t.document; 2527 (* For fragment parsing, remove the html wrapper and promote children *) 2528 if t.fragment_context <> None then begin 2529 match t.document.Dom.children with 2530 | [root] when root.Dom.name = "html" -> 2531 (* Move context element's children to root if applicable *) 2532 (match t.fragment_context_element with 2533 | Some ctx_elem -> 2534 (match ctx_elem.Dom.parent with 2535 | Some p when p == root -> 2536 let ctx_children = ctx_elem.Dom.children in 2537 List.iter (fun child -> 2538 Dom.remove_child ctx_elem child; 2539 Dom.append_child root child 2540 ) ctx_children; 2541 Dom.remove_child root ctx_elem 2542 | _ -> ()) 2543 | None -> ()); 2544 (* Promote root's children to document - preserve order *) 2545 let children_copy = root.Dom.children in 2546 List.iter (fun child -> 2547 Dom.remove_child root child; 2548 Dom.append_child t.document child 2549 ) children_copy; 2550 Dom.remove_child t.document root 2551 | _ -> () 2552 end; 2553 t.document 2554 2555let get_errors t = List.rev t.errors