OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* HTML5 Tree Builder *)
2
3module Dom = Dom
4module Token = Tokenizer_token
5module State = Tokenizer_state
6
7type fragment_context = {
8 tag_name : string;
9 namespace : string option;
10}
11
12type formatting_entry =
13 | Marker
14 | Entry of {
15 name : string;
16 attrs : (string * string) list;
17 node : Dom.node;
18 }
19
20type parse_error = {
21 code : Parse_error_code.t;
22 line : int;
23 column : int;
24}
25
26type t = {
27 mutable document : Dom.node;
28 mutable mode : Parser_insertion_mode.t;
29 mutable original_mode : Parser_insertion_mode.t option;
30 mutable open_elements : Dom.node list;
31 mutable active_formatting : formatting_entry list;
32 mutable head_element : Dom.node option;
33 mutable form_element : Dom.node option;
34 mutable frameset_ok : bool;
35 mutable ignore_lf : bool;
36 mutable foster_parenting : bool;
37 mutable pending_table_chars : string list;
38 mutable template_modes : Parser_insertion_mode.t list;
39 mutable quirks_mode : Dom.quirks_mode;
40 mutable errors : parse_error list;
41 collect_errors : bool;
42 fragment_context : fragment_context option;
43 mutable fragment_context_element : Dom.node option;
44 iframe_srcdoc : bool;
45 mutable current_line : int;
46 mutable current_column : int;
47}
48
49let create ?(collect_errors=false) ?fragment_context ?(iframe_srcdoc=false) () =
50 let is_fragment = fragment_context <> None in
51 let doc = if is_fragment then Dom.create_document_fragment () else Dom.create_document () in
52 let t = {
53 document = doc;
54 mode = Parser_insertion_mode.Initial;
55 original_mode = None;
56 open_elements = [];
57 active_formatting = [];
58 head_element = None;
59 form_element = None;
60 frameset_ok = true;
61 ignore_lf = false;
62 foster_parenting = false;
63 pending_table_chars = [];
64 template_modes = [];
65 quirks_mode = Dom.No_quirks;
66 errors = [];
67 collect_errors;
68 fragment_context;
69 fragment_context_element = None;
70 iframe_srcdoc;
71 current_line = 1;
72 current_column = 1;
73 } in
74 (* Initialize fragment parsing *)
75 (match fragment_context with
76 | Some ctx ->
77 let name = String.lowercase_ascii ctx.tag_name in
78 let ns = ctx.namespace in
79 (* Create html root *)
80 let root = Dom.create_element "html" () in
81 Dom.append_child doc root;
82 t.open_elements <- [root];
83 (* For foreign content contexts, create context element *)
84 (match ns with
85 | Some namespace when namespace <> "html" ->
86 let context_elem = Dom.create_element ctx.tag_name ~namespace:ns () in
87 Dom.append_child root context_elem;
88 t.open_elements <- [context_elem; root];
89 t.fragment_context_element <- Some context_elem
90 | _ -> ());
91 (* Set initial mode based on context *)
92 t.mode <- (
93 if name = "html" then Parser_insertion_mode.Before_head
94 else if Parser_constants.is_table_section_element name && (ns = None || ns = Some "html") then
95 Parser_insertion_mode.In_table_body
96 else if name = "tr" && (ns = None || ns = Some "html") then
97 Parser_insertion_mode.In_row
98 else if Parser_constants.is_table_cell_element name && (ns = None || ns = Some "html") then
99 Parser_insertion_mode.In_cell
100 else if name = "caption" && (ns = None || ns = Some "html") then
101 Parser_insertion_mode.In_caption
102 else if name = "colgroup" && (ns = None || ns = Some "html") then
103 Parser_insertion_mode.In_column_group
104 else if name = "table" && (ns = None || ns = Some "html") then
105 Parser_insertion_mode.In_table
106 else if name = "template" && (ns = None || ns = Some "html") then begin
107 t.template_modes <- [Parser_insertion_mode.In_template];
108 Parser_insertion_mode.In_template
109 end
110 else
111 Parser_insertion_mode.In_body
112 );
113 t.frameset_ok <- false
114 | None -> ());
115 t
116
117(* Position tracking for error reporting *)
118let set_position t ~line ~column =
119 t.current_line <- line;
120 t.current_column <- column
121
122(* Error handling *)
123let parse_error t code =
124 if t.collect_errors then
125 t.errors <- { code = Parse_error_code.of_string code; line = t.current_line; column = t.current_column } :: t.errors
126
127(* Stack helpers *)
128let current_node t =
129 match t.open_elements with
130 | [] -> None
131 | x :: _ -> Some x
132
133let adjusted_current_node t =
134 match t.fragment_context, t.open_elements with
135 | Some ctx, [_] ->
136 (* Fragment case: use context element info *)
137 Some (Dom.create_element ctx.tag_name ~namespace:ctx.namespace ())
138 | _, x :: _ -> Some x
139 | _, [] -> None
140
141let is_in_html_namespace node =
142 node.Dom.namespace = None || node.Dom.namespace = Some "html"
143
144(* Namespace-aware check for "special" elements per WHATWG spec *)
145let is_special_element node =
146 let name = String.lowercase_ascii node.Dom.name in
147 match node.Dom.namespace with
148 | None | Some "html" -> Parser_constants.is_special name
149 | Some "mathml" -> List.mem name ["mi"; "mo"; "mn"; "ms"; "mtext"; "annotation-xml"]
150 | Some "svg" -> List.mem name ["foreignobject"; "desc"; "title"]
151 | _ -> false
152
153let adjusted_current_node_in_html_namespace t =
154 match adjusted_current_node t with
155 | Some node -> is_in_html_namespace node
156 | None -> true
157
158(* Insertion helpers *)
159let appropriate_insertion_place t =
160 match current_node t with
161 | None -> (t.document, None)
162 | Some target ->
163 if t.foster_parenting && Parser_constants.is_foster_parenting_element target.Dom.name then begin
164 (* Foster parenting per WHATWG spec *)
165 (* Step 1: Find last (most recent) template and table in stack *)
166 (* Note: index 0 = top of stack = most recently added *)
167 let last_template_idx = ref None in
168 let last_table_idx = ref None in
169 List.iteri (fun i n ->
170 (* Take first match (most recent = lowest index) *)
171 if n.Dom.name = "template" && !last_template_idx = None then last_template_idx := Some i;
172 if n.Dom.name = "table" && !last_table_idx = None then last_table_idx := Some i
173 ) t.open_elements;
174
175 (* Step 2-3: If last template is more recent than last table (lower index = more recent) *)
176 match !last_template_idx, !last_table_idx with
177 | Some ti, None ->
178 (* No table, use template content *)
179 let template = List.nth t.open_elements ti in
180 (match template.Dom.template_content with
181 | Some tc -> (tc, None)
182 | None -> (template, None))
183 | Some ti, Some tbi when ti < tbi ->
184 (* Template is more recent than table, use template content *)
185 let template = List.nth t.open_elements ti in
186 (match template.Dom.template_content with
187 | Some tc -> (tc, None)
188 | None -> (template, None))
189 | _, Some tbi ->
190 (* Use table's parent as foster parent *)
191 let table = List.nth t.open_elements tbi in
192 (match table.Dom.parent with
193 | Some parent -> (parent, Some table)
194 | None ->
195 (* Step 6: element above table in stack (index + 1 since 0 is top) *)
196 if tbi + 1 < List.length t.open_elements then
197 (List.nth t.open_elements (tbi + 1), None)
198 else
199 (t.document, None))
200 | None, None ->
201 (* No table or template, use document *)
202 (t.document, None)
203 end else begin
204 (* If target is a template, insert into its content document fragment *)
205 match target.Dom.template_content with
206 | Some tc -> (tc, None)
207 | None -> (target, None)
208 end
209
210let insert_element t name ?(namespace=None) ?(push=false) attrs =
211 (* Reset ignore_lf flag - per HTML5 spec, only the immediately next token
212 after pre/textarea/listing should be checked for leading LF *)
213 t.ignore_lf <- false;
214 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
215 let node = Dom.create_element name ~namespace ~attrs ~location () in
216 let (parent, before) = appropriate_insertion_place t in
217 (match before with
218 | None -> Dom.append_child parent node
219 | Some ref -> Dom.insert_before parent node ref);
220 if push then t.open_elements <- node :: t.open_elements;
221 node
222
223let insert_element_for_token t (tag : Token.tag) =
224 insert_element t tag.name ~push:true tag.attrs
225
226let insert_foreign_element t (tag : Token.tag) namespace =
227 let attrs =
228 if namespace = Some "svg" then
229 Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs tag.attrs)
230 else
231 Parser_constants.adjust_foreign_attrs tag.attrs
232 in
233 let name =
234 if namespace = Some "svg" then Parser_constants.adjust_svg_tag_name tag.name
235 else tag.name
236 in
237 let node = insert_element t name ~namespace attrs in
238 t.open_elements <- node :: t.open_elements;
239 node
240
241let insert_character t data =
242 if t.ignore_lf && String.length data > 0 && data.[0] = '\n' then begin
243 t.ignore_lf <- false;
244 if String.length data > 1 then begin
245 let rest = String.sub data 1 (String.length data - 1) in
246 let (parent, before) = appropriate_insertion_place t in
247 Dom.insert_text_at parent rest before
248 end
249 end else begin
250 t.ignore_lf <- false;
251 let (parent, before) = appropriate_insertion_place t in
252 Dom.insert_text_at parent data before
253 end
254
255let insert_comment t data =
256 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
257 let node = Dom.create_comment ~location data in
258 let (parent, _) = appropriate_insertion_place t in
259 Dom.append_child parent node
260
261let insert_comment_to_document t data =
262 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
263 let node = Dom.create_comment ~location data in
264 Dom.append_child t.document node
265
266(* Stack manipulation *)
267let pop_current t =
268 match t.open_elements with
269 | [] -> ()
270 | _ :: rest -> t.open_elements <- rest
271
272let pop_until t pred =
273 let rec loop () =
274 match t.open_elements with
275 | [] -> ()
276 | x :: rest ->
277 t.open_elements <- rest;
278 if not (pred x) then loop ()
279 in
280 loop ()
281
282let pop_until_tag t name =
283 pop_until t (fun n -> n.Dom.name = name)
284
285(* Pop until HTML namespace element with given name *)
286let pop_until_html_tag t name =
287 pop_until t (fun n -> n.Dom.name = name && is_in_html_namespace n)
288
289let pop_until_one_of t names =
290 pop_until t (fun n -> List.mem n.Dom.name names)
291
292(* Pop until HTML namespace element with one of given names *)
293let pop_until_html_one_of t names =
294 pop_until t (fun n -> List.mem n.Dom.name names && is_in_html_namespace n)
295
296(* Check if element is an HTML integration point *)
297let is_html_integration_point node =
298 (* SVG foreignObject, desc, and title are always HTML integration points *)
299 if node.Dom.namespace = Some "svg" &&
300 Parser_constants.is_svg_html_integration node.Dom.name then true
301 (* annotation-xml is an HTML integration point only with specific encoding values *)
302 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
303 match List.assoc_opt "encoding" node.Dom.attrs with
304 | Some enc ->
305 let enc_lower = String.lowercase_ascii enc in
306 enc_lower = "text/html" || enc_lower = "application/xhtml+xml"
307 | None -> false
308 else false
309
310(* Check if element is a MathML text integration point *)
311let is_mathml_text_integration_point node =
312 node.Dom.namespace = Some "mathml" &&
313 Parser_constants.is_mathml_text_integration node.Dom.name
314
315(* Scope checks - integration points also terminate scope (except for table scope) *)
316(* Per WHATWG spec, scope checks only consider HTML namespace elements for the target names *)
317let has_element_in_scope_impl t names exclude_list ~check_integration_points =
318 let rec check = function
319 | [] -> false
320 | n :: rest ->
321 (* Target elements must be in HTML namespace *)
322 if is_in_html_namespace n && List.mem n.Dom.name names then true
323 else if is_in_html_namespace n && List.mem n.Dom.name exclude_list then false
324 (* Integration points terminate scope (unless we're checking table scope) *)
325 else if check_integration_points && (is_html_integration_point n || is_mathml_text_integration_point n) then false
326 else check rest
327 in
328 check t.open_elements
329
330let has_element_in_scope t name =
331 has_element_in_scope_impl t [name] Parser_constants.default_scope ~check_integration_points:true
332
333let has_element_in_button_scope t name =
334 has_element_in_scope_impl t [name] Parser_constants.button_scope ~check_integration_points:true
335
336let has_element_in_list_item_scope t name =
337 has_element_in_scope_impl t [name] Parser_constants.list_item_scope ~check_integration_points:true
338
339let has_element_in_table_scope t name =
340 has_element_in_scope_impl t [name] Parser_constants.table_scope ~check_integration_points:false
341
342let has_element_in_select_scope t name =
343 let rec check = function
344 | [] -> false
345 | n :: rest ->
346 if n.Dom.name = name then true
347 else if not (Parser_constants.is_select_scope_exclude n.Dom.name) then false
348 else check rest
349 in
350 check t.open_elements
351
352(* Implied end tags *)
353let generate_implied_end_tags t ?except () =
354 let rec loop () =
355 match current_node t with
356 | Some n when Parser_constants.is_implied_end_tag n.Dom.name ->
357 (match except with
358 | Some ex when n.Dom.name = ex -> ()
359 | _ -> pop_current t; loop ())
360 | _ -> ()
361 in
362 loop ()
363
364let generate_all_implied_end_tags t =
365 let rec loop () =
366 match current_node t with
367 | Some n when Parser_constants.is_thoroughly_implied_end_tag n.Dom.name ->
368 pop_current t; loop ()
369 | _ -> ()
370 in
371 loop ()
372
373(* Active formatting elements *)
374let push_formatting_marker t =
375 t.active_formatting <- Marker :: t.active_formatting
376
377let push_formatting_element t node name attrs =
378 (* Noah's Ark: remove earlier identical elements (up to 3) *)
379 let rec count_and_remove same acc = function
380 | [] -> List.rev acc
381 | Marker :: rest -> List.rev acc @ (Marker :: rest)
382 | Entry e :: rest when e.name = name && e.attrs = attrs ->
383 if same >= 2 then
384 count_and_remove same acc rest (* Remove this one *)
385 else
386 count_and_remove (same + 1) (Entry e :: acc) rest
387 | x :: rest -> count_and_remove same (x :: acc) rest
388 in
389 t.active_formatting <- count_and_remove 0 [] t.active_formatting;
390 t.active_formatting <- Entry { name; attrs; node } :: t.active_formatting
391
392let clear_active_formatting_to_marker t =
393 let rec loop = function
394 | [] -> []
395 | Marker :: rest -> rest
396 | _ :: rest -> loop rest
397 in
398 t.active_formatting <- loop t.active_formatting
399
400let reconstruct_active_formatting t =
401 let rec find_to_reconstruct acc = function
402 | [] -> acc
403 | Marker :: _ -> acc
404 | Entry e :: rest ->
405 if List.exists (fun n -> n == e.node) t.open_elements then acc
406 else find_to_reconstruct (Entry e :: acc) rest
407 in
408 let to_reconstruct = find_to_reconstruct [] t.active_formatting in
409 List.iter (fun entry ->
410 match entry with
411 | Entry e ->
412 let node = insert_element t e.name e.attrs in
413 t.open_elements <- node :: t.open_elements;
414 (* Update the entry to point to new node *)
415 t.active_formatting <- List.map (fun x ->
416 if x == entry then Entry { e with node }
417 else x
418 ) t.active_formatting
419 | Marker -> ()
420 ) to_reconstruct
421
422(* Adoption agency algorithm - follows WHATWG spec *)
423let adoption_agency t tag_name =
424 (* Step 1: If current node is subject and not in active formatting list, just pop *)
425 (match current_node t with
426 | Some n when n.Dom.name = tag_name ->
427 let in_active = List.exists (function
428 | Entry e -> e.name = tag_name
429 | Marker -> false
430 ) t.active_formatting in
431 if not in_active then begin
432 pop_current t;
433 () (* Return early - this case is handled *)
434 end
435 | _ -> ());
436
437 (* Step 2: Outer loop *)
438 let outer_loop_counter = ref 0 in
439 let done_flag = ref false in
440
441 while !outer_loop_counter < 8 && not !done_flag do
442 incr outer_loop_counter;
443
444 (* Step 3: Find formatting element in active formatting list *)
445 let rec find_formatting_index idx = function
446 | [] -> None
447 | Marker :: _ -> None
448 | Entry e :: rest ->
449 if e.name = tag_name then Some (idx, e.node, e.attrs)
450 else find_formatting_index (idx + 1) rest
451 in
452 let formatting_entry = find_formatting_index 0 t.active_formatting in
453
454 match formatting_entry with
455 | None ->
456 (* No formatting element found - done *)
457 done_flag := true
458 | Some (fmt_idx, fmt_node, fmt_attrs) ->
459
460 (* Step 4: Check if formatting element is in open elements *)
461 if not (List.exists (fun n -> n == fmt_node) t.open_elements) then begin
462 parse_error t "adoption-agency-1.2";
463 t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
464 done_flag := true
465 end
466 (* Step 5: Check if formatting element is in scope *)
467 else if not (has_element_in_scope t tag_name) then begin
468 parse_error t "adoption-agency-1.3";
469 done_flag := true
470 end else begin
471 (* Step 6: Parse error if not current node *)
472 (match current_node t with
473 | Some n when n != fmt_node -> parse_error t "adoption-agency-1.3"
474 | _ -> ());
475
476 (* Step 7: Find furthest block - first special element BELOW formatting element *)
477 (* open_elements is [current(top)...html(bottom)], formatting element is somewhere in the middle *)
478 (* We need the first special element going from formatting element toward current *)
479 (* This is the "topmost" (closest to formatting element) special element that is "lower" (closer to current) *)
480 let fmt_stack_idx = ref (-1) in
481 List.iteri (fun i n -> if n == fmt_node then fmt_stack_idx := i) t.open_elements;
482 let furthest_block =
483 if !fmt_stack_idx <= 0 then None
484 else begin
485 (* Look from fmt_stack_idx-1 down to 0, find first special element *)
486 let rec find_from_idx idx =
487 if idx < 0 then None
488 else
489 let n = List.nth t.open_elements idx in
490 if is_special_element n then Some n
491 else find_from_idx (idx - 1)
492 in
493 find_from_idx (!fmt_stack_idx - 1)
494 end
495 in
496
497 match furthest_block with
498 | None ->
499 (* Step 8: No furthest block - pop elements including formatting element *)
500 pop_until t (fun n -> n == fmt_node);
501 t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
502 done_flag := true
503
504 | Some fb ->
505 (* Step 9: Let common ancestor be element immediately above formatting element *)
506 let rec find_common_ancestor = function
507 | [] -> None
508 | n :: rest when n == fmt_node ->
509 (match rest with x :: _ -> Some x | [] -> None)
510 | _ :: rest -> find_common_ancestor rest
511 in
512 let common_ancestor = find_common_ancestor t.open_elements in
513
514 (* Step 10: Bookmark starts after formatting element *)
515 let bookmark = ref (fmt_idx + 1) in
516
517 (* Step 11: Let last_node = furthest block *)
518 let last_node = ref fb in
519
520 (* Step 12: Inner loop *)
521 (* The inner loop processes elements between furthest_block and formatting_element,
522 removing non-formatting elements and reparenting formatting elements *)
523 let inner_loop_counter = ref 0 in
524
525 (* Get index of furthest block in open elements *)
526 let fb_idx = ref 0 in
527 List.iteri (fun i n -> if n == fb then fb_idx := i) t.open_elements;
528
529 (* Start from element after furthest block (toward formatting element) *)
530 let node_idx = ref (!fb_idx + 1) in
531
532 while !node_idx < List.length t.open_elements &&
533 (List.nth t.open_elements !node_idx) != fmt_node do
534 incr inner_loop_counter;
535 let current_node = List.nth t.open_elements !node_idx in
536
537 (* Step 12.3: Find node in active formatting list *)
538 let rec find_node_in_formatting idx = function
539 | [] -> None
540 | Entry e :: _rest when e.node == current_node -> Some idx
541 | _ :: rest -> find_node_in_formatting (idx + 1) rest
542 in
543 let node_fmt_idx = find_node_in_formatting 0 t.active_formatting in
544
545 (* Step 12.4: If inner loop counter > 3 and node in active formatting, remove it *)
546 let node_fmt_idx =
547 match node_fmt_idx with
548 | Some idx when !inner_loop_counter > 3 ->
549 t.active_formatting <- List.filteri (fun i _ -> i <> idx) t.active_formatting;
550 if idx < !bookmark then decr bookmark;
551 None
552 | x -> x
553 in
554
555 (* Step 12.5: If node not in active formatting, remove from stack and continue *)
556 match node_fmt_idx with
557 | None ->
558 (* Remove from stack - this shifts indices *)
559 t.open_elements <- List.filteri (fun i _ -> i <> !node_idx) t.open_elements
560 (* Don't increment node_idx since we removed an element *)
561
562 | Some af_idx ->
563 (* Step 12.6: Create new element for node *)
564 let (node_name, node_attrs) = match List.nth t.active_formatting af_idx with
565 | Entry e -> (e.name, e.attrs)
566 | Marker -> failwith "unexpected marker"
567 in
568 let new_node_elem = Dom.create_element node_name ~attrs:node_attrs () in
569
570 (* Update active formatting with new node *)
571 t.active_formatting <- List.mapi (fun i entry ->
572 if i = af_idx then Entry { name = node_name; node = new_node_elem; attrs = node_attrs }
573 else entry
574 ) t.active_formatting;
575
576 (* Replace node in open elements *)
577 t.open_elements <- List.mapi (fun i n ->
578 if i = !node_idx then new_node_elem else n
579 ) t.open_elements;
580
581 (* Step 12.7: If last_node is furthest block, update bookmark *)
582 if !last_node == fb then
583 bookmark := af_idx + 1;
584
585 (* Step 12.8: Reparent last_node to new node *)
586 (match !last_node.Dom.parent with
587 | Some p -> Dom.remove_child p !last_node
588 | None -> ());
589 Dom.append_child new_node_elem !last_node;
590
591 (* Step 12.9: Let last_node = new node *)
592 last_node := new_node_elem;
593
594 (* Move to next element *)
595 incr node_idx
596 done;
597
598 (* Step 13: Insert last_node into common ancestor *)
599 (match common_ancestor with
600 | Some ca ->
601 (match !last_node.Dom.parent with
602 | Some p -> Dom.remove_child p !last_node
603 | None -> ());
604 (* Check if we need foster parenting *)
605 if t.foster_parenting && Parser_constants.is_foster_parenting_element ca.Dom.name then begin
606 (* Find table and insert before it *)
607 let rec find_table = function
608 | [] -> None
609 | n :: rest when n.Dom.name = "table" -> Some (n, rest)
610 | _ :: rest -> find_table rest
611 in
612 match find_table t.open_elements with
613 | Some (table, _) ->
614 (match table.Dom.parent with
615 | Some parent -> Dom.insert_before parent !last_node table
616 | None -> Dom.append_child ca !last_node)
617 | None -> Dom.append_child ca !last_node
618 end else begin
619 (* If common ancestor is template, insert into its content *)
620 match ca.Dom.template_content with
621 | Some tc -> Dom.append_child tc !last_node
622 | None -> Dom.append_child ca !last_node
623 end
624 | None -> ());
625
626 (* Step 14: Create new formatting element *)
627 let new_formatting = Dom.create_element tag_name ~attrs:fmt_attrs () in
628
629 (* Step 15: Move children of furthest block to new formatting element *)
630 let fb_children = fb.Dom.children in
631 List.iter (fun child ->
632 Dom.remove_child fb child;
633 Dom.append_child new_formatting child
634 ) fb_children;
635
636 (* Step 16: Append new formatting element to furthest block *)
637 Dom.append_child fb new_formatting;
638
639 (* Step 17: Remove old from active formatting, insert new at bookmark *)
640 let new_entry = Entry { name = tag_name; node = new_formatting; attrs = fmt_attrs } in
641 t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
642 (* Adjust bookmark since we removed an element *)
643 let adjusted_bookmark = if fmt_idx < !bookmark then !bookmark - 1 else !bookmark in
644 let rec insert_at_bookmark idx acc = function
645 | [] -> List.rev (new_entry :: acc)
646 | x :: rest when idx = adjusted_bookmark ->
647 List.rev_append acc (new_entry :: x :: rest)
648 | x :: rest -> insert_at_bookmark (idx + 1) (x :: acc) rest
649 in
650 t.active_formatting <- insert_at_bookmark 0 [] t.active_formatting;
651
652 (* Step 18: Remove formatting element from open elements, insert new after furthest block *)
653 (* "After" in stack terms means new_formatting should be between fb and current node *)
654 (* In our list orientation (current at index 0), this means new_formatting at lower index than fb *)
655 t.open_elements <- List.filter (fun n -> n != fmt_node) t.open_elements;
656 (* Find fb and insert new_formatting before it (lower index = closer to current) *)
657 let rec insert_before acc = function
658 | [] -> List.rev (new_formatting :: acc)
659 | n :: rest when n == fb ->
660 (* Insert new_formatting before fb: acc reversed, then new_formatting, then fb, then rest *)
661 List.rev_append acc (new_formatting :: n :: rest)
662 | n :: rest -> insert_before (n :: acc) rest
663 in
664 t.open_elements <- insert_before [] t.open_elements
665 (* Continue outer loop *)
666 end
667 done
668
669(* Close p element *)
670let close_p_element t =
671 generate_implied_end_tags t ~except:"p" ();
672 (match current_node t with
673 | Some n when n.Dom.name <> "p" -> parse_error t "end-tag-p-implied-but-open-elements"
674 | _ -> ());
675 pop_until_tag t "p"
676
677(* Reset insertion mode *)
678let reset_insertion_mode t =
679 let rec check_node last = function
680 | [] -> t.mode <- Parser_insertion_mode.In_body
681 | node :: rest ->
682 let is_last = rest = [] in
683 let node_to_check =
684 if is_last then
685 match t.fragment_context with
686 | Some ctx -> Dom.create_element ctx.tag_name ~namespace:ctx.namespace ()
687 | None -> node
688 else node
689 in
690 let name = node_to_check.Dom.name in
691 if name = "select" then begin
692 if not is_last then begin
693 let rec find_table_or_template = function
694 | [] -> ()
695 | n :: rest ->
696 if n.Dom.name = "template" then t.mode <- Parser_insertion_mode.In_select
697 else if n.Dom.name = "table" then t.mode <- Parser_insertion_mode.In_select_in_table
698 else find_table_or_template rest
699 in
700 find_table_or_template rest
701 end;
702 if t.mode <> Parser_insertion_mode.In_select_in_table then
703 t.mode <- Parser_insertion_mode.In_select
704 end else if Parser_constants.is_table_cell_element name && not is_last then
705 t.mode <- Parser_insertion_mode.In_cell
706 else if name = "tr" then
707 t.mode <- Parser_insertion_mode.In_row
708 else if Parser_constants.is_table_section_element name then
709 t.mode <- Parser_insertion_mode.In_table_body
710 else if name = "caption" then
711 t.mode <- Parser_insertion_mode.In_caption
712 else if name = "colgroup" then
713 t.mode <- Parser_insertion_mode.In_column_group
714 else if name = "table" then
715 t.mode <- Parser_insertion_mode.In_table
716 else if name = "template" then
717 t.mode <- (match t.template_modes with m :: _ -> m | [] -> Parser_insertion_mode.In_template)
718 else if name = "head" && not is_last then
719 t.mode <- Parser_insertion_mode.In_head
720 else if name = "body" then
721 t.mode <- Parser_insertion_mode.In_body
722 else if name = "frameset" then
723 t.mode <- Parser_insertion_mode.In_frameset
724 else if name = "html" then
725 t.mode <- (if t.head_element = None then Parser_insertion_mode.Before_head else Parser_insertion_mode.After_head)
726 else if is_last then
727 t.mode <- Parser_insertion_mode.In_body
728 else
729 check_node last rest
730 in
731 check_node false t.open_elements
732
733let is_whitespace s =
734 let ws = [' '; '\t'; '\n'; '\x0C'; '\r'] in
735 String.for_all (fun c -> List.mem c ws) s
736
737(* Mode handlers *)
738let rec process_initial t token =
739 match token with
740 | Token.Character data when is_whitespace data -> ()
741 | Token.Comment data -> insert_comment_to_document t data
742 | Token.Doctype dt ->
743 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
744 let node = Dom.create_doctype ?name:dt.name ?public_id:dt.public_id ?system_id:dt.system_id ~location () in
745 Dom.append_child t.document node;
746 (* Quirks mode detection *)
747 if dt.force_quirks then
748 t.quirks_mode <- Dom.Quirks
749 else if dt.name <> Some "html" then
750 t.quirks_mode <- Dom.Quirks
751 else begin
752 let pub = Option.map String.lowercase_ascii dt.public_id in
753 let sys = Option.map String.lowercase_ascii dt.system_id in
754 let is_quirky =
755 (match pub with
756 | Some p -> List.mem p Parser_constants.quirky_public_matches ||
757 List.exists (fun prefix -> String.length p >= String.length prefix &&
758 String.sub p 0 (String.length prefix) = prefix) Parser_constants.quirky_public_prefixes
759 | None -> false) ||
760 (match sys with
761 | Some s -> List.mem s Parser_constants.quirky_system_matches
762 | None -> false)
763 in
764 if is_quirky then t.quirks_mode <- Dom.Quirks
765 else begin
766 let is_limited_quirky =
767 match pub with
768 | Some p -> List.exists (fun prefix -> String.length p >= String.length prefix &&
769 String.sub p 0 (String.length prefix) = prefix)
770 Parser_constants.limited_quirky_public_prefixes
771 | None -> false
772 in
773 if is_limited_quirky then t.quirks_mode <- Dom.Limited_quirks
774 end
775 end;
776 t.mode <- Parser_insertion_mode.Before_html
777 | _ ->
778 parse_error t "expected-doctype-but-got-other";
779 t.quirks_mode <- Dom.Quirks;
780 t.mode <- Parser_insertion_mode.Before_html;
781 process_token t token
782
783and process_before_html t token =
784 match token with
785 | Token.Doctype _ -> parse_error t "unexpected-doctype"
786 | Token.Comment data -> insert_comment_to_document t data
787 | Token.Character data when is_whitespace data -> ()
788 | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } ->
789 let html = insert_element t "html" attrs in
790 t.open_elements <- [html];
791 t.mode <- Parser_insertion_mode.Before_head
792 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] ->
793 let html = insert_element t "html" [] in
794 t.open_elements <- [html];
795 t.mode <- Parser_insertion_mode.Before_head;
796 process_token t token
797 | Token.Tag { kind = Token.End; name; _ } ->
798 parse_error t ("unexpected-end-tag:" ^ name)
799 | _ ->
800 let html = insert_element t "html" [] in
801 t.open_elements <- [html];
802 t.mode <- Parser_insertion_mode.Before_head;
803 process_token t token
804
805and process_before_head t token =
806 match token with
807 | Token.Character data when is_whitespace data -> ()
808 | Token.Comment data -> insert_comment t data
809 | Token.Doctype _ -> parse_error t "unexpected-doctype"
810 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
811 process_in_body t token
812 | Token.Tag { kind = Token.Start; name = "head"; attrs; _ } ->
813 let head = insert_element t "head" attrs in
814 t.open_elements <- head :: t.open_elements;
815 t.head_element <- Some head;
816 t.mode <- Parser_insertion_mode.In_head
817 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] ->
818 let head = insert_element t "head" [] in
819 t.open_elements <- head :: t.open_elements;
820 t.head_element <- Some head;
821 t.mode <- Parser_insertion_mode.In_head;
822 process_token t token
823 | Token.Tag { kind = Token.End; name; _ } ->
824 parse_error t ("unexpected-end-tag:" ^ name)
825 | _ ->
826 let head = insert_element t "head" [] in
827 t.open_elements <- head :: t.open_elements;
828 t.head_element <- Some head;
829 t.mode <- Parser_insertion_mode.In_head;
830 process_token t token
831
832and process_in_head t token =
833 match token with
834 | Token.Character data when is_whitespace data ->
835 insert_character t data
836 | Token.Character data ->
837 (* Extract leading whitespace *)
838 let rec count_leading_ws i =
839 if i >= String.length data then i
840 else match data.[i] with
841 | '\t' | '\n' | '\x0C' | '\r' | ' ' -> count_leading_ws (i + 1)
842 | _ -> i
843 in
844 let ws_count = count_leading_ws 0 in
845 let leading_ws = String.sub data 0 ws_count in
846 let remaining = String.sub data ws_count (String.length data - ws_count) in
847 (* If there's leading whitespace and current element has children, insert it *)
848 if leading_ws <> "" then
849 (match current_node t with
850 | Some n when n.Dom.children <> [] -> insert_character t leading_ws
851 | _ -> ());
852 pop_current t;
853 t.mode <- Parser_insertion_mode.After_head;
854 process_token t (Token.Character remaining)
855 | Token.Comment data ->
856 insert_comment t data
857 | Token.Doctype _ ->
858 parse_error t "unexpected-doctype"
859 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
860 process_in_body t token
861 | Token.Tag { kind = Token.Start; name; attrs; _ }
862 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"] ->
863 ignore (insert_element t name attrs)
864 | Token.Tag { kind = Token.Start; name = "title"; attrs; self_closing } ->
865 ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs; self_closing });
866 t.original_mode <- Some t.mode;
867 t.mode <- Parser_insertion_mode.Text
868 | Token.Tag { kind = Token.Start; name; attrs; self_closing }
869 when List.mem name ["noframes"; "style"] ->
870 ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing });
871 t.original_mode <- Some t.mode;
872 t.mode <- Parser_insertion_mode.Text
873 | Token.Tag { kind = Token.Start; name = "noscript"; attrs; self_closing } ->
874 (* Scripting is disabled: parse noscript content as HTML *)
875 ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs; self_closing });
876 t.mode <- Parser_insertion_mode.In_head_noscript
877 | Token.Tag { kind = Token.Start; name = "script"; attrs; self_closing } ->
878 ignore (insert_element_for_token t { kind = Token.Start; name = "script"; attrs; self_closing });
879 t.original_mode <- Some t.mode;
880 t.mode <- Parser_insertion_mode.Text
881 | Token.Tag { kind = Token.End; name = "head"; _ } ->
882 pop_current t;
883 t.mode <- Parser_insertion_mode.After_head
884 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] ->
885 pop_current t;
886 t.mode <- Parser_insertion_mode.After_head;
887 process_token t token
888 | Token.Tag { kind = Token.Start; name = "template"; attrs; _ } ->
889 let node = Dom.create_template ~attrs () in
890 let (parent, _) = appropriate_insertion_place t in
891 Dom.append_child parent node;
892 t.open_elements <- node :: t.open_elements;
893 push_formatting_marker t;
894 t.frameset_ok <- false;
895 t.mode <- Parser_insertion_mode.In_template;
896 t.template_modes <- Parser_insertion_mode.In_template :: t.template_modes
897 | Token.Tag { kind = Token.End; name = "template"; _ } ->
898 if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then
899 parse_error t "unexpected-end-tag"
900 else begin
901 generate_all_implied_end_tags t;
902 (match current_node t with
903 | Some n when not (n.Dom.name = "template" && is_in_html_namespace n) -> parse_error t "unexpected-end-tag"
904 | _ -> ());
905 pop_until_html_tag t "template";
906 clear_active_formatting_to_marker t;
907 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
908 reset_insertion_mode t
909 end
910 | Token.Tag { kind = Token.Start; name = "head"; _ } ->
911 parse_error t "unexpected-start-tag"
912 | Token.Tag { kind = Token.End; name; _ } ->
913 parse_error t ("unexpected-end-tag:" ^ name)
914 | _ ->
915 pop_current t;
916 t.mode <- Parser_insertion_mode.After_head;
917 process_token t token
918
919and process_in_head_noscript t token =
920 match token with
921 | Token.Character data when is_whitespace data ->
922 process_in_head t token
923 | Token.Character _ ->
924 parse_error t "unexpected-char-in-noscript";
925 pop_current t; (* Pop noscript *)
926 t.mode <- Parser_insertion_mode.In_head;
927 process_token t token
928 | Token.Comment _ ->
929 process_in_head t token
930 | Token.Doctype _ ->
931 parse_error t "unexpected-doctype"
932 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
933 process_in_body t token
934 | Token.Tag { kind = Token.Start; name; _ }
935 when List.mem name ["basefont"; "bgsound"; "link"; "meta"; "noframes"; "style"] ->
936 process_in_head t token
937 | Token.Tag { kind = Token.Start; name; _ }
938 when List.mem name ["head"; "noscript"] ->
939 parse_error t "unexpected-start-tag"
940 | Token.Tag { kind = Token.Start; name; _ } ->
941 parse_error t ("bad-start-tag-in-head-noscript:" ^ name);
942 pop_current t; (* Pop noscript *)
943 t.mode <- Parser_insertion_mode.In_head;
944 process_token t token
945 | Token.Tag { kind = Token.End; name = "noscript"; _ } ->
946 pop_current t; (* Pop noscript *)
947 t.mode <- Parser_insertion_mode.In_head
948 | Token.Tag { kind = Token.End; name = "br"; _ } ->
949 parse_error t "unexpected-end-tag";
950 pop_current t; (* Pop noscript *)
951 t.mode <- Parser_insertion_mode.In_head;
952 process_token t token
953 | Token.Tag { kind = Token.End; name; _ } ->
954 parse_error t ("unexpected-end-tag:" ^ name)
955 | Token.EOF ->
956 parse_error t "expected-closing-tag-but-got-eof";
957 pop_current t; (* Pop noscript *)
958 t.mode <- Parser_insertion_mode.In_head;
959 process_token t token
960
961and process_after_head t token =
962 match token with
963 | Token.Character data when is_whitespace data ->
964 insert_character t data
965 | Token.Comment data ->
966 insert_comment t data
967 | Token.Doctype _ ->
968 parse_error t "unexpected-doctype"
969 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
970 process_in_body t token
971 | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } ->
972 ignore (insert_element t "body" ~push:true attrs);
973 t.frameset_ok <- false;
974 t.mode <- Parser_insertion_mode.In_body
975 | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
976 ignore (insert_element t "frameset" ~push:true attrs);
977 t.mode <- Parser_insertion_mode.In_frameset
978 | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
979 (* Special handling for input type="hidden" - parse error, ignore *)
980 let input_type = List.assoc_opt "type" attrs in
981 (match input_type with
982 | Some typ when String.lowercase_ascii typ = "hidden" ->
983 parse_error t "unexpected-hidden-input-after-head"
984 | _ ->
985 (* Non-hidden input creates body *)
986 let body = insert_element t "body" [] in
987 t.open_elements <- body :: t.open_elements;
988 t.mode <- Parser_insertion_mode.In_body;
989 process_token t token)
990 | Token.Tag { kind = Token.Start; name; _ }
991 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
992 parse_error t "unexpected-start-tag";
993 (match t.head_element with
994 | Some head ->
995 t.open_elements <- head :: t.open_elements;
996 process_in_head t token;
997 t.open_elements <- List.filter (fun n -> n != head) t.open_elements
998 | None -> ())
999 | Token.Tag { kind = Token.End; name = "template"; _ } ->
1000 process_in_head t token
1001 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] ->
1002 let body = insert_element t "body" [] in
1003 t.open_elements <- body :: t.open_elements;
1004 t.mode <- Parser_insertion_mode.In_body;
1005 process_token t token
1006 | Token.Tag { kind = Token.Start; name = "head"; _ } ->
1007 parse_error t "unexpected-start-tag"
1008 | Token.Tag { kind = Token.End; name; _ } ->
1009 parse_error t ("unexpected-end-tag:" ^ name)
1010 | _ ->
1011 let body = insert_element t "body" [] in
1012 t.open_elements <- body :: t.open_elements;
1013 t.mode <- Parser_insertion_mode.In_body;
1014 process_token t token
1015
1016and process_in_body t token =
1017 match token with
1018 | Token.Character "\x00" ->
1019 parse_error t "unexpected-null-character"
1020 | Token.Character data ->
1021 reconstruct_active_formatting t;
1022 insert_character t data;
1023 if not (is_whitespace data) then t.frameset_ok <- false
1024 | Token.Comment data ->
1025 insert_comment t data
1026 | Token.Doctype _ ->
1027 parse_error t "unexpected-doctype"
1028 | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } ->
1029 parse_error t "unexpected-start-tag";
1030 if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
1031 (* Find the html element (at the bottom of the stack) *)
1032 let html_elem = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in
1033 (match html_elem with
1034 | Some html ->
1035 List.iter (fun (k, v) ->
1036 if not (Dom.has_attr html k) then Dom.set_attr html k v
1037 ) attrs
1038 | None -> ())
1039 | Token.Tag { kind = Token.Start; name; _ }
1040 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
1041 process_in_head t token
1042 | Token.Tag { kind = Token.End; name = "template"; _ } ->
1043 process_in_head t token
1044 | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } ->
1045 parse_error t "unexpected-start-tag";
1046 (* Find body element on stack - it should be near the end (html is last) *)
1047 let body = List.find_opt (fun n -> n.Dom.name = "body") t.open_elements in
1048 (match body with
1049 | Some body when not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) ->
1050 t.frameset_ok <- false;
1051 List.iter (fun (k, v) ->
1052 if not (Dom.has_attr body k) then Dom.set_attr body k v
1053 ) attrs
1054 | _ -> ())
1055 | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
1056 if not t.frameset_ok then
1057 parse_error t "unexpected-start-tag-ignored"
1058 else begin
1059 (* Find body element on the stack *)
1060 let rec find_body_index idx = function
1061 | [] -> None
1062 | n :: rest ->
1063 if n.Dom.name = "body" then Some (idx, n)
1064 else find_body_index (idx + 1) rest
1065 in
1066 match find_body_index 0 t.open_elements with
1067 | None ->
1068 parse_error t "unexpected-start-tag-ignored"
1069 | Some (idx, body_elem) ->
1070 (* Remove body from its parent (the html element) *)
1071 (match body_elem.Dom.parent with
1072 | Some parent -> Dom.remove_child parent body_elem
1073 | None -> ());
1074 (* Pop all elements up to and including body - keep only elements after body_idx *)
1075 let rec drop n lst = if n <= 0 then lst else match lst with [] -> [] | _ :: rest -> drop (n - 1) rest in
1076 t.open_elements <- drop (idx + 1) t.open_elements;
1077 (* Insert frameset element *)
1078 ignore (insert_element t "frameset" ~push:true attrs);
1079 t.mode <- Parser_insertion_mode.In_frameset
1080 end
1081 | Token.EOF ->
1082 if t.template_modes <> [] then
1083 process_in_template t token
1084 else begin
1085 let has_unclosed = List.exists (fun n ->
1086 not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"])
1087 ) t.open_elements in
1088 if has_unclosed then parse_error t "expected-closing-tag-but-got-eof"
1089 end
1090 | Token.Tag { kind = Token.End; name = "body"; _ } ->
1091 if not (has_element_in_scope t "body") then
1092 parse_error t "unexpected-end-tag"
1093 else begin
1094 let has_unclosed = List.exists (fun n ->
1095 not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"])
1096 ) t.open_elements in
1097 if has_unclosed then parse_error t "end-tag-too-early";
1098 t.mode <- Parser_insertion_mode.After_body
1099 end
1100 | Token.Tag { kind = Token.End; name = "html"; _ } ->
1101 if not (has_element_in_scope t "body") then
1102 parse_error t "unexpected-end-tag"
1103 else begin
1104 t.mode <- Parser_insertion_mode.After_body;
1105 process_token t token
1106 end
1107 | Token.Tag { kind = Token.Start; name; attrs; _ }
1108 when List.mem name ["address"; "article"; "aside"; "blockquote"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "main"; "menu"; "nav"; "ol"; "p"; "search"; "section"; "summary"; "ul"] ->
1109 if has_element_in_button_scope t "p" then close_p_element t;
1110 ignore (insert_element t name ~push:true attrs)
1111 | Token.Tag { kind = Token.Start; name; attrs; _ } when Parser_constants.is_heading_element name ->
1112 if has_element_in_button_scope t "p" then close_p_element t;
1113 (match current_node t with
1114 | Some n when Parser_constants.is_heading_element n.Dom.name ->
1115 parse_error t "unexpected-start-tag";
1116 pop_current t
1117 | _ -> ());
1118 ignore (insert_element t name ~push:true attrs)
1119 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["pre"; "listing"] ->
1120 if has_element_in_button_scope t "p" then close_p_element t;
1121 ignore (insert_element t name ~push:true attrs);
1122 t.ignore_lf <- true;
1123 t.frameset_ok <- false
1124 | Token.Tag { kind = Token.Start; name = "form"; attrs; _ } ->
1125 if t.form_element <> None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
1126 parse_error t "unexpected-start-tag"
1127 else begin
1128 if has_element_in_button_scope t "p" then close_p_element t;
1129 let form = insert_element t "form" attrs in
1130 t.open_elements <- form :: t.open_elements;
1131 if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
1132 t.form_element <- Some form
1133 end
1134 | Token.Tag { kind = Token.Start; name = "li"; attrs; _ } ->
1135 t.frameset_ok <- false;
1136 let rec check = function
1137 | [] -> ()
1138 | n :: rest ->
1139 if n.Dom.name = "li" then begin
1140 generate_implied_end_tags t ~except:"li" ();
1141 (match current_node t with
1142 | Some cn when cn.Dom.name <> "li" -> parse_error t "unexpected-start-tag"
1143 | _ -> ());
1144 pop_until_tag t "li"
1145 end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then
1146 ()
1147 else
1148 check rest
1149 in
1150 check t.open_elements;
1151 if has_element_in_button_scope t "p" then close_p_element t;
1152 ignore (insert_element t "li" ~push:true attrs)
1153 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["dd"; "dt"] ->
1154 t.frameset_ok <- false;
1155 let rec check = function
1156 | [] -> ()
1157 | n :: rest ->
1158 if List.mem n.Dom.name ["dd"; "dt"] then begin
1159 generate_implied_end_tags t ~except:n.Dom.name ();
1160 (match current_node t with
1161 | Some cn when cn.Dom.name <> n.Dom.name -> parse_error t "unexpected-start-tag"
1162 | _ -> ());
1163 pop_until_one_of t ["dd"; "dt"]
1164 end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then
1165 ()
1166 else
1167 check rest
1168 in
1169 check t.open_elements;
1170 if has_element_in_button_scope t "p" then close_p_element t;
1171 ignore (insert_element t name ~push:true attrs)
1172 | Token.Tag { kind = Token.Start; name = "plaintext"; _ } ->
1173 if has_element_in_button_scope t "p" then close_p_element t;
1174 ignore (insert_element t "plaintext" ~push:true [])
1175 (* Tokenizer should switch to PLAINTEXT state *)
1176 | Token.Tag { kind = Token.Start; name = "button"; attrs; _ } ->
1177 if has_element_in_scope t "button" then begin
1178 parse_error t "unexpected-start-tag";
1179 generate_implied_end_tags t ();
1180 pop_until_tag t "button"
1181 end;
1182 reconstruct_active_formatting t;
1183 ignore (insert_element t "button" ~push:true attrs);
1184 t.frameset_ok <- false
1185 | Token.Tag { kind = Token.End; name; _ }
1186 when List.mem name ["address"; "article"; "aside"; "blockquote"; "button"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "listing"; "main"; "menu"; "nav"; "ol"; "pre"; "search"; "section"; "summary"; "ul"] ->
1187 if not (has_element_in_scope t name) then
1188 parse_error t ("unexpected-end-tag:" ^ name)
1189 else begin
1190 generate_implied_end_tags t ();
1191 (match current_node t with
1192 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1193 | _ -> ());
1194 pop_until_tag t name
1195 end
1196 | Token.Tag { kind = Token.End; name = "form"; _ } ->
1197 if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin
1198 let node = t.form_element in
1199 t.form_element <- None;
1200 match node with
1201 | None -> parse_error t "unexpected-end-tag"
1202 | Some form_node ->
1203 if not (has_element_in_scope t "form") then
1204 parse_error t "unexpected-end-tag"
1205 else begin
1206 generate_implied_end_tags t ();
1207 (match current_node t with
1208 | Some n when n != form_node -> parse_error t "end-tag-too-early"
1209 | _ -> ());
1210 t.open_elements <- List.filter (fun n -> n != form_node) t.open_elements
1211 end
1212 end else begin
1213 if not (has_element_in_scope t "form") then
1214 parse_error t "unexpected-end-tag"
1215 else begin
1216 generate_implied_end_tags t ();
1217 (match current_node t with
1218 | Some n when n.Dom.name <> "form" -> parse_error t "end-tag-too-early"
1219 | _ -> ());
1220 pop_until_tag t "form"
1221 end
1222 end
1223 | Token.Tag { kind = Token.End; name = "p"; _ } ->
1224 if not (has_element_in_button_scope t "p") then begin
1225 parse_error t "no-p-element-in-scope";
1226 ignore (insert_element t "p" ~push:true [])
1227 end;
1228 close_p_element t
1229 | Token.Tag { kind = Token.End; name = "li"; _ } ->
1230 if not (has_element_in_list_item_scope t "li") then
1231 parse_error t "unexpected-end-tag"
1232 else begin
1233 generate_implied_end_tags t ~except:"li" ();
1234 (match current_node t with
1235 | Some n when n.Dom.name <> "li" -> parse_error t "end-tag-too-early"
1236 | _ -> ());
1237 pop_until_tag t "li"
1238 end
1239 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["dd"; "dt"] ->
1240 if not (has_element_in_scope t name) then
1241 parse_error t "unexpected-end-tag"
1242 else begin
1243 generate_implied_end_tags t ~except:name ();
1244 (match current_node t with
1245 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1246 | _ -> ());
1247 pop_until_tag t name
1248 end
1249 | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_heading_element name ->
1250 if not (has_element_in_scope_impl t Parser_constants.heading_elements Parser_constants.default_scope ~check_integration_points:true) then
1251 parse_error t "unexpected-end-tag"
1252 else begin
1253 generate_implied_end_tags t ();
1254 (match current_node t with
1255 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1256 | _ -> ());
1257 pop_until_one_of t Parser_constants.heading_elements
1258 end
1259 | Token.Tag { kind = Token.Start; name = "a"; attrs; _ } ->
1260 (* Check for existing <a> in active formatting *)
1261 let rec find_a = function
1262 | [] -> None
1263 | Marker :: _ -> None
1264 | Entry e :: _ when e.name = "a" -> Some e.node
1265 | _ :: rest -> find_a rest
1266 in
1267 (match find_a t.active_formatting with
1268 | Some existing ->
1269 parse_error t "unexpected-start-tag";
1270 adoption_agency t "a";
1271 t.active_formatting <- List.filter (function
1272 | Entry e -> e.node != existing
1273 | _ -> true
1274 ) t.active_formatting;
1275 t.open_elements <- List.filter (fun n -> n != existing) t.open_elements
1276 | None -> ());
1277 reconstruct_active_formatting t;
1278 let node = insert_element t "a" attrs in
1279 t.open_elements <- node :: t.open_elements;
1280 push_formatting_element t node "a" attrs
1281 | Token.Tag { kind = Token.Start; name; attrs; _ }
1282 when List.mem name ["b"; "big"; "code"; "em"; "font"; "i"; "s"; "small"; "strike"; "strong"; "tt"; "u"] ->
1283 reconstruct_active_formatting t;
1284 let node = insert_element t name attrs in
1285 t.open_elements <- node :: t.open_elements;
1286 push_formatting_element t node name attrs
1287 | Token.Tag { kind = Token.Start; name = "nobr"; attrs; _ } ->
1288 if has_element_in_scope t "nobr" then begin
1289 parse_error t "unexpected-start-tag";
1290 adoption_agency t "nobr";
1291 (* Remove nobr from active formatting *)
1292 t.active_formatting <- List.filter (function
1293 | Entry e -> e.name <> "nobr"
1294 | Marker -> true
1295 ) t.active_formatting;
1296 (* Remove nobr from open elements *)
1297 t.open_elements <- List.filter (fun n -> n.Dom.name <> "nobr") t.open_elements
1298 end;
1299 reconstruct_active_formatting t;
1300 let node = insert_element t "nobr" attrs in
1301 t.open_elements <- node :: t.open_elements;
1302 push_formatting_element t node "nobr" attrs
1303 | Token.Tag { kind = Token.End; name; _ }
1304 when List.mem name ["a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u"] ->
1305 adoption_agency t name
1306 | Token.Tag { kind = Token.Start; name; attrs; _ }
1307 when List.mem name ["applet"; "marquee"; "object"] ->
1308 reconstruct_active_formatting t;
1309 ignore (insert_element t name ~push:true attrs);
1310 push_formatting_marker t;
1311 t.frameset_ok <- false
1312 | Token.Tag { kind = Token.End; name; _ }
1313 when List.mem name ["applet"; "marquee"; "object"] ->
1314 if not (has_element_in_scope t name) then
1315 parse_error t "unexpected-end-tag"
1316 else begin
1317 generate_implied_end_tags t ();
1318 (match current_node t with
1319 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1320 | _ -> ());
1321 pop_until_tag t name;
1322 clear_active_formatting_to_marker t
1323 end
1324 | Token.Tag { kind = Token.Start; name = "table"; attrs; _ } ->
1325 if t.quirks_mode <> Dom.Quirks && has_element_in_button_scope t "p" then
1326 close_p_element t;
1327 ignore (insert_element t "table" ~push:true attrs);
1328 t.frameset_ok <- false;
1329 t.mode <- Parser_insertion_mode.In_table
1330 | Token.Tag { kind = Token.End; name = "br"; _ } ->
1331 parse_error t "end-tag-br";
1332 reconstruct_active_formatting t;
1333 ignore (insert_element t "br" ~push:true []);
1334 pop_current t;
1335 t.frameset_ok <- false
1336 | Token.Tag { kind = Token.Start; name; attrs; _ }
1337 when List.mem name ["area"; "br"; "embed"; "img"; "keygen"; "wbr"] ->
1338 reconstruct_active_formatting t;
1339 ignore (insert_element t name ~push:true attrs);
1340 pop_current t;
1341 t.frameset_ok <- false
1342 | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
1343 reconstruct_active_formatting t;
1344 ignore (insert_element t "input" ~push:true attrs);
1345 pop_current t;
1346 let is_hidden = List.exists (fun (k, v) ->
1347 String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden"
1348 ) attrs in
1349 if not is_hidden then t.frameset_ok <- false
1350 | Token.Tag { kind = Token.Start; name; attrs; _ }
1351 when List.mem name ["param"; "source"; "track"] ->
1352 ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing = false });
1353 pop_current t
1354 | Token.Tag { kind = Token.Start; name = "hr"; _ } ->
1355 if has_element_in_button_scope t "p" then close_p_element t;
1356 ignore (insert_element t "hr" ~push:true []);
1357 pop_current t;
1358 t.frameset_ok <- false
1359 | Token.Tag { kind = Token.Start; name = "image"; attrs; _ } ->
1360 parse_error t "unexpected-start-tag";
1361 (* Treat <image> as <img> *)
1362 reconstruct_active_formatting t;
1363 ignore (insert_element t "img" ~push:true attrs);
1364 pop_current t;
1365 t.frameset_ok <- false
1366 | Token.Tag { kind = Token.Start; name = "textarea"; attrs; _ } ->
1367 ignore (insert_element t "textarea" ~push:true attrs);
1368 t.ignore_lf <- true;
1369 t.original_mode <- Some t.mode;
1370 t.frameset_ok <- false;
1371 t.mode <- Parser_insertion_mode.Text
1372 | Token.Tag { kind = Token.Start; name = "xmp"; attrs; _ } ->
1373 if has_element_in_button_scope t "p" then close_p_element t;
1374 reconstruct_active_formatting t;
1375 t.frameset_ok <- false;
1376 ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs; self_closing = false });
1377 t.original_mode <- Some t.mode;
1378 t.mode <- Parser_insertion_mode.Text
1379 | Token.Tag { kind = Token.Start; name = "iframe"; attrs; _ } ->
1380 t.frameset_ok <- false;
1381 ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs; self_closing = false });
1382 t.original_mode <- Some t.mode;
1383 t.mode <- Parser_insertion_mode.Text
1384 | Token.Tag { kind = Token.Start; name = "noembed"; attrs; _ } ->
1385 ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs; self_closing = false });
1386 t.original_mode <- Some t.mode;
1387 t.mode <- Parser_insertion_mode.Text
1388 | Token.Tag { kind = Token.Start; name = "select"; attrs; _ } ->
1389 reconstruct_active_formatting t;
1390 ignore (insert_element t "select" ~push:true attrs);
1391 t.frameset_ok <- false;
1392 if List.mem t.mode [Parser_insertion_mode.In_table; Parser_insertion_mode.In_caption; Parser_insertion_mode.In_table_body; Parser_insertion_mode.In_row; Parser_insertion_mode.In_cell] then
1393 t.mode <- Parser_insertion_mode.In_select_in_table
1394 else
1395 t.mode <- Parser_insertion_mode.In_select
1396 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["optgroup"; "option"] ->
1397 (match current_node t with
1398 | Some n when n.Dom.name = "option" -> pop_current t
1399 | _ -> ());
1400 reconstruct_active_formatting t;
1401 ignore (insert_element t name ~push:true attrs)
1402 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rb"; "rtc"] ->
1403 if has_element_in_scope t "ruby" then begin
1404 generate_implied_end_tags t ()
1405 end;
1406 (match current_node t with
1407 | Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag"
1408 | _ -> ());
1409 ignore (insert_element t name ~push:true attrs)
1410 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rp"; "rt"] ->
1411 if has_element_in_scope t "ruby" then begin
1412 generate_implied_end_tags t ~except:"rtc" ()
1413 end;
1414 (match current_node t with
1415 | Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag"
1416 | _ -> ());
1417 ignore (insert_element t name ~push:true attrs)
1418 | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } ->
1419 reconstruct_active_formatting t;
1420 let adjusted_attrs = Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs) in
1421 ignore (insert_foreign_element t { kind = Token.Start; name = "math"; attrs = adjusted_attrs; self_closing } (Some "mathml"));
1422 if self_closing then pop_current t
1423 | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } ->
1424 reconstruct_active_formatting t;
1425 let adjusted_attrs = Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs) in
1426 ignore (insert_foreign_element t { kind = Token.Start; name = "svg"; attrs = adjusted_attrs; self_closing } (Some "svg"));
1427 if self_closing then pop_current t
1428 | Token.Tag { kind = Token.Start; name; attrs; _ }
1429 when List.mem name ["col"; "frame"] ->
1430 (* In fragment context, insert these; otherwise ignore *)
1431 if t.fragment_context = None then
1432 parse_error t "unexpected-start-tag-ignored"
1433 else
1434 ignore (insert_element t name attrs)
1435 | Token.Tag { kind = Token.Start; name; _ }
1436 when List.mem name ["caption"; "colgroup"; "head"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1437 parse_error t "unexpected-start-tag"
1438 | Token.Tag { kind = Token.Start; name; attrs; self_closing } ->
1439 (* Any other start tag *)
1440 reconstruct_active_formatting t;
1441 ignore (insert_element t name ~push:true attrs);
1442 (* Check for self-closing on non-void HTML element *)
1443 if self_closing && not (Parser_constants.is_void_element name) then
1444 parse_error t "non-void-html-element-start-tag-with-trailing-solidus"
1445 | Token.Tag { kind = Token.End; name; _ } ->
1446 (* Any other end tag *)
1447 let rec check = function
1448 | [] -> ()
1449 | node :: rest ->
1450 if node.Dom.name = name then begin
1451 generate_implied_end_tags t ~except:name ();
1452 (match current_node t with
1453 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1454 | _ -> ());
1455 pop_until t (fun n -> n == node)
1456 end else if is_special_element node then
1457 parse_error t ("unexpected-end-tag:" ^ name)
1458 else
1459 check rest
1460 in
1461 check t.open_elements
1462
1463and process_text t token =
1464 match token with
1465 | Token.Character data ->
1466 insert_character t data
1467 | Token.EOF ->
1468 parse_error t "expected-closing-tag-but-got-eof";
1469 pop_current t;
1470 t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body;
1471 process_token t token
1472 | Token.Tag { kind = Token.End; _ } ->
1473 pop_current t;
1474 t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body
1475 | _ -> ()
1476
1477and process_in_table t token =
1478 match token with
1479 | Token.Character _ when (match current_node t with Some n -> Parser_constants.is_foster_parenting_element n.Dom.name | None -> false) ->
1480 t.pending_table_chars <- [];
1481 t.original_mode <- Some t.mode;
1482 t.mode <- Parser_insertion_mode.In_table_text;
1483 process_token t token
1484 | Token.Comment data ->
1485 insert_comment t data
1486 | Token.Doctype _ ->
1487 parse_error t "unexpected-doctype"
1488 | Token.Tag { kind = Token.Start; name = "caption"; attrs; _ } ->
1489 clear_stack_back_to_table_context t;
1490 push_formatting_marker t;
1491 ignore (insert_element t "caption" ~push:true attrs);
1492 t.mode <- Parser_insertion_mode.In_caption
1493 | Token.Tag { kind = Token.Start; name = "colgroup"; attrs; _ } ->
1494 clear_stack_back_to_table_context t;
1495 ignore (insert_element t "colgroup" ~push:true attrs);
1496 t.mode <- Parser_insertion_mode.In_column_group
1497 | Token.Tag { kind = Token.Start; name = "col"; _ } ->
1498 clear_stack_back_to_table_context t;
1499 ignore (insert_element t "colgroup" ~push:true []);
1500 t.mode <- Parser_insertion_mode.In_column_group;
1501 process_token t token
1502 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
1503 clear_stack_back_to_table_context t;
1504 ignore (insert_element t name ~push:true attrs);
1505 t.mode <- Parser_insertion_mode.In_table_body
1506 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"; "tr"] ->
1507 clear_stack_back_to_table_context t;
1508 ignore (insert_element t "tbody" ~push:true []);
1509 t.mode <- Parser_insertion_mode.In_table_body;
1510 process_token t token
1511 | Token.Tag { kind = Token.Start; name = "table"; _ } ->
1512 parse_error t "unexpected-start-tag";
1513 if has_element_in_table_scope t "table" then begin
1514 pop_until_tag t "table";
1515 reset_insertion_mode t;
1516 process_token t token
1517 end
1518 | Token.Tag { kind = Token.End; name = "table"; _ } ->
1519 if not (has_element_in_table_scope t "table") then
1520 parse_error t "unexpected-end-tag"
1521 else begin
1522 pop_until_tag t "table";
1523 reset_insertion_mode t
1524 end
1525 | Token.Tag { kind = Token.End; name; _ }
1526 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1527 parse_error t "unexpected-end-tag"
1528 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["style"; "script"; "template"] ->
1529 process_in_head t token
1530 | Token.Tag { kind = Token.End; name = "template"; _ } ->
1531 process_in_head t token
1532 | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
1533 let is_hidden = List.exists (fun (k, v) ->
1534 String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden"
1535 ) attrs in
1536 if not is_hidden then begin
1537 parse_error t "start-tag-in-table:input";
1538 t.foster_parenting <- true;
1539 process_in_body t token;
1540 t.foster_parenting <- false
1541 end else begin
1542 parse_error t "start-tag-in-table:input";
1543 ignore (insert_element t "input" ~push:true attrs);
1544 pop_current t
1545 end
1546 | Token.Tag { kind = Token.Start; name = "form"; attrs; _ } ->
1547 parse_error t "unexpected-start-tag";
1548 if t.form_element = None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin
1549 let form = insert_element t "form" attrs in
1550 t.open_elements <- form :: t.open_elements;
1551 t.form_element <- Some form;
1552 pop_current t
1553 end
1554 | Token.EOF ->
1555 process_in_body t token
1556 | _ ->
1557 parse_error t "unexpected-token-in-table";
1558 t.foster_parenting <- true;
1559 process_in_body t token;
1560 t.foster_parenting <- false
1561
1562and clear_stack_back_to_table_context t =
1563 let rec loop () =
1564 match current_node t with
1565 | Some n when not (List.mem n.Dom.name ["table"; "template"; "html"]) ->
1566 pop_current t;
1567 loop ()
1568 | _ -> ()
1569 in
1570 loop ()
1571
1572and process_in_table_text t token =
1573 match token with
1574 | Token.Character data ->
1575 if String.contains data '\x00' then
1576 parse_error t "unexpected-null-character"
1577 else
1578 t.pending_table_chars <- data :: t.pending_table_chars
1579 | _ ->
1580 let pending = String.concat "" (List.rev t.pending_table_chars) in
1581 t.pending_table_chars <- [];
1582 if not (is_whitespace pending) then begin
1583 parse_error t "unexpected-character-in-table";
1584 t.foster_parenting <- true;
1585 reconstruct_active_formatting t;
1586 insert_character t pending;
1587 t.foster_parenting <- false
1588 end else
1589 insert_character t pending;
1590 t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_table;
1591 process_token t token
1592
1593and process_in_caption t token =
1594 match token with
1595 | Token.Tag { kind = Token.End; name = "caption"; _ } ->
1596 if not (has_element_in_table_scope t "caption") then
1597 parse_error t "unexpected-end-tag"
1598 else begin
1599 generate_implied_end_tags t ();
1600 (match current_node t with
1601 | Some n when n.Dom.name <> "caption" -> parse_error t "end-tag-too-early"
1602 | _ -> ());
1603 pop_until_tag t "caption";
1604 clear_active_formatting_to_marker t;
1605 t.mode <- Parser_insertion_mode.In_table
1606 end
1607 | Token.Tag { kind = Token.Start; name; _ }
1608 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1609 if not (has_element_in_table_scope t "caption") then
1610 parse_error t "unexpected-start-tag"
1611 else begin
1612 generate_implied_end_tags t ();
1613 pop_until_tag t "caption";
1614 clear_active_formatting_to_marker t;
1615 t.mode <- Parser_insertion_mode.In_table;
1616 process_token t token
1617 end
1618 | Token.Tag { kind = Token.End; name = "table"; _ } ->
1619 if not (has_element_in_table_scope t "caption") then
1620 parse_error t "unexpected-end-tag"
1621 else begin
1622 generate_implied_end_tags t ();
1623 pop_until_tag t "caption";
1624 clear_active_formatting_to_marker t;
1625 t.mode <- Parser_insertion_mode.In_table;
1626 process_token t token
1627 end
1628 | Token.Tag { kind = Token.End; name; _ }
1629 when List.mem name ["body"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1630 parse_error t "unexpected-end-tag"
1631 | _ ->
1632 process_in_body t token
1633
1634and process_in_column_group t token =
1635 match token with
1636 | Token.Character data when is_whitespace data ->
1637 insert_character t data
1638 | Token.Character data ->
1639 (* Split leading whitespace from non-whitespace *)
1640 let ws_chars = [' '; '\t'; '\n'; '\x0C'; '\r'] in
1641 let len = String.length data in
1642 let ws_end = ref 0 in
1643 while !ws_end < len && List.mem data.[!ws_end] ws_chars do incr ws_end done;
1644 if !ws_end > 0 then
1645 insert_character t (String.sub data 0 !ws_end);
1646 if !ws_end < len then begin
1647 let remaining = String.sub data !ws_end (len - !ws_end) in
1648 (match current_node t with
1649 | Some n when n.Dom.name = "colgroup" ->
1650 pop_current t;
1651 t.mode <- Parser_insertion_mode.In_table;
1652 process_token t (Token.Character remaining)
1653 | _ ->
1654 parse_error t "unexpected-token")
1655 end
1656 | Token.Comment data ->
1657 insert_comment t data
1658 | Token.Doctype _ ->
1659 parse_error t "unexpected-doctype"
1660 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
1661 process_in_body t token
1662 | Token.Tag { kind = Token.Start; name = "col"; attrs; _ } ->
1663 ignore (insert_element t "col" ~push:true attrs);
1664 pop_current t
1665 | Token.Tag { kind = Token.End; name = "colgroup"; _ } ->
1666 (match current_node t with
1667 | Some n when n.Dom.name <> "colgroup" -> parse_error t "unexpected-end-tag"
1668 | Some _ -> pop_current t; t.mode <- Parser_insertion_mode.In_table
1669 | None -> parse_error t "unexpected-end-tag")
1670 | Token.Tag { kind = Token.End; name = "col"; _ } ->
1671 parse_error t "unexpected-end-tag"
1672 | Token.Tag { kind = Token.Start; name = "template"; _ }
1673 | Token.Tag { kind = Token.End; name = "template"; _ } ->
1674 process_in_head t token
1675 | Token.EOF ->
1676 process_in_body t token
1677 | _ ->
1678 (match current_node t with
1679 | Some n when n.Dom.name = "colgroup" ->
1680 pop_current t;
1681 t.mode <- Parser_insertion_mode.In_table;
1682 process_token t token
1683 | _ ->
1684 parse_error t "unexpected-token")
1685
1686and process_in_table_body t token =
1687 match token with
1688 | Token.Tag { kind = Token.Start; name = "tr"; attrs; _ } ->
1689 clear_stack_back_to_table_body_context t;
1690 ignore (insert_element t "tr" ~push:true attrs);
1691 t.mode <- Parser_insertion_mode.In_row
1692 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["th"; "td"] ->
1693 parse_error t "unexpected-start-tag";
1694 clear_stack_back_to_table_body_context t;
1695 ignore (insert_element t "tr" ~push:true []);
1696 t.mode <- Parser_insertion_mode.In_row;
1697 process_token t token
1698 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
1699 if not (has_element_in_table_scope t name) then
1700 parse_error t "unexpected-end-tag"
1701 else begin
1702 clear_stack_back_to_table_body_context t;
1703 pop_current t;
1704 t.mode <- Parser_insertion_mode.In_table
1705 end
1706 | Token.Tag { kind = Token.Start; name; _ }
1707 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"] ->
1708 if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then
1709 parse_error t "unexpected-start-tag"
1710 else begin
1711 clear_stack_back_to_table_body_context t;
1712 pop_current t;
1713 t.mode <- Parser_insertion_mode.In_table;
1714 process_token t token
1715 end
1716 | Token.Tag { kind = Token.End; name = "table"; _ } ->
1717 if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then
1718 parse_error t "unexpected-end-tag"
1719 else begin
1720 clear_stack_back_to_table_body_context t;
1721 pop_current t;
1722 t.mode <- Parser_insertion_mode.In_table;
1723 process_token t token
1724 end
1725 | Token.Tag { kind = Token.End; name; _ }
1726 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"; "tr"] ->
1727 parse_error t "unexpected-end-tag"
1728 | _ ->
1729 process_in_table t token
1730
1731and clear_stack_back_to_table_body_context t =
1732 let rec loop () =
1733 match current_node t with
1734 | Some n when not (List.mem n.Dom.name ["tbody"; "tfoot"; "thead"; "template"; "html"]) ->
1735 pop_current t;
1736 loop ()
1737 | _ -> ()
1738 in
1739 loop ()
1740
1741and process_in_row t token =
1742 match token with
1743 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["th"; "td"] ->
1744 clear_stack_back_to_table_row_context t;
1745 ignore (insert_element t name ~push:true attrs);
1746 t.mode <- Parser_insertion_mode.In_cell;
1747 push_formatting_marker t
1748 | Token.Tag { kind = Token.End; name = "tr"; _ } ->
1749 if not (has_element_in_table_scope t "tr") then
1750 parse_error t "unexpected-end-tag"
1751 else begin
1752 clear_stack_back_to_table_row_context t;
1753 pop_current t;
1754 t.mode <- Parser_insertion_mode.In_table_body
1755 end
1756 | Token.Tag { kind = Token.Start; name; _ }
1757 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"; "tr"] ->
1758 if not (has_element_in_table_scope t "tr") then
1759 parse_error t "unexpected-start-tag"
1760 else begin
1761 clear_stack_back_to_table_row_context t;
1762 pop_current t;
1763 t.mode <- Parser_insertion_mode.In_table_body;
1764 process_token t token
1765 end
1766 | Token.Tag { kind = Token.End; name = "table"; _ } ->
1767 if not (has_element_in_table_scope t "tr") then
1768 parse_error t "unexpected-end-tag"
1769 else begin
1770 clear_stack_back_to_table_row_context t;
1771 pop_current t;
1772 t.mode <- Parser_insertion_mode.In_table_body;
1773 process_token t token
1774 end
1775 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
1776 if not (has_element_in_table_scope t name) then
1777 parse_error t "unexpected-end-tag"
1778 else if not (has_element_in_table_scope t "tr") then
1779 parse_error t "unexpected-end-tag"
1780 else begin
1781 clear_stack_back_to_table_row_context t;
1782 pop_current t;
1783 t.mode <- Parser_insertion_mode.In_table_body;
1784 process_token t token
1785 end
1786 | Token.Tag { kind = Token.End; name; _ }
1787 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"] ->
1788 parse_error t "unexpected-end-tag"
1789 | _ ->
1790 process_in_table t token
1791
1792and clear_stack_back_to_table_row_context t =
1793 let rec loop () =
1794 match current_node t with
1795 | Some n when not (List.mem n.Dom.name ["tr"; "template"; "html"]) ->
1796 pop_current t;
1797 loop ()
1798 | _ -> ()
1799 in
1800 loop ()
1801
1802and process_in_cell t token =
1803 match token with
1804 | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_table_cell_element name ->
1805 if not (has_element_in_table_scope t name) then
1806 parse_error t "unexpected-end-tag"
1807 else begin
1808 generate_implied_end_tags t ();
1809 (match current_node t with
1810 | Some n when not (n.Dom.name = name && is_in_html_namespace n) -> parse_error t "end-tag-too-early"
1811 | _ -> ());
1812 pop_until_html_tag t name;
1813 clear_active_formatting_to_marker t;
1814 t.mode <- Parser_insertion_mode.In_row
1815 end
1816 | Token.Tag { kind = Token.Start; name; _ }
1817 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1818 if not (has_element_in_scope_impl t ["td"; "th"] Parser_constants.table_scope ~check_integration_points:false) then
1819 parse_error t "unexpected-start-tag"
1820 else begin
1821 close_cell t;
1822 process_token t token
1823 end
1824 | Token.Tag { kind = Token.End; name; _ }
1825 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"] ->
1826 parse_error t "unexpected-end-tag"
1827 | Token.Tag { kind = Token.End; name; _ }
1828 when Parser_constants.is_foster_parenting_element name ->
1829 if not (has_element_in_table_scope t name) then
1830 parse_error t "unexpected-end-tag"
1831 else begin
1832 close_cell t;
1833 process_token t token
1834 end
1835 | _ ->
1836 process_in_body t token
1837
1838and close_cell t =
1839 generate_implied_end_tags t ();
1840 (match current_node t with
1841 | Some n when not (Parser_constants.is_table_cell_element n.Dom.name && is_in_html_namespace n) -> parse_error t "end-tag-too-early"
1842 | _ -> ());
1843 pop_until_html_one_of t ["td"; "th"];
1844 clear_active_formatting_to_marker t;
1845 t.mode <- Parser_insertion_mode.In_row
1846
1847and process_in_select t token =
1848 match token with
1849 | Token.Character "\x00" ->
1850 parse_error t "unexpected-null-character"
1851 | Token.Character data ->
1852 reconstruct_active_formatting t;
1853 insert_character t data
1854 | Token.Comment data ->
1855 insert_comment t data
1856 | Token.Doctype _ ->
1857 parse_error t "unexpected-doctype"
1858 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
1859 process_in_body t token
1860 | Token.Tag { kind = Token.Start; name = "option"; attrs; _ } ->
1861 (match current_node t with
1862 | Some n when n.Dom.name = "option" -> pop_current t
1863 | _ -> ());
1864 reconstruct_active_formatting t;
1865 ignore (insert_element t "option" ~push:true attrs)
1866 | Token.Tag { kind = Token.Start; name = "optgroup"; attrs; _ } ->
1867 (match current_node t with
1868 | Some n when n.Dom.name = "option" -> pop_current t
1869 | _ -> ());
1870 (match current_node t with
1871 | Some n when n.Dom.name = "optgroup" -> pop_current t
1872 | _ -> ());
1873 reconstruct_active_formatting t;
1874 ignore (insert_element t "optgroup" ~push:true attrs)
1875 | Token.Tag { kind = Token.Start; name = "hr"; _ } ->
1876 (match current_node t with
1877 | Some n when n.Dom.name = "option" -> pop_current t
1878 | _ -> ());
1879 (match current_node t with
1880 | Some n when n.Dom.name = "optgroup" -> pop_current t
1881 | _ -> ());
1882 ignore (insert_element t "hr" ~push:true []);
1883 pop_current t
1884 | Token.Tag { kind = Token.End; name = "optgroup"; _ } ->
1885 (match t.open_elements with
1886 | opt :: optg :: _ when opt.Dom.name = "option" && optg.Dom.name = "optgroup" ->
1887 pop_current t
1888 | _ -> ());
1889 (match current_node t with
1890 | Some n when n.Dom.name = "optgroup" -> pop_current t
1891 | _ -> parse_error t "unexpected-end-tag")
1892 | Token.Tag { kind = Token.End; name = "option"; _ } ->
1893 (match current_node t with
1894 | Some n when n.Dom.name = "option" -> pop_current t
1895 | _ -> parse_error t "unexpected-end-tag")
1896 | Token.Tag { kind = Token.End; name = "select"; _ } ->
1897 if not (has_element_in_select_scope t "select") then
1898 parse_error t "unexpected-end-tag"
1899 else begin
1900 pop_until_tag t "select";
1901 reset_insertion_mode t
1902 end
1903 | Token.Tag { kind = Token.Start; name = "select"; _ } ->
1904 parse_error t "unexpected-start-tag";
1905 (* Per spec: in IN_SELECT mode, select is always on the stack - just pop *)
1906 pop_until_tag t "select";
1907 reset_insertion_mode t
1908 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["input"; "textarea"] ->
1909 parse_error t "unexpected-start-tag";
1910 (* Per spec: in IN_SELECT mode, select is always on the stack - just pop *)
1911 pop_until_tag t "select";
1912 reset_insertion_mode t;
1913 process_token t token
1914 | Token.Tag { kind = Token.Start; name = "plaintext"; attrs; _ } ->
1915 (* plaintext is allowed in select - creates element, parser will switch tokenizer to PLAINTEXT mode *)
1916 reconstruct_active_formatting t;
1917 ignore (insert_element t "plaintext" ~push:true attrs)
1918 | Token.Tag { kind = Token.Start; name = "menuitem"; attrs; _ } ->
1919 (* menuitem is allowed in select *)
1920 reconstruct_active_formatting t;
1921 ignore (insert_element t "menuitem" ~push:true attrs)
1922 | Token.Tag { kind = Token.Start; name = "keygen"; attrs; _ } ->
1923 (* keygen is handled specially in select - inserted directly *)
1924 reconstruct_active_formatting t;
1925 ignore (insert_element t "keygen" attrs)
1926 (* Void element, don't push to stack *)
1927 | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } ->
1928 reconstruct_active_formatting t;
1929 let node = insert_foreign_element t { kind = Token.Start; name = "svg"; attrs; self_closing } (Some "svg") in
1930 if not self_closing then t.open_elements <- node :: t.open_elements
1931 | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } ->
1932 reconstruct_active_formatting t;
1933 let node = insert_foreign_element t { kind = Token.Start; name = "math"; attrs; self_closing } (Some "mathml") in
1934 if not self_closing then t.open_elements <- node :: t.open_elements
1935 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["script"; "template"] ->
1936 process_in_head t token
1937 | Token.Tag { kind = Token.End; name = "template"; _ } ->
1938 process_in_head t token
1939 (* Allow certain HTML elements in select - newer spec behavior *)
1940 | Token.Tag { kind = Token.Start; name; attrs; self_closing } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] ->
1941 reconstruct_active_formatting t;
1942 let node = insert_element t name attrs in
1943 if not self_closing then t.open_elements <- node :: t.open_elements
1944 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["br"; "img"] ->
1945 reconstruct_active_formatting t;
1946 ignore (insert_element t name attrs)
1947 (* Don't push to stack - void elements *)
1948 (* Handle formatting elements in select *)
1949 | Token.Tag { kind = Token.Start; name; attrs; _ } when Parser_constants.is_formatting_element name ->
1950 reconstruct_active_formatting t;
1951 let node = insert_element t name ~push:true attrs in
1952 push_formatting_element t node name attrs
1953 | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_formatting_element name ->
1954 (* Find select element and check if formatting element is inside select *)
1955 let select_idx = ref None in
1956 let fmt_idx = ref None in
1957 List.iteri (fun i n ->
1958 if n.Dom.name = "select" && !select_idx = None then select_idx := Some i;
1959 if n.Dom.name = name then fmt_idx := Some i
1960 ) t.open_elements;
1961 (match !fmt_idx, !select_idx with
1962 | Some fi, Some si when fi < si ->
1963 (* Formatting element is inside select, run adoption agency *)
1964 adoption_agency t name
1965 | Some _, Some _ ->
1966 (* Formatting element is outside select boundary - parse error, ignore *)
1967 parse_error t "unexpected-end-tag"
1968 | Some _, None ->
1969 adoption_agency t name
1970 | None, _ ->
1971 parse_error t "unexpected-end-tag")
1972 (* End tags for HTML elements allowed in select *)
1973 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] ->
1974 (* Find select and target indices *)
1975 let select_idx = ref None in
1976 let target_idx = ref None in
1977 List.iteri (fun i n ->
1978 if n.Dom.name = "select" && !select_idx = None then select_idx := Some i;
1979 if n.Dom.name = name then target_idx := Some i
1980 ) t.open_elements;
1981 (* Only pop if target exists and is inside select (lower index = closer to current) *)
1982 (match !target_idx, !select_idx with
1983 | Some ti, Some si when ti < si ->
1984 (* Pop until we reach the target *)
1985 let rec pop_to_target () =
1986 match t.open_elements with
1987 | [] -> ()
1988 | n :: rest ->
1989 t.open_elements <- rest;
1990 if n.Dom.name <> name then pop_to_target ()
1991 in
1992 pop_to_target ()
1993 | Some _, Some _ ->
1994 parse_error t "unexpected-end-tag"
1995 | Some _, None ->
1996 (* No select on stack, just pop to target *)
1997 let rec pop_to_target () =
1998 match t.open_elements with
1999 | [] -> ()
2000 | n :: rest ->
2001 t.open_elements <- rest;
2002 if n.Dom.name <> name then pop_to_target ()
2003 in
2004 pop_to_target ()
2005 | None, _ ->
2006 parse_error t "unexpected-end-tag")
2007 | Token.EOF ->
2008 process_in_body t token
2009 | _ ->
2010 parse_error t "unexpected-token-in-select"
2011
2012and process_in_select_in_table t token =
2013 match token with
2014 | Token.Tag { kind = Token.Start; name; _ }
2015 when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] ->
2016 parse_error t "unexpected-start-tag";
2017 pop_until_tag t "select";
2018 reset_insertion_mode t;
2019 process_token t token
2020 | Token.Tag { kind = Token.End; name; _ }
2021 when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] ->
2022 parse_error t "unexpected-end-tag";
2023 if has_element_in_table_scope t name then begin
2024 pop_until_tag t "select";
2025 reset_insertion_mode t;
2026 process_token t token
2027 end
2028 | _ ->
2029 process_in_select t token
2030
2031and process_in_template t token =
2032 match token with
2033 | Token.Character _ | Token.Comment _ | Token.Doctype _ ->
2034 process_in_body t token
2035 | Token.Tag { kind = Token.Start; name; _ }
2036 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
2037 process_in_head t token
2038 | Token.Tag { kind = Token.End; name = "template"; _ } ->
2039 process_in_head t token
2040 | Token.Tag { kind = Token.Start; name; _ }
2041 when List.mem name ["caption"; "colgroup"; "tbody"; "tfoot"; "thead"] ->
2042 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2043 t.template_modes <- Parser_insertion_mode.In_table :: t.template_modes;
2044 t.mode <- Parser_insertion_mode.In_table;
2045 process_token t token
2046 | Token.Tag { kind = Token.Start; name = "col"; _ } ->
2047 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2048 t.template_modes <- Parser_insertion_mode.In_column_group :: t.template_modes;
2049 t.mode <- Parser_insertion_mode.In_column_group;
2050 process_token t token
2051 | Token.Tag { kind = Token.Start; name = "tr"; _ } ->
2052 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2053 t.template_modes <- Parser_insertion_mode.In_table_body :: t.template_modes;
2054 t.mode <- Parser_insertion_mode.In_table_body;
2055 process_token t token
2056 | Token.Tag { kind = Token.Start; name; _ } when Parser_constants.is_table_cell_element name ->
2057 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2058 t.template_modes <- Parser_insertion_mode.In_row :: t.template_modes;
2059 t.mode <- Parser_insertion_mode.In_row;
2060 process_token t token
2061 | Token.Tag { kind = Token.Start; _ } ->
2062 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2063 t.template_modes <- Parser_insertion_mode.In_body :: t.template_modes;
2064 t.mode <- Parser_insertion_mode.In_body;
2065 process_token t token
2066 | Token.Tag { kind = Token.End; name; _ } ->
2067 parse_error t ("unexpected-end-tag:" ^ name)
2068 | Token.EOF ->
2069 if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then
2070 () (* Stop parsing *)
2071 else begin
2072 parse_error t "expected-closing-tag-but-got-eof";
2073 pop_until_html_tag t "template";
2074 clear_active_formatting_to_marker t;
2075 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2076 reset_insertion_mode t;
2077 process_token t token
2078 end
2079
2080and process_after_body t token =
2081 match token with
2082 | Token.Character data when is_whitespace data ->
2083 process_in_body t token
2084 | Token.Comment data ->
2085 (* Insert as last child of html element - html is at bottom of stack *)
2086 let html_opt = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in
2087 (match html_opt with
2088 | Some html ->
2089 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
2090 Dom.append_child html (Dom.create_comment ~location data)
2091 | None -> ())
2092 | Token.Doctype _ ->
2093 parse_error t "unexpected-doctype"
2094 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
2095 process_in_body t token
2096 | Token.Tag { kind = Token.End; name = "html"; _ } ->
2097 if t.fragment_context <> None then
2098 parse_error t "unexpected-end-tag"
2099 else
2100 t.mode <- Parser_insertion_mode.After_after_body
2101 | Token.EOF ->
2102 () (* Stop parsing *)
2103 | _ ->
2104 parse_error t "unexpected-token-after-body";
2105 t.mode <- Parser_insertion_mode.In_body;
2106 process_token t token
2107
2108and process_in_frameset t token =
2109 match token with
2110 | Token.Character data ->
2111 (* Extract only whitespace characters and insert them *)
2112 let whitespace = String.to_seq data
2113 |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
2114 |> String.of_seq in
2115 if whitespace <> "" then insert_character t whitespace;
2116 if not (is_whitespace data) then
2117 parse_error t "unexpected-char-in-frameset"
2118 | Token.Comment data ->
2119 insert_comment t data
2120 | Token.Doctype _ ->
2121 parse_error t "unexpected-doctype"
2122 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
2123 process_in_body t token
2124 | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
2125 ignore (insert_element t "frameset" ~push:true attrs)
2126 | Token.Tag { kind = Token.End; name = "frameset"; _ } ->
2127 (match current_node t with
2128 | Some n when n.Dom.name = "html" -> parse_error t "unexpected-end-tag"
2129 | _ ->
2130 pop_current t;
2131 if t.fragment_context = None then
2132 (match current_node t with
2133 | Some n when n.Dom.name <> "frameset" -> t.mode <- Parser_insertion_mode.After_frameset
2134 | _ -> ()))
2135 | Token.Tag { kind = Token.Start; name = "frame"; attrs; _ } ->
2136 ignore (insert_element t "frame" ~push:true attrs);
2137 pop_current t
2138 | Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
2139 process_in_head t token
2140 | Token.EOF ->
2141 (match current_node t with
2142 | Some n when n.Dom.name <> "html" -> parse_error t "expected-closing-tag-but-got-eof"
2143 | _ -> ())
2144 | _ ->
2145 parse_error t "unexpected-token-in-frameset"
2146
2147and process_after_frameset t token =
2148 match token with
2149 | Token.Character data ->
2150 (* Extract only whitespace characters and insert them *)
2151 let whitespace = String.to_seq data
2152 |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
2153 |> String.of_seq in
2154 if whitespace <> "" then insert_character t whitespace;
2155 if not (is_whitespace data) then
2156 parse_error t "unexpected-char-after-frameset"
2157 | Token.Comment data ->
2158 insert_comment t data
2159 | Token.Doctype _ ->
2160 parse_error t "unexpected-doctype"
2161 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
2162 process_in_body t token
2163 | Token.Tag { kind = Token.End; name = "html"; _ } ->
2164 t.mode <- Parser_insertion_mode.After_after_frameset
2165 | Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
2166 process_in_head t token
2167 | Token.EOF ->
2168 () (* Stop parsing *)
2169 | _ ->
2170 parse_error t "unexpected-token-after-frameset"
2171
2172and process_after_after_body t token =
2173 match token with
2174 | Token.Comment data ->
2175 insert_comment_to_document t data
2176 | Token.Doctype _ ->
2177 process_in_body t token
2178 | Token.Character data when is_whitespace data ->
2179 process_in_body t token
2180 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
2181 process_in_body t token
2182 | Token.EOF ->
2183 () (* Stop parsing *)
2184 | _ ->
2185 parse_error t "unexpected-token-after-after-body";
2186 t.mode <- Parser_insertion_mode.In_body;
2187 process_token t token
2188
2189and process_after_after_frameset t token =
2190 match token with
2191 | Token.Comment data ->
2192 insert_comment_to_document t data
2193 | Token.Doctype _ ->
2194 process_in_body t token
2195 | Token.Character data ->
2196 (* Extract only whitespace characters and process using in_body rules *)
2197 let whitespace = String.to_seq data
2198 |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
2199 |> String.of_seq in
2200 if whitespace <> "" then process_in_body t (Token.Character whitespace);
2201 if not (is_whitespace data) then
2202 parse_error t "unexpected-char-after-after-frameset"
2203 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
2204 process_in_body t token
2205 | Token.EOF ->
2206 () (* Stop parsing *)
2207 | Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
2208 process_in_head t token
2209 | _ ->
2210 parse_error t "unexpected-token-after-after-frameset"
2211
2212and process_token t token =
2213 (* Check for HTML integration points (SVG foreignObject, desc, title) *)
2214 let is_html_integration_point node =
2215 (* SVG foreignObject, desc, and title are always HTML integration points *)
2216 if node.Dom.namespace = Some "svg" &&
2217 Parser_constants.is_svg_html_integration node.Dom.name then true
2218 (* annotation-xml is an HTML integration point only with specific encoding values *)
2219 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
2220 match List.assoc_opt "encoding" node.Dom.attrs with
2221 | Some enc ->
2222 let enc_lower = String.lowercase_ascii enc in
2223 enc_lower = "text/html" || enc_lower = "application/xhtml+xml"
2224 | None -> false
2225 else false
2226 in
2227 (* Check for MathML text integration points *)
2228 let is_mathml_text_integration_point node =
2229 node.Dom.namespace = Some "mathml" &&
2230 Parser_constants.is_mathml_text_integration node.Dom.name
2231 in
2232 (* Foreign content handling *)
2233 let in_foreign =
2234 match adjusted_current_node t with
2235 | None -> false
2236 | Some node ->
2237 if is_in_html_namespace node then false
2238 else begin
2239 (* At HTML integration points, characters and start tags (except mglyph/malignmark) use HTML rules *)
2240 if is_html_integration_point node then begin
2241 match token with
2242 | Token.Character _ -> false
2243 | Token.Tag { kind = Token.Start; _ } -> false
2244 | _ -> true
2245 end
2246 (* At MathML text integration points, characters and start tags (except mglyph/malignmark) use HTML rules *)
2247 else if is_mathml_text_integration_point node then begin
2248 match token with
2249 | Token.Character _ -> false
2250 | Token.Tag { kind = Token.Start; name; _ } ->
2251 name = "mglyph" || name = "malignmark"
2252 | _ -> true
2253 end
2254 (* Special case: <svg> inside annotation-xml uses HTML rules (creates svg in svg namespace) *)
2255 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then begin
2256 match token with
2257 | Token.Tag { kind = Token.Start; name; _ } when String.lowercase_ascii name = "svg" -> false
2258 | _ -> true
2259 end
2260 (* Not at integration point - use foreign content rules *)
2261 (* Breakout handling is done inside process_foreign_content *)
2262 else true
2263 end
2264 in
2265
2266 (* Check if at HTML integration point for special table mode handling *)
2267 let at_integration_point =
2268 match adjusted_current_node t with
2269 | Some node ->
2270 is_html_integration_point node || is_mathml_text_integration_point node
2271 | None -> false
2272 in
2273
2274 if in_foreign then
2275 process_foreign_content t token
2276 else if at_integration_point then begin
2277 (* At integration points, check if in table mode without table in scope *)
2278 let is_table_mode = List.mem t.mode [In_table; In_table_body; In_row; In_cell; In_caption; In_column_group] in
2279 let has_table = has_element_in_table_scope t "table" in
2280 if is_table_mode && not has_table then begin
2281 match token with
2282 | Token.Tag { kind = Token.Start; _ } ->
2283 (* Temporarily use IN_BODY for start tags in table mode without table *)
2284 let saved_mode = t.mode in
2285 t.mode <- In_body;
2286 process_by_mode t token;
2287 if t.mode = In_body then t.mode <- saved_mode
2288 | _ -> process_by_mode t token
2289 end else
2290 process_by_mode t token
2291 end else
2292 process_by_mode t token
2293
2294(* Pop foreign elements until HTML or integration point *)
2295and pop_until_html_or_integration_point t =
2296 let is_html_integration_point node =
2297 (* SVG foreignObject, desc, and title are always HTML integration points *)
2298 if node.Dom.namespace = Some "svg" &&
2299 Parser_constants.is_svg_html_integration node.Dom.name then true
2300 (* annotation-xml is an HTML integration point only with specific encoding values *)
2301 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
2302 match List.assoc_opt "encoding" node.Dom.attrs with
2303 | Some enc ->
2304 let enc_lower = String.lowercase_ascii enc in
2305 enc_lower = "text/html" || enc_lower = "application/xhtml+xml"
2306 | None -> false
2307 else false
2308 in
2309 (* Get fragment context element - only for foreign namespace fragment contexts *)
2310 let fragment_context_elem = t.fragment_context_element in
2311 let rec pop () =
2312 match current_node t with
2313 | None -> ()
2314 | Some node ->
2315 if is_in_html_namespace node then ()
2316 else if is_html_integration_point node then ()
2317 (* Don't pop past fragment context element *)
2318 else (match fragment_context_elem with
2319 | Some ctx when node == ctx -> ()
2320 | _ ->
2321 pop_current t;
2322 pop ())
2323 in
2324 pop ()
2325
2326(* Foreign breakout elements - these break out of foreign content *)
2327and is_foreign_breakout_element name =
2328 List.mem (String.lowercase_ascii name)
2329 ["b"; "big"; "blockquote"; "body"; "br"; "center"; "code"; "dd"; "div"; "dl"; "dt";
2330 "em"; "embed"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "hr"; "i"; "img"; "li";
2331 "listing"; "menu"; "meta"; "nobr"; "ol"; "p"; "pre"; "ruby"; "s"; "small"; "span";
2332 "strong"; "strike"; "sub"; "sup"; "table"; "tt"; "u"; "ul"; "var"]
2333
2334and process_foreign_content t token =
2335 match token with
2336 | Token.Character data when String.contains data '\x00' ->
2337 (* Replace NUL characters with U+FFFD replacement character *)
2338 parse_error t "unexpected-null-character";
2339 let buf = Buffer.create (String.length data) in
2340 let has_non_ws_non_nul = ref false in
2341 String.iter (fun c ->
2342 if c = '\x00' then Buffer.add_string buf "\xEF\xBF\xBD"
2343 else begin
2344 Buffer.add_char buf c;
2345 if not (c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r') then
2346 has_non_ws_non_nul := true
2347 end
2348 ) data;
2349 let replaced = Buffer.contents buf in
2350 insert_character t replaced;
2351 (* Only set frameset_ok to false if there's actual non-whitespace non-NUL content *)
2352 if !has_non_ws_non_nul then t.frameset_ok <- false
2353 | Token.Character data when is_whitespace data ->
2354 insert_character t data
2355 | Token.Character data ->
2356 insert_character t data;
2357 t.frameset_ok <- false
2358 | Token.Comment data ->
2359 insert_comment t data
2360 | Token.Doctype _ ->
2361 parse_error t "unexpected-doctype"
2362 | Token.Tag { kind = Token.Start; name; _ } when is_foreign_breakout_element name ->
2363 (* Breakout from foreign content - pop until HTML or integration point, reprocess in HTML mode *)
2364 parse_error t "unexpected-html-element-in-foreign-content";
2365 pop_until_html_or_integration_point t;
2366 reset_insertion_mode t;
2367 (* Use process_by_mode to force HTML mode processing and avoid infinite loop *)
2368 process_by_mode t token
2369 | Token.Tag { kind = Token.Start; name = "font"; attrs; _ }
2370 when List.exists (fun (n, _) ->
2371 let n = String.lowercase_ascii n in
2372 n = "color" || n = "face" || n = "size") attrs ->
2373 (* font with color/face/size breaks out of foreign content *)
2374 parse_error t "unexpected-html-element-in-foreign-content";
2375 pop_until_html_or_integration_point t;
2376 reset_insertion_mode t;
2377 process_by_mode t token
2378 | Token.Tag { kind = Token.Start; name; attrs; self_closing } ->
2379 let name =
2380 match adjusted_current_node t with
2381 | Some n when n.Dom.namespace = Some "svg" -> Parser_constants.adjust_svg_tag_name name
2382 | _ -> name
2383 in
2384 let attrs =
2385 match adjusted_current_node t with
2386 | Some n when n.Dom.namespace = Some "svg" ->
2387 Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs)
2388 | Some n when n.Dom.namespace = Some "mathml" ->
2389 Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs)
2390 | _ -> Parser_constants.adjust_foreign_attrs attrs
2391 in
2392 let namespace =
2393 match adjusted_current_node t with
2394 | Some n -> n.Dom.namespace
2395 | None -> None
2396 in
2397 let node = insert_element t name ~namespace attrs in
2398 t.open_elements <- node :: t.open_elements;
2399 if self_closing then pop_current t
2400 | Token.Tag { kind = Token.End; name; _ } when List.mem (String.lowercase_ascii name) ["br"; "p"] ->
2401 (* Special case: </br> and </p> end tags trigger breakout from foreign content *)
2402 parse_error t "unexpected-html-element-in-foreign-content";
2403 pop_until_html_or_integration_point t;
2404 reset_insertion_mode t;
2405 (* Use process_by_mode to force HTML mode processing and avoid infinite loop *)
2406 process_by_mode t token
2407 | Token.Tag { kind = Token.End; name; _ } ->
2408 (* Find matching element per WHATWG spec for foreign content *)
2409 let is_fragment_context n =
2410 match t.fragment_context_element with
2411 | Some ctx -> n == ctx
2412 | None -> false
2413 in
2414 let name_lower = String.lowercase_ascii name in
2415 (* Walk through stack looking for matching element *)
2416 let rec find_and_process first_node idx = function
2417 | [] -> () (* Stack exhausted - ignore tag *)
2418 | n :: rest ->
2419 let node_name_lower = String.lowercase_ascii n.Dom.name in
2420 let is_html = is_in_html_namespace n in
2421 let name_matches = node_name_lower = name_lower in
2422
2423 (* If first node doesn't match tag name, it's a parse error *)
2424 if first_node && not name_matches then
2425 parse_error t "unexpected-end-tag-in-foreign-content";
2426
2427 (* Check if this node matches the end tag *)
2428 if name_matches then begin
2429 (* Fragment context check *)
2430 if is_fragment_context n then
2431 parse_error t "unexpected-end-tag-in-fragment-context"
2432 (* If matched element is in HTML namespace, reprocess via HTML mode *)
2433 else if is_html then
2434 process_by_mode t token
2435 (* Otherwise it's a foreign element - pop everything from this point up *)
2436 else begin
2437 (* Pop all elements from current down to and including the matched element *)
2438 let rec pop_to_idx current_idx =
2439 if current_idx >= idx then begin
2440 pop_current t;
2441 pop_to_idx (current_idx - 1)
2442 end
2443 in
2444 pop_to_idx (List.length t.open_elements - 1)
2445 end
2446 end
2447 (* If we hit an HTML element that doesn't match, process via HTML mode *)
2448 else if is_html then
2449 process_by_mode t token
2450 (* Continue searching in the stack *)
2451 else
2452 find_and_process false (idx - 1) rest
2453 in
2454 find_and_process true (List.length t.open_elements - 1) t.open_elements
2455 | Token.EOF ->
2456 process_by_mode t token
2457
2458and process_by_mode t token =
2459 match t.mode with
2460 | Parser_insertion_mode.Initial -> process_initial t token
2461 | Parser_insertion_mode.Before_html -> process_before_html t token
2462 | Parser_insertion_mode.Before_head -> process_before_head t token
2463 | Parser_insertion_mode.In_head -> process_in_head t token
2464 | Parser_insertion_mode.In_head_noscript -> process_in_head_noscript t token
2465 | Parser_insertion_mode.After_head -> process_after_head t token
2466 | Parser_insertion_mode.In_body -> process_in_body t token
2467 | Parser_insertion_mode.Text -> process_text t token
2468 | Parser_insertion_mode.In_table -> process_in_table t token
2469 | Parser_insertion_mode.In_table_text -> process_in_table_text t token
2470 | Parser_insertion_mode.In_caption -> process_in_caption t token
2471 | Parser_insertion_mode.In_column_group -> process_in_column_group t token
2472 | Parser_insertion_mode.In_table_body -> process_in_table_body t token
2473 | Parser_insertion_mode.In_row -> process_in_row t token
2474 | Parser_insertion_mode.In_cell -> process_in_cell t token
2475 | Parser_insertion_mode.In_select -> process_in_select t token
2476 | Parser_insertion_mode.In_select_in_table -> process_in_select_in_table t token
2477 | Parser_insertion_mode.In_template -> process_in_template t token
2478 | Parser_insertion_mode.After_body -> process_after_body t token
2479 | Parser_insertion_mode.In_frameset -> process_in_frameset t token
2480 | Parser_insertion_mode.After_frameset -> process_after_frameset t token
2481 | Parser_insertion_mode.After_after_body -> process_after_after_body t token
2482 | Parser_insertion_mode.After_after_frameset -> process_after_after_frameset t token
2483
2484(* Populate selectedcontent elements with content from selected option *)
2485let find_elements name node =
2486 let result = ref [] in
2487 let rec find n =
2488 if n.Dom.name = name then result := n :: !result;
2489 List.iter find n.Dom.children
2490 in
2491 find node;
2492 List.rev !result (* Reverse to maintain document order *)
2493
2494let find_element name node =
2495 let rec find n =
2496 if n.Dom.name = name then Some n
2497 else
2498 List.find_map find n.Dom.children
2499 in
2500 find node
2501
2502let populate_selectedcontent document =
2503 let selects = find_elements "select" document in
2504 List.iter (fun select ->
2505 match find_element "selectedcontent" select with
2506 | None -> ()
2507 | Some selectedcontent ->
2508 let options = find_elements "option" select in
2509 if options <> [] then begin
2510 (* Find selected option or use first *)
2511 let selected_option =
2512 match List.find_opt (fun opt -> Dom.has_attr opt "selected") options with
2513 | Some opt -> opt
2514 | None -> List.hd options
2515 in
2516 (* Clone children from selected option to selectedcontent *)
2517 List.iter (fun child ->
2518 let cloned = Dom.clone ~deep:true child in
2519 Dom.append_child selectedcontent cloned
2520 ) selected_option.Dom.children
2521 end
2522 ) selects
2523
2524let finish t =
2525 (* Populate selectedcontent elements *)
2526 populate_selectedcontent t.document;
2527 (* For fragment parsing, remove the html wrapper and promote children *)
2528 if t.fragment_context <> None then begin
2529 match t.document.Dom.children with
2530 | [root] when root.Dom.name = "html" ->
2531 (* Move context element's children to root if applicable *)
2532 (match t.fragment_context_element with
2533 | Some ctx_elem ->
2534 (match ctx_elem.Dom.parent with
2535 | Some p when p == root ->
2536 let ctx_children = ctx_elem.Dom.children in
2537 List.iter (fun child ->
2538 Dom.remove_child ctx_elem child;
2539 Dom.append_child root child
2540 ) ctx_children;
2541 Dom.remove_child root ctx_elem
2542 | _ -> ())
2543 | None -> ());
2544 (* Promote root's children to document - preserve order *)
2545 let children_copy = root.Dom.children in
2546 List.iter (fun child ->
2547 Dom.remove_child root child;
2548 Dom.append_child t.document child
2549 ) children_copy;
2550 Dom.remove_child t.document root
2551 | _ -> ()
2552 end;
2553 t.document
2554
2555let get_errors t = List.rev t.errors