we (web engine): Experimental web browser project to understand the limits of Claude
1//! HTML tree builder: construct a DOM tree from tokenizer output.
2//!
3//! Implements a simplified subset of the WHATWG HTML5 tree construction
4//! algorithm for Phase 3 of the browser engine.
5
6use we_dom::{Document, NodeId};
7
8use crate::{Token, Tokenizer};
9
10/// Insertion modes for the tree builder state machine.
11#[derive(Debug, Clone, Copy, PartialEq)]
12enum InsertionMode {
13 Initial,
14 BeforeHtml,
15 BeforeHead,
16 InHead,
17 Text,
18 AfterHead,
19 InBody,
20 InSelect,
21 AfterBody,
22 AfterAfterBody,
23}
24
25/// Returns true if the given tag name is a void element (self-closing, no end tag).
26fn is_void_element(tag: &str) -> bool {
27 matches!(
28 tag,
29 "area"
30 | "base"
31 | "br"
32 | "col"
33 | "embed"
34 | "hr"
35 | "img"
36 | "input"
37 | "link"
38 | "meta"
39 | "param"
40 | "source"
41 | "track"
42 | "wbr"
43 )
44}
45
46/// SVG namespace URI.
47const SVG_NAMESPACE: &str = "http://www.w3.org/2000/svg";
48
49/// HTML tree builder that processes tokens and constructs a DOM tree.
50pub struct TreeBuilder {
51 document: Document,
52 /// Stack of open elements (the current nesting context).
53 open_elements: Vec<NodeId>,
54 head_element: Option<NodeId>,
55 body_element: Option<NodeId>,
56 /// The form element pointer (per HTML spec §13.2.4.1).
57 form_element: Option<NodeId>,
58 insertion_mode: InsertionMode,
59 /// Original insertion mode, saved when switching to Text mode.
60 original_insertion_mode: Option<InsertionMode>,
61 /// Pending text for the Text insertion mode (e.g., inside `<title>`).
62 pending_text: String,
63 /// Depth counter for SVG foreign content. >0 means we are inside an `<svg>` element.
64 svg_depth: usize,
65}
66
67impl TreeBuilder {
68 /// Create a new tree builder with an empty document.
69 pub fn new() -> Self {
70 TreeBuilder {
71 document: Document::new(),
72 open_elements: Vec::new(),
73 head_element: None,
74 body_element: None,
75 form_element: None,
76 insertion_mode: InsertionMode::Initial,
77 original_insertion_mode: None,
78 pending_text: String::new(),
79 svg_depth: 0,
80 }
81 }
82
83 /// Process a single token, updating the DOM tree.
84 pub fn process_token(&mut self, token: Token) {
85 // Handle SVG foreign content.
86 if self.svg_depth > 0 {
87 self.handle_svg_content(token);
88 return;
89 }
90
91 match self.insertion_mode {
92 InsertionMode::Initial => self.handle_initial(token),
93 InsertionMode::BeforeHtml => self.handle_before_html(token),
94 InsertionMode::BeforeHead => self.handle_before_head(token),
95 InsertionMode::InHead => self.handle_in_head(token),
96 InsertionMode::Text => self.handle_text(token),
97 InsertionMode::AfterHead => self.handle_after_head(token),
98 InsertionMode::InBody => self.handle_in_body(token),
99 InsertionMode::InSelect => self.handle_in_select(token),
100 InsertionMode::AfterBody => self.handle_after_body(token),
101 InsertionMode::AfterAfterBody => self.handle_after_after_body(token),
102 }
103 }
104
105 /// Finish building and return the constructed DOM document.
106 pub fn finish(self) -> Document {
107 self.document
108 }
109
110 // --- Insertion mode handlers ---
111
112 fn handle_initial(&mut self, token: Token) {
113 match token {
114 Token::Doctype { .. } => {
115 // For Phase 3, we just acknowledge the DOCTYPE and move on.
116 self.insertion_mode = InsertionMode::BeforeHtml;
117 }
118 Token::Comment(data) => {
119 let comment = self.document.create_comment(&data);
120 let root = self.document.root();
121 self.document.append_child(root, comment);
122 }
123 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
124 // Ignore whitespace in Initial mode.
125 }
126 _ => {
127 // Anything else: switch to BeforeHtml and reprocess.
128 self.insertion_mode = InsertionMode::BeforeHtml;
129 self.handle_before_html(token);
130 }
131 }
132 }
133
134 fn handle_before_html(&mut self, token: Token) {
135 match token {
136 Token::Doctype { .. } => { /* ignore */ }
137 Token::Comment(data) => {
138 let comment = self.document.create_comment(&data);
139 let root = self.document.root();
140 self.document.append_child(root, comment);
141 }
142 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
143 // Ignore whitespace.
144 }
145 Token::StartTag { ref name, .. } if name == "html" => {
146 let html = self.create_element_from_token(&token);
147 let root = self.document.root();
148 self.document.append_child(root, html);
149 self.open_elements.push(html);
150 self.insertion_mode = InsertionMode::BeforeHead;
151 }
152 Token::EndTag { ref name }
153 if name != "head" && name != "body" && name != "html" && name != "br" =>
154 {
155 // Parse error, ignore.
156 }
157 _ => {
158 // Create an implicit <html> element.
159 let html = self.document.create_element("html");
160 let root = self.document.root();
161 self.document.append_child(root, html);
162 self.open_elements.push(html);
163 self.insertion_mode = InsertionMode::BeforeHead;
164 self.handle_before_head(token);
165 }
166 }
167 }
168
169 fn handle_before_head(&mut self, token: Token) {
170 match token {
171 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
172 // Ignore whitespace.
173 }
174 Token::Comment(data) => {
175 self.insert_comment(&data);
176 }
177 Token::Doctype { .. } => { /* ignore */ }
178 Token::StartTag { ref name, .. } if name == "html" => {
179 // Process as if InBody.
180 self.handle_in_body(token);
181 }
182 Token::StartTag { ref name, .. } if name == "head" => {
183 let head = self.create_element_from_token(&token);
184 self.insert_node(head);
185 self.open_elements.push(head);
186 self.head_element = Some(head);
187 self.insertion_mode = InsertionMode::InHead;
188 }
189 Token::EndTag { ref name }
190 if name != "head" && name != "body" && name != "html" && name != "br" =>
191 {
192 // Parse error, ignore.
193 }
194 _ => {
195 // Implied <head>.
196 let head = self.document.create_element("head");
197 self.insert_node(head);
198 self.open_elements.push(head);
199 self.head_element = Some(head);
200 self.insertion_mode = InsertionMode::InHead;
201 self.handle_in_head(token);
202 }
203 }
204 }
205
206 fn handle_in_head(&mut self, token: Token) {
207 match token {
208 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
209 self.insert_text(s);
210 }
211 Token::Comment(data) => {
212 self.insert_comment(&data);
213 }
214 Token::Doctype { .. } => { /* ignore */ }
215 Token::StartTag { ref name, .. } if name == "title" => {
216 let elem = self.create_element_from_token(&token);
217 self.insert_node(elem);
218 self.open_elements.push(elem);
219 self.original_insertion_mode = Some(self.insertion_mode);
220 self.insertion_mode = InsertionMode::Text;
221 }
222 Token::StartTag { ref name, .. }
223 if name == "style" || name == "script" || name == "noscript" =>
224 {
225 let elem = self.create_element_from_token(&token);
226 self.insert_node(elem);
227 self.open_elements.push(elem);
228 self.original_insertion_mode = Some(self.insertion_mode);
229 self.insertion_mode = InsertionMode::Text;
230 }
231 Token::StartTag { ref name, .. } if name == "meta" || name == "link" => {
232 let elem = self.create_element_from_token(&token);
233 self.insert_node(elem);
234 // Void elements: don't push onto stack.
235 }
236 Token::StartTag { ref name, .. } if name == "head" => {
237 // Ignore duplicate <head>.
238 }
239 Token::EndTag { ref name } if name == "head" => {
240 self.pop_until("head");
241 self.insertion_mode = InsertionMode::AfterHead;
242 }
243 Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => {
244 // Parse error, ignore.
245 }
246 _ => {
247 // Pop <head> and switch to AfterHead, then reprocess.
248 self.pop_until("head");
249 self.insertion_mode = InsertionMode::AfterHead;
250 self.handle_after_head(token);
251 }
252 }
253 }
254
255 fn handle_text(&mut self, token: Token) {
256 match token {
257 Token::Character(s) => {
258 self.pending_text.push_str(&s);
259 }
260 Token::EndTag { .. } => {
261 // Flush pending text.
262 if !self.pending_text.is_empty() {
263 let text = self.pending_text.clone();
264 self.pending_text.clear();
265 self.insert_text(&text);
266 }
267 // Pop the element (e.g., <title>).
268 self.open_elements.pop();
269 self.insertion_mode = self
270 .original_insertion_mode
271 .unwrap_or(InsertionMode::InBody);
272 self.original_insertion_mode = None;
273 }
274 Token::Eof => {
275 // Flush pending text.
276 if !self.pending_text.is_empty() {
277 let text = self.pending_text.clone();
278 self.pending_text.clear();
279 self.insert_text(&text);
280 }
281 self.open_elements.pop();
282 self.insertion_mode = self
283 .original_insertion_mode
284 .unwrap_or(InsertionMode::InBody);
285 self.original_insertion_mode = None;
286 self.process_token(Token::Eof);
287 }
288 _ => {}
289 }
290 }
291
292 fn handle_after_head(&mut self, token: Token) {
293 match token {
294 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
295 self.insert_text(s);
296 }
297 Token::Comment(data) => {
298 self.insert_comment(&data);
299 }
300 Token::Doctype { .. } => { /* ignore */ }
301 Token::StartTag { ref name, .. } if name == "html" => {
302 self.handle_in_body(token);
303 }
304 Token::StartTag { ref name, .. } if name == "body" => {
305 let body = self.create_element_from_token(&token);
306 self.insert_node(body);
307 self.open_elements.push(body);
308 self.body_element = Some(body);
309 self.insertion_mode = InsertionMode::InBody;
310 }
311 Token::StartTag { ref name, .. } if name == "head" => {
312 // Ignore.
313 }
314 Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => {
315 // Ignore.
316 }
317 _ => {
318 // Implied <body>.
319 let body = self.document.create_element("body");
320 self.insert_node(body);
321 self.open_elements.push(body);
322 self.body_element = Some(body);
323 self.insertion_mode = InsertionMode::InBody;
324 self.handle_in_body(token);
325 }
326 }
327 }
328
329 fn handle_in_body(&mut self, token: Token) {
330 match token {
331 Token::Character(s) => {
332 self.insert_text(&s);
333 }
334 Token::Comment(data) => {
335 self.insert_comment(&data);
336 }
337 Token::Doctype { .. } => { /* ignore */ }
338 Token::StartTag { ref name, .. } if name == "html" => {
339 // Merge attributes onto existing <html> element.
340 if let Token::StartTag { attributes, .. } = &token {
341 if let Some(&html_id) = self.open_elements.first() {
342 for (attr_name, attr_value) in attributes {
343 if self.document.get_attribute(html_id, attr_name).is_none() {
344 self.document.set_attribute(html_id, attr_name, attr_value);
345 }
346 }
347 }
348 }
349 }
350 Token::StartTag { ref name, .. }
351 if name == "body"
352 || name == "head"
353 || name == "title"
354 || name == "style"
355 || name == "script" =>
356 {
357 match name.as_str() {
358 "body" => {
359 // Ignore duplicate <body>.
360 }
361 "head" => {
362 // Ignore <head> in body.
363 }
364 _ => {
365 // title/style/script: process using InHead rules
366 self.handle_in_head(token);
367 }
368 }
369 }
370 Token::StartTag { ref name, .. }
371 if name == "p"
372 || name == "div"
373 || name == "h1"
374 || name == "h2"
375 || name == "h3"
376 || name == "h4"
377 || name == "h5"
378 || name == "h6"
379 || name == "pre"
380 || name == "blockquote"
381 || name == "ul"
382 || name == "ol"
383 || name == "li" =>
384 {
385 // If there's a <p> in button scope, close it first.
386 if self.has_element_in_button_scope("p") {
387 self.close_p_element();
388 }
389 let elem = self.create_element_from_token(&token);
390 self.insert_node(elem);
391 self.open_elements.push(elem);
392 }
393 // --- Form elements (Phase 16) ---
394 Token::StartTag { ref name, .. } if name == "form" => {
395 // Per spec: if the form element pointer is not null, ignore.
396 if self.form_element.is_some() {
397 // Parse error, ignore the token.
398 } else {
399 if self.has_element_in_button_scope("p") {
400 self.close_p_element();
401 }
402 let elem = self.create_element_from_token(&token);
403 self.insert_node(elem);
404 self.open_elements.push(elem);
405 self.form_element = Some(elem);
406 }
407 }
408 Token::StartTag { ref name, .. } if name == "fieldset" => {
409 if self.has_element_in_button_scope("p") {
410 self.close_p_element();
411 }
412 let elem = self.create_element_from_token(&token);
413 self.insert_node(elem);
414 self.open_elements.push(elem);
415 }
416 Token::StartTag { ref name, .. } if name == "button" => {
417 // If there's a button in scope, close it first.
418 if self.has_element_in_scope("button") {
419 self.generate_implied_end_tags(None);
420 self.pop_until("button");
421 }
422 let elem = self.create_element_from_token(&token);
423 self.insert_node(elem);
424 self.open_elements.push(elem);
425 }
426 Token::StartTag { ref name, .. } if name == "textarea" => {
427 let elem = self.create_element_from_token(&token);
428 self.insert_node(elem);
429 self.open_elements.push(elem);
430 // Switch to Text mode to collect raw text content.
431 self.original_insertion_mode = Some(self.insertion_mode);
432 self.insertion_mode = InsertionMode::Text;
433 }
434 Token::StartTag { ref name, .. } if name == "select" => {
435 let elem = self.create_element_from_token(&token);
436 self.insert_node(elem);
437 self.open_elements.push(elem);
438 self.insertion_mode = InsertionMode::InSelect;
439 }
440 Token::StartTag { ref name, .. } if name == "optgroup" || name == "option" => {
441 // Close any currently open <option>.
442 if let Some(&top) = self.open_elements.last() {
443 if self.document.tag_name(top) == Some("option") {
444 self.open_elements.pop();
445 }
446 }
447 // Also close open <optgroup> when a new <optgroup> starts.
448 if name == "optgroup" {
449 if let Some(&top) = self.open_elements.last() {
450 if self.document.tag_name(top) == Some("optgroup") {
451 self.open_elements.pop();
452 }
453 }
454 }
455 let elem = self.create_element_from_token(&token);
456 self.insert_node(elem);
457 self.open_elements.push(elem);
458 }
459 // --- SVG / iframe / void / generic start tags ---
460 Token::StartTag { ref name, .. } if name == "svg" => {
461 let elem = self.create_svg_element_from_token(&token);
462 self.insert_node(elem);
463 self.open_elements.push(elem);
464 self.svg_depth = 1;
465 }
466 Token::StartTag { ref name, .. } if name == "iframe" => {
467 // Per HTML spec, <iframe> uses RAWTEXT parsing: content between
468 // <iframe> and </iframe> is raw text (fallback content), not HTML.
469 let elem = self.create_element_from_token(&token);
470 self.insert_node(elem);
471 self.open_elements.push(elem);
472 self.original_insertion_mode = Some(self.insertion_mode);
473 self.insertion_mode = InsertionMode::Text;
474 }
475 Token::StartTag { ref name, .. } if is_void_element(name) => {
476 let elem = self.create_element_from_token(&token);
477 self.insert_node(elem);
478 // Don't push void elements onto the stack.
479 }
480 Token::StartTag { .. } => {
481 // Generic start tag: create element and push onto stack.
482 let elem = self.create_element_from_token(&token);
483 self.insert_node(elem);
484 self.open_elements.push(elem);
485 }
486 Token::EndTag { ref name } if name == "body" => {
487 if self.has_element_in_scope("body") {
488 self.insertion_mode = InsertionMode::AfterBody;
489 }
490 }
491 Token::EndTag { ref name } if name == "html" => {
492 if self.has_element_in_scope("body") {
493 self.insertion_mode = InsertionMode::AfterBody;
494 self.handle_after_body(token);
495 }
496 }
497 Token::EndTag { ref name } if name == "p" => {
498 if !self.has_element_in_button_scope("p") {
499 // No matching <p>: insert an empty one, then close it.
500 let p = self.document.create_element("p");
501 self.insert_node(p);
502 self.open_elements.push(p);
503 }
504 self.close_p_element();
505 }
506 // --- Form end tags (Phase 16) ---
507 Token::EndTag { ref name } if name == "form" => {
508 // Per spec: reset the form element pointer, then pop.
509 self.form_element = None;
510 if self.has_element_in_scope("form") {
511 self.generate_implied_end_tags(Some("form"));
512 self.pop_until("form");
513 }
514 }
515 Token::EndTag { ref name } if name == "button" => {
516 if self.has_element_in_scope("button") {
517 self.generate_implied_end_tags(None);
518 self.pop_until("button");
519 }
520 }
521 Token::EndTag { ref name } if name == "fieldset" => {
522 if self.has_element_in_scope("fieldset") {
523 self.generate_implied_end_tags(None);
524 self.pop_until("fieldset");
525 }
526 }
527 Token::EndTag { ref name } if name == "optgroup" || name == "option" => {
528 if self.has_element_in_scope(name) {
529 self.generate_implied_end_tags(Some(name));
530 self.pop_until(name);
531 }
532 }
533 // --- End of form end tags ---
534 Token::EndTag { ref name }
535 if name == "div"
536 || name == "pre"
537 || name == "blockquote"
538 || name == "ul"
539 || name == "ol"
540 || name == "li" =>
541 {
542 if self.has_element_in_scope(name) {
543 self.generate_implied_end_tags(Some(name));
544 self.pop_until(name);
545 }
546 }
547 Token::EndTag { ref name }
548 if name == "h1"
549 || name == "h2"
550 || name == "h3"
551 || name == "h4"
552 || name == "h5"
553 || name == "h6" =>
554 {
555 if self.has_heading_in_scope() {
556 self.generate_implied_end_tags(None);
557 // Pop until we find a heading element.
558 while let Some(id) = self.open_elements.pop() {
559 if let Some(tag) = self.document.tag_name(id) {
560 if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
561 break;
562 }
563 }
564 }
565 }
566 }
567 Token::EndTag { ref name } => {
568 // Generic end tag: walk back through open elements.
569 self.handle_any_other_end_tag(name);
570 }
571 Token::Eof => {
572 // Stop parsing.
573 }
574 }
575 }
576
577 /// Handle tokens inside a `<select>` element (InSelect insertion mode).
578 fn handle_in_select(&mut self, token: Token) {
579 match token {
580 Token::Character(s) => {
581 self.insert_text(&s);
582 }
583 Token::Comment(data) => {
584 self.insert_comment(&data);
585 }
586 Token::StartTag { ref name, .. } if name == "option" => {
587 // Close any currently open <option>.
588 if let Some(&top) = self.open_elements.last() {
589 if self.document.tag_name(top) == Some("option") {
590 self.open_elements.pop();
591 }
592 }
593 let elem = self.create_element_from_token(&token);
594 self.insert_node(elem);
595 self.open_elements.push(elem);
596 }
597 Token::StartTag { ref name, .. } if name == "optgroup" => {
598 // Close any open <option>, then close any open <optgroup>.
599 if let Some(&top) = self.open_elements.last() {
600 if self.document.tag_name(top) == Some("option") {
601 self.open_elements.pop();
602 }
603 }
604 if let Some(&top) = self.open_elements.last() {
605 if self.document.tag_name(top) == Some("optgroup") {
606 self.open_elements.pop();
607 }
608 }
609 let elem = self.create_element_from_token(&token);
610 self.insert_node(elem);
611 self.open_elements.push(elem);
612 }
613 Token::EndTag { ref name } if name == "option" => {
614 if let Some(&top) = self.open_elements.last() {
615 if self.document.tag_name(top) == Some("option") {
616 self.open_elements.pop();
617 }
618 }
619 }
620 Token::EndTag { ref name } if name == "optgroup" => {
621 // If the top is <option>, pop it first.
622 if let Some(&top) = self.open_elements.last() {
623 if self.document.tag_name(top) == Some("option") {
624 self.open_elements.pop();
625 }
626 }
627 if let Some(&top) = self.open_elements.last() {
628 if self.document.tag_name(top) == Some("optgroup") {
629 self.open_elements.pop();
630 }
631 }
632 }
633 Token::EndTag { ref name } if name == "select" => {
634 self.pop_until("select");
635 self.insertion_mode = InsertionMode::InBody;
636 }
637 Token::StartTag { ref name, .. } if name == "select" => {
638 // Nested <select>: close the current one (parse error).
639 self.pop_until("select");
640 self.insertion_mode = InsertionMode::InBody;
641 }
642 Token::StartTag { ref name, .. } if name == "input" || name == "textarea" => {
643 // Per spec: these close the <select> and reprocess in InBody.
644 self.pop_until("select");
645 self.insertion_mode = InsertionMode::InBody;
646 self.handle_in_body(token);
647 }
648 Token::Eof => {
649 // Stop parsing.
650 }
651 _ => {
652 // Ignore anything else in select.
653 }
654 }
655 }
656
657 fn handle_after_body(&mut self, token: Token) {
658 match token {
659 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
660 // Process whitespace as in InBody.
661 self.handle_in_body(token);
662 }
663 Token::Comment(data) => {
664 // Insert as last child of the first element (html).
665 let comment = self.document.create_comment(&data);
666 if let Some(&html) = self.open_elements.first() {
667 self.document.append_child(html, comment);
668 }
669 }
670 Token::Doctype { .. } => { /* ignore */ }
671 Token::EndTag { ref name } if name == "html" => {
672 self.insertion_mode = InsertionMode::AfterAfterBody;
673 }
674 Token::Eof => {
675 // Stop parsing.
676 }
677 _ => {
678 // Anything else: switch back to InBody and reprocess.
679 self.insertion_mode = InsertionMode::InBody;
680 self.handle_in_body(token);
681 }
682 }
683 }
684
685 fn handle_after_after_body(&mut self, token: Token) {
686 match token {
687 Token::Comment(data) => {
688 let comment = self.document.create_comment(&data);
689 let root = self.document.root();
690 self.document.append_child(root, comment);
691 }
692 Token::Doctype { .. } => { /* ignore */ }
693 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
694 self.handle_in_body(token);
695 }
696 Token::Eof => {
697 // Stop.
698 }
699 _ => {
700 self.insertion_mode = InsertionMode::InBody;
701 self.handle_in_body(token);
702 }
703 }
704 }
705
706 // --- SVG foreign content ---
707
708 /// Handle tokens while inside an SVG subtree.
709 fn handle_svg_content(&mut self, token: Token) {
710 match token {
711 Token::Character(s) => {
712 self.insert_text(&s);
713 }
714 Token::Comment(data) => {
715 self.insert_comment(&data);
716 }
717 Token::StartTag { ref name, .. } if name == "svg" => {
718 // Nested <svg>.
719 let elem = self.create_svg_element_from_token(&token);
720 self.insert_node(elem);
721 self.open_elements.push(elem);
722 self.svg_depth += 1;
723 }
724 Token::StartTag { self_closing, .. } => {
725 let elem = self.create_svg_element_from_token(&token);
726 self.insert_node(elem);
727 if !self_closing {
728 self.open_elements.push(elem);
729 }
730 }
731 Token::EndTag { ref name } if name == "svg" => {
732 self.pop_until("svg");
733 self.svg_depth -= 1;
734 }
735 Token::EndTag { ref name } => {
736 // Pop matching element from the stack.
737 self.handle_any_other_end_tag(name);
738 }
739 Token::Eof => {}
740 _ => {}
741 }
742 }
743
744 // --- Helper methods ---
745
746 /// Create a DOM element from a StartTag token with SVG namespace, setting attributes.
747 fn create_svg_element_from_token(&mut self, token: &Token) -> NodeId {
748 if let Token::StartTag {
749 name, attributes, ..
750 } = token
751 {
752 let id = self.document.create_element_ns(name, Some(SVG_NAMESPACE));
753 for (attr_name, attr_value) in attributes {
754 self.document.set_attribute(id, attr_name, attr_value);
755 }
756 id
757 } else {
758 self.document
759 .create_element_ns("unknown", Some(SVG_NAMESPACE))
760 }
761 }
762
763 /// Create a DOM element from a StartTag token, setting attributes.
764 fn create_element_from_token(&mut self, token: &Token) -> NodeId {
765 if let Token::StartTag {
766 name, attributes, ..
767 } = token
768 {
769 let id = self.document.create_element(name);
770 for (attr_name, attr_value) in attributes {
771 self.document.set_attribute(id, attr_name, attr_value);
772 }
773 id
774 } else {
775 // Should only be called with StartTag tokens.
776 self.document.create_element("unknown")
777 }
778 }
779
780 /// Insert a node at the current insertion point (last open element).
781 fn insert_node(&mut self, node: NodeId) {
782 let parent = self
783 .open_elements
784 .last()
785 .copied()
786 .unwrap_or_else(|| self.document.root());
787 self.document.append_child(parent, node);
788 }
789
790 /// Insert a text node at the current insertion point.
791 /// If the last child is already a text node, append to it.
792 fn insert_text(&mut self, data: &str) {
793 let parent = self
794 .open_elements
795 .last()
796 .copied()
797 .unwrap_or_else(|| self.document.root());
798
799 // Try to merge with existing text node.
800 if let Some(last_child) = self.document.last_child(parent) {
801 if let we_dom::NodeData::Text { data: ref existing } =
802 *self.document.node_data(last_child)
803 {
804 let mut merged = existing.clone();
805 merged.push_str(data);
806 self.document.set_text_content(last_child, &merged);
807 return;
808 }
809 }
810
811 let text = self.document.create_text(data);
812 self.document.append_child(parent, text);
813 }
814
815 /// Insert a comment node at the current insertion point.
816 fn insert_comment(&mut self, data: &str) {
817 let comment = self.document.create_comment(data);
818 self.insert_node(comment);
819 }
820
821 /// Pop elements from the stack until we find one with the given tag name.
822 /// The matching element is also popped.
823 fn pop_until(&mut self, tag_name: &str) {
824 while let Some(id) = self.open_elements.pop() {
825 if self.document.tag_name(id) == Some(tag_name) {
826 return;
827 }
828 }
829 }
830
831 /// Check if the given tag name is "in scope" (simplified).
832 /// In scope means there's an element with that tag on the stack,
833 /// and no scope barrier element between it and the top.
834 fn has_element_in_scope(&self, target: &str) -> bool {
835 for &id in self.open_elements.iter().rev() {
836 if let Some(tag) = self.document.tag_name(id) {
837 if tag == target {
838 return true;
839 }
840 // Scope barrier elements.
841 if matches!(
842 tag,
843 "applet"
844 | "caption"
845 | "html"
846 | "table"
847 | "td"
848 | "th"
849 | "marquee"
850 | "object"
851 | "template"
852 ) {
853 return false;
854 }
855 }
856 }
857 false
858 }
859
860 /// Check if the given tag name is "in button scope".
861 fn has_element_in_button_scope(&self, target: &str) -> bool {
862 for &id in self.open_elements.iter().rev() {
863 if let Some(tag) = self.document.tag_name(id) {
864 if tag == target {
865 return true;
866 }
867 // Button scope includes all regular scope barriers plus <button>.
868 if matches!(
869 tag,
870 "applet"
871 | "button"
872 | "caption"
873 | "html"
874 | "table"
875 | "td"
876 | "th"
877 | "marquee"
878 | "object"
879 | "template"
880 ) {
881 return false;
882 }
883 }
884 }
885 false
886 }
887
888 /// Check if any heading element (h1-h6) is in scope.
889 fn has_heading_in_scope(&self) -> bool {
890 for &id in self.open_elements.iter().rev() {
891 if let Some(tag) = self.document.tag_name(id) {
892 if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
893 return true;
894 }
895 if matches!(
896 tag,
897 "applet"
898 | "caption"
899 | "html"
900 | "table"
901 | "td"
902 | "th"
903 | "marquee"
904 | "object"
905 | "template"
906 ) {
907 return false;
908 }
909 }
910 }
911 false
912 }
913
914 /// Close a `<p>` element: generate implied end tags (excluding p),
915 /// then pop until we find the `<p>`.
916 fn close_p_element(&mut self) {
917 self.generate_implied_end_tags(Some("p"));
918 self.pop_until("p");
919 }
920
921 /// Generate implied end tags. If `exclude` is provided, don't generate
922 /// an end tag for that element.
923 fn generate_implied_end_tags(&mut self, exclude: Option<&str>) {
924 loop {
925 let should_pop = self
926 .open_elements
927 .last()
928 .and_then(|&id| self.document.tag_name(id))
929 .map(|tag| {
930 if let Some(excl) = exclude {
931 if tag == excl {
932 return false;
933 }
934 }
935 matches!(
936 tag,
937 "dd" | "dt"
938 | "li"
939 | "optgroup"
940 | "option"
941 | "p"
942 | "rb"
943 | "rp"
944 | "rt"
945 | "rtc"
946 )
947 })
948 .unwrap_or(false);
949 if should_pop {
950 self.open_elements.pop();
951 } else {
952 break;
953 }
954 }
955 }
956
957 /// Handle a generic end tag by walking back through open elements
958 /// using the "any other end tag" algorithm.
959 fn handle_any_other_end_tag(&mut self, name: &str) {
960 // Walk backwards through the stack.
961 let mut i = self.open_elements.len();
962 while i > 0 {
963 i -= 1;
964 let id = self.open_elements[i];
965 if self.document.tag_name(id) == Some(name) {
966 // Pop everything above and including this element.
967 self.open_elements.truncate(i);
968 return;
969 }
970 // If this is a "special" element, stop.
971 if let Some(tag) = self.document.tag_name(id) {
972 if is_special_element(tag) {
973 return;
974 }
975 }
976 }
977 }
978}
979
980impl Default for TreeBuilder {
981 fn default() -> Self {
982 Self::new()
983 }
984}
985
986/// Returns true if the tag is a "special" element per the HTML spec.
987fn is_special_element(tag: &str) -> bool {
988 matches!(
989 tag,
990 "address"
991 | "applet"
992 | "area"
993 | "article"
994 | "aside"
995 | "base"
996 | "basefont"
997 | "bgsound"
998 | "blockquote"
999 | "body"
1000 | "br"
1001 | "button"
1002 | "caption"
1003 | "center"
1004 | "col"
1005 | "colgroup"
1006 | "dd"
1007 | "details"
1008 | "dir"
1009 | "div"
1010 | "dl"
1011 | "dt"
1012 | "embed"
1013 | "fieldset"
1014 | "figcaption"
1015 | "figure"
1016 | "footer"
1017 | "form"
1018 | "frame"
1019 | "frameset"
1020 | "h1"
1021 | "h2"
1022 | "h3"
1023 | "h4"
1024 | "h5"
1025 | "h6"
1026 | "head"
1027 | "header"
1028 | "hgroup"
1029 | "hr"
1030 | "html"
1031 | "iframe"
1032 | "img"
1033 | "input"
1034 | "legend"
1035 | "li"
1036 | "link"
1037 | "listing"
1038 | "main"
1039 | "marquee"
1040 | "menu"
1041 | "meta"
1042 | "nav"
1043 | "noembed"
1044 | "noframes"
1045 | "noscript"
1046 | "object"
1047 | "ol"
1048 | "p"
1049 | "param"
1050 | "plaintext"
1051 | "pre"
1052 | "script"
1053 | "section"
1054 | "select"
1055 | "source"
1056 | "style"
1057 | "summary"
1058 | "table"
1059 | "tbody"
1060 | "td"
1061 | "template"
1062 | "textarea"
1063 | "tfoot"
1064 | "th"
1065 | "thead"
1066 | "title"
1067 | "tr"
1068 | "track"
1069 | "ul"
1070 | "wbr"
1071 | "xmp"
1072 )
1073}
1074
1075/// Parse an HTML string into a DOM document.
1076///
1077/// This is a convenience function that tokenizes the input and builds
1078/// a DOM tree using the tree builder.
1079pub fn parse_html(input: &str) -> Document {
1080 let mut builder = TreeBuilder::new();
1081 let mut tokenizer = Tokenizer::new(input);
1082 loop {
1083 let token = tokenizer.next_token();
1084 let is_eof = token == Token::Eof;
1085 builder.process_token(token);
1086 if is_eof {
1087 break;
1088 }
1089 }
1090 builder.finish()
1091}
1092
1093#[cfg(test)]
1094mod tests {
1095 use super::*;
1096 use we_dom::NodeData;
1097
1098 /// Helper: collect tag names of direct children of a node.
1099 fn child_tags(doc: &Document, node: NodeId) -> Vec<String> {
1100 doc.children(node)
1101 .filter_map(|id| doc.tag_name(id).map(String::from))
1102 .collect()
1103 }
1104
1105 /// Helper: get the text content of all text node children, concatenated.
1106 fn text_of_children(doc: &Document, node: NodeId) -> String {
1107 let mut result = String::new();
1108 for child in doc.children(node) {
1109 if let Some(text) = doc.text_content(child) {
1110 result.push_str(text);
1111 }
1112 }
1113 result
1114 }
1115
1116 #[test]
1117 fn parse_full_document() {
1118 let doc = parse_html(
1119 "<!DOCTYPE html><html><head><title>Test</title></head><body><p>Hello</p></body></html>",
1120 );
1121 let root = doc.root();
1122
1123 // Root should have one child: <html>
1124 let html_children: Vec<NodeId> = doc.children(root).collect();
1125 assert_eq!(html_children.len(), 1);
1126 let html = html_children[0];
1127 assert_eq!(doc.tag_name(html), Some("html"));
1128
1129 // <html> should have <head> and <body>
1130 let tags = child_tags(&doc, html);
1131 assert_eq!(tags, vec!["head", "body"]);
1132
1133 // <head> should have <title>
1134 let head = doc.children(html).next().unwrap();
1135 let head_tags = child_tags(&doc, head);
1136 assert_eq!(head_tags, vec!["title"]);
1137
1138 // <title> should contain "Test"
1139 let title = doc.children(head).next().unwrap();
1140 assert_eq!(text_of_children(&doc, title), "Test");
1141
1142 // <body> should have <p>
1143 let body = doc.children(html).nth(1).unwrap();
1144 let body_tags = child_tags(&doc, body);
1145 assert_eq!(body_tags, vec!["p"]);
1146
1147 // <p> should contain "Hello"
1148 let p = doc.children(body).next().unwrap();
1149 assert_eq!(text_of_children(&doc, p), "Hello");
1150 }
1151
1152 #[test]
1153 fn implicit_html_head_body() {
1154 // Minimal document: just <p>Hello
1155 let doc = parse_html("<p>Hello");
1156 let root = doc.root();
1157
1158 let html: Vec<NodeId> = doc.children(root).collect();
1159 assert_eq!(html.len(), 1);
1160 assert_eq!(doc.tag_name(html[0]), Some("html"));
1161
1162 let html_tags = child_tags(&doc, html[0]);
1163 assert_eq!(html_tags, vec!["head", "body"]);
1164
1165 let body = doc.children(html[0]).nth(1).unwrap();
1166 let body_tags = child_tags(&doc, body);
1167 assert_eq!(body_tags, vec!["p"]);
1168
1169 let p = doc.children(body).next().unwrap();
1170 assert_eq!(text_of_children(&doc, p), "Hello");
1171 }
1172
1173 #[test]
1174 fn void_element_br() {
1175 let doc = parse_html("<p>Line 1<br>Line 2</p>");
1176 let root = doc.root();
1177 let html = doc.children(root).next().unwrap();
1178 let body = doc.children(html).nth(1).unwrap();
1179 let p = doc.children(body).next().unwrap();
1180
1181 // <p> should have: text("Line 1"), <br>, text("Line 2")
1182 let children: Vec<NodeId> = doc.children(p).collect();
1183 assert_eq!(children.len(), 3);
1184 assert_eq!(doc.text_content(children[0]), Some("Line 1"));
1185 assert_eq!(doc.tag_name(children[1]), Some("br"));
1186 assert_eq!(doc.text_content(children[2]), Some("Line 2"));
1187 }
1188
1189 #[test]
1190 fn p_inside_p_closes_outer() {
1191 let doc = parse_html("<p>First<p>Second");
1192 let root = doc.root();
1193 let html = doc.children(root).next().unwrap();
1194 let body = doc.children(html).nth(1).unwrap();
1195
1196 // Should have two sibling <p> elements, not nested.
1197 let body_tags = child_tags(&doc, body);
1198 assert_eq!(body_tags, vec!["p", "p"]);
1199
1200 let children: Vec<NodeId> = doc.children(body).collect();
1201 assert_eq!(text_of_children(&doc, children[0]), "First");
1202 assert_eq!(text_of_children(&doc, children[1]), "Second");
1203 }
1204
1205 #[test]
1206 fn nested_div_elements() {
1207 let doc = parse_html("<div><div>inner</div></div>");
1208 let root = doc.root();
1209 let html = doc.children(root).next().unwrap();
1210 let body = doc.children(html).nth(1).unwrap();
1211
1212 let outer_div = doc.children(body).next().unwrap();
1213 assert_eq!(doc.tag_name(outer_div), Some("div"));
1214
1215 let inner_div = doc.children(outer_div).next().unwrap();
1216 assert_eq!(doc.tag_name(inner_div), Some("div"));
1217 assert_eq!(text_of_children(&doc, inner_div), "inner");
1218 }
1219
1220 #[test]
1221 fn inline_elements_nest_properly() {
1222 let doc = parse_html("<p><span><a href=\"#\">link</a></span></p>");
1223 let root = doc.root();
1224 let html = doc.children(root).next().unwrap();
1225 let body = doc.children(html).nth(1).unwrap();
1226
1227 let p = doc.children(body).next().unwrap();
1228 let span = doc.children(p).next().unwrap();
1229 assert_eq!(doc.tag_name(span), Some("span"));
1230
1231 let a = doc.children(span).next().unwrap();
1232 assert_eq!(doc.tag_name(a), Some("a"));
1233 assert_eq!(doc.get_attribute(a, "href"), Some("#"));
1234 assert_eq!(text_of_children(&doc, a), "link");
1235 }
1236
1237 #[test]
1238 fn headings() {
1239 let doc = parse_html("<h1>Title</h1><h2>Subtitle</h2><p>Body text</p>");
1240 let root = doc.root();
1241 let html = doc.children(root).next().unwrap();
1242 let body = doc.children(html).nth(1).unwrap();
1243
1244 let tags = child_tags(&doc, body);
1245 assert_eq!(tags, vec!["h1", "h2", "p"]);
1246 }
1247
1248 #[test]
1249 fn comment_nodes() {
1250 let doc = parse_html("<body><!-- a comment --><p>text</p></body>");
1251 let root = doc.root();
1252 let html = doc.children(root).next().unwrap();
1253 let body = doc.children(html).nth(1).unwrap();
1254
1255 let children: Vec<NodeId> = doc.children(body).collect();
1256 assert!(children.len() >= 2);
1257
1258 // First child should be a comment.
1259 match doc.node_data(children[0]) {
1260 NodeData::Comment { data } => assert_eq!(data, " a comment "),
1261 other => panic!("expected comment, got {:?}", other),
1262 }
1263 }
1264
1265 #[test]
1266 fn pre_element() {
1267 let doc = parse_html("<pre>line 1\nline 2</pre>");
1268 let root = doc.root();
1269 let html = doc.children(root).next().unwrap();
1270 let body = doc.children(html).nth(1).unwrap();
1271
1272 let pre = doc.children(body).next().unwrap();
1273 assert_eq!(doc.tag_name(pre), Some("pre"));
1274 assert_eq!(text_of_children(&doc, pre), "line 1\nline 2");
1275 }
1276
1277 #[test]
1278 fn attributes_preserved() {
1279 let doc =
1280 parse_html("<div id=\"main\" class=\"container\"><a href=\"/page\">link</a></div>");
1281 let root = doc.root();
1282 let html = doc.children(root).next().unwrap();
1283 let body = doc.children(html).nth(1).unwrap();
1284
1285 let div = doc.children(body).next().unwrap();
1286 assert_eq!(doc.get_attribute(div, "id"), Some("main"));
1287 assert_eq!(doc.get_attribute(div, "class"), Some("container"));
1288
1289 let a = doc.children(div).next().unwrap();
1290 assert_eq!(doc.get_attribute(a, "href"), Some("/page"));
1291 }
1292
1293 #[test]
1294 fn empty_document() {
1295 let doc = parse_html("");
1296 let root = doc.root();
1297 // Even an empty doc should get html/head/body from EOF handling.
1298 // The tree builder creates implicit elements.
1299 assert!(doc.children(root).next().is_some());
1300 }
1301
1302 #[test]
1303 fn just_text() {
1304 let doc = parse_html("Hello, world!");
1305 let root = doc.root();
1306 let html = doc.children(root).next().unwrap();
1307 let body = doc.children(html).nth(1).unwrap();
1308
1309 assert_eq!(text_of_children(&doc, body), "Hello, world!");
1310 }
1311
1312 #[test]
1313 fn heading_closes_open_p() {
1314 let doc = parse_html("<p>text<h1>heading</h1>");
1315 let root = doc.root();
1316 let html = doc.children(root).next().unwrap();
1317 let body = doc.children(html).nth(1).unwrap();
1318
1319 // <p> should be closed by <h1>, so they're siblings.
1320 let tags = child_tags(&doc, body);
1321 assert_eq!(tags, vec!["p", "h1"]);
1322 }
1323
1324 #[test]
1325 fn self_closing_void_elements() {
1326 let doc = parse_html("<p>before<br/>after</p>");
1327 let root = doc.root();
1328 let html = doc.children(root).next().unwrap();
1329 let body = doc.children(html).nth(1).unwrap();
1330 let p = doc.children(body).next().unwrap();
1331
1332 let children: Vec<NodeId> = doc.children(p).collect();
1333 assert_eq!(children.len(), 3);
1334 assert_eq!(doc.tag_name(children[1]), Some("br"));
1335 }
1336
1337 #[test]
1338 fn doctype_is_handled() {
1339 let doc = parse_html("<!DOCTYPE html><html><body></body></html>");
1340 let root = doc.root();
1341 let html = doc.children(root).next().unwrap();
1342 assert_eq!(doc.tag_name(html), Some("html"));
1343 }
1344
1345 #[test]
1346 fn tree_builder_step_by_step() {
1347 let mut builder = TreeBuilder::new();
1348 builder.process_token(Token::Doctype {
1349 name: Some("html".into()),
1350 public_id: None,
1351 system_id: None,
1352 force_quirks: false,
1353 });
1354 builder.process_token(Token::StartTag {
1355 name: "html".into(),
1356 attributes: vec![],
1357 self_closing: false,
1358 });
1359 builder.process_token(Token::StartTag {
1360 name: "head".into(),
1361 attributes: vec![],
1362 self_closing: false,
1363 });
1364 builder.process_token(Token::EndTag {
1365 name: "head".into(),
1366 });
1367 builder.process_token(Token::StartTag {
1368 name: "body".into(),
1369 attributes: vec![],
1370 self_closing: false,
1371 });
1372 builder.process_token(Token::StartTag {
1373 name: "p".into(),
1374 attributes: vec![],
1375 self_closing: false,
1376 });
1377 builder.process_token(Token::Character("Hello".into()));
1378 builder.process_token(Token::EndTag { name: "p".into() });
1379 builder.process_token(Token::EndTag {
1380 name: "body".into(),
1381 });
1382 builder.process_token(Token::EndTag {
1383 name: "html".into(),
1384 });
1385 builder.process_token(Token::Eof);
1386
1387 let doc = builder.finish();
1388 let root = doc.root();
1389 let html = doc.children(root).next().unwrap();
1390 assert_eq!(doc.tag_name(html), Some("html"));
1391
1392 let body = doc.children(html).nth(1).unwrap();
1393 let p = doc.children(body).next().unwrap();
1394 assert_eq!(text_of_children(&doc, p), "Hello");
1395 }
1396
1397 #[test]
1398 fn multiple_text_children_merge() {
1399 // When consecutive character tokens arrive, they should merge.
1400 let mut builder = TreeBuilder::new();
1401 builder.process_token(Token::StartTag {
1402 name: "p".into(),
1403 attributes: vec![],
1404 self_closing: false,
1405 });
1406 builder.process_token(Token::Character("Hello ".into()));
1407 builder.process_token(Token::Character("world".into()));
1408 builder.process_token(Token::EndTag { name: "p".into() });
1409 builder.process_token(Token::Eof);
1410
1411 let doc = builder.finish();
1412 let root = doc.root();
1413 let html = doc.children(root).next().unwrap();
1414 let body = doc.children(html).nth(1).unwrap();
1415 let p = doc.children(body).next().unwrap();
1416
1417 // Should be a single text node.
1418 let children: Vec<NodeId> = doc.children(p).collect();
1419 assert_eq!(children.len(), 1);
1420 assert_eq!(doc.text_content(children[0]), Some("Hello world"));
1421 }
1422
1423 #[test]
1424 fn parse_inline_svg() {
1425 let doc = parse_html(
1426 "<html><body><svg width=\"100\" height=\"100\"><rect width=\"50\" height=\"50\" fill=\"red\"/></svg></body></html>",
1427 );
1428 let root = doc.root();
1429 let html = doc.children(root).next().unwrap();
1430 let body = doc.children(html).nth(1).unwrap();
1431 let svg = doc.children(body).next().unwrap();
1432
1433 // SVG element should have SVG namespace.
1434 if let NodeData::Element {
1435 ref namespace,
1436 ref tag_name,
1437 ..
1438 } = *doc.node_data(svg)
1439 {
1440 assert_eq!(tag_name, "svg");
1441 assert_eq!(namespace.as_deref(), Some(SVG_NAMESPACE));
1442 } else {
1443 panic!("Expected Element node");
1444 }
1445
1446 // SVG should have width/height attributes.
1447 assert_eq!(doc.get_attribute(svg, "width"), Some("100"));
1448 assert_eq!(doc.get_attribute(svg, "height"), Some("100"));
1449
1450 // Rect child should also have SVG namespace.
1451 let rect = doc.children(svg).next().unwrap();
1452 if let NodeData::Element {
1453 ref namespace,
1454 ref tag_name,
1455 ..
1456 } = *doc.node_data(rect)
1457 {
1458 assert_eq!(tag_name, "rect");
1459 assert_eq!(namespace.as_deref(), Some(SVG_NAMESPACE));
1460 } else {
1461 panic!("Expected Element node");
1462 }
1463 assert_eq!(doc.get_attribute(rect, "fill"), Some("red"));
1464 }
1465
1466 #[test]
1467 fn parse_svg_with_nested_elements() {
1468 let doc = parse_html(
1469 "<body><svg width=\"200\" height=\"200\"><g><circle cx=\"50\" cy=\"50\" r=\"40\"/><text x=\"10\" y=\"80\">Hello</text></g></svg></body>",
1470 );
1471 let root = doc.root();
1472 let html = doc.children(root).next().unwrap();
1473 let body = doc.children(html).nth(1).unwrap();
1474 let svg = doc.children(body).next().unwrap();
1475
1476 assert_eq!(doc.tag_name(svg), Some("svg"));
1477 let g = doc.children(svg).next().unwrap();
1478 assert_eq!(doc.tag_name(g), Some("g"));
1479
1480 let children: Vec<String> = child_tags(&doc, g);
1481 assert_eq!(children, vec!["circle", "text"]);
1482
1483 // Text element should contain text content.
1484 let text_el = doc.children(g).nth(1).unwrap();
1485 assert_eq!(doc.deep_text_content(text_el), "Hello");
1486 }
1487
1488 #[test]
1489 fn svg_content_followed_by_html() {
1490 let doc = parse_html(
1491 "<body><svg width=\"50\" height=\"50\"><rect fill=\"blue\"/></svg><p>After SVG</p></body>",
1492 );
1493 let root = doc.root();
1494 let html = doc.children(root).next().unwrap();
1495 let body = doc.children(html).nth(1).unwrap();
1496
1497 let children: Vec<String> = child_tags(&doc, body);
1498 assert_eq!(children, vec!["svg", "p"]);
1499
1500 // SVG children should be in SVG namespace.
1501 let svg = doc.children(body).next().unwrap();
1502 let rect = doc.children(svg).next().unwrap();
1503 if let NodeData::Element { ref namespace, .. } = *doc.node_data(rect) {
1504 assert_eq!(namespace.as_deref(), Some(SVG_NAMESPACE));
1505 }
1506
1507 // Paragraph after SVG should be in HTML namespace (no namespace).
1508 let p = doc.children(body).nth(1).unwrap();
1509 if let NodeData::Element { ref namespace, .. } = *doc.node_data(p) {
1510 assert_eq!(namespace.as_deref(), None);
1511 }
1512 }
1513
1514 // --- Form element parsing tests (Phase 16) ---
1515
1516 #[test]
1517 fn parse_form_with_inputs() {
1518 let doc = parse_html(
1519 r#"<form action="/submit" method="post"><input type="text" name="user"><input type="password" name="pass"><button type="submit">Login</button></form>"#,
1520 );
1521 let root = doc.root();
1522 let html = doc.children(root).next().unwrap();
1523 let body = doc.children(html).nth(1).unwrap();
1524 let form = doc.children(body).next().unwrap();
1525
1526 assert_eq!(doc.tag_name(form), Some("form"));
1527 assert_eq!(doc.get_attribute(form, "action"), Some("/submit"));
1528 assert_eq!(doc.get_attribute(form, "method"), Some("post"));
1529
1530 let tags = child_tags(&doc, form);
1531 assert_eq!(tags, vec!["input", "input", "button"]);
1532
1533 // Check input attributes.
1534 let children: Vec<NodeId> = doc.children(form).collect();
1535 assert_eq!(doc.get_attribute(children[0], "type"), Some("text"));
1536 assert_eq!(doc.get_attribute(children[0], "name"), Some("user"));
1537 assert_eq!(doc.get_attribute(children[1], "type"), Some("password"));
1538 assert_eq!(doc.get_attribute(children[1], "name"), Some("pass"));
1539
1540 // Button contains text.
1541 assert_eq!(doc.get_attribute(children[2], "type"), Some("submit"));
1542 assert_eq!(text_of_children(&doc, children[2]), "Login");
1543 }
1544
1545 #[test]
1546 fn parse_textarea() {
1547 let doc = parse_html(
1548 r#"<form><textarea name="bio" rows="4" cols="50">Default text here</textarea></form>"#,
1549 );
1550 let root = doc.root();
1551 let html = doc.children(root).next().unwrap();
1552 let body = doc.children(html).nth(1).unwrap();
1553 let form = doc.children(body).next().unwrap();
1554 let textarea = doc.children(form).next().unwrap();
1555
1556 assert_eq!(doc.tag_name(textarea), Some("textarea"));
1557 assert_eq!(doc.get_attribute(textarea, "name"), Some("bio"));
1558 assert_eq!(doc.get_attribute(textarea, "rows"), Some("4"));
1559 assert_eq!(doc.get_attribute(textarea, "cols"), Some("50"));
1560 assert_eq!(text_of_children(&doc, textarea), "Default text here");
1561 }
1562
1563 #[test]
1564 fn parse_select_with_options() {
1565 let doc = parse_html(
1566 r#"<form><select name="color"><option value="r">Red</option><option value="g" selected>Green</option><option value="b">Blue</option></select></form>"#,
1567 );
1568 let root = doc.root();
1569 let html = doc.children(root).next().unwrap();
1570 let body = doc.children(html).nth(1).unwrap();
1571 let form = doc.children(body).next().unwrap();
1572 let select = doc.children(form).next().unwrap();
1573
1574 assert_eq!(doc.tag_name(select), Some("select"));
1575 assert_eq!(doc.get_attribute(select, "name"), Some("color"));
1576
1577 let options: Vec<NodeId> = doc.children(select).collect();
1578 assert_eq!(options.len(), 3);
1579 assert_eq!(doc.get_attribute(options[0], "value"), Some("r"));
1580 assert_eq!(text_of_children(&doc, options[0]), "Red");
1581 assert_eq!(doc.get_attribute(options[1], "value"), Some("g"));
1582 assert_eq!(doc.get_attribute(options[1], "selected"), Some(""));
1583 assert_eq!(doc.get_attribute(options[2], "value"), Some("b"));
1584 }
1585
1586 #[test]
1587 fn parse_select_with_optgroups() {
1588 let doc = parse_html(
1589 r#"<select><optgroup label="Primary"><option>Red</option><option>Blue</option></optgroup><optgroup label="Secondary"><option>Orange</option></optgroup></select>"#,
1590 );
1591 let root = doc.root();
1592 let html = doc.children(root).next().unwrap();
1593 let body = doc.children(html).nth(1).unwrap();
1594 let select = doc.children(body).next().unwrap();
1595
1596 assert_eq!(doc.tag_name(select), Some("select"));
1597 let groups = child_tags(&doc, select);
1598 assert_eq!(groups, vec!["optgroup", "optgroup"]);
1599
1600 let group1 = doc.children(select).next().unwrap();
1601 assert_eq!(doc.get_attribute(group1, "label"), Some("Primary"));
1602 let options: Vec<String> = child_tags(&doc, group1);
1603 assert_eq!(options, vec!["option", "option"]);
1604 }
1605
1606 #[test]
1607 fn parse_fieldset_and_legend() {
1608 let doc = parse_html(
1609 r#"<form><fieldset><legend>Personal Info</legend><input type="text" name="name"><input type="email" name="email"></fieldset></form>"#,
1610 );
1611 let root = doc.root();
1612 let html = doc.children(root).next().unwrap();
1613 let body = doc.children(html).nth(1).unwrap();
1614 let form = doc.children(body).next().unwrap();
1615 let fieldset = doc.children(form).next().unwrap();
1616
1617 assert_eq!(doc.tag_name(fieldset), Some("fieldset"));
1618
1619 let fieldset_tags = child_tags(&doc, fieldset);
1620 assert_eq!(fieldset_tags, vec!["legend", "input", "input"]);
1621
1622 let legend = doc.children(fieldset).next().unwrap();
1623 assert_eq!(text_of_children(&doc, legend), "Personal Info");
1624 }
1625
1626 #[test]
1627 fn parse_label_with_for_attribute() {
1628 let doc = parse_html(
1629 r#"<form><label for="name">Name:</label><input type="text" id="name" name="name"></form>"#,
1630 );
1631 let root = doc.root();
1632 let html = doc.children(root).next().unwrap();
1633 let body = doc.children(html).nth(1).unwrap();
1634 let form = doc.children(body).next().unwrap();
1635
1636 let tags = child_tags(&doc, form);
1637 assert_eq!(tags, vec!["label", "input"]);
1638
1639 let label = doc.children(form).next().unwrap();
1640 assert_eq!(doc.get_attribute(label, "for"), Some("name"));
1641 assert_eq!(text_of_children(&doc, label), "Name:");
1642
1643 // Verify label_control resolves via `for` attribute.
1644 let input = doc.children(form).nth(1).unwrap();
1645 assert_eq!(doc.label_control(label), Some(input));
1646 }
1647
1648 #[test]
1649 fn parse_label_implicit_association() {
1650 let doc = parse_html(r#"<label>Name: <input type="text" name="name"></label>"#);
1651 let root = doc.root();
1652 let html = doc.children(root).next().unwrap();
1653 let body = doc.children(html).nth(1).unwrap();
1654 let label = doc.children(body).next().unwrap();
1655
1656 assert_eq!(doc.tag_name(label), Some("label"));
1657 let input = doc.label_control(label).unwrap();
1658 assert_eq!(doc.tag_name(input), Some("input"));
1659 assert_eq!(doc.get_attribute(input, "name"), Some("name"));
1660 }
1661
1662 #[test]
1663 fn form_closes_p_in_button_scope() {
1664 let doc = parse_html("<p>text<form><input></form>");
1665 let root = doc.root();
1666 let html = doc.children(root).next().unwrap();
1667 let body = doc.children(html).nth(1).unwrap();
1668
1669 // <p> should be closed by <form>, so they're siblings.
1670 let tags = child_tags(&doc, body);
1671 assert_eq!(tags, vec!["p", "form"]);
1672 }
1673
1674 #[test]
1675 fn nested_form_is_ignored() {
1676 let doc = parse_html("<form id=\"outer\"><form id=\"inner\"><input></form></form>");
1677 let root = doc.root();
1678 let html = doc.children(root).next().unwrap();
1679 let body = doc.children(html).nth(1).unwrap();
1680
1681 // Only one <form> should exist (nested form is ignored).
1682 let tags = child_tags(&doc, body);
1683 assert_eq!(tags, vec!["form"]);
1684
1685 let form = doc.children(body).next().unwrap();
1686 assert_eq!(doc.get_attribute(form, "id"), Some("outer"));
1687
1688 // Input should be a child of the outer form.
1689 let form_tags = child_tags(&doc, form);
1690 assert_eq!(form_tags, vec!["input"]);
1691 }
1692
1693 #[test]
1694 fn button_scope_handling() {
1695 let doc = parse_html("<button>First</button><button>Second</button>");
1696 let root = doc.root();
1697 let html = doc.children(root).next().unwrap();
1698 let body = doc.children(html).nth(1).unwrap();
1699
1700 let tags = child_tags(&doc, body);
1701 assert_eq!(tags, vec!["button", "button"]);
1702 }
1703
1704 #[test]
1705 fn form_elements_collection() {
1706 let doc = parse_html(
1707 r#"<form><input name="a"><div><input name="b"></div><select name="c"><option>X</option></select><textarea name="d"></textarea></form>"#,
1708 );
1709 let root = doc.root();
1710 let html = doc.children(root).next().unwrap();
1711 let body = doc.children(html).nth(1).unwrap();
1712 let form = doc.children(body).next().unwrap();
1713
1714 let elements = doc.form_elements(form);
1715 assert_eq!(elements.len(), 4);
1716
1717 // Verify they're the right elements.
1718 assert_eq!(doc.get_attribute(elements[0], "name"), Some("a"));
1719 assert_eq!(doc.get_attribute(elements[1], "name"), Some("b"));
1720 assert_eq!(doc.get_attribute(elements[2], "name"), Some("c"));
1721 assert_eq!(doc.get_attribute(elements[3], "name"), Some("d"));
1722 }
1723
1724 #[test]
1725 fn form_owner_from_parsed_tree() {
1726 let doc = parse_html(r#"<form id="f"><div><input id="i"></div></form>"#);
1727 let root = doc.root();
1728 let html = doc.children(root).next().unwrap();
1729 let body = doc.children(html).nth(1).unwrap();
1730 let form = doc.children(body).next().unwrap();
1731
1732 let input = doc.get_element_by_id("i").unwrap();
1733 assert_eq!(doc.form_owner(input), Some(form));
1734 }
1735
1736 #[test]
1737 fn parse_input_types() {
1738 let doc = parse_html(
1739 r#"<form>
1740 <input type="text">
1741 <input type="password">
1742 <input type="checkbox" checked>
1743 <input type="radio" name="choice" value="a">
1744 <input type="submit" value="Go">
1745 <input type="reset">
1746 <input type="hidden" name="token" value="abc">
1747 <input type="number" min="0" max="100">
1748 <input type="email">
1749 <input type="url">
1750 <input type="search">
1751 <input type="tel">
1752 </form>"#,
1753 );
1754 let root = doc.root();
1755 let html = doc.children(root).next().unwrap();
1756 let body = doc.children(html).nth(1).unwrap();
1757 let form = doc.children(body).next().unwrap();
1758
1759 let inputs: Vec<NodeId> = doc
1760 .children(form)
1761 .filter(|&id| doc.tag_name(id) == Some("input"))
1762 .collect();
1763 assert_eq!(inputs.len(), 12);
1764
1765 assert_eq!(doc.get_attribute(inputs[0], "type"), Some("text"));
1766 assert_eq!(doc.get_attribute(inputs[1], "type"), Some("password"));
1767 assert_eq!(doc.get_attribute(inputs[2], "type"), Some("checkbox"));
1768 assert_eq!(doc.get_attribute(inputs[2], "checked"), Some(""));
1769 assert_eq!(doc.get_attribute(inputs[3], "type"), Some("radio"));
1770 assert_eq!(doc.get_attribute(inputs[3], "name"), Some("choice"));
1771 assert_eq!(doc.get_attribute(inputs[4], "type"), Some("submit"));
1772 assert_eq!(doc.get_attribute(inputs[4], "value"), Some("Go"));
1773 assert_eq!(doc.get_attribute(inputs[5], "type"), Some("reset"));
1774 assert_eq!(doc.get_attribute(inputs[6], "type"), Some("hidden"));
1775 assert_eq!(doc.get_attribute(inputs[6], "value"), Some("abc"));
1776 assert_eq!(doc.get_attribute(inputs[7], "type"), Some("number"));
1777 assert_eq!(doc.get_attribute(inputs[7], "min"), Some("0"));
1778 assert_eq!(doc.get_attribute(inputs[7], "max"), Some("100"));
1779 assert_eq!(doc.get_attribute(inputs[8], "type"), Some("email"));
1780 assert_eq!(doc.get_attribute(inputs[9], "type"), Some("url"));
1781 assert_eq!(doc.get_attribute(inputs[10], "type"), Some("search"));
1782 assert_eq!(doc.get_attribute(inputs[11], "type"), Some("tel"));
1783 }
1784
1785 #[test]
1786 fn select_closes_on_input() {
1787 // An <input> inside a <select> should close the select.
1788 let doc = parse_html("<select><option>A</option><input type=\"text\"></select>");
1789 let root = doc.root();
1790 let html = doc.children(root).next().unwrap();
1791 let body = doc.children(html).nth(1).unwrap();
1792
1793 let tags = child_tags(&doc, body);
1794 // Select should be closed before input, making them siblings.
1795 assert_eq!(tags, vec!["select", "input"]);
1796 }
1797}