//! HTML tree builder: construct a DOM tree from tokenizer output. //! //! Implements a simplified subset of the WHATWG HTML5 tree construction //! algorithm for Phase 3 of the browser engine. use we_dom::{Document, NodeId}; use crate::{Token, Tokenizer}; /// Insertion modes for the tree builder state machine. #[derive(Debug, Clone, Copy, PartialEq)] enum InsertionMode { Initial, BeforeHtml, BeforeHead, InHead, Text, AfterHead, InBody, InSelect, AfterBody, AfterAfterBody, } /// Returns true if the given tag name is a void element (self-closing, no end tag). fn is_void_element(tag: &str) -> bool { matches!( tag, "area" | "base" | "br" | "col" | "embed" | "hr" | "img" | "input" | "link" | "meta" | "param" | "source" | "track" | "wbr" ) } /// SVG namespace URI. const SVG_NAMESPACE: &str = "http://www.w3.org/2000/svg"; /// HTML tree builder that processes tokens and constructs a DOM tree. pub struct TreeBuilder { document: Document, /// Stack of open elements (the current nesting context). open_elements: Vec, head_element: Option, body_element: Option, /// The form element pointer (per HTML spec ยง13.2.4.1). form_element: Option, insertion_mode: InsertionMode, /// Original insertion mode, saved when switching to Text mode. original_insertion_mode: Option, /// Pending text for the Text insertion mode (e.g., inside ``). pending_text: String, /// Depth counter for SVG foreign content. >0 means we are inside an `<svg>` element. svg_depth: usize, } impl TreeBuilder { /// Create a new tree builder with an empty document. pub fn new() -> Self { TreeBuilder { document: Document::new(), open_elements: Vec::new(), head_element: None, body_element: None, form_element: None, insertion_mode: InsertionMode::Initial, original_insertion_mode: None, pending_text: String::new(), svg_depth: 0, } } /// Process a single token, updating the DOM tree. pub fn process_token(&mut self, token: Token) { // Handle SVG foreign content. if self.svg_depth > 0 { self.handle_svg_content(token); return; } match self.insertion_mode { InsertionMode::Initial => self.handle_initial(token), InsertionMode::BeforeHtml => self.handle_before_html(token), InsertionMode::BeforeHead => self.handle_before_head(token), InsertionMode::InHead => self.handle_in_head(token), InsertionMode::Text => self.handle_text(token), InsertionMode::AfterHead => self.handle_after_head(token), InsertionMode::InBody => self.handle_in_body(token), InsertionMode::InSelect => self.handle_in_select(token), InsertionMode::AfterBody => self.handle_after_body(token), InsertionMode::AfterAfterBody => self.handle_after_after_body(token), } } /// Finish building and return the constructed DOM document. pub fn finish(self) -> Document { self.document } // --- Insertion mode handlers --- fn handle_initial(&mut self, token: Token) { match token { Token::Doctype { .. } => { // For Phase 3, we just acknowledge the DOCTYPE and move on. self.insertion_mode = InsertionMode::BeforeHtml; } Token::Comment(data) => { let comment = self.document.create_comment(&data); let root = self.document.root(); self.document.append_child(root, comment); } Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { // Ignore whitespace in Initial mode. } _ => { // Anything else: switch to BeforeHtml and reprocess. self.insertion_mode = InsertionMode::BeforeHtml; self.handle_before_html(token); } } } fn handle_before_html(&mut self, token: Token) { match token { Token::Doctype { .. } => { /* ignore */ } Token::Comment(data) => { let comment = self.document.create_comment(&data); let root = self.document.root(); self.document.append_child(root, comment); } Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { // Ignore whitespace. } Token::StartTag { ref name, .. } if name == "html" => { let html = self.create_element_from_token(&token); let root = self.document.root(); self.document.append_child(root, html); self.open_elements.push(html); self.insertion_mode = InsertionMode::BeforeHead; } Token::EndTag { ref name } if name != "head" && name != "body" && name != "html" && name != "br" => { // Parse error, ignore. } _ => { // Create an implicit <html> element. let html = self.document.create_element("html"); let root = self.document.root(); self.document.append_child(root, html); self.open_elements.push(html); self.insertion_mode = InsertionMode::BeforeHead; self.handle_before_head(token); } } } fn handle_before_head(&mut self, token: Token) { match token { Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { // Ignore whitespace. } Token::Comment(data) => { self.insert_comment(&data); } Token::Doctype { .. } => { /* ignore */ } Token::StartTag { ref name, .. } if name == "html" => { // Process as if InBody. self.handle_in_body(token); } Token::StartTag { ref name, .. } if name == "head" => { let head = self.create_element_from_token(&token); self.insert_node(head); self.open_elements.push(head); self.head_element = Some(head); self.insertion_mode = InsertionMode::InHead; } Token::EndTag { ref name } if name != "head" && name != "body" && name != "html" && name != "br" => { // Parse error, ignore. } _ => { // Implied <head>. let head = self.document.create_element("head"); self.insert_node(head); self.open_elements.push(head); self.head_element = Some(head); self.insertion_mode = InsertionMode::InHead; self.handle_in_head(token); } } } fn handle_in_head(&mut self, token: Token) { match token { Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { self.insert_text(s); } Token::Comment(data) => { self.insert_comment(&data); } Token::Doctype { .. } => { /* ignore */ } Token::StartTag { ref name, .. } if name == "title" => { let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); self.original_insertion_mode = Some(self.insertion_mode); self.insertion_mode = InsertionMode::Text; } Token::StartTag { ref name, .. } if name == "style" || name == "script" || name == "noscript" => { let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); self.original_insertion_mode = Some(self.insertion_mode); self.insertion_mode = InsertionMode::Text; } Token::StartTag { ref name, .. } if name == "meta" || name == "link" => { let elem = self.create_element_from_token(&token); self.insert_node(elem); // Void elements: don't push onto stack. } Token::StartTag { ref name, .. } if name == "head" => { // Ignore duplicate <head>. } Token::EndTag { ref name } if name == "head" => { self.pop_until("head"); self.insertion_mode = InsertionMode::AfterHead; } Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => { // Parse error, ignore. } _ => { // Pop <head> and switch to AfterHead, then reprocess. self.pop_until("head"); self.insertion_mode = InsertionMode::AfterHead; self.handle_after_head(token); } } } fn handle_text(&mut self, token: Token) { match token { Token::Character(s) => { self.pending_text.push_str(&s); } Token::EndTag { .. } => { // Flush pending text. if !self.pending_text.is_empty() { let text = self.pending_text.clone(); self.pending_text.clear(); self.insert_text(&text); } // Pop the element (e.g., <title>). self.open_elements.pop(); self.insertion_mode = self .original_insertion_mode .unwrap_or(InsertionMode::InBody); self.original_insertion_mode = None; } Token::Eof => { // Flush pending text. if !self.pending_text.is_empty() { let text = self.pending_text.clone(); self.pending_text.clear(); self.insert_text(&text); } self.open_elements.pop(); self.insertion_mode = self .original_insertion_mode .unwrap_or(InsertionMode::InBody); self.original_insertion_mode = None; self.process_token(Token::Eof); } _ => {} } } fn handle_after_head(&mut self, token: Token) { match token { Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { self.insert_text(s); } Token::Comment(data) => { self.insert_comment(&data); } Token::Doctype { .. } => { /* ignore */ } Token::StartTag { ref name, .. } if name == "html" => { self.handle_in_body(token); } Token::StartTag { ref name, .. } if name == "body" => { let body = self.create_element_from_token(&token); self.insert_node(body); self.open_elements.push(body); self.body_element = Some(body); self.insertion_mode = InsertionMode::InBody; } Token::StartTag { ref name, .. } if name == "head" => { // Ignore. } Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => { // Ignore. } _ => { // Implied <body>. let body = self.document.create_element("body"); self.insert_node(body); self.open_elements.push(body); self.body_element = Some(body); self.insertion_mode = InsertionMode::InBody; self.handle_in_body(token); } } } fn handle_in_body(&mut self, token: Token) { match token { Token::Character(s) => { self.insert_text(&s); } Token::Comment(data) => { self.insert_comment(&data); } Token::Doctype { .. } => { /* ignore */ } Token::StartTag { ref name, .. } if name == "html" => { // Merge attributes onto existing <html> element. if let Token::StartTag { attributes, .. } = &token { if let Some(&html_id) = self.open_elements.first() { for (attr_name, attr_value) in attributes { if self.document.get_attribute(html_id, attr_name).is_none() { self.document.set_attribute(html_id, attr_name, attr_value); } } } } } Token::StartTag { ref name, .. } if name == "body" || name == "head" || name == "title" || name == "style" || name == "script" => { match name.as_str() { "body" => { // Ignore duplicate <body>. } "head" => { // Ignore <head> in body. } _ => { // title/style/script: process using InHead rules self.handle_in_head(token); } } } Token::StartTag { ref name, .. } if name == "p" || name == "div" || name == "h1" || name == "h2" || name == "h3" || name == "h4" || name == "h5" || name == "h6" || name == "pre" || name == "blockquote" || name == "ul" || name == "ol" || name == "li" => { // If there's a <p> in button scope, close it first. if self.has_element_in_button_scope("p") { self.close_p_element(); } let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); } // --- Form elements (Phase 16) --- Token::StartTag { ref name, .. } if name == "form" => { // Per spec: if the form element pointer is not null, ignore. if self.form_element.is_some() { // Parse error, ignore the token. } else { if self.has_element_in_button_scope("p") { self.close_p_element(); } let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); self.form_element = Some(elem); } } Token::StartTag { ref name, .. } if name == "fieldset" => { if self.has_element_in_button_scope("p") { self.close_p_element(); } let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); } Token::StartTag { ref name, .. } if name == "button" => { // If there's a button in scope, close it first. if self.has_element_in_scope("button") { self.generate_implied_end_tags(None); self.pop_until("button"); } let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); } Token::StartTag { ref name, .. } if name == "textarea" => { let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); // Switch to Text mode to collect raw text content. self.original_insertion_mode = Some(self.insertion_mode); self.insertion_mode = InsertionMode::Text; } Token::StartTag { ref name, .. } if name == "select" => { let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); self.insertion_mode = InsertionMode::InSelect; } Token::StartTag { ref name, .. } if name == "optgroup" || name == "option" => { // Close any currently open <option>. if let Some(&top) = self.open_elements.last() { if self.document.tag_name(top) == Some("option") { self.open_elements.pop(); } } // Also close open <optgroup> when a new <optgroup> starts. if name == "optgroup" { if let Some(&top) = self.open_elements.last() { if self.document.tag_name(top) == Some("optgroup") { self.open_elements.pop(); } } } let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); } // --- SVG / iframe / void / generic start tags --- Token::StartTag { ref name, .. } if name == "svg" => { let elem = self.create_svg_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); self.svg_depth = 1; } Token::StartTag { ref name, .. } if name == "iframe" => { // Per HTML spec, <iframe> uses RAWTEXT parsing: content between // <iframe> and </iframe> is raw text (fallback content), not HTML. let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); self.original_insertion_mode = Some(self.insertion_mode); self.insertion_mode = InsertionMode::Text; } Token::StartTag { ref name, .. } if is_void_element(name) => { let elem = self.create_element_from_token(&token); self.insert_node(elem); // Don't push void elements onto the stack. } Token::StartTag { .. } => { // Generic start tag: create element and push onto stack. let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); } Token::EndTag { ref name } if name == "body" => { if self.has_element_in_scope("body") { self.insertion_mode = InsertionMode::AfterBody; } } Token::EndTag { ref name } if name == "html" => { if self.has_element_in_scope("body") { self.insertion_mode = InsertionMode::AfterBody; self.handle_after_body(token); } } Token::EndTag { ref name } if name == "p" => { if !self.has_element_in_button_scope("p") { // No matching <p>: insert an empty one, then close it. let p = self.document.create_element("p"); self.insert_node(p); self.open_elements.push(p); } self.close_p_element(); } // --- Form end tags (Phase 16) --- Token::EndTag { ref name } if name == "form" => { // Per spec: reset the form element pointer, then pop. self.form_element = None; if self.has_element_in_scope("form") { self.generate_implied_end_tags(Some("form")); self.pop_until("form"); } } Token::EndTag { ref name } if name == "button" => { if self.has_element_in_scope("button") { self.generate_implied_end_tags(None); self.pop_until("button"); } } Token::EndTag { ref name } if name == "fieldset" => { if self.has_element_in_scope("fieldset") { self.generate_implied_end_tags(None); self.pop_until("fieldset"); } } Token::EndTag { ref name } if name == "optgroup" || name == "option" => { if self.has_element_in_scope(name) { self.generate_implied_end_tags(Some(name)); self.pop_until(name); } } // --- End of form end tags --- Token::EndTag { ref name } if name == "div" || name == "pre" || name == "blockquote" || name == "ul" || name == "ol" || name == "li" => { if self.has_element_in_scope(name) { self.generate_implied_end_tags(Some(name)); self.pop_until(name); } } Token::EndTag { ref name } if name == "h1" || name == "h2" || name == "h3" || name == "h4" || name == "h5" || name == "h6" => { if self.has_heading_in_scope() { self.generate_implied_end_tags(None); // Pop until we find a heading element. while let Some(id) = self.open_elements.pop() { if let Some(tag) = self.document.tag_name(id) { if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") { break; } } } } } Token::EndTag { ref name } => { // Generic end tag: walk back through open elements. self.handle_any_other_end_tag(name); } Token::Eof => { // Stop parsing. } } } /// Handle tokens inside a `<select>` element (InSelect insertion mode). fn handle_in_select(&mut self, token: Token) { match token { Token::Character(s) => { self.insert_text(&s); } Token::Comment(data) => { self.insert_comment(&data); } Token::StartTag { ref name, .. } if name == "option" => { // Close any currently open <option>. if let Some(&top) = self.open_elements.last() { if self.document.tag_name(top) == Some("option") { self.open_elements.pop(); } } let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); } Token::StartTag { ref name, .. } if name == "optgroup" => { // Close any open <option>, then close any open <optgroup>. if let Some(&top) = self.open_elements.last() { if self.document.tag_name(top) == Some("option") { self.open_elements.pop(); } } if let Some(&top) = self.open_elements.last() { if self.document.tag_name(top) == Some("optgroup") { self.open_elements.pop(); } } let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); } Token::EndTag { ref name } if name == "option" => { if let Some(&top) = self.open_elements.last() { if self.document.tag_name(top) == Some("option") { self.open_elements.pop(); } } } Token::EndTag { ref name } if name == "optgroup" => { // If the top is <option>, pop it first. if let Some(&top) = self.open_elements.last() { if self.document.tag_name(top) == Some("option") { self.open_elements.pop(); } } if let Some(&top) = self.open_elements.last() { if self.document.tag_name(top) == Some("optgroup") { self.open_elements.pop(); } } } Token::EndTag { ref name } if name == "select" => { self.pop_until("select"); self.insertion_mode = InsertionMode::InBody; } Token::StartTag { ref name, .. } if name == "select" => { // Nested <select>: close the current one (parse error). self.pop_until("select"); self.insertion_mode = InsertionMode::InBody; } Token::StartTag { ref name, .. } if name == "input" || name == "textarea" => { // Per spec: these close the <select> and reprocess in InBody. self.pop_until("select"); self.insertion_mode = InsertionMode::InBody; self.handle_in_body(token); } Token::Eof => { // Stop parsing. } _ => { // Ignore anything else in select. } } } fn handle_after_body(&mut self, token: Token) { match token { Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { // Process whitespace as in InBody. self.handle_in_body(token); } Token::Comment(data) => { // Insert as last child of the first element (html). let comment = self.document.create_comment(&data); if let Some(&html) = self.open_elements.first() { self.document.append_child(html, comment); } } Token::Doctype { .. } => { /* ignore */ } Token::EndTag { ref name } if name == "html" => { self.insertion_mode = InsertionMode::AfterAfterBody; } Token::Eof => { // Stop parsing. } _ => { // Anything else: switch back to InBody and reprocess. self.insertion_mode = InsertionMode::InBody; self.handle_in_body(token); } } } fn handle_after_after_body(&mut self, token: Token) { match token { Token::Comment(data) => { let comment = self.document.create_comment(&data); let root = self.document.root(); self.document.append_child(root, comment); } Token::Doctype { .. } => { /* ignore */ } Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { self.handle_in_body(token); } Token::Eof => { // Stop. } _ => { self.insertion_mode = InsertionMode::InBody; self.handle_in_body(token); } } } // --- SVG foreign content --- /// Handle tokens while inside an SVG subtree. fn handle_svg_content(&mut self, token: Token) { match token { Token::Character(s) => { self.insert_text(&s); } Token::Comment(data) => { self.insert_comment(&data); } Token::StartTag { ref name, .. } if name == "svg" => { // Nested <svg>. let elem = self.create_svg_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); self.svg_depth += 1; } Token::StartTag { self_closing, .. } => { let elem = self.create_svg_element_from_token(&token); self.insert_node(elem); if !self_closing { self.open_elements.push(elem); } } Token::EndTag { ref name } if name == "svg" => { self.pop_until("svg"); self.svg_depth -= 1; } Token::EndTag { ref name } => { // Pop matching element from the stack. self.handle_any_other_end_tag(name); } Token::Eof => {} _ => {} } } // --- Helper methods --- /// Create a DOM element from a StartTag token with SVG namespace, setting attributes. fn create_svg_element_from_token(&mut self, token: &Token) -> NodeId { if let Token::StartTag { name, attributes, .. } = token { let id = self.document.create_element_ns(name, Some(SVG_NAMESPACE)); for (attr_name, attr_value) in attributes { self.document.set_attribute(id, attr_name, attr_value); } id } else { self.document .create_element_ns("unknown", Some(SVG_NAMESPACE)) } } /// Create a DOM element from a StartTag token, setting attributes. fn create_element_from_token(&mut self, token: &Token) -> NodeId { if let Token::StartTag { name, attributes, .. } = token { let id = self.document.create_element(name); for (attr_name, attr_value) in attributes { self.document.set_attribute(id, attr_name, attr_value); } id } else { // Should only be called with StartTag tokens. self.document.create_element("unknown") } } /// Insert a node at the current insertion point (last open element). fn insert_node(&mut self, node: NodeId) { let parent = self .open_elements .last() .copied() .unwrap_or_else(|| self.document.root()); self.document.append_child(parent, node); } /// Insert a text node at the current insertion point. /// If the last child is already a text node, append to it. fn insert_text(&mut self, data: &str) { let parent = self .open_elements .last() .copied() .unwrap_or_else(|| self.document.root()); // Try to merge with existing text node. if let Some(last_child) = self.document.last_child(parent) { if let we_dom::NodeData::Text { data: ref existing } = *self.document.node_data(last_child) { let mut merged = existing.clone(); merged.push_str(data); self.document.set_text_content(last_child, &merged); return; } } let text = self.document.create_text(data); self.document.append_child(parent, text); } /// Insert a comment node at the current insertion point. fn insert_comment(&mut self, data: &str) { let comment = self.document.create_comment(data); self.insert_node(comment); } /// Pop elements from the stack until we find one with the given tag name. /// The matching element is also popped. fn pop_until(&mut self, tag_name: &str) { while let Some(id) = self.open_elements.pop() { if self.document.tag_name(id) == Some(tag_name) { return; } } } /// Check if the given tag name is "in scope" (simplified). /// In scope means there's an element with that tag on the stack, /// and no scope barrier element between it and the top. fn has_element_in_scope(&self, target: &str) -> bool { for &id in self.open_elements.iter().rev() { if let Some(tag) = self.document.tag_name(id) { if tag == target { return true; } // Scope barrier elements. if matches!( tag, "applet" | "caption" | "html" | "table" | "td" | "th" | "marquee" | "object" | "template" ) { return false; } } } false } /// Check if the given tag name is "in button scope". fn has_element_in_button_scope(&self, target: &str) -> bool { for &id in self.open_elements.iter().rev() { if let Some(tag) = self.document.tag_name(id) { if tag == target { return true; } // Button scope includes all regular scope barriers plus <button>. if matches!( tag, "applet" | "button" | "caption" | "html" | "table" | "td" | "th" | "marquee" | "object" | "template" ) { return false; } } } false } /// Check if any heading element (h1-h6) is in scope. fn has_heading_in_scope(&self) -> bool { for &id in self.open_elements.iter().rev() { if let Some(tag) = self.document.tag_name(id) { if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") { return true; } if matches!( tag, "applet" | "caption" | "html" | "table" | "td" | "th" | "marquee" | "object" | "template" ) { return false; } } } false } /// Close a `<p>` element: generate implied end tags (excluding p), /// then pop until we find the `<p>`. fn close_p_element(&mut self) { self.generate_implied_end_tags(Some("p")); self.pop_until("p"); } /// Generate implied end tags. If `exclude` is provided, don't generate /// an end tag for that element. fn generate_implied_end_tags(&mut self, exclude: Option<&str>) { loop { let should_pop = self .open_elements .last() .and_then(|&id| self.document.tag_name(id)) .map(|tag| { if let Some(excl) = exclude { if tag == excl { return false; } } matches!( tag, "dd" | "dt" | "li" | "optgroup" | "option" | "p" | "rb" | "rp" | "rt" | "rtc" ) }) .unwrap_or(false); if should_pop { self.open_elements.pop(); } else { break; } } } /// Handle a generic end tag by walking back through open elements /// using the "any other end tag" algorithm. fn handle_any_other_end_tag(&mut self, name: &str) { // Walk backwards through the stack. let mut i = self.open_elements.len(); while i > 0 { i -= 1; let id = self.open_elements[i]; if self.document.tag_name(id) == Some(name) { // Pop everything above and including this element. self.open_elements.truncate(i); return; } // If this is a "special" element, stop. if let Some(tag) = self.document.tag_name(id) { if is_special_element(tag) { return; } } } } } impl Default for TreeBuilder { fn default() -> Self { Self::new() } } /// Returns true if the tag is a "special" element per the HTML spec. fn is_special_element(tag: &str) -> bool { matches!( tag, "address" | "applet" | "area" | "article" | "aside" | "base" | "basefont" | "bgsound" | "blockquote" | "body" | "br" | "button" | "caption" | "center" | "col" | "colgroup" | "dd" | "details" | "dir" | "div" | "dl" | "dt" | "embed" | "fieldset" | "figcaption" | "figure" | "footer" | "form" | "frame" | "frameset" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "head" | "header" | "hgroup" | "hr" | "html" | "iframe" | "img" | "input" | "legend" | "li" | "link" | "listing" | "main" | "marquee" | "menu" | "meta" | "nav" | "noembed" | "noframes" | "noscript" | "object" | "ol" | "p" | "param" | "plaintext" | "pre" | "script" | "section" | "select" | "source" | "style" | "summary" | "table" | "tbody" | "td" | "template" | "textarea" | "tfoot" | "th" | "thead" | "title" | "tr" | "track" | "ul" | "wbr" | "xmp" ) } /// Parse an HTML string into a DOM document. /// /// This is a convenience function that tokenizes the input and builds /// a DOM tree using the tree builder. pub fn parse_html(input: &str) -> Document { let mut builder = TreeBuilder::new(); let mut tokenizer = Tokenizer::new(input); loop { let token = tokenizer.next_token(); let is_eof = token == Token::Eof; builder.process_token(token); if is_eof { break; } } builder.finish() } #[cfg(test)] mod tests { use super::*; use we_dom::NodeData; /// Helper: collect tag names of direct children of a node. fn child_tags(doc: &Document, node: NodeId) -> Vec<String> { doc.children(node) .filter_map(|id| doc.tag_name(id).map(String::from)) .collect() } /// Helper: get the text content of all text node children, concatenated. fn text_of_children(doc: &Document, node: NodeId) -> String { let mut result = String::new(); for child in doc.children(node) { if let Some(text) = doc.text_content(child) { result.push_str(text); } } result } #[test] fn parse_full_document() { let doc = parse_html( "<!DOCTYPE html><html><head><title>Test

Hello

", ); let root = doc.root(); // Root should have one child: let html_children: Vec = doc.children(root).collect(); assert_eq!(html_children.len(), 1); let html = html_children[0]; assert_eq!(doc.tag_name(html), Some("html")); // should have and let tags = child_tags(&doc, html); assert_eq!(tags, vec!["head", "body"]); // should have let head = doc.children(html).next().unwrap(); let head_tags = child_tags(&doc, head); assert_eq!(head_tags, vec!["title"]); // <title> should contain "Test" let title = doc.children(head).next().unwrap(); assert_eq!(text_of_children(&doc, title), "Test"); // <body> should have <p> let body = doc.children(html).nth(1).unwrap(); let body_tags = child_tags(&doc, body); assert_eq!(body_tags, vec!["p"]); // <p> should contain "Hello" let p = doc.children(body).next().unwrap(); assert_eq!(text_of_children(&doc, p), "Hello"); } #[test] fn implicit_html_head_body() { // Minimal document: just <p>Hello let doc = parse_html("<p>Hello"); let root = doc.root(); let html: Vec<NodeId> = doc.children(root).collect(); assert_eq!(html.len(), 1); assert_eq!(doc.tag_name(html[0]), Some("html")); let html_tags = child_tags(&doc, html[0]); assert_eq!(html_tags, vec!["head", "body"]); let body = doc.children(html[0]).nth(1).unwrap(); let body_tags = child_tags(&doc, body); assert_eq!(body_tags, vec!["p"]); let p = doc.children(body).next().unwrap(); assert_eq!(text_of_children(&doc, p), "Hello"); } #[test] fn void_element_br() { let doc = parse_html("<p>Line 1<br>Line 2</p>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let p = doc.children(body).next().unwrap(); // <p> should have: text("Line 1"), <br>, text("Line 2") let children: Vec<NodeId> = doc.children(p).collect(); assert_eq!(children.len(), 3); assert_eq!(doc.text_content(children[0]), Some("Line 1")); assert_eq!(doc.tag_name(children[1]), Some("br")); assert_eq!(doc.text_content(children[2]), Some("Line 2")); } #[test] fn p_inside_p_closes_outer() { let doc = parse_html("<p>First<p>Second"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); // Should have two sibling <p> elements, not nested. let body_tags = child_tags(&doc, body); assert_eq!(body_tags, vec!["p", "p"]); let children: Vec<NodeId> = doc.children(body).collect(); assert_eq!(text_of_children(&doc, children[0]), "First"); assert_eq!(text_of_children(&doc, children[1]), "Second"); } #[test] fn nested_div_elements() { let doc = parse_html("<div><div>inner</div></div>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let outer_div = doc.children(body).next().unwrap(); assert_eq!(doc.tag_name(outer_div), Some("div")); let inner_div = doc.children(outer_div).next().unwrap(); assert_eq!(doc.tag_name(inner_div), Some("div")); assert_eq!(text_of_children(&doc, inner_div), "inner"); } #[test] fn inline_elements_nest_properly() { let doc = parse_html("<p><span><a href=\"#\">link</a></span></p>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let p = doc.children(body).next().unwrap(); let span = doc.children(p).next().unwrap(); assert_eq!(doc.tag_name(span), Some("span")); let a = doc.children(span).next().unwrap(); assert_eq!(doc.tag_name(a), Some("a")); assert_eq!(doc.get_attribute(a, "href"), Some("#")); assert_eq!(text_of_children(&doc, a), "link"); } #[test] fn headings() { let doc = parse_html("<h1>Title</h1><h2>Subtitle</h2><p>Body text</p>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let tags = child_tags(&doc, body); assert_eq!(tags, vec!["h1", "h2", "p"]); } #[test] fn comment_nodes() { let doc = parse_html("<body><!-- a comment --><p>text</p></body>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let children: Vec<NodeId> = doc.children(body).collect(); assert!(children.len() >= 2); // First child should be a comment. match doc.node_data(children[0]) { NodeData::Comment { data } => assert_eq!(data, " a comment "), other => panic!("expected comment, got {:?}", other), } } #[test] fn pre_element() { let doc = parse_html("<pre>line 1\nline 2</pre>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let pre = doc.children(body).next().unwrap(); assert_eq!(doc.tag_name(pre), Some("pre")); assert_eq!(text_of_children(&doc, pre), "line 1\nline 2"); } #[test] fn attributes_preserved() { let doc = parse_html("<div id=\"main\" class=\"container\"><a href=\"/page\">link</a></div>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let div = doc.children(body).next().unwrap(); assert_eq!(doc.get_attribute(div, "id"), Some("main")); assert_eq!(doc.get_attribute(div, "class"), Some("container")); let a = doc.children(div).next().unwrap(); assert_eq!(doc.get_attribute(a, "href"), Some("/page")); } #[test] fn empty_document() { let doc = parse_html(""); let root = doc.root(); // Even an empty doc should get html/head/body from EOF handling. // The tree builder creates implicit elements. assert!(doc.children(root).next().is_some()); } #[test] fn just_text() { let doc = parse_html("Hello, world!"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); assert_eq!(text_of_children(&doc, body), "Hello, world!"); } #[test] fn heading_closes_open_p() { let doc = parse_html("<p>text<h1>heading</h1>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); // <p> should be closed by <h1>, so they're siblings. let tags = child_tags(&doc, body); assert_eq!(tags, vec!["p", "h1"]); } #[test] fn self_closing_void_elements() { let doc = parse_html("<p>before<br/>after</p>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let p = doc.children(body).next().unwrap(); let children: Vec<NodeId> = doc.children(p).collect(); assert_eq!(children.len(), 3); assert_eq!(doc.tag_name(children[1]), Some("br")); } #[test] fn doctype_is_handled() { let doc = parse_html("<!DOCTYPE html><html><body></body></html>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); assert_eq!(doc.tag_name(html), Some("html")); } #[test] fn tree_builder_step_by_step() { let mut builder = TreeBuilder::new(); builder.process_token(Token::Doctype { name: Some("html".into()), public_id: None, system_id: None, force_quirks: false, }); builder.process_token(Token::StartTag { name: "html".into(), attributes: vec![], self_closing: false, }); builder.process_token(Token::StartTag { name: "head".into(), attributes: vec![], self_closing: false, }); builder.process_token(Token::EndTag { name: "head".into(), }); builder.process_token(Token::StartTag { name: "body".into(), attributes: vec![], self_closing: false, }); builder.process_token(Token::StartTag { name: "p".into(), attributes: vec![], self_closing: false, }); builder.process_token(Token::Character("Hello".into())); builder.process_token(Token::EndTag { name: "p".into() }); builder.process_token(Token::EndTag { name: "body".into(), }); builder.process_token(Token::EndTag { name: "html".into(), }); builder.process_token(Token::Eof); let doc = builder.finish(); let root = doc.root(); let html = doc.children(root).next().unwrap(); assert_eq!(doc.tag_name(html), Some("html")); let body = doc.children(html).nth(1).unwrap(); let p = doc.children(body).next().unwrap(); assert_eq!(text_of_children(&doc, p), "Hello"); } #[test] fn multiple_text_children_merge() { // When consecutive character tokens arrive, they should merge. let mut builder = TreeBuilder::new(); builder.process_token(Token::StartTag { name: "p".into(), attributes: vec![], self_closing: false, }); builder.process_token(Token::Character("Hello ".into())); builder.process_token(Token::Character("world".into())); builder.process_token(Token::EndTag { name: "p".into() }); builder.process_token(Token::Eof); let doc = builder.finish(); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let p = doc.children(body).next().unwrap(); // Should be a single text node. let children: Vec<NodeId> = doc.children(p).collect(); assert_eq!(children.len(), 1); assert_eq!(doc.text_content(children[0]), Some("Hello world")); } #[test] fn parse_inline_svg() { let doc = parse_html( "<html><body><svg width=\"100\" height=\"100\"><rect width=\"50\" height=\"50\" fill=\"red\"/></svg></body></html>", ); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let svg = doc.children(body).next().unwrap(); // SVG element should have SVG namespace. if let NodeData::Element { ref namespace, ref tag_name, .. } = *doc.node_data(svg) { assert_eq!(tag_name, "svg"); assert_eq!(namespace.as_deref(), Some(SVG_NAMESPACE)); } else { panic!("Expected Element node"); } // SVG should have width/height attributes. assert_eq!(doc.get_attribute(svg, "width"), Some("100")); assert_eq!(doc.get_attribute(svg, "height"), Some("100")); // Rect child should also have SVG namespace. let rect = doc.children(svg).next().unwrap(); if let NodeData::Element { ref namespace, ref tag_name, .. } = *doc.node_data(rect) { assert_eq!(tag_name, "rect"); assert_eq!(namespace.as_deref(), Some(SVG_NAMESPACE)); } else { panic!("Expected Element node"); } assert_eq!(doc.get_attribute(rect, "fill"), Some("red")); } #[test] fn parse_svg_with_nested_elements() { let doc = parse_html( "<body><svg width=\"200\" height=\"200\"><g><circle cx=\"50\" cy=\"50\" r=\"40\"/><text x=\"10\" y=\"80\">Hello</text></g></svg></body>", ); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let svg = doc.children(body).next().unwrap(); assert_eq!(doc.tag_name(svg), Some("svg")); let g = doc.children(svg).next().unwrap(); assert_eq!(doc.tag_name(g), Some("g")); let children: Vec<String> = child_tags(&doc, g); assert_eq!(children, vec!["circle", "text"]); // Text element should contain text content. let text_el = doc.children(g).nth(1).unwrap(); assert_eq!(doc.deep_text_content(text_el), "Hello"); } #[test] fn svg_content_followed_by_html() { let doc = parse_html( "<body><svg width=\"50\" height=\"50\"><rect fill=\"blue\"/></svg><p>After SVG</p></body>", ); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let children: Vec<String> = child_tags(&doc, body); assert_eq!(children, vec!["svg", "p"]); // SVG children should be in SVG namespace. let svg = doc.children(body).next().unwrap(); let rect = doc.children(svg).next().unwrap(); if let NodeData::Element { ref namespace, .. } = *doc.node_data(rect) { assert_eq!(namespace.as_deref(), Some(SVG_NAMESPACE)); } // Paragraph after SVG should be in HTML namespace (no namespace). let p = doc.children(body).nth(1).unwrap(); if let NodeData::Element { ref namespace, .. } = *doc.node_data(p) { assert_eq!(namespace.as_deref(), None); } } // --- Form element parsing tests (Phase 16) --- #[test] fn parse_form_with_inputs() { let doc = parse_html( r#"<form action="/submit" method="post"><input type="text" name="user"><input type="password" name="pass"><button type="submit">Login</button></form>"#, ); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let form = doc.children(body).next().unwrap(); assert_eq!(doc.tag_name(form), Some("form")); assert_eq!(doc.get_attribute(form, "action"), Some("/submit")); assert_eq!(doc.get_attribute(form, "method"), Some("post")); let tags = child_tags(&doc, form); assert_eq!(tags, vec!["input", "input", "button"]); // Check input attributes. let children: Vec<NodeId> = doc.children(form).collect(); assert_eq!(doc.get_attribute(children[0], "type"), Some("text")); assert_eq!(doc.get_attribute(children[0], "name"), Some("user")); assert_eq!(doc.get_attribute(children[1], "type"), Some("password")); assert_eq!(doc.get_attribute(children[1], "name"), Some("pass")); // Button contains text. assert_eq!(doc.get_attribute(children[2], "type"), Some("submit")); assert_eq!(text_of_children(&doc, children[2]), "Login"); } #[test] fn parse_textarea() { let doc = parse_html( r#"<form><textarea name="bio" rows="4" cols="50">Default text here</textarea></form>"#, ); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let form = doc.children(body).next().unwrap(); let textarea = doc.children(form).next().unwrap(); assert_eq!(doc.tag_name(textarea), Some("textarea")); assert_eq!(doc.get_attribute(textarea, "name"), Some("bio")); assert_eq!(doc.get_attribute(textarea, "rows"), Some("4")); assert_eq!(doc.get_attribute(textarea, "cols"), Some("50")); assert_eq!(text_of_children(&doc, textarea), "Default text here"); } #[test] fn parse_select_with_options() { let doc = parse_html( r#"<form><select name="color"><option value="r">Red</option><option value="g" selected>Green</option><option value="b">Blue</option></select></form>"#, ); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let form = doc.children(body).next().unwrap(); let select = doc.children(form).next().unwrap(); assert_eq!(doc.tag_name(select), Some("select")); assert_eq!(doc.get_attribute(select, "name"), Some("color")); let options: Vec<NodeId> = doc.children(select).collect(); assert_eq!(options.len(), 3); assert_eq!(doc.get_attribute(options[0], "value"), Some("r")); assert_eq!(text_of_children(&doc, options[0]), "Red"); assert_eq!(doc.get_attribute(options[1], "value"), Some("g")); assert_eq!(doc.get_attribute(options[1], "selected"), Some("")); assert_eq!(doc.get_attribute(options[2], "value"), Some("b")); } #[test] fn parse_select_with_optgroups() { let doc = parse_html( r#"<select><optgroup label="Primary"><option>Red</option><option>Blue</option></optgroup><optgroup label="Secondary"><option>Orange</option></optgroup></select>"#, ); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let select = doc.children(body).next().unwrap(); assert_eq!(doc.tag_name(select), Some("select")); let groups = child_tags(&doc, select); assert_eq!(groups, vec!["optgroup", "optgroup"]); let group1 = doc.children(select).next().unwrap(); assert_eq!(doc.get_attribute(group1, "label"), Some("Primary")); let options: Vec<String> = child_tags(&doc, group1); assert_eq!(options, vec!["option", "option"]); } #[test] fn parse_fieldset_and_legend() { let doc = parse_html( r#"<form><fieldset><legend>Personal Info</legend><input type="text" name="name"><input type="email" name="email"></fieldset></form>"#, ); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let form = doc.children(body).next().unwrap(); let fieldset = doc.children(form).next().unwrap(); assert_eq!(doc.tag_name(fieldset), Some("fieldset")); let fieldset_tags = child_tags(&doc, fieldset); assert_eq!(fieldset_tags, vec!["legend", "input", "input"]); let legend = doc.children(fieldset).next().unwrap(); assert_eq!(text_of_children(&doc, legend), "Personal Info"); } #[test] fn parse_label_with_for_attribute() { let doc = parse_html( r#"<form><label for="name">Name:</label><input type="text" id="name" name="name"></form>"#, ); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let form = doc.children(body).next().unwrap(); let tags = child_tags(&doc, form); assert_eq!(tags, vec!["label", "input"]); let label = doc.children(form).next().unwrap(); assert_eq!(doc.get_attribute(label, "for"), Some("name")); assert_eq!(text_of_children(&doc, label), "Name:"); // Verify label_control resolves via `for` attribute. let input = doc.children(form).nth(1).unwrap(); assert_eq!(doc.label_control(label), Some(input)); } #[test] fn parse_label_implicit_association() { let doc = parse_html(r#"<label>Name: <input type="text" name="name"></label>"#); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let label = doc.children(body).next().unwrap(); assert_eq!(doc.tag_name(label), Some("label")); let input = doc.label_control(label).unwrap(); assert_eq!(doc.tag_name(input), Some("input")); assert_eq!(doc.get_attribute(input, "name"), Some("name")); } #[test] fn form_closes_p_in_button_scope() { let doc = parse_html("<p>text<form><input></form>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); // <p> should be closed by <form>, so they're siblings. let tags = child_tags(&doc, body); assert_eq!(tags, vec!["p", "form"]); } #[test] fn nested_form_is_ignored() { let doc = parse_html("<form id=\"outer\"><form id=\"inner\"><input></form></form>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); // Only one <form> should exist (nested form is ignored). let tags = child_tags(&doc, body); assert_eq!(tags, vec!["form"]); let form = doc.children(body).next().unwrap(); assert_eq!(doc.get_attribute(form, "id"), Some("outer")); // Input should be a child of the outer form. let form_tags = child_tags(&doc, form); assert_eq!(form_tags, vec!["input"]); } #[test] fn button_scope_handling() { let doc = parse_html("<button>First</button><button>Second</button>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let tags = child_tags(&doc, body); assert_eq!(tags, vec!["button", "button"]); } #[test] fn form_elements_collection() { let doc = parse_html( r#"<form><input name="a"><div><input name="b"></div><select name="c"><option>X</option></select><textarea name="d"></textarea></form>"#, ); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let form = doc.children(body).next().unwrap(); let elements = doc.form_elements(form); assert_eq!(elements.len(), 4); // Verify they're the right elements. assert_eq!(doc.get_attribute(elements[0], "name"), Some("a")); assert_eq!(doc.get_attribute(elements[1], "name"), Some("b")); assert_eq!(doc.get_attribute(elements[2], "name"), Some("c")); assert_eq!(doc.get_attribute(elements[3], "name"), Some("d")); } #[test] fn form_owner_from_parsed_tree() { let doc = parse_html(r#"<form id="f"><div><input id="i"></div></form>"#); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let form = doc.children(body).next().unwrap(); let input = doc.get_element_by_id("i").unwrap(); assert_eq!(doc.form_owner(input), Some(form)); } #[test] fn parse_input_types() { let doc = parse_html( r#"<form> <input type="text"> <input type="password"> <input type="checkbox" checked> <input type="radio" name="choice" value="a"> <input type="submit" value="Go"> <input type="reset"> <input type="hidden" name="token" value="abc"> <input type="number" min="0" max="100"> <input type="email"> <input type="url"> <input type="search"> <input type="tel"> </form>"#, ); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let form = doc.children(body).next().unwrap(); let inputs: Vec<NodeId> = doc .children(form) .filter(|&id| doc.tag_name(id) == Some("input")) .collect(); assert_eq!(inputs.len(), 12); assert_eq!(doc.get_attribute(inputs[0], "type"), Some("text")); assert_eq!(doc.get_attribute(inputs[1], "type"), Some("password")); assert_eq!(doc.get_attribute(inputs[2], "type"), Some("checkbox")); assert_eq!(doc.get_attribute(inputs[2], "checked"), Some("")); assert_eq!(doc.get_attribute(inputs[3], "type"), Some("radio")); assert_eq!(doc.get_attribute(inputs[3], "name"), Some("choice")); assert_eq!(doc.get_attribute(inputs[4], "type"), Some("submit")); assert_eq!(doc.get_attribute(inputs[4], "value"), Some("Go")); assert_eq!(doc.get_attribute(inputs[5], "type"), Some("reset")); assert_eq!(doc.get_attribute(inputs[6], "type"), Some("hidden")); assert_eq!(doc.get_attribute(inputs[6], "value"), Some("abc")); assert_eq!(doc.get_attribute(inputs[7], "type"), Some("number")); assert_eq!(doc.get_attribute(inputs[7], "min"), Some("0")); assert_eq!(doc.get_attribute(inputs[7], "max"), Some("100")); assert_eq!(doc.get_attribute(inputs[8], "type"), Some("email")); assert_eq!(doc.get_attribute(inputs[9], "type"), Some("url")); assert_eq!(doc.get_attribute(inputs[10], "type"), Some("search")); assert_eq!(doc.get_attribute(inputs[11], "type"), Some("tel")); } #[test] fn select_closes_on_input() { // An <input> inside a <select> should close the select. let doc = parse_html("<select><option>A</option><input type=\"text\"></select>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let tags = child_tags(&doc, body); // Select should be closed before input, making them siblings. assert_eq!(tags, vec!["select", "input"]); } }