we (web engine): Experimental web browser project to understand the limits of Claude
2
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 1797 lines 67 kB view raw
1//! HTML tree builder: construct a DOM tree from tokenizer output. 2//! 3//! Implements a simplified subset of the WHATWG HTML5 tree construction 4//! algorithm for Phase 3 of the browser engine. 5 6use we_dom::{Document, NodeId}; 7 8use crate::{Token, Tokenizer}; 9 10/// Insertion modes for the tree builder state machine. 11#[derive(Debug, Clone, Copy, PartialEq)] 12enum InsertionMode { 13 Initial, 14 BeforeHtml, 15 BeforeHead, 16 InHead, 17 Text, 18 AfterHead, 19 InBody, 20 InSelect, 21 AfterBody, 22 AfterAfterBody, 23} 24 25/// Returns true if the given tag name is a void element (self-closing, no end tag). 26fn is_void_element(tag: &str) -> bool { 27 matches!( 28 tag, 29 "area" 30 | "base" 31 | "br" 32 | "col" 33 | "embed" 34 | "hr" 35 | "img" 36 | "input" 37 | "link" 38 | "meta" 39 | "param" 40 | "source" 41 | "track" 42 | "wbr" 43 ) 44} 45 46/// SVG namespace URI. 47const SVG_NAMESPACE: &str = "http://www.w3.org/2000/svg"; 48 49/// HTML tree builder that processes tokens and constructs a DOM tree. 50pub struct TreeBuilder { 51 document: Document, 52 /// Stack of open elements (the current nesting context). 53 open_elements: Vec<NodeId>, 54 head_element: Option<NodeId>, 55 body_element: Option<NodeId>, 56 /// The form element pointer (per HTML spec §13.2.4.1). 57 form_element: Option<NodeId>, 58 insertion_mode: InsertionMode, 59 /// Original insertion mode, saved when switching to Text mode. 60 original_insertion_mode: Option<InsertionMode>, 61 /// Pending text for the Text insertion mode (e.g., inside `<title>`). 62 pending_text: String, 63 /// Depth counter for SVG foreign content. >0 means we are inside an `<svg>` element. 64 svg_depth: usize, 65} 66 67impl TreeBuilder { 68 /// Create a new tree builder with an empty document. 69 pub fn new() -> Self { 70 TreeBuilder { 71 document: Document::new(), 72 open_elements: Vec::new(), 73 head_element: None, 74 body_element: None, 75 form_element: None, 76 insertion_mode: InsertionMode::Initial, 77 original_insertion_mode: None, 78 pending_text: String::new(), 79 svg_depth: 0, 80 } 81 } 82 83 /// Process a single token, updating the DOM tree. 84 pub fn process_token(&mut self, token: Token) { 85 // Handle SVG foreign content. 86 if self.svg_depth > 0 { 87 self.handle_svg_content(token); 88 return; 89 } 90 91 match self.insertion_mode { 92 InsertionMode::Initial => self.handle_initial(token), 93 InsertionMode::BeforeHtml => self.handle_before_html(token), 94 InsertionMode::BeforeHead => self.handle_before_head(token), 95 InsertionMode::InHead => self.handle_in_head(token), 96 InsertionMode::Text => self.handle_text(token), 97 InsertionMode::AfterHead => self.handle_after_head(token), 98 InsertionMode::InBody => self.handle_in_body(token), 99 InsertionMode::InSelect => self.handle_in_select(token), 100 InsertionMode::AfterBody => self.handle_after_body(token), 101 InsertionMode::AfterAfterBody => self.handle_after_after_body(token), 102 } 103 } 104 105 /// Finish building and return the constructed DOM document. 106 pub fn finish(self) -> Document { 107 self.document 108 } 109 110 // --- Insertion mode handlers --- 111 112 fn handle_initial(&mut self, token: Token) { 113 match token { 114 Token::Doctype { .. } => { 115 // For Phase 3, we just acknowledge the DOCTYPE and move on. 116 self.insertion_mode = InsertionMode::BeforeHtml; 117 } 118 Token::Comment(data) => { 119 let comment = self.document.create_comment(&data); 120 let root = self.document.root(); 121 self.document.append_child(root, comment); 122 } 123 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 124 // Ignore whitespace in Initial mode. 125 } 126 _ => { 127 // Anything else: switch to BeforeHtml and reprocess. 128 self.insertion_mode = InsertionMode::BeforeHtml; 129 self.handle_before_html(token); 130 } 131 } 132 } 133 134 fn handle_before_html(&mut self, token: Token) { 135 match token { 136 Token::Doctype { .. } => { /* ignore */ } 137 Token::Comment(data) => { 138 let comment = self.document.create_comment(&data); 139 let root = self.document.root(); 140 self.document.append_child(root, comment); 141 } 142 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 143 // Ignore whitespace. 144 } 145 Token::StartTag { ref name, .. } if name == "html" => { 146 let html = self.create_element_from_token(&token); 147 let root = self.document.root(); 148 self.document.append_child(root, html); 149 self.open_elements.push(html); 150 self.insertion_mode = InsertionMode::BeforeHead; 151 } 152 Token::EndTag { ref name } 153 if name != "head" && name != "body" && name != "html" && name != "br" => 154 { 155 // Parse error, ignore. 156 } 157 _ => { 158 // Create an implicit <html> element. 159 let html = self.document.create_element("html"); 160 let root = self.document.root(); 161 self.document.append_child(root, html); 162 self.open_elements.push(html); 163 self.insertion_mode = InsertionMode::BeforeHead; 164 self.handle_before_head(token); 165 } 166 } 167 } 168 169 fn handle_before_head(&mut self, token: Token) { 170 match token { 171 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 172 // Ignore whitespace. 173 } 174 Token::Comment(data) => { 175 self.insert_comment(&data); 176 } 177 Token::Doctype { .. } => { /* ignore */ } 178 Token::StartTag { ref name, .. } if name == "html" => { 179 // Process as if InBody. 180 self.handle_in_body(token); 181 } 182 Token::StartTag { ref name, .. } if name == "head" => { 183 let head = self.create_element_from_token(&token); 184 self.insert_node(head); 185 self.open_elements.push(head); 186 self.head_element = Some(head); 187 self.insertion_mode = InsertionMode::InHead; 188 } 189 Token::EndTag { ref name } 190 if name != "head" && name != "body" && name != "html" && name != "br" => 191 { 192 // Parse error, ignore. 193 } 194 _ => { 195 // Implied <head>. 196 let head = self.document.create_element("head"); 197 self.insert_node(head); 198 self.open_elements.push(head); 199 self.head_element = Some(head); 200 self.insertion_mode = InsertionMode::InHead; 201 self.handle_in_head(token); 202 } 203 } 204 } 205 206 fn handle_in_head(&mut self, token: Token) { 207 match token { 208 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 209 self.insert_text(s); 210 } 211 Token::Comment(data) => { 212 self.insert_comment(&data); 213 } 214 Token::Doctype { .. } => { /* ignore */ } 215 Token::StartTag { ref name, .. } if name == "title" => { 216 let elem = self.create_element_from_token(&token); 217 self.insert_node(elem); 218 self.open_elements.push(elem); 219 self.original_insertion_mode = Some(self.insertion_mode); 220 self.insertion_mode = InsertionMode::Text; 221 } 222 Token::StartTag { ref name, .. } 223 if name == "style" || name == "script" || name == "noscript" => 224 { 225 let elem = self.create_element_from_token(&token); 226 self.insert_node(elem); 227 self.open_elements.push(elem); 228 self.original_insertion_mode = Some(self.insertion_mode); 229 self.insertion_mode = InsertionMode::Text; 230 } 231 Token::StartTag { ref name, .. } if name == "meta" || name == "link" => { 232 let elem = self.create_element_from_token(&token); 233 self.insert_node(elem); 234 // Void elements: don't push onto stack. 235 } 236 Token::StartTag { ref name, .. } if name == "head" => { 237 // Ignore duplicate <head>. 238 } 239 Token::EndTag { ref name } if name == "head" => { 240 self.pop_until("head"); 241 self.insertion_mode = InsertionMode::AfterHead; 242 } 243 Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => { 244 // Parse error, ignore. 245 } 246 _ => { 247 // Pop <head> and switch to AfterHead, then reprocess. 248 self.pop_until("head"); 249 self.insertion_mode = InsertionMode::AfterHead; 250 self.handle_after_head(token); 251 } 252 } 253 } 254 255 fn handle_text(&mut self, token: Token) { 256 match token { 257 Token::Character(s) => { 258 self.pending_text.push_str(&s); 259 } 260 Token::EndTag { .. } => { 261 // Flush pending text. 262 if !self.pending_text.is_empty() { 263 let text = self.pending_text.clone(); 264 self.pending_text.clear(); 265 self.insert_text(&text); 266 } 267 // Pop the element (e.g., <title>). 268 self.open_elements.pop(); 269 self.insertion_mode = self 270 .original_insertion_mode 271 .unwrap_or(InsertionMode::InBody); 272 self.original_insertion_mode = None; 273 } 274 Token::Eof => { 275 // Flush pending text. 276 if !self.pending_text.is_empty() { 277 let text = self.pending_text.clone(); 278 self.pending_text.clear(); 279 self.insert_text(&text); 280 } 281 self.open_elements.pop(); 282 self.insertion_mode = self 283 .original_insertion_mode 284 .unwrap_or(InsertionMode::InBody); 285 self.original_insertion_mode = None; 286 self.process_token(Token::Eof); 287 } 288 _ => {} 289 } 290 } 291 292 fn handle_after_head(&mut self, token: Token) { 293 match token { 294 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 295 self.insert_text(s); 296 } 297 Token::Comment(data) => { 298 self.insert_comment(&data); 299 } 300 Token::Doctype { .. } => { /* ignore */ } 301 Token::StartTag { ref name, .. } if name == "html" => { 302 self.handle_in_body(token); 303 } 304 Token::StartTag { ref name, .. } if name == "body" => { 305 let body = self.create_element_from_token(&token); 306 self.insert_node(body); 307 self.open_elements.push(body); 308 self.body_element = Some(body); 309 self.insertion_mode = InsertionMode::InBody; 310 } 311 Token::StartTag { ref name, .. } if name == "head" => { 312 // Ignore. 313 } 314 Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => { 315 // Ignore. 316 } 317 _ => { 318 // Implied <body>. 319 let body = self.document.create_element("body"); 320 self.insert_node(body); 321 self.open_elements.push(body); 322 self.body_element = Some(body); 323 self.insertion_mode = InsertionMode::InBody; 324 self.handle_in_body(token); 325 } 326 } 327 } 328 329 fn handle_in_body(&mut self, token: Token) { 330 match token { 331 Token::Character(s) => { 332 self.insert_text(&s); 333 } 334 Token::Comment(data) => { 335 self.insert_comment(&data); 336 } 337 Token::Doctype { .. } => { /* ignore */ } 338 Token::StartTag { ref name, .. } if name == "html" => { 339 // Merge attributes onto existing <html> element. 340 if let Token::StartTag { attributes, .. } = &token { 341 if let Some(&html_id) = self.open_elements.first() { 342 for (attr_name, attr_value) in attributes { 343 if self.document.get_attribute(html_id, attr_name).is_none() { 344 self.document.set_attribute(html_id, attr_name, attr_value); 345 } 346 } 347 } 348 } 349 } 350 Token::StartTag { ref name, .. } 351 if name == "body" 352 || name == "head" 353 || name == "title" 354 || name == "style" 355 || name == "script" => 356 { 357 match name.as_str() { 358 "body" => { 359 // Ignore duplicate <body>. 360 } 361 "head" => { 362 // Ignore <head> in body. 363 } 364 _ => { 365 // title/style/script: process using InHead rules 366 self.handle_in_head(token); 367 } 368 } 369 } 370 Token::StartTag { ref name, .. } 371 if name == "p" 372 || name == "div" 373 || name == "h1" 374 || name == "h2" 375 || name == "h3" 376 || name == "h4" 377 || name == "h5" 378 || name == "h6" 379 || name == "pre" 380 || name == "blockquote" 381 || name == "ul" 382 || name == "ol" 383 || name == "li" => 384 { 385 // If there's a <p> in button scope, close it first. 386 if self.has_element_in_button_scope("p") { 387 self.close_p_element(); 388 } 389 let elem = self.create_element_from_token(&token); 390 self.insert_node(elem); 391 self.open_elements.push(elem); 392 } 393 // --- Form elements (Phase 16) --- 394 Token::StartTag { ref name, .. } if name == "form" => { 395 // Per spec: if the form element pointer is not null, ignore. 396 if self.form_element.is_some() { 397 // Parse error, ignore the token. 398 } else { 399 if self.has_element_in_button_scope("p") { 400 self.close_p_element(); 401 } 402 let elem = self.create_element_from_token(&token); 403 self.insert_node(elem); 404 self.open_elements.push(elem); 405 self.form_element = Some(elem); 406 } 407 } 408 Token::StartTag { ref name, .. } if name == "fieldset" => { 409 if self.has_element_in_button_scope("p") { 410 self.close_p_element(); 411 } 412 let elem = self.create_element_from_token(&token); 413 self.insert_node(elem); 414 self.open_elements.push(elem); 415 } 416 Token::StartTag { ref name, .. } if name == "button" => { 417 // If there's a button in scope, close it first. 418 if self.has_element_in_scope("button") { 419 self.generate_implied_end_tags(None); 420 self.pop_until("button"); 421 } 422 let elem = self.create_element_from_token(&token); 423 self.insert_node(elem); 424 self.open_elements.push(elem); 425 } 426 Token::StartTag { ref name, .. } if name == "textarea" => { 427 let elem = self.create_element_from_token(&token); 428 self.insert_node(elem); 429 self.open_elements.push(elem); 430 // Switch to Text mode to collect raw text content. 431 self.original_insertion_mode = Some(self.insertion_mode); 432 self.insertion_mode = InsertionMode::Text; 433 } 434 Token::StartTag { ref name, .. } if name == "select" => { 435 let elem = self.create_element_from_token(&token); 436 self.insert_node(elem); 437 self.open_elements.push(elem); 438 self.insertion_mode = InsertionMode::InSelect; 439 } 440 Token::StartTag { ref name, .. } if name == "optgroup" || name == "option" => { 441 // Close any currently open <option>. 442 if let Some(&top) = self.open_elements.last() { 443 if self.document.tag_name(top) == Some("option") { 444 self.open_elements.pop(); 445 } 446 } 447 // Also close open <optgroup> when a new <optgroup> starts. 448 if name == "optgroup" { 449 if let Some(&top) = self.open_elements.last() { 450 if self.document.tag_name(top) == Some("optgroup") { 451 self.open_elements.pop(); 452 } 453 } 454 } 455 let elem = self.create_element_from_token(&token); 456 self.insert_node(elem); 457 self.open_elements.push(elem); 458 } 459 // --- SVG / iframe / void / generic start tags --- 460 Token::StartTag { ref name, .. } if name == "svg" => { 461 let elem = self.create_svg_element_from_token(&token); 462 self.insert_node(elem); 463 self.open_elements.push(elem); 464 self.svg_depth = 1; 465 } 466 Token::StartTag { ref name, .. } if name == "iframe" => { 467 // Per HTML spec, <iframe> uses RAWTEXT parsing: content between 468 // <iframe> and </iframe> is raw text (fallback content), not HTML. 469 let elem = self.create_element_from_token(&token); 470 self.insert_node(elem); 471 self.open_elements.push(elem); 472 self.original_insertion_mode = Some(self.insertion_mode); 473 self.insertion_mode = InsertionMode::Text; 474 } 475 Token::StartTag { ref name, .. } if is_void_element(name) => { 476 let elem = self.create_element_from_token(&token); 477 self.insert_node(elem); 478 // Don't push void elements onto the stack. 479 } 480 Token::StartTag { .. } => { 481 // Generic start tag: create element and push onto stack. 482 let elem = self.create_element_from_token(&token); 483 self.insert_node(elem); 484 self.open_elements.push(elem); 485 } 486 Token::EndTag { ref name } if name == "body" => { 487 if self.has_element_in_scope("body") { 488 self.insertion_mode = InsertionMode::AfterBody; 489 } 490 } 491 Token::EndTag { ref name } if name == "html" => { 492 if self.has_element_in_scope("body") { 493 self.insertion_mode = InsertionMode::AfterBody; 494 self.handle_after_body(token); 495 } 496 } 497 Token::EndTag { ref name } if name == "p" => { 498 if !self.has_element_in_button_scope("p") { 499 // No matching <p>: insert an empty one, then close it. 500 let p = self.document.create_element("p"); 501 self.insert_node(p); 502 self.open_elements.push(p); 503 } 504 self.close_p_element(); 505 } 506 // --- Form end tags (Phase 16) --- 507 Token::EndTag { ref name } if name == "form" => { 508 // Per spec: reset the form element pointer, then pop. 509 self.form_element = None; 510 if self.has_element_in_scope("form") { 511 self.generate_implied_end_tags(Some("form")); 512 self.pop_until("form"); 513 } 514 } 515 Token::EndTag { ref name } if name == "button" => { 516 if self.has_element_in_scope("button") { 517 self.generate_implied_end_tags(None); 518 self.pop_until("button"); 519 } 520 } 521 Token::EndTag { ref name } if name == "fieldset" => { 522 if self.has_element_in_scope("fieldset") { 523 self.generate_implied_end_tags(None); 524 self.pop_until("fieldset"); 525 } 526 } 527 Token::EndTag { ref name } if name == "optgroup" || name == "option" => { 528 if self.has_element_in_scope(name) { 529 self.generate_implied_end_tags(Some(name)); 530 self.pop_until(name); 531 } 532 } 533 // --- End of form end tags --- 534 Token::EndTag { ref name } 535 if name == "div" 536 || name == "pre" 537 || name == "blockquote" 538 || name == "ul" 539 || name == "ol" 540 || name == "li" => 541 { 542 if self.has_element_in_scope(name) { 543 self.generate_implied_end_tags(Some(name)); 544 self.pop_until(name); 545 } 546 } 547 Token::EndTag { ref name } 548 if name == "h1" 549 || name == "h2" 550 || name == "h3" 551 || name == "h4" 552 || name == "h5" 553 || name == "h6" => 554 { 555 if self.has_heading_in_scope() { 556 self.generate_implied_end_tags(None); 557 // Pop until we find a heading element. 558 while let Some(id) = self.open_elements.pop() { 559 if let Some(tag) = self.document.tag_name(id) { 560 if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") { 561 break; 562 } 563 } 564 } 565 } 566 } 567 Token::EndTag { ref name } => { 568 // Generic end tag: walk back through open elements. 569 self.handle_any_other_end_tag(name); 570 } 571 Token::Eof => { 572 // Stop parsing. 573 } 574 } 575 } 576 577 /// Handle tokens inside a `<select>` element (InSelect insertion mode). 578 fn handle_in_select(&mut self, token: Token) { 579 match token { 580 Token::Character(s) => { 581 self.insert_text(&s); 582 } 583 Token::Comment(data) => { 584 self.insert_comment(&data); 585 } 586 Token::StartTag { ref name, .. } if name == "option" => { 587 // Close any currently open <option>. 588 if let Some(&top) = self.open_elements.last() { 589 if self.document.tag_name(top) == Some("option") { 590 self.open_elements.pop(); 591 } 592 } 593 let elem = self.create_element_from_token(&token); 594 self.insert_node(elem); 595 self.open_elements.push(elem); 596 } 597 Token::StartTag { ref name, .. } if name == "optgroup" => { 598 // Close any open <option>, then close any open <optgroup>. 599 if let Some(&top) = self.open_elements.last() { 600 if self.document.tag_name(top) == Some("option") { 601 self.open_elements.pop(); 602 } 603 } 604 if let Some(&top) = self.open_elements.last() { 605 if self.document.tag_name(top) == Some("optgroup") { 606 self.open_elements.pop(); 607 } 608 } 609 let elem = self.create_element_from_token(&token); 610 self.insert_node(elem); 611 self.open_elements.push(elem); 612 } 613 Token::EndTag { ref name } if name == "option" => { 614 if let Some(&top) = self.open_elements.last() { 615 if self.document.tag_name(top) == Some("option") { 616 self.open_elements.pop(); 617 } 618 } 619 } 620 Token::EndTag { ref name } if name == "optgroup" => { 621 // If the top is <option>, pop it first. 622 if let Some(&top) = self.open_elements.last() { 623 if self.document.tag_name(top) == Some("option") { 624 self.open_elements.pop(); 625 } 626 } 627 if let Some(&top) = self.open_elements.last() { 628 if self.document.tag_name(top) == Some("optgroup") { 629 self.open_elements.pop(); 630 } 631 } 632 } 633 Token::EndTag { ref name } if name == "select" => { 634 self.pop_until("select"); 635 self.insertion_mode = InsertionMode::InBody; 636 } 637 Token::StartTag { ref name, .. } if name == "select" => { 638 // Nested <select>: close the current one (parse error). 639 self.pop_until("select"); 640 self.insertion_mode = InsertionMode::InBody; 641 } 642 Token::StartTag { ref name, .. } if name == "input" || name == "textarea" => { 643 // Per spec: these close the <select> and reprocess in InBody. 644 self.pop_until("select"); 645 self.insertion_mode = InsertionMode::InBody; 646 self.handle_in_body(token); 647 } 648 Token::Eof => { 649 // Stop parsing. 650 } 651 _ => { 652 // Ignore anything else in select. 653 } 654 } 655 } 656 657 fn handle_after_body(&mut self, token: Token) { 658 match token { 659 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 660 // Process whitespace as in InBody. 661 self.handle_in_body(token); 662 } 663 Token::Comment(data) => { 664 // Insert as last child of the first element (html). 665 let comment = self.document.create_comment(&data); 666 if let Some(&html) = self.open_elements.first() { 667 self.document.append_child(html, comment); 668 } 669 } 670 Token::Doctype { .. } => { /* ignore */ } 671 Token::EndTag { ref name } if name == "html" => { 672 self.insertion_mode = InsertionMode::AfterAfterBody; 673 } 674 Token::Eof => { 675 // Stop parsing. 676 } 677 _ => { 678 // Anything else: switch back to InBody and reprocess. 679 self.insertion_mode = InsertionMode::InBody; 680 self.handle_in_body(token); 681 } 682 } 683 } 684 685 fn handle_after_after_body(&mut self, token: Token) { 686 match token { 687 Token::Comment(data) => { 688 let comment = self.document.create_comment(&data); 689 let root = self.document.root(); 690 self.document.append_child(root, comment); 691 } 692 Token::Doctype { .. } => { /* ignore */ } 693 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 694 self.handle_in_body(token); 695 } 696 Token::Eof => { 697 // Stop. 698 } 699 _ => { 700 self.insertion_mode = InsertionMode::InBody; 701 self.handle_in_body(token); 702 } 703 } 704 } 705 706 // --- SVG foreign content --- 707 708 /// Handle tokens while inside an SVG subtree. 709 fn handle_svg_content(&mut self, token: Token) { 710 match token { 711 Token::Character(s) => { 712 self.insert_text(&s); 713 } 714 Token::Comment(data) => { 715 self.insert_comment(&data); 716 } 717 Token::StartTag { ref name, .. } if name == "svg" => { 718 // Nested <svg>. 719 let elem = self.create_svg_element_from_token(&token); 720 self.insert_node(elem); 721 self.open_elements.push(elem); 722 self.svg_depth += 1; 723 } 724 Token::StartTag { self_closing, .. } => { 725 let elem = self.create_svg_element_from_token(&token); 726 self.insert_node(elem); 727 if !self_closing { 728 self.open_elements.push(elem); 729 } 730 } 731 Token::EndTag { ref name } if name == "svg" => { 732 self.pop_until("svg"); 733 self.svg_depth -= 1; 734 } 735 Token::EndTag { ref name } => { 736 // Pop matching element from the stack. 737 self.handle_any_other_end_tag(name); 738 } 739 Token::Eof => {} 740 _ => {} 741 } 742 } 743 744 // --- Helper methods --- 745 746 /// Create a DOM element from a StartTag token with SVG namespace, setting attributes. 747 fn create_svg_element_from_token(&mut self, token: &Token) -> NodeId { 748 if let Token::StartTag { 749 name, attributes, .. 750 } = token 751 { 752 let id = self.document.create_element_ns(name, Some(SVG_NAMESPACE)); 753 for (attr_name, attr_value) in attributes { 754 self.document.set_attribute(id, attr_name, attr_value); 755 } 756 id 757 } else { 758 self.document 759 .create_element_ns("unknown", Some(SVG_NAMESPACE)) 760 } 761 } 762 763 /// Create a DOM element from a StartTag token, setting attributes. 764 fn create_element_from_token(&mut self, token: &Token) -> NodeId { 765 if let Token::StartTag { 766 name, attributes, .. 767 } = token 768 { 769 let id = self.document.create_element(name); 770 for (attr_name, attr_value) in attributes { 771 self.document.set_attribute(id, attr_name, attr_value); 772 } 773 id 774 } else { 775 // Should only be called with StartTag tokens. 776 self.document.create_element("unknown") 777 } 778 } 779 780 /// Insert a node at the current insertion point (last open element). 781 fn insert_node(&mut self, node: NodeId) { 782 let parent = self 783 .open_elements 784 .last() 785 .copied() 786 .unwrap_or_else(|| self.document.root()); 787 self.document.append_child(parent, node); 788 } 789 790 /// Insert a text node at the current insertion point. 791 /// If the last child is already a text node, append to it. 792 fn insert_text(&mut self, data: &str) { 793 let parent = self 794 .open_elements 795 .last() 796 .copied() 797 .unwrap_or_else(|| self.document.root()); 798 799 // Try to merge with existing text node. 800 if let Some(last_child) = self.document.last_child(parent) { 801 if let we_dom::NodeData::Text { data: ref existing } = 802 *self.document.node_data(last_child) 803 { 804 let mut merged = existing.clone(); 805 merged.push_str(data); 806 self.document.set_text_content(last_child, &merged); 807 return; 808 } 809 } 810 811 let text = self.document.create_text(data); 812 self.document.append_child(parent, text); 813 } 814 815 /// Insert a comment node at the current insertion point. 816 fn insert_comment(&mut self, data: &str) { 817 let comment = self.document.create_comment(data); 818 self.insert_node(comment); 819 } 820 821 /// Pop elements from the stack until we find one with the given tag name. 822 /// The matching element is also popped. 823 fn pop_until(&mut self, tag_name: &str) { 824 while let Some(id) = self.open_elements.pop() { 825 if self.document.tag_name(id) == Some(tag_name) { 826 return; 827 } 828 } 829 } 830 831 /// Check if the given tag name is "in scope" (simplified). 832 /// In scope means there's an element with that tag on the stack, 833 /// and no scope barrier element between it and the top. 834 fn has_element_in_scope(&self, target: &str) -> bool { 835 for &id in self.open_elements.iter().rev() { 836 if let Some(tag) = self.document.tag_name(id) { 837 if tag == target { 838 return true; 839 } 840 // Scope barrier elements. 841 if matches!( 842 tag, 843 "applet" 844 | "caption" 845 | "html" 846 | "table" 847 | "td" 848 | "th" 849 | "marquee" 850 | "object" 851 | "template" 852 ) { 853 return false; 854 } 855 } 856 } 857 false 858 } 859 860 /// Check if the given tag name is "in button scope". 861 fn has_element_in_button_scope(&self, target: &str) -> bool { 862 for &id in self.open_elements.iter().rev() { 863 if let Some(tag) = self.document.tag_name(id) { 864 if tag == target { 865 return true; 866 } 867 // Button scope includes all regular scope barriers plus <button>. 868 if matches!( 869 tag, 870 "applet" 871 | "button" 872 | "caption" 873 | "html" 874 | "table" 875 | "td" 876 | "th" 877 | "marquee" 878 | "object" 879 | "template" 880 ) { 881 return false; 882 } 883 } 884 } 885 false 886 } 887 888 /// Check if any heading element (h1-h6) is in scope. 889 fn has_heading_in_scope(&self) -> bool { 890 for &id in self.open_elements.iter().rev() { 891 if let Some(tag) = self.document.tag_name(id) { 892 if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") { 893 return true; 894 } 895 if matches!( 896 tag, 897 "applet" 898 | "caption" 899 | "html" 900 | "table" 901 | "td" 902 | "th" 903 | "marquee" 904 | "object" 905 | "template" 906 ) { 907 return false; 908 } 909 } 910 } 911 false 912 } 913 914 /// Close a `<p>` element: generate implied end tags (excluding p), 915 /// then pop until we find the `<p>`. 916 fn close_p_element(&mut self) { 917 self.generate_implied_end_tags(Some("p")); 918 self.pop_until("p"); 919 } 920 921 /// Generate implied end tags. If `exclude` is provided, don't generate 922 /// an end tag for that element. 923 fn generate_implied_end_tags(&mut self, exclude: Option<&str>) { 924 loop { 925 let should_pop = self 926 .open_elements 927 .last() 928 .and_then(|&id| self.document.tag_name(id)) 929 .map(|tag| { 930 if let Some(excl) = exclude { 931 if tag == excl { 932 return false; 933 } 934 } 935 matches!( 936 tag, 937 "dd" | "dt" 938 | "li" 939 | "optgroup" 940 | "option" 941 | "p" 942 | "rb" 943 | "rp" 944 | "rt" 945 | "rtc" 946 ) 947 }) 948 .unwrap_or(false); 949 if should_pop { 950 self.open_elements.pop(); 951 } else { 952 break; 953 } 954 } 955 } 956 957 /// Handle a generic end tag by walking back through open elements 958 /// using the "any other end tag" algorithm. 959 fn handle_any_other_end_tag(&mut self, name: &str) { 960 // Walk backwards through the stack. 961 let mut i = self.open_elements.len(); 962 while i > 0 { 963 i -= 1; 964 let id = self.open_elements[i]; 965 if self.document.tag_name(id) == Some(name) { 966 // Pop everything above and including this element. 967 self.open_elements.truncate(i); 968 return; 969 } 970 // If this is a "special" element, stop. 971 if let Some(tag) = self.document.tag_name(id) { 972 if is_special_element(tag) { 973 return; 974 } 975 } 976 } 977 } 978} 979 980impl Default for TreeBuilder { 981 fn default() -> Self { 982 Self::new() 983 } 984} 985 986/// Returns true if the tag is a "special" element per the HTML spec. 987fn is_special_element(tag: &str) -> bool { 988 matches!( 989 tag, 990 "address" 991 | "applet" 992 | "area" 993 | "article" 994 | "aside" 995 | "base" 996 | "basefont" 997 | "bgsound" 998 | "blockquote" 999 | "body" 1000 | "br" 1001 | "button" 1002 | "caption" 1003 | "center" 1004 | "col" 1005 | "colgroup" 1006 | "dd" 1007 | "details" 1008 | "dir" 1009 | "div" 1010 | "dl" 1011 | "dt" 1012 | "embed" 1013 | "fieldset" 1014 | "figcaption" 1015 | "figure" 1016 | "footer" 1017 | "form" 1018 | "frame" 1019 | "frameset" 1020 | "h1" 1021 | "h2" 1022 | "h3" 1023 | "h4" 1024 | "h5" 1025 | "h6" 1026 | "head" 1027 | "header" 1028 | "hgroup" 1029 | "hr" 1030 | "html" 1031 | "iframe" 1032 | "img" 1033 | "input" 1034 | "legend" 1035 | "li" 1036 | "link" 1037 | "listing" 1038 | "main" 1039 | "marquee" 1040 | "menu" 1041 | "meta" 1042 | "nav" 1043 | "noembed" 1044 | "noframes" 1045 | "noscript" 1046 | "object" 1047 | "ol" 1048 | "p" 1049 | "param" 1050 | "plaintext" 1051 | "pre" 1052 | "script" 1053 | "section" 1054 | "select" 1055 | "source" 1056 | "style" 1057 | "summary" 1058 | "table" 1059 | "tbody" 1060 | "td" 1061 | "template" 1062 | "textarea" 1063 | "tfoot" 1064 | "th" 1065 | "thead" 1066 | "title" 1067 | "tr" 1068 | "track" 1069 | "ul" 1070 | "wbr" 1071 | "xmp" 1072 ) 1073} 1074 1075/// Parse an HTML string into a DOM document. 1076/// 1077/// This is a convenience function that tokenizes the input and builds 1078/// a DOM tree using the tree builder. 1079pub fn parse_html(input: &str) -> Document { 1080 let mut builder = TreeBuilder::new(); 1081 let mut tokenizer = Tokenizer::new(input); 1082 loop { 1083 let token = tokenizer.next_token(); 1084 let is_eof = token == Token::Eof; 1085 builder.process_token(token); 1086 if is_eof { 1087 break; 1088 } 1089 } 1090 builder.finish() 1091} 1092 1093#[cfg(test)] 1094mod tests { 1095 use super::*; 1096 use we_dom::NodeData; 1097 1098 /// Helper: collect tag names of direct children of a node. 1099 fn child_tags(doc: &Document, node: NodeId) -> Vec<String> { 1100 doc.children(node) 1101 .filter_map(|id| doc.tag_name(id).map(String::from)) 1102 .collect() 1103 } 1104 1105 /// Helper: get the text content of all text node children, concatenated. 1106 fn text_of_children(doc: &Document, node: NodeId) -> String { 1107 let mut result = String::new(); 1108 for child in doc.children(node) { 1109 if let Some(text) = doc.text_content(child) { 1110 result.push_str(text); 1111 } 1112 } 1113 result 1114 } 1115 1116 #[test] 1117 fn parse_full_document() { 1118 let doc = parse_html( 1119 "<!DOCTYPE html><html><head><title>Test</title></head><body><p>Hello</p></body></html>", 1120 ); 1121 let root = doc.root(); 1122 1123 // Root should have one child: <html> 1124 let html_children: Vec<NodeId> = doc.children(root).collect(); 1125 assert_eq!(html_children.len(), 1); 1126 let html = html_children[0]; 1127 assert_eq!(doc.tag_name(html), Some("html")); 1128 1129 // <html> should have <head> and <body> 1130 let tags = child_tags(&doc, html); 1131 assert_eq!(tags, vec!["head", "body"]); 1132 1133 // <head> should have <title> 1134 let head = doc.children(html).next().unwrap(); 1135 let head_tags = child_tags(&doc, head); 1136 assert_eq!(head_tags, vec!["title"]); 1137 1138 // <title> should contain "Test" 1139 let title = doc.children(head).next().unwrap(); 1140 assert_eq!(text_of_children(&doc, title), "Test"); 1141 1142 // <body> should have <p> 1143 let body = doc.children(html).nth(1).unwrap(); 1144 let body_tags = child_tags(&doc, body); 1145 assert_eq!(body_tags, vec!["p"]); 1146 1147 // <p> should contain "Hello" 1148 let p = doc.children(body).next().unwrap(); 1149 assert_eq!(text_of_children(&doc, p), "Hello"); 1150 } 1151 1152 #[test] 1153 fn implicit_html_head_body() { 1154 // Minimal document: just <p>Hello 1155 let doc = parse_html("<p>Hello"); 1156 let root = doc.root(); 1157 1158 let html: Vec<NodeId> = doc.children(root).collect(); 1159 assert_eq!(html.len(), 1); 1160 assert_eq!(doc.tag_name(html[0]), Some("html")); 1161 1162 let html_tags = child_tags(&doc, html[0]); 1163 assert_eq!(html_tags, vec!["head", "body"]); 1164 1165 let body = doc.children(html[0]).nth(1).unwrap(); 1166 let body_tags = child_tags(&doc, body); 1167 assert_eq!(body_tags, vec!["p"]); 1168 1169 let p = doc.children(body).next().unwrap(); 1170 assert_eq!(text_of_children(&doc, p), "Hello"); 1171 } 1172 1173 #[test] 1174 fn void_element_br() { 1175 let doc = parse_html("<p>Line 1<br>Line 2</p>"); 1176 let root = doc.root(); 1177 let html = doc.children(root).next().unwrap(); 1178 let body = doc.children(html).nth(1).unwrap(); 1179 let p = doc.children(body).next().unwrap(); 1180 1181 // <p> should have: text("Line 1"), <br>, text("Line 2") 1182 let children: Vec<NodeId> = doc.children(p).collect(); 1183 assert_eq!(children.len(), 3); 1184 assert_eq!(doc.text_content(children[0]), Some("Line 1")); 1185 assert_eq!(doc.tag_name(children[1]), Some("br")); 1186 assert_eq!(doc.text_content(children[2]), Some("Line 2")); 1187 } 1188 1189 #[test] 1190 fn p_inside_p_closes_outer() { 1191 let doc = parse_html("<p>First<p>Second"); 1192 let root = doc.root(); 1193 let html = doc.children(root).next().unwrap(); 1194 let body = doc.children(html).nth(1).unwrap(); 1195 1196 // Should have two sibling <p> elements, not nested. 1197 let body_tags = child_tags(&doc, body); 1198 assert_eq!(body_tags, vec!["p", "p"]); 1199 1200 let children: Vec<NodeId> = doc.children(body).collect(); 1201 assert_eq!(text_of_children(&doc, children[0]), "First"); 1202 assert_eq!(text_of_children(&doc, children[1]), "Second"); 1203 } 1204 1205 #[test] 1206 fn nested_div_elements() { 1207 let doc = parse_html("<div><div>inner</div></div>"); 1208 let root = doc.root(); 1209 let html = doc.children(root).next().unwrap(); 1210 let body = doc.children(html).nth(1).unwrap(); 1211 1212 let outer_div = doc.children(body).next().unwrap(); 1213 assert_eq!(doc.tag_name(outer_div), Some("div")); 1214 1215 let inner_div = doc.children(outer_div).next().unwrap(); 1216 assert_eq!(doc.tag_name(inner_div), Some("div")); 1217 assert_eq!(text_of_children(&doc, inner_div), "inner"); 1218 } 1219 1220 #[test] 1221 fn inline_elements_nest_properly() { 1222 let doc = parse_html("<p><span><a href=\"#\">link</a></span></p>"); 1223 let root = doc.root(); 1224 let html = doc.children(root).next().unwrap(); 1225 let body = doc.children(html).nth(1).unwrap(); 1226 1227 let p = doc.children(body).next().unwrap(); 1228 let span = doc.children(p).next().unwrap(); 1229 assert_eq!(doc.tag_name(span), Some("span")); 1230 1231 let a = doc.children(span).next().unwrap(); 1232 assert_eq!(doc.tag_name(a), Some("a")); 1233 assert_eq!(doc.get_attribute(a, "href"), Some("#")); 1234 assert_eq!(text_of_children(&doc, a), "link"); 1235 } 1236 1237 #[test] 1238 fn headings() { 1239 let doc = parse_html("<h1>Title</h1><h2>Subtitle</h2><p>Body text</p>"); 1240 let root = doc.root(); 1241 let html = doc.children(root).next().unwrap(); 1242 let body = doc.children(html).nth(1).unwrap(); 1243 1244 let tags = child_tags(&doc, body); 1245 assert_eq!(tags, vec!["h1", "h2", "p"]); 1246 } 1247 1248 #[test] 1249 fn comment_nodes() { 1250 let doc = parse_html("<body><!-- a comment --><p>text</p></body>"); 1251 let root = doc.root(); 1252 let html = doc.children(root).next().unwrap(); 1253 let body = doc.children(html).nth(1).unwrap(); 1254 1255 let children: Vec<NodeId> = doc.children(body).collect(); 1256 assert!(children.len() >= 2); 1257 1258 // First child should be a comment. 1259 match doc.node_data(children[0]) { 1260 NodeData::Comment { data } => assert_eq!(data, " a comment "), 1261 other => panic!("expected comment, got {:?}", other), 1262 } 1263 } 1264 1265 #[test] 1266 fn pre_element() { 1267 let doc = parse_html("<pre>line 1\nline 2</pre>"); 1268 let root = doc.root(); 1269 let html = doc.children(root).next().unwrap(); 1270 let body = doc.children(html).nth(1).unwrap(); 1271 1272 let pre = doc.children(body).next().unwrap(); 1273 assert_eq!(doc.tag_name(pre), Some("pre")); 1274 assert_eq!(text_of_children(&doc, pre), "line 1\nline 2"); 1275 } 1276 1277 #[test] 1278 fn attributes_preserved() { 1279 let doc = 1280 parse_html("<div id=\"main\" class=\"container\"><a href=\"/page\">link</a></div>"); 1281 let root = doc.root(); 1282 let html = doc.children(root).next().unwrap(); 1283 let body = doc.children(html).nth(1).unwrap(); 1284 1285 let div = doc.children(body).next().unwrap(); 1286 assert_eq!(doc.get_attribute(div, "id"), Some("main")); 1287 assert_eq!(doc.get_attribute(div, "class"), Some("container")); 1288 1289 let a = doc.children(div).next().unwrap(); 1290 assert_eq!(doc.get_attribute(a, "href"), Some("/page")); 1291 } 1292 1293 #[test] 1294 fn empty_document() { 1295 let doc = parse_html(""); 1296 let root = doc.root(); 1297 // Even an empty doc should get html/head/body from EOF handling. 1298 // The tree builder creates implicit elements. 1299 assert!(doc.children(root).next().is_some()); 1300 } 1301 1302 #[test] 1303 fn just_text() { 1304 let doc = parse_html("Hello, world!"); 1305 let root = doc.root(); 1306 let html = doc.children(root).next().unwrap(); 1307 let body = doc.children(html).nth(1).unwrap(); 1308 1309 assert_eq!(text_of_children(&doc, body), "Hello, world!"); 1310 } 1311 1312 #[test] 1313 fn heading_closes_open_p() { 1314 let doc = parse_html("<p>text<h1>heading</h1>"); 1315 let root = doc.root(); 1316 let html = doc.children(root).next().unwrap(); 1317 let body = doc.children(html).nth(1).unwrap(); 1318 1319 // <p> should be closed by <h1>, so they're siblings. 1320 let tags = child_tags(&doc, body); 1321 assert_eq!(tags, vec!["p", "h1"]); 1322 } 1323 1324 #[test] 1325 fn self_closing_void_elements() { 1326 let doc = parse_html("<p>before<br/>after</p>"); 1327 let root = doc.root(); 1328 let html = doc.children(root).next().unwrap(); 1329 let body = doc.children(html).nth(1).unwrap(); 1330 let p = doc.children(body).next().unwrap(); 1331 1332 let children: Vec<NodeId> = doc.children(p).collect(); 1333 assert_eq!(children.len(), 3); 1334 assert_eq!(doc.tag_name(children[1]), Some("br")); 1335 } 1336 1337 #[test] 1338 fn doctype_is_handled() { 1339 let doc = parse_html("<!DOCTYPE html><html><body></body></html>"); 1340 let root = doc.root(); 1341 let html = doc.children(root).next().unwrap(); 1342 assert_eq!(doc.tag_name(html), Some("html")); 1343 } 1344 1345 #[test] 1346 fn tree_builder_step_by_step() { 1347 let mut builder = TreeBuilder::new(); 1348 builder.process_token(Token::Doctype { 1349 name: Some("html".into()), 1350 public_id: None, 1351 system_id: None, 1352 force_quirks: false, 1353 }); 1354 builder.process_token(Token::StartTag { 1355 name: "html".into(), 1356 attributes: vec![], 1357 self_closing: false, 1358 }); 1359 builder.process_token(Token::StartTag { 1360 name: "head".into(), 1361 attributes: vec![], 1362 self_closing: false, 1363 }); 1364 builder.process_token(Token::EndTag { 1365 name: "head".into(), 1366 }); 1367 builder.process_token(Token::StartTag { 1368 name: "body".into(), 1369 attributes: vec![], 1370 self_closing: false, 1371 }); 1372 builder.process_token(Token::StartTag { 1373 name: "p".into(), 1374 attributes: vec![], 1375 self_closing: false, 1376 }); 1377 builder.process_token(Token::Character("Hello".into())); 1378 builder.process_token(Token::EndTag { name: "p".into() }); 1379 builder.process_token(Token::EndTag { 1380 name: "body".into(), 1381 }); 1382 builder.process_token(Token::EndTag { 1383 name: "html".into(), 1384 }); 1385 builder.process_token(Token::Eof); 1386 1387 let doc = builder.finish(); 1388 let root = doc.root(); 1389 let html = doc.children(root).next().unwrap(); 1390 assert_eq!(doc.tag_name(html), Some("html")); 1391 1392 let body = doc.children(html).nth(1).unwrap(); 1393 let p = doc.children(body).next().unwrap(); 1394 assert_eq!(text_of_children(&doc, p), "Hello"); 1395 } 1396 1397 #[test] 1398 fn multiple_text_children_merge() { 1399 // When consecutive character tokens arrive, they should merge. 1400 let mut builder = TreeBuilder::new(); 1401 builder.process_token(Token::StartTag { 1402 name: "p".into(), 1403 attributes: vec![], 1404 self_closing: false, 1405 }); 1406 builder.process_token(Token::Character("Hello ".into())); 1407 builder.process_token(Token::Character("world".into())); 1408 builder.process_token(Token::EndTag { name: "p".into() }); 1409 builder.process_token(Token::Eof); 1410 1411 let doc = builder.finish(); 1412 let root = doc.root(); 1413 let html = doc.children(root).next().unwrap(); 1414 let body = doc.children(html).nth(1).unwrap(); 1415 let p = doc.children(body).next().unwrap(); 1416 1417 // Should be a single text node. 1418 let children: Vec<NodeId> = doc.children(p).collect(); 1419 assert_eq!(children.len(), 1); 1420 assert_eq!(doc.text_content(children[0]), Some("Hello world")); 1421 } 1422 1423 #[test] 1424 fn parse_inline_svg() { 1425 let doc = parse_html( 1426 "<html><body><svg width=\"100\" height=\"100\"><rect width=\"50\" height=\"50\" fill=\"red\"/></svg></body></html>", 1427 ); 1428 let root = doc.root(); 1429 let html = doc.children(root).next().unwrap(); 1430 let body = doc.children(html).nth(1).unwrap(); 1431 let svg = doc.children(body).next().unwrap(); 1432 1433 // SVG element should have SVG namespace. 1434 if let NodeData::Element { 1435 ref namespace, 1436 ref tag_name, 1437 .. 1438 } = *doc.node_data(svg) 1439 { 1440 assert_eq!(tag_name, "svg"); 1441 assert_eq!(namespace.as_deref(), Some(SVG_NAMESPACE)); 1442 } else { 1443 panic!("Expected Element node"); 1444 } 1445 1446 // SVG should have width/height attributes. 1447 assert_eq!(doc.get_attribute(svg, "width"), Some("100")); 1448 assert_eq!(doc.get_attribute(svg, "height"), Some("100")); 1449 1450 // Rect child should also have SVG namespace. 1451 let rect = doc.children(svg).next().unwrap(); 1452 if let NodeData::Element { 1453 ref namespace, 1454 ref tag_name, 1455 .. 1456 } = *doc.node_data(rect) 1457 { 1458 assert_eq!(tag_name, "rect"); 1459 assert_eq!(namespace.as_deref(), Some(SVG_NAMESPACE)); 1460 } else { 1461 panic!("Expected Element node"); 1462 } 1463 assert_eq!(doc.get_attribute(rect, "fill"), Some("red")); 1464 } 1465 1466 #[test] 1467 fn parse_svg_with_nested_elements() { 1468 let doc = parse_html( 1469 "<body><svg width=\"200\" height=\"200\"><g><circle cx=\"50\" cy=\"50\" r=\"40\"/><text x=\"10\" y=\"80\">Hello</text></g></svg></body>", 1470 ); 1471 let root = doc.root(); 1472 let html = doc.children(root).next().unwrap(); 1473 let body = doc.children(html).nth(1).unwrap(); 1474 let svg = doc.children(body).next().unwrap(); 1475 1476 assert_eq!(doc.tag_name(svg), Some("svg")); 1477 let g = doc.children(svg).next().unwrap(); 1478 assert_eq!(doc.tag_name(g), Some("g")); 1479 1480 let children: Vec<String> = child_tags(&doc, g); 1481 assert_eq!(children, vec!["circle", "text"]); 1482 1483 // Text element should contain text content. 1484 let text_el = doc.children(g).nth(1).unwrap(); 1485 assert_eq!(doc.deep_text_content(text_el), "Hello"); 1486 } 1487 1488 #[test] 1489 fn svg_content_followed_by_html() { 1490 let doc = parse_html( 1491 "<body><svg width=\"50\" height=\"50\"><rect fill=\"blue\"/></svg><p>After SVG</p></body>", 1492 ); 1493 let root = doc.root(); 1494 let html = doc.children(root).next().unwrap(); 1495 let body = doc.children(html).nth(1).unwrap(); 1496 1497 let children: Vec<String> = child_tags(&doc, body); 1498 assert_eq!(children, vec!["svg", "p"]); 1499 1500 // SVG children should be in SVG namespace. 1501 let svg = doc.children(body).next().unwrap(); 1502 let rect = doc.children(svg).next().unwrap(); 1503 if let NodeData::Element { ref namespace, .. } = *doc.node_data(rect) { 1504 assert_eq!(namespace.as_deref(), Some(SVG_NAMESPACE)); 1505 } 1506 1507 // Paragraph after SVG should be in HTML namespace (no namespace). 1508 let p = doc.children(body).nth(1).unwrap(); 1509 if let NodeData::Element { ref namespace, .. } = *doc.node_data(p) { 1510 assert_eq!(namespace.as_deref(), None); 1511 } 1512 } 1513 1514 // --- Form element parsing tests (Phase 16) --- 1515 1516 #[test] 1517 fn parse_form_with_inputs() { 1518 let doc = parse_html( 1519 r#"<form action="/submit" method="post"><input type="text" name="user"><input type="password" name="pass"><button type="submit">Login</button></form>"#, 1520 ); 1521 let root = doc.root(); 1522 let html = doc.children(root).next().unwrap(); 1523 let body = doc.children(html).nth(1).unwrap(); 1524 let form = doc.children(body).next().unwrap(); 1525 1526 assert_eq!(doc.tag_name(form), Some("form")); 1527 assert_eq!(doc.get_attribute(form, "action"), Some("/submit")); 1528 assert_eq!(doc.get_attribute(form, "method"), Some("post")); 1529 1530 let tags = child_tags(&doc, form); 1531 assert_eq!(tags, vec!["input", "input", "button"]); 1532 1533 // Check input attributes. 1534 let children: Vec<NodeId> = doc.children(form).collect(); 1535 assert_eq!(doc.get_attribute(children[0], "type"), Some("text")); 1536 assert_eq!(doc.get_attribute(children[0], "name"), Some("user")); 1537 assert_eq!(doc.get_attribute(children[1], "type"), Some("password")); 1538 assert_eq!(doc.get_attribute(children[1], "name"), Some("pass")); 1539 1540 // Button contains text. 1541 assert_eq!(doc.get_attribute(children[2], "type"), Some("submit")); 1542 assert_eq!(text_of_children(&doc, children[2]), "Login"); 1543 } 1544 1545 #[test] 1546 fn parse_textarea() { 1547 let doc = parse_html( 1548 r#"<form><textarea name="bio" rows="4" cols="50">Default text here</textarea></form>"#, 1549 ); 1550 let root = doc.root(); 1551 let html = doc.children(root).next().unwrap(); 1552 let body = doc.children(html).nth(1).unwrap(); 1553 let form = doc.children(body).next().unwrap(); 1554 let textarea = doc.children(form).next().unwrap(); 1555 1556 assert_eq!(doc.tag_name(textarea), Some("textarea")); 1557 assert_eq!(doc.get_attribute(textarea, "name"), Some("bio")); 1558 assert_eq!(doc.get_attribute(textarea, "rows"), Some("4")); 1559 assert_eq!(doc.get_attribute(textarea, "cols"), Some("50")); 1560 assert_eq!(text_of_children(&doc, textarea), "Default text here"); 1561 } 1562 1563 #[test] 1564 fn parse_select_with_options() { 1565 let doc = parse_html( 1566 r#"<form><select name="color"><option value="r">Red</option><option value="g" selected>Green</option><option value="b">Blue</option></select></form>"#, 1567 ); 1568 let root = doc.root(); 1569 let html = doc.children(root).next().unwrap(); 1570 let body = doc.children(html).nth(1).unwrap(); 1571 let form = doc.children(body).next().unwrap(); 1572 let select = doc.children(form).next().unwrap(); 1573 1574 assert_eq!(doc.tag_name(select), Some("select")); 1575 assert_eq!(doc.get_attribute(select, "name"), Some("color")); 1576 1577 let options: Vec<NodeId> = doc.children(select).collect(); 1578 assert_eq!(options.len(), 3); 1579 assert_eq!(doc.get_attribute(options[0], "value"), Some("r")); 1580 assert_eq!(text_of_children(&doc, options[0]), "Red"); 1581 assert_eq!(doc.get_attribute(options[1], "value"), Some("g")); 1582 assert_eq!(doc.get_attribute(options[1], "selected"), Some("")); 1583 assert_eq!(doc.get_attribute(options[2], "value"), Some("b")); 1584 } 1585 1586 #[test] 1587 fn parse_select_with_optgroups() { 1588 let doc = parse_html( 1589 r#"<select><optgroup label="Primary"><option>Red</option><option>Blue</option></optgroup><optgroup label="Secondary"><option>Orange</option></optgroup></select>"#, 1590 ); 1591 let root = doc.root(); 1592 let html = doc.children(root).next().unwrap(); 1593 let body = doc.children(html).nth(1).unwrap(); 1594 let select = doc.children(body).next().unwrap(); 1595 1596 assert_eq!(doc.tag_name(select), Some("select")); 1597 let groups = child_tags(&doc, select); 1598 assert_eq!(groups, vec!["optgroup", "optgroup"]); 1599 1600 let group1 = doc.children(select).next().unwrap(); 1601 assert_eq!(doc.get_attribute(group1, "label"), Some("Primary")); 1602 let options: Vec<String> = child_tags(&doc, group1); 1603 assert_eq!(options, vec!["option", "option"]); 1604 } 1605 1606 #[test] 1607 fn parse_fieldset_and_legend() { 1608 let doc = parse_html( 1609 r#"<form><fieldset><legend>Personal Info</legend><input type="text" name="name"><input type="email" name="email"></fieldset></form>"#, 1610 ); 1611 let root = doc.root(); 1612 let html = doc.children(root).next().unwrap(); 1613 let body = doc.children(html).nth(1).unwrap(); 1614 let form = doc.children(body).next().unwrap(); 1615 let fieldset = doc.children(form).next().unwrap(); 1616 1617 assert_eq!(doc.tag_name(fieldset), Some("fieldset")); 1618 1619 let fieldset_tags = child_tags(&doc, fieldset); 1620 assert_eq!(fieldset_tags, vec!["legend", "input", "input"]); 1621 1622 let legend = doc.children(fieldset).next().unwrap(); 1623 assert_eq!(text_of_children(&doc, legend), "Personal Info"); 1624 } 1625 1626 #[test] 1627 fn parse_label_with_for_attribute() { 1628 let doc = parse_html( 1629 r#"<form><label for="name">Name:</label><input type="text" id="name" name="name"></form>"#, 1630 ); 1631 let root = doc.root(); 1632 let html = doc.children(root).next().unwrap(); 1633 let body = doc.children(html).nth(1).unwrap(); 1634 let form = doc.children(body).next().unwrap(); 1635 1636 let tags = child_tags(&doc, form); 1637 assert_eq!(tags, vec!["label", "input"]); 1638 1639 let label = doc.children(form).next().unwrap(); 1640 assert_eq!(doc.get_attribute(label, "for"), Some("name")); 1641 assert_eq!(text_of_children(&doc, label), "Name:"); 1642 1643 // Verify label_control resolves via `for` attribute. 1644 let input = doc.children(form).nth(1).unwrap(); 1645 assert_eq!(doc.label_control(label), Some(input)); 1646 } 1647 1648 #[test] 1649 fn parse_label_implicit_association() { 1650 let doc = parse_html(r#"<label>Name: <input type="text" name="name"></label>"#); 1651 let root = doc.root(); 1652 let html = doc.children(root).next().unwrap(); 1653 let body = doc.children(html).nth(1).unwrap(); 1654 let label = doc.children(body).next().unwrap(); 1655 1656 assert_eq!(doc.tag_name(label), Some("label")); 1657 let input = doc.label_control(label).unwrap(); 1658 assert_eq!(doc.tag_name(input), Some("input")); 1659 assert_eq!(doc.get_attribute(input, "name"), Some("name")); 1660 } 1661 1662 #[test] 1663 fn form_closes_p_in_button_scope() { 1664 let doc = parse_html("<p>text<form><input></form>"); 1665 let root = doc.root(); 1666 let html = doc.children(root).next().unwrap(); 1667 let body = doc.children(html).nth(1).unwrap(); 1668 1669 // <p> should be closed by <form>, so they're siblings. 1670 let tags = child_tags(&doc, body); 1671 assert_eq!(tags, vec!["p", "form"]); 1672 } 1673 1674 #[test] 1675 fn nested_form_is_ignored() { 1676 let doc = parse_html("<form id=\"outer\"><form id=\"inner\"><input></form></form>"); 1677 let root = doc.root(); 1678 let html = doc.children(root).next().unwrap(); 1679 let body = doc.children(html).nth(1).unwrap(); 1680 1681 // Only one <form> should exist (nested form is ignored). 1682 let tags = child_tags(&doc, body); 1683 assert_eq!(tags, vec!["form"]); 1684 1685 let form = doc.children(body).next().unwrap(); 1686 assert_eq!(doc.get_attribute(form, "id"), Some("outer")); 1687 1688 // Input should be a child of the outer form. 1689 let form_tags = child_tags(&doc, form); 1690 assert_eq!(form_tags, vec!["input"]); 1691 } 1692 1693 #[test] 1694 fn button_scope_handling() { 1695 let doc = parse_html("<button>First</button><button>Second</button>"); 1696 let root = doc.root(); 1697 let html = doc.children(root).next().unwrap(); 1698 let body = doc.children(html).nth(1).unwrap(); 1699 1700 let tags = child_tags(&doc, body); 1701 assert_eq!(tags, vec!["button", "button"]); 1702 } 1703 1704 #[test] 1705 fn form_elements_collection() { 1706 let doc = parse_html( 1707 r#"<form><input name="a"><div><input name="b"></div><select name="c"><option>X</option></select><textarea name="d"></textarea></form>"#, 1708 ); 1709 let root = doc.root(); 1710 let html = doc.children(root).next().unwrap(); 1711 let body = doc.children(html).nth(1).unwrap(); 1712 let form = doc.children(body).next().unwrap(); 1713 1714 let elements = doc.form_elements(form); 1715 assert_eq!(elements.len(), 4); 1716 1717 // Verify they're the right elements. 1718 assert_eq!(doc.get_attribute(elements[0], "name"), Some("a")); 1719 assert_eq!(doc.get_attribute(elements[1], "name"), Some("b")); 1720 assert_eq!(doc.get_attribute(elements[2], "name"), Some("c")); 1721 assert_eq!(doc.get_attribute(elements[3], "name"), Some("d")); 1722 } 1723 1724 #[test] 1725 fn form_owner_from_parsed_tree() { 1726 let doc = parse_html(r#"<form id="f"><div><input id="i"></div></form>"#); 1727 let root = doc.root(); 1728 let html = doc.children(root).next().unwrap(); 1729 let body = doc.children(html).nth(1).unwrap(); 1730 let form = doc.children(body).next().unwrap(); 1731 1732 let input = doc.get_element_by_id("i").unwrap(); 1733 assert_eq!(doc.form_owner(input), Some(form)); 1734 } 1735 1736 #[test] 1737 fn parse_input_types() { 1738 let doc = parse_html( 1739 r#"<form> 1740 <input type="text"> 1741 <input type="password"> 1742 <input type="checkbox" checked> 1743 <input type="radio" name="choice" value="a"> 1744 <input type="submit" value="Go"> 1745 <input type="reset"> 1746 <input type="hidden" name="token" value="abc"> 1747 <input type="number" min="0" max="100"> 1748 <input type="email"> 1749 <input type="url"> 1750 <input type="search"> 1751 <input type="tel"> 1752 </form>"#, 1753 ); 1754 let root = doc.root(); 1755 let html = doc.children(root).next().unwrap(); 1756 let body = doc.children(html).nth(1).unwrap(); 1757 let form = doc.children(body).next().unwrap(); 1758 1759 let inputs: Vec<NodeId> = doc 1760 .children(form) 1761 .filter(|&id| doc.tag_name(id) == Some("input")) 1762 .collect(); 1763 assert_eq!(inputs.len(), 12); 1764 1765 assert_eq!(doc.get_attribute(inputs[0], "type"), Some("text")); 1766 assert_eq!(doc.get_attribute(inputs[1], "type"), Some("password")); 1767 assert_eq!(doc.get_attribute(inputs[2], "type"), Some("checkbox")); 1768 assert_eq!(doc.get_attribute(inputs[2], "checked"), Some("")); 1769 assert_eq!(doc.get_attribute(inputs[3], "type"), Some("radio")); 1770 assert_eq!(doc.get_attribute(inputs[3], "name"), Some("choice")); 1771 assert_eq!(doc.get_attribute(inputs[4], "type"), Some("submit")); 1772 assert_eq!(doc.get_attribute(inputs[4], "value"), Some("Go")); 1773 assert_eq!(doc.get_attribute(inputs[5], "type"), Some("reset")); 1774 assert_eq!(doc.get_attribute(inputs[6], "type"), Some("hidden")); 1775 assert_eq!(doc.get_attribute(inputs[6], "value"), Some("abc")); 1776 assert_eq!(doc.get_attribute(inputs[7], "type"), Some("number")); 1777 assert_eq!(doc.get_attribute(inputs[7], "min"), Some("0")); 1778 assert_eq!(doc.get_attribute(inputs[7], "max"), Some("100")); 1779 assert_eq!(doc.get_attribute(inputs[8], "type"), Some("email")); 1780 assert_eq!(doc.get_attribute(inputs[9], "type"), Some("url")); 1781 assert_eq!(doc.get_attribute(inputs[10], "type"), Some("search")); 1782 assert_eq!(doc.get_attribute(inputs[11], "type"), Some("tel")); 1783 } 1784 1785 #[test] 1786 fn select_closes_on_input() { 1787 // An <input> inside a <select> should close the select. 1788 let doc = parse_html("<select><option>A</option><input type=\"text\"></select>"); 1789 let root = doc.root(); 1790 let html = doc.children(root).next().unwrap(); 1791 let body = doc.children(html).nth(1).unwrap(); 1792 1793 let tags = child_tags(&doc, body); 1794 // Select should be closed before input, making them siblings. 1795 assert_eq!(tags, vec!["select", "input"]); 1796 } 1797}