we (web engine): Experimental web browser project to understand the limits of Claude
1//! HTML5 tokenizer and tree builder.
2//!
3//! Implements the WHATWG HTML5 tokenizer state machine (§13.2.5)
4//! and a simplified tree builder for constructing DOM trees from tokens.
5
6mod entities;
7pub mod speculative;
8mod tokenizer;
9mod tree_builder;
10
11pub use tokenizer::Tokenizer;
12pub use tree_builder::{parse_html, TreeBuilder};
13
14/// A token emitted by the HTML tokenizer.
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17 /// `<!DOCTYPE name public_id system_id>`
18 Doctype {
19 name: Option<String>,
20 public_id: Option<String>,
21 system_id: Option<String>,
22 force_quirks: bool,
23 },
24 /// `<tag attr="val">`
25 StartTag {
26 name: String,
27 attributes: Vec<(String, String)>,
28 self_closing: bool,
29 },
30 /// `</tag>`
31 EndTag { name: String },
32 /// Character data (may be coalesced).
33 Character(String),
34 /// `<!-- comment -->`
35 Comment(String),
36 /// End of file.
37 Eof,
38}
39
40/// Tokenize an HTML input string into a sequence of tokens.
41///
42/// Runs the HTML5 tokenizer state machine and returns all emitted tokens
43/// (excluding Eof). Adjacent Character tokens are coalesced.
44pub fn tokenize(input: &str) -> Vec<Token> {
45 let mut tok = Tokenizer::new(input);
46 let mut tokens = Vec::new();
47 loop {
48 let token = tok.next_token();
49 match token {
50 Token::Eof => break,
51 Token::Character(ref s) => {
52 // Coalesce adjacent character tokens.
53 if let Some(Token::Character(ref mut prev)) = tokens.last_mut() {
54 prev.push_str(s);
55 } else {
56 tokens.push(token);
57 }
58 }
59 _ => tokens.push(token),
60 }
61 }
62 tokens
63}