we (web engine): Experimental web browser project to understand the limits of Claude
2
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 63 lines 1.8 kB view raw
1//! HTML5 tokenizer and tree builder. 2//! 3//! Implements the WHATWG HTML5 tokenizer state machine (§13.2.5) 4//! and a simplified tree builder for constructing DOM trees from tokens. 5 6mod entities; 7pub mod speculative; 8mod tokenizer; 9mod tree_builder; 10 11pub use tokenizer::Tokenizer; 12pub use tree_builder::{parse_html, TreeBuilder}; 13 14/// A token emitted by the HTML tokenizer. 15#[derive(Debug, Clone, PartialEq)] 16pub enum Token { 17 /// `<!DOCTYPE name public_id system_id>` 18 Doctype { 19 name: Option<String>, 20 public_id: Option<String>, 21 system_id: Option<String>, 22 force_quirks: bool, 23 }, 24 /// `<tag attr="val">` 25 StartTag { 26 name: String, 27 attributes: Vec<(String, String)>, 28 self_closing: bool, 29 }, 30 /// `</tag>` 31 EndTag { name: String }, 32 /// Character data (may be coalesced). 33 Character(String), 34 /// `<!-- comment -->` 35 Comment(String), 36 /// End of file. 37 Eof, 38} 39 40/// Tokenize an HTML input string into a sequence of tokens. 41/// 42/// Runs the HTML5 tokenizer state machine and returns all emitted tokens 43/// (excluding Eof). Adjacent Character tokens are coalesced. 44pub fn tokenize(input: &str) -> Vec<Token> { 45 let mut tok = Tokenizer::new(input); 46 let mut tokens = Vec::new(); 47 loop { 48 let token = tok.next_token(); 49 match token { 50 Token::Eof => break, 51 Token::Character(ref s) => { 52 // Coalesce adjacent character tokens. 53 if let Some(Token::Character(ref mut prev)) = tokens.last_mut() { 54 prev.push_str(s); 55 } else { 56 tokens.push(token); 57 } 58 } 59 _ => tokens.push(token), 60 } 61 } 62 tokens 63}