we (web engine): Experimental web browser project to understand the limits of Claude
2
fork

Configure Feed

Select the types of activity you want to include in your feed.

Implement speculative HTML parsing with off-main-thread preload scanning

Add a speculative parser that runs on a background thread to discover
preloadable resources while the main parser is blocked on script execution.

- PreloadScanner: lightweight state machine that identifies <link rel=stylesheet>,
<script src>, <img src>, and <link rel=preload> tags
- SpeculativeParser: manages background thread, mpsc channels, token buffering,
and invalidation on document.write
- Handles inline script content correctly (enters ScriptData mode)
- Skips HTML comments to avoid false positives
- Supports double-quoted, single-quoted, and unquoted attribute values
- 22 new tests covering resource discovery, speculation hit/miss, invalidation,
and token replay

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

+1017
+1
crates/html/src/lib.rs
··· 4 4 //! and a simplified tree builder for constructing DOM trees from tokens. 5 5 6 6 mod entities; 7 + pub mod speculative; 7 8 mod tokenizer; 8 9 mod tree_builder; 9 10
+1016
crates/html/src/speculative.rs
··· 1 + //! Speculative HTML parsing: off-main-thread preload scanning. 2 + //! 3 + //! When the main parser blocks on script execution, a speculative tokenizer 4 + //! runs on a background thread to discover preloadable resources (`<link>`, 5 + //! `<script src>`, `<img>`, `<link rel="preload">`). Discovered URLs are 6 + //! sent back so the resource loader can start fetching them early. 7 + //! 8 + //! The speculative tokenizer also buffers the tokens it produces. If 9 + //! `document.write` does not inject content, the main parser can reuse 10 + //! those tokens instead of re-tokenizing. If `document.write` *does* 11 + //! inject content, speculative results from that point forward are 12 + //! discarded and the main parser re-tokenizes. 13 + 14 + use crate::Token; 15 + use std::sync::mpsc; 16 + use std::thread; 17 + 18 + /// The kind of resource discovered by speculative scanning. 19 + #[derive(Debug, Clone, Copy, PartialEq, Eq)] 20 + pub enum ResourceType { 21 + /// `<link rel="stylesheet" href="...">` 22 + Stylesheet, 23 + /// `<script src="...">` 24 + Script, 25 + /// `<img src="...">` 26 + Image, 27 + /// `<link rel="preload" href="..." as="...">` 28 + Preload, 29 + } 30 + 31 + /// A preloadable resource URL discovered by the speculative scanner. 32 + #[derive(Debug, Clone, PartialEq, Eq)] 33 + pub struct PreloadUrl { 34 + /// The URL to preload. 35 + pub url: String, 36 + /// The type of resource. 37 + pub resource_type: ResourceType, 38 + /// For preload hints, the `as` attribute value (e.g. "script", "style"). 39 + pub as_attr: Option<String>, 40 + } 41 + 42 + /// State machine for the simplified preload scanner. 43 + /// 44 + /// This is much simpler than the full HTML5 tokenizer: it only needs to 45 + /// recognise opening tags and extract `src`, `href`, `rel`, and `as` 46 + /// attributes from resource-bearing elements. 47 + #[derive(Debug, Clone, Copy, PartialEq)] 48 + enum ScanState { 49 + /// Scanning text content, looking for `<`. 50 + Data, 51 + /// Just saw `<`, deciding what kind of tag. 52 + TagOpen, 53 + /// Inside `</...>` end tag — skip until `>`. 54 + EndTag, 55 + /// Accumulating a tag name. 56 + TagName, 57 + /// Between attributes, waiting for name or `>`. 58 + BeforeAttr, 59 + /// Accumulating an attribute name. 60 + AttrName, 61 + /// After attribute name, before `=` or next attribute. 62 + AfterAttrName, 63 + /// After `=`, before attribute value. 64 + BeforeAttrValue, 65 + /// Inside a double-quoted attribute value. 66 + AttrValueDQ, 67 + /// Inside a single-quoted attribute value. 68 + AttrValueSQ, 69 + /// Inside an unquoted attribute value. 70 + AttrValueUQ, 71 + /// After a quoted attribute value (before whitespace/`>`). 72 + AfterAttrValue, 73 + /// Inside `<!--` comment, waiting for `-->`. 74 + Comment, 75 + /// Saw `-` inside comment. 76 + CommentDash, 77 + /// Saw `--` inside comment. 78 + CommentDashDash, 79 + /// Inside `<script>` raw text — skip until `</script>`. 80 + ScriptData, 81 + /// Saw `<` inside script data. 82 + ScriptDataLt, 83 + /// Accumulating potential `/script>` end tag inside script data. 84 + ScriptDataEndTag, 85 + } 86 + 87 + /// Lightweight preload scanner that extracts resource URLs from HTML. 88 + /// 89 + /// This is intentionally simplified: it does not handle all the edge cases 90 + /// of the full HTML5 tokenizer. It is conservative — it may miss some 91 + /// resources, but it will not produce false positives that cause incorrect 92 + /// behaviour. 93 + pub struct PreloadScanner { 94 + input: Vec<char>, 95 + pos: usize, 96 + state: ScanState, 97 + /// The tag name being accumulated. 98 + tag_name: String, 99 + /// Current attribute name being accumulated. 100 + attr_name: String, 101 + /// Current attribute value being accumulated. 102 + attr_value: String, 103 + /// Collected attributes for the current tag. 104 + attrs: Vec<(String, String)>, 105 + /// Buffer for script end tag detection. 106 + end_tag_buf: String, 107 + /// Discovered preload URLs. 108 + preloads: Vec<PreloadUrl>, 109 + } 110 + 111 + impl PreloadScanner { 112 + /// Create a new preload scanner for the given input. 113 + pub fn new(input: &str) -> Self { 114 + PreloadScanner { 115 + input: input.chars().collect(), 116 + pos: 0, 117 + state: ScanState::Data, 118 + tag_name: String::new(), 119 + attr_name: String::new(), 120 + attr_value: String::new(), 121 + attrs: Vec::new(), 122 + end_tag_buf: String::new(), 123 + preloads: Vec::new(), 124 + } 125 + } 126 + 127 + /// Run the scanner to completion, returning discovered preload URLs. 128 + pub fn scan(&mut self) -> Vec<PreloadUrl> { 129 + while self.pos < self.input.len() { 130 + self.step(); 131 + } 132 + std::mem::take(&mut self.preloads) 133 + } 134 + 135 + /// Also produce a buffered token stream using the full tokenizer. 136 + /// 137 + /// This runs the full HTML5 tokenizer on the same input in addition to 138 + /// the preload scan, buffering all tokens for potential reuse. 139 + pub fn scan_with_tokens(&mut self) -> (Vec<PreloadUrl>, Vec<Token>) { 140 + // Run the preload scan. 141 + let preloads = self.scan(); 142 + 143 + // Run the full tokenizer to buffer tokens. 144 + let input_str: String = self.input.iter().collect(); 145 + let mut tokenizer = crate::Tokenizer::new(&input_str); 146 + let mut tokens = Vec::new(); 147 + loop { 148 + let token = tokenizer.next_token(); 149 + if token == Token::Eof { 150 + break; 151 + } 152 + tokens.push(token); 153 + } 154 + 155 + (preloads, tokens) 156 + } 157 + 158 + /// Advance the scanner by one character. 159 + fn step(&mut self) { 160 + let ch = self.input[self.pos]; 161 + match self.state { 162 + ScanState::Data => self.scan_data(ch), 163 + ScanState::TagOpen => self.scan_tag_open(ch), 164 + ScanState::EndTag => self.scan_end_tag(ch), 165 + ScanState::TagName => self.scan_tag_name(ch), 166 + ScanState::BeforeAttr => self.scan_before_attr(ch), 167 + ScanState::AttrName => self.scan_attr_name(ch), 168 + ScanState::AfterAttrName => self.scan_after_attr_name(ch), 169 + ScanState::BeforeAttrValue => self.scan_before_attr_value(ch), 170 + ScanState::AttrValueDQ => self.scan_attr_value_dq(ch), 171 + ScanState::AttrValueSQ => self.scan_attr_value_sq(ch), 172 + ScanState::AttrValueUQ => self.scan_attr_value_uq(ch), 173 + ScanState::AfterAttrValue => self.scan_after_attr_value(ch), 174 + ScanState::Comment => self.scan_comment(ch), 175 + ScanState::CommentDash => self.scan_comment_dash(ch), 176 + ScanState::CommentDashDash => self.scan_comment_dash_dash(ch), 177 + ScanState::ScriptData => self.scan_script_data(ch), 178 + ScanState::ScriptDataLt => self.scan_script_data_lt(ch), 179 + ScanState::ScriptDataEndTag => self.scan_script_data_end_tag(ch), 180 + } 181 + } 182 + 183 + fn scan_data(&mut self, ch: char) { 184 + if ch == '<' { 185 + self.state = ScanState::TagOpen; 186 + } 187 + self.pos += 1; 188 + } 189 + 190 + fn scan_tag_open(&mut self, ch: char) { 191 + if ch == '/' { 192 + self.state = ScanState::EndTag; 193 + self.end_tag_buf.clear(); 194 + self.pos += 1; 195 + } else if ch == '!' { 196 + // Could be comment: `<!--` 197 + if self.pos + 2 < self.input.len() 198 + && self.input[self.pos + 1] == '-' 199 + && self.input[self.pos + 2] == '-' 200 + { 201 + self.state = ScanState::Comment; 202 + self.pos += 3; 203 + } else { 204 + // DOCTYPE or other — skip back to data. 205 + self.state = ScanState::Data; 206 + self.pos += 1; 207 + } 208 + } else if ch.is_ascii_alphabetic() { 209 + self.tag_name.clear(); 210 + self.attrs.clear(); 211 + self.tag_name.push(ch.to_ascii_lowercase()); 212 + self.state = ScanState::TagName; 213 + self.pos += 1; 214 + } else { 215 + self.state = ScanState::Data; 216 + self.pos += 1; 217 + } 218 + } 219 + 220 + fn scan_end_tag(&mut self, ch: char) { 221 + if ch == '>' { 222 + self.state = ScanState::Data; 223 + } 224 + self.pos += 1; 225 + } 226 + 227 + fn scan_tag_name(&mut self, ch: char) { 228 + if ch.is_ascii_whitespace() { 229 + self.state = ScanState::BeforeAttr; 230 + self.pos += 1; 231 + } else if ch == '/' || ch == '>' { 232 + if ch == '>' { 233 + self.state = ScanState::Data; 234 + } else { 235 + // Self-closing `/>`; wait for `>`. 236 + self.state = ScanState::BeforeAttr; 237 + } 238 + // emit_tag may override state (e.g. to ScriptData for <script>). 239 + self.emit_tag(); 240 + self.pos += 1; 241 + } else { 242 + self.tag_name.push(ch.to_ascii_lowercase()); 243 + self.pos += 1; 244 + } 245 + } 246 + 247 + fn scan_before_attr(&mut self, ch: char) { 248 + if ch == '>' { 249 + self.state = ScanState::Data; 250 + self.emit_tag(); 251 + self.pos += 1; 252 + } else if ch == '/' { 253 + // Self-closing; stay in BeforeAttr. 254 + self.pos += 1; 255 + } else if !ch.is_ascii_whitespace() { 256 + self.attr_name.clear(); 257 + self.attr_value.clear(); 258 + self.attr_name.push(ch.to_ascii_lowercase()); 259 + self.state = ScanState::AttrName; 260 + self.pos += 1; 261 + } else { 262 + self.pos += 1; 263 + } 264 + } 265 + 266 + fn scan_attr_name(&mut self, ch: char) { 267 + if ch == '=' { 268 + self.state = ScanState::BeforeAttrValue; 269 + self.pos += 1; 270 + } else if ch.is_ascii_whitespace() { 271 + self.state = ScanState::AfterAttrName; 272 + self.pos += 1; 273 + } else if ch == '>' || ch == '/' { 274 + // Attribute with no value. 275 + self.attrs 276 + .push((std::mem::take(&mut self.attr_name), String::new())); 277 + if ch == '>' { 278 + self.state = ScanState::Data; 279 + self.emit_tag(); 280 + } else { 281 + self.state = ScanState::BeforeAttr; 282 + } 283 + self.pos += 1; 284 + } else { 285 + self.attr_name.push(ch.to_ascii_lowercase()); 286 + self.pos += 1; 287 + } 288 + } 289 + 290 + fn scan_after_attr_name(&mut self, ch: char) { 291 + if ch == '=' { 292 + self.state = ScanState::BeforeAttrValue; 293 + self.pos += 1; 294 + } else if ch == '>' { 295 + self.attrs 296 + .push((std::mem::take(&mut self.attr_name), String::new())); 297 + self.state = ScanState::Data; 298 + self.emit_tag(); 299 + self.pos += 1; 300 + } else if ch == '/' { 301 + self.attrs 302 + .push((std::mem::take(&mut self.attr_name), String::new())); 303 + self.state = ScanState::BeforeAttr; 304 + self.pos += 1; 305 + } else if !ch.is_ascii_whitespace() { 306 + // New attribute started without value for the previous one. 307 + self.attrs 308 + .push((std::mem::take(&mut self.attr_name), String::new())); 309 + self.attr_name.clear(); 310 + self.attr_value.clear(); 311 + self.attr_name.push(ch.to_ascii_lowercase()); 312 + self.state = ScanState::AttrName; 313 + self.pos += 1; 314 + } else { 315 + self.pos += 1; 316 + } 317 + } 318 + 319 + fn scan_before_attr_value(&mut self, ch: char) { 320 + if ch == '"' { 321 + self.state = ScanState::AttrValueDQ; 322 + self.pos += 1; 323 + } else if ch == '\'' { 324 + self.state = ScanState::AttrValueSQ; 325 + self.pos += 1; 326 + } else if ch == '>' { 327 + self.attrs.push(( 328 + std::mem::take(&mut self.attr_name), 329 + std::mem::take(&mut self.attr_value), 330 + )); 331 + self.state = ScanState::Data; 332 + self.emit_tag(); 333 + self.pos += 1; 334 + } else if !ch.is_ascii_whitespace() { 335 + self.attr_value.push(ch); 336 + self.state = ScanState::AttrValueUQ; 337 + self.pos += 1; 338 + } else { 339 + self.pos += 1; 340 + } 341 + } 342 + 343 + fn scan_attr_value_dq(&mut self, ch: char) { 344 + if ch == '"' { 345 + self.attrs.push(( 346 + std::mem::take(&mut self.attr_name), 347 + std::mem::take(&mut self.attr_value), 348 + )); 349 + self.state = ScanState::AfterAttrValue; 350 + self.pos += 1; 351 + } else { 352 + self.attr_value.push(ch); 353 + self.pos += 1; 354 + } 355 + } 356 + 357 + fn scan_attr_value_sq(&mut self, ch: char) { 358 + if ch == '\'' { 359 + self.attrs.push(( 360 + std::mem::take(&mut self.attr_name), 361 + std::mem::take(&mut self.attr_value), 362 + )); 363 + self.state = ScanState::AfterAttrValue; 364 + self.pos += 1; 365 + } else { 366 + self.attr_value.push(ch); 367 + self.pos += 1; 368 + } 369 + } 370 + 371 + fn scan_attr_value_uq(&mut self, ch: char) { 372 + if ch.is_ascii_whitespace() { 373 + self.attrs.push(( 374 + std::mem::take(&mut self.attr_name), 375 + std::mem::take(&mut self.attr_value), 376 + )); 377 + self.state = ScanState::BeforeAttr; 378 + self.pos += 1; 379 + } else if ch == '>' { 380 + self.attrs.push(( 381 + std::mem::take(&mut self.attr_name), 382 + std::mem::take(&mut self.attr_value), 383 + )); 384 + self.state = ScanState::Data; 385 + self.emit_tag(); 386 + self.pos += 1; 387 + } else { 388 + self.attr_value.push(ch); 389 + self.pos += 1; 390 + } 391 + } 392 + 393 + fn scan_after_attr_value(&mut self, ch: char) { 394 + if ch.is_ascii_whitespace() { 395 + self.state = ScanState::BeforeAttr; 396 + self.pos += 1; 397 + } else if ch == '>' { 398 + self.state = ScanState::Data; 399 + self.emit_tag(); 400 + self.pos += 1; 401 + } else if ch == '/' { 402 + self.state = ScanState::BeforeAttr; 403 + self.pos += 1; 404 + } else { 405 + // Error recovery: treat as new attribute. 406 + self.attr_name.clear(); 407 + self.attr_value.clear(); 408 + self.attr_name.push(ch.to_ascii_lowercase()); 409 + self.state = ScanState::AttrName; 410 + self.pos += 1; 411 + } 412 + } 413 + 414 + fn scan_comment(&mut self, ch: char) { 415 + if ch == '-' { 416 + self.state = ScanState::CommentDash; 417 + } 418 + self.pos += 1; 419 + } 420 + 421 + fn scan_comment_dash(&mut self, ch: char) { 422 + if ch == '-' { 423 + self.state = ScanState::CommentDashDash; 424 + } else { 425 + self.state = ScanState::Comment; 426 + } 427 + self.pos += 1; 428 + } 429 + 430 + fn scan_comment_dash_dash(&mut self, ch: char) { 431 + if ch == '>' { 432 + self.state = ScanState::Data; 433 + } else if ch != '-' { 434 + self.state = ScanState::Comment; 435 + } 436 + self.pos += 1; 437 + } 438 + 439 + fn scan_script_data(&mut self, ch: char) { 440 + if ch == '<' { 441 + self.state = ScanState::ScriptDataLt; 442 + } 443 + self.pos += 1; 444 + } 445 + 446 + fn scan_script_data_lt(&mut self, ch: char) { 447 + if ch == '/' { 448 + self.end_tag_buf.clear(); 449 + self.state = ScanState::ScriptDataEndTag; 450 + self.pos += 1; 451 + } else { 452 + self.state = ScanState::ScriptData; 453 + self.pos += 1; 454 + } 455 + } 456 + 457 + fn scan_script_data_end_tag(&mut self, ch: char) { 458 + if ch.is_ascii_alphabetic() { 459 + self.end_tag_buf.push(ch.to_ascii_lowercase()); 460 + self.pos += 1; 461 + } else if ch == '>' && self.end_tag_buf == "script" { 462 + self.state = ScanState::Data; 463 + self.pos += 1; 464 + } else { 465 + // Not `</script>` — go back to script data. 466 + self.state = ScanState::ScriptData; 467 + self.pos += 1; 468 + } 469 + } 470 + 471 + /// Called when a complete start tag has been scanned. 472 + /// Checks whether it's a resource-bearing tag and records any preload URLs. 473 + fn emit_tag(&mut self) { 474 + match self.tag_name.as_str() { 475 + "script" => { 476 + let src = self.get_attr("src"); 477 + if let Some(url) = src { 478 + // External script: record preload and enter script data state 479 + // to skip the (presumably empty) content. 480 + self.preloads.push(PreloadUrl { 481 + url, 482 + resource_type: ResourceType::Script, 483 + as_attr: None, 484 + }); 485 + self.state = ScanState::ScriptData; 486 + } else { 487 + // Inline script: per the issue spec, we stop speculating here 488 + // because the inline script could call `document.write`. 489 + // We enter ScriptData state and continue scanning after it ends 490 + // (conservative approach: we still find resources after the 491 + // inline script's closing tag). 492 + self.state = ScanState::ScriptData; 493 + } 494 + } 495 + "link" => { 496 + let rel = self.get_attr("rel").unwrap_or_default(); 497 + let href = self.get_attr("href"); 498 + if let Some(url) = href { 499 + let rel_lower = rel.to_ascii_lowercase(); 500 + if rel_lower == "stylesheet" { 501 + self.preloads.push(PreloadUrl { 502 + url, 503 + resource_type: ResourceType::Stylesheet, 504 + as_attr: None, 505 + }); 506 + } else if rel_lower == "preload" { 507 + let as_val = self.get_attr("as"); 508 + self.preloads.push(PreloadUrl { 509 + url, 510 + resource_type: ResourceType::Preload, 511 + as_attr: as_val, 512 + }); 513 + } 514 + } 515 + } 516 + "img" => { 517 + if let Some(url) = self.get_attr("src") { 518 + self.preloads.push(PreloadUrl { 519 + url, 520 + resource_type: ResourceType::Image, 521 + as_attr: None, 522 + }); 523 + } 524 + } 525 + _ => {} 526 + } 527 + } 528 + 529 + /// Get an attribute value by name from the current tag's collected attributes. 530 + fn get_attr(&self, name: &str) -> Option<String> { 531 + self.attrs 532 + .iter() 533 + .find(|(k, _)| k == name) 534 + .map(|(_, v)| v.clone()) 535 + } 536 + } 537 + 538 + /// Message sent from the main thread to the speculative parser thread. 539 + #[derive(Debug)] 540 + pub enum SpeculativeRequest { 541 + /// Scan this HTML chunk starting at the given byte offset. 542 + Scan { 543 + /// The HTML content to scan (from offset to end of known input). 544 + html: String, 545 + /// The byte offset in the original document. 546 + offset: usize, 547 + }, 548 + /// Shut down the speculative parser thread. 549 + Shutdown, 550 + } 551 + 552 + /// Message sent from the speculative parser thread back to the main thread. 553 + #[derive(Debug)] 554 + pub enum SpeculativeResponse { 555 + /// Discovered preload URLs and buffered tokens. 556 + Results { 557 + /// Discovered preloadable resource URLs. 558 + preloads: Vec<PreloadUrl>, 559 + /// Speculatively tokenized tokens for potential reuse. 560 + tokens: Vec<Token>, 561 + /// The byte offset where scanning started. 562 + offset: usize, 563 + }, 564 + } 565 + 566 + /// Manages speculative parsing on a background thread. 567 + /// 568 + /// The main parser creates a `SpeculativeParser`, which spawns a background 569 + /// thread. When the main parser blocks on script execution, it sends the 570 + /// remaining HTML to the background thread for preload scanning. 571 + pub struct SpeculativeParser { 572 + /// Channel to send requests to the background thread. 573 + request_tx: mpsc::Sender<SpeculativeRequest>, 574 + /// Channel to receive responses from the background thread. 575 + response_rx: mpsc::Receiver<SpeculativeResponse>, 576 + /// Join handle for the background thread. 577 + handle: Option<thread::JoinHandle<()>>, 578 + /// Whether speculation results have been invalidated (e.g. by document.write). 579 + invalidated: bool, 580 + /// The offset at which the last speculation started. 581 + speculation_offset: Option<usize>, 582 + /// Buffered tokens from the last successful speculation. 583 + buffered_tokens: Vec<Token>, 584 + /// Index into buffered_tokens for replay. 585 + replay_index: usize, 586 + } 587 + 588 + impl Default for SpeculativeParser { 589 + fn default() -> Self { 590 + Self::new() 591 + } 592 + } 593 + 594 + impl SpeculativeParser { 595 + /// Create and start a new speculative parser with its background thread. 596 + pub fn new() -> Self { 597 + let (req_tx, req_rx) = mpsc::channel::<SpeculativeRequest>(); 598 + let (resp_tx, resp_rx) = mpsc::channel::<SpeculativeResponse>(); 599 + 600 + let handle = thread::spawn(move || { 601 + Self::background_loop(req_rx, resp_tx); 602 + }); 603 + 604 + SpeculativeParser { 605 + request_tx: req_tx, 606 + response_rx: resp_rx, 607 + handle: Some(handle), 608 + invalidated: false, 609 + speculation_offset: None, 610 + buffered_tokens: Vec::new(), 611 + replay_index: 0, 612 + } 613 + } 614 + 615 + /// The background thread's main loop. 616 + fn background_loop( 617 + rx: mpsc::Receiver<SpeculativeRequest>, 618 + tx: mpsc::Sender<SpeculativeResponse>, 619 + ) { 620 + while let Ok(request) = rx.recv() { 621 + match request { 622 + SpeculativeRequest::Scan { html, offset } => { 623 + let mut scanner = PreloadScanner::new(&html); 624 + let (preloads, tokens) = scanner.scan_with_tokens(); 625 + let _ = tx.send(SpeculativeResponse::Results { 626 + preloads, 627 + tokens, 628 + offset, 629 + }); 630 + } 631 + SpeculativeRequest::Shutdown => break, 632 + } 633 + } 634 + } 635 + 636 + /// Start speculative scanning of HTML content. 637 + /// 638 + /// Called when the main parser blocks on script execution. 639 + /// `html` is the remaining un-parsed HTML, `offset` is its position 640 + /// in the original document. 641 + pub fn speculate(&mut self, html: &str, offset: usize) { 642 + self.invalidated = false; 643 + self.speculation_offset = Some(offset); 644 + self.buffered_tokens.clear(); 645 + self.replay_index = 0; 646 + let _ = self.request_tx.send(SpeculativeRequest::Scan { 647 + html: html.to_string(), 648 + offset, 649 + }); 650 + } 651 + 652 + /// Collect speculation results (blocking until the background thread responds). 653 + /// 654 + /// Returns the discovered preload URLs if speculation was not invalidated. 655 + pub fn collect_results(&mut self) -> Option<Vec<PreloadUrl>> { 656 + match self.response_rx.recv() { 657 + Ok(SpeculativeResponse::Results { 658 + preloads, 659 + tokens, 660 + offset, 661 + }) => { 662 + if self.invalidated { 663 + // Results are stale — discard. 664 + return None; 665 + } 666 + if self.speculation_offset == Some(offset) { 667 + self.buffered_tokens = tokens; 668 + self.replay_index = 0; 669 + Some(preloads) 670 + } else { 671 + None 672 + } 673 + } 674 + Err(_) => None, 675 + } 676 + } 677 + 678 + /// Try to collect results without blocking. 679 + /// 680 + /// Returns `Some(preloads)` if results are ready and valid, 681 + /// `None` if not ready or invalidated. 682 + pub fn try_collect_results(&mut self) -> Option<Vec<PreloadUrl>> { 683 + match self.response_rx.try_recv() { 684 + Ok(SpeculativeResponse::Results { 685 + preloads, 686 + tokens, 687 + offset, 688 + }) => { 689 + if self.invalidated { 690 + return None; 691 + } 692 + if self.speculation_offset == Some(offset) { 693 + self.buffered_tokens = tokens; 694 + self.replay_index = 0; 695 + Some(preloads) 696 + } else { 697 + None 698 + } 699 + } 700 + Err(_) => None, 701 + } 702 + } 703 + 704 + /// Invalidate speculation results due to `document.write`. 705 + /// 706 + /// All buffered tokens are discarded, and the main parser must 707 + /// re-tokenize from the current position. 708 + pub fn invalidate(&mut self) { 709 + self.invalidated = true; 710 + self.buffered_tokens.clear(); 711 + self.replay_index = 0; 712 + self.speculation_offset = None; 713 + } 714 + 715 + /// Returns true if there are buffered tokens available for replay. 716 + pub fn has_buffered_tokens(&self) -> bool { 717 + !self.invalidated && self.replay_index < self.buffered_tokens.len() 718 + } 719 + 720 + /// Get the next buffered token if available. 721 + /// 722 + /// Returns `Some(token)` if speculation succeeded and there are tokens 723 + /// to replay. Returns `None` if no buffered tokens are available or 724 + /// speculation was invalidated. 725 + pub fn next_buffered_token(&mut self) -> Option<Token> { 726 + if self.invalidated || self.replay_index >= self.buffered_tokens.len() { 727 + return None; 728 + } 729 + let token = self.buffered_tokens[self.replay_index].clone(); 730 + self.replay_index += 1; 731 + Some(token) 732 + } 733 + 734 + /// Returns whether speculation was invalidated. 735 + pub fn is_invalidated(&self) -> bool { 736 + self.invalidated 737 + } 738 + 739 + /// Shut down the background thread. 740 + pub fn shutdown(&mut self) { 741 + let _ = self.request_tx.send(SpeculativeRequest::Shutdown); 742 + if let Some(handle) = self.handle.take() { 743 + let _ = handle.join(); 744 + } 745 + } 746 + } 747 + 748 + impl Drop for SpeculativeParser { 749 + fn drop(&mut self) { 750 + self.shutdown(); 751 + } 752 + } 753 + 754 + #[cfg(test)] 755 + mod tests { 756 + use super::*; 757 + 758 + #[test] 759 + fn scan_finds_stylesheet() { 760 + let html = r#"<html><head><link rel="stylesheet" href="/style.css"></head></html>"#; 761 + let mut scanner = PreloadScanner::new(html); 762 + let preloads = scanner.scan(); 763 + assert_eq!(preloads.len(), 1); 764 + assert_eq!(preloads[0].url, "/style.css"); 765 + assert_eq!(preloads[0].resource_type, ResourceType::Stylesheet); 766 + } 767 + 768 + #[test] 769 + fn scan_finds_script_src() { 770 + let html = r#"<script src="/app.js"></script><p>text</p>"#; 771 + let mut scanner = PreloadScanner::new(html); 772 + let preloads = scanner.scan(); 773 + assert_eq!(preloads.len(), 1); 774 + assert_eq!(preloads[0].url, "/app.js"); 775 + assert_eq!(preloads[0].resource_type, ResourceType::Script); 776 + } 777 + 778 + #[test] 779 + fn scan_finds_image() { 780 + let html = r#"<body><img src="/logo.png"><p>Hello</p></body>"#; 781 + let mut scanner = PreloadScanner::new(html); 782 + let preloads = scanner.scan(); 783 + assert_eq!(preloads.len(), 1); 784 + assert_eq!(preloads[0].url, "/logo.png"); 785 + assert_eq!(preloads[0].resource_type, ResourceType::Image); 786 + } 787 + 788 + #[test] 789 + fn scan_finds_preload_hint() { 790 + let html = r#"<link rel="preload" href="/font.woff2" as="font">"#; 791 + let mut scanner = PreloadScanner::new(html); 792 + let preloads = scanner.scan(); 793 + assert_eq!(preloads.len(), 1); 794 + assert_eq!(preloads[0].url, "/font.woff2"); 795 + assert_eq!(preloads[0].resource_type, ResourceType::Preload); 796 + assert_eq!(preloads[0].as_attr.as_deref(), Some("font")); 797 + } 798 + 799 + #[test] 800 + fn scan_finds_multiple_resources() { 801 + let html = r#" 802 + <head> 803 + <link rel="stylesheet" href="/a.css"> 804 + <link rel="stylesheet" href="/b.css"> 805 + <script src="/app.js"></script> 806 + </head> 807 + <body> 808 + <img src="/hero.jpg"> 809 + <img src="/logo.png"> 810 + </body> 811 + "#; 812 + let mut scanner = PreloadScanner::new(html); 813 + let preloads = scanner.scan(); 814 + assert_eq!(preloads.len(), 5); 815 + assert_eq!(preloads[0].resource_type, ResourceType::Stylesheet); 816 + assert_eq!(preloads[1].resource_type, ResourceType::Stylesheet); 817 + assert_eq!(preloads[2].resource_type, ResourceType::Script); 818 + assert_eq!(preloads[3].resource_type, ResourceType::Image); 819 + assert_eq!(preloads[4].resource_type, ResourceType::Image); 820 + } 821 + 822 + #[test] 823 + fn scan_skips_inline_script_content() { 824 + let html = r#"<script>var x = "<img src='fake.png'>";</script><img src="/real.png">"#; 825 + let mut scanner = PreloadScanner::new(html); 826 + let preloads = scanner.scan(); 827 + // Should only find the real image, not the one inside the script string. 828 + assert_eq!(preloads.len(), 1); 829 + assert_eq!(preloads[0].url, "/real.png"); 830 + } 831 + 832 + #[test] 833 + fn scan_skips_comments() { 834 + let html = r#"<!-- <img src="/hidden.png"> --><img src="/visible.png">"#; 835 + let mut scanner = PreloadScanner::new(html); 836 + let preloads = scanner.scan(); 837 + assert_eq!(preloads.len(), 1); 838 + assert_eq!(preloads[0].url, "/visible.png"); 839 + } 840 + 841 + #[test] 842 + fn scan_ignores_link_without_stylesheet_rel() { 843 + let html = r#"<link rel="icon" href="/favicon.ico">"#; 844 + let mut scanner = PreloadScanner::new(html); 845 + let preloads = scanner.scan(); 846 + assert_eq!(preloads.len(), 0); 847 + } 848 + 849 + #[test] 850 + fn scan_handles_single_quoted_attrs() { 851 + let html = "<img src='/photo.jpg'>"; 852 + let mut scanner = PreloadScanner::new(html); 853 + let preloads = scanner.scan(); 854 + assert_eq!(preloads.len(), 1); 855 + assert_eq!(preloads[0].url, "/photo.jpg"); 856 + } 857 + 858 + #[test] 859 + fn scan_handles_unquoted_attrs() { 860 + let html = "<img src=/photo.jpg>"; 861 + let mut scanner = PreloadScanner::new(html); 862 + let preloads = scanner.scan(); 863 + assert_eq!(preloads.len(), 1); 864 + assert_eq!(preloads[0].url, "/photo.jpg"); 865 + } 866 + 867 + #[test] 868 + fn scan_handles_self_closing_tags() { 869 + let html = r#"<img src="/photo.jpg" />"#; 870 + let mut scanner = PreloadScanner::new(html); 871 + let preloads = scanner.scan(); 872 + assert_eq!(preloads.len(), 1); 873 + assert_eq!(preloads[0].url, "/photo.jpg"); 874 + } 875 + 876 + #[test] 877 + fn scan_case_insensitive_tag_names() { 878 + let html = r#"<IMG SRC="/photo.jpg"><LINK REL="stylesheet" HREF="/style.css">"#; 879 + let mut scanner = PreloadScanner::new(html); 880 + let preloads = scanner.scan(); 881 + assert_eq!(preloads.len(), 2); 882 + } 883 + 884 + #[test] 885 + fn scan_with_tokens_produces_valid_tokens() { 886 + let html = r#"<p>Hello</p>"#; 887 + let mut scanner = PreloadScanner::new(html); 888 + let (preloads, tokens) = scanner.scan_with_tokens(); 889 + assert_eq!(preloads.len(), 0); 890 + // Should have StartTag(p), Character(Hello), EndTag(p) 891 + assert!(tokens.len() >= 3); 892 + } 893 + 894 + #[test] 895 + fn speculative_parser_basic_flow() { 896 + let mut spec = SpeculativeParser::new(); 897 + 898 + let html = r#"<link rel="stylesheet" href="/main.css"><img src="/hero.jpg">"#; 899 + spec.speculate(html, 100); 900 + 901 + let results = spec.collect_results(); 902 + assert!(results.is_some()); 903 + let preloads = results.unwrap(); 904 + assert_eq!(preloads.len(), 2); 905 + assert_eq!(preloads[0].url, "/main.css"); 906 + assert_eq!(preloads[1].url, "/hero.jpg"); 907 + 908 + // Should have buffered tokens. 909 + assert!(spec.has_buffered_tokens()); 910 + 911 + spec.shutdown(); 912 + } 913 + 914 + #[test] 915 + fn speculative_parser_invalidation() { 916 + let mut spec = SpeculativeParser::new(); 917 + 918 + let html = r#"<img src="/photo.jpg">"#; 919 + spec.speculate(html, 0); 920 + 921 + // Simulate document.write invalidating speculation. 922 + spec.invalidate(); 923 + assert!(spec.is_invalidated()); 924 + assert!(!spec.has_buffered_tokens()); 925 + 926 + // Results should be discarded. 927 + let results = spec.collect_results(); 928 + assert!(results.is_none()); 929 + 930 + spec.shutdown(); 931 + } 932 + 933 + #[test] 934 + fn speculative_parser_token_replay() { 935 + let mut spec = SpeculativeParser::new(); 936 + 937 + let html = r#"<p>Hello</p>"#; 938 + spec.speculate(html, 0); 939 + let _ = spec.collect_results(); 940 + 941 + // Replay buffered tokens. 942 + let mut replayed = Vec::new(); 943 + while let Some(token) = spec.next_buffered_token() { 944 + replayed.push(token); 945 + } 946 + assert!(!replayed.is_empty()); 947 + 948 + spec.shutdown(); 949 + } 950 + 951 + #[test] 952 + fn speculative_parser_no_tokens_after_invalidation() { 953 + let mut spec = SpeculativeParser::new(); 954 + 955 + let html = r#"<p>Hello</p>"#; 956 + spec.speculate(html, 0); 957 + let _ = spec.collect_results(); 958 + 959 + spec.invalidate(); 960 + assert!(spec.next_buffered_token().is_none()); 961 + 962 + spec.shutdown(); 963 + } 964 + 965 + #[test] 966 + fn speculation_hit_no_document_write() { 967 + // Simulates: main parser encounters <script>, blocks, speculative parser 968 + // scans ahead and finds resources. No document.write happens, so 969 + // speculation is valid. 970 + let remaining_html = r#" 971 + <link rel="stylesheet" href="/after-script.css"> 972 + <img src="/image.png"> 973 + <p>Content after script</p> 974 + "#; 975 + 976 + let mut spec = SpeculativeParser::new(); 977 + spec.speculate(remaining_html, 500); 978 + 979 + let preloads = spec.collect_results().unwrap(); 980 + assert_eq!(preloads.len(), 2); 981 + assert_eq!(preloads[0].url, "/after-script.css"); 982 + assert_eq!(preloads[0].resource_type, ResourceType::Stylesheet); 983 + assert_eq!(preloads[1].url, "/image.png"); 984 + assert_eq!(preloads[1].resource_type, ResourceType::Image); 985 + 986 + // Buffered tokens should be usable. 987 + assert!(spec.has_buffered_tokens()); 988 + assert!(!spec.is_invalidated()); 989 + 990 + spec.shutdown(); 991 + } 992 + 993 + #[test] 994 + fn speculation_miss_document_write_invalidates() { 995 + // Simulates: main parser encounters <script>, blocks, speculative parser 996 + // scans ahead. Then document.write injects content, invalidating speculation. 997 + let remaining_html = r#" 998 + <link rel="stylesheet" href="/after-script.css"> 999 + <img src="/image.png"> 1000 + "#; 1001 + 1002 + let mut spec = SpeculativeParser::new(); 1003 + spec.speculate(remaining_html, 500); 1004 + 1005 + // Before collecting results, simulate document.write. 1006 + spec.invalidate(); 1007 + 1008 + // Results should be discarded. 1009 + let preloads = spec.collect_results(); 1010 + assert!(preloads.is_none()); 1011 + assert!(!spec.has_buffered_tokens()); 1012 + assert!(spec.is_invalidated()); 1013 + 1014 + spec.shutdown(); 1015 + } 1016 + }