OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Fix parser/serializer bugs found by AFL fuzzing

Serializer fixes:
- Add leading newline preservation for pre, textarea, and listing elements.
Per HTML5 spec, the parser strips a single leading newline after these
element tags, so the serializer must emit an extra newline to preserve
content that starts with a newline.
- Escape < in attribute values to prevent tag injection during reparsing

Parser fix:
- Reset ignore_lf flag when any element is inserted, not just on character
tokens. Per HTML5 spec, only the immediately next token after
pre/textarea/listing should be checked for leading LF, but we were
persisting the flag across element insertions.

Tokenizer fix:
- Handle < in tag names per HTML5 spec: emit parse error and reconsume
in tag open state. Previously < was incorrectly added to tag names.

These fixes improve roundtrip stability for edge cases with:
- Pre-formatted elements with nested elements and leading newlines
- Attribute values containing < characters
- Malformed HTML with < inside tag names

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

+50 -3
+35 -2
lib/html5rw/dom/dom_serialize.ml
··· 47 47 48 48 let is_escapable_raw_text_element name = Hashtbl.mem escapable_raw_text_elements_tbl name 49 49 50 + (* Elements where a leading newline in content must be doubled during serialization. 51 + Per HTML5 spec, the parser strips a single leading newline after opening tags 52 + for pre, textarea, and listing elements. To preserve content, we must emit 53 + an extra newline if the content starts with one. *) 54 + let needs_leading_newline_preserved name = 55 + name = "pre" || name = "textarea" || name = "listing" 56 + 57 + (* Check if text content starts with a newline (LF) *) 58 + let starts_with_newline text = 59 + String.length text > 0 && text.[0] = '\n' 60 + 61 + (* Get the first text content from children, if any *) 62 + let first_text_content children = 63 + match children with 64 + | [] -> "" 65 + | first :: _ when first.name = "#text" -> first.data 66 + | _ -> "" 67 + 50 68 (* Foreign attribute adjustments for test output *) 51 69 let foreign_attr_adjustments = [ 52 70 "xlink:actuate"; "xlink:arcrole"; "xlink:href"; "xlink:role"; ··· 81 99 if String.contains value '"' && not (String.contains value '\'') then '\'' 82 100 else '"' 83 101 84 - (* Escape attribute value *) 102 + (* Escape attribute value - must escape &, quotes, and < for safe reparsing *) 85 103 let escape_attr_value value quote_char = 86 104 let buf = Buffer.create (String.length value) in 87 105 String.iter (fun c -> 88 106 match c with 89 107 | '&' -> Buffer.add_string buf "&amp;" 108 + | '<' -> Buffer.add_string buf "&lt;" 90 109 | '"' when quote_char = '"' -> Buffer.add_string buf "&quot;" 91 110 | '\'' when quote_char = '\'' -> Buffer.add_string buf "&#39;" 92 111 | c -> Buffer.add_char buf c ··· 218 237 in 219 238 (* Check if all children are text *) 220 239 let all_text = List.for_all is_text node.children in 240 + (* Per HTML5 spec, pre/textarea/listing need leading newline doubled *) 241 + let leading_newline = 242 + if needs_leading_newline_preserved name && 243 + starts_with_newline (first_text_content node.children) 244 + then "\n" else "" 245 + in 221 246 if all_text then begin 222 247 let text = String.concat "" (List.map (fun c -> c.data) node.children) in 223 248 let escaped = match child_text_mode with ··· 225 250 | Raw -> text 226 251 | EscapableRaw -> escape_escapable_raw_text text 227 252 in 228 - (prefix ^ open_tag ^ escaped ^ serialize_end_tag name, false) 253 + (prefix ^ open_tag ^ leading_newline ^ escaped ^ serialize_end_tag name, false) 229 254 end else begin 230 255 let buf = Buffer.create 256 in 231 256 Buffer.add_string buf (prefix ^ open_tag); 257 + Buffer.add_string buf leading_newline; 232 258 let plaintext_found = ref false in 233 259 List.iter (fun child -> 234 260 if not !plaintext_found then begin ··· 425 451 in 426 452 (* Check if all children are text *) 427 453 let all_text = List.for_all is_text node.children in 454 + (* Per HTML5 spec, pre/textarea/listing need leading newline doubled *) 455 + let needs_leading_nl = 456 + needs_leading_newline_preserved name && 457 + starts_with_newline (first_text_content node.children) 458 + in 428 459 if all_text then begin 429 460 let text = String.concat "" (List.map (fun c -> c.data) node.children) in 430 461 let escaped = match child_text_mode with ··· 432 463 | Raw -> text 433 464 | EscapableRaw -> escape_escapable_raw_text text 434 465 in 466 + if needs_leading_nl then write "\n"; 435 467 write escaped; 436 468 write (serialize_end_tag name); 437 469 false 438 470 end else begin 471 + if needs_leading_nl then write "\n"; 439 472 let plaintext_found = ref false in 440 473 List.iter (fun child -> 441 474 if not !plaintext_found then begin
+3
lib/html5rw/parser/parser_tree_builder.ml
··· 208 208 end 209 209 210 210 let insert_element t name ?(namespace=None) ?(push=false) attrs = 211 + (* Reset ignore_lf flag - per HTML5 spec, only the immediately next token 212 + after pre/textarea/listing should be checked for leading LF *) 213 + t.ignore_lf <- false; 211 214 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in 212 215 let node = Dom.create_element name ~namespace ~attrs ~location () in 213 216 let (parent, before) = appropriate_insertion_place t in
+12 -1
lib/html5rw/tokenizer/tokenizer_impl.ml
··· 711 711 t.state <- Tokenizer_state.Bogus_comment 712 712 713 713 and state_tag_name () = 714 - match Tokenizer_stream.consume t.stream with 714 + match Tokenizer_stream.peek t.stream with 715 715 | Some ('\t' | '\n' | '\x0C' | ' ') -> 716 + Tokenizer_stream.advance t.stream; 716 717 t.state <- Tokenizer_state.Before_attribute_name 717 718 | Some '/' -> 719 + Tokenizer_stream.advance t.stream; 718 720 t.state <- Tokenizer_state.Self_closing_start_tag 719 721 | Some '>' -> 722 + Tokenizer_stream.advance t.stream; 720 723 t.state <- Tokenizer_state.Data; 721 724 emit_current_tag () 722 725 | Some '\x00' -> 726 + Tokenizer_stream.advance t.stream; 723 727 error t "unexpected-null-character"; 724 728 Buffer.add_string t.current_tag_name "\xEF\xBF\xBD" 729 + | Some '<' -> 730 + (* Per HTML5 spec: emit error and reconsume in tag open state *) 731 + error t "unexpected-character-in-tag-name"; 732 + (* Emit current tag as-is before starting new tag *) 733 + emit_current_tag (); 734 + t.state <- Tokenizer_state.Tag_open 725 735 | Some c -> 736 + Tokenizer_stream.advance t.stream; 726 737 check_control_char c; 727 738 Buffer.add_char t.current_tag_name (ascii_lower c) 728 739 | None -> ()