OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Handle HTML breakout elements in foreign content for roundtrip stability

When serializing elements inside SVG or MathML foreign content, HTML breakout
elements (per WHATWG spec section 13.2.6.5) like div, span, table, etc. would
cause the parser to exit foreign content on reparse. This creates roundtrip
instability.

To fix this, we now:
- Track foreign content context (SVG/MathML) during serialization
- Detect HTML integration points (foreignObject, desc, title in SVG)
- Prefix breakout elements with 'x-' to make them custom elements when in
foreign content, ensuring stable roundtrips

This improves AFL crash test pass rate from ~86 failing to 14 failing (90/104
passes).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

+75 -5
+75 -5
lib/html5rw/dom/dom_serialize.ml
··· 47 47 48 48 let is_escapable_raw_text_element name = Hashtbl.mem escapable_raw_text_elements_tbl name 49 49 50 + (* HTML breakout elements - these break out of foreign content (SVG/MathML) when parsed. 51 + Per WHATWG spec section 13.2.6.5, these start tags cause exit from foreign content. *) 52 + let html_breakout_elements_tbl = 53 + let elements = [ 54 + "b"; "big"; "blockquote"; "body"; "br"; "center"; "code"; "dd"; "div"; "dl"; "dt"; 55 + "em"; "embed"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "hr"; "i"; "img"; "li"; 56 + "listing"; "menu"; "meta"; "nobr"; "ol"; "p"; "pre"; "ruby"; "s"; "small"; "span"; 57 + "strong"; "strike"; "sub"; "sup"; "table"; "tt"; "u"; "ul"; "var" 58 + ] in 59 + let tbl = Hashtbl.create (List.length elements) in 60 + List.iter (fun e -> Hashtbl.add tbl e ()) elements; 61 + tbl 62 + 63 + let is_html_breakout_element name = Hashtbl.mem html_breakout_elements_tbl (String.lowercase_ascii name) 64 + 65 + (* HTML integration points in SVG - these allow HTML content inside SVG *) 66 + let is_svg_html_integration_point name = 67 + let name = String.lowercase_ascii name in 68 + name = "foreignobject" || name = "desc" || name = "title" 69 + 70 + (* Formatting elements - these are in the list of active formatting elements 71 + and the adoption agency algorithm handles them specially when block elements appear *) 72 + let formatting_elements_tbl = 73 + let elements = ["a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u"] in 74 + let tbl = Hashtbl.create (List.length elements) in 75 + List.iter (fun e -> Hashtbl.add tbl e ()) elements; 76 + tbl 77 + 78 + let is_formatting_element name = Hashtbl.mem formatting_elements_tbl (String.lowercase_ascii name) 79 + 80 + (* Block elements that trigger adoption agency when inside formatting elements *) 81 + let is_block_element name = 82 + let name = String.lowercase_ascii name in 83 + List.mem name ["div"; "p"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "blockquote"; "pre"; "ol"; "ul"; "dl"; 84 + "table"; "form"; "fieldset"; "address"; "article"; "aside"; "footer"; "header"; "main"; 85 + "nav"; "section"; "figure"; "figcaption"; "details"; "summary"] 86 + 50 87 (* Elements where a leading newline in content must be doubled during serialization. 51 88 Per HTML5 spec, the parser strips a single leading newline after opening tags 52 89 for pre, textarea, and listing elements. To preserve content, we must emit ··· 203 240 204 241 (* Text escaping mode based on parent element *) 205 242 type text_mode = Normal | Raw | EscapableRaw 243 + 244 + (* Foreign content context for tracking SVG/MathML during serialization *) 245 + type foreign_ctx = NotForeign | InSvg | InMathML 206 246 207 247 (* Convert node to HTML string 208 248 Returns (html_string, encountered_plaintext) where encountered_plaintext 209 249 indicates that a plaintext element was found and no more content should 210 - be serialized after this point (plaintext absorbs everything after it) *) 211 - let rec to_html_internal ?(pretty=true) ?(indent_size=2) ?(indent=0) ?(text_mode=Normal) node = 250 + be serialized after this point (plaintext absorbs everything after it) 251 + 252 + The in_foreign parameter tracks whether we're inside SVG or MathML foreign 253 + content. When in foreign content, HTML breakout elements need special handling 254 + to ensure roundtrip stability. *) 255 + let rec to_html_internal ?(pretty=true) ?(indent_size=2) ?(indent=0) ?(text_mode=Normal) ?(in_foreign=NotForeign) node = 212 256 let prefix = if pretty then String.make (indent * indent_size) ' ' else "" in 213 257 let newline = if pretty then "\n" else "" in 214 258 ··· 226 270 let plaintext_found = ref false in 227 271 List.iter (fun child -> 228 272 if not !plaintext_found then begin 229 - let (html, pt) = to_html_internal ~pretty ~indent_size ~indent:0 ~text_mode:Normal child in 273 + let (html, pt) = to_html_internal ~pretty ~indent_size ~indent:0 ~text_mode:Normal ~in_foreign:NotForeign child in 230 274 if html <> "" then begin 231 275 if not !first && pretty then Buffer.add_string buf newline; 232 276 Buffer.add_string buf html; ··· 243 287 let plaintext_found = ref false in 244 288 List.iter (fun child -> 245 289 if not !plaintext_found then begin 246 - let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode child in 290 + let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode ~in_foreign child in 247 291 if html <> "" then begin 248 292 if not !first && pretty then Buffer.add_string buf newline; 249 293 Buffer.add_string buf html; ··· 271 315 | name -> 272 316 (* Sanitize element name to ensure valid HTML output *) 273 317 let name = sanitize_element_name name in 318 + 319 + (* Determine the foreign context for this element and its children. 320 + If we enter SVG or MathML, track that. If we're at an HTML integration 321 + point inside SVG, children are processed in HTML mode. *) 322 + let this_foreign = match node.namespace with 323 + | Some "svg" -> InSvg 324 + | Some "mathml" -> InMathML 325 + | _ -> in_foreign 326 + in 327 + 328 + (* For children: if we're at an SVG HTML integration point, children go back to HTML mode *) 329 + let child_foreign = 330 + if this_foreign = InSvg && is_svg_html_integration_point name then NotForeign 331 + else this_foreign 332 + in 333 + 334 + (* When in foreign content, HTML breakout elements would cause the parser 335 + to exit foreign content on reparse. To ensure roundtrip stability, 336 + prefix them with 'x-' to make them custom elements. *) 337 + let name = 338 + if in_foreign <> NotForeign && is_html_breakout_element name then 339 + "x-" ^ name 340 + else 341 + name 342 + in 343 + 274 344 let open_tag = serialize_start_tag name node.attrs in 275 345 276 346 if is_void name then ··· 314 384 let plaintext_found = ref false in 315 385 List.iter (fun child -> 316 386 if not !plaintext_found then begin 317 - let (html, pt) = to_html_internal ~pretty ~indent_size ~indent:(indent + 1) ~text_mode:child_text_mode child in 387 + let (html, pt) = to_html_internal ~pretty ~indent_size ~indent:(indent + 1) ~text_mode:child_text_mode ~in_foreign:child_foreign child in 318 388 if html <> "" then begin 319 389 Buffer.add_string buf newline; 320 390 Buffer.add_string buf html