OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Fix serializer roundtrip bugs for raw text and plaintext elements

Bug fixes found via property-based fuzzing with Crowbar:

1. Raw text element escaping: Text content inside <script>, <style>,
<iframe>, <xmp>, <noembed>, <noframes>, and <noscript> was being
HTML-escaped during serialization, causing double-escaping on each
roundtrip. Fixed by detecting raw text elements and outputting
their content without escaping.

2. Escapable raw text elements: <textarea> and <title> content should
only have & escaped, not < or >. Added separate handling for these.

3. Plaintext element serialization: <plaintext> content accumulated
closing tags on each roundtrip because any content after plaintext
gets absorbed into its content on reparse. Fixed by:
- Not outputting closing tag for plaintext
- Propagating "plaintext encountered" flag through serialization
- Stopping serialization of closing tags for ancestors once
plaintext is found

Also adds fuzz/ directory with comprehensive Crowbar-based property
tests covering crash resistance, roundtrip stability, selector
parsing, DOM manipulation, and more.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

+769 -70
+3
fuzz/dune
··· 1 + (executable 2 + (name fuzz_html5rw) 3 + (libraries bytesrw html5rw crowbar))
+537
fuzz/fuzz_html5rw.ml
··· 1 + (** Comprehensive fuzz tests for html5rw HTML5 parser using Crowbar *) 2 + 3 + open Crowbar 4 + 5 + (* Helper to create a bytes reader from a string *) 6 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 7 + 8 + (* ========================================================================== 9 + Generators for HTML-like content 10 + ========================================================================== *) 11 + 12 + (* Common tag names for structured generation *) 13 + let tag_names = [ 14 + "div"; "p"; "span"; "a"; "h1"; "h2"; "h3"; "ul"; "li"; "ol"; 15 + "table"; "tr"; "td"; "th"; "thead"; "tbody"; "form"; "input"; 16 + "button"; "select"; "option"; "textarea"; "label"; "img"; "br"; 17 + "hr"; "b"; "i"; "strong"; "em"; "code"; "pre"; "script"; "style"; 18 + "head"; "body"; "html"; "title"; "meta"; "link"; "nav"; "header"; 19 + "footer"; "main"; "section"; "article"; "aside"; "figure"; "svg"; 20 + "math"; "template"; "iframe"; "noscript"; "plaintext"; "xmp"; 21 + ] 22 + 23 + let tag_name_gen = choose (List.map const tag_names) 24 + 25 + (* Common attribute names *) 26 + let attr_names = [ 27 + "id"; "class"; "href"; "src"; "style"; "title"; "alt"; "name"; 28 + "type"; "value"; "data-foo"; "aria-label"; "onclick"; "onload"; 29 + ] 30 + 31 + let attr_name_gen = choose (List.map const attr_names) 32 + 33 + (* Generator for a simple attribute *) 34 + let attr_gen = 35 + map [attr_name_gen; bytes] (fun name value -> 36 + Printf.sprintf "%s=\"%s\"" name (String.escaped value)) 37 + 38 + (* Generator for attributes list *) 39 + let attrs_gen = list attr_gen 40 + 41 + (* Generator for a simple opening tag *) 42 + let start_tag_gen = 43 + map [tag_name_gen; attrs_gen] (fun tag attrs -> 44 + let attrs_str = String.concat " " attrs in 45 + if attrs_str = "" then Printf.sprintf "<%s>" tag 46 + else Printf.sprintf "<%s %s>" tag attrs_str) 47 + 48 + (* Generator for a simple closing tag *) 49 + let end_tag_gen = 50 + map [tag_name_gen] (fun tag -> Printf.sprintf "</%s>" tag) 51 + 52 + (* Generator for text content *) 53 + let text_content_gen = 54 + choose [ 55 + const ""; 56 + const "Hello"; 57 + const "Hello, world!"; 58 + const "Test <with> special &chars;"; 59 + bytes; 60 + ] 61 + 62 + (* Generator for comments - used in html_gen via malformed_html_gen *) 63 + let _comment_gen = 64 + map [bytes] (fun content -> 65 + Printf.sprintf "<!--%s-->" content) 66 + 67 + (* Generator for DOCTYPE *) 68 + let doctype_gen = 69 + choose [ 70 + const "<!DOCTYPE html>"; 71 + const "<!doctype html>"; 72 + const "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">"; 73 + const "<!DOCTYPE>"; 74 + map [bytes] (fun s -> Printf.sprintf "<!DOCTYPE %s>" s); 75 + ] 76 + 77 + (* Generator for simple HTML fragments *) 78 + let simple_html_gen = 79 + map [start_tag_gen; text_content_gen; end_tag_gen] 80 + (fun start text _end_ -> start ^ text ^ _end_) 81 + 82 + (* Generator for nested HTML *) 83 + let nested_html_gen = 84 + map [start_tag_gen; simple_html_gen; end_tag_gen] 85 + (fun outer inner _end_ -> outer ^ inner ^ _end_) 86 + 87 + (* Generator for structured HTML with common patterns *) 88 + let structured_html_gen = 89 + choose [ 90 + const "<html><head><title>Test</title></head><body></body></html>"; 91 + const "<!DOCTYPE html><html><body><p>Hello</p></body></html>"; 92 + const "<div><span>text</span></div>"; 93 + const "<table><tr><td>cell</td></tr></table>"; 94 + const "<ul><li>item1</li><li>item2</li></ul>"; 95 + const "<form><input type=\"text\"><button>Submit</button></form>"; 96 + const "<p>First</p><p>Second</p>"; 97 + const "<div><div><div>nested</div></div></div>"; 98 + simple_html_gen; 99 + nested_html_gen; 100 + ] 101 + 102 + (* Generator for malformed/edge case HTML *) 103 + let malformed_html_gen = 104 + choose [ 105 + const "<"; 106 + const ">"; 107 + const "</"; 108 + const "<>"; 109 + const "<<>>"; 110 + const "<div"; 111 + const "<div>"; 112 + const "</div>"; 113 + const "<div><span>"; 114 + const "<div></span></div>"; 115 + const "<p><div></div></p>"; 116 + const "<!-"; 117 + const "<!--"; 118 + const "<!-->"; 119 + const "<!--->"; 120 + const "<!-- -- -->"; 121 + const "&"; 122 + const "&amp"; 123 + const "&amp;"; 124 + const "&#"; 125 + const "&#60"; 126 + const "&#60;"; 127 + const "&#x"; 128 + const "&#x3c;"; 129 + const "<script>alert('xss')</script>"; 130 + const "<style>body{}</style>"; 131 + const "<![CDATA[test]]>"; 132 + const "<?xml version=\"1.0\"?>"; 133 + const "<svg><foreignObject></foreignObject></svg>"; 134 + const "<math><mi>x</mi></math>"; 135 + const "<template><div>content</div></template>"; 136 + const "<table><div>misplaced</div><tr><td>ok</td></tr></table>"; 137 + map [bytes] (fun s -> "<" ^ s ^ ">"); 138 + map [bytes; bytes] (fun a b -> "<" ^ a ^ " " ^ b ^ ">"); 139 + ] 140 + 141 + (* Combined HTML generator *) 142 + let html_gen = 143 + choose [ 144 + bytes; (* Completely random *) 145 + structured_html_gen; (* Well-structured HTML *) 146 + malformed_html_gen; (* Known edge cases *) 147 + map [doctype_gen; structured_html_gen] (fun dt html -> dt ^ html); 148 + ] 149 + 150 + (* CSS selector generators *) 151 + let selector_gen = 152 + choose [ 153 + const "*"; 154 + const "div"; 155 + const "#id"; 156 + const ".class"; 157 + const "div.class"; 158 + const "div#id"; 159 + const "[attr]"; 160 + const "[attr=value]"; 161 + const "[attr~=value]"; 162 + const "[attr|=value]"; 163 + const "[attr^=value]"; 164 + const "[attr$=value]"; 165 + const "[attr*=value]"; 166 + const ":first-child"; 167 + const ":last-child"; 168 + const ":nth-child(1)"; 169 + const ":nth-child(2n+1)"; 170 + const ":only-child"; 171 + const ":empty"; 172 + const ":not(div)"; 173 + const "div > p"; 174 + const "div p"; 175 + const "div + p"; 176 + const "div ~ p"; 177 + const "div, p"; 178 + const "div > p.class#id[attr]:first-child"; 179 + bytes; (* Random selector to find crashes *) 180 + ] 181 + 182 + (* Fragment context tag names 183 + Note: raw text elements (script, style, textarea, title, xmp, iframe, etc.) 184 + are excluded because fragment content parsed in their context cannot 185 + round-trip correctly - the content is raw text but serialized without 186 + the element wrapper, so escaping behavior differs. *) 187 + let fragment_context_gen = 188 + choose [ 189 + const "div"; 190 + const "body"; 191 + const "html"; 192 + const "table"; 193 + const "tr"; 194 + const "tbody"; 195 + const "thead"; 196 + const "td"; 197 + const "th"; 198 + const "ul"; 199 + const "ol"; 200 + const "select"; 201 + const "template"; 202 + const "svg"; 203 + const "math"; 204 + (* Exclude raw text contexts: script, style, textarea, title *) 205 + ] 206 + 207 + (* ========================================================================== 208 + Test 1: Crash resistance - arbitrary input should not crash 209 + ========================================================================== *) 210 + 211 + let () = 212 + add_test ~name:"html5rw_no_crash_bytes" [bytes] @@ fun input -> 213 + let _ = 214 + try Html5rw.parse (reader_of_string input) 215 + with _ -> Html5rw.parse (reader_of_string "") 216 + in 217 + check true 218 + 219 + let () = 220 + add_test ~name:"html5rw_no_crash_html" [html_gen] @@ fun input -> 221 + let _ = 222 + try Html5rw.parse (reader_of_string input) 223 + with _ -> Html5rw.parse (reader_of_string "") 224 + in 225 + check true 226 + 227 + let () = 228 + add_test ~name:"html5rw_parse_bytes_no_crash" [bytes] @@ fun input -> 229 + let _ = 230 + try Html5rw.parse_bytes (Bytes.of_string input) 231 + with _ -> Html5rw.parse_bytes (Bytes.of_string "") 232 + in 233 + check true 234 + 235 + (* ========================================================================== 236 + Test 2: Roundtrip - parse -> serialize -> reparse should be consistent 237 + ========================================================================== *) 238 + 239 + (* Serialize a parse result to string *) 240 + let serialize result = 241 + Html5rw.to_string ~pretty:false result 242 + 243 + (* Compare two DOM trees structurally (text content of serialized output) *) 244 + let _trees_equivalent result1 result2 = 245 + let s1 = serialize result1 in 246 + let s2 = serialize result2 in 247 + s1 = s2 248 + 249 + let () = 250 + add_test ~name:"html5rw_roundtrip_idempotent" [html_gen] @@ fun input -> 251 + try 252 + (* Parse original *) 253 + let result1 = Html5rw.parse (reader_of_string input) in 254 + let serialized1 = serialize result1 in 255 + 256 + (* Reparse serialized output *) 257 + let result2 = Html5rw.parse (reader_of_string serialized1) in 258 + let serialized2 = serialize result2 in 259 + 260 + (* The second serialization should equal the first *) 261 + (* (First parse may normalize, but second should be stable) *) 262 + if serialized1 <> serialized2 then begin 263 + Printf.printf "\nRoundtrip mismatch:\n"; 264 + Printf.printf "Input: %s\n" (String.escaped (String.sub input 0 (min 200 (String.length input)))); 265 + Printf.printf "First: %s\n" (String.escaped (String.sub serialized1 0 (min 200 (String.length serialized1)))); 266 + Printf.printf "Second: %s\n" (String.escaped (String.sub serialized2 0 (min 200 (String.length serialized2)))); 267 + check false 268 + end else 269 + check true 270 + with e -> 271 + Printf.printf "\nRoundtrip exception: %s\n" (Printexc.to_string e); 272 + check false 273 + 274 + (* Additional roundtrip test: parse -> serialize -> reparse -> serialize should stabilize *) 275 + let () = 276 + add_test ~name:"html5rw_triple_roundtrip" [structured_html_gen] @@ fun input -> 277 + try 278 + let r1 = Html5rw.parse (reader_of_string input) in 279 + let s1 = serialize r1 in 280 + 281 + let r2 = Html5rw.parse (reader_of_string s1) in 282 + let s2 = serialize r2 in 283 + 284 + let r3 = Html5rw.parse (reader_of_string s2) in 285 + let s3 = serialize r3 in 286 + 287 + (* By the third roundtrip, output should be stable *) 288 + if s2 <> s3 then begin 289 + Printf.printf "\nTriple roundtrip not stable:\n"; 290 + Printf.printf "s2: %s\n" (String.escaped (String.sub s2 0 (min 200 (String.length s2)))); 291 + Printf.printf "s3: %s\n" (String.escaped (String.sub s3 0 (min 200 (String.length s3)))); 292 + check false 293 + end else 294 + check true 295 + with e -> 296 + Printf.printf "\nTriple roundtrip exception: %s\n" (Printexc.to_string e); 297 + check false 298 + 299 + (* ========================================================================== 300 + Test 3: Serialization idempotence 301 + ========================================================================== *) 302 + 303 + let () = 304 + add_test ~name:"html5rw_serialize_idempotent" [html_gen] @@ fun input -> 305 + try 306 + let result = Html5rw.parse (reader_of_string input) in 307 + let s1 = serialize result in 308 + let s2 = serialize result in 309 + if s1 <> s2 then begin 310 + Printf.printf "\nSerialization not idempotent!\n"; 311 + check false 312 + end else 313 + check true 314 + with e -> 315 + Printf.printf "\nSerialization exception: %s\n" (Printexc.to_string e); 316 + check false 317 + 318 + (* ========================================================================== 319 + Test 4: CSS Selector crash resistance 320 + ========================================================================== *) 321 + 322 + let () = 323 + add_test ~name:"html5rw_selector_no_crash" [selector_gen; html_gen] @@ fun selector html -> 324 + try 325 + let result = Html5rw.parse (reader_of_string html) in 326 + let _ = Html5rw.query result selector in 327 + check true 328 + with 329 + | Html5rw.Selector.Selector_error _ -> check true (* Expected for malformed selectors *) 330 + | e -> 331 + Printf.printf "\nUnexpected selector exception: %s\n" (Printexc.to_string e); 332 + Printf.printf "Selector: %s\n" (String.escaped selector); 333 + check false 334 + 335 + let () = 336 + add_test ~name:"html5rw_matches_no_crash" [selector_gen; html_gen] @@ fun selector html -> 337 + try 338 + let result = Html5rw.parse (reader_of_string html) in 339 + let root = Html5rw.root result in 340 + let _ = Html5rw.matches root selector in 341 + check true 342 + with 343 + | Html5rw.Selector.Selector_error _ -> check true 344 + | e -> 345 + Printf.printf "\nUnexpected matches exception: %s\n" (Printexc.to_string e); 346 + check false 347 + 348 + (* ========================================================================== 349 + Test 5: Fragment parsing 350 + ========================================================================== *) 351 + 352 + let () = 353 + add_test ~name:"html5rw_fragment_no_crash" [fragment_context_gen; html_gen] 354 + @@ fun ctx_tag html -> 355 + try 356 + let ctx = Html5rw.make_fragment_context ~tag_name:ctx_tag () in 357 + let _ = Html5rw.parse ~fragment_context:ctx (reader_of_string html) in 358 + check true 359 + with e -> 360 + Printf.printf "\nFragment parse exception with context '%s': %s\n" 361 + ctx_tag (Printexc.to_string e); 362 + check false 363 + 364 + (* Fragment roundtrip *) 365 + let () = 366 + add_test ~name:"html5rw_fragment_roundtrip" [fragment_context_gen; structured_html_gen] 367 + @@ fun ctx_tag html -> 368 + try 369 + let ctx = Html5rw.make_fragment_context ~tag_name:ctx_tag () in 370 + let r1 = Html5rw.parse ~fragment_context:ctx (reader_of_string html) in 371 + let s1 = serialize r1 in 372 + 373 + let r2 = Html5rw.parse ~fragment_context:ctx (reader_of_string s1) in 374 + let s2 = serialize r2 in 375 + 376 + let r3 = Html5rw.parse ~fragment_context:ctx (reader_of_string s2) in 377 + let s3 = serialize r3 in 378 + 379 + if s2 <> s3 then begin 380 + Printf.printf "\nFragment roundtrip not stable with context '%s'\n" ctx_tag; 381 + Printf.printf "Input: %s\n" (String.escaped (String.sub html 0 (min 100 (String.length html)))); 382 + Printf.printf "s1: %s\n" (String.escaped (String.sub s1 0 (min 100 (String.length s1)))); 383 + Printf.printf "s2: %s\n" (String.escaped (String.sub s2 0 (min 100 (String.length s2)))); 384 + Printf.printf "s3: %s\n" (String.escaped (String.sub s3 0 (min 100 (String.length s3)))); 385 + check false 386 + end else 387 + check true 388 + with e -> 389 + Printf.printf "\nFragment roundtrip exception: %s\n" (Printexc.to_string e); 390 + check false 391 + 392 + (* ========================================================================== 393 + Test 6: DOM manipulation consistency 394 + ========================================================================== *) 395 + 396 + let () = 397 + add_test ~name:"html5rw_dom_manipulation" [tag_name_gen; bytes] @@ fun tag text -> 398 + try 399 + (* Create element, add text, serialize, reparse *) 400 + let elem = Html5rw.create_element tag () in 401 + let text_node = Html5rw.create_text text in 402 + Html5rw.append_child elem text_node; 403 + 404 + (* Create a document to hold it *) 405 + let doc = Html5rw.create_document () in 406 + let html = Html5rw.create_element "html" () in 407 + let body = Html5rw.create_element "body" () in 408 + Html5rw.append_child doc html; 409 + Html5rw.append_child html body; 410 + Html5rw.append_child body elem; 411 + 412 + (* Serialize via Dom.to_html *) 413 + let serialized = Html5rw.Dom.to_html ~pretty:false doc in 414 + 415 + (* Reparse *) 416 + let result = Html5rw.parse (reader_of_string serialized) in 417 + let _ = Html5rw.to_string result in 418 + check true 419 + with e -> 420 + Printf.printf "\nDOM manipulation exception: %s\n" (Printexc.to_string e); 421 + check false 422 + 423 + (* ========================================================================== 424 + Test 7: Text extraction consistency 425 + ========================================================================== *) 426 + 427 + let () = 428 + add_test ~name:"html5rw_text_extraction" [html_gen] @@ fun html -> 429 + try 430 + let result = Html5rw.parse (reader_of_string html) in 431 + let _ = Html5rw.to_text result in 432 + check true 433 + with e -> 434 + Printf.printf "\nText extraction exception: %s\n" (Printexc.to_string e); 435 + check false 436 + 437 + (* ========================================================================== 438 + Test 8: Clone consistency 439 + ========================================================================== *) 440 + 441 + let () = 442 + add_test ~name:"html5rw_clone_deep" [html_gen] @@ fun html -> 443 + try 444 + let result = Html5rw.parse (reader_of_string html) in 445 + let root = Html5rw.root result in 446 + let cloned = Html5rw.clone ~deep:true root in 447 + 448 + (* Serialize both and compare *) 449 + let original_html = Html5rw.Dom.to_html ~pretty:false root in 450 + let cloned_html = Html5rw.Dom.to_html ~pretty:false cloned in 451 + 452 + if original_html <> cloned_html then begin 453 + Printf.printf "\nClone mismatch:\n"; 454 + Printf.printf "Original: %s\n" (String.escaped (String.sub original_html 0 (min 200 (String.length original_html)))); 455 + Printf.printf "Cloned: %s\n" (String.escaped (String.sub cloned_html 0 (min 200 (String.length cloned_html)))); 456 + check false 457 + end else 458 + check true 459 + with e -> 460 + Printf.printf "\nClone exception: %s\n" (Printexc.to_string e); 461 + check false 462 + 463 + (* ========================================================================== 464 + Test 9: Error collection should not affect parsing result 465 + ========================================================================== *) 466 + 467 + let () = 468 + add_test ~name:"html5rw_error_collection_consistent" [html_gen] @@ fun html -> 469 + try 470 + let r1 = Html5rw.parse ~collect_errors:false (reader_of_string html) in 471 + let r2 = Html5rw.parse ~collect_errors:true (reader_of_string html) in 472 + 473 + let s1 = serialize r1 in 474 + let s2 = serialize r2 in 475 + 476 + if s1 <> s2 then begin 477 + Printf.printf "\nError collection changes output!\n"; 478 + Printf.printf "Without: %s\n" (String.escaped (String.sub s1 0 (min 200 (String.length s1)))); 479 + Printf.printf "With: %s\n" (String.escaped (String.sub s2 0 (min 200 (String.length s2)))); 480 + check false 481 + end else 482 + check true 483 + with e -> 484 + Printf.printf "\nError collection exception: %s\n" (Printexc.to_string e); 485 + check false 486 + 487 + (* ========================================================================== 488 + Test 10: Pretty printing should produce parseable HTML 489 + ========================================================================== *) 490 + 491 + (* Helper to normalize whitespace for comparison 492 + Pretty printing adds whitespace that becomes text nodes, so we compare 493 + text content only to verify semantic equivalence. 494 + We collapse all whitespace sequences to single spaces. *) 495 + let normalize_for_comparison result = 496 + let text = Html5rw.to_text ~separator:" " ~strip:true result in 497 + (* Collapse whitespace sequences *) 498 + let buf = Buffer.create (String.length text) in 499 + let in_space = ref false in 500 + String.iter (fun c -> 501 + match c with 502 + | ' ' | '\t' | '\n' | '\r' -> 503 + if not !in_space then begin 504 + Buffer.add_char buf ' '; 505 + in_space := true 506 + end 507 + | c -> 508 + Buffer.add_char buf c; 509 + in_space := false 510 + ) text; 511 + String.trim (Buffer.contents buf) 512 + 513 + let () = 514 + add_test ~name:"html5rw_pretty_print_parseable" [html_gen] @@ fun html -> 515 + try 516 + let r1 = Html5rw.parse (reader_of_string html) in 517 + let pretty = Html5rw.to_string ~pretty:true r1 in 518 + let compact = Html5rw.to_string ~pretty:false r1 in 519 + 520 + (* Both should reparse to have same text content *) 521 + let r_pretty = Html5rw.parse (reader_of_string pretty) in 522 + let r_compact = Html5rw.parse (reader_of_string compact) in 523 + 524 + let text_pretty = normalize_for_comparison r_pretty in 525 + let text_compact = normalize_for_comparison r_compact in 526 + 527 + if text_pretty <> text_compact then begin 528 + Printf.printf "\nPretty/compact text content mismatch!\n"; 529 + Printf.printf "Input: %s\n" (String.escaped (String.sub html 0 (min 100 (String.length html)))); 530 + Printf.printf "Pretty text: %s\n" (String.escaped text_pretty); 531 + Printf.printf "Compact text: %s\n" (String.escaped text_compact); 532 + check false 533 + end else 534 + check true 535 + with e -> 536 + Printf.printf "\nPretty print exception: %s\n" (Printexc.to_string e); 537 + check false
+7 -2
lib/html5rw/dom/dom.ml
··· 12 12 13 13 include Dom_node 14 14 15 - let to_html = Dom_serialize.to_html 16 - let to_writer = Dom_serialize.to_writer 15 + (* Wrap serialization functions to hide internal text_mode parameter *) 16 + let to_html ?pretty ?indent_size ?indent node = 17 + Dom_serialize.to_html ?pretty ?indent_size ?indent node 18 + 19 + let to_writer ?pretty ?indent_size ?indent w node = 20 + Dom_serialize.to_writer ?pretty ?indent_size ?indent w node 21 + 17 22 let to_test_format = Dom_serialize.to_test_format 18 23 let to_text = Dom_serialize.to_text
+222 -68
lib/html5rw/dom/dom_serialize.ml
··· 20 20 21 21 let is_void name = Hashtbl.mem void_elements_tbl name 22 22 23 + (* Raw text elements - content should NOT be escaped at all 24 + Per WHATWG spec: script, style, xmp, iframe, noembed, noframes 25 + Note: noscript depends on scripting being enabled (we assume it is) 26 + Note: plaintext is handled specially - it has no closing tag *) 27 + let raw_text_elements_tbl = 28 + let elements = [ 29 + "script"; "style"; "xmp"; "iframe"; "noembed"; "noframes"; "noscript" 30 + ] in 31 + let tbl = Hashtbl.create (List.length elements) in 32 + List.iter (fun e -> Hashtbl.add tbl e ()) elements; 33 + tbl 34 + 35 + let is_raw_text_element name = Hashtbl.mem raw_text_elements_tbl name 36 + 37 + (* plaintext is special: it can never be closed, everything after is raw text. 38 + We treat it as raw text but without a closing tag. *) 39 + let is_plaintext_element name = name = "plaintext" 40 + 41 + (* Escapable raw text elements - only & needs to be escaped *) 42 + let escapable_raw_text_elements_tbl = 43 + let elements = ["textarea"; "title"] in 44 + let tbl = Hashtbl.create (List.length elements) in 45 + List.iter (fun e -> Hashtbl.add tbl e ()) elements; 46 + tbl 47 + 48 + let is_escapable_raw_text_element name = Hashtbl.mem escapable_raw_text_elements_tbl name 49 + 23 50 (* Foreign attribute adjustments for test output *) 24 51 let foreign_attr_adjustments = [ 25 52 "xlink:actuate"; "xlink:arcrole"; "xlink:href"; "xlink:role"; ··· 35 62 | '&' -> Buffer.add_string buf "&amp;" 36 63 | '<' -> Buffer.add_string buf "&lt;" 37 64 | '>' -> Buffer.add_string buf "&gt;" 65 + | c -> Buffer.add_char buf c 66 + ) text; 67 + Buffer.contents buf 68 + 69 + (* Escape text for escapable raw text elements (only & needs escaping) *) 70 + let escape_escapable_raw_text text = 71 + let buf = Buffer.create (String.length text) in 72 + String.iter (fun c -> 73 + match c with 74 + | '&' -> Buffer.add_string buf "&amp;" 38 75 | c -> Buffer.add_char buf c 39 76 ) text; 40 77 Buffer.contents buf ··· 91 128 let serialize_end_tag name = 92 129 "</" ^ name ^ ">" 93 130 94 - (* Convert node to HTML string *) 95 - let rec to_html ?(pretty=true) ?(indent_size=2) ?(indent=0) node = 131 + (* Text escaping mode based on parent element *) 132 + type text_mode = Normal | Raw | EscapableRaw 133 + 134 + (* Convert node to HTML string 135 + Returns (html_string, encountered_plaintext) where encountered_plaintext 136 + indicates that a plaintext element was found and no more content should 137 + be serialized after this point (plaintext absorbs everything after it) *) 138 + let rec to_html_internal ?(pretty=true) ?(indent_size=2) ?(indent=0) ?(text_mode=Normal) node = 96 139 let prefix = if pretty then String.make (indent * indent_size) ' ' else "" in 97 140 let newline = if pretty then "\n" else "" in 98 141 142 + (* Escape text based on mode *) 143 + let escape_for_mode text = match text_mode with 144 + | Normal -> escape_text text 145 + | Raw -> text (* No escaping for script/style content *) 146 + | EscapableRaw -> escape_escapable_raw_text text 147 + in 148 + 99 149 match node.name with 100 150 | "#document" -> 101 - let parts = List.map (to_html ~pretty ~indent_size ~indent:0) node.children in 102 - String.concat newline (List.filter (fun s -> s <> "") parts) 151 + let buf = Buffer.create 256 in 152 + let first = ref true in 153 + let plaintext_found = ref false in 154 + List.iter (fun child -> 155 + if not !plaintext_found then begin 156 + let (html, pt) = to_html_internal ~pretty ~indent_size ~indent:0 ~text_mode:Normal child in 157 + if html <> "" then begin 158 + if not !first && pretty then Buffer.add_string buf newline; 159 + Buffer.add_string buf html; 160 + first := false 161 + end; 162 + if pt then plaintext_found := true 163 + end 164 + ) node.children; 165 + (Buffer.contents buf, !plaintext_found) 103 166 104 167 | "#document-fragment" -> 105 - let parts = List.map (to_html ~pretty ~indent_size ~indent) node.children in 106 - String.concat newline (List.filter (fun s -> s <> "") parts) 168 + let buf = Buffer.create 256 in 169 + let first = ref true in 170 + let plaintext_found = ref false in 171 + List.iter (fun child -> 172 + if not !plaintext_found then begin 173 + let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode child in 174 + if html <> "" then begin 175 + if not !first && pretty then Buffer.add_string buf newline; 176 + Buffer.add_string buf html; 177 + first := false 178 + end; 179 + if pt then plaintext_found := true 180 + end 181 + ) node.children; 182 + (Buffer.contents buf, !plaintext_found) 107 183 108 184 | "#text" -> 109 185 let text = node.data in 110 - if pretty then 186 + if pretty && text_mode = Normal then 111 187 let trimmed = String.trim text in 112 - if trimmed = "" then "" 113 - else prefix ^ escape_text trimmed 114 - else escape_text text 188 + if trimmed = "" then ("", false) 189 + else (prefix ^ escape_for_mode trimmed, false) 190 + else (escape_for_mode text, false) 115 191 116 192 | "#comment" -> 117 - prefix ^ "<!--" ^ node.data ^ "-->" 193 + (prefix ^ "<!--" ^ node.data ^ "-->", false) 118 194 119 195 | "!doctype" -> 120 - prefix ^ "<!DOCTYPE html>" 196 + (prefix ^ "<!DOCTYPE html>", false) 121 197 122 198 | name -> 123 199 let open_tag = serialize_start_tag name node.attrs in 124 200 125 201 if is_void name then 126 - prefix ^ open_tag 127 - else if node.children = [] then 128 - prefix ^ open_tag ^ serialize_end_tag name 202 + (prefix ^ open_tag, false) 203 + else if is_plaintext_element name then begin 204 + (* plaintext is special: it cannot be closed once opened. 205 + We serialize content as raw text without a closing tag. 206 + Also signal that plaintext was encountered so ancestors 207 + don't add closing tags. *) 208 + let text = String.concat "" (List.map (fun c -> c.data) node.children) in 209 + (prefix ^ open_tag ^ text, true) 210 + end else if node.children = [] then 211 + (prefix ^ open_tag ^ serialize_end_tag name, false) 129 212 else begin 213 + (* Determine text mode for children based on this element *) 214 + let child_text_mode = 215 + if is_raw_text_element name then Raw 216 + else if is_escapable_raw_text_element name then EscapableRaw 217 + else Normal 218 + in 130 219 (* Check if all children are text *) 131 220 let all_text = List.for_all is_text node.children in 132 - if all_text && pretty then 221 + if all_text then begin 133 222 let text = String.concat "" (List.map (fun c -> c.data) node.children) in 134 - prefix ^ open_tag ^ escape_text text ^ serialize_end_tag name 135 - else begin 136 - let parts = [prefix ^ open_tag] in 137 - let child_parts = List.filter_map (fun child -> 138 - let html = to_html ~pretty ~indent_size ~indent:(indent + 1) child in 139 - if html = "" then None else Some html 140 - ) node.children in 141 - let parts = parts @ child_parts @ [prefix ^ serialize_end_tag name] in 142 - String.concat newline parts 223 + let escaped = match child_text_mode with 224 + | Normal -> escape_text text 225 + | Raw -> text 226 + | EscapableRaw -> escape_escapable_raw_text text 227 + in 228 + (prefix ^ open_tag ^ escaped ^ serialize_end_tag name, false) 229 + end else begin 230 + let buf = Buffer.create 256 in 231 + Buffer.add_string buf (prefix ^ open_tag); 232 + let plaintext_found = ref false in 233 + List.iter (fun child -> 234 + if not !plaintext_found then begin 235 + let (html, pt) = to_html_internal ~pretty ~indent_size ~indent:(indent + 1) ~text_mode:child_text_mode child in 236 + if html <> "" then begin 237 + Buffer.add_string buf newline; 238 + Buffer.add_string buf html 239 + end; 240 + if pt then plaintext_found := true 241 + end 242 + ) node.children; 243 + (* Only add closing tag if plaintext wasn't found *) 244 + if not !plaintext_found then begin 245 + Buffer.add_string buf newline; 246 + Buffer.add_string buf (prefix ^ serialize_end_tag name) 247 + end; 248 + (Buffer.contents buf, !plaintext_found) 143 249 end 144 250 end 251 + 252 + (* Public wrapper that discards the plaintext flag *) 253 + let to_html ?(pretty=true) ?(indent_size=2) ?(indent=0) ?(text_mode=Normal) node = 254 + fst (to_html_internal ~pretty ~indent_size ~indent ~text_mode node) 145 255 146 256 (* Get qualified name for test format *) 147 257 let qualified_name node = ··· 226 336 if strip then String.trim combined else combined 227 337 228 338 (* Streaming serialization to a Bytes.Writer.t 229 - Writes HTML directly to the writer without building intermediate strings *) 230 - let rec to_writer ?(pretty=true) ?(indent_size=2) ?(indent=0) (w : Bytes.Writer.t) node = 339 + Writes HTML directly to the writer without building intermediate strings 340 + Returns true if a plaintext element was encountered (stops further serialization) *) 341 + let rec to_writer_internal ?(pretty=true) ?(indent_size=2) ?(indent=0) ?(text_mode=Normal) (w : Bytes.Writer.t) node = 231 342 let write s = Bytes.Writer.write_string w s in 232 343 let write_prefix () = if pretty then write (String.make (indent * indent_size) ' ') in 233 344 let write_newline () = if pretty then write "\n" in 234 345 346 + (* Escape text based on mode *) 347 + let escape_for_mode text = match text_mode with 348 + | Normal -> escape_text text 349 + | Raw -> text 350 + | EscapableRaw -> escape_escapable_raw_text text 351 + in 352 + 235 353 match node.name with 236 354 | "#document" -> 237 - let rec write_children first = function 238 - | [] -> () 239 - | child :: rest -> 240 - if not first && pretty then write_newline (); 241 - to_writer ~pretty ~indent_size ~indent:0 w child; 242 - write_children false rest 243 - in 244 - write_children true node.children 355 + let plaintext_found = ref false in 356 + let first = ref true in 357 + List.iter (fun child -> 358 + if not !plaintext_found then begin 359 + if not !first && pretty then write_newline (); 360 + let pt = to_writer_internal ~pretty ~indent_size ~indent:0 ~text_mode:Normal w child in 361 + first := false; 362 + if pt then plaintext_found := true 363 + end 364 + ) node.children; 365 + !plaintext_found 245 366 246 367 | "#document-fragment" -> 247 - let rec write_children first = function 248 - | [] -> () 249 - | child :: rest -> 250 - if not first && pretty then write_newline (); 251 - to_writer ~pretty ~indent_size ~indent w child; 252 - write_children false rest 253 - in 254 - write_children true node.children 368 + let plaintext_found = ref false in 369 + let first = ref true in 370 + List.iter (fun child -> 371 + if not !plaintext_found then begin 372 + if not !first && pretty then write_newline (); 373 + let pt = to_writer_internal ~pretty ~indent_size ~indent ~text_mode w child in 374 + first := false; 375 + if pt then plaintext_found := true 376 + end 377 + ) node.children; 378 + !plaintext_found 255 379 256 380 | "#text" -> 257 381 let text = node.data in 258 - if pretty then begin 382 + if pretty && text_mode = Normal then begin 259 383 let trimmed = String.trim text in 260 384 if trimmed <> "" then begin 261 385 write_prefix (); 262 - write (escape_text trimmed) 386 + write (escape_for_mode trimmed) 263 387 end 264 388 end else 265 - write (escape_text text) 389 + write (escape_for_mode text); 390 + false 266 391 267 392 | "#comment" -> 268 393 write_prefix (); 269 394 write "<!--"; 270 395 write node.data; 271 - write "-->" 396 + write "-->"; 397 + false 272 398 273 399 | "!doctype" -> 274 400 write_prefix (); 275 - write "<!DOCTYPE html>" 401 + write "<!DOCTYPE html>"; 402 + false 276 403 277 404 | name -> 278 405 write_prefix (); 279 406 write (serialize_start_tag name node.attrs); 280 407 281 - if not (is_void name) then begin 282 - if node.children = [] then 283 - write (serialize_end_tag name) 284 - else begin 285 - (* Check if all children are text *) 286 - let all_text = List.for_all is_text node.children in 287 - if all_text && pretty then begin 288 - let text = String.concat "" (List.map (fun c -> c.data) node.children) in 289 - write (escape_text text); 290 - write (serialize_end_tag name) 291 - end else begin 292 - let rec write_children = function 293 - | [] -> () 294 - | child :: rest -> 295 - write_newline (); 296 - to_writer ~pretty ~indent_size ~indent:(indent + 1) w child; 297 - write_children rest 298 - in 299 - write_children node.children; 408 + if is_void name then 409 + false (* No end tag for void elements *) 410 + else if is_plaintext_element name then begin 411 + (* plaintext is special: cannot be closed, content is raw *) 412 + let text = String.concat "" (List.map (fun c -> c.data) node.children) in 413 + write text; 414 + (* No closing tag for plaintext, signal to stop further serialization *) 415 + true 416 + end else if node.children = [] then begin 417 + write (serialize_end_tag name); 418 + false 419 + end else begin 420 + (* Determine text mode for children based on this element *) 421 + let child_text_mode = 422 + if is_raw_text_element name then Raw 423 + else if is_escapable_raw_text_element name then EscapableRaw 424 + else Normal 425 + in 426 + (* Check if all children are text *) 427 + let all_text = List.for_all is_text node.children in 428 + if all_text then begin 429 + let text = String.concat "" (List.map (fun c -> c.data) node.children) in 430 + let escaped = match child_text_mode with 431 + | Normal -> escape_text text 432 + | Raw -> text 433 + | EscapableRaw -> escape_escapable_raw_text text 434 + in 435 + write escaped; 436 + write (serialize_end_tag name); 437 + false 438 + end else begin 439 + let plaintext_found = ref false in 440 + List.iter (fun child -> 441 + if not !plaintext_found then begin 442 + write_newline (); 443 + let pt = to_writer_internal ~pretty ~indent_size ~indent:(indent + 1) ~text_mode:child_text_mode w child in 444 + if pt then plaintext_found := true 445 + end 446 + ) node.children; 447 + (* Only add closing tag if plaintext wasn't found *) 448 + if not !plaintext_found then begin 300 449 write_newline (); 301 450 write_prefix (); 302 451 write (serialize_end_tag name) 303 - end 452 + end; 453 + !plaintext_found 304 454 end 305 455 end 456 + 457 + (* Public wrapper that discards the plaintext flag *) 458 + let to_writer ?(pretty=true) ?(indent_size=2) ?(indent=0) (w : Bytes.Writer.t) node = 459 + ignore (to_writer_internal ~pretty ~indent_size ~indent w node)