(*--------------------------------------------------------------------------- Copyright (c) 2025 Anil Madhavapeddy . All rights reserved. SPDX-License-Identifier: MIT ---------------------------------------------------------------------------*) (* HTML5 DOM serialization *) open Bytesrw open Dom_node (* Void elements that don't have end tags - O(1) hashtable lookup *) let void_elements_tbl = let elements = [ "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input"; "link"; "meta"; "source"; "track"; "wbr" ] in let tbl = Hashtbl.create (List.length elements) in List.iter (fun e -> Hashtbl.add tbl e ()) elements; tbl let is_void name = Hashtbl.mem void_elements_tbl name (* Foreign attribute adjustments for test output *) let foreign_attr_adjustments = [ "xlink:actuate"; "xlink:arcrole"; "xlink:href"; "xlink:role"; "xlink:show"; "xlink:title"; "xlink:type"; "xml:lang"; "xml:space"; "xmlns:xlink" ] (* Escape text content *) let escape_text text = let buf = Buffer.create (String.length text) in String.iter (fun c -> match c with | '&' -> Buffer.add_string buf "&" | '<' -> Buffer.add_string buf "<" | '>' -> Buffer.add_string buf ">" | c -> Buffer.add_char buf c ) text; Buffer.contents buf (* Choose quote character for attribute value *) let choose_attr_quote value = if String.contains value '"' && not (String.contains value '\'') then '\'' else '"' (* Escape attribute value *) let escape_attr_value value quote_char = let buf = Buffer.create (String.length value) in String.iter (fun c -> match c with | '&' -> Buffer.add_string buf "&" | '"' when quote_char = '"' -> Buffer.add_string buf """ | '\'' when quote_char = '\'' -> Buffer.add_string buf "'" | c -> Buffer.add_char buf c ) value; Buffer.contents buf (* Check if attribute value can be unquoted *) let can_unquote_attr_value value = if String.length value = 0 then false else let invalid = ref false in String.iter (fun c -> if c = '>' || c = '"' || c = '\'' || c = '=' || c = '`' || c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' then invalid := true ) value; not !invalid (* Serialize start tag - per WHATWG spec, attribute values must be quoted *) let serialize_start_tag name attrs = let buf = Buffer.create 64 in Buffer.add_char buf '<'; Buffer.add_string buf name; List.iter (fun (key, value) -> Buffer.add_char buf ' '; Buffer.add_string buf key; if value <> "" then begin (* WHATWG serialization algorithm requires double quotes around values *) Buffer.add_char buf '='; Buffer.add_char buf '"'; Buffer.add_string buf (escape_attr_value value '"'); Buffer.add_char buf '"' end ) attrs; Buffer.add_char buf '>'; Buffer.contents buf (* Serialize end tag *) let serialize_end_tag name = "" (* Convert node to HTML string *) let rec to_html ?(pretty=true) ?(indent_size=2) ?(indent=0) node = let prefix = if pretty then String.make (indent * indent_size) ' ' else "" in let newline = if pretty then "\n" else "" in match node.name with | "#document" -> let parts = List.map (to_html ~pretty ~indent_size ~indent:0) node.children in String.concat newline (List.filter (fun s -> s <> "") parts) | "#document-fragment" -> let parts = List.map (to_html ~pretty ~indent_size ~indent) node.children in String.concat newline (List.filter (fun s -> s <> "") parts) | "#text" -> let text = node.data in if pretty then let trimmed = String.trim text in if trimmed = "" then "" else prefix ^ escape_text trimmed else escape_text text | "#comment" -> prefix ^ "" | "!doctype" -> prefix ^ "" | name -> let open_tag = serialize_start_tag name node.attrs in if is_void name then prefix ^ open_tag else if node.children = [] then prefix ^ open_tag ^ serialize_end_tag name else begin (* Check if all children are text *) let all_text = List.for_all is_text node.children in if all_text && pretty then let text = String.concat "" (List.map (fun c -> c.data) node.children) in prefix ^ open_tag ^ escape_text text ^ serialize_end_tag name else begin let parts = [prefix ^ open_tag] in let child_parts = List.filter_map (fun child -> let html = to_html ~pretty ~indent_size ~indent:(indent + 1) child in if html = "" then None else Some html ) node.children in let parts = parts @ child_parts @ [prefix ^ serialize_end_tag name] in String.concat newline parts end end (* Get qualified name for test format *) let qualified_name node = match node.namespace with | Some "svg" -> "svg " ^ node.name | Some "mathml" -> "math " ^ node.name | Some ns when ns <> "html" -> ns ^ " " ^ node.name | _ -> node.name (* Format attributes for test output *) let attrs_to_test_format node indent = if node.attrs = [] then [] else begin let padding = String.make (indent + 2) ' ' in (* Compute display names first, then sort by display name for canonical output *) let with_display_names = List.map (fun (name, value) -> let display_name = match node.namespace with | Some ns when ns <> "html" && List.mem (String.lowercase_ascii name) foreign_attr_adjustments -> String.map (fun c -> if c = ':' then ' ' else c) name | _ -> name in (display_name, value) ) node.attrs in let sorted = List.sort (fun (a, _) (b, _) -> String.compare a b) with_display_names in List.map (fun (display_name, value) -> Printf.sprintf "| %s%s=\"%s\"" padding display_name value ) sorted end (* Convert node to html5lib test format *) let rec to_test_format ?(indent=0) node = match node.name with | "#document" | "#document-fragment" -> let parts = List.map (to_test_format ~indent:0) node.children in String.concat "\n" parts | "#comment" -> Printf.sprintf "| %s" (String.make indent ' ') node.data | "!doctype" -> let dt = match node.doctype with Some d -> d | None -> { name = None; public_id = None; system_id = None } in let name_str = match dt.name with Some n -> " " ^ n | None -> " " in let ids_str = match dt.public_id, dt.system_id with | None, None -> "" | pub, sys -> let pub_str = match pub with Some p -> p | None -> "" in let sys_str = match sys with Some s -> s | None -> "" in Printf.sprintf " \"%s\" \"%s\"" pub_str sys_str in Printf.sprintf "| " name_str ids_str | "#text" -> Printf.sprintf "| %s\"%s\"" (String.make indent ' ') node.data | "template" when node.namespace = None || node.namespace = Some "html" -> let line = Printf.sprintf "| %s<%s>" (String.make indent ' ') (qualified_name node) in let attr_lines = attrs_to_test_format node indent in let content_line = Printf.sprintf "| %scontent" (String.make (indent + 2) ' ') in let content_children = match node.template_content with | Some tc -> List.map (to_test_format ~indent:(indent + 4)) tc.children | None -> [] in String.concat "\n" ([line] @ attr_lines @ [content_line] @ content_children) | _ -> let line = Printf.sprintf "| %s<%s>" (String.make indent ' ') (qualified_name node) in let attr_lines = attrs_to_test_format node indent in let child_lines = List.map (to_test_format ~indent:(indent + 2)) node.children in String.concat "\n" ([line] @ attr_lines @ child_lines) (* Extract text content *) let to_text ?(separator=" ") ?(strip=true) node = let rec collect_text n = if is_text n then [n.data] else List.concat_map collect_text n.children in let texts = collect_text node in let combined = String.concat separator texts in if strip then String.trim combined else combined (* Streaming serialization to a Bytes.Writer.t Writes HTML directly to the writer without building intermediate strings *) let rec to_writer ?(pretty=true) ?(indent_size=2) ?(indent=0) (w : Bytes.Writer.t) node = let write s = Bytes.Writer.write_string w s in let write_prefix () = if pretty then write (String.make (indent * indent_size) ' ') in let write_newline () = if pretty then write "\n" in match node.name with | "#document" -> let rec write_children first = function | [] -> () | child :: rest -> if not first && pretty then write_newline (); to_writer ~pretty ~indent_size ~indent:0 w child; write_children false rest in write_children true node.children | "#document-fragment" -> let rec write_children first = function | [] -> () | child :: rest -> if not first && pretty then write_newline (); to_writer ~pretty ~indent_size ~indent w child; write_children false rest in write_children true node.children | "#text" -> let text = node.data in if pretty then begin let trimmed = String.trim text in if trimmed <> "" then begin write_prefix (); write (escape_text trimmed) end end else write (escape_text text) | "#comment" -> write_prefix (); write "" | "!doctype" -> write_prefix (); write "" | name -> write_prefix (); write (serialize_start_tag name node.attrs); if not (is_void name) then begin if node.children = [] then write (serialize_end_tag name) else begin (* Check if all children are text *) let all_text = List.for_all is_text node.children in if all_text && pretty then begin let text = String.concat "" (List.map (fun c -> c.data) node.children) in write (escape_text text); write (serialize_end_tag name) end else begin let rec write_children = function | [] -> () | child :: rest -> write_newline (); to_writer ~pretty ~indent_size ~indent:(indent + 1) w child; write_children rest in write_children node.children; write_newline (); write_prefix (); write (serialize_end_tag name) end end end