OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at f7c69be4eae5476a0985d55de71f2cc34c8d5361 305 lines 10 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(* HTML5 DOM serialization *) 7 8open Bytesrw 9open Dom_node 10 11(* Void elements that don't have end tags - O(1) hashtable lookup *) 12let void_elements_tbl = 13 let elements = [ 14 "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input"; 15 "link"; "meta"; "source"; "track"; "wbr" 16 ] in 17 let tbl = Hashtbl.create (List.length elements) in 18 List.iter (fun e -> Hashtbl.add tbl e ()) elements; 19 tbl 20 21let is_void name = Hashtbl.mem void_elements_tbl name 22 23(* Foreign attribute adjustments for test output *) 24let foreign_attr_adjustments = [ 25 "xlink:actuate"; "xlink:arcrole"; "xlink:href"; "xlink:role"; 26 "xlink:show"; "xlink:title"; "xlink:type"; "xml:lang"; "xml:space"; 27 "xmlns:xlink" 28] 29 30(* Escape text content *) 31let escape_text text = 32 let buf = Buffer.create (String.length text) in 33 String.iter (fun c -> 34 match c with 35 | '&' -> Buffer.add_string buf "&amp;" 36 | '<' -> Buffer.add_string buf "&lt;" 37 | '>' -> Buffer.add_string buf "&gt;" 38 | c -> Buffer.add_char buf c 39 ) text; 40 Buffer.contents buf 41 42(* Choose quote character for attribute value *) 43let choose_attr_quote value = 44 if String.contains value '"' && not (String.contains value '\'') then '\'' 45 else '"' 46 47(* Escape attribute value *) 48let escape_attr_value value quote_char = 49 let buf = Buffer.create (String.length value) in 50 String.iter (fun c -> 51 match c with 52 | '&' -> Buffer.add_string buf "&amp;" 53 | '"' when quote_char = '"' -> Buffer.add_string buf "&quot;" 54 | '\'' when quote_char = '\'' -> Buffer.add_string buf "&#39;" 55 | c -> Buffer.add_char buf c 56 ) value; 57 Buffer.contents buf 58 59(* Check if attribute value can be unquoted *) 60let can_unquote_attr_value value = 61 if String.length value = 0 then false 62 else 63 let invalid = ref false in 64 String.iter (fun c -> 65 if c = '>' || c = '"' || c = '\'' || c = '=' || c = '`' || 66 c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' then 67 invalid := true 68 ) value; 69 not !invalid 70 71(* Serialize start tag - per WHATWG spec, attribute values must be quoted *) 72let serialize_start_tag name attrs = 73 let buf = Buffer.create 64 in 74 Buffer.add_char buf '<'; 75 Buffer.add_string buf name; 76 List.iter (fun (key, value) -> 77 Buffer.add_char buf ' '; 78 Buffer.add_string buf key; 79 if value <> "" then begin 80 (* WHATWG serialization algorithm requires double quotes around values *) 81 Buffer.add_char buf '='; 82 Buffer.add_char buf '"'; 83 Buffer.add_string buf (escape_attr_value value '"'); 84 Buffer.add_char buf '"' 85 end 86 ) attrs; 87 Buffer.add_char buf '>'; 88 Buffer.contents buf 89 90(* Serialize end tag *) 91let serialize_end_tag name = 92 "</" ^ name ^ ">" 93 94(* Convert node to HTML string *) 95let rec to_html ?(pretty=true) ?(indent_size=2) ?(indent=0) node = 96 let prefix = if pretty then String.make (indent * indent_size) ' ' else "" in 97 let newline = if pretty then "\n" else "" in 98 99 match node.name with 100 | "#document" -> 101 let parts = List.map (to_html ~pretty ~indent_size ~indent:0) node.children in 102 String.concat newline (List.filter (fun s -> s <> "") parts) 103 104 | "#document-fragment" -> 105 let parts = List.map (to_html ~pretty ~indent_size ~indent) node.children in 106 String.concat newline (List.filter (fun s -> s <> "") parts) 107 108 | "#text" -> 109 let text = node.data in 110 if pretty then 111 let trimmed = String.trim text in 112 if trimmed = "" then "" 113 else prefix ^ escape_text trimmed 114 else escape_text text 115 116 | "#comment" -> 117 prefix ^ "<!--" ^ node.data ^ "-->" 118 119 | "!doctype" -> 120 prefix ^ "<!DOCTYPE html>" 121 122 | name -> 123 let open_tag = serialize_start_tag name node.attrs in 124 125 if is_void name then 126 prefix ^ open_tag 127 else if node.children = [] then 128 prefix ^ open_tag ^ serialize_end_tag name 129 else begin 130 (* Check if all children are text *) 131 let all_text = List.for_all is_text node.children in 132 if all_text && pretty then 133 let text = String.concat "" (List.map (fun c -> c.data) node.children) in 134 prefix ^ open_tag ^ escape_text text ^ serialize_end_tag name 135 else begin 136 let parts = [prefix ^ open_tag] in 137 let child_parts = List.filter_map (fun child -> 138 let html = to_html ~pretty ~indent_size ~indent:(indent + 1) child in 139 if html = "" then None else Some html 140 ) node.children in 141 let parts = parts @ child_parts @ [prefix ^ serialize_end_tag name] in 142 String.concat newline parts 143 end 144 end 145 146(* Get qualified name for test format *) 147let qualified_name node = 148 match node.namespace with 149 | Some "svg" -> "svg " ^ node.name 150 | Some "mathml" -> "math " ^ node.name 151 | Some ns when ns <> "html" -> ns ^ " " ^ node.name 152 | _ -> node.name 153 154(* Format attributes for test output *) 155let attrs_to_test_format node indent = 156 if node.attrs = [] then [] 157 else begin 158 let padding = String.make (indent + 2) ' ' in 159 (* Compute display names first, then sort by display name for canonical output *) 160 let with_display_names = List.map (fun (name, value) -> 161 let display_name = 162 match node.namespace with 163 | Some ns when ns <> "html" && List.mem (String.lowercase_ascii name) foreign_attr_adjustments -> 164 String.map (fun c -> if c = ':' then ' ' else c) name 165 | _ -> name 166 in 167 (display_name, value) 168 ) node.attrs in 169 let sorted = List.sort (fun (a, _) (b, _) -> String.compare a b) with_display_names in 170 List.map (fun (display_name, value) -> 171 Printf.sprintf "| %s%s=\"%s\"" padding display_name value 172 ) sorted 173 end 174 175(* Convert node to html5lib test format *) 176let rec to_test_format ?(indent=0) node = 177 match node.name with 178 | "#document" | "#document-fragment" -> 179 let parts = List.map (to_test_format ~indent:0) node.children in 180 String.concat "\n" parts 181 182 | "#comment" -> 183 Printf.sprintf "| %s<!-- %s -->" (String.make indent ' ') node.data 184 185 | "!doctype" -> 186 let dt = match node.doctype with Some d -> d | None -> { name = None; public_id = None; system_id = None } in 187 let name_str = match dt.name with Some n -> " " ^ n | None -> " " in 188 let ids_str = 189 match dt.public_id, dt.system_id with 190 | None, None -> "" 191 | pub, sys -> 192 let pub_str = match pub with Some p -> p | None -> "" in 193 let sys_str = match sys with Some s -> s | None -> "" in 194 Printf.sprintf " \"%s\" \"%s\"" pub_str sys_str 195 in 196 Printf.sprintf "| <!DOCTYPE%s%s>" name_str ids_str 197 198 | "#text" -> 199 Printf.sprintf "| %s\"%s\"" (String.make indent ' ') node.data 200 201 | "template" when node.namespace = None || node.namespace = Some "html" -> 202 let line = Printf.sprintf "| %s<%s>" (String.make indent ' ') (qualified_name node) in 203 let attr_lines = attrs_to_test_format node indent in 204 let content_line = Printf.sprintf "| %scontent" (String.make (indent + 2) ' ') in 205 let content_children = 206 match node.template_content with 207 | Some tc -> List.map (to_test_format ~indent:(indent + 4)) tc.children 208 | None -> [] 209 in 210 String.concat "\n" ([line] @ attr_lines @ [content_line] @ content_children) 211 212 | _ -> 213 let line = Printf.sprintf "| %s<%s>" (String.make indent ' ') (qualified_name node) in 214 let attr_lines = attrs_to_test_format node indent in 215 let child_lines = List.map (to_test_format ~indent:(indent + 2)) node.children in 216 String.concat "\n" ([line] @ attr_lines @ child_lines) 217 218(* Extract text content *) 219let to_text ?(separator=" ") ?(strip=true) node = 220 let rec collect_text n = 221 if is_text n then [n.data] 222 else List.concat_map collect_text n.children 223 in 224 let texts = collect_text node in 225 let combined = String.concat separator texts in 226 if strip then String.trim combined else combined 227 228(* Streaming serialization to a Bytes.Writer.t 229 Writes HTML directly to the writer without building intermediate strings *) 230let rec to_writer ?(pretty=true) ?(indent_size=2) ?(indent=0) (w : Bytes.Writer.t) node = 231 let write s = Bytes.Writer.write_string w s in 232 let write_prefix () = if pretty then write (String.make (indent * indent_size) ' ') in 233 let write_newline () = if pretty then write "\n" in 234 235 match node.name with 236 | "#document" -> 237 let rec write_children first = function 238 | [] -> () 239 | child :: rest -> 240 if not first && pretty then write_newline (); 241 to_writer ~pretty ~indent_size ~indent:0 w child; 242 write_children false rest 243 in 244 write_children true node.children 245 246 | "#document-fragment" -> 247 let rec write_children first = function 248 | [] -> () 249 | child :: rest -> 250 if not first && pretty then write_newline (); 251 to_writer ~pretty ~indent_size ~indent w child; 252 write_children false rest 253 in 254 write_children true node.children 255 256 | "#text" -> 257 let text = node.data in 258 if pretty then begin 259 let trimmed = String.trim text in 260 if trimmed <> "" then begin 261 write_prefix (); 262 write (escape_text trimmed) 263 end 264 end else 265 write (escape_text text) 266 267 | "#comment" -> 268 write_prefix (); 269 write "<!--"; 270 write node.data; 271 write "-->" 272 273 | "!doctype" -> 274 write_prefix (); 275 write "<!DOCTYPE html>" 276 277 | name -> 278 write_prefix (); 279 write (serialize_start_tag name node.attrs); 280 281 if not (is_void name) then begin 282 if node.children = [] then 283 write (serialize_end_tag name) 284 else begin 285 (* Check if all children are text *) 286 let all_text = List.for_all is_text node.children in 287 if all_text && pretty then begin 288 let text = String.concat "" (List.map (fun c -> c.data) node.children) in 289 write (escape_text text); 290 write (serialize_end_tag name) 291 end else begin 292 let rec write_children = function 293 | [] -> () 294 | child :: rest -> 295 write_newline (); 296 to_writer ~pretty ~indent_size ~indent:(indent + 1) w child; 297 write_children rest 298 in 299 write_children node.children; 300 write_newline (); 301 write_prefix (); 302 write (serialize_end_tag name) 303 end 304 end 305 end