OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(* HTML5 DOM serialization *)
7
8open Bytesrw
9open Dom_node
10
11(* Void elements that don't have end tags - O(1) hashtable lookup *)
12let void_elements_tbl =
13 let elements = [
14 "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
15 "link"; "meta"; "source"; "track"; "wbr"
16 ] in
17 let tbl = Hashtbl.create (List.length elements) in
18 List.iter (fun e -> Hashtbl.add tbl e ()) elements;
19 tbl
20
21let is_void name = Hashtbl.mem void_elements_tbl name
22
23(* Foreign attribute adjustments for test output *)
24let foreign_attr_adjustments = [
25 "xlink:actuate"; "xlink:arcrole"; "xlink:href"; "xlink:role";
26 "xlink:show"; "xlink:title"; "xlink:type"; "xml:lang"; "xml:space";
27 "xmlns:xlink"
28]
29
30(* Escape text content *)
31let escape_text text =
32 let buf = Buffer.create (String.length text) in
33 String.iter (fun c ->
34 match c with
35 | '&' -> Buffer.add_string buf "&"
36 | '<' -> Buffer.add_string buf "<"
37 | '>' -> Buffer.add_string buf ">"
38 | c -> Buffer.add_char buf c
39 ) text;
40 Buffer.contents buf
41
42(* Choose quote character for attribute value *)
43let choose_attr_quote value =
44 if String.contains value '"' && not (String.contains value '\'') then '\''
45 else '"'
46
47(* Escape attribute value *)
48let escape_attr_value value quote_char =
49 let buf = Buffer.create (String.length value) in
50 String.iter (fun c ->
51 match c with
52 | '&' -> Buffer.add_string buf "&"
53 | '"' when quote_char = '"' -> Buffer.add_string buf """
54 | '\'' when quote_char = '\'' -> Buffer.add_string buf "'"
55 | c -> Buffer.add_char buf c
56 ) value;
57 Buffer.contents buf
58
59(* Check if attribute value can be unquoted *)
60let can_unquote_attr_value value =
61 if String.length value = 0 then false
62 else
63 let invalid = ref false in
64 String.iter (fun c ->
65 if c = '>' || c = '"' || c = '\'' || c = '=' || c = '`' ||
66 c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' then
67 invalid := true
68 ) value;
69 not !invalid
70
71(* Serialize start tag - per WHATWG spec, attribute values must be quoted *)
72let serialize_start_tag name attrs =
73 let buf = Buffer.create 64 in
74 Buffer.add_char buf '<';
75 Buffer.add_string buf name;
76 List.iter (fun (key, value) ->
77 Buffer.add_char buf ' ';
78 Buffer.add_string buf key;
79 if value <> "" then begin
80 (* WHATWG serialization algorithm requires double quotes around values *)
81 Buffer.add_char buf '=';
82 Buffer.add_char buf '"';
83 Buffer.add_string buf (escape_attr_value value '"');
84 Buffer.add_char buf '"'
85 end
86 ) attrs;
87 Buffer.add_char buf '>';
88 Buffer.contents buf
89
90(* Serialize end tag *)
91let serialize_end_tag name =
92 "</" ^ name ^ ">"
93
94(* Convert node to HTML string *)
95let rec to_html ?(pretty=true) ?(indent_size=2) ?(indent=0) node =
96 let prefix = if pretty then String.make (indent * indent_size) ' ' else "" in
97 let newline = if pretty then "\n" else "" in
98
99 match node.name with
100 | "#document" ->
101 let parts = List.map (to_html ~pretty ~indent_size ~indent:0) node.children in
102 String.concat newline (List.filter (fun s -> s <> "") parts)
103
104 | "#document-fragment" ->
105 let parts = List.map (to_html ~pretty ~indent_size ~indent) node.children in
106 String.concat newline (List.filter (fun s -> s <> "") parts)
107
108 | "#text" ->
109 let text = node.data in
110 if pretty then
111 let trimmed = String.trim text in
112 if trimmed = "" then ""
113 else prefix ^ escape_text trimmed
114 else escape_text text
115
116 | "#comment" ->
117 prefix ^ "<!--" ^ node.data ^ "-->"
118
119 | "!doctype" ->
120 prefix ^ "<!DOCTYPE html>"
121
122 | name ->
123 let open_tag = serialize_start_tag name node.attrs in
124
125 if is_void name then
126 prefix ^ open_tag
127 else if node.children = [] then
128 prefix ^ open_tag ^ serialize_end_tag name
129 else begin
130 (* Check if all children are text *)
131 let all_text = List.for_all is_text node.children in
132 if all_text && pretty then
133 let text = String.concat "" (List.map (fun c -> c.data) node.children) in
134 prefix ^ open_tag ^ escape_text text ^ serialize_end_tag name
135 else begin
136 let parts = [prefix ^ open_tag] in
137 let child_parts = List.filter_map (fun child ->
138 let html = to_html ~pretty ~indent_size ~indent:(indent + 1) child in
139 if html = "" then None else Some html
140 ) node.children in
141 let parts = parts @ child_parts @ [prefix ^ serialize_end_tag name] in
142 String.concat newline parts
143 end
144 end
145
146(* Get qualified name for test format *)
147let qualified_name node =
148 match node.namespace with
149 | Some "svg" -> "svg " ^ node.name
150 | Some "mathml" -> "math " ^ node.name
151 | Some ns when ns <> "html" -> ns ^ " " ^ node.name
152 | _ -> node.name
153
154(* Format attributes for test output *)
155let attrs_to_test_format node indent =
156 if node.attrs = [] then []
157 else begin
158 let padding = String.make (indent + 2) ' ' in
159 (* Compute display names first, then sort by display name for canonical output *)
160 let with_display_names = List.map (fun (name, value) ->
161 let display_name =
162 match node.namespace with
163 | Some ns when ns <> "html" && List.mem (String.lowercase_ascii name) foreign_attr_adjustments ->
164 String.map (fun c -> if c = ':' then ' ' else c) name
165 | _ -> name
166 in
167 (display_name, value)
168 ) node.attrs in
169 let sorted = List.sort (fun (a, _) (b, _) -> String.compare a b) with_display_names in
170 List.map (fun (display_name, value) ->
171 Printf.sprintf "| %s%s=\"%s\"" padding display_name value
172 ) sorted
173 end
174
175(* Convert node to html5lib test format *)
176let rec to_test_format ?(indent=0) node =
177 match node.name with
178 | "#document" | "#document-fragment" ->
179 let parts = List.map (to_test_format ~indent:0) node.children in
180 String.concat "\n" parts
181
182 | "#comment" ->
183 Printf.sprintf "| %s<!-- %s -->" (String.make indent ' ') node.data
184
185 | "!doctype" ->
186 let dt = match node.doctype with Some d -> d | None -> { name = None; public_id = None; system_id = None } in
187 let name_str = match dt.name with Some n -> " " ^ n | None -> " " in
188 let ids_str =
189 match dt.public_id, dt.system_id with
190 | None, None -> ""
191 | pub, sys ->
192 let pub_str = match pub with Some p -> p | None -> "" in
193 let sys_str = match sys with Some s -> s | None -> "" in
194 Printf.sprintf " \"%s\" \"%s\"" pub_str sys_str
195 in
196 Printf.sprintf "| <!DOCTYPE%s%s>" name_str ids_str
197
198 | "#text" ->
199 Printf.sprintf "| %s\"%s\"" (String.make indent ' ') node.data
200
201 | "template" when node.namespace = None || node.namespace = Some "html" ->
202 let line = Printf.sprintf "| %s<%s>" (String.make indent ' ') (qualified_name node) in
203 let attr_lines = attrs_to_test_format node indent in
204 let content_line = Printf.sprintf "| %scontent" (String.make (indent + 2) ' ') in
205 let content_children =
206 match node.template_content with
207 | Some tc -> List.map (to_test_format ~indent:(indent + 4)) tc.children
208 | None -> []
209 in
210 String.concat "\n" ([line] @ attr_lines @ [content_line] @ content_children)
211
212 | _ ->
213 let line = Printf.sprintf "| %s<%s>" (String.make indent ' ') (qualified_name node) in
214 let attr_lines = attrs_to_test_format node indent in
215 let child_lines = List.map (to_test_format ~indent:(indent + 2)) node.children in
216 String.concat "\n" ([line] @ attr_lines @ child_lines)
217
218(* Extract text content *)
219let to_text ?(separator=" ") ?(strip=true) node =
220 let rec collect_text n =
221 if is_text n then [n.data]
222 else List.concat_map collect_text n.children
223 in
224 let texts = collect_text node in
225 let combined = String.concat separator texts in
226 if strip then String.trim combined else combined
227
228(* Streaming serialization to a Bytes.Writer.t
229 Writes HTML directly to the writer without building intermediate strings *)
230let rec to_writer ?(pretty=true) ?(indent_size=2) ?(indent=0) (w : Bytes.Writer.t) node =
231 let write s = Bytes.Writer.write_string w s in
232 let write_prefix () = if pretty then write (String.make (indent * indent_size) ' ') in
233 let write_newline () = if pretty then write "\n" in
234
235 match node.name with
236 | "#document" ->
237 let rec write_children first = function
238 | [] -> ()
239 | child :: rest ->
240 if not first && pretty then write_newline ();
241 to_writer ~pretty ~indent_size ~indent:0 w child;
242 write_children false rest
243 in
244 write_children true node.children
245
246 | "#document-fragment" ->
247 let rec write_children first = function
248 | [] -> ()
249 | child :: rest ->
250 if not first && pretty then write_newline ();
251 to_writer ~pretty ~indent_size ~indent w child;
252 write_children false rest
253 in
254 write_children true node.children
255
256 | "#text" ->
257 let text = node.data in
258 if pretty then begin
259 let trimmed = String.trim text in
260 if trimmed <> "" then begin
261 write_prefix ();
262 write (escape_text trimmed)
263 end
264 end else
265 write (escape_text text)
266
267 | "#comment" ->
268 write_prefix ();
269 write "<!--";
270 write node.data;
271 write "-->"
272
273 | "!doctype" ->
274 write_prefix ();
275 write "<!DOCTYPE html>"
276
277 | name ->
278 write_prefix ();
279 write (serialize_start_tag name node.attrs);
280
281 if not (is_void name) then begin
282 if node.children = [] then
283 write (serialize_end_tag name)
284 else begin
285 (* Check if all children are text *)
286 let all_text = List.for_all is_text node.children in
287 if all_text && pretty then begin
288 let text = String.concat "" (List.map (fun c -> c.data) node.children) in
289 write (escape_text text);
290 write (serialize_end_tag name)
291 end else begin
292 let rec write_children = function
293 | [] -> ()
294 | child :: rest ->
295 write_newline ();
296 to_writer ~pretty ~indent_size ~indent:(indent + 1) w child;
297 write_children rest
298 in
299 write_children node.children;
300 write_newline ();
301 write_prefix ();
302 write (serialize_end_tag name)
303 end
304 end
305 end