OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* HTML5 spec constants *)
2
3(* Use Astring for string operations *)
4let lowercase = Astring.String.Ascii.lowercase
5
6(* Helper to create a hashtable set from a list for O(1) membership *)
7let make_set elements =
8 let tbl = Hashtbl.create (List.length elements) in
9 List.iter (fun e -> Hashtbl.add tbl e ()) elements;
10 tbl
11
12(* Void elements - no end tag allowed *)
13let void_elements = [
14 "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
15 "link"; "meta"; "source"; "track"; "wbr"
16]
17let void_elements_tbl = make_set void_elements
18
19(* Raw text elements - content is raw text *)
20let raw_text_elements = ["script"; "style"]
21
22(* Escapable raw text elements *)
23let escapable_raw_text_elements = ["textarea"; "title"]
24
25(* Formatting elements for adoption agency *)
26let formatting_elements = [
27 "a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small";
28 "strike"; "strong"; "tt"; "u"
29]
30let formatting_elements_tbl = make_set formatting_elements
31
32(* Special elements *)
33let special_elements = [
34 "address"; "applet"; "area"; "article"; "aside"; "base"; "basefont";
35 "bgsound"; "blockquote"; "body"; "br"; "button"; "caption"; "center";
36 "col"; "colgroup"; "dd"; "details"; "dir"; "div"; "dl"; "dt"; "embed";
37 "fieldset"; "figcaption"; "figure"; "footer"; "form"; "frame"; "frameset";
38 "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "header"; "hgroup"; "hr";
39 "html"; "iframe"; "img"; "input"; "keygen"; "li"; "link"; "listing";
40 "main"; "marquee"; "menu"; "meta"; "nav"; "noembed"; "noframes";
41 "noscript"; "object"; "ol"; "p"; "param"; "plaintext"; "pre"; "script";
42 "search"; "section"; "select"; "source"; "style"; "summary"; "table";
43 "tbody"; "td"; "template"; "textarea"; "tfoot"; "th"; "thead"; "title";
44 "tr"; "track"; "ul"; "wbr"; "xmp"
45]
46let special_elements_tbl = make_set special_elements
47
48(* Heading elements *)
49let heading_elements = ["h1"; "h2"; "h3"; "h4"; "h5"; "h6"]
50let heading_elements_tbl = make_set heading_elements
51
52(* Implied end tag elements *)
53let implied_end_tags = [
54 "dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"
55]
56let implied_end_tags_tbl = make_set implied_end_tags
57
58(* Thoroughly implied end tags *)
59let thoroughly_implied_end_tags = [
60 "caption"; "colgroup"; "dd"; "dt"; "li"; "optgroup"; "option"; "p";
61 "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"
62]
63let thoroughly_implied_end_tags_tbl = make_set thoroughly_implied_end_tags
64
65(* Scope elements for various scope checks *)
66let default_scope = [
67 "applet"; "caption"; "html"; "table"; "td"; "th"; "marquee"; "object"; "template"
68]
69
70let list_item_scope = default_scope @ ["ol"; "ul"]
71
72let button_scope = default_scope @ ["button"]
73
74let table_scope = ["html"; "table"; "template"]
75
76let select_scope_exclude = ["optgroup"; "option"]
77let select_scope_exclude_tbl = make_set select_scope_exclude
78
79(* MathML text integration points *)
80let mathml_text_integration = ["mi"; "mo"; "mn"; "ms"; "mtext"]
81let mathml_text_integration_tbl = make_set mathml_text_integration
82
83(* MathML attribute adjustments - O(1) hashtable lookup *)
84let mathml_attr_adjustments_tbl =
85 let adjustments = [("definitionurl", "definitionURL")] in
86 let tbl = Hashtbl.create 4 in
87 List.iter (fun (k, v) -> Hashtbl.add tbl k v) adjustments;
88 tbl
89
90let adjust_mathml_attrs attrs =
91 List.map (fun (k, v) ->
92 match Hashtbl.find_opt mathml_attr_adjustments_tbl (lowercase k) with
93 | Some adjusted_k -> (adjusted_k, v)
94 | None -> (k, v)
95 ) attrs
96
97(* SVG HTML integration points *)
98let svg_html_integration = ["foreignObject"; "desc"; "title"]
99let svg_html_integration_tbl = make_set (List.map lowercase svg_html_integration)
100
101(* Helper to create hashtable from association list for O(1) lookup *)
102let make_assoc_tbl pairs =
103 let tbl = Hashtbl.create (List.length pairs) in
104 List.iter (fun (k, v) -> Hashtbl.add tbl k v) pairs;
105 tbl
106
107(* SVG tag name adjustments *)
108let svg_tag_adjustments = [
109 ("altglyph", "altGlyph");
110 ("altglyphdef", "altGlyphDef");
111 ("altglyphitem", "altGlyphItem");
112 ("animatecolor", "animateColor");
113 ("animatemotion", "animateMotion");
114 ("animatetransform", "animateTransform");
115 ("clippath", "clipPath");
116 ("feblend", "feBlend");
117 ("fecolormatrix", "feColorMatrix");
118 ("fecomponenttransfer", "feComponentTransfer");
119 ("fecomposite", "feComposite");
120 ("feconvolvematrix", "feConvolveMatrix");
121 ("fediffuselighting", "feDiffuseLighting");
122 ("fedisplacementmap", "feDisplacementMap");
123 ("fedistantlight", "feDistantLight");
124 ("fedropshadow", "feDropShadow");
125 ("feflood", "feFlood");
126 ("fefunca", "feFuncA");
127 ("fefuncb", "feFuncB");
128 ("fefuncg", "feFuncG");
129 ("fefuncr", "feFuncR");
130 ("fegaussianblur", "feGaussianBlur");
131 ("feimage", "feImage");
132 ("femerge", "feMerge");
133 ("femergenode", "feMergeNode");
134 ("femorphology", "feMorphology");
135 ("feoffset", "feOffset");
136 ("fepointlight", "fePointLight");
137 ("fespecularlighting", "feSpecularLighting");
138 ("fespotlight", "feSpotLight");
139 ("fetile", "feTile");
140 ("feturbulence", "feTurbulence");
141 ("foreignobject", "foreignObject");
142 ("glyphref", "glyphRef");
143 ("lineargradient", "linearGradient");
144 ("radialgradient", "radialGradient");
145 ("textpath", "textPath");
146]
147let svg_tag_adjustments_tbl = make_assoc_tbl svg_tag_adjustments
148
149(* SVG attribute adjustments *)
150let svg_attr_adjustments = [
151 ("attributename", "attributeName");
152 ("attributetype", "attributeType");
153 ("basefrequency", "baseFrequency");
154 ("baseprofile", "baseProfile");
155 ("calcmode", "calcMode");
156 ("clippathunits", "clipPathUnits");
157 ("diffuseconstant", "diffuseConstant");
158 ("edgemode", "edgeMode");
159 ("filterunits", "filterUnits");
160 ("glyphref", "glyphRef");
161 ("gradienttransform", "gradientTransform");
162 ("gradientunits", "gradientUnits");
163 ("kernelmatrix", "kernelMatrix");
164 ("kernelunitlength", "kernelUnitLength");
165 ("keypoints", "keyPoints");
166 ("keysplines", "keySplines");
167 ("keytimes", "keyTimes");
168 ("lengthadjust", "lengthAdjust");
169 ("limitingconeangle", "limitingConeAngle");
170 ("markerheight", "markerHeight");
171 ("markerunits", "markerUnits");
172 ("markerwidth", "markerWidth");
173 ("maskcontentunits", "maskContentUnits");
174 ("maskunits", "maskUnits");
175 ("numoctaves", "numOctaves");
176 ("pathlength", "pathLength");
177 ("patterncontentunits", "patternContentUnits");
178 ("patterntransform", "patternTransform");
179 ("patternunits", "patternUnits");
180 ("pointsatx", "pointsAtX");
181 ("pointsaty", "pointsAtY");
182 ("pointsatz", "pointsAtZ");
183 ("preservealpha", "preserveAlpha");
184 ("preserveaspectratio", "preserveAspectRatio");
185 ("primitiveunits", "primitiveUnits");
186 ("refx", "refX");
187 ("refy", "refY");
188 ("repeatcount", "repeatCount");
189 ("repeatdur", "repeatDur");
190 ("requiredextensions", "requiredExtensions");
191 ("requiredfeatures", "requiredFeatures");
192 ("specularconstant", "specularConstant");
193 ("specularexponent", "specularExponent");
194 ("spreadmethod", "spreadMethod");
195 ("startoffset", "startOffset");
196 ("stddeviation", "stdDeviation");
197 ("stitchtiles", "stitchTiles");
198 ("surfacescale", "surfaceScale");
199 ("systemlanguage", "systemLanguage");
200 ("tablevalues", "tableValues");
201 ("targetx", "targetX");
202 ("targety", "targetY");
203 ("textlength", "textLength");
204 ("viewbox", "viewBox");
205 ("viewtarget", "viewTarget");
206 ("xchannelselector", "xChannelSelector");
207 ("ychannelselector", "yChannelSelector");
208 ("zoomandpan", "zoomAndPan");
209]
210let svg_attr_adjustments_tbl = make_assoc_tbl svg_attr_adjustments
211
212(* Foreign attribute adjustments *)
213let foreign_attr_adjustments = [
214 ("xlink:actuate", ("xlink", "actuate", "http://www.w3.org/1999/xlink"));
215 ("xlink:arcrole", ("xlink", "arcrole", "http://www.w3.org/1999/xlink"));
216 ("xlink:href", ("xlink", "href", "http://www.w3.org/1999/xlink"));
217 ("xlink:role", ("xlink", "role", "http://www.w3.org/1999/xlink"));
218 ("xlink:show", ("xlink", "show", "http://www.w3.org/1999/xlink"));
219 ("xlink:title", ("xlink", "title", "http://www.w3.org/1999/xlink"));
220 ("xlink:type", ("xlink", "type", "http://www.w3.org/1999/xlink"));
221 ("xml:lang", ("xml", "lang", "http://www.w3.org/XML/1998/namespace"));
222 ("xml:space", ("xml", "space", "http://www.w3.org/XML/1998/namespace"));
223 ("xmlns", ("", "xmlns", "http://www.w3.org/2000/xmlns/"));
224 ("xmlns:xlink", ("xmlns", "xlink", "http://www.w3.org/2000/xmlns/"));
225]
226let foreign_attr_adjustments_tbl = make_assoc_tbl foreign_attr_adjustments
227
228(* Quirks mode detection *)
229let quirky_public_matches = [
230 "-//w3o//dtd w3 html strict 3.0//en//";
231 "-/w3c/dtd html 4.0 transitional/en";
232 "html"
233]
234
235let quirky_public_prefixes = [
236 "+//silmaril//dtd html pro v0r11 19970101//";
237 "-//as//dtd html 3.0 aswedit + extensions//";
238 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//";
239 "-//ietf//dtd html 2.0 level 1//";
240 "-//ietf//dtd html 2.0 level 2//";
241 "-//ietf//dtd html 2.0 strict level 1//";
242 "-//ietf//dtd html 2.0 strict level 2//";
243 "-//ietf//dtd html 2.0 strict//";
244 "-//ietf//dtd html 2.0//";
245 "-//ietf//dtd html 2.1e//";
246 "-//ietf//dtd html 3.0//";
247 "-//ietf//dtd html 3.2 final//";
248 "-//ietf//dtd html 3.2//";
249 "-//ietf//dtd html 3//";
250 "-//ietf//dtd html level 0//";
251 "-//ietf//dtd html level 1//";
252 "-//ietf//dtd html level 2//";
253 "-//ietf//dtd html level 3//";
254 "-//ietf//dtd html strict level 0//";
255 "-//ietf//dtd html strict level 1//";
256 "-//ietf//dtd html strict level 2//";
257 "-//ietf//dtd html strict level 3//";
258 "-//ietf//dtd html strict//";
259 "-//ietf//dtd html//";
260 "-//metrius//dtd metrius presentational//";
261 "-//microsoft//dtd internet explorer 2.0 html strict//";
262 "-//microsoft//dtd internet explorer 2.0 html//";
263 "-//microsoft//dtd internet explorer 2.0 tables//";
264 "-//microsoft//dtd internet explorer 3.0 html strict//";
265 "-//microsoft//dtd internet explorer 3.0 html//";
266 "-//microsoft//dtd internet explorer 3.0 tables//";
267 "-//netscape comm. corp.//dtd html//";
268 "-//netscape comm. corp.//dtd strict html//";
269 "-//o'reilly and associates//dtd html 2.0//";
270 "-//o'reilly and associates//dtd html extended 1.0//";
271 "-//o'reilly and associates//dtd html extended relaxed 1.0//";
272 "-//sq//dtd html 2.0 hotmetal + extensions//";
273 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//";
274 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//";
275 "-//spyglass//dtd html 2.0 extended//";
276 "-//sun microsystems corp.//dtd hotjava html//";
277 "-//sun microsystems corp.//dtd hotjava strict html//";
278 "-//w3c//dtd html 3 1995-03-24//";
279 "-//w3c//dtd html 3.2 draft//";
280 "-//w3c//dtd html 3.2 final//";
281 "-//w3c//dtd html 3.2//";
282 "-//w3c//dtd html 3.2s draft//";
283 "-//w3c//dtd html 4.0 frameset//";
284 "-//w3c//dtd html 4.0 transitional//";
285 "-//w3c//dtd html experimental 19960712//";
286 "-//w3c//dtd html experimental 970421//";
287 "-//w3c//dtd w3 html//";
288 "-//w3o//dtd w3 html 3.0//";
289 "-//webtechs//dtd mozilla html 2.0//";
290 "-//webtechs//dtd mozilla html//";
291]
292
293let limited_quirky_public_prefixes = [
294 "-//w3c//dtd xhtml 1.0 frameset//";
295 "-//w3c//dtd xhtml 1.0 transitional//";
296]
297
298let html4_public_prefixes = [
299 "-//w3c//dtd html 4.01 frameset//";
300 "-//w3c//dtd html 4.01 transitional//";
301]
302
303let quirky_system_matches = [
304 "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"
305]
306
307(* Table-related element sets for tree builder O(1) lookups *)
308let table_section_elements = ["tbody"; "thead"; "tfoot"]
309let table_section_elements_tbl = make_set table_section_elements
310
311let table_cell_elements = ["td"; "th"]
312let table_cell_elements_tbl = make_set table_cell_elements
313
314let foster_parenting_elements = ["table"; "tbody"; "tfoot"; "thead"; "tr"]
315let foster_parenting_elements_tbl = make_set foster_parenting_elements
316
317(* Helper functions - O(1) hashtable lookups *)
318let is_void_element name = Hashtbl.mem void_elements_tbl name
319let is_formatting_element name = Hashtbl.mem formatting_elements_tbl name
320let is_special_element name = Hashtbl.mem special_elements_tbl name
321let is_heading_element name = Hashtbl.mem heading_elements_tbl name
322let is_implied_end_tag name = Hashtbl.mem implied_end_tags_tbl name
323let is_thoroughly_implied_end_tag name = Hashtbl.mem thoroughly_implied_end_tags_tbl name
324let is_mathml_text_integration name = Hashtbl.mem mathml_text_integration_tbl name
325let is_svg_html_integration name = Hashtbl.mem svg_html_integration_tbl (lowercase name)
326let is_select_scope_exclude name = Hashtbl.mem select_scope_exclude_tbl name
327let is_table_section_element name = Hashtbl.mem table_section_elements_tbl name
328let is_table_cell_element name = Hashtbl.mem table_cell_elements_tbl name
329let is_foster_parenting_element name = Hashtbl.mem foster_parenting_elements_tbl name
330
331(* Backwards compatibility aliases *)
332let is_void = List.mem
333let is_formatting = List.mem
334let is_special name = List.mem name special_elements
335let is_heading = List.mem
336
337let adjust_svg_tag_name name =
338 match Hashtbl.find_opt svg_tag_adjustments_tbl (lowercase name) with
339 | Some adjusted -> adjusted
340 | None -> name
341
342let adjust_svg_attrs attrs =
343 List.map (fun (name, value) ->
344 let adjusted_name =
345 match Hashtbl.find_opt svg_attr_adjustments_tbl (lowercase name) with
346 | Some n -> n
347 | None -> name
348 in
349 (adjusted_name, value)
350 ) attrs
351
352let adjust_foreign_attrs attrs =
353 List.map (fun (name, value) ->
354 match Hashtbl.find_opt foreign_attr_adjustments_tbl (lowercase name) with
355 | Some (prefix, local, _ns) ->
356 if prefix = "" then (local, value)
357 else (prefix ^ ":" ^ local, value)
358 | None -> (name, value)
359 ) attrs