OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 359 lines 14 kB view raw
1(* HTML5 spec constants *) 2 3(* Use Astring for string operations *) 4let lowercase = Astring.String.Ascii.lowercase 5 6(* Helper to create a hashtable set from a list for O(1) membership *) 7let make_set elements = 8 let tbl = Hashtbl.create (List.length elements) in 9 List.iter (fun e -> Hashtbl.add tbl e ()) elements; 10 tbl 11 12(* Void elements - no end tag allowed *) 13let void_elements = [ 14 "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input"; 15 "link"; "meta"; "source"; "track"; "wbr" 16] 17let void_elements_tbl = make_set void_elements 18 19(* Raw text elements - content is raw text *) 20let raw_text_elements = ["script"; "style"] 21 22(* Escapable raw text elements *) 23let escapable_raw_text_elements = ["textarea"; "title"] 24 25(* Formatting elements for adoption agency *) 26let formatting_elements = [ 27 "a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; 28 "strike"; "strong"; "tt"; "u" 29] 30let formatting_elements_tbl = make_set formatting_elements 31 32(* Special elements *) 33let special_elements = [ 34 "address"; "applet"; "area"; "article"; "aside"; "base"; "basefont"; 35 "bgsound"; "blockquote"; "body"; "br"; "button"; "caption"; "center"; 36 "col"; "colgroup"; "dd"; "details"; "dir"; "div"; "dl"; "dt"; "embed"; 37 "fieldset"; "figcaption"; "figure"; "footer"; "form"; "frame"; "frameset"; 38 "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "header"; "hgroup"; "hr"; 39 "html"; "iframe"; "img"; "input"; "keygen"; "li"; "link"; "listing"; 40 "main"; "marquee"; "menu"; "meta"; "nav"; "noembed"; "noframes"; 41 "noscript"; "object"; "ol"; "p"; "param"; "plaintext"; "pre"; "script"; 42 "search"; "section"; "select"; "source"; "style"; "summary"; "table"; 43 "tbody"; "td"; "template"; "textarea"; "tfoot"; "th"; "thead"; "title"; 44 "tr"; "track"; "ul"; "wbr"; "xmp" 45] 46let special_elements_tbl = make_set special_elements 47 48(* Heading elements *) 49let heading_elements = ["h1"; "h2"; "h3"; "h4"; "h5"; "h6"] 50let heading_elements_tbl = make_set heading_elements 51 52(* Implied end tag elements *) 53let implied_end_tags = [ 54 "dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc" 55] 56let implied_end_tags_tbl = make_set implied_end_tags 57 58(* Thoroughly implied end tags *) 59let thoroughly_implied_end_tags = [ 60 "caption"; "colgroup"; "dd"; "dt"; "li"; "optgroup"; "option"; "p"; 61 "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr" 62] 63let thoroughly_implied_end_tags_tbl = make_set thoroughly_implied_end_tags 64 65(* Scope elements for various scope checks *) 66let default_scope = [ 67 "applet"; "caption"; "html"; "table"; "td"; "th"; "marquee"; "object"; "template" 68] 69 70let list_item_scope = default_scope @ ["ol"; "ul"] 71 72let button_scope = default_scope @ ["button"] 73 74let table_scope = ["html"; "table"; "template"] 75 76let select_scope_exclude = ["optgroup"; "option"] 77let select_scope_exclude_tbl = make_set select_scope_exclude 78 79(* MathML text integration points *) 80let mathml_text_integration = ["mi"; "mo"; "mn"; "ms"; "mtext"] 81let mathml_text_integration_tbl = make_set mathml_text_integration 82 83(* MathML attribute adjustments - O(1) hashtable lookup *) 84let mathml_attr_adjustments_tbl = 85 let adjustments = [("definitionurl", "definitionURL")] in 86 let tbl = Hashtbl.create 4 in 87 List.iter (fun (k, v) -> Hashtbl.add tbl k v) adjustments; 88 tbl 89 90let adjust_mathml_attrs attrs = 91 List.map (fun (k, v) -> 92 match Hashtbl.find_opt mathml_attr_adjustments_tbl (lowercase k) with 93 | Some adjusted_k -> (adjusted_k, v) 94 | None -> (k, v) 95 ) attrs 96 97(* SVG HTML integration points *) 98let svg_html_integration = ["foreignObject"; "desc"; "title"] 99let svg_html_integration_tbl = make_set (List.map lowercase svg_html_integration) 100 101(* Helper to create hashtable from association list for O(1) lookup *) 102let make_assoc_tbl pairs = 103 let tbl = Hashtbl.create (List.length pairs) in 104 List.iter (fun (k, v) -> Hashtbl.add tbl k v) pairs; 105 tbl 106 107(* SVG tag name adjustments *) 108let svg_tag_adjustments = [ 109 ("altglyph", "altGlyph"); 110 ("altglyphdef", "altGlyphDef"); 111 ("altglyphitem", "altGlyphItem"); 112 ("animatecolor", "animateColor"); 113 ("animatemotion", "animateMotion"); 114 ("animatetransform", "animateTransform"); 115 ("clippath", "clipPath"); 116 ("feblend", "feBlend"); 117 ("fecolormatrix", "feColorMatrix"); 118 ("fecomponenttransfer", "feComponentTransfer"); 119 ("fecomposite", "feComposite"); 120 ("feconvolvematrix", "feConvolveMatrix"); 121 ("fediffuselighting", "feDiffuseLighting"); 122 ("fedisplacementmap", "feDisplacementMap"); 123 ("fedistantlight", "feDistantLight"); 124 ("fedropshadow", "feDropShadow"); 125 ("feflood", "feFlood"); 126 ("fefunca", "feFuncA"); 127 ("fefuncb", "feFuncB"); 128 ("fefuncg", "feFuncG"); 129 ("fefuncr", "feFuncR"); 130 ("fegaussianblur", "feGaussianBlur"); 131 ("feimage", "feImage"); 132 ("femerge", "feMerge"); 133 ("femergenode", "feMergeNode"); 134 ("femorphology", "feMorphology"); 135 ("feoffset", "feOffset"); 136 ("fepointlight", "fePointLight"); 137 ("fespecularlighting", "feSpecularLighting"); 138 ("fespotlight", "feSpotLight"); 139 ("fetile", "feTile"); 140 ("feturbulence", "feTurbulence"); 141 ("foreignobject", "foreignObject"); 142 ("glyphref", "glyphRef"); 143 ("lineargradient", "linearGradient"); 144 ("radialgradient", "radialGradient"); 145 ("textpath", "textPath"); 146] 147let svg_tag_adjustments_tbl = make_assoc_tbl svg_tag_adjustments 148 149(* SVG attribute adjustments *) 150let svg_attr_adjustments = [ 151 ("attributename", "attributeName"); 152 ("attributetype", "attributeType"); 153 ("basefrequency", "baseFrequency"); 154 ("baseprofile", "baseProfile"); 155 ("calcmode", "calcMode"); 156 ("clippathunits", "clipPathUnits"); 157 ("diffuseconstant", "diffuseConstant"); 158 ("edgemode", "edgeMode"); 159 ("filterunits", "filterUnits"); 160 ("glyphref", "glyphRef"); 161 ("gradienttransform", "gradientTransform"); 162 ("gradientunits", "gradientUnits"); 163 ("kernelmatrix", "kernelMatrix"); 164 ("kernelunitlength", "kernelUnitLength"); 165 ("keypoints", "keyPoints"); 166 ("keysplines", "keySplines"); 167 ("keytimes", "keyTimes"); 168 ("lengthadjust", "lengthAdjust"); 169 ("limitingconeangle", "limitingConeAngle"); 170 ("markerheight", "markerHeight"); 171 ("markerunits", "markerUnits"); 172 ("markerwidth", "markerWidth"); 173 ("maskcontentunits", "maskContentUnits"); 174 ("maskunits", "maskUnits"); 175 ("numoctaves", "numOctaves"); 176 ("pathlength", "pathLength"); 177 ("patterncontentunits", "patternContentUnits"); 178 ("patterntransform", "patternTransform"); 179 ("patternunits", "patternUnits"); 180 ("pointsatx", "pointsAtX"); 181 ("pointsaty", "pointsAtY"); 182 ("pointsatz", "pointsAtZ"); 183 ("preservealpha", "preserveAlpha"); 184 ("preserveaspectratio", "preserveAspectRatio"); 185 ("primitiveunits", "primitiveUnits"); 186 ("refx", "refX"); 187 ("refy", "refY"); 188 ("repeatcount", "repeatCount"); 189 ("repeatdur", "repeatDur"); 190 ("requiredextensions", "requiredExtensions"); 191 ("requiredfeatures", "requiredFeatures"); 192 ("specularconstant", "specularConstant"); 193 ("specularexponent", "specularExponent"); 194 ("spreadmethod", "spreadMethod"); 195 ("startoffset", "startOffset"); 196 ("stddeviation", "stdDeviation"); 197 ("stitchtiles", "stitchTiles"); 198 ("surfacescale", "surfaceScale"); 199 ("systemlanguage", "systemLanguage"); 200 ("tablevalues", "tableValues"); 201 ("targetx", "targetX"); 202 ("targety", "targetY"); 203 ("textlength", "textLength"); 204 ("viewbox", "viewBox"); 205 ("viewtarget", "viewTarget"); 206 ("xchannelselector", "xChannelSelector"); 207 ("ychannelselector", "yChannelSelector"); 208 ("zoomandpan", "zoomAndPan"); 209] 210let svg_attr_adjustments_tbl = make_assoc_tbl svg_attr_adjustments 211 212(* Foreign attribute adjustments *) 213let foreign_attr_adjustments = [ 214 ("xlink:actuate", ("xlink", "actuate", "http://www.w3.org/1999/xlink")); 215 ("xlink:arcrole", ("xlink", "arcrole", "http://www.w3.org/1999/xlink")); 216 ("xlink:href", ("xlink", "href", "http://www.w3.org/1999/xlink")); 217 ("xlink:role", ("xlink", "role", "http://www.w3.org/1999/xlink")); 218 ("xlink:show", ("xlink", "show", "http://www.w3.org/1999/xlink")); 219 ("xlink:title", ("xlink", "title", "http://www.w3.org/1999/xlink")); 220 ("xlink:type", ("xlink", "type", "http://www.w3.org/1999/xlink")); 221 ("xml:lang", ("xml", "lang", "http://www.w3.org/XML/1998/namespace")); 222 ("xml:space", ("xml", "space", "http://www.w3.org/XML/1998/namespace")); 223 ("xmlns", ("", "xmlns", "http://www.w3.org/2000/xmlns/")); 224 ("xmlns:xlink", ("xmlns", "xlink", "http://www.w3.org/2000/xmlns/")); 225] 226let foreign_attr_adjustments_tbl = make_assoc_tbl foreign_attr_adjustments 227 228(* Quirks mode detection *) 229let quirky_public_matches = [ 230 "-//w3o//dtd w3 html strict 3.0//en//"; 231 "-/w3c/dtd html 4.0 transitional/en"; 232 "html" 233] 234 235let quirky_public_prefixes = [ 236 "+//silmaril//dtd html pro v0r11 19970101//"; 237 "-//as//dtd html 3.0 aswedit + extensions//"; 238 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"; 239 "-//ietf//dtd html 2.0 level 1//"; 240 "-//ietf//dtd html 2.0 level 2//"; 241 "-//ietf//dtd html 2.0 strict level 1//"; 242 "-//ietf//dtd html 2.0 strict level 2//"; 243 "-//ietf//dtd html 2.0 strict//"; 244 "-//ietf//dtd html 2.0//"; 245 "-//ietf//dtd html 2.1e//"; 246 "-//ietf//dtd html 3.0//"; 247 "-//ietf//dtd html 3.2 final//"; 248 "-//ietf//dtd html 3.2//"; 249 "-//ietf//dtd html 3//"; 250 "-//ietf//dtd html level 0//"; 251 "-//ietf//dtd html level 1//"; 252 "-//ietf//dtd html level 2//"; 253 "-//ietf//dtd html level 3//"; 254 "-//ietf//dtd html strict level 0//"; 255 "-//ietf//dtd html strict level 1//"; 256 "-//ietf//dtd html strict level 2//"; 257 "-//ietf//dtd html strict level 3//"; 258 "-//ietf//dtd html strict//"; 259 "-//ietf//dtd html//"; 260 "-//metrius//dtd metrius presentational//"; 261 "-//microsoft//dtd internet explorer 2.0 html strict//"; 262 "-//microsoft//dtd internet explorer 2.0 html//"; 263 "-//microsoft//dtd internet explorer 2.0 tables//"; 264 "-//microsoft//dtd internet explorer 3.0 html strict//"; 265 "-//microsoft//dtd internet explorer 3.0 html//"; 266 "-//microsoft//dtd internet explorer 3.0 tables//"; 267 "-//netscape comm. corp.//dtd html//"; 268 "-//netscape comm. corp.//dtd strict html//"; 269 "-//o'reilly and associates//dtd html 2.0//"; 270 "-//o'reilly and associates//dtd html extended 1.0//"; 271 "-//o'reilly and associates//dtd html extended relaxed 1.0//"; 272 "-//sq//dtd html 2.0 hotmetal + extensions//"; 273 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"; 274 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"; 275 "-//spyglass//dtd html 2.0 extended//"; 276 "-//sun microsystems corp.//dtd hotjava html//"; 277 "-//sun microsystems corp.//dtd hotjava strict html//"; 278 "-//w3c//dtd html 3 1995-03-24//"; 279 "-//w3c//dtd html 3.2 draft//"; 280 "-//w3c//dtd html 3.2 final//"; 281 "-//w3c//dtd html 3.2//"; 282 "-//w3c//dtd html 3.2s draft//"; 283 "-//w3c//dtd html 4.0 frameset//"; 284 "-//w3c//dtd html 4.0 transitional//"; 285 "-//w3c//dtd html experimental 19960712//"; 286 "-//w3c//dtd html experimental 970421//"; 287 "-//w3c//dtd w3 html//"; 288 "-//w3o//dtd w3 html 3.0//"; 289 "-//webtechs//dtd mozilla html 2.0//"; 290 "-//webtechs//dtd mozilla html//"; 291] 292 293let limited_quirky_public_prefixes = [ 294 "-//w3c//dtd xhtml 1.0 frameset//"; 295 "-//w3c//dtd xhtml 1.0 transitional//"; 296] 297 298let html4_public_prefixes = [ 299 "-//w3c//dtd html 4.01 frameset//"; 300 "-//w3c//dtd html 4.01 transitional//"; 301] 302 303let quirky_system_matches = [ 304 "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd" 305] 306 307(* Table-related element sets for tree builder O(1) lookups *) 308let table_section_elements = ["tbody"; "thead"; "tfoot"] 309let table_section_elements_tbl = make_set table_section_elements 310 311let table_cell_elements = ["td"; "th"] 312let table_cell_elements_tbl = make_set table_cell_elements 313 314let foster_parenting_elements = ["table"; "tbody"; "tfoot"; "thead"; "tr"] 315let foster_parenting_elements_tbl = make_set foster_parenting_elements 316 317(* Helper functions - O(1) hashtable lookups *) 318let is_void_element name = Hashtbl.mem void_elements_tbl name 319let is_formatting_element name = Hashtbl.mem formatting_elements_tbl name 320let is_special_element name = Hashtbl.mem special_elements_tbl name 321let is_heading_element name = Hashtbl.mem heading_elements_tbl name 322let is_implied_end_tag name = Hashtbl.mem implied_end_tags_tbl name 323let is_thoroughly_implied_end_tag name = Hashtbl.mem thoroughly_implied_end_tags_tbl name 324let is_mathml_text_integration name = Hashtbl.mem mathml_text_integration_tbl name 325let is_svg_html_integration name = Hashtbl.mem svg_html_integration_tbl (lowercase name) 326let is_select_scope_exclude name = Hashtbl.mem select_scope_exclude_tbl name 327let is_table_section_element name = Hashtbl.mem table_section_elements_tbl name 328let is_table_cell_element name = Hashtbl.mem table_cell_elements_tbl name 329let is_foster_parenting_element name = Hashtbl.mem foster_parenting_elements_tbl name 330 331(* Backwards compatibility aliases *) 332let is_void = List.mem 333let is_formatting = List.mem 334let is_special name = List.mem name special_elements 335let is_heading = List.mem 336 337let adjust_svg_tag_name name = 338 match Hashtbl.find_opt svg_tag_adjustments_tbl (lowercase name) with 339 | Some adjusted -> adjusted 340 | None -> name 341 342let adjust_svg_attrs attrs = 343 List.map (fun (name, value) -> 344 let adjusted_name = 345 match Hashtbl.find_opt svg_attr_adjustments_tbl (lowercase name) with 346 | Some n -> n 347 | None -> name 348 in 349 (adjusted_name, value) 350 ) attrs 351 352let adjust_foreign_attrs attrs = 353 List.map (fun (name, value) -> 354 match Hashtbl.find_opt foreign_attr_adjustments_tbl (lowercase name) with 355 | Some (prefix, local, _ns) -> 356 if prefix = "" then (local, value) 357 else (prefix ^ ":" ^ local, value) 358 | None -> (name, value) 359 ) attrs