OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at f7c69be4eae5476a0985d55de71f2cc34c8d5361 201 lines 8.9 kB view raw
1type element_context = { 2 name : string; 3 spec : Element_spec.t; 4 children_count : int; 5 is_foreign : bool; (* SVG or MathML element *) 6} 7 8type state = { 9 registry : Element_registry.t; 10 mutable ancestor_stack : element_context list; 11} 12 13let create_with_registry ?(registry = Element_registry.default ()) _collector = 14 { registry; ancestor_stack = [] } 15 16let create () = create_with_registry (Message_collector.create ()) 17 18let reset state = 19 state.ancestor_stack <- [] 20 21(* Check if an element name matches a content model *) 22let rec matches_content_model registry element_name content_model = 23 match content_model with 24 | Content_model.Nothing -> false 25 | Content_model.Text -> false (* Text, not element *) 26 | Content_model.Transparent -> true (* Inherits parent, allow for now *) 27 | Content_model.Categories cats -> ( 28 match Element_registry.get registry element_name with 29 | None -> false 30 | Some spec -> 31 List.exists (fun cat -> Element_spec.has_category spec cat) cats) 32 | Content_model.Elements names -> 33 List.mem (String.lowercase_ascii element_name) 34 (List.map String.lowercase_ascii names) 35 | Content_model.Mixed cats -> ( 36 match Element_registry.get registry element_name with 37 | None -> false 38 | Some spec -> 39 List.exists (fun cat -> Element_spec.has_category spec cat) cats) 40 | Content_model.One_or_more model -> matches_content_model registry element_name model 41 | Content_model.Zero_or_more model -> matches_content_model registry element_name model 42 | Content_model.Optional model -> matches_content_model registry element_name model 43 | Content_model.Sequence models -> 44 (* For sequences, allow any of the models for now (simplified) *) 45 List.exists (matches_content_model registry element_name) models 46 | Content_model.Choice models -> 47 List.exists (matches_content_model registry element_name) models 48 | Content_model.Except (model, excluded_cats) -> ( 49 match Element_registry.get registry element_name with 50 | None -> matches_content_model registry element_name model 51 | Some spec -> 52 matches_content_model registry element_name model 53 && not (List.exists (fun cat -> Element_spec.has_category spec cat) excluded_cats)) 54 55(* Check if text is allowed in a content model *) 56let rec allows_text content_model = 57 match content_model with 58 | Content_model.Nothing -> false 59 | Content_model.Text -> true 60 | Content_model.Transparent -> true (* Inherits parent *) 61 | Content_model.Categories cats -> 62 (* Phrasing and Flow content include text *) 63 List.mem Content_category.Phrasing cats || List.mem Content_category.Flow cats 64 | Content_model.Elements _ -> false (* Specific elements only *) 65 | Content_model.Mixed _ -> true (* Text + elements *) 66 | Content_model.One_or_more model -> allows_text model 67 | Content_model.Zero_or_more model -> allows_text model 68 | Content_model.Optional model -> allows_text model 69 | Content_model.Sequence models -> List.exists allows_text models 70 | Content_model.Choice models -> List.exists allows_text models 71 | Content_model.Except (model, _) -> allows_text model 72 73(* Check for prohibited ancestors *) 74let check_prohibited_ancestors state name spec collector = 75 List.iter 76 (fun prohibited -> 77 if List.exists (fun ctx -> String.equal ctx.name prohibited) state.ancestor_stack then 78 Message_collector.add_typed collector 79 (`Element (`Not_allowed_as_child (`Child name, `Parent prohibited)))) 80 spec.Element_spec.prohibited_ancestors 81 82(* Validate that a child element is allowed *) 83let validate_child_element state child_name collector = 84 match state.ancestor_stack with 85 | [] -> 86 (* Root level - only html allowed *) 87 if not (String.equal (String.lowercase_ascii child_name) "html") then 88 Message_collector.add_typed collector 89 (`Generic (Printf.sprintf "Element '%s' not allowed at document root (only 'html' allowed)" child_name)) 90 | parent :: _ -> 91 let content_model = parent.spec.Element_spec.content_model in 92 if not (matches_content_model state.registry child_name content_model) then 93 Message_collector.add_typed collector 94 (`Element (`Not_allowed_as_child (`Child child_name, `Parent parent.name))) 95 96let start_element state ~element collector = 97 let name = Tag.tag_to_string element.Element.tag in 98 99 (* Check if we're inside a foreign (SVG/MathML) context *) 100 let in_foreign_context = match state.ancestor_stack with 101 | ctx :: _ -> ctx.is_foreign 102 | [] -> false 103 in 104 105 (* Determine if this element is foreign content *) 106 let is_foreign = match element.Element.tag with 107 | Tag.Svg _ | Tag.MathML _ -> true 108 | _ -> in_foreign_context (* Inherit from parent if inside foreign content *) 109 in 110 111 (* If entering foreign content from HTML, SVG/MathML are valid embedded content *) 112 (* If already in foreign content, skip HTML content model checks *) 113 if is_foreign && not in_foreign_context then begin 114 (* Entering SVG/MathML from HTML - just track it, it's valid embedded content *) 115 let spec = Element_spec.make ~name ~content_model:(Content_model.Categories [Content_category.Flow]) () in 116 let context = { name; spec; children_count = 0; is_foreign = true } in 117 state.ancestor_stack <- context :: state.ancestor_stack 118 end else if is_foreign then begin 119 (* Inside SVG/MathML - just track nesting, don't validate against HTML *) 120 let spec = Element_spec.make ~name ~content_model:(Content_model.Categories [Content_category.Flow]) () in 121 let context = { name; spec; children_count = 0; is_foreign = true } in 122 state.ancestor_stack <- context :: state.ancestor_stack 123 end else begin 124 (* HTML element - do normal validation *) 125 let spec_opt = Element_registry.get state.registry name in 126 127 match spec_opt with 128 | None -> 129 (* Unknown element - first check if it's allowed in current context *) 130 validate_child_element state name collector 131 | Some spec -> 132 (* Check prohibited ancestors *) 133 check_prohibited_ancestors state name spec collector; 134 135 (* Validate this element is allowed as child of parent *) 136 validate_child_element state name collector; 137 138 (* Push element context onto stack *) 139 let context = { name; spec; children_count = 0; is_foreign = false } in 140 state.ancestor_stack <- context :: state.ancestor_stack 141 end 142 143let end_element state ~tag collector = 144 let name = Tag.tag_to_string tag in 145 match state.ancestor_stack with 146 | [] -> 147 (* Unmatched closing tag *) 148 Message_collector.add_typed collector 149 (`Generic (Printf.sprintf "Unmatched closing tag '%s'" name)) 150 | context :: rest -> 151 if not (String.equal context.name name) then 152 (* Mismatched tag *) 153 Message_collector.add_typed collector 154 (`Generic (Printf.sprintf "Expected closing tag '%s' but got '%s'" context.name name)) 155 else ( 156 (* Check if void element has children *) 157 if Element_spec.is_void context.spec && context.children_count > 0 then 158 Message_collector.add_typed collector 159 (`Generic (Printf.sprintf "Void element '%s' must not have children" name)); 160 161 (* Pop stack *) 162 state.ancestor_stack <- rest; 163 164 (* If there's a parent, increment its child count *) 165 match rest with 166 | [] -> () 167 | parent :: rest_tail -> 168 let updated_parent = { parent with children_count = parent.children_count + 1 } in 169 state.ancestor_stack <- updated_parent :: rest_tail) 170 171let characters state text collector = 172 (* Check if text is allowed in current context *) 173 match state.ancestor_stack with 174 | [] -> 175 (* Text at root level - only whitespace allowed *) 176 if not (String.trim text = "") then 177 Message_collector.add_typed collector 178 (`Generic "Text content not allowed at document root") 179 | parent :: rest -> 180 let content_model = parent.spec.Element_spec.content_model in 181 if not (allows_text content_model) then 182 (* Only report if non-whitespace text *) 183 if not (String.trim text = "") then 184 Message_collector.add_typed collector 185 (`Element (`Text_not_allowed (`Parent parent.name))) 186 else ( 187 (* Text is allowed, increment child count *) 188 let updated_parent = { parent with children_count = parent.children_count + 1 } in 189 state.ancestor_stack <- updated_parent :: rest) 190 191let end_document state collector = 192 (* Check for unclosed elements *) 193 List.iter 194 (fun context -> 195 Message_collector.add_typed collector 196 (`Generic (Printf.sprintf "Unclosed element '%s'" context.name))) 197 state.ancestor_stack 198 199(* Package as first-class module *) 200let checker = Checker.make ~create ~reset ~start_element ~end_element 201 ~characters ~end_document ()