OCaml HTML5 parser/serialiser based on Python's JustHTML
1type element_context = {
2 name : string;
3 spec : Element_spec.t;
4 children_count : int;
5 is_foreign : bool; (* SVG or MathML element *)
6}
7
8type state = {
9 registry : Element_registry.t;
10 mutable ancestor_stack : element_context list;
11}
12
13let create_with_registry ?(registry = Element_registry.default ()) _collector =
14 { registry; ancestor_stack = [] }
15
16let create () = create_with_registry (Message_collector.create ())
17
18let reset state =
19 state.ancestor_stack <- []
20
21(* Check if an element name matches a content model *)
22let rec matches_content_model registry element_name content_model =
23 match content_model with
24 | Content_model.Nothing -> false
25 | Content_model.Text -> false (* Text, not element *)
26 | Content_model.Transparent -> true (* Inherits parent, allow for now *)
27 | Content_model.Categories cats -> (
28 match Element_registry.get registry element_name with
29 | None -> false
30 | Some spec ->
31 List.exists (fun cat -> Element_spec.has_category spec cat) cats)
32 | Content_model.Elements names ->
33 List.mem (String.lowercase_ascii element_name)
34 (List.map String.lowercase_ascii names)
35 | Content_model.Mixed cats -> (
36 match Element_registry.get registry element_name with
37 | None -> false
38 | Some spec ->
39 List.exists (fun cat -> Element_spec.has_category spec cat) cats)
40 | Content_model.One_or_more model -> matches_content_model registry element_name model
41 | Content_model.Zero_or_more model -> matches_content_model registry element_name model
42 | Content_model.Optional model -> matches_content_model registry element_name model
43 | Content_model.Sequence models ->
44 (* For sequences, allow any of the models for now (simplified) *)
45 List.exists (matches_content_model registry element_name) models
46 | Content_model.Choice models ->
47 List.exists (matches_content_model registry element_name) models
48 | Content_model.Except (model, excluded_cats) -> (
49 match Element_registry.get registry element_name with
50 | None -> matches_content_model registry element_name model
51 | Some spec ->
52 matches_content_model registry element_name model
53 && not (List.exists (fun cat -> Element_spec.has_category spec cat) excluded_cats))
54
55(* Check if text is allowed in a content model *)
56let rec allows_text content_model =
57 match content_model with
58 | Content_model.Nothing -> false
59 | Content_model.Text -> true
60 | Content_model.Transparent -> true (* Inherits parent *)
61 | Content_model.Categories cats ->
62 (* Phrasing and Flow content include text *)
63 List.mem Content_category.Phrasing cats || List.mem Content_category.Flow cats
64 | Content_model.Elements _ -> false (* Specific elements only *)
65 | Content_model.Mixed _ -> true (* Text + elements *)
66 | Content_model.One_or_more model -> allows_text model
67 | Content_model.Zero_or_more model -> allows_text model
68 | Content_model.Optional model -> allows_text model
69 | Content_model.Sequence models -> List.exists allows_text models
70 | Content_model.Choice models -> List.exists allows_text models
71 | Content_model.Except (model, _) -> allows_text model
72
73(* Check for prohibited ancestors *)
74let check_prohibited_ancestors state name spec collector =
75 List.iter
76 (fun prohibited ->
77 if List.exists (fun ctx -> String.equal ctx.name prohibited) state.ancestor_stack then
78 Message_collector.add_typed collector
79 (`Element (`Not_allowed_as_child (`Child name, `Parent prohibited))))
80 spec.Element_spec.prohibited_ancestors
81
82(* Validate that a child element is allowed *)
83let validate_child_element state child_name collector =
84 match state.ancestor_stack with
85 | [] ->
86 (* Root level - only html allowed *)
87 if not (String.equal (String.lowercase_ascii child_name) "html") then
88 Message_collector.add_typed collector
89 (`Generic (Printf.sprintf "Element '%s' not allowed at document root (only 'html' allowed)" child_name))
90 | parent :: _ ->
91 let content_model = parent.spec.Element_spec.content_model in
92 if not (matches_content_model state.registry child_name content_model) then
93 Message_collector.add_typed collector
94 (`Element (`Not_allowed_as_child (`Child child_name, `Parent parent.name)))
95
96let start_element state ~element collector =
97 let name = Tag.tag_to_string element.Element.tag in
98
99 (* Check if we're inside a foreign (SVG/MathML) context *)
100 let in_foreign_context = match state.ancestor_stack with
101 | ctx :: _ -> ctx.is_foreign
102 | [] -> false
103 in
104
105 (* Determine if this element is foreign content *)
106 let is_foreign = match element.Element.tag with
107 | Tag.Svg _ | Tag.MathML _ -> true
108 | _ -> in_foreign_context (* Inherit from parent if inside foreign content *)
109 in
110
111 (* If entering foreign content from HTML, SVG/MathML are valid embedded content *)
112 (* If already in foreign content, skip HTML content model checks *)
113 if is_foreign && not in_foreign_context then begin
114 (* Entering SVG/MathML from HTML - just track it, it's valid embedded content *)
115 let spec = Element_spec.make ~name ~content_model:(Content_model.Categories [Content_category.Flow]) () in
116 let context = { name; spec; children_count = 0; is_foreign = true } in
117 state.ancestor_stack <- context :: state.ancestor_stack
118 end else if is_foreign then begin
119 (* Inside SVG/MathML - just track nesting, don't validate against HTML *)
120 let spec = Element_spec.make ~name ~content_model:(Content_model.Categories [Content_category.Flow]) () in
121 let context = { name; spec; children_count = 0; is_foreign = true } in
122 state.ancestor_stack <- context :: state.ancestor_stack
123 end else begin
124 (* HTML element - do normal validation *)
125 let spec_opt = Element_registry.get state.registry name in
126
127 match spec_opt with
128 | None ->
129 (* Unknown element - first check if it's allowed in current context *)
130 validate_child_element state name collector
131 | Some spec ->
132 (* Check prohibited ancestors *)
133 check_prohibited_ancestors state name spec collector;
134
135 (* Validate this element is allowed as child of parent *)
136 validate_child_element state name collector;
137
138 (* Push element context onto stack *)
139 let context = { name; spec; children_count = 0; is_foreign = false } in
140 state.ancestor_stack <- context :: state.ancestor_stack
141 end
142
143let end_element state ~tag collector =
144 let name = Tag.tag_to_string tag in
145 match state.ancestor_stack with
146 | [] ->
147 (* Unmatched closing tag *)
148 Message_collector.add_typed collector
149 (`Generic (Printf.sprintf "Unmatched closing tag '%s'" name))
150 | context :: rest ->
151 if not (String.equal context.name name) then
152 (* Mismatched tag *)
153 Message_collector.add_typed collector
154 (`Generic (Printf.sprintf "Expected closing tag '%s' but got '%s'" context.name name))
155 else (
156 (* Check if void element has children *)
157 if Element_spec.is_void context.spec && context.children_count > 0 then
158 Message_collector.add_typed collector
159 (`Generic (Printf.sprintf "Void element '%s' must not have children" name));
160
161 (* Pop stack *)
162 state.ancestor_stack <- rest;
163
164 (* If there's a parent, increment its child count *)
165 match rest with
166 | [] -> ()
167 | parent :: rest_tail ->
168 let updated_parent = { parent with children_count = parent.children_count + 1 } in
169 state.ancestor_stack <- updated_parent :: rest_tail)
170
171let characters state text collector =
172 (* Check if text is allowed in current context *)
173 match state.ancestor_stack with
174 | [] ->
175 (* Text at root level - only whitespace allowed *)
176 if not (String.trim text = "") then
177 Message_collector.add_typed collector
178 (`Generic "Text content not allowed at document root")
179 | parent :: rest ->
180 let content_model = parent.spec.Element_spec.content_model in
181 if not (allows_text content_model) then
182 (* Only report if non-whitespace text *)
183 if not (String.trim text = "") then
184 Message_collector.add_typed collector
185 (`Element (`Text_not_allowed (`Parent parent.name)))
186 else (
187 (* Text is allowed, increment child count *)
188 let updated_parent = { parent with children_count = parent.children_count + 1 } in
189 state.ancestor_stack <- updated_parent :: rest)
190
191let end_document state collector =
192 (* Check for unclosed elements *)
193 List.iter
194 (fun context ->
195 Message_collector.add_typed collector
196 (`Generic (Printf.sprintf "Unclosed element '%s'" context.name)))
197 state.ancestor_stack
198
199(* Package as first-class module *)
200let checker = Checker.make ~create ~reset ~start_element ~end_element
201 ~characters ~end_document ()