OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Microdata validation checker.
2
3 Validates HTML5 microdata attributes. *)
4
5(** Quote helper for consistent message formatting. *)
6let q = Error_code.q
7
8(** Information about an itemscope. *)
9type item_scope = {
10 element : string;
11 location : Message.location option; [@warning "-69"]
12 itemtype : string option; [@warning "-69"]
13 itemid : string option; [@warning "-69"]
14 itemref : string list; [@warning "-69"]
15}
16
17(** Information about an itemref reference. *)
18type itemref_reference = {
19 referring_element : string;
20 referenced_ids : string list;
21 location : Message.location option; [@warning "-69"]
22}
23
24(** Checker state tracking microdata. *)
25type state = {
26 mutable scope_stack : item_scope list;
27 mutable itemref_references : itemref_reference list;
28 mutable all_ids : (string, unit) Hashtbl.t; [@warning "-69"]
29 mutable html_element_seen : bool; [@warning "-69"]
30}
31
32let create () =
33 {
34 scope_stack = [];
35 itemref_references = [];
36 all_ids = Hashtbl.create 64;
37 html_element_seen = false;
38 }
39
40let reset state =
41 state.scope_stack <- [];
42 state.itemref_references <- [];
43 Hashtbl.clear state.all_ids;
44 state.html_element_seen <- false
45
46(** Split whitespace-separated values - uses shared utility. *)
47let split_whitespace = Datatype.split_on_whitespace
48
49(** Check if a string is a valid URL (contains a colon). *)
50let is_url s =
51 String.contains s ':'
52
53(** Validate that a URL is a valid absolute URL for itemtype/itemid.
54 Uses the comprehensive URL validation from Url_checker.
55 original_value is the full attribute value (for error messages when split by whitespace) *)
56let validate_microdata_url url element attr_name original_value =
57 let url_trimmed = String.trim url in
58 if String.length url_trimmed = 0 then
59 Some (Printf.sprintf
60 "Bad value %s for attribute %s on element %s: Bad absolute URL: Must be non-empty."
61 (q original_value) (q attr_name) (q element))
62 else
63 (* First check if it has a scheme (required for absolute URL) *)
64 match Url_checker.extract_scheme url_trimmed with
65 | None ->
66 Some (Printf.sprintf
67 "Bad value %s for attribute %s on element %s: Bad absolute URL: The string %s is not an absolute URL."
68 (q original_value) (q attr_name) (q element) (q url))
69 | Some _ ->
70 (* Has a scheme - do comprehensive URL validation *)
71 match Url_checker.validate_url url element attr_name with
72 | None -> None
73 | Some error_msg ->
74 (* Replace "Bad URL:" with "Bad absolute URL:" for microdata *)
75 let error_msg = Str.global_replace (Str.regexp "Bad URL:") "Bad absolute URL:" error_msg in
76 (* Also replace the URL value with the original value in case they differ *)
77 (* Escape backslashes in replacement string for Str.global_replace *)
78 let escaped_original = Str.global_replace (Str.regexp "\\\\") "\\\\\\\\" original_value in
79 let error_msg = Str.global_replace
80 (Str.regexp_string (Printf.sprintf "%s for attribute" (q url)))
81 (Printf.sprintf "%s for attribute" (q escaped_original))
82 error_msg in
83 Some error_msg
84
85(** Check if itemprop value is valid. *)
86let validate_itemprop_value value =
87 if String.length value = 0 then
88 Error "itemprop value must not be empty"
89 else if not (is_url value) && String.contains value ':' then
90 Error (Printf.sprintf
91 "itemprop value '%s' contains a colon but is not a URL" value)
92 else
93 Ok ()
94
95(** Check if element is inside an itemscope or referenced by itemref. *)
96let is_property_element state =
97 state.scope_stack <> []
98
99(** Get attributes from attribute list. *)
100let get_attr attrs name =
101 try Some (List.assoc name attrs)
102 with Not_found -> None
103
104(** Process microdata attributes. *)
105let process_microdata_attrs state ~element ~attrs ~location collector =
106 let has_itemscope = List.mem_assoc "itemscope" attrs in
107 let itemtype_opt = get_attr attrs "itemtype" in
108 let itemid_opt = get_attr attrs "itemid" in
109 let itemref_opt = get_attr attrs "itemref" in
110 let itemprop_opt = get_attr attrs "itemprop" in
111
112 begin match itemid_opt with
113 | Some itemid ->
114 if not has_itemscope then
115 Message_collector.add_typed collector
116 (`Generic "itemid attribute requires itemscope attribute");
117 if itemtype_opt = None then
118 Message_collector.add_typed collector
119 (`Generic "itemid attribute requires itemtype attribute");
120 (match Url_checker.validate_url itemid element "itemid" with
121 | None -> ()
122 | Some error_msg ->
123 Message_collector.add_typed collector (`Attr (`Bad_value_generic (`Message error_msg))))
124 | None -> ()
125 end;
126
127 begin match itemref_opt with
128 | Some itemref_value ->
129 if not has_itemscope then
130 Message_collector.add_typed collector
131 (`Generic "itemref attribute requires itemscope attribute")
132 else begin
133 let ids = split_whitespace itemref_value in
134 state.itemref_references <- {
135 referring_element = element;
136 referenced_ids = ids;
137 location;
138 } :: state.itemref_references
139 end
140 | None -> ()
141 end;
142
143 begin match itemtype_opt with
144 | Some itemtype ->
145 if not has_itemscope then
146 Message_collector.add_typed collector
147 (`Generic "itemtype attribute requires itemscope attribute")
148 else begin
149 let types = split_whitespace itemtype in
150 if types = [] then
151 Message_collector.add_typed collector
152 (`Attr (`Bad_value (`Elem element, `Attr "itemtype", `Value itemtype, `Reason "")))
153 else
154 List.iter (fun url ->
155 match validate_microdata_url url element "itemtype" itemtype with
156 | None -> ()
157 | Some error_msg ->
158 Message_collector.add_typed collector (`Attr (`Bad_value_generic (`Message error_msg)))
159 ) types
160 end
161 | None -> ()
162 end;
163
164 (* Check itemprop value validity *)
165 begin match itemprop_opt with
166 | Some itemprop_value ->
167 let props = split_whitespace itemprop_value in
168 List.iter (fun prop ->
169 match validate_itemprop_value prop with
170 | Ok () -> ()
171 | Error msg ->
172 Message_collector.add_typed collector
173 (`Generic msg)
174 ) props;
175
176 (* Check itemprop can only appear on property elements *)
177 if not (is_property_element state) then
178 Message_collector.add_typed collector
179 (`Generic "itemprop attribute can only appear on elements that are \
180 properties of an item (descendant of itemscope or referenced by itemref)")
181 | None -> ()
182 end;
183
184 (* If this element has itemscope, push it onto the stack *)
185 if has_itemscope then begin
186 let itemref = match itemref_opt with
187 | Some v -> split_whitespace v
188 | None -> []
189 in
190 let scope = {
191 element;
192 location;
193 itemtype = itemtype_opt;
194 itemid = itemid_opt;
195 itemref;
196 } in
197 state.scope_stack <- scope :: state.scope_stack
198 end
199
200(** Track IDs for itemref validation. *)
201let track_id state attrs =
202 match get_attr attrs "id" with
203 | Some id ->
204 if String.length id > 0 then
205 Hashtbl.replace state.all_ids id ()
206 | None -> ()
207
208(** Detect itemref cycles using depth-first search. *)
209let detect_itemref_cycles state collector =
210 (* Build adjacency list from itemref references *)
211 let graph = Hashtbl.create 32 in
212 List.iter (fun ref ->
213 Hashtbl.replace graph ref.referring_element ref.referenced_ids
214 ) state.itemref_references;
215
216 (* DFS to detect cycles *)
217 let rec visit visited stack node =
218 if List.mem node stack then
219 (* Found a cycle *)
220 Some (node :: stack)
221 else if List.mem node visited then
222 None
223 else
224 match Hashtbl.find_opt graph node with
225 | None -> None
226 | Some neighbors ->
227 let stack' = node :: stack in
228 let rec check_neighbors = function
229 | [] -> None
230 | neighbor :: rest ->
231 match visit visited stack' neighbor with
232 | Some cycle -> Some cycle
233 | None -> check_neighbors rest
234 in
235 check_neighbors neighbors
236 in
237
238 (* Check all nodes *)
239 let rec check_all_nodes visited nodes =
240 match nodes with
241 | [] -> ()
242 | node :: rest ->
243 begin match visit visited [] node with
244 | Some cycle ->
245 let cycle_str = String.concat " -> " (List.rev cycle) in
246 Message_collector.add_typed collector
247 (`Generic (Printf.sprintf "itemref cycle detected: %s" cycle_str))
248 | None -> ()
249 end;
250 check_all_nodes (node :: visited) rest
251 in
252
253 let all_nodes = Hashtbl.to_seq_keys graph |> List.of_seq in
254 check_all_nodes [] all_nodes
255
256let start_element state ~element collector =
257 let name = Tag.tag_to_string element.Element.tag in
258 let attrs = element.raw_attrs in
259 let location = None in
260 track_id state attrs;
261 process_microdata_attrs state ~element:name ~attrs ~location collector
262
263let end_element state ~tag _collector =
264 let name = Tag.tag_to_string tag in
265 (* Pop itemscope from stack if this element had one *)
266 match state.scope_stack with
267 | scope :: rest when scope.element = name ->
268 state.scope_stack <- rest
269 | _ -> ()
270
271let end_document state collector =
272 (* Check all itemref references point to existing IDs *)
273 List.iter (fun ref ->
274 List.iter (fun id ->
275 if not (Hashtbl.mem state.all_ids id) then
276 Message_collector.add_typed collector
277 (`Generic (Printf.sprintf
278 "itemref on <%s> refers to ID '%s' which does not exist"
279 ref.referring_element id))
280 ) ref.referenced_ids
281 ) state.itemref_references;
282
283 (* Detect itemref cycles *)
284 detect_itemref_cycles state collector
285
286let checker = Checker.make ~create ~reset ~start_element ~end_element
287 ~end_document ()