(** Microdata validation checker. Validates HTML5 microdata attributes. *) (** Quote helper for consistent message formatting. *) let q = Error_code.q (** Information about an itemscope. *) type item_scope = { element : string; location : Message.location option; [@warning "-69"] itemtype : string option; [@warning "-69"] itemid : string option; [@warning "-69"] itemref : string list; [@warning "-69"] } (** Information about an itemref reference. *) type itemref_reference = { referring_element : string; referenced_ids : string list; location : Message.location option; [@warning "-69"] } (** Checker state tracking microdata. *) type state = { mutable scope_stack : item_scope list; mutable itemref_references : itemref_reference list; mutable all_ids : (string, unit) Hashtbl.t; [@warning "-69"] mutable html_element_seen : bool; [@warning "-69"] } let create () = { scope_stack = []; itemref_references = []; all_ids = Hashtbl.create 64; html_element_seen = false; } let reset state = state.scope_stack <- []; state.itemref_references <- []; Hashtbl.clear state.all_ids; state.html_element_seen <- false (** Split whitespace-separated values - uses shared utility. *) let split_whitespace = Datatype.split_on_whitespace (** Check if a string is a valid URL (contains a colon). *) let is_url s = String.contains s ':' (** Validate that a URL is a valid absolute URL for itemtype/itemid. Uses the comprehensive URL validation from Url_checker. original_value is the full attribute value (for error messages when split by whitespace) *) let validate_microdata_url url element attr_name original_value = let url_trimmed = String.trim url in if String.length url_trimmed = 0 then Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad absolute URL: Must be non-empty." (q original_value) (q attr_name) (q element)) else (* First check if it has a scheme (required for absolute URL) *) match Url_checker.extract_scheme url_trimmed with | None -> Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad absolute URL: The string %s is not an absolute URL." (q original_value) (q attr_name) (q element) (q url)) | Some _ -> (* Has a scheme - do comprehensive URL validation *) match Url_checker.validate_url url element attr_name with | None -> None | Some error_msg -> (* Replace "Bad URL:" with "Bad absolute URL:" for microdata *) let error_msg = Str.global_replace (Str.regexp "Bad URL:") "Bad absolute URL:" error_msg in (* Also replace the URL value with the original value in case they differ *) (* Escape backslashes in replacement string for Str.global_replace *) let escaped_original = Str.global_replace (Str.regexp "\\\\") "\\\\\\\\" original_value in let error_msg = Str.global_replace (Str.regexp_string (Printf.sprintf "%s for attribute" (q url))) (Printf.sprintf "%s for attribute" (q escaped_original)) error_msg in Some error_msg (** Check if itemprop value is valid. *) let validate_itemprop_value value = if String.length value = 0 then Error "itemprop value must not be empty" else if not (is_url value) && String.contains value ':' then Error (Printf.sprintf "itemprop value '%s' contains a colon but is not a URL" value) else Ok () (** Check if element is inside an itemscope or referenced by itemref. *) let is_property_element state = state.scope_stack <> [] (** Get attributes from attribute list. *) let get_attr attrs name = try Some (List.assoc name attrs) with Not_found -> None (** Process microdata attributes. *) let process_microdata_attrs state ~element ~attrs ~location collector = let has_itemscope = List.mem_assoc "itemscope" attrs in let itemtype_opt = get_attr attrs "itemtype" in let itemid_opt = get_attr attrs "itemid" in let itemref_opt = get_attr attrs "itemref" in let itemprop_opt = get_attr attrs "itemprop" in begin match itemid_opt with | Some itemid -> if not has_itemscope then Message_collector.add_typed collector (`Generic "itemid attribute requires itemscope attribute"); if itemtype_opt = None then Message_collector.add_typed collector (`Generic "itemid attribute requires itemtype attribute"); (match Url_checker.validate_url itemid element "itemid" with | None -> () | Some error_msg -> Message_collector.add_typed collector (`Attr (`Bad_value_generic (`Message error_msg)))) | None -> () end; begin match itemref_opt with | Some itemref_value -> if not has_itemscope then Message_collector.add_typed collector (`Generic "itemref attribute requires itemscope attribute") else begin let ids = split_whitespace itemref_value in state.itemref_references <- { referring_element = element; referenced_ids = ids; location; } :: state.itemref_references end | None -> () end; begin match itemtype_opt with | Some itemtype -> if not has_itemscope then Message_collector.add_typed collector (`Generic "itemtype attribute requires itemscope attribute") else begin let types = split_whitespace itemtype in if types = [] then Message_collector.add_typed collector (`Attr (`Bad_value (`Elem element, `Attr "itemtype", `Value itemtype, `Reason ""))) else List.iter (fun url -> match validate_microdata_url url element "itemtype" itemtype with | None -> () | Some error_msg -> Message_collector.add_typed collector (`Attr (`Bad_value_generic (`Message error_msg))) ) types end | None -> () end; (* Check itemprop value validity *) begin match itemprop_opt with | Some itemprop_value -> let props = split_whitespace itemprop_value in List.iter (fun prop -> match validate_itemprop_value prop with | Ok () -> () | Error msg -> Message_collector.add_typed collector (`Generic msg) ) props; (* Check itemprop can only appear on property elements *) if not (is_property_element state) then Message_collector.add_typed collector (`Generic "itemprop attribute can only appear on elements that are \ properties of an item (descendant of itemscope or referenced by itemref)") | None -> () end; (* If this element has itemscope, push it onto the stack *) if has_itemscope then begin let itemref = match itemref_opt with | Some v -> split_whitespace v | None -> [] in let scope = { element; location; itemtype = itemtype_opt; itemid = itemid_opt; itemref; } in state.scope_stack <- scope :: state.scope_stack end (** Track IDs for itemref validation. *) let track_id state attrs = match get_attr attrs "id" with | Some id -> if String.length id > 0 then Hashtbl.replace state.all_ids id () | None -> () (** Detect itemref cycles using depth-first search. *) let detect_itemref_cycles state collector = (* Build adjacency list from itemref references *) let graph = Hashtbl.create 32 in List.iter (fun ref -> Hashtbl.replace graph ref.referring_element ref.referenced_ids ) state.itemref_references; (* DFS to detect cycles *) let rec visit visited stack node = if List.mem node stack then (* Found a cycle *) Some (node :: stack) else if List.mem node visited then None else match Hashtbl.find_opt graph node with | None -> None | Some neighbors -> let stack' = node :: stack in let rec check_neighbors = function | [] -> None | neighbor :: rest -> match visit visited stack' neighbor with | Some cycle -> Some cycle | None -> check_neighbors rest in check_neighbors neighbors in (* Check all nodes *) let rec check_all_nodes visited nodes = match nodes with | [] -> () | node :: rest -> begin match visit visited [] node with | Some cycle -> let cycle_str = String.concat " -> " (List.rev cycle) in Message_collector.add_typed collector (`Generic (Printf.sprintf "itemref cycle detected: %s" cycle_str)) | None -> () end; check_all_nodes (node :: visited) rest in let all_nodes = Hashtbl.to_seq_keys graph |> List.of_seq in check_all_nodes [] all_nodes let start_element state ~element collector = let name = Tag.tag_to_string element.Element.tag in let attrs = element.raw_attrs in let location = None in track_id state attrs; process_microdata_attrs state ~element:name ~attrs ~location collector let end_element state ~tag _collector = let name = Tag.tag_to_string tag in (* Pop itemscope from stack if this element had one *) match state.scope_stack with | scope :: rest when scope.element = name -> state.scope_stack <- rest | _ -> () let end_document state collector = (* Check all itemref references point to existing IDs *) List.iter (fun ref -> List.iter (fun id -> if not (Hashtbl.mem state.all_ids id) then Message_collector.add_typed collector (`Generic (Printf.sprintf "itemref on <%s> refers to ID '%s' which does not exist" ref.referring_element id)) ) ref.referenced_ids ) state.itemref_references; (* Detect itemref cycles *) detect_itemref_cycles state collector let checker = Checker.make ~create ~reset ~start_element ~end_element ~end_document ()