OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 287 lines 9.9 kB view raw
1(** Microdata validation checker. 2 3 Validates HTML5 microdata attributes. *) 4 5(** Quote helper for consistent message formatting. *) 6let q = Error_code.q 7 8(** Information about an itemscope. *) 9type item_scope = { 10 element : string; 11 location : Message.location option; [@warning "-69"] 12 itemtype : string option; [@warning "-69"] 13 itemid : string option; [@warning "-69"] 14 itemref : string list; [@warning "-69"] 15} 16 17(** Information about an itemref reference. *) 18type itemref_reference = { 19 referring_element : string; 20 referenced_ids : string list; 21 location : Message.location option; [@warning "-69"] 22} 23 24(** Checker state tracking microdata. *) 25type state = { 26 mutable scope_stack : item_scope list; 27 mutable itemref_references : itemref_reference list; 28 mutable all_ids : (string, unit) Hashtbl.t; [@warning "-69"] 29 mutable html_element_seen : bool; [@warning "-69"] 30} 31 32let create () = 33 { 34 scope_stack = []; 35 itemref_references = []; 36 all_ids = Hashtbl.create 64; 37 html_element_seen = false; 38 } 39 40let reset state = 41 state.scope_stack <- []; 42 state.itemref_references <- []; 43 Hashtbl.clear state.all_ids; 44 state.html_element_seen <- false 45 46(** Split whitespace-separated values - uses shared utility. *) 47let split_whitespace = Datatype.split_on_whitespace 48 49(** Check if a string is a valid URL (contains a colon). *) 50let is_url s = 51 String.contains s ':' 52 53(** Validate that a URL is a valid absolute URL for itemtype/itemid. 54 Uses the comprehensive URL validation from Url_checker. 55 original_value is the full attribute value (for error messages when split by whitespace) *) 56let validate_microdata_url url element attr_name original_value = 57 let url_trimmed = String.trim url in 58 if String.length url_trimmed = 0 then 59 Some (Printf.sprintf 60 "Bad value %s for attribute %s on element %s: Bad absolute URL: Must be non-empty." 61 (q original_value) (q attr_name) (q element)) 62 else 63 (* First check if it has a scheme (required for absolute URL) *) 64 match Url_checker.extract_scheme url_trimmed with 65 | None -> 66 Some (Printf.sprintf 67 "Bad value %s for attribute %s on element %s: Bad absolute URL: The string %s is not an absolute URL." 68 (q original_value) (q attr_name) (q element) (q url)) 69 | Some _ -> 70 (* Has a scheme - do comprehensive URL validation *) 71 match Url_checker.validate_url url element attr_name with 72 | None -> None 73 | Some error_msg -> 74 (* Replace "Bad URL:" with "Bad absolute URL:" for microdata *) 75 let error_msg = Str.global_replace (Str.regexp "Bad URL:") "Bad absolute URL:" error_msg in 76 (* Also replace the URL value with the original value in case they differ *) 77 (* Escape backslashes in replacement string for Str.global_replace *) 78 let escaped_original = Str.global_replace (Str.regexp "\\\\") "\\\\\\\\" original_value in 79 let error_msg = Str.global_replace 80 (Str.regexp_string (Printf.sprintf "%s for attribute" (q url))) 81 (Printf.sprintf "%s for attribute" (q escaped_original)) 82 error_msg in 83 Some error_msg 84 85(** Check if itemprop value is valid. *) 86let validate_itemprop_value value = 87 if String.length value = 0 then 88 Error "itemprop value must not be empty" 89 else if not (is_url value) && String.contains value ':' then 90 Error (Printf.sprintf 91 "itemprop value '%s' contains a colon but is not a URL" value) 92 else 93 Ok () 94 95(** Check if element is inside an itemscope or referenced by itemref. *) 96let is_property_element state = 97 state.scope_stack <> [] 98 99(** Get attributes from attribute list. *) 100let get_attr attrs name = 101 try Some (List.assoc name attrs) 102 with Not_found -> None 103 104(** Process microdata attributes. *) 105let process_microdata_attrs state ~element ~attrs ~location collector = 106 let has_itemscope = List.mem_assoc "itemscope" attrs in 107 let itemtype_opt = get_attr attrs "itemtype" in 108 let itemid_opt = get_attr attrs "itemid" in 109 let itemref_opt = get_attr attrs "itemref" in 110 let itemprop_opt = get_attr attrs "itemprop" in 111 112 begin match itemid_opt with 113 | Some itemid -> 114 if not has_itemscope then 115 Message_collector.add_typed collector 116 (`Generic "itemid attribute requires itemscope attribute"); 117 if itemtype_opt = None then 118 Message_collector.add_typed collector 119 (`Generic "itemid attribute requires itemtype attribute"); 120 (match Url_checker.validate_url itemid element "itemid" with 121 | None -> () 122 | Some error_msg -> 123 Message_collector.add_typed collector (`Attr (`Bad_value_generic (`Message error_msg)))) 124 | None -> () 125 end; 126 127 begin match itemref_opt with 128 | Some itemref_value -> 129 if not has_itemscope then 130 Message_collector.add_typed collector 131 (`Generic "itemref attribute requires itemscope attribute") 132 else begin 133 let ids = split_whitespace itemref_value in 134 state.itemref_references <- { 135 referring_element = element; 136 referenced_ids = ids; 137 location; 138 } :: state.itemref_references 139 end 140 | None -> () 141 end; 142 143 begin match itemtype_opt with 144 | Some itemtype -> 145 if not has_itemscope then 146 Message_collector.add_typed collector 147 (`Generic "itemtype attribute requires itemscope attribute") 148 else begin 149 let types = split_whitespace itemtype in 150 if types = [] then 151 Message_collector.add_typed collector 152 (`Attr (`Bad_value (`Elem element, `Attr "itemtype", `Value itemtype, `Reason ""))) 153 else 154 List.iter (fun url -> 155 match validate_microdata_url url element "itemtype" itemtype with 156 | None -> () 157 | Some error_msg -> 158 Message_collector.add_typed collector (`Attr (`Bad_value_generic (`Message error_msg))) 159 ) types 160 end 161 | None -> () 162 end; 163 164 (* Check itemprop value validity *) 165 begin match itemprop_opt with 166 | Some itemprop_value -> 167 let props = split_whitespace itemprop_value in 168 List.iter (fun prop -> 169 match validate_itemprop_value prop with 170 | Ok () -> () 171 | Error msg -> 172 Message_collector.add_typed collector 173 (`Generic msg) 174 ) props; 175 176 (* Check itemprop can only appear on property elements *) 177 if not (is_property_element state) then 178 Message_collector.add_typed collector 179 (`Generic "itemprop attribute can only appear on elements that are \ 180 properties of an item (descendant of itemscope or referenced by itemref)") 181 | None -> () 182 end; 183 184 (* If this element has itemscope, push it onto the stack *) 185 if has_itemscope then begin 186 let itemref = match itemref_opt with 187 | Some v -> split_whitespace v 188 | None -> [] 189 in 190 let scope = { 191 element; 192 location; 193 itemtype = itemtype_opt; 194 itemid = itemid_opt; 195 itemref; 196 } in 197 state.scope_stack <- scope :: state.scope_stack 198 end 199 200(** Track IDs for itemref validation. *) 201let track_id state attrs = 202 match get_attr attrs "id" with 203 | Some id -> 204 if String.length id > 0 then 205 Hashtbl.replace state.all_ids id () 206 | None -> () 207 208(** Detect itemref cycles using depth-first search. *) 209let detect_itemref_cycles state collector = 210 (* Build adjacency list from itemref references *) 211 let graph = Hashtbl.create 32 in 212 List.iter (fun ref -> 213 Hashtbl.replace graph ref.referring_element ref.referenced_ids 214 ) state.itemref_references; 215 216 (* DFS to detect cycles *) 217 let rec visit visited stack node = 218 if List.mem node stack then 219 (* Found a cycle *) 220 Some (node :: stack) 221 else if List.mem node visited then 222 None 223 else 224 match Hashtbl.find_opt graph node with 225 | None -> None 226 | Some neighbors -> 227 let stack' = node :: stack in 228 let rec check_neighbors = function 229 | [] -> None 230 | neighbor :: rest -> 231 match visit visited stack' neighbor with 232 | Some cycle -> Some cycle 233 | None -> check_neighbors rest 234 in 235 check_neighbors neighbors 236 in 237 238 (* Check all nodes *) 239 let rec check_all_nodes visited nodes = 240 match nodes with 241 | [] -> () 242 | node :: rest -> 243 begin match visit visited [] node with 244 | Some cycle -> 245 let cycle_str = String.concat " -> " (List.rev cycle) in 246 Message_collector.add_typed collector 247 (`Generic (Printf.sprintf "itemref cycle detected: %s" cycle_str)) 248 | None -> () 249 end; 250 check_all_nodes (node :: visited) rest 251 in 252 253 let all_nodes = Hashtbl.to_seq_keys graph |> List.of_seq in 254 check_all_nodes [] all_nodes 255 256let start_element state ~element collector = 257 let name = Tag.tag_to_string element.Element.tag in 258 let attrs = element.raw_attrs in 259 let location = None in 260 track_id state attrs; 261 process_microdata_attrs state ~element:name ~attrs ~location collector 262 263let end_element state ~tag _collector = 264 let name = Tag.tag_to_string tag in 265 (* Pop itemscope from stack if this element had one *) 266 match state.scope_stack with 267 | scope :: rest when scope.element = name -> 268 state.scope_stack <- rest 269 | _ -> () 270 271let end_document state collector = 272 (* Check all itemref references point to existing IDs *) 273 List.iter (fun ref -> 274 List.iter (fun id -> 275 if not (Hashtbl.mem state.all_ids id) then 276 Message_collector.add_typed collector 277 (`Generic (Printf.sprintf 278 "itemref on <%s> refers to ID '%s' which does not exist" 279 ref.referring_element id)) 280 ) ref.referenced_ids 281 ) state.itemref_references; 282 283 (* Detect itemref cycles *) 284 detect_itemref_cycles state collector 285 286let checker = Checker.make ~create ~reset ~start_element ~end_element 287 ~end_document ()