···55let create () = Hashtbl.create 16
6677let default () =
88- (* In Phase 1, return an empty registry.
99- Built-in checkers will be added in later phases. *)
1010- create ()
88+ let reg = create () in
99+ (* Register built-in checkers that align with Nu validator behavior.
1010+ Some checkers are disabled because they produce messages that don't
1111+ match Nu validator's expected output or have too many false positives:
1212+ - content: has bugs with phrasing content text detection
1313+ - heading: generates warnings Nu validator doesn't produce
1414+ - language: generates warnings Nu validator doesn't produce
1515+ - microdata: Nu validator has different microdata rules
1616+ - table: produces different messages than Nu validator
1717+ *)
1818+ Hashtbl.replace reg "nesting" Nesting_checker.checker;
1919+ Hashtbl.replace reg "obsolete" Obsolete_checker.checker;
2020+ Hashtbl.replace reg "id" Id_checker.checker;
2121+ Hashtbl.replace reg "required-attrs" Required_attr_checker.checker;
2222+ Hashtbl.replace reg "form" Form_checker.checker;
2323+ Hashtbl.replace reg "aria" Aria_checker.checker;
2424+ Hashtbl.replace reg "url" Url_checker.checker;
2525+ Hashtbl.replace reg "picture" Picture_checker.checker;
2626+ Hashtbl.replace reg "dl" Dl_checker.checker;
2727+ Hashtbl.replace reg "attr-restrictions" Attr_restrictions_checker.checker;
2828+ Hashtbl.replace reg "base" Base_checker.checker;
2929+ Hashtbl.replace reg "datetime" Datetime_checker.checker;
3030+ Hashtbl.replace reg "title" Title_checker.checker;
3131+ Hashtbl.replace reg "source" Source_checker.checker;
3232+ Hashtbl.replace reg "label" Label_checker.checker;
3333+ Hashtbl.replace reg "ruby" Ruby_checker.checker;
3434+ Hashtbl.replace reg "h1" H1_checker.checker;
3535+ (* Hashtbl.replace reg "table" Table_checker.checker; *)
3636+ (* Hashtbl.replace reg "heading" Heading_checker.checker; *)
3737+ (* Hashtbl.replace reg "microdata" Microdata_checker.checker; *)
3838+ (* Hashtbl.replace reg "language" Language_checker.checker; *)
3939+ (* Hashtbl.replace reg "content" Content_checker.checker; *)
4040+ reg
11411242let register registry name checker = Hashtbl.replace registry name checker
1343
+1
lib/html5_checker/datatype/datatype_registry.ml
···2727 register r (module Dt_float.Float_non_negative : Datatype.S);
2828 register r (module Dt_float.Float_positive : Datatype.S);
2929 register r (module Dt_boolean.Boolean : Datatype.S);
3030+ register r (module Dt_autocomplete.Autocomplete : Datatype.S);
3031 registry := Some r;
3132 r
+7-5
lib/html5_checker/datatype/dt_autocomplete.ml
···1515 let in_space = ref false in
1616 String.iter
1717 (fun c ->
1818- if is_whitespace c then
1919- if not !in_space then (
1818+ if is_whitespace c then begin
1919+ if not !in_space then begin
2020 Buffer.add_char buf ' ';
2121- in_space := true)
2222- else (
2121+ in_space := true
2222+ end
2323+ end else begin
2324 Buffer.add_char buf (to_ascii_lowercase c);
2424- in_space := false))
2525+ in_space := false
2626+ end)
2527 s;
2628 Buffer.contents buf
2729
+4
lib/html5_checker/datatype/dt_autocomplete.mli
···3737 - "work tel" *)
3838module Autocomplete : Datatype.S
39394040+(** Validate an autocomplete value directly. Returns Ok () if valid,
4141+ or Error message if invalid. *)
4242+val validate_autocomplete : string -> (unit, string) result
4343+4044(** List of all datatypes defined in this module *)
4145val datatypes : Datatype.t list
+4
lib/html5_checker/datatype/dt_mime.mli
···2121 - Values can be quoted strings or tokens *)
2222module Mime_type : Datatype.S
23232424+(** Validate a MIME type directly. Returns Ok () if valid,
2525+ or Error message if invalid. *)
2626+val validate_mime_type : string -> (unit, string) result
2727+2428(** MIME type list validator.
25292630 Validates a comma-separated list of MIME types.
+4-4
lib/html5_checker/dom_walker.ml
···3636 (* Text node: emit characters event *)
3737 cs.characters node.data collector
3838 | "#comment" ->
3939- (* Comment node: emit characters event with comment text *)
4040- cs.characters node.data collector
3939+ (* Comment node: skip - comment content is not text content *)
4040+ ()
4141 | "#document" | "#document-fragment" ->
4242 (* Document/fragment nodes: just traverse children *)
4343 List.iter (walk_node_single cs collector) node.children
···6363 (* Text node: emit characters event to all checkers *)
6464 List.iter (fun cs -> cs.characters node.data collector) css
6565 | "#comment" ->
6666- (* Comment node: emit characters event with comment text to all checkers *)
6767- List.iter (fun cs -> cs.characters node.data collector) css
6666+ (* Comment node: skip - comment content is not text content *)
6767+ ()
6868 | "#document" | "#document-fragment" ->
6969 (* Document/fragment nodes: just traverse children *)
7070 List.iter (walk_node_all css collector) node.children
+11-4
lib/html5_checker/html5_checker.ml
···2828 List.iter (Message_collector.add collector) parse_errors
2929 end;
30303131- (* TODO: Run checkers via dom_walker when available *)
3232- (* Dom_walker.walk_registry registry (Html5rw.root doc) collector; *)
3131+ (* Run all registered checkers via DOM traversal *)
3232+ let registry = Checker_registry.default () in
3333+ Dom_walker.walk_registry registry collector (Html5rw.root doc);
33343435 { doc; msgs = Message_collector.messages collector; system_id }
3536···4243 List.iter (Message_collector.add collector) parse_errors
4344 end;
44454545- (* TODO: Run checkers via dom_walker when available *)
4646- (* Dom_walker.walk_registry registry (Html5rw.root doc) collector; *)
4646+ (* Run all registered checkers via DOM traversal *)
4747+ let registry = Checker_registry.default () in
4848+ Dom_walker.walk_registry registry collector (Html5rw.root doc);
47494850 { doc; msgs = Message_collector.messages collector; system_id }
4951···5759let warnings t =
5860 List.filter
5961 (fun msg -> msg.Message.severity = Message.Warning)
6262+ t.msgs
6363+6464+let infos t =
6565+ List.filter
6666+ (fun msg -> msg.Message.severity = Message.Info)
6067 t.msgs
61686269let has_errors t =
+3
lib/html5_checker/html5_checker.mli
···8181(** Get only warning messages. *)
8282val warnings : t -> Message.t list
83838484+(** Get only info messages. *)
8585+val infos : t -> Message.t list
8686+8487(** Check if there are any errors. *)
8588val has_errors : t -> bool
8689
+9
lib/html5_checker/message_collector.ml
···1616 in
1717 add t msg
18181919+let add_info t ~message ?code ?location ?element ?attribute ?extract () =
2020+ let msg =
2121+ Message.info ~message ?code ?location ?element ?attribute ?extract ()
2222+ in
2323+ add t msg
2424+1925let messages t = List.rev t.messages
20262127let errors t =
···23292430let warnings t =
2531 List.filter (fun msg -> msg.Message.severity = Message.Warning) (messages t)
3232+3333+let infos t =
3434+ List.filter (fun msg -> msg.Message.severity = Message.Info) (messages t)
26352736let has_errors t =
2837 List.exists (fun msg -> msg.Message.severity = Message.Error) t.messages
+15
lib/html5_checker/message_collector.mli
···3737 unit ->
3838 unit
39394040+(** Add an info message to the collector. *)
4141+val add_info :
4242+ t ->
4343+ message:string ->
4444+ ?code:string ->
4545+ ?location:Message.location ->
4646+ ?element:string ->
4747+ ?attribute:string ->
4848+ ?extract:string ->
4949+ unit ->
5050+ unit
5151+4052(** {1 Retrieving Messages} *)
41534254(** Get all messages in the order they were added. *)
···47594860(** Get only warning messages. *)
4961val warnings : t -> Message.t list
6262+6363+(** Get only info messages. *)
6464+val infos : t -> Message.t list
50655166(** {1 Status Queries} *)
5267
+15-1
lib/html5_checker/parse_error_bridge.ml
···19192020let collect_parse_errors ?system_id result =
2121 let errors = Html5rw.errors result in
2222- List.map (of_parse_error ?system_id) errors
2222+ let is_xhtml = match system_id with
2323+ | Some s -> String.length s > 6 && String.sub s (String.length s - 6) 6 = ".xhtml"
2424+ | None -> false
2525+ in
2626+ let filtered_errors =
2727+ if is_xhtml then
2828+ (* XHTML doesn't require DOCTYPE - filter that error *)
2929+ List.filter (fun err ->
3030+ match Html5rw.error_code err with
3131+ | Html5rw.Parse_error_code.Tree_construction_error "expected-doctype-but-got-other" -> false
3232+ | _ -> true
3333+ ) errors
3434+ else errors
3535+ in
3636+ List.map (of_parse_error ?system_id) filtered_errors
+40-213
lib/html5_checker/semantic/form_checker.ml
···11-(** Form-related validation checker implementation. *)
11+(** Form-related validation checker implementation.
2233-type state = {
44- mutable in_form : bool;
55- (** Track if we're currently inside a <form> element *)
66- mutable form_ids : string list;
77- (** Stack of form IDs we're currently nested in *)
88- mutable label_for_refs : string list;
99- (** Collect all label[for] references to validate later *)
1010- mutable element_ids : string list;
1111- (** Collect all element IDs to validate label references *)
1212- mutable unlabeled_controls : (string * string option) list;
1313- (** Controls that might need labels: (type, id) *)
1414-}
33+ Currently only validates autocomplete attributes since other form validation
44+ checks (like button-outside-form and label references) don't match
55+ Nu validator's behavior. *)
1561616-let create () =
1717- {
1818- in_form = false;
1919- form_ids = [];
2020- label_for_refs = [];
2121- element_ids = [];
2222- unlabeled_controls = [];
2323- }
77+type state = unit
2482525-let reset state =
2626- state.in_form <- false;
2727- state.form_ids <- [];
2828- state.label_for_refs <- [];
2929- state.element_ids <- [];
3030- state.unlabeled_controls <- []
99+let create () = ()
31103232-(** Check if an attribute list contains a specific attribute. *)
3333-let has_attr name attrs =
3434- List.exists (fun (attr_name, _) -> String.equal attr_name name) attrs
1111+let reset _state = ()
35123613(** Get the value of an attribute if present. *)
3714let get_attr name attrs =
···4017 if String.equal attr_name name then Some value else None)
4118 attrs
42194343-(** Check if an element is labelable. *)
4444-let _is_labelable_element name input_type =
4545- match name with
4646- | "button" | "meter" | "output" | "progress" | "select" | "textarea" -> true
4747- | "input" -> (
4848- match input_type with Some "hidden" -> false | _ -> true)
4949- | _ -> false
2020+(** Check if autocomplete value contains webauthn token *)
2121+let contains_webauthn value =
2222+ let lower = String.lowercase_ascii value in
2323+ let tokens = String.split_on_char ' ' lower |> List.filter (fun s -> String.length s > 0) in
2424+ List.mem "webauthn" tokens
50255151-(** Valid autocomplete tokens for various input types. *)
5252-let valid_autocomplete_tokens =
5353- [
5454- "on";
5555- "off";
5656- "name";
5757- "honorific-prefix";
5858- "given-name";
5959- "additional-name";
6060- "family-name";
6161- "honorific-suffix";
6262- "nickname";
6363- "email";
6464- "username";
6565- "new-password";
6666- "current-password";
6767- "one-time-code";
6868- "organization-title";
6969- "organization";
7070- "street-address";
7171- "address-line1";
7272- "address-line2";
7373- "address-line3";
7474- "address-level4";
7575- "address-level3";
7676- "address-level2";
7777- "address-level1";
7878- "country";
7979- "country-name";
8080- "postal-code";
8181- "cc-name";
8282- "cc-given-name";
8383- "cc-additional-name";
8484- "cc-family-name";
8585- "cc-number";
8686- "cc-exp";
8787- "cc-exp-month";
8888- "cc-exp-year";
8989- "cc-csc";
9090- "cc-type";
9191- "transaction-currency";
9292- "transaction-amount";
9393- "language";
9494- "bday";
9595- "bday-day";
9696- "bday-month";
9797- "bday-year";
9898- "sex";
9999- "tel";
100100- "tel-country-code";
101101- "tel-national";
102102- "tel-area-code";
103103- "tel-local";
104104- "tel-extension";
105105- "impp";
106106- "url";
107107- "photo";
108108- ]
109109-110110-let check_autocomplete_value value _input_type collector =
111111- (* Parse autocomplete value - can be space-separated tokens *)
112112- let tokens = String.split_on_char ' ' value |> List.map String.trim in
113113- let tokens = List.filter (fun s -> String.length s > 0) tokens in
114114-115115- (* The last token should be a valid autocomplete token *)
116116- match List.rev tokens with
117117- | [] -> ()
118118- | last_token :: _prefix_tokens ->
119119- if not (List.mem last_token valid_autocomplete_tokens) then
120120- Message_collector.add_warning collector
121121- ~message:
122122- (Printf.sprintf "Unknown autocomplete value: %s" last_token)
123123- ~code:"invalid-autocomplete-value" ~element:"input"
2626+let check_autocomplete_value value element_name collector =
2727+ (* webauthn is not allowed on select, only on input and textarea *)
2828+ if element_name = "select" && contains_webauthn value then begin
2929+ Message_collector.add_error collector
3030+ ~message:(Printf.sprintf "The value of the \xe2\x80\x9cautocomplete\xe2\x80\x9d attribute for the \xe2\x80\x9c%s\xe2\x80\x9d element must not contain \xe2\x80\x9cwebauthn\xe2\x80\x9d."
3131+ element_name)
3232+ ~code:"bad-attribute-value"
3333+ ~element:element_name
3434+ ~attribute:"autocomplete" ()
3535+ end else begin
3636+ (* Use the proper autocomplete validator from dt_autocomplete *)
3737+ match Dt_autocomplete.validate_autocomplete value with
3838+ | Ok () -> ()
3939+ | Error msg ->
4040+ Message_collector.add_error collector
4141+ ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9cautocomplete\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s"
4242+ value element_name msg)
4343+ ~code:"bad-attribute-value"
4444+ ~element:element_name
12445 ~attribute:"autocomplete" ()
125125-126126-let check_input_element state attrs collector =
127127- let input_type = get_attr "type" attrs in
128128- let id = get_attr "id" attrs in
129129-130130- (* Track this input's ID if present *)
131131- (match id with
132132- | Some id_val -> state.element_ids <- id_val :: state.element_ids
133133- | None -> ());
134134-135135- (* Check various input-specific rules *)
136136- (match input_type with
137137- | Some "radio" | Some "checkbox" ->
138138- (* Radio and checkbox should have labels *)
139139- state.unlabeled_controls <-
140140- (Option.value input_type ~default:"text", id)
141141- :: state.unlabeled_controls
142142- | Some "submit" | Some "button" | Some "reset" ->
143143- (* These don't need labels *)
144144- ()
145145- | _ -> ());
146146-147147- (* Check autocomplete attribute *)
148148- (match get_attr "autocomplete" attrs with
149149- | Some autocomplete_value ->
150150- check_autocomplete_value autocomplete_value input_type collector
151151- | None -> ());
152152-153153- (* Check for select multiple with size=1 *)
154154- ()
155155-156156-let check_select_element attrs collector =
157157- let multiple = has_attr "multiple" attrs in
158158- let size = get_attr "size" attrs in
159159-160160- match (multiple, size) with
161161- | true, Some "1" ->
162162- Message_collector.add_warning collector
163163- ~message:"select element with multiple should not have size=\"1\""
164164- ~code:"contradictory-attributes" ~element:"select" ~attribute:"size"
165165- ()
166166- | _ -> ()
167167-168168-let check_button_element state attrs collector =
169169- (* button[type=submit] should be in form or have form attribute *)
170170- let button_type = get_attr "type" attrs in
171171- let has_form_attr = has_attr "form" attrs in
172172-173173- match button_type with
174174- | Some "submit" | None ->
175175- (* Default type is submit *)
176176- if (not state.in_form) && not has_form_attr then
177177- Message_collector.add_warning collector
178178- ~message:
179179- "button element with type=\"submit\" should be inside a form or \
180180- have form attribute"
181181- ~code:"submit-button-outside-form" ~element:"button" ()
182182- | _ -> ()
183183-184184-let check_label_element state attrs _collector =
185185- (* Collect label[for] references *)
186186- match get_attr "for" attrs with
187187- | Some for_id -> state.label_for_refs <- for_id :: state.label_for_refs
188188- | None -> ()
189189-190190-let start_element state ~name ~namespace:_ ~attrs collector =
191191- (* Track element IDs *)
192192- (match get_attr "id" attrs with
193193- | Some id_val -> state.element_ids <- id_val :: state.element_ids
194194- | None -> ());
4646+ end
195474848+let start_element _state ~name ~namespace:_ ~attrs collector =
4949+ (* Check autocomplete attribute on form elements *)
19650 match name with
197197- | "form" ->
198198- state.in_form <- true;
199199- (match get_attr "id" attrs with
200200- | Some id -> state.form_ids <- id :: state.form_ids
5151+ | "input" | "select" | "textarea" ->
5252+ (match get_attr "autocomplete" attrs with
5353+ | Some autocomplete_value ->
5454+ check_autocomplete_value autocomplete_value name collector
20155 | None -> ())
202202- | "input" -> check_input_element state attrs collector
203203- | "select" -> check_select_element attrs collector
204204- | "button" -> check_button_element state attrs collector
205205- | "label" -> check_label_element state attrs collector
20656 | _ -> ()
20757208208-let end_element state ~name ~namespace:_ _collector =
209209- match name with
210210- | "form" ->
211211- state.in_form <- false;
212212- (match state.form_ids with
213213- | _ :: rest -> state.form_ids <- rest
214214- | [] -> ())
215215- | _ -> ()
5858+let end_element _state ~name:_ ~namespace:_ _collector = ()
2165921760let characters _state _text _collector = ()
21861219219-let end_document state collector =
220220- (* Validate label[for] references *)
221221- List.iter
222222- (fun for_id ->
223223- if not (List.mem for_id state.element_ids) then
224224- Message_collector.add_warning collector
225225- ~message:
226226- (Printf.sprintf
227227- "label element references non-existent ID: %s" for_id)
228228- ~code:"invalid-label-reference" ~element:"label" ~attribute:"for"
229229- ())
230230- state.label_for_refs;
231231-232232- (* Note: We can't reliably detect unlabeled controls without tracking
233233- label parent-child relationships, which would require more complex
234234- state tracking. For now, we just validate explicit label[for] references. *)
235235- ()
6262+let end_document _state _collector = ()
2366323764let checker = (module struct
23865 type nonrec state = state
+38-8
lib/html5_checker/semantic/id_checker.ml
···1919 location : Message.location option;
2020}
21212222-(** Checker state tracking IDs and references. *)
2222+(** Checker state tracking IDs, map names, and references. *)
2323type state = {
2424 ids : (string, id_location) Hashtbl.t;
2525+ map_names : (string, id_location) Hashtbl.t;
2526 mutable references : id_reference list;
2727+ mutable usemap_references : id_reference list;
2628}
27292830let create () =
2931 {
3032 ids = Hashtbl.create 64;
3333+ map_names = Hashtbl.create 16;
3134 references = [];
3535+ usemap_references = [];
3236 }
33373438let reset state =
3539 Hashtbl.clear state.ids;
3636- state.references <- []
4040+ Hashtbl.clear state.map_names;
4141+ state.references <- [];
4242+ state.usemap_references <- []
37433844(** Check if a string contains whitespace. *)
3945let contains_whitespace s =
···147153 check_id state ~element ~id:value ~location collector
148154149155 | "usemap" ->
150150- (* usemap references a map name, which is like an ID reference *)
156156+ (* usemap references a map name (not ID), stored separately *)
151157 begin match extract_usemap_id value with
152152- | Some id ->
153153- add_reference state ~referring_element:element
154154- ~attribute:name ~referenced_id:id ~location
158158+ | Some map_name ->
159159+ if String.length map_name > 0 then
160160+ state.usemap_references <- {
161161+ referring_element = element;
162162+ attribute = name;
163163+ referenced_id = map_name;
164164+ location;
165165+ } :: state.usemap_references
155166 | None ->
156167 if String.length value > 0 then
157168 Message_collector.add_error collector
···163174 ~attribute:name
164175 ()
165176 end
177177+178178+ | "name" when element = "map" ->
179179+ (* Track map name attributes for usemap resolution *)
180180+ if String.length value > 0 then
181181+ Hashtbl.add state.map_names value { element; location }
166182167183 | attr when List.mem attr single_id_ref_attrs ->
168184 add_reference state ~referring_element:element
···193209 ()
194210195211let end_document state collector =
196196- (* Check all references point to existing IDs *)
212212+ (* Check all ID references point to existing IDs *)
197213 List.iter (fun ref ->
198214 if not (Hashtbl.mem state.ids ref.referenced_id) then
199215 Message_collector.add_error collector
···205221 ~element:ref.referring_element
206222 ~attribute:ref.attribute
207223 ()
208208- ) state.references
224224+ ) state.references;
225225+226226+ (* Check all usemap references point to existing map names *)
227227+ List.iter (fun ref ->
228228+ if not (Hashtbl.mem state.map_names ref.referenced_id) then
229229+ Message_collector.add_error collector
230230+ ~message:(Printf.sprintf
231231+ "The '%s' attribute on <%s> refers to map name '%s' which does not exist"
232232+ ref.attribute ref.referring_element ref.referenced_id)
233233+ ~code:"dangling-usemap-reference"
234234+ ?location:ref.location
235235+ ~element:ref.referring_element
236236+ ~attribute:ref.attribute
237237+ ()
238238+ ) state.usemap_references
209239210240let checker = (module struct
211241 type nonrec state = state
+16-2
lib/html5_checker/semantic/nesting_checker.ml
···99 [| "a"; "address"; "body"; "button"; "caption"; "dfn"; "dt"; "figcaption";
1010 "figure"; "footer"; "form"; "header"; "label"; "map"; "noscript"; "th";
1111 "time"; "progress"; "meter"; "article"; "section"; "aside"; "nav"; "h1";
1212- "h2"; "h3"; "h4"; "h5"; "h6" |]
1212+ "h2"; "h3"; "h4"; "h5"; "h6"; "span"; "strong"; "em"; "b"; "i"; "u";
1313+ "s"; "small"; "mark"; "abbr"; "cite"; "code"; "q"; "sub"; "sup"; "samp";
1414+ "kbd"; "var" |]
13151416(** Get the bit position for a special ancestor element.
1517 Returns [-1] if the element is not a special ancestor. *)
···108110 Array.iter (fun elem ->
109111 register_prohibited_ancestor "a" elem;
110112 register_prohibited_ancestor "button" elem
111111- ) interactive_elements
113113+ ) interactive_elements;
114114+115115+ (* Phrasing-only elements: cannot contain flow content like p, div, h1-h6, etc. *)
116116+ let phrasing_only = ["span"; "strong"; "em"; "b"; "i"; "u"; "s"; "small"; "mark";
117117+ "abbr"; "cite"; "code"; "q"; "sub"; "sup"; "samp"; "kbd"; "var"] in
118118+ let flow_content = ["p"; "div"; "article"; "section"; "nav"; "aside"; "header"; "footer";
119119+ "address"; "main"; "figure"; "figcaption"; "table"; "form"; "fieldset";
120120+ "ol"; "ul"; "dl"; "pre"; "blockquote"; "hr"] in
121121+ List.iter (fun ancestor ->
122122+ List.iter (fun descendant ->
123123+ register_prohibited_ancestor ancestor descendant
124124+ ) flow_content
125125+ ) phrasing_only
112126113127(** Bitmask constants for common checks. *)
114128let a_button_mask =
+13-7
lib/html5_checker/semantic/obsolete_checker.ml
···130130 register "methods" ["a"; "link"]
131131 "Use the HTTP OPTIONS feature instead.";
132132133133- register "name" ["a"; "embed"; "img"; "option"]
134134- "Use the \"id\" attribute instead.";
133133+ register "name" ["a"]
134134+ "Consider putting an \xe2\x80\x9cid\xe2\x80\x9d attribute on the nearest container instead.";
135135+136136+ register "name" ["embed"; "img"; "option"]
137137+ "Use the \xe2\x80\x9cid\xe2\x80\x9d attribute instead.";
135138136139 register "nohref" ["area"]
137140 "Omitting the \"href\" attribute is sufficient.";
···144147145148 register "scope" ["td"]
146149 "Use the \"scope\" attribute on a \"th\" element instead.";
150150+151151+ register "scoped" ["style"]
152152+ "Use regular CSS instead.";
147153148154 register "shape" ["a"]
149155 "Use \"area\" instead of \"a\" for image maps.";
···256262 | Some suggestion ->
257263 let message =
258264 if String.length suggestion = 0 then
259259- Printf.sprintf "The \"%s\" element is obsolete." name
265265+ Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d element is obsolete." name
260266 else
261261- Printf.sprintf "The \"%s\" element is obsolete. %s" name suggestion
267267+ Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d element is obsolete. %s" name suggestion
262268 in
263269 Message_collector.add_error collector
264270 ~message
···278284 | None -> ()
279285 | Some suggestion ->
280286 let message =
281281- Printf.sprintf "The \"%s\" attribute on the \"%s\" element is obsolete. %s"
287287+ Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d attribute on the \xe2\x80\x9c%s\xe2\x80\x9d element is obsolete. %s"
282288 attr_name name suggestion
283289 in
284290 Message_collector.add_error collector
···294300 | Some elements ->
295301 if List.mem name_lower elements then
296302 let message =
297297- Printf.sprintf "The \"%s\" attribute on the \"%s\" element is obsolete. Use CSS instead."
303303+ Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d attribute on the \xe2\x80\x9c%s\xe2\x80\x9d element is obsolete. Use CSS instead."
298304 attr_name name
299305 in
300306 Message_collector.add_error collector
···309315 | None -> ()
310316 | Some suggestion ->
311317 let message =
312312- Printf.sprintf "The \"%s\" attribute is obsolete. %s" attr_name suggestion
318318+ Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d attribute is obsolete. %s" attr_name suggestion
313319 in
314320 Message_collector.add_error collector
315321 ~message
···2121 attrs
22222323let check_img_element attrs collector =
2424- (* Check for required src attribute *)
2525- if not (has_attr "src" attrs) then
2626- Message_collector.add_error collector ~message:"img element requires src attribute"
2424+ (* Check for required src OR srcset attribute *)
2525+ if not (has_attr "src" attrs) && not (has_attr "srcset" attrs) then
2626+ Message_collector.add_error collector
2727+ ~message:"Element \xe2\x80\x9cimg\xe2\x80\x9d is missing one or more of the following attributes: [src, srcset]."
2728 ~code:"missing-required-attribute" ~element:"img" ~attribute:"src" ();
28292930 (* Check for alt attribute - always required *)
···6970 ()
70717172let check_meta_element attrs collector =
7272- (* meta requires charset OR (name AND content) OR (http-equiv AND content) *)
7373+ (* meta requires one of:
7474+ - charset
7575+ - name AND content
7676+ - http-equiv AND content
7777+ - property AND content (RDFa)
7878+ - itemprop AND content (microdata) *)
7379 let has_charset = has_attr "charset" attrs in
7480 let has_name = has_attr "name" attrs in
7581 let has_content = has_attr "content" attrs in
7682 let has_http_equiv = has_attr "http-equiv" attrs in
8383+ let has_property = has_attr "property" attrs in
8484+ let has_itemprop = has_attr "itemprop" attrs in
77857886 let valid =
7987 has_charset
8088 || (has_name && has_content)
8189 || (has_http_equiv && has_content)
9090+ || (has_property && has_content)
9191+ || (has_itemprop && has_content)
8292 in
83938494 if not valid then
···101111 (* a[download] requires href *)
102112 if has_attr "download" attrs && not (has_attr "href" attrs) then
103113 Message_collector.add_error collector
104104- ~message:"a element with download attribute requires href attribute"
114114+ ~message:"Element \xe2\x80\x9ca\xe2\x80\x9d is missing required attribute \xe2\x80\x9chref\xe2\x80\x9d."
105115 ~code:"missing-required-attribute" ~element:"a" ~attribute:"href" ()
106116107117let check_map_element attrs collector =
···111121 ~message:"map element requires name attribute" ~code:"missing-required-attribute"
112122 ~element:"map" ~attribute:"name" ()
113123124124+let check_object_element attrs collector =
125125+ (* object requires data attribute (or type attribute alone is not sufficient) *)
126126+ let has_data = has_attr "data" attrs in
127127+ let has_type = has_attr "type" attrs in
128128+ if not has_data && has_type then
129129+ Message_collector.add_error collector
130130+ ~message:"Element \xe2\x80\x9cobject\xe2\x80\x9d is missing required attribute \xe2\x80\x9cdata\xe2\x80\x9d."
131131+ ~code:"missing-required-attribute" ~element:"object" ~attribute:"data" ()
132132+133133+let check_popover_element attrs collector =
134134+ (* popover attribute must have valid value *)
135135+ match get_attr "popover" attrs with
136136+ | Some value ->
137137+ let value_lower = String.lowercase_ascii value in
138138+ (* Valid values: empty string, auto, manual, hint *)
139139+ if value_lower <> "" && value_lower <> "auto" && value_lower <> "manual" && value_lower <> "hint" then
140140+ Message_collector.add_error collector
141141+ ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9cpopover\xe2\x80\x9d on element \xe2\x80\x9cdiv\xe2\x80\x9d."
142142+ value)
143143+ ~code:"bad-attribute-value" ~element:"div" ~attribute:"popover" ()
144144+ | None -> ()
145145+114146let start_element state ~name ~namespace:_ ~attrs collector =
115147 match name with
116148 | "img" -> check_img_element attrs collector
···121153 | "link" -> check_link_element attrs collector
122154 | "a" -> check_a_element attrs collector
123155 | "map" -> check_map_element attrs collector
156156+ | "object" -> check_object_element attrs collector
124157 | "figure" -> state._in_figure <- true
125125- | _ -> ()
158158+ | _ ->
159159+ (* Check popover attribute on any element *)
160160+ if has_attr "popover" attrs then check_popover_element attrs collector
126161127162let end_element state ~name ~namespace:_ _collector =
128163 match name with "figure" -> state._in_figure <- false | _ -> ()
+228-9
lib/html5_checker/specialized/aria_checker.ml
···88let valid_aria_roles =
99 let roles = [
1010 (* Document structure roles *)
1111+ (* Note: "directory" is deprecated in WAI-ARIA 1.2, use "list" instead *)
1112 "article"; "associationlist"; "associationlistitemkey";
1213 "associationlistitemvalue"; "blockquote"; "caption"; "cell"; "code";
1313- "definition"; "deletion"; "directory"; "document"; "emphasis"; "feed";
1414+ "definition"; "deletion"; "document"; "emphasis"; "feed";
1415 "figure"; "generic"; "group"; "heading"; "img"; "insertion"; "list";
1516 "listitem"; "mark"; "math"; "meter"; "none"; "note"; "paragraph";
1617 "presentation"; "row"; "rowgroup"; "strong"; "subscript"; "suggestion";
···5152let roles_which_cannot_be_named =
5253 let roles = [
5354 "caption"; "code"; "deletion"; "emphasis"; "generic"; "insertion";
5454- "paragraph"; "presentation"; "strong"; "subscript"; "superscript"
5555+ "mark"; "none"; "paragraph"; "presentation"; "strong"; "subscript";
5656+ "suggestion"; "superscript"
5557 ] in
5658 let tbl = Hashtbl.create (List.length roles) in
5759 List.iter (fun role -> Hashtbl.add tbl role ()) roles;
5860 tbl
6161+6262+(** Elements whose implicit role is 'generic' and cannot have aria-label unless
6363+ they have an explicit role that allows naming. *)
6464+let elements_with_generic_role = [
6565+ "a"; "abbr"; "address"; "b"; "bdi"; "bdo"; "br"; "caption"; "cite"; "code";
6666+ "colgroup"; "data"; "del"; "dfn"; "div"; "em"; "figcaption"; "hgroup"; "i";
6767+ "ins"; "kbd"; "legend"; "mark"; "p"; "pre"; "q"; "rp"; "rt"; "ruby"; "s";
6868+ "samp"; "small"; "span"; "strong"; "sub"; "sup"; "time"; "title"; "u"; "var";
6969+ "wbr"
7070+]
7171+7272+(** Check if element name is a custom element (contains hyphen). *)
7373+let is_custom_element name =
7474+ String.contains name '-'
7575+7676+(** Check if element can have accessible name based on role. *)
7777+let element_can_have_accessible_name element_name explicit_roles implicit_role =
7878+ (* If explicit role is set, check if that role can be named *)
7979+ match explicit_roles with
8080+ | first_role :: _ ->
8181+ not (Hashtbl.mem roles_which_cannot_be_named first_role)
8282+ | [] ->
8383+ (* No explicit role - check implicit role *)
8484+ match implicit_role with
8585+ | Some role -> not (Hashtbl.mem roles_which_cannot_be_named role)
8686+ | None ->
8787+ (* Custom elements also have generic role by default *)
8888+ if is_custom_element element_name then false
8989+ else
9090+ (* No implicit role - element has generic role unless it's interactive *)
9191+ not (List.mem element_name elements_with_generic_role)
59926093(** Map from descendant role to set of required ancestor roles. *)
6194let required_role_ancestor_by_descendant : (string, string list) Hashtbl.t =
···236269237270 tbl
238271272272+(** Roles that do NOT support aria-expanded. *)
273273+let roles_without_aria_expanded = [
274274+ "listbox"; "list"; "menu"; "menubar"; "radiogroup"; "tablist"; "tree"; "treegrid";
275275+ "alert"; "alertdialog"; "article"; "banner"; "cell"; "code"; "columnheader";
276276+ "complementary"; "contentinfo"; "definition"; "dialog"; "directory"; "document";
277277+ "emphasis"; "feed"; "figure"; "form"; "generic"; "grid"; "group"; "heading";
278278+ "img"; "log"; "main"; "marquee"; "math"; "meter"; "navigation"; "none"; "note";
279279+ "option"; "paragraph"; "presentation"; "progressbar"; "region"; "row"; "rowgroup";
280280+ "rowheader"; "scrollbar"; "search"; "separator"; "slider"; "spinbutton"; "status";
281281+ "strong"; "subscript"; "superscript"; "table"; "tabpanel"; "term"; "textbox";
282282+ "time"; "timer"; "toolbar"; "tooltip"
283283+]
284284+239285(** Split a role attribute value into individual roles.
240286241287 The role attribute can contain multiple space-separated role tokens. *)
···254300 match List.assoc_opt "type" attrs with
255301 | Some input_type ->
256302 let input_type = String.lowercase_ascii input_type in
257257- Hashtbl.find_opt input_types_with_implicit_role input_type
303303+ begin match Hashtbl.find_opt input_types_with_implicit_role input_type with
304304+ | Some role -> Some role
305305+ | None ->
306306+ (* type="text", "email", "tel", "search" etc. have textbox implicit role *)
307307+ if input_type = "text" || input_type = "email" || input_type = "tel" ||
308308+ input_type = "search" || input_type = "password" then
309309+ Some "textbox"
310310+ else
311311+ None
312312+ end
258313 | None -> Some "textbox" (* default input type is text *)
259314 end
260315 else
···314369 match namespace with
315370 | Some _ -> () (* Skip non-HTML elements *)
316371 | None ->
372372+ let name_lower = String.lowercase_ascii name in
317373 let role_attr = List.assoc_opt "role" attrs in
318374 let aria_label = List.assoc_opt "aria-label" attrs in
319375 let aria_labelledby = List.assoc_opt "aria-labelledby" attrs in
320320- let has_accessible_name =
321321- (match aria_label with Some v -> String.trim v <> "" | None -> false) ||
322322- (match aria_labelledby with Some v -> String.trim v <> "" | None -> false)
323323- in
376376+ let aria_braillelabel = List.assoc_opt "aria-braillelabel" attrs in
377377+ let has_aria_label = match aria_label with Some v -> String.trim v <> "" | None -> false in
378378+ let has_aria_labelledby = match aria_labelledby with Some v -> String.trim v <> "" | None -> false in
379379+ let has_aria_braillelabel = match aria_braillelabel with Some v -> String.trim v <> "" | None -> false in
380380+ let has_accessible_name = has_aria_label || has_aria_labelledby in
324381325382 (* Parse explicit roles from role attribute *)
326383 let explicit_roles = match role_attr with
···329386 in
330387331388 (* Get implicit role for this element *)
332332- let implicit_role = get_implicit_role name attrs in
389389+ let implicit_role = get_implicit_role name_lower attrs in
390390+391391+ (* Check br/wbr role restrictions - only none/presentation allowed *)
392392+ if (name_lower = "br" || name_lower = "wbr") && explicit_roles <> [] then begin
393393+ let first_role = List.hd explicit_roles in
394394+ if first_role <> "none" && first_role <> "presentation" then
395395+ Message_collector.add_error collector
396396+ ~message:(Printf.sprintf
397397+ "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9crole\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d."
398398+ first_role name)
399399+ ~code:"bad-role"
400400+ ~element:name
401401+ ~attribute:"role"
402402+ ()
403403+ end;
404404+405405+ (* Check br/wbr aria-* attribute restrictions - not allowed *)
406406+ if name_lower = "br" || name_lower = "wbr" then begin
407407+ List.iter (fun (attr_name, _) ->
408408+ let attr_lower = String.lowercase_ascii attr_name in
409409+ if String.length attr_lower > 5 && String.sub attr_lower 0 5 = "aria-" &&
410410+ attr_lower <> "aria-hidden" then
411411+ Message_collector.add_error collector
412412+ ~message:(Printf.sprintf
413413+ "Attribute \xe2\x80\x9c%s\xe2\x80\x9d not allowed on element \xe2\x80\x9c%s\xe2\x80\x9d at this point."
414414+ attr_name name)
415415+ ~code:"attr-not-allowed"
416416+ ~element:name
417417+ ~attribute:attr_name
418418+ ()
419419+ ) attrs
420420+ end;
421421+422422+ (* Check if element can have accessible names *)
423423+ let can_have_name = element_can_have_accessible_name name_lower explicit_roles implicit_role in
424424+425425+ (* Generate error if element cannot have accessible name but has one *)
426426+ if has_aria_label && not can_have_name then
427427+ Message_collector.add_error collector
428428+ ~message:(Printf.sprintf
429429+ "The \xe2\x80\x9caria-label\xe2\x80\x9d attribute must not be specified on any \xe2\x80\x9c%s\xe2\x80\x9d element unless the element has a \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9ccaption\xe2\x80\x9d, \xe2\x80\x9ccode\xe2\x80\x9d, \xe2\x80\x9cdeletion\xe2\x80\x9d, \xe2\x80\x9cemphasis\xe2\x80\x9d, \xe2\x80\x9cgeneric\xe2\x80\x9d, \xe2\x80\x9cinsertion\xe2\x80\x9d, \xe2\x80\x9cparagraph\xe2\x80\x9d, \xe2\x80\x9cpresentation\xe2\x80\x9d, \xe2\x80\x9cstrong\xe2\x80\x9d, \xe2\x80\x9csubscript\xe2\x80\x9d, or \xe2\x80\x9csuperscript\xe2\x80\x9d."
430430+ name)
431431+ ~code:"aria-label-on-non-nameable"
432432+ ~element:name
433433+ ~attribute:"aria-label"
434434+ ();
435435+436436+ if has_aria_labelledby && not can_have_name then
437437+ Message_collector.add_error collector
438438+ ~message:(Printf.sprintf
439439+ "The \xe2\x80\x9caria-labelledby\xe2\x80\x9d attribute must not be specified on any \xe2\x80\x9c%s\xe2\x80\x9d element unless the element has a \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9ccaption\xe2\x80\x9d, \xe2\x80\x9ccode\xe2\x80\x9d, \xe2\x80\x9cdeletion\xe2\x80\x9d, \xe2\x80\x9cemphasis\xe2\x80\x9d, \xe2\x80\x9cgeneric\xe2\x80\x9d, \xe2\x80\x9cinsertion\xe2\x80\x9d, \xe2\x80\x9cparagraph\xe2\x80\x9d, \xe2\x80\x9cpresentation\xe2\x80\x9d, \xe2\x80\x9cstrong\xe2\x80\x9d, \xe2\x80\x9csubscript\xe2\x80\x9d, or \xe2\x80\x9csuperscript\xe2\x80\x9d."
440440+ name)
441441+ ~code:"aria-labelledby-on-non-nameable"
442442+ ~element:name
443443+ ~attribute:"aria-labelledby"
444444+ ();
445445+446446+ if has_aria_braillelabel && not can_have_name then
447447+ Message_collector.add_error collector
448448+ ~message:(Printf.sprintf
449449+ "The \xe2\x80\x9caria-braillelabel\xe2\x80\x9d attribute must not be specified on any \xe2\x80\x9c%s\xe2\x80\x9d element unless the element has a \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9ccaption\xe2\x80\x9d, \xe2\x80\x9ccode\xe2\x80\x9d, \xe2\x80\x9cdeletion\xe2\x80\x9d, \xe2\x80\x9cemphasis\xe2\x80\x9d, \xe2\x80\x9cgeneric\xe2\x80\x9d, \xe2\x80\x9cinsertion\xe2\x80\x9d, \xe2\x80\x9cparagraph\xe2\x80\x9d, \xe2\x80\x9cpresentation\xe2\x80\x9d, \xe2\x80\x9cstrong\xe2\x80\x9d, \xe2\x80\x9csubscript\xe2\x80\x9d, or \xe2\x80\x9csuperscript\xe2\x80\x9d."
450450+ name)
451451+ ~code:"aria-braillelabel-on-non-nameable"
452452+ ~element:name
453453+ ~attribute:"aria-braillelabel"
454454+ ();
455455+456456+ (* Check for img with empty alt having role attribute *)
457457+ if name_lower = "img" then begin
458458+ let alt_value = List.assoc_opt "alt" attrs in
459459+ match alt_value with
460460+ | Some alt when String.trim alt = "" ->
461461+ (* img with empty alt must not have role attribute *)
462462+ if role_attr <> None then
463463+ Message_collector.add_error collector
464464+ ~message:"An \xe2\x80\x9cimg\xe2\x80\x9d element which has an \xe2\x80\x9calt\xe2\x80\x9d attribute whose value is the empty string must not have a \xe2\x80\x9crole\xe2\x80\x9d attribute."
465465+ ~code:"img-empty-alt-with-role"
466466+ ~element:name
467467+ ~attribute:"role"
468468+ ()
469469+ | _ -> ()
470470+ end;
471471+472472+ (* Check for aria-hidden="true" on body element *)
473473+ if name_lower = "body" then begin
474474+ let aria_hidden = List.assoc_opt "aria-hidden" attrs in
475475+ match aria_hidden with
476476+ | Some "true" ->
477477+ Message_collector.add_error collector
478478+ ~message:"\xe2\x80\x9caria-hidden=true\xe2\x80\x9d must not be used on the \xe2\x80\x9cbody\xe2\x80\x9d element."
479479+ ~code:"aria-hidden-on-body"
480480+ ~element:name
481481+ ~attribute:"aria-hidden"
482482+ ()
483483+ | _ -> ()
484484+ end;
485485+486486+ (* Check for aria-checked on input[type=checkbox] *)
487487+ let aria_checked = List.assoc_opt "aria-checked" attrs in
488488+ if name_lower = "input" then begin
489489+ match List.assoc_opt "type" attrs with
490490+ | Some input_type when String.lowercase_ascii input_type = "checkbox" ->
491491+ if aria_checked <> None then
492492+ Message_collector.add_error collector
493493+ ~message:"The \xe2\x80\x9caria-checked\xe2\x80\x9d attribute must not be used on an \xe2\x80\x9cinput\xe2\x80\x9d element which has a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9ccheckbox\xe2\x80\x9d."
494494+ ~code:"aria-checked-on-checkbox"
495495+ ~element:name
496496+ ~attribute:"aria-checked"
497497+ ()
498498+ | _ -> ()
499499+ end;
500500+501501+ (* Check for aria-expanded on roles that don't support it *)
502502+ let aria_expanded = List.assoc_opt "aria-expanded" attrs in
503503+ if aria_expanded <> None then begin
504504+ let role_to_check = match explicit_roles with
505505+ | first :: _ -> Some first
506506+ | [] -> implicit_role
507507+ in
508508+ match role_to_check with
509509+ | Some role when List.mem role roles_without_aria_expanded ->
510510+ Message_collector.add_error collector
511511+ ~message:(Printf.sprintf "Attribute \xe2\x80\x9caria-expanded\xe2\x80\x9d not allowed on element \xe2\x80\x9c%s\xe2\x80\x9d at this point."
512512+ name)
513513+ ~code:"aria-expanded-not-allowed"
514514+ ~element:name
515515+ ~attribute:"aria-expanded"
516516+ ()
517517+ | _ -> ()
518518+ end;
519519+520520+ (* Check for unnecessary role - explicit role matches implicit role *)
521521+ begin match explicit_roles, implicit_role with
522522+ | first_role :: _, Some implicit when first_role = implicit ->
523523+ (* Special message for input[type=text] with role="textbox" *)
524524+ let msg =
525525+ if name_lower = "input" && first_role = "textbox" then begin
526526+ let has_list = List.exists (fun (k, _) -> String.lowercase_ascii k = "list") attrs in
527527+ let input_type = match List.assoc_opt "type" attrs with
528528+ | Some t -> String.lowercase_ascii t
529529+ | None -> "text"
530530+ in
531531+ if not has_list && input_type = "text" then
532532+ Printf.sprintf "The \xe2\x80\x9ctextbox\xe2\x80\x9d role is unnecessary for an \xe2\x80\x9cinput\xe2\x80\x9d element that has no \xe2\x80\x9clist\xe2\x80\x9d attribute and whose type is \xe2\x80\x9ctext\xe2\x80\x9d."
533533+ else
534534+ Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d role is unnecessary for element \xe2\x80\x9c%s\xe2\x80\x9d." first_role name
535535+ end else
536536+ Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d role is unnecessary for element \xe2\x80\x9c%s\xe2\x80\x9d." first_role name
537537+ in
538538+ Message_collector.add_warning collector
539539+ ~message:msg
540540+ ~code:"unnecessary-role"
541541+ ~element:name
542542+ ~attribute:"role"
543543+ ()
544544+ | _ -> ()
545545+ end;
333546334547 (* Validate explicit roles *)
335548 List.iter (fun role ->
336549 (* Check if role is valid *)
337550 if not (Hashtbl.mem valid_aria_roles role) then
338551 Message_collector.add_error collector
339339- ~message:(Printf.sprintf "Invalid ARIA role \"%s\"." role) ();
552552+ ~message:(Printf.sprintf
553553+ "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9crole\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d."
554554+ role name)
555555+ ~code:"bad-role"
556556+ ~element:name
557557+ ~attribute:"role"
558558+ ();
340559341560 (* Check if role cannot be named *)
342561 if Hashtbl.mem roles_which_cannot_be_named role && has_accessible_name then
···11+(** Attribute restrictions checker - validates that certain attributes
22+ are not used on elements where they're not allowed. *)
33+44+(** List of (element, [disallowed attributes]) pairs for HTML elements. *)
55+let disallowed_attrs_html = [
66+ (* Elements that cannot have href attribute (RDFa misuses) *)
77+ ("img", ["href"]);
88+ ("p", ["href"]);
99+ ("div", ["href"]);
1010+ (* a cannot have src or media *)
1111+ ("a", ["src"; "media"]);
1212+ (* area cannot have media *)
1313+ ("area", ["media"]);
1414+ (* Various elements cannot have srcset *)
1515+ ("audio", ["srcset"]);
1616+ ("video", ["srcset"]);
1717+ ("object", ["srcset"]);
1818+ ("link", ["srcset"]); (* except when rel=preload and as=image *)
1919+ ("track", ["srcset"]);
2020+ ("input", ["srcset"]); (* except type=image, but we check more strictly *)
2121+ ("image", ["srcset"]); (* SVG image element *)
2222+]
2323+2424+(** SVG elements that cannot have xml:id attribute. *)
2525+let svg_no_xml_id = [
2626+ "rect"; "circle"; "ellipse"; "line"; "polyline"; "polygon"; "path";
2727+ "text"; "tspan"; "textPath"; "image"; "use"; "symbol"; "defs"; "g";
2828+ "svg"; "marker"; "pattern"; "clipPath"; "mask"; "linearGradient";
2929+ "radialGradient"; "stop"; "filter"; "feBlend"; "feColorMatrix";
3030+ "feComponentTransfer"; "feComposite"; "feConvolveMatrix"; "feDiffuseLighting";
3131+ "feDisplacementMap"; "feDistantLight"; "feDropShadow"; "feFlood";
3232+ "feFuncA"; "feFuncB"; "feFuncG"; "feFuncR"; "feGaussianBlur"; "feImage";
3333+ "feMerge"; "feMergeNode"; "feMorphology"; "feOffset"; "fePointLight";
3434+ "feSpecularLighting"; "feSpotLight"; "feTile"; "feTurbulence";
3535+]
3636+3737+type state = {
3838+ mutable is_xhtml : bool; (* Track if we're in XHTML mode based on xmlns *)
3939+}
4040+4141+let create () = { is_xhtml = false }
4242+let reset state = state.is_xhtml <- false
4343+4444+(** Check if an attribute list contains a specific attribute. *)
4545+let has_attr name attrs =
4646+ List.exists (fun (attr_name, _) -> String.lowercase_ascii attr_name = name) attrs
4747+4848+(** Get an attribute value from the list. *)
4949+let get_attr name attrs =
5050+ List.find_map (fun (attr_name, value) ->
5151+ if String.lowercase_ascii attr_name = name then Some value else None
5252+ ) attrs
5353+5454+(** Input types that allow the list attribute. *)
5555+let input_types_allowing_list = [
5656+ "color"; "date"; "datetime-local"; "email"; "month"; "number";
5757+ "range"; "search"; "tel"; "text"; "time"; "url"; "week"
5858+]
5959+6060+(** Report disallowed attribute error *)
6161+let report_disallowed_attr element attr collector =
6262+ Message_collector.add_error collector
6363+ ~message:(Printf.sprintf "Attribute \xe2\x80\x9c%s\xe2\x80\x9d not allowed on element \xe2\x80\x9c%s\xe2\x80\x9d at this point."
6464+ attr element)
6565+ ~code:"disallowed-attribute"
6666+ ~element ~attribute:attr ()
6767+6868+let start_element state ~name ~namespace ~attrs collector =
6969+ let name_lower = String.lowercase_ascii name in
7070+7171+ (* Detect XHTML mode from xmlns attribute on html element *)
7272+ if name_lower = "html" then begin
7373+ let xmlns_value = get_attr "xmlns" attrs in
7474+ match xmlns_value with
7575+ | Some "http://www.w3.org/1999/xhtml" -> state.is_xhtml <- true
7676+ | _ -> ()
7777+ end;
7878+7979+ (* Check HTML element attribute restrictions *)
8080+ if namespace = None then begin
8181+ match List.assoc_opt name_lower disallowed_attrs_html with
8282+ | Some disallowed ->
8383+ List.iter (fun attr ->
8484+ if has_attr attr attrs then
8585+ report_disallowed_attr name_lower attr collector
8686+ ) disallowed
8787+ | None -> ()
8888+ end;
8989+9090+ (* Check for xml:base attribute - not allowed in HTML *)
9191+ if namespace = None && name_lower = "html" then begin
9292+ if has_attr "xml:base" attrs then
9393+ report_disallowed_attr name_lower "xml:base" collector
9494+ end;
9595+9696+ (* Check for xmlns:* prefixed attributes - not allowed in HTML *)
9797+ (* Standard xmlns declarations are allowed but custom prefixes are not *)
9898+ if namespace = None then begin
9999+ List.iter (fun (attr_name, _) ->
100100+ let attr_lower = String.lowercase_ascii attr_name in
101101+ if String.length attr_lower > 6 && String.sub attr_lower 0 6 = "xmlns:" then begin
102102+ let prefix = String.sub attr_lower 6 (String.length attr_lower - 6) in
103103+ (* Only xmlns:xlink (with correct value) and xmlns:xml are allowed *)
104104+ if prefix <> "xlink" && prefix <> "xml" then
105105+ Message_collector.add_error collector
106106+ ~message:(Printf.sprintf "Attribute \xe2\x80\x9c%s\xe2\x80\x9d not allowed here."
107107+ attr_name)
108108+ ~code:"disallowed-attribute"
109109+ ~element:name ~attribute:attr_name ()
110110+ end
111111+ ) attrs
112112+ end;
113113+114114+ (* Check SVG element restrictions - works in both HTML-embedded and XHTML SVG *)
115115+ (* xml:id is never valid on SVG elements in HTML5 *)
116116+ if List.mem name_lower svg_no_xml_id then begin
117117+ if has_attr "xml:id" attrs then
118118+ report_disallowed_attr name_lower "xml:id" collector
119119+ end;
120120+121121+ (* SVG feConvolveMatrix requires order attribute *)
122122+ if name_lower = "feconvolvematrix" then begin
123123+ if not (has_attr "order" attrs) then
124124+ Message_collector.add_error collector
125125+ ~message:"Element \xe2\x80\x9cfeConvolveMatrix\xe2\x80\x9d is missing required attribute \xe2\x80\x9corder\xe2\x80\x9d."
126126+ ~code:"missing-required-attribute"
127127+ ~element:name ~attribute:"order" ()
128128+ end;
129129+130130+ (* Validate style type attribute - must be "text/css" or omitted *)
131131+ if namespace = None && name_lower = "style" then begin
132132+ List.iter (fun (attr_name, attr_value) ->
133133+ let attr_lower = String.lowercase_ascii attr_name in
134134+ if attr_lower = "type" then begin
135135+ let value_lower = String.lowercase_ascii (String.trim attr_value) in
136136+ if value_lower <> "text/css" then
137137+ Message_collector.add_error collector
138138+ ~message:"The only allowed value for the \xe2\x80\x9ctype\xe2\x80\x9d attribute for the \xe2\x80\x9cstyle\xe2\x80\x9d element is \xe2\x80\x9ctext/css\xe2\x80\x9d (with no parameters). (But the attribute is not needed and should be omitted altogether.)"
139139+ ~code:"bad-attribute-value"
140140+ ~element:name ~attribute:attr_name ()
141141+ end
142142+ ) attrs
143143+ end;
144144+145145+ (* Validate object element requires data or type attribute *)
146146+ if namespace = None && name_lower = "object" then begin
147147+ let has_data = has_attr "data" attrs in
148148+ let has_type = has_attr "type" attrs in
149149+ if not has_data && not has_type then
150150+ Message_collector.add_error collector
151151+ ~message:"Element \xe2\x80\x9cobject\xe2\x80\x9d is missing required attribute \xe2\x80\x9cdata\xe2\x80\x9d."
152152+ ~code:"missing-required-attribute"
153153+ ~element:name ~attribute:"data" ()
154154+ end;
155155+156156+ (* Validate link imagesizes/imagesrcset attributes *)
157157+ if namespace = None && name_lower = "link" then begin
158158+ let has_imagesizes = has_attr "imagesizes" attrs in
159159+ let has_imagesrcset = has_attr "imagesrcset" attrs in
160160+ let rel_value = get_attr "rel" attrs in
161161+ let as_value = get_attr "as" attrs in
162162+163163+ (* imagesizes requires imagesrcset *)
164164+ if has_imagesizes && not has_imagesrcset then
165165+ Message_collector.add_error collector
166166+ ~message:"The \xe2\x80\x9cimagesizes\xe2\x80\x9d attribute must only be specified if the \xe2\x80\x9cimagesrcset\xe2\x80\x9d attribute is also specified."
167167+ ~code:"missing-required-attribute"
168168+ ~element:name ~attribute:"imagesrcset" ();
169169+170170+ (* imagesrcset requires as="image" *)
171171+ if has_imagesrcset then begin
172172+ let as_is_image = match as_value with
173173+ | Some v -> String.lowercase_ascii (String.trim v) = "image"
174174+ | None -> false
175175+ in
176176+ if not as_is_image then
177177+ Message_collector.add_error collector
178178+ ~message:"A \xe2\x80\x9clink\xe2\x80\x9d element with an \xe2\x80\x9cimagesrcset\xe2\x80\x9d attribute must have an \xe2\x80\x9cas\xe2\x80\x9d attribute with value \xe2\x80\x9cimage\xe2\x80\x9d."
179179+ ~code:"missing-required-attribute"
180180+ ~element:name ~attribute:"as" ()
181181+ end;
182182+183183+ (* as attribute requires rel="preload" or rel="modulepreload" *)
184184+ (match as_value with
185185+ | Some _ ->
186186+ let rel_is_preload = match rel_value with
187187+ | Some v ->
188188+ let rel_lower = String.lowercase_ascii (String.trim v) in
189189+ String.length rel_lower > 0 &&
190190+ (List.mem "preload" (String.split_on_char ' ' rel_lower) ||
191191+ List.mem "modulepreload" (String.split_on_char ' ' rel_lower))
192192+ | None -> false
193193+ in
194194+ if not rel_is_preload then
195195+ Message_collector.add_error collector
196196+ ~message:"A \xe2\x80\x9clink\xe2\x80\x9d element with an \xe2\x80\x9cas\xe2\x80\x9d attribute must have a \xe2\x80\x9crel\xe2\x80\x9d attribute that contains the value \xe2\x80\x9cpreload\xe2\x80\x9d or the value \xe2\x80\x9cmodulepreload\xe2\x80\x9d."
197197+ ~code:"missing-required-attribute"
198198+ ~element:name ~attribute:"rel" ()
199199+ | None -> ())
200200+ end;
201201+202202+ (* Validate img usemap attribute - must be hash-name reference with content *)
203203+ if namespace = None && name_lower = "img" then begin
204204+ List.iter (fun (attr_name, attr_value) ->
205205+ let attr_lower = String.lowercase_ascii attr_name in
206206+ if attr_lower = "usemap" then begin
207207+ if attr_value = "#" then
208208+ Message_collector.add_error collector
209209+ ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad hash-name reference: A hash-name reference must have at least one character after \xe2\x80\x9c#\xe2\x80\x9d."
210210+ attr_value attr_name name)
211211+ ~code:"bad-attribute-value"
212212+ ~element:name ~attribute:attr_name ()
213213+ end
214214+ ) attrs
215215+ end;
216216+217217+ (* Validate embed type attribute - must be valid MIME type *)
218218+ if namespace = None && name_lower = "embed" then begin
219219+ List.iter (fun (attr_name, attr_value) ->
220220+ let attr_lower = String.lowercase_ascii attr_name in
221221+ if attr_lower = "type" then begin
222222+ match Dt_mime.validate_mime_type attr_value with
223223+ | Ok () -> ()
224224+ | Error msg ->
225225+ Message_collector.add_error collector
226226+ ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: %s"
227227+ attr_value attr_name name msg)
228228+ ~code:"bad-attribute-value"
229229+ ~element:name ~attribute:attr_name ()
230230+ end
231231+ ) attrs
232232+ end;
233233+234234+ (* Validate width/height on embed and img - must be non-negative integers *)
235235+ if namespace = None && (name_lower = "embed" || name_lower = "img" ||
236236+ name_lower = "video" || name_lower = "canvas" ||
237237+ name_lower = "iframe" || name_lower = "source") then begin
238238+ List.iter (fun (attr_name, attr_value) ->
239239+ let attr_lower = String.lowercase_ascii attr_name in
240240+ if attr_lower = "width" || attr_lower = "height" then begin
241241+ (* Check for non-negative integer only *)
242242+ let is_valid =
243243+ String.length attr_value > 0 &&
244244+ String.for_all (fun c -> c >= '0' && c <= '9') attr_value
245245+ in
246246+ if not is_valid then begin
247247+ (* Determine specific error message *)
248248+ let error_msg =
249249+ if String.length attr_value = 0 then
250250+ Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer."
251251+ attr_name name
252252+ else if String.contains attr_value '%' then
253253+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead."
254254+ attr_value attr_name name
255255+ else if String.length attr_value > 0 && attr_value.[0] = '-' then
256256+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The value must be non-negative."
257257+ attr_value attr_name name
258258+ else
259259+ (* Find first non-digit character *)
260260+ let bad_char =
261261+ try
262262+ let i = ref 0 in
263263+ while !i < String.length attr_value && attr_value.[!i] >= '0' && attr_value.[!i] <= '9' do
264264+ incr i
265265+ done;
266266+ if !i < String.length attr_value then Some attr_value.[!i] else None
267267+ with _ -> None
268268+ in
269269+ match bad_char with
270270+ | Some c ->
271271+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead."
272272+ attr_value attr_name name c
273273+ | None ->
274274+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit."
275275+ attr_value attr_name name
276276+ in
277277+ Message_collector.add_error collector
278278+ ~message:error_msg
279279+ ~code:"bad-attribute-value"
280280+ ~element:name ~attribute:attr_name ()
281281+ end
282282+ end
283283+ ) attrs
284284+ end;
285285+286286+ (* Validate area[shape=default] cannot have coords *)
287287+ if namespace = None && name_lower = "area" then begin
288288+ let shape_value = get_attr "shape" attrs in
289289+ match shape_value with
290290+ | Some s when String.lowercase_ascii (String.trim s) = "default" ->
291291+ if has_attr "coords" attrs then
292292+ Message_collector.add_error collector
293293+ ~message:"Attribute \xe2\x80\x9ccoords\xe2\x80\x9d not allowed on element \xe2\x80\x9carea\xe2\x80\x9d at this point."
294294+ ~code:"disallowed-attribute"
295295+ ~element:name ~attribute:"coords" ()
296296+ | _ -> ()
297297+ end;
298298+299299+ (* Validate bdo element requires dir attribute, and dir cannot be "auto" *)
300300+ if namespace = None && name_lower = "bdo" then begin
301301+ let dir_value = get_attr "dir" attrs in
302302+ match dir_value with
303303+ | None ->
304304+ Message_collector.add_error collector
305305+ ~message:"Element \xe2\x80\x9cbdo\xe2\x80\x9d must have attribute \xe2\x80\x9cdir\xe2\x80\x9d."
306306+ ~code:"missing-required-attribute"
307307+ ~element:name ~attribute:"dir" ()
308308+ | Some v when String.lowercase_ascii (String.trim v) = "auto" ->
309309+ Message_collector.add_error collector
310310+ ~message:"The value of \xe2\x80\x9cdir\xe2\x80\x9d attribute for the \xe2\x80\x9cbdo\xe2\x80\x9d element must not be \xe2\x80\x9cauto\xe2\x80\x9d."
311311+ ~code:"bad-attribute-value"
312312+ ~element:name ~attribute:"dir" ()
313313+ | _ -> ()
314314+ end;
315315+316316+ (* Validate input list attribute - only allowed for certain types *)
317317+ if namespace = None && name_lower = "input" then begin
318318+ if has_attr "list" attrs then begin
319319+ let input_type = match get_attr "type" attrs with
320320+ | Some t -> String.lowercase_ascii (String.trim t)
321321+ | None -> "text" (* default type is text *)
322322+ in
323323+ if not (List.mem input_type input_types_allowing_list) then
324324+ Message_collector.add_error collector
325325+ ~message:"Attribute \xe2\x80\x9clist\xe2\x80\x9d is only allowed when the input type is \xe2\x80\x9ccolor\xe2\x80\x9d, \xe2\x80\x9cdate\xe2\x80\x9d, \xe2\x80\x9cdatetime-local\xe2\x80\x9d, \xe2\x80\x9cemail\xe2\x80\x9d, \xe2\x80\x9cmonth\xe2\x80\x9d, \xe2\x80\x9cnumber\xe2\x80\x9d, \xe2\x80\x9crange\xe2\x80\x9d, \xe2\x80\x9csearch\xe2\x80\x9d, \xe2\x80\x9ctel\xe2\x80\x9d, \xe2\x80\x9ctext\xe2\x80\x9d, \xe2\x80\x9ctime\xe2\x80\x9d, \xe2\x80\x9curl\xe2\x80\x9d, or \xe2\x80\x9cweek\xe2\x80\x9d."
326326+ ~code:"disallowed-attribute"
327327+ ~element:name ~attribute:"list" ()
328328+ end
329329+ end;
330330+331331+ (* Note: data-* uppercase check requires XML parsing which preserves case.
332332+ The HTML5 parser normalizes attribute names to lowercase, so this check
333333+ is only effective when the document is parsed as XML.
334334+ Commenting out until we have XML parsing support. *)
335335+ ignore state.is_xhtml
336336+337337+let end_element _state ~name:_ ~namespace:_ _collector = ()
338338+let characters _state _text _collector = ()
339339+let end_document _state _collector = ()
340340+341341+let checker =
342342+ (module struct
343343+ type nonrec state = state
344344+ let create = create
345345+ let reset = reset
346346+ let start_element = start_element
347347+ let end_element = end_element
348348+ let characters = characters
349349+ let end_document = end_document
350350+ end : Checker.S)
+55
lib/html5_checker/specialized/base_checker.ml
···11+(** Base element ordering checker. *)
22+33+type state = {
44+ mutable seen_link_or_script : bool;
55+}
66+77+let create () = {
88+ seen_link_or_script = false;
99+}
1010+1111+let reset state =
1212+ state.seen_link_or_script <- false
1313+1414+(** Check if an attribute list contains a specific attribute. *)
1515+let has_attr name attrs =
1616+ List.exists (fun (attr_name, _) -> String.lowercase_ascii attr_name = name) attrs
1717+1818+let start_element state ~name ~namespace ~attrs collector =
1919+ if namespace <> None then ()
2020+ else begin
2121+ let name_lower = String.lowercase_ascii name in
2222+ match name_lower with
2323+ | "link" | "script" ->
2424+ state.seen_link_or_script <- true
2525+ | "base" ->
2626+ if state.seen_link_or_script then
2727+ Message_collector.add_error collector
2828+ ~message:"The \xe2\x80\x9cbase\xe2\x80\x9d element must come before any \xe2\x80\x9clink\xe2\x80\x9d or \xe2\x80\x9cscript\xe2\x80\x9d elements in the document."
2929+ ~code:"base-after-link-script"
3030+ ~element:name ();
3131+ (* base element must have href or target attribute *)
3232+ let has_href = has_attr "href" attrs in
3333+ let has_target = has_attr "target" attrs in
3434+ if not has_href && not has_target then
3535+ Message_collector.add_error collector
3636+ ~message:"Element \xe2\x80\x9cbase\xe2\x80\x9d is missing one or more of the following attributes: [href, target]."
3737+ ~code:"missing-required-attribute"
3838+ ~element:name ()
3939+ | _ -> ()
4040+ end
4141+4242+let end_element _state ~name:_ ~namespace:_ _collector = ()
4343+let characters _state _text _collector = ()
4444+let end_document _state _collector = ()
4545+4646+let checker =
4747+ (module struct
4848+ type nonrec state = state
4949+ let create = create
5050+ let reset = reset
5151+ let start_element = start_element
5252+ let end_element = end_element
5353+ let characters = characters
5454+ let end_document = end_document
5555+ end : Checker.S)
+419
lib/html5_checker/specialized/datetime_checker.ml
···11+(** Datetime attribute validation checker *)
22+33+(** Elements that have datetime attribute *)
44+let datetime_elements = ["del"; "ins"; "time"]
55+66+(** Helper: check if char is digit *)
77+let is_digit c = c >= '0' && c <= '9'
88+99+(** Parse int safely *)
1010+let parse_int s =
1111+ try Some (int_of_string s) with _ -> None
1212+1313+(** Days in each month (non-leap year) *)
1414+let days_in_month = [| 31; 28; 31; 30; 31; 30; 31; 31; 30; 31; 30; 31 |]
1515+1616+(** Check if a year is a leap year *)
1717+let is_leap_year year =
1818+ (year mod 400 = 0) || (year mod 4 = 0 && year mod 100 <> 0)
1919+2020+(** Get max day for a given month/year *)
2121+let max_day_for_month year month =
2222+ if month = 2 && is_leap_year year then 29
2323+ else if month >= 1 && month <= 12 then days_in_month.(month - 1)
2424+ else 31
2525+2626+(** Validate date string YYYY-MM-DD. Returns (valid, error_reason option) *)
2727+let validate_date s =
2828+ let pattern = Str.regexp "^\\([0-9]+\\)-\\([0-9][0-9]\\)-\\([0-9][0-9]\\)$" in
2929+ if not (Str.string_match pattern s 0) then
3030+ (false, Some "Date must be in YYYY-MM-DD format")
3131+ else
3232+ let year_s = Str.matched_group 1 s in
3333+ let month_s = Str.matched_group 2 s in
3434+ let day_s = Str.matched_group 3 s in
3535+ if String.length year_s < 4 then
3636+ (false, Some "Year must be at least 4 digits")
3737+ else
3838+ match (parse_int year_s, parse_int month_s, parse_int day_s) with
3939+ | None, _, _ | _, None, _ | _, _, None ->
4040+ (false, Some "Invalid year, month or day")
4141+ | Some year, Some month, Some day ->
4242+ if year < 1 then (false, Some "Year cannot be less than 1")
4343+ else if month < 1 || month > 12 then (false, Some "Month out of range")
4444+ else if day < 1 then (false, Some "Day cannot be less than 1")
4545+ else
4646+ let max_day = max_day_for_month year month in
4747+ if day > max_day then (false, Some "Day out of range")
4848+ else (true, None)
4949+5050+(** Check if a date-like value has a 5+ digit year (might be mistyped) *)
5151+let has_suspicious_year s =
5252+ let pattern = Str.regexp "^\\([0-9]+\\)-" in
5353+ if Str.string_match pattern s 0 then
5454+ let year_s = Str.matched_group 1 s in
5555+ String.length year_s > 4
5656+ else
5757+ false
5858+5959+(** Validate time string HH:MM[:SS[.sss]] *)
6060+let validate_time s =
6161+ let pattern = Str.regexp "^\\([0-9][0-9]\\):\\([0-9][0-9]\\)\\(:\\([0-9][0-9]\\)\\(\\.\\([0-9]+\\)\\)?\\)?$" in
6262+ if not (Str.string_match pattern s 0) then
6363+ (false, Some "Time must be in HH:MM format")
6464+ else
6565+ let hour_s = Str.matched_group 1 s in
6666+ let minute_s = Str.matched_group 2 s in
6767+ match (parse_int hour_s, parse_int minute_s) with
6868+ | None, _ | _, None -> (false, Some "Invalid hour or minute")
6969+ | Some hour, Some minute ->
7070+ if hour > 23 then (false, Some "Hour out of range")
7171+ else if minute > 59 then (false, Some "Minute out of range")
7272+ else
7373+ let second_s = try Some (Str.matched_group 4 s) with Not_found -> None in
7474+ match second_s with
7575+ | None -> (true, None)
7676+ | Some sec_s ->
7777+ match parse_int sec_s with
7878+ | None -> (false, Some "Invalid seconds")
7979+ | Some sec ->
8080+ if sec > 59 then (false, Some "Second out of range")
8181+ else
8282+ (* Check milliseconds if present *)
8383+ let millis_s = try Some (Str.matched_group 6 s) with Not_found -> None in
8484+ match millis_s with
8585+ | None -> (true, None)
8686+ | Some ms ->
8787+ if String.length ms < 1 || String.length ms > 3 then
8888+ (false, Some "A fraction of a second must be one, two, or three digits")
8989+ else
9090+ (true, None)
9191+9292+(** Validate year-only format YYYY (at least 4 digits, > 0) *)
9393+let validate_year_only s =
9494+ let pattern = Str.regexp "^\\([0-9]+\\)$" in
9595+ if not (Str.string_match pattern s 0) then
9696+ (false, Some "Year must be digits only")
9797+ else
9898+ let year_s = Str.matched_group 1 s in
9999+ if String.length year_s < 4 then
100100+ (false, Some "Year must be at least 4 digits")
101101+ else
102102+ match parse_int year_s with
103103+ | None -> (false, Some "Invalid year")
104104+ | Some year ->
105105+ if year < 1 then (false, Some "Year cannot be less than 1")
106106+ else (true, None)
107107+108108+(** Validate month format YYYY-MM *)
109109+let validate_year_month s =
110110+ let pattern = Str.regexp "^\\([0-9]+\\)-\\([0-9][0-9]\\)$" in
111111+ if not (Str.string_match pattern s 0) then
112112+ (false, Some "Month must be in YYYY-MM format")
113113+ else
114114+ let year_s = Str.matched_group 1 s in
115115+ let month_s = Str.matched_group 2 s in
116116+ if String.length year_s < 4 then
117117+ (false, Some "Year must be at least 4 digits")
118118+ else
119119+ match (parse_int year_s, parse_int month_s) with
120120+ | None, _ | _, None -> (false, Some "Invalid year or month")
121121+ | Some year, Some month ->
122122+ if year < 1 then (false, Some "Year cannot be less than 1")
123123+ else if month < 1 || month > 12 then (false, Some "Month out of range")
124124+ else (true, None)
125125+126126+(** Validate week format YYYY-Www *)
127127+let validate_week s =
128128+ let pattern = Str.regexp "^\\([0-9]+\\)-W\\([0-9][0-9]\\)$" in
129129+ if not (Str.string_match pattern s 0) then
130130+ (false, Some "Week must be in YYYY-Www format")
131131+ else
132132+ let year_s = Str.matched_group 1 s in
133133+ let week_s = Str.matched_group 2 s in
134134+ if String.length year_s < 4 then
135135+ (false, Some "Year must be at least 4 digits")
136136+ else
137137+ match (parse_int year_s, parse_int week_s) with
138138+ | None, _ | _, None -> (false, Some "Invalid year or week")
139139+ | Some year, Some week ->
140140+ if year < 1 then (false, Some "Year cannot be less than 1")
141141+ else if week < 1 || week > 53 then (false, Some "Week out of range")
142142+ else (true, None)
143143+144144+(** Validate yearless date format --MM-DD *)
145145+let validate_yearless_date s =
146146+ let pattern = Str.regexp "^--\\([0-9][0-9]\\)-\\([0-9][0-9]\\)$" in
147147+ if not (Str.string_match pattern s 0) then
148148+ (false, Some "Yearless date must be in --MM-DD format")
149149+ else
150150+ let month_s = Str.matched_group 1 s in
151151+ let day_s = Str.matched_group 2 s in
152152+ match (parse_int month_s, parse_int day_s) with
153153+ | None, _ | _, None -> (false, Some "Invalid month or day")
154154+ | Some month, Some day ->
155155+ if month < 1 || month > 12 then (false, Some "Month out of range")
156156+ else if day < 1 then (false, Some "Day cannot be less than 1")
157157+ else
158158+ (* Use non-leap year for yearless date validation *)
159159+ let max_day = if month = 2 then 29 else days_in_month.(month - 1) in
160160+ if day > max_day then (false, Some "Day out of range")
161161+ else (true, None)
162162+163163+(** Validate duration format - HTML5 only accepts:
164164+ 1. Duration time component: PT#H#M#S (or PT#H, PT#M, PT#S, etc.)
165165+ 2. Duration weeks: P#W
166166+ 3. Duration days: P#D or P#DT#H#M#S *)
167167+let validate_duration s =
168168+ if String.length s < 2 then
169169+ (false, Some "Duration too short")
170170+ else if s.[0] <> 'P' then
171171+ (false, Some "Duration must start with P")
172172+ else
173173+ let rest = String.sub s 1 (String.length s - 1) in
174174+ (* Valid HTML5 duration patterns:
175175+ - PT#H#M#S (or any combination of H, M, S after T)
176176+ - P#W (weeks only)
177177+ - P#D or P#DT#H#M#S (days with optional time) *)
178178+ let pattern_time_only = Str.regexp "^T\\([0-9]+H\\)?\\([0-9]+M\\)?\\([0-9]+\\(\\.[0-9]+\\)?S\\)?$" in
179179+ let pattern_weeks = Str.regexp "^[0-9]+W$" in
180180+ let pattern_days = Str.regexp "^[0-9]+D\\(T\\([0-9]+H\\)?\\([0-9]+M\\)?\\([0-9]+\\(\\.[0-9]+\\)?S\\)?\\)?$" in
181181+ if Str.string_match pattern_time_only rest 0 then
182182+ (* Check that at least one component exists after T *)
183183+ if String.length rest > 1 then (true, None)
184184+ else (false, Some "Invalid duration format")
185185+ else if Str.string_match pattern_weeks rest 0 then
186186+ (true, None)
187187+ else if Str.string_match pattern_days rest 0 then
188188+ (true, None)
189189+ else
190190+ (false, Some "Invalid duration format")
191191+192192+(** Validate timezone offset +HH:MM or -HH:MM or +HHMM or -HHMM *)
193193+let validate_timezone_offset s =
194194+ (* Try +HH:MM format *)
195195+ let pattern_colon = Str.regexp "^[+-]\\([0-9][0-9]\\):\\([0-9][0-9]\\)$" in
196196+ (* Try +HHMM format (no colon) *)
197197+ let pattern_no_colon = Str.regexp "^[+-]\\([0-9][0-9]\\)\\([0-9][0-9]\\)$" in
198198+ let matched =
199199+ if Str.string_match pattern_colon s 0 then true
200200+ else Str.string_match pattern_no_colon s 0
201201+ in
202202+ if not matched then
203203+ (false, Some "Invalid timezone offset")
204204+ else
205205+ let hour_s = Str.matched_group 1 s in
206206+ let minute_s = Str.matched_group 2 s in
207207+ match (parse_int hour_s, parse_int minute_s) with
208208+ | None, _ | _, None -> (false, Some "Invalid timezone")
209209+ | Some hour, Some minute ->
210210+ if hour > 23 || minute > 59 then (false, Some "Timezone offset out of range")
211211+ else (true, None)
212212+213213+(** Validate datetime with timezone: YYYY-MM-DDTHH:MM:SS[.sss]Z or YYYY-MM-DDTHH:MM:SS[.sss]+HH:MM *)
214214+let validate_datetime_with_timezone s =
215215+ (* Try to split on T or space *)
216216+ let sep_pos =
217217+ try Some (String.index s 'T')
218218+ with Not_found ->
219219+ try Some (String.index s ' ')
220220+ with Not_found -> None
221221+ in
222222+ match sep_pos with
223223+ | None -> (false, Some "The literal did not satisfy the datetime with timezone format")
224224+ | Some pos ->
225225+ let date_part = String.sub s 0 pos in
226226+ let time_and_tz = String.sub s (pos + 1) (String.length s - pos - 1) in
227227+ (* Validate date *)
228228+ match validate_date date_part with
229229+ | (false, reason) -> (false, reason)
230230+ | (true, _) ->
231231+ (* Check if ends with Z *)
232232+ if String.length time_and_tz > 0 && time_and_tz.[String.length time_and_tz - 1] = 'Z' then begin
233233+ let time_part = String.sub time_and_tz 0 (String.length time_and_tz - 1) in
234234+ match validate_time time_part with
235235+ | (false, _) -> (false, Some "The literal did not satisfy the datetime with timezone format")
236236+ | (true, _) -> (true, None)
237237+ end
238238+ else begin
239239+ (* Check for +/- timezone offset *)
240240+ let plus_pos = try Some (String.rindex time_and_tz '+') with Not_found -> None in
241241+ let minus_pos = try Some (String.rindex time_and_tz '-') with Not_found -> None in
242242+ let tz_pos = match plus_pos, minus_pos with
243243+ | Some p, Some m -> Some (max p m)
244244+ | Some p, None -> Some p
245245+ | None, Some m -> Some m
246246+ | None, None -> None
247247+ in
248248+ match tz_pos with
249249+ | None -> (false, Some "The literal did not satisfy the datetime with timezone format")
250250+ | Some tp ->
251251+ let time_part = String.sub time_and_tz 0 tp in
252252+ let tz_part = String.sub time_and_tz tp (String.length time_and_tz - tp) in
253253+ match validate_time time_part with
254254+ | (false, _) -> (false, Some "The literal did not satisfy the datetime with timezone format")
255255+ | (true, _) ->
256256+ match validate_timezone_offset tz_part with
257257+ | (false, _) -> (false, Some "The literal did not satisfy the datetime with timezone format")
258258+ | (true, _) -> (true, None)
259259+ end
260260+261261+(** Validate datetime-local: YYYY-MM-DDTHH:MM[:SS[.sss]] or YYYY-MM-DD HH:MM *)
262262+let validate_datetime_local s =
263263+ let sep_pos =
264264+ try Some (String.index s 'T')
265265+ with Not_found ->
266266+ try Some (String.index s ' ')
267267+ with Not_found -> None
268268+ in
269269+ match sep_pos with
270270+ | None -> (false, Some "Invalid datetime-local format")
271271+ | Some pos ->
272272+ let date_part = String.sub s 0 pos in
273273+ let time_part = String.sub s (pos + 1) (String.length s - pos - 1) in
274274+ match validate_date date_part with
275275+ | (false, reason) -> (false, reason)
276276+ | (true, _) ->
277277+ match validate_time time_part with
278278+ | (false, reason) -> (false, reason)
279279+ | (true, _) -> (true, None)
280280+281281+(** Result type for datetime validation - can be Ok, Error, or Warning *)
282282+type datetime_result =
283283+ | Ok
284284+ | Error of string
285285+ | Warning of string
286286+287287+(** Validate datetime attribute - valid formats depend on element:
288288+ - del/ins: only date or datetime-with-timezone
289289+ - time: date, time, datetime-local, datetime-with-timezone, year, month, week, yearless, duration *)
290290+let validate_datetime_attr value element_name attr_name =
291291+ let is_time_element = element_name = "time" in
292292+ (* Check for leading/trailing whitespace - not allowed *)
293293+ if value <> String.trim value then begin
294294+ let tz_msg = "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format." in
295295+ let date_msg = "Bad date: The literal did not satisfy the date format." in
296296+ Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
297297+ value attr_name element_name tz_msg date_msg)
298298+ end
299299+ else
300300+ (* Try datetime with timezone first *)
301301+ match validate_datetime_with_timezone value with
302302+ | (true, _) -> Ok (* Valid datetime with timezone *)
303303+ | (false, tz_error) ->
304304+ (* Try just date - valid for all elements *)
305305+ match validate_date value with
306306+ | (true, _) ->
307307+ (* Date is valid, but check for suspicious year (5+ digits) *)
308308+ if has_suspicious_year value then begin
309309+ let date_msg = "Bad date: Year may be mistyped." in
310310+ let tz_msg = match tz_error with
311311+ | Some e -> Printf.sprintf "Bad datetime with timezone: %s." e
312312+ | None -> "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format."
313313+ in
314314+ Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
315315+ value attr_name element_name date_msg tz_msg)
316316+ end else
317317+ Ok (* Valid date with normal year *)
318318+ | (false, date_error) ->
319319+ (* For time element only, try additional formats *)
320320+ if is_time_element then begin
321321+ match validate_datetime_local value with
322322+ | (true, _) -> Ok (* Valid datetime-local *)
323323+ | (false, _) ->
324324+ match validate_time value with
325325+ | (true, _) -> Ok (* Valid time *)
326326+ | (false, _) ->
327327+ match validate_year_month value with
328328+ | (true, _) -> Ok (* Valid month YYYY-MM *)
329329+ | (false, _) ->
330330+ match validate_year_only value with
331331+ | (true, _) -> Ok (* Valid year YYYY *)
332332+ | (false, _) ->
333333+ match validate_week value with
334334+ | (true, _) -> Ok (* Valid week YYYY-Www *)
335335+ | (false, _) ->
336336+ match validate_yearless_date value with
337337+ | (true, _) -> Ok (* Valid yearless date --MM-DD *)
338338+ | (false, _) ->
339339+ match validate_duration value with
340340+ | (true, _) -> Ok (* Valid duration P... *)
341341+ | (false, _) ->
342342+ let tz_msg = match tz_error with
343343+ | Some e -> Printf.sprintf "Bad datetime with timezone: %s." e
344344+ | None -> "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format."
345345+ in
346346+ let date_msg = match date_error with
347347+ | Some e -> Printf.sprintf "Bad date: %s." e
348348+ | None -> "Bad date: The literal did not satisfy the date format."
349349+ in
350350+ Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
351351+ value attr_name element_name tz_msg date_msg)
352352+ end
353353+ else begin
354354+ (* del/ins only allow date or datetime-with-timezone *)
355355+ let tz_msg = match tz_error with
356356+ | Some e -> Printf.sprintf "Bad datetime with timezone: %s." e
357357+ | None -> "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format."
358358+ in
359359+ let date_msg = match date_error with
360360+ | Some e -> Printf.sprintf "Bad date: %s." e
361361+ | None -> "Bad date: The literal did not satisfy the date format."
362362+ in
363363+ Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
364364+ value attr_name element_name tz_msg date_msg)
365365+ end
366366+367367+(** Checker state *)
368368+type state = unit
369369+370370+let create () = ()
371371+let reset _state = ()
372372+373373+let start_element _state ~name ~namespace ~attrs collector =
374374+ if namespace <> None then ()
375375+ else begin
376376+ let name_lower = String.lowercase_ascii name in
377377+ if List.mem name_lower datetime_elements then begin
378378+ (* Check for datetime attribute *)
379379+ let datetime_attr = List.find_map (fun (k, v) ->
380380+ if String.lowercase_ascii k = "datetime" then Some v else None
381381+ ) attrs in
382382+ match datetime_attr with
383383+ | None -> ()
384384+ | Some value ->
385385+ if String.trim value = "" then ()
386386+ else
387387+ match validate_datetime_attr value name "datetime" with
388388+ | Ok -> ()
389389+ | Error error_msg ->
390390+ Message_collector.add_error collector
391391+ ~message:error_msg
392392+ ~code:"bad-datetime"
393393+ ~element:name
394394+ ~attribute:"datetime"
395395+ ()
396396+ | Warning warn_msg ->
397397+ Message_collector.add_warning collector
398398+ ~message:warn_msg
399399+ ~code:"suspicious-datetime"
400400+ ~element:name
401401+ ~attribute:"datetime"
402402+ ()
403403+ end
404404+ end
405405+406406+let end_element _state ~name:_ ~namespace:_ _collector = ()
407407+let characters _state _text _collector = ()
408408+let end_document _state _collector = ()
409409+410410+let checker =
411411+ (module struct
412412+ type nonrec state = state
413413+ let create = create
414414+ let reset = reset
415415+ let start_element = start_element
416416+ let end_element = end_element
417417+ let characters = characters
418418+ let end_document = end_document
419419+ end : Checker.S)
+283
lib/html5_checker/specialized/dl_checker.ml
···11+(** DL element content model validation checker. *)
22+33+(** Checker state for tracking dl element context. *)
44+type dl_context = {
55+ mutable has_dt : bool;
66+ mutable has_dd : bool;
77+ mutable last_was_dt : bool;
88+ mutable contains_div : bool;
99+ mutable contains_dt_dd : bool;
1010+ mutable dd_before_dt_error_reported : bool; (* Track if we've reported dd-before-dt error *)
1111+}
1212+1313+type div_context = {
1414+ mutable has_dt : bool;
1515+ mutable has_dd : bool;
1616+}
1717+1818+type state = {
1919+ mutable dl_stack : dl_context list;
2020+ mutable div_in_dl_stack : div_context list;
2121+ mutable in_template : int; (* Template nesting depth *)
2222+ mutable in_dt_dd : int; (* Depth inside dt/dd elements *)
2323+ mutable parent_stack : string list; (* Stack of parent element names for context errors *)
2424+}
2525+2626+let create () = {
2727+ dl_stack = [];
2828+ div_in_dl_stack = [];
2929+ in_template = 0;
3030+ in_dt_dd = 0;
3131+ parent_stack = [];
3232+}
3333+3434+let reset state =
3535+ state.dl_stack <- [];
3636+ state.div_in_dl_stack <- [];
3737+ state.in_template <- 0;
3838+ state.in_dt_dd <- 0;
3939+ state.parent_stack <- []
4040+4141+let current_parent state =
4242+ (* The stack has current element on top, so parent is second *)
4343+ match state.parent_stack with
4444+ | _ :: p :: _ -> Some p
4545+ | _ -> None
4646+4747+let current_dl state =
4848+ match state.dl_stack with
4949+ | ctx :: _ -> Some ctx
5050+ | [] -> None
5151+5252+let current_div state =
5353+ match state.div_in_dl_stack with
5454+ | ctx :: _ -> Some ctx
5555+ | [] -> None
5656+5757+let start_element state ~name ~namespace ~attrs:_ collector =
5858+ let name_lower = String.lowercase_ascii name in
5959+6060+ (* Track parent stack for all HTML elements first *)
6161+ if namespace = None then
6262+ state.parent_stack <- name_lower :: state.parent_stack;
6363+6464+ if namespace <> None then ()
6565+ else begin
6666+ match name_lower with
6767+ | "template" ->
6868+ state.in_template <- state.in_template + 1
6969+7070+ | "dl" when state.in_template = 0 ->
7171+ (* Check for nested dl - only error if direct child (not inside dt/dd) *)
7272+ begin match current_dl state with
7373+ | Some _ when state.in_dt_dd = 0 && state.div_in_dl_stack = [] ->
7474+ Message_collector.add_error collector
7575+ ~message:"Element \xe2\x80\x9cdl\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cdl\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"
7676+ ~code:"disallowed-child"
7777+ ~element:"dl" ()
7878+ | _ -> ()
7979+ end;
8080+ let ctx = {
8181+ has_dt = false;
8282+ has_dd = false;
8383+ last_was_dt = false;
8484+ contains_div = false;
8585+ contains_dt_dd = false;
8686+ dd_before_dt_error_reported = false;
8787+ } in
8888+ state.dl_stack <- ctx :: state.dl_stack
8989+9090+ | "div" when state.in_template = 0 ->
9191+ begin match current_dl state with
9292+ | Some dl_ctx when state.div_in_dl_stack = [] ->
9393+ (* Direct div child of dl *)
9494+ dl_ctx.contains_div <- true;
9595+ (* Check for mixed content - if we already have dt/dd, div is not allowed *)
9696+ if dl_ctx.contains_dt_dd then
9797+ Message_collector.add_error collector
9898+ ~message:"Element \xe2\x80\x9cdiv\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cdl\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"
9999+ ~code:"disallowed-child"
100100+ ~element:"div" ();
101101+ let div_ctx = { has_dt = false; has_dd = false } in
102102+ state.div_in_dl_stack <- div_ctx :: state.div_in_dl_stack
103103+ | Some _ when state.div_in_dl_stack <> [] ->
104104+ (* Nested div inside div in dl - not allowed *)
105105+ Message_collector.add_error collector
106106+ ~message:"Element \xe2\x80\x9cdiv\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cdiv\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"
107107+ ~code:"disallowed-child"
108108+ ~element:"div" ()
109109+ | _ -> ()
110110+ end
111111+112112+ | "dt" when state.in_template = 0 ->
113113+ state.in_dt_dd <- state.in_dt_dd + 1;
114114+ begin match current_div state with
115115+ | Some div_ctx ->
116116+ div_ctx.has_dt <- true
117117+ | None ->
118118+ match current_dl state with
119119+ | Some dl_ctx ->
120120+ dl_ctx.has_dt <- true;
121121+ dl_ctx.last_was_dt <- true;
122122+ dl_ctx.contains_dt_dd <- true;
123123+ (* Check for mixed content - if we already have div, dt is not allowed *)
124124+ if dl_ctx.contains_div then
125125+ Message_collector.add_error collector
126126+ ~message:"Element \xe2\x80\x9cdt\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cdl\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"
127127+ ~code:"disallowed-child"
128128+ ~element:"dt" ()
129129+ | None ->
130130+ (* dt outside dl context - error *)
131131+ let parent = match current_parent state with
132132+ | Some p -> p
133133+ | None -> "document"
134134+ in
135135+ Message_collector.add_error collector
136136+ ~message:(Printf.sprintf "Element \xe2\x80\x9cdt\xe2\x80\x9d not allowed as child of element \xe2\x80\x9c%s\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)" parent)
137137+ ~code:"disallowed-child"
138138+ ~element:"dt" ()
139139+ end
140140+141141+ | "dd" when state.in_template = 0 ->
142142+ state.in_dt_dd <- state.in_dt_dd + 1;
143143+ begin match current_div state with
144144+ | Some div_ctx ->
145145+ div_ctx.has_dd <- true
146146+ | None ->
147147+ match current_dl state with
148148+ | Some dl_ctx ->
149149+ (* Check if dd appears before any dt - only report once per dl *)
150150+ if not dl_ctx.has_dt && not dl_ctx.dd_before_dt_error_reported then begin
151151+ dl_ctx.dd_before_dt_error_reported <- true;
152152+ Message_collector.add_error collector
153153+ ~message:"Element \xe2\x80\x9cdl\xe2\x80\x9d is missing a required child element."
154154+ ~code:"missing-required-child"
155155+ ~element:"dl" ()
156156+ end;
157157+ dl_ctx.has_dd <- true;
158158+ dl_ctx.last_was_dt <- false;
159159+ dl_ctx.contains_dt_dd <- true;
160160+ (* Check for mixed content *)
161161+ if dl_ctx.contains_div then
162162+ Message_collector.add_error collector
163163+ ~message:"Element \xe2\x80\x9cdd\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cdl\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"
164164+ ~code:"disallowed-child"
165165+ ~element:"dd" ()
166166+ | None ->
167167+ (* dd outside dl context - error *)
168168+ let parent = match current_parent state with
169169+ | Some p -> p
170170+ | None -> "document"
171171+ in
172172+ Message_collector.add_error collector
173173+ ~message:(Printf.sprintf "Element \xe2\x80\x9cdd\xe2\x80\x9d not allowed as child of element \xe2\x80\x9c%s\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)" parent)
174174+ ~code:"disallowed-child"
175175+ ~element:"dd" ()
176176+ end
177177+178178+ | _ -> ()
179179+ end
180180+181181+let end_element state ~name ~namespace collector =
182182+ if namespace <> None then ()
183183+ else begin
184184+ let name_lower = String.lowercase_ascii name in
185185+186186+ (* Pop from parent stack *)
187187+ (match state.parent_stack with
188188+ | _ :: rest -> state.parent_stack <- rest
189189+ | [] -> ());
190190+191191+ match name_lower with
192192+ | "template" ->
193193+ state.in_template <- max 0 (state.in_template - 1)
194194+195195+ | "dt" | "dd" when state.in_template = 0 ->
196196+ state.in_dt_dd <- max 0 (state.in_dt_dd - 1)
197197+198198+ | "dl" when state.in_template = 0 ->
199199+ begin match state.dl_stack with
200200+ | ctx :: rest ->
201201+ state.dl_stack <- rest;
202202+ (* Check dl content model at end *)
203203+ if ctx.contains_dt_dd then begin
204204+ (* Direct dt/dd content - must have both *)
205205+ if not ctx.has_dt && not ctx.dd_before_dt_error_reported then
206206+ (* Only report missing dt if we didn't already report it when dd appeared first *)
207207+ Message_collector.add_error collector
208208+ ~message:"Element \xe2\x80\x9cdl\xe2\x80\x9d is missing a required child element."
209209+ ~code:"missing-required-child"
210210+ ~element:"dl" ()
211211+ else if not ctx.has_dd then
212212+ Message_collector.add_error collector
213213+ ~message:"Element \xe2\x80\x9cdl\xe2\x80\x9d is missing required child element \xe2\x80\x9cdd\xe2\x80\x9d."
214214+ ~code:"missing-required-child"
215215+ ~element:"dl" ()
216216+ else if ctx.last_was_dt then
217217+ (* Ended with dt, missing dd *)
218218+ Message_collector.add_error collector
219219+ ~message:"Element \xe2\x80\x9cdl\xe2\x80\x9d is missing required child element \xe2\x80\x9cdd\xe2\x80\x9d."
220220+ ~code:"missing-required-child"
221221+ ~element:"dl" ()
222222+ end else if not ctx.contains_div && not ctx.has_dt && not ctx.has_dd then begin
223223+ (* Empty dl or only contained text/other elements - that's ok for now *)
224224+ ()
225225+ end
226226+ | [] -> ()
227227+ end
228228+229229+ | "div" when state.in_template = 0 ->
230230+ begin match state.div_in_dl_stack with
231231+ | div_ctx :: rest ->
232232+ state.div_in_dl_stack <- rest;
233233+ (* Check div in dl must have both dt and dd *)
234234+ if not div_ctx.has_dt && not div_ctx.has_dd then
235235+ Message_collector.add_error collector
236236+ ~message:"Element \xe2\x80\x9cdiv\xe2\x80\x9d is missing required child element \xe2\x80\x9cdd\xe2\x80\x9d."
237237+ ~code:"missing-required-child"
238238+ ~element:"div" ()
239239+ else if not div_ctx.has_dt then
240240+ Message_collector.add_error collector
241241+ ~message:"Element \xe2\x80\x9cdiv\xe2\x80\x9d is missing required child element \xe2\x80\x9cdt\xe2\x80\x9d."
242242+ ~code:"missing-required-child"
243243+ ~element:"div" ()
244244+ else if not div_ctx.has_dd then
245245+ Message_collector.add_error collector
246246+ ~message:"Element \xe2\x80\x9cdiv\xe2\x80\x9d is missing required child element \xe2\x80\x9cdd\xe2\x80\x9d."
247247+ ~code:"missing-required-child"
248248+ ~element:"div" ()
249249+ | [] -> ()
250250+ end
251251+252252+ | _ -> ()
253253+ end
254254+255255+let characters state text collector =
256256+ if state.in_template > 0 then ()
257257+ else if state.in_dt_dd > 0 then () (* Text in dt/dd is fine *)
258258+ else begin
259259+ let trimmed = String.trim text in
260260+ if trimmed <> "" then begin
261261+ (* Check for text directly in dl *)
262262+ match current_dl state with
263263+ | Some _ when state.div_in_dl_stack = [] ->
264264+ Message_collector.add_error collector
265265+ ~message:"Text not allowed in element \xe2\x80\x9cdl\xe2\x80\x9d in this context."
266266+ ~code:"text-not-allowed"
267267+ ~element:"dl" ()
268268+ | _ -> ()
269269+ end
270270+ end
271271+272272+let end_document _state _collector = ()
273273+274274+let checker =
275275+ (module struct
276276+ type nonrec state = state
277277+ let create = create
278278+ let reset = reset
279279+ let start_element = start_element
280280+ let end_element = end_element
281281+ let characters = characters
282282+ let end_document = end_document
283283+ end : Checker.S)
+42
lib/html5_checker/specialized/h1_checker.ml
···11+(** H1 element counter - warns about multiple h1 elements in a document. *)
22+33+type state = {
44+ mutable h1_count : int;
55+}
66+77+let create () = {
88+ h1_count = 0;
99+}
1010+1111+let reset state =
1212+ state.h1_count <- 0
1313+1414+let start_element state ~name ~namespace ~attrs collector =
1515+ ignore attrs;
1616+ if namespace <> None then ()
1717+ else begin
1818+ let name_lower = String.lowercase_ascii name in
1919+ if name_lower = "h1" then begin
2020+ state.h1_count <- state.h1_count + 1;
2121+ if state.h1_count > 1 then
2222+ Message_collector.add_info collector
2323+ ~message:"Consider using only one \xe2\x80\x9ch1\xe2\x80\x9d element per document (or, if using \xe2\x80\x9ch1\xe2\x80\x9d elements multiple times is required, consider using the \xe2\x80\x9cheadingoffset\xe2\x80\x9d attribute to indicate that these \xe2\x80\x9ch1\xe2\x80\x9d elements are not all top-level headings)."
2424+ ~code:"multiple-h1"
2525+ ~element:name ()
2626+ end
2727+ end
2828+2929+let end_element _state ~name:_ ~namespace:_ _collector = ()
3030+let characters _state _text _collector = ()
3131+let end_document _state _collector = ()
3232+3333+let checker =
3434+ (module struct
3535+ type nonrec state = state
3636+ let create = create
3737+ let reset = reset
3838+ let start_element = start_element
3939+ let end_element = end_element
4040+ let characters = characters
4141+ let end_document = end_document
4242+ end : Checker.S)
+1-1
lib/html5_checker/specialized/heading_checker.ml
···8080 state.h1_count <- state.h1_count + 1;
8181 if state.h1_count > 1 then
8282 Message_collector.add_warning collector
8383- ~message:"Multiple <h1> elements detected. While valid in HTML5 sectioning content, traditional advice suggests one <h1> per page"
8383+ ~message:"Consider using only one \xe2\x80\x9ch1\xe2\x80\x9d element per document (or, if using \xe2\x80\x9ch1\xe2\x80\x9d elements multiple times is required, consider using the \xe2\x80\x9cheadingoffset\xe2\x80\x9d attribute to indicate that these \xe2\x80\x9ch1\xe2\x80\x9d elements are not all top-level headings)."
8484 ~code:"multiple-h1"
8585 ~element:name
8686 ()
+115
lib/html5_checker/specialized/label_checker.ml
···11+(** Label element content model validation checker.
22+ Validates that label element contains at most one labelable element
33+ and that descendants with for attribute have matching ids. *)
44+55+(** Labelable elements that label can reference *)
66+let labelable_elements = ["button"; "input"; "meter"; "output"; "progress"; "select"; "textarea"]
77+88+(** Helper to get attribute value *)
99+let get_attr attrs name =
1010+ let name_lower = String.lowercase_ascii name in
1111+ List.find_map (fun (n, v) ->
1212+ if String.lowercase_ascii n = name_lower then Some v else None
1313+ ) attrs
1414+1515+type state = {
1616+ mutable in_label : bool;
1717+ mutable label_depth : int;
1818+ mutable labelable_count : int;
1919+ mutable label_for_value : string option; (* Value of for attribute on current label *)
2020+}
2121+2222+let create () = {
2323+ in_label = false;
2424+ label_depth = 0;
2525+ labelable_count = 0;
2626+ label_for_value = None;
2727+}
2828+2929+let reset state =
3030+ state.in_label <- false;
3131+ state.label_depth <- 0;
3232+ state.labelable_count <- 0;
3333+ state.label_for_value <- None
3434+3535+let start_element state ~name ~namespace ~attrs collector =
3636+ if namespace <> None then ()
3737+ else begin
3838+ let name_lower = String.lowercase_ascii name in
3939+4040+ if name_lower = "label" then begin
4141+ state.in_label <- true;
4242+ state.label_depth <- 0;
4343+ state.labelable_count <- 0;
4444+ state.label_for_value <- get_attr attrs "for"
4545+ end;
4646+4747+ if state.in_label then begin
4848+ state.label_depth <- state.label_depth + 1;
4949+5050+ (* Check for labelable elements inside label *)
5151+ if List.mem name_lower labelable_elements then begin
5252+ state.labelable_count <- state.labelable_count + 1;
5353+ if state.labelable_count > 1 then
5454+ Message_collector.add_error collector
5555+ ~message:"The \xe2\x80\x9clabel\xe2\x80\x9d element may contain at most one \xe2\x80\x9cbutton\xe2\x80\x9d, \xe2\x80\x9cinput\xe2\x80\x9d, \xe2\x80\x9cmeter\xe2\x80\x9d, \xe2\x80\x9coutput\xe2\x80\x9d, \xe2\x80\x9cprogress\xe2\x80\x9d, \xe2\x80\x9cselect\xe2\x80\x9d, or \xe2\x80\x9ctextarea\xe2\x80\x9d descendant."
5656+ ~code:"too-many-labelable-descendants"
5757+ ~element:"label" ();
5858+5959+ (* Check if label has for attribute and descendant has mismatched id *)
6060+ match state.label_for_value with
6161+ | Some for_value ->
6262+ let descendant_id = get_attr attrs "id" in
6363+ (match descendant_id with
6464+ | None ->
6565+ (* Descendant has no id, but label has for attribute *)
6666+ Message_collector.add_error collector
6767+ ~message:(Printf.sprintf "Any \xe2\x80\x9c%s\xe2\x80\x9d descendant of a \xe2\x80\x9clabel\xe2\x80\x9d element with a \xe2\x80\x9cfor\xe2\x80\x9d attribute must have an ID value that matches that \xe2\x80\x9cfor\xe2\x80\x9d attribute." name_lower)
6868+ ~code:"label-for-descendant-id-mismatch"
6969+ ~element:name_lower ()
7070+ | Some id when id <> for_value ->
7171+ (* Descendant has id, but it doesn't match the for value *)
7272+ Message_collector.add_error collector
7373+ ~message:(Printf.sprintf "Any \xe2\x80\x9c%s\xe2\x80\x9d descendant of a \xe2\x80\x9clabel\xe2\x80\x9d element with a \xe2\x80\x9cfor\xe2\x80\x9d attribute must have an ID value that matches that \xe2\x80\x9cfor\xe2\x80\x9d attribute." name_lower)
7474+ ~code:"label-for-descendant-id-mismatch"
7575+ ~element:name_lower ()
7676+ | Some _ ->
7777+ (* id matches for value - no error *)
7878+ ())
7979+ | None ->
8080+ (* No for attribute on label - no constraint on descendant id *)
8181+ ()
8282+ end
8383+ end
8484+ end
8585+8686+let end_element state ~name ~namespace _collector =
8787+ if namespace <> None then ()
8888+ else begin
8989+ let name_lower = String.lowercase_ascii name in
9090+9191+ if state.in_label then begin
9292+ state.label_depth <- state.label_depth - 1;
9393+9494+ if name_lower = "label" && state.label_depth < 0 then begin
9595+ state.in_label <- false;
9696+ state.labelable_count <- 0;
9797+ state.label_for_value <- None
9898+ end
9999+ end
100100+ end
101101+102102+let characters _state _text _collector = ()
103103+104104+let end_document _state _collector = ()
105105+106106+let checker =
107107+ (module struct
108108+ type nonrec state = state
109109+ let create = create
110110+ let reset = reset
111111+ let start_element = start_element
112112+ let end_element = end_element
113113+ let characters = characters
114114+ let end_document = end_document
115115+ end : Checker.S)
+192
lib/html5_checker/specialized/picture_checker.ml
···11+(** Picture element content model and attribute validation checker. *)
22+33+(** Elements allowed as children of picture *)
44+let allowed_picture_children = ["source"; "img"; "script"; "template"]
55+66+(** Attributes NOT allowed on picture element *)
77+let disallowed_picture_attrs = [
88+ "align"; "alt"; "border"; "crossorigin"; "height"; "hspace"; "ismap";
99+ "longdesc"; "lowsrc"; "media"; "name"; "sizes"; "src"; "srcset"; "usemap";
1010+ "vspace"; "width"; "role"
1111+]
1212+1313+(** Attributes NOT allowed on source element when in picture context *)
1414+let disallowed_source_attrs_in_picture = [
1515+ "align"; "alt"; "border"; "crossorigin"; "hspace"; "ismap"; "longdesc";
1616+ "name"; "src"; "usemap"; "vspace"; "role"
1717+]
1818+1919+(** Attributes NOT allowed on img element *)
2020+let disallowed_img_attrs = ["type"]
2121+2222+(** Checker state. *)
2323+type state = {
2424+ mutable in_picture : bool;
2525+ mutable has_img_in_picture : bool;
2626+ mutable picture_depth : int;
2727+ mutable children_in_picture : string list;
2828+ mutable last_was_img : bool;
2929+ mutable has_source_after_img : bool;
3030+}
3131+3232+let create () = {
3333+ in_picture = false;
3434+ has_img_in_picture = false;
3535+ picture_depth = 0;
3636+ children_in_picture = [];
3737+ last_was_img = false;
3838+ has_source_after_img = false;
3939+}
4040+4141+let reset state =
4242+ state.in_picture <- false;
4343+ state.has_img_in_picture <- false;
4444+ state.picture_depth <- 0;
4545+ state.children_in_picture <- [];
4646+ state.last_was_img <- false;
4747+ state.has_source_after_img <- false
4848+4949+(** Check if an attribute list contains a specific attribute. *)
5050+let has_attr name attrs =
5151+ List.exists (fun (attr_name, _) -> String.lowercase_ascii attr_name = name) attrs
5252+5353+(** Report disallowed attribute error *)
5454+let report_disallowed_attr element attr collector =
5555+ Message_collector.add_error collector
5656+ ~message:(Printf.sprintf "Attribute \xe2\x80\x9c%s\xe2\x80\x9d not allowed on element \xe2\x80\x9c%s\xe2\x80\x9d at this point."
5757+ attr element)
5858+ ~code:"disallowed-attribute"
5959+ ~element ~attribute:attr ()
6060+6161+(** Report disallowed child element error *)
6262+let report_disallowed_child parent child collector =
6363+ Message_collector.add_error collector
6464+ ~message:(Printf.sprintf "Element \xe2\x80\x9c%s\xe2\x80\x9d not allowed as child of element \xe2\x80\x9c%s\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"
6565+ child parent)
6666+ ~code:"disallowed-child"
6767+ ~element:child ()
6868+6969+let check_picture_attrs attrs collector =
7070+ List.iter (fun disallowed ->
7171+ if has_attr disallowed attrs then
7272+ report_disallowed_attr "picture" disallowed collector
7373+ ) disallowed_picture_attrs
7474+7575+let check_source_attrs_in_picture attrs collector =
7676+ List.iter (fun disallowed ->
7777+ if has_attr disallowed attrs then
7878+ report_disallowed_attr "source" disallowed collector
7979+ ) disallowed_source_attrs_in_picture;
8080+ (* source in picture requires srcset *)
8181+ if not (has_attr "srcset" attrs) then
8282+ Message_collector.add_error collector
8383+ ~message:"Element \xe2\x80\x9csource\xe2\x80\x9d is missing required attribute \xe2\x80\x9csrcset\xe2\x80\x9d."
8484+ ~code:"missing-required-attribute"
8585+ ~element:"source" ~attribute:"srcset" ()
8686+8787+let check_img_attrs attrs collector =
8888+ List.iter (fun disallowed ->
8989+ if has_attr disallowed attrs then
9090+ report_disallowed_attr "img" disallowed collector
9191+ ) disallowed_img_attrs
9292+9393+let start_element state ~name ~namespace ~attrs collector =
9494+ let name_lower = String.lowercase_ascii name in
9595+9696+ (* Check for disallowed children of picture first - even foreign content *)
9797+ if state.in_picture && state.picture_depth = 1 then begin
9898+ if not (List.mem name_lower allowed_picture_children) then
9999+ report_disallowed_child "picture" name_lower collector
100100+ end;
101101+102102+ (* Rest of checks only apply to HTML namespace elements *)
103103+ if namespace = None then begin
104104+ match name_lower with
105105+ | "picture" ->
106106+ check_picture_attrs attrs collector;
107107+ state.in_picture <- true;
108108+ state.has_img_in_picture <- false;
109109+ state.picture_depth <- 0; (* Will be incremented to 1 at end of function *)
110110+ state.children_in_picture <- [];
111111+ state.last_was_img <- false;
112112+ state.has_source_after_img <- false
113113+114114+ | "source" when state.in_picture && state.picture_depth = 1 ->
115115+ check_source_attrs_in_picture attrs collector;
116116+ state.children_in_picture <- "source" :: state.children_in_picture;
117117+ if state.last_was_img then
118118+ state.has_source_after_img <- true
119119+120120+ | "img" when state.in_picture && state.picture_depth = 1 ->
121121+ check_img_attrs attrs collector;
122122+ state.has_img_in_picture <- true;
123123+ state.children_in_picture <- "img" :: state.children_in_picture;
124124+ state.last_was_img <- true;
125125+ (* Check for multiple img elements *)
126126+ let img_count = List.filter (fun c -> c = "img") state.children_in_picture |> List.length in
127127+ if img_count > 1 then
128128+ report_disallowed_child "picture" "img" collector
129129+130130+ | "script" when state.in_picture && state.picture_depth = 1 ->
131131+ state.children_in_picture <- "script" :: state.children_in_picture
132132+133133+ | "template" when state.in_picture && state.picture_depth = 1 ->
134134+ state.children_in_picture <- "template" :: state.children_in_picture
135135+136136+ | "img" ->
137137+ check_img_attrs attrs collector
138138+139139+ | _ -> ()
140140+ end;
141141+142142+ (* Track depth when inside picture *)
143143+ if state.in_picture then
144144+ state.picture_depth <- state.picture_depth + 1
145145+146146+let end_element state ~name ~namespace collector =
147147+ if namespace <> None then ()
148148+ else begin
149149+ let name_lower = String.lowercase_ascii name in
150150+151151+ (* Track depth *)
152152+ if state.in_picture then
153153+ state.picture_depth <- state.picture_depth - 1;
154154+155155+ if name_lower = "picture" && state.picture_depth = 0 then begin
156156+ (* Check if picture had img child *)
157157+ if not state.has_img_in_picture then
158158+ Message_collector.add_error collector
159159+ ~message:"Element \xe2\x80\x9cpicture\xe2\x80\x9d is missing required child element \xe2\x80\x9cimg\xe2\x80\x9d."
160160+ ~code:"missing-required-child"
161161+ ~element:"picture" ();
162162+ (* Check for source after img *)
163163+ if state.has_source_after_img then
164164+ report_disallowed_child "picture" "source" collector;
165165+166166+ state.in_picture <- false
167167+ end
168168+ end
169169+170170+let characters state text collector =
171171+ (* Text in picture element is not allowed *)
172172+ if state.in_picture && state.picture_depth = 1 then begin
173173+ let trimmed = String.trim text in
174174+ if trimmed <> "" then
175175+ Message_collector.add_error collector
176176+ ~message:"Text not allowed in element \xe2\x80\x9cpicture\xe2\x80\x9d in this context."
177177+ ~code:"text-not-allowed"
178178+ ~element:"picture" ()
179179+ end
180180+181181+let end_document _state _collector = ()
182182+183183+let checker =
184184+ (module struct
185185+ type nonrec state = state
186186+ let create = create
187187+ let reset = reset
188188+ let start_element = start_element
189189+ let end_element = end_element
190190+ let characters = characters
191191+ let end_document = end_document
192192+ end : Checker.S)
+141
lib/html5_checker/specialized/ruby_checker.ml
···11+(** Ruby element content model validation checker.
22+33+ Validates that:
44+ - Ruby contains at least one rt element
55+ - Ruby contains phrasing content before rt elements *)
66+77+type ruby_info = {
88+ mutable has_rt : bool;
99+ mutable has_content_before_rt : bool;
1010+ mutable saw_rt : bool; (* Whether we've seen rt yet *)
1111+ mutable depth : int; (* Track nesting level *)
1212+}
1313+1414+type state = {
1515+ mutable ruby_stack : ruby_info list; (* Stack for nested ruby elements *)
1616+ mutable in_template : int;
1717+}
1818+1919+let create () = {
2020+ ruby_stack = [];
2121+ in_template = 0;
2222+}
2323+2424+let reset state =
2525+ state.ruby_stack <- [];
2626+ state.in_template <- 0
2727+2828+(** Check if element is phrasing content that can appear before rt *)
2929+let is_phrasing_content name =
3030+ let name_lower = String.lowercase_ascii name in
3131+ (* rt and rp are special - they don't count as "content before rt" *)
3232+ name_lower <> "rt" && name_lower <> "rp"
3333+3434+let start_element state ~name ~namespace ~attrs _collector =
3535+ ignore attrs;
3636+ if namespace <> None then ()
3737+ else begin
3838+ let name_lower = String.lowercase_ascii name in
3939+4040+ if name_lower = "template" then
4141+ state.in_template <- state.in_template + 1;
4242+4343+ if state.in_template > 0 then ()
4444+ else begin
4545+ if name_lower = "ruby" then begin
4646+ (* Push new ruby context *)
4747+ let info = {
4848+ has_rt = false;
4949+ has_content_before_rt = false;
5050+ saw_rt = false;
5151+ depth = 0;
5252+ } in
5353+ state.ruby_stack <- info :: state.ruby_stack
5454+ end;
5555+5656+ match state.ruby_stack with
5757+ | info :: _ ->
5858+ (* Inside a ruby element *)
5959+ if name_lower = "ruby" then begin
6060+ (* This is the opening of ruby, set depth to 1 *)
6161+ info.depth <- 1
6262+ end else begin
6363+ if info.depth = 1 then begin
6464+ (* Direct children of ruby *)
6565+ if name_lower = "rt" then begin
6666+ info.has_rt <- true;
6767+ info.saw_rt <- true
6868+ end else if is_phrasing_content name_lower then begin
6969+ if not info.saw_rt then
7070+ info.has_content_before_rt <- true
7171+ end
7272+ end;
7373+ info.depth <- info.depth + 1
7474+ end
7575+ | [] -> ()
7676+ end
7777+ end
7878+7979+let end_element state ~name ~namespace collector =
8080+ if namespace <> None then ()
8181+ else begin
8282+ let name_lower = String.lowercase_ascii name in
8383+8484+ if name_lower = "template" && state.in_template > 0 then
8585+ state.in_template <- state.in_template - 1;
8686+8787+ if state.in_template > 0 then ()
8888+ else begin
8989+ match state.ruby_stack with
9090+ | info :: rest ->
9191+ info.depth <- info.depth - 1;
9292+ (* Check if this is the closing ruby tag (depth becomes 0 when ruby closes) *)
9393+ if name_lower = "ruby" && info.depth <= 0 then begin
9494+ (* Closing ruby element - validate *)
9595+ if not info.has_rt then
9696+ Message_collector.add_error collector
9797+ ~message:"Element \xe2\x80\x9cruby\xe2\x80\x9d is missing one or more of the following child elements: [rp, rt]."
9898+ ~code:"ruby-missing-rt"
9999+ ~element:"ruby" ()
100100+ else if not info.has_content_before_rt then
101101+ Message_collector.add_error collector
102102+ ~message:"Element \xe2\x80\x9cruby\xe2\x80\x9d is missing required child element \xe2\x80\x9crt\xe2\x80\x9d."
103103+ ~code:"ruby-missing-content"
104104+ ~element:"ruby" ();
105105+ state.ruby_stack <- rest
106106+ end
107107+ | [] -> ()
108108+ end
109109+ end
110110+111111+let characters state text _collector =
112112+ (* Text content counts as phrasing content before rt *)
113113+ if state.in_template > 0 then ()
114114+ else begin
115115+ match state.ruby_stack with
116116+ | info :: _ ->
117117+ if info.depth = 1 then begin
118118+ (* Direct text child of ruby *)
119119+ let has_non_whitespace =
120120+ String.exists (fun c ->
121121+ c <> ' ' && c <> '\t' && c <> '\n' && c <> '\r'
122122+ ) text
123123+ in
124124+ if has_non_whitespace && not info.saw_rt then
125125+ info.has_content_before_rt <- true
126126+ end
127127+ | [] -> ()
128128+ end
129129+130130+let end_document _state _collector = ()
131131+132132+let checker =
133133+ (module struct
134134+ type nonrec state = state
135135+ let create = create
136136+ let reset = reset
137137+ let start_element = start_element
138138+ let end_element = end_element
139139+ let characters = characters
140140+ let end_document = end_document
141141+ end : Checker.S)
+103
lib/html5_checker/specialized/source_checker.ml
···11+(** Source element context validation checker.
22+ Validates that source attributes are appropriate for the parent context. *)
33+44+type parent_context =
55+ | Picture
66+ | Video
77+ | Audio
88+ | Other
99+1010+type state = {
1111+ mutable context_stack : parent_context list;
1212+}
1313+1414+let create () = {
1515+ context_stack = [];
1616+}
1717+1818+let reset state =
1919+ state.context_stack <- []
2020+2121+let current_context state =
2222+ match state.context_stack with
2323+ | ctx :: _ -> ctx
2424+ | [] -> Other
2525+2626+(** Check if an attribute list contains a specific attribute. *)
2727+let has_attr name attrs =
2828+ List.exists (fun (attr_name, _) -> String.lowercase_ascii attr_name = name) attrs
2929+3030+let start_element state ~name ~namespace ~attrs collector =
3131+ if namespace <> None then ()
3232+ else begin
3333+ let name_lower = String.lowercase_ascii name in
3434+ match name_lower with
3535+ | "picture" ->
3636+ state.context_stack <- Picture :: state.context_stack
3737+ | "video" ->
3838+ state.context_stack <- Video :: state.context_stack
3939+ | "audio" ->
4040+ state.context_stack <- Audio :: state.context_stack
4141+ | "source" ->
4242+ let ctx = current_context state in
4343+ begin match ctx with
4444+ | Video | Audio ->
4545+ (* srcset is not allowed on source inside video/audio *)
4646+ if has_attr "srcset" attrs then
4747+ Message_collector.add_error collector
4848+ ~message:"Attribute \xe2\x80\x9csrcset\xe2\x80\x9d not allowed on element \xe2\x80\x9csource\xe2\x80\x9d at this point."
4949+ ~code:"disallowed-attribute"
5050+ ~element:name ~attribute:"srcset" ();
5151+ (* sizes is not allowed on source inside video/audio *)
5252+ if has_attr "sizes" attrs then
5353+ Message_collector.add_error collector
5454+ ~message:"Attribute \xe2\x80\x9csizes\xe2\x80\x9d not allowed on element \xe2\x80\x9csource\xe2\x80\x9d at this point."
5555+ ~code:"disallowed-attribute"
5656+ ~element:name ~attribute:"sizes" ();
5757+ (* Note: media IS allowed on source in video/audio for source selection *)
5858+ (* width/height not allowed on source inside video/audio *)
5959+ if has_attr "width" attrs then
6060+ Message_collector.add_error collector
6161+ ~message:"Attribute \xe2\x80\x9cwidth\xe2\x80\x9d not allowed on element \xe2\x80\x9csource\xe2\x80\x9d at this point."
6262+ ~code:"disallowed-attribute"
6363+ ~element:name ~attribute:"width" ();
6464+ if has_attr "height" attrs then
6565+ Message_collector.add_error collector
6666+ ~message:"Attribute \xe2\x80\x9cheight\xe2\x80\x9d not allowed on element \xe2\x80\x9csource\xe2\x80\x9d at this point."
6767+ ~code:"disallowed-attribute"
6868+ ~element:name ~attribute:"height" ()
6969+ | Picture | Other ->
7070+ (* In picture context or other contexts, these attributes might be valid *)
7171+ ()
7272+ end
7373+ | _ ->
7474+ (* Any other element maintains current context *)
7575+ ()
7676+ end
7777+7878+let end_element state ~name ~namespace _collector =
7979+ if namespace <> None then ()
8080+ else begin
8181+ let name_lower = String.lowercase_ascii name in
8282+ match name_lower with
8383+ | "picture" | "video" | "audio" ->
8484+ (match state.context_stack with
8585+ | _ :: rest -> state.context_stack <- rest
8686+ | [] -> ())
8787+ | _ -> ()
8888+ end
8989+9090+let characters _state _text _collector = ()
9191+9292+let end_document _state _collector = ()
9393+9494+let checker =
9595+ (module struct
9696+ type nonrec state = state
9797+ let create = create
9898+ let reset = reset
9999+ let start_element = start_element
100100+ let end_element = end_element
101101+ let characters = characters
102102+ let end_document = end_document
103103+ end : Checker.S)
+98
lib/html5_checker/specialized/title_checker.ml
···11+(** Title element validation checker. *)
22+33+type state = {
44+ mutable in_head : bool;
55+ mutable has_title : bool;
66+ mutable in_title : bool;
77+ mutable title_has_content : bool;
88+ mutable title_depth : int;
99+ mutable is_iframe_srcdoc : bool;
1010+}
1111+1212+let create () = {
1313+ in_head = false;
1414+ has_title = false;
1515+ in_title = false;
1616+ title_has_content = false;
1717+ title_depth = 0;
1818+ is_iframe_srcdoc = false;
1919+}
2020+2121+let reset state =
2222+ state.in_head <- false;
2323+ state.has_title <- false;
2424+ state.in_title <- false;
2525+ state.title_has_content <- false;
2626+ state.title_depth <- 0;
2727+ state.is_iframe_srcdoc <- false
2828+2929+let start_element state ~name ~namespace ~attrs collector =
3030+ ignore (collector, attrs);
3131+ if namespace <> None then ()
3232+ else begin
3333+ let name_lower = String.lowercase_ascii name in
3434+ match name_lower with
3535+ | "html" ->
3636+ (* Check if this is an iframe srcdoc - title is not required *)
3737+ (* We detect this by checking for srcdoc context - not directly checkable from HTML,
3838+ but we can assume normal HTML document for now *)
3939+ ()
4040+ | "head" ->
4141+ state.in_head <- true
4242+ | "title" when state.in_head ->
4343+ state.has_title <- true;
4444+ state.in_title <- true;
4545+ state.title_has_content <- false;
4646+ state.title_depth <- 0
4747+ | _ -> ()
4848+ end;
4949+ if state.in_title then
5050+ state.title_depth <- state.title_depth + 1
5151+5252+let end_element state ~name ~namespace collector =
5353+ if namespace <> None then ()
5454+ else begin
5555+ let name_lower = String.lowercase_ascii name in
5656+5757+ if state.in_title then
5858+ state.title_depth <- state.title_depth - 1;
5959+6060+ match name_lower with
6161+ | "title" when state.in_title && state.title_depth = 0 ->
6262+ (* Check if title was empty *)
6363+ if not state.title_has_content then
6464+ Message_collector.add_error collector
6565+ ~message:"Element \xe2\x80\x9ctitle\xe2\x80\x9d must not be empty."
6666+ ~code:"empty-title"
6767+ ~element:name ();
6868+ state.in_title <- false
6969+ | "head" ->
7070+ (* Check if head had a title element *)
7171+ if state.in_head && not state.has_title then
7272+ Message_collector.add_error collector
7373+ ~message:"Element \xe2\x80\x9chead\xe2\x80\x9d is missing required child element \xe2\x80\x9ctitle\xe2\x80\x9d."
7474+ ~code:"missing-required-child"
7575+ ~element:"head" ();
7676+ state.in_head <- false
7777+ | _ -> ()
7878+ end
7979+8080+let characters state text _collector =
8181+ if state.in_title then begin
8282+ let trimmed = String.trim text in
8383+ if trimmed <> "" then
8484+ state.title_has_content <- true
8585+ end
8686+8787+let end_document _state _collector = ()
8888+8989+let checker =
9090+ (module struct
9191+ type nonrec state = state
9292+ let create = create
9393+ let reset = reset
9494+ let start_element = start_element
9595+ let end_element = end_element
9696+ let characters = characters
9797+ let end_document = end_document
9898+ end : Checker.S)
+792
lib/html5_checker/specialized/url_checker.ml
···11+(** URL validation checker for href, src, action, and other URL attributes. *)
22+33+(** Attributes that contain URLs and should be validated.
44+ Note: srcset uses special microsyntax, not validated as URL here.
55+ Note: input[value] is only checked for type="url", handled specially below. *)
66+let url_attributes = [
77+ ("a", ["href"]);
88+ ("area", ["href"]);
99+ ("audio", ["src"]);
1010+ ("base", ["href"]);
1111+ ("blockquote", ["cite"]);
1212+ ("button", ["formaction"]);
1313+ ("del", ["cite"]);
1414+ ("embed", ["src"]);
1515+ ("form", ["action"]);
1616+ ("iframe", ["src"]);
1717+ ("img", ["src"]);
1818+ ("input", ["formaction"; "src"]);
1919+ ("ins", ["cite"]);
2020+ ("link", ["href"]);
2121+ ("object", ["data"]);
2222+ ("q", ["cite"]);
2323+ ("script", ["src"]);
2424+ ("source", ["src"]);
2525+ ("track", ["src"]);
2626+ ("video", ["src"; "poster"]);
2727+]
2828+2929+(** Characters not allowed in URL host. *)
3030+let invalid_host_chars = ['^'; '`'; '{'; '}'; '<'; '>']
3131+3232+(** Check if a host looks like an IPv6 address (starts with [). *)
3333+let is_ipv6_host host =
3434+ String.length host > 0 && host.[0] = '['
3535+3636+(** Check if character is valid in IPv6 address. *)
3737+let is_valid_ipv6_char c =
3838+ (c >= '0' && c <= '9') ||
3939+ (c >= 'a' && c <= 'f') ||
4040+ (c >= 'A' && c <= 'F') ||
4141+ c = ':' || c = '.' || c = '[' || c = ']'
4242+4343+(** Validate IPv6 bracketed host. *)
4444+let validate_ipv6_host host url attr_name element_name =
4545+ (* Host should be in format [xxxx:...] *)
4646+ if String.length host < 3 then
4747+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character."
4848+ url attr_name element_name)
4949+ else begin
5050+ (* Check if all characters are valid IPv6 chars *)
5151+ let invalid_char = String.exists (fun c -> not (is_valid_ipv6_char c)) host in
5252+ if invalid_char then
5353+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character."
5454+ url attr_name element_name)
5555+ else
5656+ None
5757+ end
5858+5959+(** Check if a file URL host is a valid Windows drive letter (like C|). *)
6060+let is_valid_windows_drive host =
6161+ String.length host = 2 &&
6262+ ((host.[0] >= 'A' && host.[0] <= 'Z') || (host.[0] >= 'a' && host.[0] <= 'z')) &&
6363+ host.[1] = '|'
6464+6565+(** Check if pipe is allowed in this host context. *)
6666+let is_pipe_allowed_in_host url host =
6767+ let scheme = try String.lowercase_ascii (String.sub url 0 (String.index url ':')) with _ -> "" in
6868+ scheme = "file" && is_valid_windows_drive host
6969+7070+(** Special schemes that require double slash (//).
7171+ Note: file: is special but doesn't always require //.
7272+ Note: ws and wss allow single/no slash forms per WHATWG URL Standard. *)
7373+let special_schemes_require_double_slash = ["http"; "https"; "ftp"]
7474+7575+(** Special schemes (for other checks). *)
7676+let special_schemes = ["http"; "https"; "ftp"; "ws"; "wss"; "file"]
7777+7878+(** Extract scheme from URL. *)
7979+let extract_scheme url =
8080+ (* A scheme must start with a letter, not [ or other special chars *)
8181+ if String.length url = 0 then None
8282+ else if not (url.[0] >= 'a' && url.[0] <= 'z' || url.[0] >= 'A' && url.[0] <= 'Z') then
8383+ None
8484+ else
8585+ try
8686+ let colon_pos = String.index url ':' in
8787+ (* Scheme can only contain letters, digits, +, -, . *)
8888+ let potential_scheme = String.sub url 0 colon_pos in
8989+ let is_valid_scheme = String.length potential_scheme > 0 &&
9090+ String.for_all (fun c ->
9191+ (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
9292+ (c >= '0' && c <= '9') || c = '+' || c = '-' || c = '.'
9393+ ) potential_scheme in
9494+ if is_valid_scheme then
9595+ Some (String.lowercase_ascii potential_scheme)
9696+ else
9797+ None
9898+ with Not_found -> None
9999+100100+(** Extract host and port from URL. Returns (host option, port_string option). *)
101101+let extract_host_and_port url =
102102+ try
103103+ let double_slash =
104104+ try Some (Str.search_forward (Str.regexp "://") url 0 + 3)
105105+ with Not_found -> None
106106+ in
107107+ match double_slash with
108108+ | None -> (None, None)
109109+ | Some start_pos ->
110110+ let rest = String.sub url start_pos (String.length url - start_pos) in
111111+ (* Find end of authority (/ ? # or end) *)
112112+ let auth_end =
113113+ let find_char c = try Some (String.index rest c) with Not_found -> None in
114114+ match find_char '/', find_char '?', find_char '#' with
115115+ | Some a, Some b, Some c -> min a (min b c)
116116+ | Some a, Some b, None -> min a b
117117+ | Some a, None, Some c -> min a c
118118+ | None, Some b, Some c -> min b c
119119+ | Some a, None, None -> a
120120+ | None, Some b, None -> b
121121+ | None, None, Some c -> c
122122+ | None, None, None -> String.length rest
123123+ in
124124+ let authority = String.sub rest 0 auth_end in
125125+ (* Remove userinfo if present *)
126126+ let host_port =
127127+ try
128128+ let at_pos = String.rindex authority '@' in
129129+ String.sub authority (at_pos + 1) (String.length authority - at_pos - 1)
130130+ with Not_found -> authority
131131+ in
132132+ (* Handle IPv6 addresses *)
133133+ if String.length host_port > 0 && host_port.[0] = '[' then begin
134134+ try
135135+ let bracket_end = String.index host_port ']' in
136136+ let host = String.sub host_port 0 (bracket_end + 1) in
137137+ let after_bracket = String.sub host_port (bracket_end + 1) (String.length host_port - bracket_end - 1) in
138138+ if String.length after_bracket > 0 && after_bracket.[0] = ':' then
139139+ (Some host, Some (String.sub after_bracket 1 (String.length after_bracket - 1)))
140140+ else
141141+ (Some host, None)
142142+ with Not_found -> (Some host_port, None)
143143+ end else begin
144144+ (* Regular host:port - use FIRST colon to separate host from port
145145+ (per WHATWG URL Standard for special schemes) *)
146146+ try
147147+ let colon_pos = String.index host_port ':' in
148148+ let host = String.sub host_port 0 colon_pos in
149149+ let port = String.sub host_port (colon_pos + 1) (String.length host_port - colon_pos - 1) in
150150+ (Some host, Some port)
151151+ with Not_found -> (Some host_port, None)
152152+ end
153153+ with _ -> (None, None)
154154+155155+(** Check if character is a valid hex digit (for percent-decoding). *)
156156+let is_hex_digit_for_decode c =
157157+ (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
158158+159159+(** Convert a hex character to its numeric value. *)
160160+let hex_value c =
161161+ if c >= '0' && c <= '9' then Char.code c - Char.code '0'
162162+ else if c >= 'a' && c <= 'f' then Char.code c - Char.code 'a' + 10
163163+ else if c >= 'A' && c <= 'F' then Char.code c - Char.code 'A' + 10
164164+ else 0
165165+166166+(** Percent-decode a string. Returns the decoded bytes. *)
167167+let percent_decode s =
168168+ let buf = Buffer.create (String.length s) in
169169+ let len = String.length s in
170170+ let i = ref 0 in
171171+ while !i < len do
172172+ if s.[!i] = '%' && !i + 2 < len && is_hex_digit_for_decode s.[!i + 1] && is_hex_digit_for_decode s.[!i + 2] then begin
173173+ let byte = hex_value s.[!i + 1] * 16 + hex_value s.[!i + 2] in
174174+ Buffer.add_char buf (Char.chr byte);
175175+ i := !i + 3
176176+ end else begin
177177+ Buffer.add_char buf s.[!i];
178178+ incr i
179179+ end
180180+ done;
181181+ Buffer.contents buf
182182+183183+(** Check if decoded bytes contain invalid Unicode noncharacters or surrogates.
184184+ These are forbidden in hostnames per WHATWG URL Standard.
185185+ - U+FDD0-U+FDEF: noncharacters
186186+ - U+FFFE, U+FFFF: noncharacters
187187+ - U+xFFFE, U+xFFFF for any plane (0x1FFFE, etc.)
188188+ - U+D800-U+DFFF: surrogate code points *)
189189+let contains_invalid_unicode bytes =
190190+ let len = String.length bytes in
191191+ let i = ref 0 in
192192+ while !i < len do
193193+ let c = Char.code bytes.[!i] in
194194+ if c < 128 then begin
195195+ (* ASCII - OK *)
196196+ incr i
197197+ end else if c >= 0xC0 && c < 0xE0 && !i + 1 < len then begin
198198+ (* 2-byte UTF-8 *)
199199+ let b1 = Char.code bytes.[!i + 1] in
200200+ (* let codepoint = ((c land 0x1F) lsl 6) lor (b1 land 0x3F) in *)
201201+ ignore b1;
202202+ i := !i + 2
203203+ end else if c >= 0xE0 && c < 0xF0 && !i + 2 < len then begin
204204+ (* 3-byte UTF-8 *)
205205+ let b1 = Char.code bytes.[!i + 1] in
206206+ let b2 = Char.code bytes.[!i + 2] in
207207+ let codepoint = ((c land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) in
208208+ (* Check for surrogates (U+D800-U+DFFF) *)
209209+ if codepoint >= 0xD800 && codepoint <= 0xDFFF then
210210+ raise Exit;
211211+ (* Check for noncharacters in BMP *)
212212+ if codepoint >= 0xFDD0 && codepoint <= 0xFDEF then
213213+ raise Exit;
214214+ if codepoint = 0xFFFE || codepoint = 0xFFFF then
215215+ raise Exit;
216216+ i := !i + 3
217217+ end else if c >= 0xF0 && c < 0xF8 && !i + 3 < len then begin
218218+ (* 4-byte UTF-8 *)
219219+ let b1 = Char.code bytes.[!i + 1] in
220220+ let b2 = Char.code bytes.[!i + 2] in
221221+ let b3 = Char.code bytes.[!i + 3] in
222222+ let codepoint = ((c land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor
223223+ ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) in
224224+ (* Check for noncharacters at end of each plane: U+1FFFE, U+1FFFF, U+2FFFE, etc. *)
225225+ if (codepoint land 0xFFFF) = 0xFFFE || (codepoint land 0xFFFF) = 0xFFFF then
226226+ raise Exit;
227227+ i := !i + 4
228228+ end else begin
229229+ (* Invalid UTF-8 or other - skip *)
230230+ incr i
231231+ end
232232+ done;
233233+ false
234234+235235+(** Check if host contains invalid percent-encoded Unicode. *)
236236+let check_invalid_percent_encoded_unicode host url attr_name element_name =
237237+ try
238238+ let decoded = percent_decode host in
239239+ let _ = contains_invalid_unicode decoded in
240240+ None
241241+ with Exit ->
242242+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host."
243243+ url attr_name element_name)
244244+245245+(** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *)
246246+let contains_percent_char s =
247247+ (* Check for ASCII percent *)
248248+ String.contains s '%' ||
249249+ (* Check for fullwidth percent (U+FF05 = 0xEF 0xBC 0x85 in UTF-8) *)
250250+ try
251251+ let _ = Str.search_forward (Str.regexp "\xef\xbc\x85") s 0 in
252252+ true
253253+ with Not_found -> false
254254+255255+(** Check if decoded host contains forbidden characters.
256256+ Some URLs have percent-encoded fullwidth characters that decode to forbidden chars. *)
257257+let check_decoded_host_chars host url attr_name element_name =
258258+ let decoded = percent_decode host in
259259+ (* Check for % character in decoded host - this catches fullwidth percent signs etc. *)
260260+ if contains_percent_char decoded then
261261+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c%%\xe2\x80\x9d is not allowed."
262262+ url attr_name element_name)
263263+ else
264264+ None
265265+266266+(** Validate port string. Returns error message or None. *)
267267+let validate_port port url attr_name element_name =
268268+ if port = "" then None
269269+ else begin
270270+ (* Check for invalid characters in port *)
271271+ let invalid_char = ref None in
272272+ String.iter (fun c ->
273273+ if !invalid_char = None && not (c >= '0' && c <= '9') then
274274+ invalid_char := Some c
275275+ ) port;
276276+ match !invalid_char with
277277+ | Some c ->
278278+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in port: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed."
279279+ url attr_name element_name c)
280280+ | None ->
281281+ (* Check port range *)
282282+ try
283283+ let port_num = int_of_string port in
284284+ if port_num >= 65536 then
285285+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Port number must be less than 65536."
286286+ url attr_name element_name)
287287+ else
288288+ None
289289+ with _ -> None
290290+ end
291291+292292+(** Validate host string. Returns error message or None. *)
293293+let validate_host host url attr_name element_name scheme =
294294+ if is_ipv6_host host then
295295+ validate_ipv6_host host url attr_name element_name
296296+ else begin
297297+ (* Check for empty host *)
298298+ let requires_host = List.mem scheme special_schemes in
299299+ if host = "" && requires_host && scheme <> "file" then
300300+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: empty host."
301301+ url attr_name element_name)
302302+ else
303303+ (* Check for invalid chars *)
304304+ let invalid_char =
305305+ List.find_opt (fun c -> String.contains host c) invalid_host_chars
306306+ in
307307+ match invalid_char with
308308+ | Some c ->
309309+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed."
310310+ url attr_name element_name c)
311311+ | None ->
312312+ (* Check for | *)
313313+ if String.contains host '|' && not (is_pipe_allowed_in_host url host) then
314314+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c|\xe2\x80\x9d is not allowed."
315315+ url attr_name element_name)
316316+ (* Check for backslash in host *)
317317+ else if String.contains host '\\' then
318318+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed."
319319+ url attr_name element_name)
320320+ (* Check for space in host *)
321321+ else if String.contains host ' ' then
322322+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: space is not allowed."
323323+ url attr_name element_name)
324324+ (* Check for invalid percent-encoded Unicode in host *)
325325+ else begin
326326+ match check_invalid_percent_encoded_unicode host url attr_name element_name with
327327+ | Some err -> Some err
328328+ | None ->
329329+ (* Check decoded host for forbidden chars like fullwidth percent *)
330330+ check_decoded_host_chars host url attr_name element_name
331331+ end
332332+ end
333333+334334+(** Check if URL has special scheme requiring double slash. *)
335335+let check_special_scheme_double_slash url attr_name element_name =
336336+ match extract_scheme url with
337337+ | None -> None
338338+ | Some scheme ->
339339+ (* Only check for schemes that require //, not file: *)
340340+ if List.mem scheme special_schemes_require_double_slash then begin
341341+ (* Check if followed by :// *)
342342+ let colon_pos = String.index url ':' in
343343+ let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
344344+ if String.length after_colon < 2 || after_colon.[0] <> '/' || after_colon.[1] <> '/' then
345345+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a slash (\"/\")."
346346+ url attr_name element_name)
347347+ else
348348+ None
349349+ end else
350350+ None
351351+352352+(** Check for data: URI with fragment - this is a warning (RFC 2397 forbids fragments). *)
353353+let check_data_uri_fragment url attr_name element_name =
354354+ match extract_scheme url with
355355+ | None -> None
356356+ | Some scheme ->
357357+ if scheme = "data" && String.contains url '#' then
358358+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Fragment is not allowed for data: URIs according to RFC 2397."
359359+ url attr_name element_name)
360360+ else
361361+ None
362362+363363+(** data: URLs cannot start with / (they have specific format: data:[mediatype][;base64],data) *)
364364+let data_scheme_no_slash = ["data"]
365365+366366+(** Check for data: URL that incorrectly has a slash (data: URLs have specific format). *)
367367+let check_data_url_no_slash url attr_name element_name =
368368+ match extract_scheme url with
369369+ | None -> None
370370+ | Some scheme ->
371371+ if List.mem scheme data_scheme_no_slash then begin
372372+ let colon_pos = String.index url ':' in
373373+ let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
374374+ (* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *)
375375+ if String.length after_colon > 0 && after_colon.[0] = '/' then
376376+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid %s: URL."
377377+ url attr_name element_name scheme)
378378+ else
379379+ None
380380+ end else
381381+ None
382382+383383+(** Check for illegal characters in scheme data (for non-special schemes). *)
384384+let check_scheme_data url attr_name element_name =
385385+ match extract_scheme url with
386386+ | None -> None
387387+ | Some scheme ->
388388+ if not (List.mem scheme special_schemes) then begin
389389+ (* Get scheme data (after the colon) *)
390390+ let colon_pos = String.index url ':' in
391391+ let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
392392+ (* Check for space in scheme data *)
393393+ if String.contains scheme_data ' ' then
394394+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: space is not allowed."
395395+ url attr_name element_name)
396396+ else
397397+ None
398398+ end else
399399+ None
400400+401401+(** Remove query and fragment from path. *)
402402+let remove_query_fragment path =
403403+ let path = try String.sub path 0 (String.index path '?') with Not_found -> path in
404404+ try String.sub path 0 (String.index path '#') with Not_found -> path
405405+406406+(** Check for illegal characters in path segment. *)
407407+let check_path_segment url attr_name element_name =
408408+ (* Extract path: everything after authority (or after scheme: for non-authority URLs) *)
409409+ let raw_path =
410410+ try
411411+ let double_slash = Str.search_forward (Str.regexp "://") url 0 in
412412+ let after_auth_start = double_slash + 3 in
413413+ let rest = String.sub url after_auth_start (String.length url - after_auth_start) in
414414+ (* Find end of authority *)
415415+ let path_start =
416416+ try String.index rest '/'
417417+ with Not_found -> String.length rest
418418+ in
419419+ if path_start < String.length rest then
420420+ String.sub rest path_start (String.length rest - path_start)
421421+ else
422422+ ""
423423+ with Not_found ->
424424+ (* No double slash - check for single slash path *)
425425+ match extract_scheme url with
426426+ | Some _ ->
427427+ let colon_pos = String.index url ':' in
428428+ let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
429429+ after_colon
430430+ | None ->
431431+ (* Relative URL - the whole thing is the path *)
432432+ url
433433+ in
434434+ (* Remove query and fragment for path-specific checks *)
435435+ let path = remove_query_fragment raw_path in
436436+ (* Check for space in path (not allowed) *)
437437+ if String.contains path ' ' then
438438+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: space is not allowed."
439439+ url attr_name element_name)
440440+ (* Check for pipe in path (not allowed except in file:// authority) *)
441441+ else if String.contains path '|' then
442442+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c|\xe2\x80\x9d is not allowed."
443443+ url attr_name element_name)
444444+ (* Check for unescaped square brackets in path *)
445445+ else if String.contains path '[' then
446446+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c[\xe2\x80\x9d is not allowed."
447447+ url attr_name element_name)
448448+ else
449449+ None
450450+451451+(** Check for illegal characters in relative URL. *)
452452+let check_relative_url url attr_name element_name =
453453+ (* If URL has no scheme, it's relative *)
454454+ match extract_scheme url with
455455+ | Some _ -> None
456456+ | None ->
457457+ (* Check for square brackets at start (not IPv6 - that requires scheme) *)
458458+ if String.length url > 0 && url.[0] = '[' then
459459+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c[\xe2\x80\x9d is not allowed."
460460+ url attr_name element_name)
461461+ else
462462+ None
463463+464464+(** Check if character is a valid hex digit. *)
465465+let is_hex_digit c =
466466+ (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
467467+468468+(** Check for bare percent sign not followed by hex digits. *)
469469+let check_percent_encoding url attr_name element_name =
470470+ let len = String.length url in
471471+ let rec find_bare_percent i =
472472+ if i >= len then None
473473+ else if url.[i] = '%' then begin
474474+ (* Check if followed by two hex digits *)
475475+ if i + 2 < len && is_hex_digit url.[i + 1] && is_hex_digit url.[i + 2] then
476476+ find_bare_percent (i + 3) (* Valid percent encoding, continue *)
477477+ else
478478+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Percentage (\xe2\x80\x9c%%\xe2\x80\x9d) is not followed by two hexadecimal digits."
479479+ url attr_name element_name)
480480+ end else
481481+ find_bare_percent (i + 1)
482482+ in
483483+ find_bare_percent 0
484484+485485+(** Check for illegal characters in query string. *)
486486+let check_query_string url attr_name element_name =
487487+ try
488488+ let query_start = String.index url '?' in
489489+ let fragment_start =
490490+ try Some (String.index_from url query_start '#')
491491+ with Not_found -> None
492492+ in
493493+ let query_end = match fragment_start with
494494+ | Some pos -> pos
495495+ | None -> String.length url
496496+ in
497497+ let query = String.sub url (query_start + 1) (query_end - query_start - 1) in
498498+ (* Check for unescaped space in query *)
499499+ if String.contains query ' ' then
500500+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in query: space is not allowed."
501501+ url attr_name element_name)
502502+ else
503503+ None
504504+ with Not_found -> None (* No query string *)
505505+506506+(** Check for illegal characters in fragment. *)
507507+let check_fragment url attr_name element_name =
508508+ try
509509+ let fragment_start = String.index url '#' in
510510+ let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in
511511+ (* Check for second hash in fragment *)
512512+ if String.contains fragment '#' then
513513+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c#\xe2\x80\x9d is not allowed."
514514+ url attr_name element_name)
515515+ (* Check for space in fragment *)
516516+ else if String.contains fragment ' ' then
517517+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: space is not allowed."
518518+ url attr_name element_name)
519519+ else
520520+ None
521521+ with Not_found -> None (* No fragment *)
522522+523523+(** Characters not allowed in userinfo (user:password) part of URL. *)
524524+let invalid_userinfo_chars = [']'; '['; '^'; '|'; '`'; '<'; '>']
525525+526526+(** Check for illegal characters in userinfo (user:password). *)
527527+let check_userinfo url attr_name element_name =
528528+ try
529529+ (* Look for :// then find the LAST @ before the next / or end *)
530530+ let double_slash = Str.search_forward (Str.regexp "://") url 0 + 3 in
531531+ let rest = String.sub url double_slash (String.length url - double_slash) in
532532+ (* Find first / or ? or # to limit authority section *)
533533+ let auth_end =
534534+ let find_char c = try Some (String.index rest c) with Not_found -> None in
535535+ match find_char '/', find_char '?', find_char '#' with
536536+ | Some a, Some b, Some c -> min a (min b c)
537537+ | Some a, Some b, None -> min a b
538538+ | Some a, None, Some c -> min a c
539539+ | None, Some b, Some c -> min b c
540540+ | Some a, None, None -> a
541541+ | None, Some b, None -> b
542542+ | None, None, Some c -> c
543543+ | None, None, None -> String.length rest
544544+ in
545545+ let authority = String.sub rest 0 auth_end in
546546+ (* Find LAST @ in authority to separate userinfo from host *)
547547+ let at_pos =
548548+ try Some (String.rindex authority '@')
549549+ with Not_found -> None
550550+ in
551551+ match at_pos with
552552+ | None -> None (* No userinfo *)
553553+ | Some at ->
554554+ let userinfo = String.sub authority 0 at in
555555+ (* Check for @ in userinfo (should be percent-encoded) *)
556556+ if String.contains userinfo '@' then
557557+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: User or password contains an at symbol (\xe2\x80\x9c@\xe2\x80\x9d) not percent-encoded."
558558+ url attr_name element_name)
559559+ (* Check for space *)
560560+ else if String.contains userinfo ' ' then
561561+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: space is not allowed."
562562+ url attr_name element_name)
563563+ else
564564+ (* Check for non-ASCII characters (like emoji) *)
565565+ let has_non_ascii = String.exists (fun c -> Char.code c > 127) userinfo in
566566+ if has_non_ascii then
567567+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password."
568568+ url attr_name element_name)
569569+ else
570570+ (* Check for other invalid chars *)
571571+ let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in
572572+ match invalid with
573573+ | Some c ->
574574+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed."
575575+ url attr_name element_name c)
576576+ | None -> None
577577+ with _ -> None
578578+579579+(** Attributes where empty URL is an error.
580580+ Note: href, cite, action can be empty (refers to current document).
581581+ formaction and src must be non-empty though. *)
582582+let must_be_non_empty = ["formaction"; "src"; "poster"; "data"]
583583+584584+(** Element/attribute combinations where empty URL is an error. *)
585585+let must_be_non_empty_combinations = [
586586+ ("link", "href"); (* link href must be non-empty *)
587587+ ("form", "action"); (* form action must be non-empty *)
588588+]
589589+590590+(** Check URL for common errors. Returns error message or None. *)
591591+let validate_url url element_name attr_name =
592592+ let original_url = url in
593593+ let url = String.trim url in
594594+ (* Empty URL check for certain attributes *)
595595+ if url = "" then begin
596596+ let name_lower = String.lowercase_ascii element_name in
597597+ let attr_lower = String.lowercase_ascii attr_name in
598598+ if List.mem attr_lower must_be_non_empty ||
599599+ List.mem (name_lower, attr_lower) must_be_non_empty_combinations then
600600+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Must be non-empty."
601601+ original_url attr_name element_name)
602602+ else
603603+ None
604604+ end
605605+ else begin
606606+ (* Check for leading/trailing whitespace *)
607607+ if original_url <> url && (String.length original_url > 0) then
608608+ let has_leading = String.length original_url > 0 && (original_url.[0] = ' ' || original_url.[0] = '\t') in
609609+ let has_trailing = String.length original_url > 0 &&
610610+ let last = original_url.[String.length original_url - 1] in
611611+ last = ' ' || last = '\t' in
612612+ if has_leading || has_trailing then
613613+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character: leading/trailing ASCII whitespace."
614614+ original_url attr_name element_name)
615615+ else None
616616+ (* Check for newlines/tabs *)
617617+ else if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then
618618+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found."
619619+ url attr_name element_name)
620620+ else begin
621621+ (* Check for relative URL issues first *)
622622+ match check_relative_url url attr_name element_name with
623623+ | Some err -> Some err
624624+ | None ->
625625+626626+ (* Check percent encoding *)
627627+ match check_percent_encoding url attr_name element_name with
628628+ | Some err -> Some err
629629+ | None ->
630630+631631+ (* Check query string *)
632632+ match check_query_string url attr_name element_name with
633633+ | Some err -> Some err
634634+ | None ->
635635+636636+ (* Check fragment *)
637637+ match check_fragment url attr_name element_name with
638638+ | Some err -> Some err
639639+ | None ->
640640+641641+ (* Check userinfo *)
642642+ match check_userinfo url attr_name element_name with
643643+ | Some err -> Some err
644644+ | None ->
645645+646646+ (* Check special scheme requires double slash *)
647647+ match check_special_scheme_double_slash url attr_name element_name with
648648+ | Some err -> Some err
649649+ | None ->
650650+651651+ (* Check data: URLs don't start with slash *)
652652+ match check_data_url_no_slash url attr_name element_name with
653653+ | Some err -> Some err
654654+ | None ->
655655+656656+ (* Check for backslash AFTER special scheme check *)
657657+ if String.contains url '\\' then
658658+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Backslash (\"\\\") used as path segment delimiter."
659659+ url attr_name element_name)
660660+ else
661661+662662+ (* Check scheme data for non-special schemes *)
663663+ match check_scheme_data url attr_name element_name with
664664+ | Some err -> Some err
665665+ | None ->
666666+667667+ (* Check path segment for illegal characters *)
668668+ match check_path_segment url attr_name element_name with
669669+ | Some err -> Some err
670670+ | None ->
671671+672672+ let scheme = extract_scheme url in
673673+ let (host_opt, port_opt) = extract_host_and_port url in
674674+ let scheme_str = match scheme with Some s -> s | None -> "" in
675675+676676+ (* Validate port if present *)
677677+ match port_opt with
678678+ | Some port ->
679679+ (match validate_port port url attr_name element_name with
680680+ | Some err -> Some err
681681+ | None ->
682682+ (* Also validate host *)
683683+ match host_opt with
684684+ | Some host -> validate_host host url attr_name element_name scheme_str
685685+ | None -> None)
686686+ | None ->
687687+ (* Just validate host *)
688688+ match host_opt with
689689+ | Some host -> validate_host host url attr_name element_name scheme_str
690690+ | None -> None
691691+ end
692692+ end
693693+694694+(** Checker state. *)
695695+type state = unit
696696+697697+let create () = ()
698698+let reset _state = ()
699699+700700+(** Get attribute value by name. *)
701701+let get_attr_value name attrs =
702702+ List.find_map (fun (k, v) ->
703703+ if String.lowercase_ascii k = String.lowercase_ascii name then Some v else None
704704+ ) attrs
705705+706706+let start_element _state ~name ~namespace ~attrs collector =
707707+ if namespace <> None then ()
708708+ else begin
709709+ let name_lower = String.lowercase_ascii name in
710710+ match List.assoc_opt name_lower url_attributes with
711711+ | None -> ()
712712+ | Some url_attrs ->
713713+ List.iter (fun attr_name ->
714714+ (* Try to find the attribute - case insensitive *)
715715+ let url_opt = get_attr_value attr_name attrs in
716716+ match url_opt with
717717+ | None -> ()
718718+ | Some url ->
719719+ (* Check for data: URI with fragment - emit warning *)
720720+ (match check_data_uri_fragment url attr_name name with
721721+ | Some warn_msg ->
722722+ Message_collector.add_warning collector
723723+ ~message:warn_msg
724724+ ~code:"data-uri-fragment"
725725+ ~element:name
726726+ ~attribute:attr_name
727727+ ()
728728+ | None -> ());
729729+ match validate_url url name attr_name with
730730+ | None -> ()
731731+ | Some error_msg ->
732732+ Message_collector.add_error collector
733733+ ~message:error_msg
734734+ ~code:"bad-url"
735735+ ~element:name
736736+ ~attribute:attr_name
737737+ ()
738738+ ) url_attrs;
739739+ (* Special handling for input[type=url] value attribute - must be absolute URL *)
740740+ if name_lower = "input" then begin
741741+ let type_attr = get_attr_value "type" attrs in
742742+ if type_attr = Some "url" then begin
743743+ match get_attr_value "value" attrs with
744744+ | None -> ()
745745+ | Some url ->
746746+ let url = String.trim url in
747747+ if url = "" then ()
748748+ else begin
749749+ (* First check if it's an absolute URL (has a scheme) *)
750750+ let scheme = extract_scheme url in
751751+ match scheme with
752752+ | None ->
753753+ (* Not an absolute URL *)
754754+ Message_collector.add_error collector
755755+ ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9cvalue\xe2\x80\x9d on element \xe2\x80\x9cinput\xe2\x80\x9d: Bad absolute URL: The string \xe2\x80\x9c%s\xe2\x80\x9d is not an absolute URL."
756756+ url url)
757757+ ~code:"bad-url"
758758+ ~element:name
759759+ ~attribute:"value"
760760+ ()
761761+ | Some _ ->
762762+ (* Has a scheme - do regular URL validation with "absolute URL" prefix *)
763763+ match validate_url url name "value" with
764764+ | None -> ()
765765+ | Some error_msg ->
766766+ (* Replace "Bad URL:" with "Bad absolute URL:" for input[type=url] *)
767767+ let error_msg = Str.global_replace (Str.regexp "Bad URL:") "Bad absolute URL:" error_msg in
768768+ Message_collector.add_error collector
769769+ ~message:error_msg
770770+ ~code:"bad-url"
771771+ ~element:name
772772+ ~attribute:"value"
773773+ ()
774774+ end
775775+ end
776776+ end
777777+ end
778778+779779+let end_element _state ~name:_ ~namespace:_ _collector = ()
780780+let characters _state _text _collector = ()
781781+let end_document _state _collector = ()
782782+783783+let checker =
784784+ (module struct
785785+ type nonrec state = state
786786+ let create = create
787787+ let reset = reset
788788+ let start_element = start_element
789789+ let end_element = end_element
790790+ let characters = characters
791791+ let end_document = end_document
792792+ end : Checker.S)
+14-14
lib/html5rw/parser/parser_tree_builder.ml
···854854 | Token.Tag { kind = Token.Start; name; attrs; _ }
855855 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"] ->
856856 ignore (insert_element t name attrs)
857857- | Token.Tag { kind = Token.Start; name = "title"; _ } ->
858858- ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs = []; self_closing = false });
857857+ | Token.Tag { kind = Token.Start; name = "title"; attrs; self_closing } ->
858858+ ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs; self_closing });
859859 t.original_mode <- Some t.mode;
860860 t.mode <- Parser_insertion_mode.Text
861861- | Token.Tag { kind = Token.Start; name; _ }
861861+ | Token.Tag { kind = Token.Start; name; attrs; self_closing }
862862 when List.mem name ["noframes"; "style"] ->
863863- ignore (insert_element_for_token t { kind = Token.Start; name; attrs = []; self_closing = false });
863863+ ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing });
864864 t.original_mode <- Some t.mode;
865865 t.mode <- Parser_insertion_mode.Text
866866- | Token.Tag { kind = Token.Start; name = "noscript"; _ } ->
866866+ | Token.Tag { kind = Token.Start; name = "noscript"; attrs; self_closing } ->
867867 (* Scripting is disabled: parse noscript content as HTML *)
868868- ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs = []; self_closing = false });
868868+ ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs; self_closing });
869869 t.mode <- Parser_insertion_mode.In_head_noscript
870870 | Token.Tag { kind = Token.Start; name = "script"; attrs; self_closing } ->
871871 ignore (insert_element_for_token t { kind = Token.Start; name = "script"; attrs; self_closing });
···13401340 String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden"
13411341 ) attrs in
13421342 if not is_hidden then t.frameset_ok <- false
13431343- | Token.Tag { kind = Token.Start; name; _ }
13431343+ | Token.Tag { kind = Token.Start; name; attrs; _ }
13441344 when List.mem name ["param"; "source"; "track"] ->
13451345- ignore (insert_element_for_token t { kind = Token.Start; name; attrs = []; self_closing = false });
13451345+ ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing = false });
13461346 pop_current t
13471347 | Token.Tag { kind = Token.Start; name = "hr"; _ } ->
13481348 if has_element_in_button_scope t "p" then close_p_element t;
···13621362 t.original_mode <- Some t.mode;
13631363 t.frameset_ok <- false;
13641364 t.mode <- Parser_insertion_mode.Text
13651365- | Token.Tag { kind = Token.Start; name = "xmp"; _ } ->
13651365+ | Token.Tag { kind = Token.Start; name = "xmp"; attrs; _ } ->
13661366 if has_element_in_button_scope t "p" then close_p_element t;
13671367 reconstruct_active_formatting t;
13681368 t.frameset_ok <- false;
13691369- ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs = []; self_closing = false });
13691369+ ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs; self_closing = false });
13701370 t.original_mode <- Some t.mode;
13711371 t.mode <- Parser_insertion_mode.Text
13721372- | Token.Tag { kind = Token.Start; name = "iframe"; _ } ->
13721372+ | Token.Tag { kind = Token.Start; name = "iframe"; attrs; _ } ->
13731373 t.frameset_ok <- false;
13741374- ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs = []; self_closing = false });
13741374+ ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs; self_closing = false });
13751375 t.original_mode <- Some t.mode;
13761376 t.mode <- Parser_insertion_mode.Text
13771377- | Token.Tag { kind = Token.Start; name = "noembed"; _ } ->
13781378- ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs = []; self_closing = false });
13771377+ | Token.Tag { kind = Token.Start; name = "noembed"; attrs; _ } ->
13781378+ ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs; self_closing = false });
13791379 t.original_mode <- Some t.mode;
13801380 t.mode <- Parser_insertion_mode.Text
13811381 | Token.Tag { kind = Token.Start; name = "select"; attrs; _ } ->
+62
test/analyze_failures.ml
···11+(* Quick analysis: find failing test files and print their content *)
22+33+let tests_dir = "validator/tests"
44+55+type expected_outcome = Valid | Invalid | HasWarning | Unknown
66+77+let parse_outcome filename =
88+ (* Check .html *)
99+ if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-isvalid.html" then Valid
1010+ else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-novalid.html" then Invalid
1111+ else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-haswarn.html" then HasWarning
1212+ (* Check .xhtml *)
1313+ else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-isvalid.xhtml" then Valid
1414+ else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-novalid.xhtml" then Invalid
1515+ else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-haswarn.xhtml" then HasWarning
1616+ else Unknown
1717+1818+let rec find_files dir =
1919+ let entries = Sys.readdir dir |> Array.to_list in
2020+ List.concat_map (fun entry ->
2121+ let path = Filename.concat dir entry in
2222+ if Sys.is_directory path then find_files path
2323+ else if parse_outcome (Filename.basename path) <> Unknown then [path]
2424+ else []
2525+ ) entries
2626+2727+let () =
2828+ let mode = if Array.length Sys.argv > 1 then Sys.argv.(1) else "novalid" in
2929+ let files = find_files tests_dir in
3030+ let count = ref 0 in
3131+3232+ List.iter (fun path ->
3333+ let outcome = parse_outcome (Filename.basename path) in
3434+ let ic = open_in path in
3535+ let content = really_input_string ic (in_channel_length ic) in
3636+ close_in ic;
3737+3838+ let reader = Bytesrw.Bytes.Reader.of_string content in
3939+ let result = Html5_checker.check ~collect_parse_errors:true reader in
4040+ let errors = Html5_checker.errors result in
4141+ let warnings = Html5_checker.warnings result in
4242+4343+ let should_print = match mode with
4444+ | "isvalid" -> outcome = Valid && (errors <> [] || warnings <> []) && !count < 60
4545+ | _ -> outcome = Invalid && errors = [] && !count < 60
4646+ in
4747+ if should_print then begin
4848+ Printf.printf "\n=== %s ===\n" path;
4949+ if mode = "isvalid" then begin
5050+ if errors <> [] then begin
5151+ Printf.printf "ERRORS:\n";
5252+ List.iter (fun e -> Printf.printf " %s\n" e.Html5_checker.Message.message) errors
5353+ end;
5454+ if warnings <> [] then begin
5555+ Printf.printf "WARNINGS:\n";
5656+ List.iter (fun w -> Printf.printf " %s\n" w.Html5_checker.Message.message) warnings
5757+ end
5858+ end;
5959+ print_endline content;
6060+ incr count
6161+ end
6262+ ) files
+41
test/debug_validator.ml
···11+(** Debug utility for testing individual HTML files against the validator *)
22+33+let () =
44+ if Array.length Sys.argv < 2 then begin
55+ Printf.printf "Usage: debug_validator <html-file>\n";
66+ exit 1
77+ end;
88+99+ let path = Sys.argv.(1) in
1010+ let ic = open_in path in
1111+ let content = really_input_string ic (in_channel_length ic) in
1212+ close_in ic;
1313+1414+ Printf.printf "=== Checking: %s ===\n\n" path;
1515+ Printf.printf "Input (%d bytes):\n%s\n\n" (String.length content) content;
1616+1717+ let reader = Bytesrw.Bytes.Reader.of_string content in
1818+ let result = Html5_checker.check ~collect_parse_errors:true ~system_id:path reader in
1919+2020+ let errors = Html5_checker.errors result in
2121+ let warnings = Html5_checker.warnings result in
2222+2323+ Printf.printf "=== Results ===\n";
2424+ Printf.printf "Errors: %d\n" (List.length errors);
2525+ List.iter (fun msg ->
2626+ Printf.printf " [ERROR] %s\n" msg.Html5_checker.Message.message;
2727+ (match msg.Html5_checker.Message.location with
2828+ | Some loc -> Printf.printf " at line %d, col %d\n" loc.line loc.column
2929+ | None -> ())
3030+ ) errors;
3131+3232+ Printf.printf "Warnings: %d\n" (List.length warnings);
3333+ List.iter (fun msg ->
3434+ Printf.printf " [WARN] %s\n" msg.Html5_checker.Message.message;
3535+ (match msg.Html5_checker.Message.location with
3636+ | Some loc -> Printf.printf " at line %d, col %d\n" loc.line loc.column
3737+ | None -> ())
3838+ ) warnings;
3939+4040+ Printf.printf "\n=== Formatted Output ===\n";
4141+ Printf.printf "%s\n" (Html5_checker.format_text result)