···8899 val start_element :
1010 state ->
1111- name:string ->
1212- namespace:string option ->
1313- attrs:(string * string) list ->
1111+ element:Element.t ->
1412 Message_collector.t ->
1513 unit
16141715 val end_element :
1818- state -> name:string -> namespace:string option -> Message_collector.t -> unit
1616+ state ->
1717+ tag:Tag.element_tag ->
1818+ Message_collector.t ->
1919+ unit
19202021 val characters : state -> string -> Message_collector.t -> unit
2122 val end_document : state -> Message_collector.t -> unit
···3031 let create () = ()
3132 let reset () = ()
32333333- let start_element () ~name:_ ~namespace:_ ~attrs:_ _ = ()
3434- let end_element () ~name:_ ~namespace:_ _ = ()
3434+ let start_element () ~element:_ _ = ()
3535+ let end_element () ~tag:_ _ = ()
3536 let characters () _ _ = ()
3637 let end_document () _ = ()
3738end
+9-12
lib/htmlrw_check/checker.mli
···87878888 val start_element :
8989 state ->
9090- name:string ->
9191- namespace:string option ->
9292- attrs:(string * string) list ->
9090+ element:Element.t ->
9391 Message_collector.t ->
9492 unit
9595- (** [start_element state ~name ~namespace ~attrs collector] is called when
9393+ (** [start_element state ~element collector] is called when
9694 entering an element during DOM traversal.
97959896 @param state The checker state
9999- @param name The element tag name (e.g., "div", "p", "span")
100100- @param namespace The element namespace ([None] for HTML, [Some "svg"]
101101- for SVG, [Some "mathml"] for MathML)
102102- @param attrs The element's attributes as [(name, value)] pairs
9797+ @param element The typed element (includes tag, typed attrs, and raw attrs)
10398 @param collector The message collector for emitting validation messages
10499105100 This is where checkers can validate:
···109104 - Whether the element opens a new validation context *)
110105111106 val end_element :
112112- state -> name:string -> namespace:string option -> Message_collector.t -> unit
113113- (** [end_element state ~name ~namespace collector] is called when exiting
107107+ state ->
108108+ tag:Tag.element_tag ->
109109+ Message_collector.t ->
110110+ unit
111111+ (** [end_element state ~tag collector] is called when exiting
114112 an element during DOM traversal.
115113116114 @param state The checker state
117117- @param name The element tag name
118118- @param namespace The element namespace
115115+ @param tag The element tag
119116 @param collector The message collector for emitting validation messages
120117121118 This is where checkers can:
+46-16
lib/htmlrw_check/content_model/content_checker.ml
···22 name : string;
33 spec : Element_spec.t;
44 children_count : int;
55+ is_foreign : bool; (* SVG or MathML element *)
56}
6778type state = {
···9293 Message_collector.add_typed collector
9394 (`Element (`Not_allowed_as_child (`Child child_name, `Parent parent.name)))
94959595-let start_element state ~name ~namespace:_ ~attrs:_ collector =
9696- (* Look up element specification *)
9797- let spec_opt = Element_registry.get state.registry name in
9696+let start_element state ~element collector =
9797+ let name = Tag.tag_to_string element.Element.tag in
9898+9999+ (* Check if we're inside a foreign (SVG/MathML) context *)
100100+ let in_foreign_context = match state.ancestor_stack with
101101+ | ctx :: _ -> ctx.is_foreign
102102+ | [] -> false
103103+ in
104104+105105+ (* Determine if this element is foreign content *)
106106+ let is_foreign = match element.Element.tag with
107107+ | Tag.Svg _ | Tag.MathML _ -> true
108108+ | _ -> in_foreign_context (* Inherit from parent if inside foreign content *)
109109+ in
110110+111111+ (* If entering foreign content from HTML, SVG/MathML are valid embedded content *)
112112+ (* If already in foreign content, skip HTML content model checks *)
113113+ if is_foreign && not in_foreign_context then begin
114114+ (* Entering SVG/MathML from HTML - just track it, it's valid embedded content *)
115115+ let spec = Element_spec.make ~name ~content_model:(Content_model.Categories [Content_category.Flow]) () in
116116+ let context = { name; spec; children_count = 0; is_foreign = true } in
117117+ state.ancestor_stack <- context :: state.ancestor_stack
118118+ end else if is_foreign then begin
119119+ (* Inside SVG/MathML - just track nesting, don't validate against HTML *)
120120+ let spec = Element_spec.make ~name ~content_model:(Content_model.Categories [Content_category.Flow]) () in
121121+ let context = { name; spec; children_count = 0; is_foreign = true } in
122122+ state.ancestor_stack <- context :: state.ancestor_stack
123123+ end else begin
124124+ (* HTML element - do normal validation *)
125125+ let spec_opt = Element_registry.get state.registry name in
981269999- match spec_opt with
100100- | None ->
101101- (* Unknown element - first check if it's allowed in current context *)
102102- validate_child_element state name collector
103103- | Some spec ->
104104- (* Check prohibited ancestors *)
105105- check_prohibited_ancestors state name spec collector;
127127+ match spec_opt with
128128+ | None ->
129129+ (* Unknown element - first check if it's allowed in current context *)
130130+ validate_child_element state name collector
131131+ | Some spec ->
132132+ (* Check prohibited ancestors *)
133133+ check_prohibited_ancestors state name spec collector;
106134107107- (* Validate this element is allowed as child of parent *)
108108- validate_child_element state name collector;
135135+ (* Validate this element is allowed as child of parent *)
136136+ validate_child_element state name collector;
109137110110- (* Push element context onto stack *)
111111- let context = { name; spec; children_count = 0 } in
112112- state.ancestor_stack <- context :: state.ancestor_stack
138138+ (* Push element context onto stack *)
139139+ let context = { name; spec; children_count = 0; is_foreign = false } in
140140+ state.ancestor_stack <- context :: state.ancestor_stack
141141+ end
113142114114-let end_element state ~name ~namespace:_ collector =
143143+let end_element state ~tag collector =
144144+ let name = Tag.tag_to_string tag in
115145 match state.ancestor_stack with
116146 | [] ->
117147 (* Unmatched closing tag *)
+14-22
lib/htmlrw_check/dom_walker.ml
···15151616(** Package a checker with its state for traversal. *)
1717type checker_state = {
1818- start_element :
1919- name:string ->
2020- namespace:string option ->
2121- attrs:(string * string) list ->
2222- Message_collector.t ->
2323- unit;
2424- end_element :
2525- name:string -> namespace:string option -> Message_collector.t -> unit;
1818+ start_element : element:Element.t -> Message_collector.t -> unit;
1919+ end_element : tag:Tag.element_tag -> Message_collector.t -> unit;
2620 characters : string -> Message_collector.t -> unit;
2721 end_document : Message_collector.t -> unit;
2822}
···3125let make_checker_state (module C : Checker.S) =
3226 let state = C.create () in
3327 {
3434- start_element = (fun ~name ~namespace ~attrs collector ->
3535- C.start_element state ~name ~namespace ~attrs collector);
3636- end_element = (fun ~name ~namespace collector ->
3737- C.end_element state ~name ~namespace collector);
2828+ start_element = (fun ~element collector ->
2929+ C.start_element state ~element collector);
3030+ end_element = (fun ~tag collector ->
3131+ C.end_element state ~tag collector);
3832 characters = (fun text collector ->
3933 C.characters state text collector);
4034 end_document = (fun collector ->
···6054 (* Doctype node: skip (no validation events for doctype) *)
6155 ()
6256 | _ ->
6363- (* Element node: emit start, traverse children, emit end *)
6464- cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector;
5757+ (* Element node: create typed element, emit start, traverse children, emit end *)
5858+ let element = Element.create ~name:node.name ~namespace:node.namespace ~attrs:node.attrs in
5959+ cs.start_element ~element collector;
6560 List.iter (walk_node_single cs collector) node.children;
6666- cs.end_element ~name:node.name ~namespace:node.namespace collector
6161+ cs.end_element ~tag:element.tag collector
67626863let walk checker collector node =
6964 let cs = make_checker_state checker in
···8984 (* Doctype node: skip *)
9085 ()
9186 | _ ->
9292- (* Element node: emit start to all checkers, traverse children, emit end to all *)
9393- List.iter (fun cs ->
9494- cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector
9595- ) css;
8787+ (* Element node: create typed element, emit start to all checkers, traverse children, emit end to all *)
8888+ let element = Element.create ~name:node.name ~namespace:node.namespace ~attrs:node.attrs in
8989+ List.iter (fun cs -> cs.start_element ~element collector) css;
9690 List.iter (walk_node_all css collector) node.children;
9797- List.iter (fun cs ->
9898- cs.end_element ~name:node.name ~namespace:node.namespace collector
9999- ) css
9191+ List.iter (fun cs -> cs.end_element ~tag:element.tag collector) css
1009210193let walk_all checkers collector node =
10294 (* Create checker state packages *)
+873
lib/htmlrw_check/element/attr.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Typed HTML5 attribute representations using polymorphic variants.
77+88+ This module provides typed representations for HTML attributes with
99+ proper value types for enumerated attributes. *)
1010+1111+(** {1 Attribute Value Types} *)
1212+1313+(** Direction attribute values *)
1414+type dir_value = [ `Ltr | `Rtl | `Auto ]
1515+1616+(** Hidden attribute values *)
1717+type hidden_value = [ `Hidden | `Until_found ]
1818+1919+(** Popover attribute values *)
2020+type popover_value = [ `Auto | `Manual | `Hint ]
2121+2222+(** Link target values *)
2323+type target_value = [ `Self | `Blank | `Parent | `Top | `Named of string ]
2424+2525+(** Loading behavior values *)
2626+type loading_value = [ `Eager | `Lazy ]
2727+2828+(** Decoding hint values *)
2929+type decoding_value = [ `Sync | `Async | `Auto ]
3030+3131+(** Fetch priority values *)
3232+type fetchpriority_value = [ `High | `Low | `Auto ]
3333+3434+(** CORS settings values *)
3535+type crossorigin_value = [ `Anonymous | `Use_credentials ]
3636+3737+(** Preload hint values *)
3838+type preload_value = [ `None | `Metadata | `Auto ]
3939+4040+(** Form method values *)
4141+type method_value = [ `Get | `Post | `Dialog ]
4242+4343+(** Form enctype values *)
4444+type enctype_value = [ `Urlencoded | `Multipart | `Plain ]
4545+4646+(** Textarea wrap values *)
4747+type wrap_value = [ `Soft | `Hard ]
4848+4949+(** Table scope values *)
5050+type scope_value = [ `Row | `Col | `Rowgroup | `Colgroup ]
5151+5252+(** Input type values *)
5353+type input_type = [
5454+ | `Hidden | `Text | `Search | `Tel | `Url | `Email | `Password
5555+ | `Date | `Month | `Week | `Time | `Datetime_local | `Number
5656+ | `Range | `Color | `Checkbox | `Radio | `File | `Submit
5757+ | `Image | `Reset | `Button
5858+]
5959+6060+(** Button type values *)
6161+type button_type = [ `Submit | `Reset | `Button ]
6262+6363+(** Referrer policy values *)
6464+type referrerpolicy_value = [
6565+ | `No_referrer | `No_referrer_when_downgrade | `Origin
6666+ | `Origin_when_cross_origin | `Same_origin | `Strict_origin
6767+ | `Strict_origin_when_cross_origin | `Unsafe_url
6868+]
6969+7070+(** Sandbox flag values *)
7171+type sandbox_flag = [
7272+ | `Allow_downloads | `Allow_forms | `Allow_modals | `Allow_orientation_lock
7373+ | `Allow_pointer_lock | `Allow_popups | `Allow_popups_to_escape_sandbox
7474+ | `Allow_presentation | `Allow_same_origin | `Allow_scripts
7575+ | `Allow_top_navigation | `Allow_top_navigation_by_user_activation
7676+ | `Allow_top_navigation_to_custom_protocols
7777+]
7878+7979+(** Enter key hint values *)
8080+type enterkeyhint_value = [
8181+ | `Enter | `Done | `Go | `Next | `Previous | `Search | `Send
8282+]
8383+8484+(** Input mode values *)
8585+type inputmode_value = [
8686+ | `None | `Text | `Decimal | `Numeric | `Tel | `Search | `Email | `Url
8787+]
8888+8989+(** Content editable values *)
9090+type contenteditable_value = [ `True | `False | `Plaintext_only ]
9191+9292+(** Autocapitalize values *)
9393+type autocapitalize_value = [
9494+ | `Off | `None | `On | `Sentences | `Words | `Characters
9595+]
9696+9797+(** Image shape values *)
9898+type shape_value = [ `Rect | `Circle | `Poly | `Default ]
9999+100100+(** Capture values *)
101101+type capture_value = [ `User | `Environment ]
102102+103103+(** List type values *)
104104+type list_type_value = [
105105+ | `Decimal | `Lower_alpha | `Upper_alpha | `Lower_roman | `Upper_roman
106106+]
107107+108108+(** Track kind values *)
109109+type kind_value = [
110110+ | `Subtitles | `Captions | `Descriptions | `Chapters | `Metadata
111111+]
112112+113113+(** {1 Typed Attribute Variant} *)
114114+115115+(** Typed attribute representation *)
116116+type t = [
117117+ (* Global attributes *)
118118+ | `Id of string
119119+ | `Class of string
120120+ | `Style of string
121121+ | `Title of string
122122+ | `Lang of string
123123+ | `Dir of dir_value
124124+ | `Hidden of hidden_value option (* None = just "hidden" *)
125125+ | `Tabindex of int
126126+ | `Accesskey of string
127127+ | `Autocapitalize of autocapitalize_value
128128+ | `Autofocus
129129+ | `Contenteditable of contenteditable_value option
130130+ | `Draggable of bool
131131+ | `Enterkeyhint of enterkeyhint_value
132132+ | `Inert
133133+ | `Inputmode of inputmode_value
134134+ | `Is of string
135135+ | `Nonce of string
136136+ | `Popover of popover_value option
137137+ | `Slot of string
138138+ | `Spellcheck of bool option
139139+ | `Translate of bool
140140+ | `Exportparts of string
141141+ | `Part of string
142142+143143+ (* Microdata *)
144144+ | `Itemscope
145145+ | `Itemtype of string
146146+ | `Itemprop of string
147147+ | `Itemid of string
148148+ | `Itemref of string
149149+150150+ (* ARIA *)
151151+ | `Role of string
152152+ | `Aria of string * string (* aria-* -> (name, value) *)
153153+154154+ (* Event handlers *)
155155+ | `Event of string * string (* onclick -> ("click", handler) *)
156156+157157+ (* Link/navigation attributes *)
158158+ | `Href of string
159159+ | `Target of target_value
160160+ | `Rel of string
161161+ | `Download of string option
162162+ | `Hreflang of string
163163+ | `Ping of string
164164+ | `Referrerpolicy of referrerpolicy_value
165165+ | `Type_link of string
166166+167167+ (* Media/resource attributes *)
168168+ | `Src of string
169169+ | `Srcset of string
170170+ | `Sizes of string
171171+ | `Alt of string
172172+ | `Width of string
173173+ | `Height of string
174174+ | `Loading of loading_value
175175+ | `Decoding of decoding_value
176176+ | `Fetchpriority of fetchpriority_value
177177+ | `Crossorigin of crossorigin_value option
178178+ | `Ismap
179179+ | `Usemap of string
180180+ | `Media of string
181181+182182+ (* Audio/Video specific *)
183183+ | `Controls
184184+ | `Autoplay
185185+ | `Loop
186186+ | `Muted
187187+ | `Preload of preload_value
188188+ | `Poster of string
189189+ | `Playsinline
190190+191191+ (* Image map *)
192192+ | `Coords of string
193193+ | `Shape of shape_value
194194+195195+ (* iframe *)
196196+ | `Sandbox of sandbox_flag list option
197197+ | `Allow of string
198198+ | `Allowfullscreen
199199+ | `Srcdoc of string
200200+ | `Csp of string
201201+202202+ (* Form attributes *)
203203+ | `Action of string
204204+ | `Method of method_value
205205+ | `Enctype of enctype_value
206206+ | `Novalidate
207207+ | `Accept_charset of string
208208+ | `Autocomplete of string
209209+ | `Name of string
210210+ | `Form of string
211211+212212+ (* Form control attributes *)
213213+ | `Value of string
214214+ | `Type_input of input_type
215215+ | `Type_button of button_type
216216+ | `Disabled
217217+ | `Readonly
218218+ | `Required
219219+ | `Checked
220220+ | `Selected
221221+ | `Multiple
222222+ | `Placeholder of string
223223+ | `Min of string
224224+ | `Max of string
225225+ | `Step of string
226226+ | `Minlength of int
227227+ | `Maxlength of int
228228+ | `Pattern of string
229229+ | `Size of int
230230+ | `Cols of int
231231+ | `Rows of int
232232+ | `Wrap of wrap_value
233233+ | `Accept of string
234234+ | `Capture of capture_value
235235+ | `Dirname of string
236236+ | `For of string
237237+ | `List of string
238238+239239+ (* Form submission attributes *)
240240+ | `Formaction of string
241241+ | `Formmethod of method_value
242242+ | `Formenctype of enctype_value
243243+ | `Formnovalidate
244244+ | `Formtarget of target_value
245245+246246+ (* Table attributes *)
247247+ | `Colspan of int
248248+ | `Rowspan of int
249249+ | `Headers of string
250250+ | `Scope of scope_value
251251+ | `Span of int
252252+253253+ (* Details/Dialog *)
254254+ | `Open
255255+256256+ (* Script *)
257257+ | `Async
258258+ | `Defer
259259+ | `Integrity of string
260260+ | `Nomodule
261261+ | `Blocking of string
262262+ | `Type_script of string
263263+264264+ (* Meta *)
265265+ | `Charset of string
266266+ | `Content of string
267267+ | `Http_equiv of string
268268+269269+ (* Link element *)
270270+ | `As of string
271271+ | `Imagesizes of string
272272+ | `Imagesrcset of string
273273+274274+ (* Object/Embed *)
275275+ | `Data_object of string
276276+277277+ (* Output *)
278278+ | `For_output of string
279279+280280+ (* Meter/Progress *)
281281+ | `Low of float
282282+ | `High of float
283283+ | `Optimum of float
284284+285285+ (* Time *)
286286+ | `Datetime of string
287287+288288+ (* Ol *)
289289+ | `Start of int
290290+ | `Reversed
291291+ | `Type_list of list_type_value
292292+293293+ (* Track *)
294294+ | `Kind of kind_value
295295+ | `Srclang of string
296296+ | `Default
297297+298298+ (* Td/Th *)
299299+ | `Abbr of string
300300+301301+ (* Data attributes *)
302302+ | `Data_attr of string * string
303303+304304+ (* RDFa *)
305305+ | `Property of string
306306+ | `Typeof of string
307307+ | `Resource of string
308308+ | `Prefix of string
309309+ | `Vocab of string
310310+ | `About of string
311311+ | `Datatype of string
312312+ | `Inlist
313313+ | `Rev of string
314314+315315+ (* Escape hatch *)
316316+ | `Unknown_attr of string * string
317317+]
318318+319319+(** {1 Parsing Functions} *)
320320+321321+(** Parse dir value *)
322322+let parse_dir = function
323323+ | "ltr" -> Some `Ltr
324324+ | "rtl" -> Some `Rtl
325325+ | "auto" -> Some `Auto
326326+ | _ -> None
327327+328328+(** Parse target value *)
329329+let parse_target = function
330330+ | "_self" -> `Self
331331+ | "_blank" -> `Blank
332332+ | "_parent" -> `Parent
333333+ | "_top" -> `Top
334334+ | s -> `Named s
335335+336336+(** Parse loading value *)
337337+let parse_loading = function
338338+ | "eager" -> Some `Eager
339339+ | "lazy" -> Some `Lazy
340340+ | _ -> None
341341+342342+(** Parse decoding value *)
343343+let parse_decoding = function
344344+ | "sync" -> Some `Sync
345345+ | "async" -> Some `Async
346346+ | "auto" -> Some `Auto
347347+ | _ -> None
348348+349349+(** Parse fetchpriority value *)
350350+let parse_fetchpriority = function
351351+ | "high" -> Some `High
352352+ | "low" -> Some `Low
353353+ | "auto" -> Some `Auto
354354+ | _ -> None
355355+356356+(** Parse crossorigin value *)
357357+let parse_crossorigin = function
358358+ | "anonymous" | "" -> Some `Anonymous
359359+ | "use-credentials" -> Some `Use_credentials
360360+ | _ -> None
361361+362362+(** Parse preload value *)
363363+let parse_preload = function
364364+ | "none" -> Some `None
365365+ | "metadata" -> Some `Metadata
366366+ | "auto" | "" -> Some `Auto
367367+ | _ -> None
368368+369369+(** Parse method value *)
370370+let parse_method = function
371371+ | "get" -> Some `Get
372372+ | "post" -> Some `Post
373373+ | "dialog" -> Some `Dialog
374374+ | _ -> None
375375+376376+(** Parse enctype value *)
377377+let parse_enctype = function
378378+ | "application/x-www-form-urlencoded" -> Some `Urlencoded
379379+ | "multipart/form-data" -> Some `Multipart
380380+ | "text/plain" -> Some `Plain
381381+ | _ -> None
382382+383383+(** Parse wrap value *)
384384+let parse_wrap = function
385385+ | "soft" -> Some `Soft
386386+ | "hard" -> Some `Hard
387387+ | _ -> None
388388+389389+(** Parse scope value *)
390390+let parse_scope = function
391391+ | "row" -> Some `Row
392392+ | "col" -> Some `Col
393393+ | "rowgroup" -> Some `Rowgroup
394394+ | "colgroup" -> Some `Colgroup
395395+ | _ -> None
396396+397397+(** Parse input type value *)
398398+let parse_input_type = function
399399+ | "hidden" -> Some `Hidden
400400+ | "text" -> Some `Text
401401+ | "search" -> Some `Search
402402+ | "tel" -> Some `Tel
403403+ | "url" -> Some `Url
404404+ | "email" -> Some `Email
405405+ | "password" -> Some `Password
406406+ | "date" -> Some `Date
407407+ | "month" -> Some `Month
408408+ | "week" -> Some `Week
409409+ | "time" -> Some `Time
410410+ | "datetime-local" -> Some `Datetime_local
411411+ | "number" -> Some `Number
412412+ | "range" -> Some `Range
413413+ | "color" -> Some `Color
414414+ | "checkbox" -> Some `Checkbox
415415+ | "radio" -> Some `Radio
416416+ | "file" -> Some `File
417417+ | "submit" -> Some `Submit
418418+ | "image" -> Some `Image
419419+ | "reset" -> Some `Reset
420420+ | "button" -> Some `Button
421421+ | _ -> None
422422+423423+(** Parse button type value *)
424424+let parse_button_type = function
425425+ | "submit" -> Some `Submit
426426+ | "reset" -> Some `Reset
427427+ | "button" -> Some `Button
428428+ | _ -> None
429429+430430+(** Parse shape value *)
431431+let parse_shape = function
432432+ | "rect" -> Some `Rect
433433+ | "circle" -> Some `Circle
434434+ | "poly" -> Some `Poly
435435+ | "default" -> Some `Default
436436+ | _ -> None
437437+438438+(** Parse capture value *)
439439+let parse_capture = function
440440+ | "user" -> Some `User
441441+ | "environment" -> Some `Environment
442442+ | _ -> None
443443+444444+(** Parse list type value *)
445445+let parse_list_type = function
446446+ | "1" -> Some `Decimal
447447+ | "a" -> Some `Lower_alpha
448448+ | "A" -> Some `Upper_alpha
449449+ | "i" -> Some `Lower_roman
450450+ | "I" -> Some `Upper_roman
451451+ | _ -> None
452452+453453+(** Parse kind value *)
454454+let parse_kind = function
455455+ | "subtitles" -> Some `Subtitles
456456+ | "captions" -> Some `Captions
457457+ | "descriptions" -> Some `Descriptions
458458+ | "chapters" -> Some `Chapters
459459+ | "metadata" -> Some `Metadata
460460+ | _ -> None
461461+462462+(** Parse referrerpolicy value *)
463463+let parse_referrerpolicy = function
464464+ | "no-referrer" -> Some `No_referrer
465465+ | "no-referrer-when-downgrade" -> Some `No_referrer_when_downgrade
466466+ | "origin" -> Some `Origin
467467+ | "origin-when-cross-origin" -> Some `Origin_when_cross_origin
468468+ | "same-origin" -> Some `Same_origin
469469+ | "strict-origin" -> Some `Strict_origin
470470+ | "strict-origin-when-cross-origin" -> Some `Strict_origin_when_cross_origin
471471+ | "unsafe-url" -> Some `Unsafe_url
472472+ | _ -> None
473473+474474+(** Parse sandbox flag *)
475475+let parse_sandbox_flag = function
476476+ | "allow-downloads" -> Some `Allow_downloads
477477+ | "allow-forms" -> Some `Allow_forms
478478+ | "allow-modals" -> Some `Allow_modals
479479+ | "allow-orientation-lock" -> Some `Allow_orientation_lock
480480+ | "allow-pointer-lock" -> Some `Allow_pointer_lock
481481+ | "allow-popups" -> Some `Allow_popups
482482+ | "allow-popups-to-escape-sandbox" -> Some `Allow_popups_to_escape_sandbox
483483+ | "allow-presentation" -> Some `Allow_presentation
484484+ | "allow-same-origin" -> Some `Allow_same_origin
485485+ | "allow-scripts" -> Some `Allow_scripts
486486+ | "allow-top-navigation" -> Some `Allow_top_navigation
487487+ | "allow-top-navigation-by-user-activation" -> Some `Allow_top_navigation_by_user_activation
488488+ | "allow-top-navigation-to-custom-protocols" -> Some `Allow_top_navigation_to_custom_protocols
489489+ | _ -> None
490490+491491+(** Parse sandbox value (space-separated flags) *)
492492+let parse_sandbox value =
493493+ if String.trim value = "" then
494494+ Some []
495495+ else
496496+ let flags = String.split_on_char ' ' value |> List.filter (fun s -> s <> "") in
497497+ let parsed = List.filter_map parse_sandbox_flag flags in
498498+ if List.length parsed = List.length flags then
499499+ Some parsed
500500+ else
501501+ None
502502+503503+(** Parse enterkeyhint value *)
504504+let parse_enterkeyhint = function
505505+ | "enter" -> Some `Enter
506506+ | "done" -> Some `Done
507507+ | "go" -> Some `Go
508508+ | "next" -> Some `Next
509509+ | "previous" -> Some `Previous
510510+ | "search" -> Some `Search
511511+ | "send" -> Some `Send
512512+ | _ -> None
513513+514514+(** Parse inputmode value *)
515515+let parse_inputmode = function
516516+ | "none" -> Some `None
517517+ | "text" -> Some `Text
518518+ | "decimal" -> Some `Decimal
519519+ | "numeric" -> Some `Numeric
520520+ | "tel" -> Some `Tel
521521+ | "search" -> Some `Search
522522+ | "email" -> Some `Email
523523+ | "url" -> Some `Url
524524+ | _ -> None
525525+526526+(** Parse contenteditable value *)
527527+let parse_contenteditable = function
528528+ | "true" | "" -> Some `True
529529+ | "false" -> Some `False
530530+ | "plaintext-only" -> Some `Plaintext_only
531531+ | _ -> None
532532+533533+(** Parse autocapitalize value *)
534534+let parse_autocapitalize = function
535535+ | "off" -> Some `Off
536536+ | "none" -> Some `None
537537+ | "on" -> Some `On
538538+ | "sentences" -> Some `Sentences
539539+ | "words" -> Some `Words
540540+ | "characters" -> Some `Characters
541541+ | _ -> None
542542+543543+(** Parse hidden value *)
544544+let parse_hidden = function
545545+ | "" | "hidden" -> Some `Hidden
546546+ | "until-found" -> Some `Until_found
547547+ | _ -> None
548548+549549+(** Parse popover value *)
550550+let parse_popover = function
551551+ | "" | "auto" -> Some `Auto
552552+ | "manual" -> Some `Manual
553553+ | "hint" -> Some `Hint
554554+ | _ -> None
555555+556556+(** Try to parse an integer *)
557557+let parse_int s =
558558+ try Some (int_of_string (String.trim s))
559559+ with Failure _ -> None
560560+561561+(** Try to parse a float *)
562562+let parse_float s =
563563+ try Some (float_of_string (String.trim s))
564564+ with Failure _ -> None
565565+566566+(** Parse a boolean string *)
567567+let parse_bool = function
568568+ | "true" | "" -> Some true
569569+ | "false" -> Some false
570570+ | _ -> None
571571+572572+(** Parse a single attribute name-value pair to typed attribute *)
573573+let parse_attr name value : t =
574574+ let name_lower = String.lowercase_ascii name in
575575+ let value_lower = String.lowercase_ascii value in
576576+ match name_lower with
577577+ (* Global attributes *)
578578+ | "id" -> `Id value
579579+ | "class" -> `Class value
580580+ | "style" -> `Style value
581581+ | "title" -> `Title value
582582+ | "lang" -> `Lang value
583583+ | "dir" -> (match parse_dir value_lower with Some d -> `Dir d | None -> `Unknown_attr (name, value))
584584+ | "hidden" -> `Hidden (parse_hidden value_lower)
585585+ | "tabindex" -> (match parse_int value with Some i -> `Tabindex i | None -> `Unknown_attr (name, value))
586586+ | "accesskey" -> `Accesskey value
587587+ | "autocapitalize" -> (match parse_autocapitalize value_lower with Some a -> `Autocapitalize a | None -> `Unknown_attr (name, value))
588588+ | "autofocus" -> `Autofocus
589589+ | "contenteditable" -> `Contenteditable (parse_contenteditable value_lower)
590590+ | "draggable" -> (match parse_bool value_lower with Some b -> `Draggable b | None -> `Unknown_attr (name, value))
591591+ | "enterkeyhint" -> (match parse_enterkeyhint value_lower with Some e -> `Enterkeyhint e | None -> `Unknown_attr (name, value))
592592+ | "inert" -> `Inert
593593+ | "inputmode" -> (match parse_inputmode value_lower with Some i -> `Inputmode i | None -> `Unknown_attr (name, value))
594594+ | "is" -> `Is value
595595+ | "nonce" -> `Nonce value
596596+ | "popover" -> `Popover (parse_popover value_lower)
597597+ | "slot" -> `Slot value
598598+ | "spellcheck" -> `Spellcheck (parse_bool value_lower)
599599+ | "translate" -> (match value_lower with "yes" | "" -> `Translate true | "no" -> `Translate false | _ -> `Unknown_attr (name, value))
600600+ | "exportparts" -> `Exportparts value
601601+ | "part" -> `Part value
602602+603603+ (* Microdata *)
604604+ | "itemscope" -> `Itemscope
605605+ | "itemtype" -> `Itemtype value
606606+ | "itemprop" -> `Itemprop value
607607+ | "itemid" -> `Itemid value
608608+ | "itemref" -> `Itemref value
609609+610610+ (* ARIA - role and aria-* *)
611611+ | "role" -> `Role value
612612+ | _ when String.starts_with ~prefix:"aria-" name_lower ->
613613+ let aria_name = String.sub name_lower 5 (String.length name_lower - 5) in
614614+ `Aria (aria_name, value)
615615+616616+ (* Event handlers - on* *)
617617+ | _ when String.starts_with ~prefix:"on" name_lower && String.length name_lower > 2 ->
618618+ let event_name = String.sub name_lower 2 (String.length name_lower - 2) in
619619+ `Event (event_name, value)
620620+621621+ (* Link/navigation attributes *)
622622+ | "href" -> `Href value
623623+ | "target" -> `Target (parse_target value)
624624+ | "rel" -> `Rel value
625625+ | "download" -> `Download (if value = "" then None else Some value)
626626+ | "hreflang" -> `Hreflang value
627627+ | "ping" -> `Ping value
628628+ | "referrerpolicy" -> (match parse_referrerpolicy value_lower with Some r -> `Referrerpolicy r | None -> `Unknown_attr (name, value))
629629+630630+ (* Media/resource attributes *)
631631+ | "src" -> `Src value
632632+ | "srcset" -> `Srcset value
633633+ | "sizes" -> `Sizes value
634634+ | "alt" -> `Alt value
635635+ | "width" -> `Width value
636636+ | "height" -> `Height value
637637+ | "loading" -> (match parse_loading value_lower with Some l -> `Loading l | None -> `Unknown_attr (name, value))
638638+ | "decoding" -> (match parse_decoding value_lower with Some d -> `Decoding d | None -> `Unknown_attr (name, value))
639639+ | "fetchpriority" -> (match parse_fetchpriority value_lower with Some f -> `Fetchpriority f | None -> `Unknown_attr (name, value))
640640+ | "crossorigin" -> `Crossorigin (parse_crossorigin value_lower)
641641+ | "ismap" -> `Ismap
642642+ | "usemap" -> `Usemap value
643643+ | "media" -> `Media value
644644+645645+ (* Audio/Video specific *)
646646+ | "controls" -> `Controls
647647+ | "autoplay" -> `Autoplay
648648+ | "loop" -> `Loop
649649+ | "muted" -> `Muted
650650+ | "preload" -> (match parse_preload value_lower with Some p -> `Preload p | None -> `Unknown_attr (name, value))
651651+ | "poster" -> `Poster value
652652+ | "playsinline" -> `Playsinline
653653+654654+ (* Image map *)
655655+ | "coords" -> `Coords value
656656+ | "shape" -> (match parse_shape value_lower with Some s -> `Shape s | None -> `Unknown_attr (name, value))
657657+658658+ (* iframe *)
659659+ | "sandbox" -> `Sandbox (parse_sandbox value_lower)
660660+ | "allow" -> `Allow value
661661+ | "allowfullscreen" -> `Allowfullscreen
662662+ | "srcdoc" -> `Srcdoc value
663663+ | "csp" -> `Csp value
664664+665665+ (* Form attributes *)
666666+ | "action" -> `Action value
667667+ | "method" -> (match parse_method value_lower with Some m -> `Method m | None -> `Unknown_attr (name, value))
668668+ | "enctype" -> (match parse_enctype value_lower with Some e -> `Enctype e | None -> `Unknown_attr (name, value))
669669+ | "novalidate" -> `Novalidate
670670+ | "accept-charset" -> `Accept_charset value
671671+ | "autocomplete" -> `Autocomplete value
672672+ | "name" -> `Name value
673673+ | "form" -> `Form value
674674+675675+ (* Form control attributes *)
676676+ | "value" -> `Value value
677677+ | "type" -> `Unknown_attr (name, value) (* type is context-dependent, handle in element parsing *)
678678+ | "disabled" -> `Disabled
679679+ | "readonly" -> `Readonly
680680+ | "required" -> `Required
681681+ | "checked" -> `Checked
682682+ | "selected" -> `Selected
683683+ | "multiple" -> `Multiple
684684+ | "placeholder" -> `Placeholder value
685685+ | "min" -> `Min value
686686+ | "max" -> `Max value
687687+ | "step" -> `Step value
688688+ | "minlength" -> (match parse_int value with Some i -> `Minlength i | None -> `Unknown_attr (name, value))
689689+ | "maxlength" -> (match parse_int value with Some i -> `Maxlength i | None -> `Unknown_attr (name, value))
690690+ | "pattern" -> `Pattern value
691691+ | "size" -> (match parse_int value with Some i -> `Size i | None -> `Unknown_attr (name, value))
692692+ | "cols" -> (match parse_int value with Some i -> `Cols i | None -> `Unknown_attr (name, value))
693693+ | "rows" -> (match parse_int value with Some i -> `Rows i | None -> `Unknown_attr (name, value))
694694+ | "wrap" -> (match parse_wrap value_lower with Some w -> `Wrap w | None -> `Unknown_attr (name, value))
695695+ | "accept" -> `Accept value
696696+ | "capture" -> (match parse_capture value_lower with Some c -> `Capture c | None -> `Unknown_attr (name, value))
697697+ | "dirname" -> `Dirname value
698698+ | "for" -> `For value
699699+ | "list" -> `List value
700700+701701+ (* Form submission attributes *)
702702+ | "formaction" -> `Formaction value
703703+ | "formmethod" -> (match parse_method value_lower with Some m -> `Formmethod m | None -> `Unknown_attr (name, value))
704704+ | "formenctype" -> (match parse_enctype value_lower with Some e -> `Formenctype e | None -> `Unknown_attr (name, value))
705705+ | "formnovalidate" -> `Formnovalidate
706706+ | "formtarget" -> `Formtarget (parse_target value)
707707+708708+ (* Table attributes *)
709709+ | "colspan" -> (match parse_int value with Some i -> `Colspan i | None -> `Unknown_attr (name, value))
710710+ | "rowspan" -> (match parse_int value with Some i -> `Rowspan i | None -> `Unknown_attr (name, value))
711711+ | "headers" -> `Headers value
712712+ | "scope" -> (match parse_scope value_lower with Some s -> `Scope s | None -> `Unknown_attr (name, value))
713713+ | "span" -> (match parse_int value with Some i -> `Span i | None -> `Unknown_attr (name, value))
714714+715715+ (* Details/Dialog *)
716716+ | "open" -> `Open
717717+718718+ (* Script *)
719719+ | "async" -> `Async
720720+ | "defer" -> `Defer
721721+ | "integrity" -> `Integrity value
722722+ | "nomodule" -> `Nomodule
723723+ | "blocking" -> `Blocking value
724724+725725+ (* Meta *)
726726+ | "charset" -> `Charset value
727727+ | "content" -> `Content value
728728+ | "http-equiv" -> `Http_equiv value
729729+730730+ (* Link element *)
731731+ | "as" -> `As value
732732+ | "imagesizes" -> `Imagesizes value
733733+ | "imagesrcset" -> `Imagesrcset value
734734+735735+ (* Object *)
736736+ | "data" -> `Data_object value
737737+738738+ (* Meter/Progress *)
739739+ | "low" -> (match parse_float value with Some f -> `Low f | None -> `Unknown_attr (name, value))
740740+ | "high" -> (match parse_float value with Some f -> `High f | None -> `Unknown_attr (name, value))
741741+ | "optimum" -> (match parse_float value with Some f -> `Optimum f | None -> `Unknown_attr (name, value))
742742+743743+ (* Time *)
744744+ | "datetime" -> `Datetime value
745745+746746+ (* Ol *)
747747+ | "start" -> (match parse_int value with Some i -> `Start i | None -> `Unknown_attr (name, value))
748748+ | "reversed" -> `Reversed
749749+750750+ (* Track *)
751751+ | "kind" -> (match parse_kind value_lower with Some k -> `Kind k | None -> `Unknown_attr (name, value))
752752+ | "srclang" -> `Srclang value
753753+ | "default" -> `Default
754754+755755+ (* Td/Th *)
756756+ | "abbr" -> `Abbr value
757757+758758+ (* RDFa *)
759759+ | "property" -> `Property value
760760+ | "typeof" -> `Typeof value
761761+ | "resource" -> `Resource value
762762+ | "prefix" -> `Prefix value
763763+ | "vocab" -> `Vocab value
764764+ | "about" -> `About value
765765+ | "datatype" -> `Datatype value
766766+ | "inlist" -> `Inlist
767767+ | "rev" -> `Rev value
768768+769769+ (* Data attributes *)
770770+ | _ when String.starts_with ~prefix:"data-" name_lower ->
771771+ let data_name = String.sub name_lower 5 (String.length name_lower - 5) in
772772+ `Data_attr (data_name, value)
773773+774774+ (* Escape hatch *)
775775+ | _ -> `Unknown_attr (name, value)
776776+777777+(** Parse multiple attributes *)
778778+let parse_attrs (attrs : (string * string) list) : t list =
779779+ List.map (fun (n, v) -> parse_attr n v) attrs
780780+781781+(** {1 Accessor Functions} *)
782782+783783+(** Get id attribute *)
784784+let get_id attrs =
785785+ List.find_map (function `Id s -> Some s | _ -> None) attrs
786786+787787+(** Get class attribute *)
788788+let get_class attrs =
789789+ List.find_map (function `Class s -> Some s | _ -> None) attrs
790790+791791+(** Get href attribute *)
792792+let get_href attrs =
793793+ List.find_map (function `Href s -> Some s | _ -> None) attrs
794794+795795+(** Get src attribute *)
796796+let get_src attrs =
797797+ List.find_map (function `Src s -> Some s | _ -> None) attrs
798798+799799+(** Get alt attribute *)
800800+let get_alt attrs =
801801+ List.find_map (function `Alt s -> Some s | _ -> None) attrs
802802+803803+(** Get name attribute *)
804804+let get_name attrs =
805805+ List.find_map (function `Name s -> Some s | _ -> None) attrs
806806+807807+(** Get value attribute *)
808808+let get_value attrs =
809809+ List.find_map (function `Value s -> Some s | _ -> None) attrs
810810+811811+(** Get role attribute *)
812812+let get_role attrs =
813813+ List.find_map (function `Role s -> Some s | _ -> None) attrs
814814+815815+(** Get a specific aria-* attribute *)
816816+let get_aria name attrs =
817817+ List.find_map (function `Aria (n, v) when n = name -> Some v | _ -> None) attrs
818818+819819+(** Get a specific data-* attribute *)
820820+let get_data name attrs =
821821+ List.find_map (function `Data_attr (n, v) when n = name -> Some v | _ -> None) attrs
822822+823823+(** Check if disabled is present *)
824824+let has_disabled attrs =
825825+ List.exists (function `Disabled -> true | _ -> false) attrs
826826+827827+(** Check if required is present *)
828828+let has_required attrs =
829829+ List.exists (function `Required -> true | _ -> false) attrs
830830+831831+(** Check if readonly is present *)
832832+let has_readonly attrs =
833833+ List.exists (function `Readonly -> true | _ -> false) attrs
834834+835835+(** Check if checked is present *)
836836+let has_checked attrs =
837837+ List.exists (function `Checked -> true | _ -> false) attrs
838838+839839+(** Check if autofocus is present *)
840840+let has_autofocus attrs =
841841+ List.exists (function `Autofocus -> true | _ -> false) attrs
842842+843843+(** Check if hidden is present *)
844844+let has_hidden attrs =
845845+ List.exists (function `Hidden _ -> true | _ -> false) attrs
846846+847847+(** Check if inert is present *)
848848+let has_inert attrs =
849849+ List.exists (function `Inert -> true | _ -> false) attrs
850850+851851+(** Check if open is present *)
852852+let has_open attrs =
853853+ List.exists (function `Open -> true | _ -> false) attrs
854854+855855+(** Get all aria-* attributes *)
856856+let get_all_aria attrs =
857857+ List.filter_map (function `Aria (n, v) -> Some (n, v) | _ -> None) attrs
858858+859859+(** Get all data-* attributes *)
860860+let get_all_data attrs =
861861+ List.filter_map (function `Data_attr (n, v) -> Some (n, v) | _ -> None) attrs
862862+863863+(** Find an attribute matching a predicate *)
864864+let find f attrs =
865865+ List.find_map f attrs
866866+867867+(** Check if any attribute matches *)
868868+let exists f attrs =
869869+ List.exists f attrs
870870+871871+(** Filter attributes *)
872872+let filter f attrs =
873873+ List.filter f attrs
+289
lib/htmlrw_check/element/element.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Typed HTML5 element representation.
77+88+ This module combines tags and attributes into a complete typed element
99+ representation with conversion functions. *)
1010+1111+(** {1 Element Type} *)
1212+1313+(** A typed HTML element *)
1414+type t = {
1515+ tag : Tag.element_tag;
1616+ attrs : Attr.t list;
1717+ raw_attrs : (string * string) list; (** Original for fallback *)
1818+}
1919+2020+(** {1 Parsing Functions} *)
2121+2222+(** Parse element-specific type attribute based on tag *)
2323+let parse_type_attr (tag : Tag.html_tag) value : Attr.t =
2424+ let value_lower = String.lowercase_ascii value in
2525+ match tag with
2626+ | `Input ->
2727+ (match Attr.parse_input_type value_lower with
2828+ | Some t -> `Type_input t
2929+ | None -> `Unknown_attr ("type", value))
3030+ | `Button ->
3131+ (match Attr.parse_button_type value_lower with
3232+ | Some t -> `Type_button t
3333+ | None -> `Unknown_attr ("type", value))
3434+ | `Script -> `Type_script value
3535+ | `Link -> `Type_link value
3636+ | `Ol ->
3737+ (match Attr.parse_list_type value_lower with
3838+ | Some t -> `Type_list t
3939+ | None -> `Unknown_attr ("type", value))
4040+ | _ -> `Unknown_attr ("type", value)
4141+4242+(** Parse attributes with element context for type attribute *)
4343+let parse_attrs_for_tag (tag : Tag.element_tag) (raw_attrs : (string * string) list) : Attr.t list =
4444+ List.map (fun (name, value) ->
4545+ let name_lower = String.lowercase_ascii name in
4646+ if name_lower = "type" then
4747+ match tag with
4848+ | Tag.Html html_tag -> parse_type_attr html_tag value
4949+ | _ -> `Unknown_attr (name, value)
5050+ else
5151+ Attr.parse_attr name value
5252+ ) raw_attrs
5353+5454+(** Create an element from raw input *)
5555+let create ~name ~namespace ~attrs:raw_attrs =
5656+ let tag = Tag.tag_of_string ?namespace name in
5757+ let attrs = parse_attrs_for_tag tag raw_attrs in
5858+ { tag; attrs; raw_attrs }
5959+6060+(** {1 Accessor Functions} *)
6161+6262+(** Get the tag *)
6363+let tag elem = elem.tag
6464+6565+(** Get typed attributes *)
6666+let attrs elem = elem.attrs
6767+6868+(** Get raw attributes *)
6969+let raw_attrs elem = elem.raw_attrs
7070+7171+(** Get the tag name as string *)
7272+let tag_name elem = Tag.tag_to_string elem.tag
7373+7474+(** Check if element is a specific HTML tag *)
7575+let is_html_tag expected elem =
7676+ Tag.is_html_tag expected elem.tag
7777+7878+(** Get the HTML tag if this is an HTML element *)
7979+let as_html_tag elem =
8080+ Tag.as_html_tag elem.tag
8181+8282+(** {1 Attribute Accessors (delegated to Attr module)} *)
8383+8484+let get_id elem = Attr.get_id elem.attrs
8585+let get_class elem = Attr.get_class elem.attrs
8686+let get_href elem = Attr.get_href elem.attrs
8787+let get_src elem = Attr.get_src elem.attrs
8888+let get_alt elem = Attr.get_alt elem.attrs
8989+let get_name elem = Attr.get_name elem.attrs
9090+let get_value elem = Attr.get_value elem.attrs
9191+let get_role elem = Attr.get_role elem.attrs
9292+let get_aria name elem = Attr.get_aria name elem.attrs
9393+let get_data name elem = Attr.get_data name elem.attrs
9494+9595+let has_disabled elem = Attr.has_disabled elem.attrs
9696+let has_required elem = Attr.has_required elem.attrs
9797+let has_readonly elem = Attr.has_readonly elem.attrs
9898+let has_checked elem = Attr.has_checked elem.attrs
9999+let has_autofocus elem = Attr.has_autofocus elem.attrs
100100+let has_hidden elem = Attr.has_hidden elem.attrs
101101+let has_inert elem = Attr.has_inert elem.attrs
102102+let has_open elem = Attr.has_open elem.attrs
103103+104104+let get_all_aria elem = Attr.get_all_aria elem.attrs
105105+let get_all_data elem = Attr.get_all_data elem.attrs
106106+107107+(** {1 Category Checks} *)
108108+109109+(** Check if this is a void element *)
110110+let is_void elem =
111111+ match elem.tag with
112112+ | Tag.Html t -> Tag.is_void t
113113+ | _ -> false
114114+115115+(** Check if this is a heading element *)
116116+let is_heading elem =
117117+ match elem.tag with
118118+ | Tag.Html t -> Tag.is_heading t
119119+ | _ -> false
120120+121121+(** Get heading level (1-6) or None *)
122122+let heading_level elem =
123123+ match elem.tag with
124124+ | Tag.Html t -> Tag.heading_level t
125125+ | _ -> None
126126+127127+(** Check if this is sectioning content *)
128128+let is_sectioning elem =
129129+ match elem.tag with
130130+ | Tag.Html t -> Tag.is_sectioning t
131131+ | _ -> false
132132+133133+(** Check if this is a sectioning root *)
134134+let is_sectioning_root elem =
135135+ match elem.tag with
136136+ | Tag.Html t -> Tag.is_sectioning_root t
137137+ | _ -> false
138138+139139+(** Check if this is embedded content *)
140140+let is_embedded elem =
141141+ match elem.tag with
142142+ | Tag.Html t -> Tag.is_embedded t
143143+ | _ -> false
144144+145145+(** Check if this is interactive content *)
146146+let is_interactive elem =
147147+ match elem.tag with
148148+ | Tag.Html t -> Tag.is_interactive t
149149+ | _ -> false
150150+151151+(** Check if this is form-associated *)
152152+let is_form_associated elem =
153153+ match elem.tag with
154154+ | Tag.Html t -> Tag.is_form_associated t
155155+ | _ -> false
156156+157157+(** Check if this is labelable *)
158158+let is_labelable elem =
159159+ match elem.tag with
160160+ | Tag.Html t -> Tag.is_labelable t
161161+ | _ -> false
162162+163163+(** Check if this is submittable *)
164164+let is_submittable elem =
165165+ match elem.tag with
166166+ | Tag.Html t -> Tag.is_submittable t
167167+ | _ -> false
168168+169169+(** Check if this is a table element *)
170170+let is_table_element elem =
171171+ match elem.tag with
172172+ | Tag.Html t -> Tag.is_table_element t
173173+ | _ -> false
174174+175175+(** Check if this is a media element *)
176176+let is_media elem =
177177+ match elem.tag with
178178+ | Tag.Html t -> Tag.is_media t
179179+ | _ -> false
180180+181181+(** Check if this is a list container *)
182182+let is_list_container elem =
183183+ match elem.tag with
184184+ | Tag.Html t -> Tag.is_list_container t
185185+ | _ -> false
186186+187187+(** Check if this has transparent content model *)
188188+let is_transparent elem =
189189+ match elem.tag with
190190+ | Tag.Html t -> Tag.is_transparent t
191191+ | _ -> false
192192+193193+(** Check if this is phrasing content *)
194194+let is_phrasing elem =
195195+ match elem.tag with
196196+ | Tag.Html t -> Tag.is_phrasing t
197197+ | _ -> false
198198+199199+(** Check if this is flow content *)
200200+let is_flow elem =
201201+ match elem.tag with
202202+ | Tag.Html t -> Tag.is_flow t
203203+ | _ -> true (* Custom elements are flow content *)
204204+205205+(** Check if this is a deprecated element *)
206206+let is_obsolete elem =
207207+ match elem.tag with
208208+ | Tag.Html t -> Tag.is_obsolete t
209209+ | _ -> false
210210+211211+(** Check if this is an SVG element *)
212212+let is_svg elem =
213213+ match elem.tag with
214214+ | Tag.Svg _ -> true
215215+ | _ -> false
216216+217217+(** Check if this is a MathML element *)
218218+let is_mathml elem =
219219+ match elem.tag with
220220+ | Tag.MathML _ -> true
221221+ | _ -> false
222222+223223+(** Check if this is a custom element *)
224224+let is_custom elem =
225225+ match elem.tag with
226226+ | Tag.Custom _ -> true
227227+ | _ -> false
228228+229229+(** Check if this is an unknown element *)
230230+let is_unknown elem =
231231+ match elem.tag with
232232+ | Tag.Unknown _ -> true
233233+ | _ -> false
234234+235235+(** {1 Input Type Utilities} *)
236236+237237+(** Get input type for input elements *)
238238+let get_input_type elem =
239239+ match elem.tag with
240240+ | Tag.Html `Input ->
241241+ List.find_map (function
242242+ | `Type_input t -> Some t
243243+ | _ -> None
244244+ ) elem.attrs
245245+ | _ -> None
246246+247247+(** Get button type for button elements *)
248248+let get_button_type elem =
249249+ match elem.tag with
250250+ | Tag.Html `Button ->
251251+ List.find_map (function
252252+ | `Type_button t -> Some t
253253+ | _ -> None
254254+ ) elem.attrs
255255+ | _ -> None
256256+257257+(** Check if input is of a specific type *)
258258+let is_input_type expected elem =
259259+ match get_input_type elem with
260260+ | Some t -> t = expected
261261+ | None -> false
262262+263263+(** {1 Raw Attribute Fallback} *)
264264+265265+(** Get raw attribute value (from original attrs) *)
266266+let get_raw_attr name elem =
267267+ List.find_map (fun (n, v) ->
268268+ if String.lowercase_ascii n = String.lowercase_ascii name then Some v else None
269269+ ) elem.raw_attrs
270270+271271+(** Check if raw attribute exists *)
272272+let has_raw_attr name elem =
273273+ List.exists (fun (n, _) ->
274274+ String.lowercase_ascii n = String.lowercase_ascii name
275275+ ) elem.raw_attrs
276276+277277+(** {1 Pattern Matching Helpers} *)
278278+279279+(** Match on HTML tag or return None *)
280280+let match_html elem f =
281281+ match elem.tag with
282282+ | Tag.Html tag -> Some (f tag)
283283+ | _ -> None
284284+285285+(** Match on specific HTML tag *)
286286+let when_html_tag expected elem f =
287287+ match elem.tag with
288288+ | Tag.Html tag when tag = expected -> Some (f ())
289289+ | _ -> None
+523
lib/htmlrw_check/element/tag.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Typed HTML5 tag representations using polymorphic variants.
77+88+ This module provides compile-time type safety for HTML elements while
99+ maintaining escape hatches for unknown/custom elements. *)
1010+1111+(** {1 HTML Tag Types} *)
1212+1313+(** All standard HTML5 elements plus deprecated elements needed by the validator *)
1414+type html_tag = [
1515+ (* Document metadata *)
1616+ | `Html | `Head | `Title | `Base | `Link | `Meta | `Style
1717+1818+ (* Sectioning root *)
1919+ | `Body
2020+2121+ (* Content sectioning *)
2222+ | `Address | `Article | `Aside | `Footer | `Header | `Hgroup
2323+ | `Main | `Nav | `Search | `Section
2424+2525+ (* Heading content *)
2626+ | `H1 | `H2 | `H3 | `H4 | `H5 | `H6
2727+2828+ (* Grouping content *)
2929+ | `Blockquote | `Dd | `Div | `Dl | `Dt | `Figcaption | `Figure
3030+ | `Hr | `Li | `Menu | `Ol | `P | `Pre | `Ul
3131+3232+ (* Text-level semantics *)
3333+ | `A | `Abbr | `B | `Bdi | `Bdo | `Br | `Cite | `Code | `Data
3434+ | `Dfn | `Em | `I | `Kbd | `Mark | `Q | `Rp | `Rt | `Ruby
3535+ | `S | `Samp | `Small | `Span | `Strong | `Sub | `Sup | `Time
3636+ | `U | `Var | `Wbr
3737+3838+ (* Edits *)
3939+ | `Del | `Ins
4040+4141+ (* Embedded content *)
4242+ | `Area | `Audio | `Canvas | `Embed | `Iframe | `Img | `Map | `Object
4343+ | `Picture | `Source | `Track | `Video
4444+4545+ (* Tabular data *)
4646+ | `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot
4747+ | `Th | `Thead | `Tr
4848+4949+ (* Forms *)
5050+ | `Button | `Datalist | `Fieldset | `Form | `Input | `Label
5151+ | `Legend | `Meter | `Optgroup | `Option | `Output | `Progress
5252+ | `Select | `Textarea
5353+5454+ (* Interactive elements *)
5555+ | `Details | `Dialog | `Summary
5656+5757+ (* Scripting *)
5858+ | `Noscript | `Script | `Slot | `Template
5959+6060+ (* Web Components / Misc *)
6161+ | `Portal | `Param
6262+6363+ (* Deprecated/obsolete elements (needed by validator) *)
6464+ | `Applet | `Acronym | `Bgsound | `Dir | `Frame | `Frameset
6565+ | `Noframes | `Isindex | `Keygen | `Listing | `Menuitem | `Nextid
6666+ | `Noembed | `Plaintext | `Rb | `Rtc | `Strike | `Xmp
6767+ | `Basefont | `Big | `Blink | `Center | `Font | `Marquee
6868+ | `Multicol | `Nobr | `Spacer | `Tt | `Image
6969+]
7070+7171+(** {1 Category Types}
7272+7373+ Categories as type aliases for subsets, enabling functions that only accept
7474+ specific categories with compile-time checking. *)
7575+7676+(** Void elements - cannot have children *)
7777+type void_tag = [
7878+ | `Area | `Base | `Br | `Col | `Embed | `Hr | `Img | `Input
7979+ | `Link | `Meta | `Source | `Track | `Wbr
8080+ (* Deprecated void elements *)
8181+ | `Basefont | `Frame | `Isindex | `Keygen | `Param
8282+]
8383+8484+(** Heading elements *)
8585+type heading_tag = [ `H1 | `H2 | `H3 | `H4 | `H5 | `H6 ]
8686+8787+(** Sectioning content *)
8888+type sectioning_tag = [ `Article | `Aside | `Nav | `Section ]
8989+9090+(** Sectioning roots (establish their own outline) *)
9191+type sectioning_root_tag = [
9292+ | `Blockquote | `Body | `Details | `Dialog | `Fieldset | `Figure | `Td
9393+]
9494+9595+(** Embedded content *)
9696+type embedded_tag = [
9797+ | `Audio | `Canvas | `Embed | `Iframe | `Img | `Object | `Picture | `Video
9898+]
9999+100100+(** Interactive content (focusable/activatable) *)
101101+type interactive_tag = [
102102+ | `A | `Audio | `Button | `Details | `Embed | `Iframe | `Img
103103+ | `Input | `Label | `Select | `Textarea | `Video
104104+]
105105+106106+(** Form-associated elements *)
107107+type form_associated_tag = [
108108+ | `Button | `Fieldset | `Input | `Label | `Object | `Output
109109+ | `Select | `Textarea | `Meter | `Progress
110110+]
111111+112112+(** Labelable elements *)
113113+type labelable_tag = [
114114+ | `Button | `Input | `Meter | `Output | `Progress | `Select | `Textarea
115115+]
116116+117117+(** Submittable elements *)
118118+type submittable_tag = [
119119+ | `Button | `Input | `Select | `Textarea
120120+]
121121+122122+(** Resettable elements *)
123123+type resettable_tag = [
124124+ | `Input | `Output | `Select | `Textarea
125125+]
126126+127127+(** Table elements *)
128128+type table_tag = [
129129+ | `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot
130130+ | `Th | `Thead | `Tr
131131+]
132132+133133+(** Media elements *)
134134+type media_tag = [ `Audio | `Video ]
135135+136136+(** List container elements *)
137137+type list_container_tag = [ `Ul | `Ol | `Menu | `Dl ]
138138+139139+(** List item elements *)
140140+type list_item_tag = [ `Li | `Dd | `Dt ]
141141+142142+(** Script-supporting elements *)
143143+type script_supporting_tag = [ `Script | `Template ]
144144+145145+(** Metadata content *)
146146+type metadata_tag = [ `Base | `Link | `Meta | `Noscript | `Script | `Style | `Template | `Title ]
147147+148148+(** {1 Top-Level Element Type} *)
149149+150150+(** Top-level element classification *)
151151+type element_tag =
152152+ | Html of html_tag (** Known HTML5 element *)
153153+ | Svg of string (** SVG element by local name *)
154154+ | MathML of string (** MathML element by local name *)
155155+ | Custom of string (** Custom element like <my-widget> *)
156156+ | Unknown of string (** Truly unknown element *)
157157+158158+(** {1 Conversion Functions} *)
159159+160160+(** Convert a lowercase tag name string to html_tag option *)
161161+let html_tag_of_string_opt name =
162162+ match name with
163163+ (* Document metadata *)
164164+ | "html" -> Some `Html | "head" -> Some `Head | "title" -> Some `Title
165165+ | "base" -> Some `Base | "link" -> Some `Link | "meta" -> Some `Meta
166166+ | "style" -> Some `Style
167167+ (* Sectioning root *)
168168+ | "body" -> Some `Body
169169+ (* Content sectioning *)
170170+ | "address" -> Some `Address | "article" -> Some `Article | "aside" -> Some `Aside
171171+ | "footer" -> Some `Footer | "header" -> Some `Header | "hgroup" -> Some `Hgroup
172172+ | "main" -> Some `Main | "nav" -> Some `Nav | "search" -> Some `Search
173173+ | "section" -> Some `Section
174174+ (* Headings *)
175175+ | "h1" -> Some `H1 | "h2" -> Some `H2 | "h3" -> Some `H3
176176+ | "h4" -> Some `H4 | "h5" -> Some `H5 | "h6" -> Some `H6
177177+ (* Grouping content *)
178178+ | "blockquote" -> Some `Blockquote | "dd" -> Some `Dd | "div" -> Some `Div
179179+ | "dl" -> Some `Dl | "dt" -> Some `Dt | "figcaption" -> Some `Figcaption
180180+ | "figure" -> Some `Figure | "hr" -> Some `Hr | "li" -> Some `Li
181181+ | "menu" -> Some `Menu | "ol" -> Some `Ol | "p" -> Some `P
182182+ | "pre" -> Some `Pre | "ul" -> Some `Ul
183183+ (* Text-level semantics *)
184184+ | "a" -> Some `A | "abbr" -> Some `Abbr | "b" -> Some `B
185185+ | "bdi" -> Some `Bdi | "bdo" -> Some `Bdo | "br" -> Some `Br
186186+ | "cite" -> Some `Cite | "code" -> Some `Code | "data" -> Some `Data
187187+ | "dfn" -> Some `Dfn | "em" -> Some `Em | "i" -> Some `I
188188+ | "kbd" -> Some `Kbd | "mark" -> Some `Mark | "q" -> Some `Q
189189+ | "rp" -> Some `Rp | "rt" -> Some `Rt | "ruby" -> Some `Ruby
190190+ | "s" -> Some `S | "samp" -> Some `Samp | "small" -> Some `Small
191191+ | "span" -> Some `Span | "strong" -> Some `Strong | "sub" -> Some `Sub
192192+ | "sup" -> Some `Sup | "time" -> Some `Time | "u" -> Some `U
193193+ | "var" -> Some `Var | "wbr" -> Some `Wbr
194194+ (* Edits *)
195195+ | "del" -> Some `Del | "ins" -> Some `Ins
196196+ (* Embedded content *)
197197+ | "area" -> Some `Area | "audio" -> Some `Audio | "canvas" -> Some `Canvas
198198+ | "embed" -> Some `Embed | "iframe" -> Some `Iframe | "img" -> Some `Img
199199+ | "map" -> Some `Map | "object" -> Some `Object | "picture" -> Some `Picture
200200+ | "source" -> Some `Source | "track" -> Some `Track | "video" -> Some `Video
201201+ (* Tabular data *)
202202+ | "caption" -> Some `Caption | "col" -> Some `Col | "colgroup" -> Some `Colgroup
203203+ | "table" -> Some `Table | "tbody" -> Some `Tbody | "td" -> Some `Td
204204+ | "tfoot" -> Some `Tfoot | "th" -> Some `Th | "thead" -> Some `Thead
205205+ | "tr" -> Some `Tr
206206+ (* Forms *)
207207+ | "button" -> Some `Button | "datalist" -> Some `Datalist
208208+ | "fieldset" -> Some `Fieldset | "form" -> Some `Form | "input" -> Some `Input
209209+ | "label" -> Some `Label | "legend" -> Some `Legend | "meter" -> Some `Meter
210210+ | "optgroup" -> Some `Optgroup | "option" -> Some `Option
211211+ | "output" -> Some `Output | "progress" -> Some `Progress
212212+ | "select" -> Some `Select | "textarea" -> Some `Textarea
213213+ (* Interactive *)
214214+ | "details" -> Some `Details | "dialog" -> Some `Dialog | "summary" -> Some `Summary
215215+ (* Scripting *)
216216+ | "noscript" -> Some `Noscript | "script" -> Some `Script
217217+ | "slot" -> Some `Slot | "template" -> Some `Template
218218+ (* Web Components / Misc *)
219219+ | "portal" -> Some `Portal | "param" -> Some `Param
220220+ (* Deprecated/obsolete elements *)
221221+ | "applet" -> Some `Applet | "acronym" -> Some `Acronym | "bgsound" -> Some `Bgsound
222222+ | "dir" -> Some `Dir | "frame" -> Some `Frame | "frameset" -> Some `Frameset
223223+ | "noframes" -> Some `Noframes | "isindex" -> Some `Isindex | "keygen" -> Some `Keygen
224224+ | "listing" -> Some `Listing | "menuitem" -> Some `Menuitem | "nextid" -> Some `Nextid
225225+ | "noembed" -> Some `Noembed | "plaintext" -> Some `Plaintext
226226+ | "rb" -> Some `Rb | "rtc" -> Some `Rtc | "strike" -> Some `Strike | "xmp" -> Some `Xmp
227227+ | "basefont" -> Some `Basefont | "big" -> Some `Big | "blink" -> Some `Blink
228228+ | "center" -> Some `Center | "font" -> Some `Font | "marquee" -> Some `Marquee
229229+ | "multicol" -> Some `Multicol | "nobr" -> Some `Nobr | "spacer" -> Some `Spacer
230230+ | "tt" -> Some `Tt | "image" -> Some `Image
231231+ | _ -> None
232232+233233+(** Check if a name is a valid custom element name (contains hyphen, not reserved) *)
234234+let is_custom_element_name name =
235235+ String.contains name '-' &&
236236+ not (String.starts_with ~prefix:"xml" (String.lowercase_ascii name)) &&
237237+ not (String.equal (String.lowercase_ascii name) "annotation-xml")
238238+239239+(** SVG namespace URI *)
240240+let svg_namespace = "http://www.w3.org/2000/svg"
241241+242242+(** MathML namespace URI *)
243243+let mathml_namespace = "http://www.w3.org/1998/Math/MathML"
244244+245245+(** Check if namespace is SVG (accepts both short and full URI) *)
246246+let is_svg_namespace = function
247247+ | "svg" | "http://www.w3.org/2000/svg" -> true
248248+ | _ -> false
249249+250250+(** Check if namespace is MathML (accepts both short and full URI) *)
251251+let is_mathml_namespace = function
252252+ | "mathml" | "http://www.w3.org/1998/Math/MathML" -> true
253253+ | _ -> false
254254+255255+(** Convert tag name and optional namespace to element_tag *)
256256+let tag_of_string ?namespace name =
257257+ let name_lower = String.lowercase_ascii name in
258258+ match namespace with
259259+ | Some ns when is_svg_namespace ns -> Svg name_lower
260260+ | Some ns when is_mathml_namespace ns -> MathML name_lower
261261+ | Some _ -> Unknown name_lower (* Unknown namespace *)
262262+ | None ->
263263+ match html_tag_of_string_opt name_lower with
264264+ | Some tag -> Html tag
265265+ | None ->
266266+ if is_custom_element_name name_lower then
267267+ Custom name_lower
268268+ else
269269+ Unknown name_lower
270270+271271+(** Convert html_tag to string *)
272272+let html_tag_to_string (tag : html_tag) : string =
273273+ match tag with
274274+ (* Document metadata *)
275275+ | `Html -> "html" | `Head -> "head" | `Title -> "title"
276276+ | `Base -> "base" | `Link -> "link" | `Meta -> "meta" | `Style -> "style"
277277+ (* Sectioning root *)
278278+ | `Body -> "body"
279279+ (* Content sectioning *)
280280+ | `Address -> "address" | `Article -> "article" | `Aside -> "aside"
281281+ | `Footer -> "footer" | `Header -> "header" | `Hgroup -> "hgroup"
282282+ | `Main -> "main" | `Nav -> "nav" | `Search -> "search" | `Section -> "section"
283283+ (* Headings *)
284284+ | `H1 -> "h1" | `H2 -> "h2" | `H3 -> "h3"
285285+ | `H4 -> "h4" | `H5 -> "h5" | `H6 -> "h6"
286286+ (* Grouping content *)
287287+ | `Blockquote -> "blockquote" | `Dd -> "dd" | `Div -> "div"
288288+ | `Dl -> "dl" | `Dt -> "dt" | `Figcaption -> "figcaption"
289289+ | `Figure -> "figure" | `Hr -> "hr" | `Li -> "li"
290290+ | `Menu -> "menu" | `Ol -> "ol" | `P -> "p" | `Pre -> "pre" | `Ul -> "ul"
291291+ (* Text-level semantics *)
292292+ | `A -> "a" | `Abbr -> "abbr" | `B -> "b"
293293+ | `Bdi -> "bdi" | `Bdo -> "bdo" | `Br -> "br"
294294+ | `Cite -> "cite" | `Code -> "code" | `Data -> "data"
295295+ | `Dfn -> "dfn" | `Em -> "em" | `I -> "i"
296296+ | `Kbd -> "kbd" | `Mark -> "mark" | `Q -> "q"
297297+ | `Rp -> "rp" | `Rt -> "rt" | `Ruby -> "ruby"
298298+ | `S -> "s" | `Samp -> "samp" | `Small -> "small"
299299+ | `Span -> "span" | `Strong -> "strong" | `Sub -> "sub"
300300+ | `Sup -> "sup" | `Time -> "time" | `U -> "u"
301301+ | `Var -> "var" | `Wbr -> "wbr"
302302+ (* Edits *)
303303+ | `Del -> "del" | `Ins -> "ins"
304304+ (* Embedded content *)
305305+ | `Area -> "area" | `Audio -> "audio" | `Canvas -> "canvas"
306306+ | `Embed -> "embed" | `Iframe -> "iframe" | `Img -> "img"
307307+ | `Map -> "map" | `Object -> "object" | `Picture -> "picture"
308308+ | `Source -> "source" | `Track -> "track" | `Video -> "video"
309309+ (* Tabular data *)
310310+ | `Caption -> "caption" | `Col -> "col" | `Colgroup -> "colgroup"
311311+ | `Table -> "table" | `Tbody -> "tbody" | `Td -> "td"
312312+ | `Tfoot -> "tfoot" | `Th -> "th" | `Thead -> "thead" | `Tr -> "tr"
313313+ (* Forms *)
314314+ | `Button -> "button" | `Datalist -> "datalist"
315315+ | `Fieldset -> "fieldset" | `Form -> "form" | `Input -> "input"
316316+ | `Label -> "label" | `Legend -> "legend" | `Meter -> "meter"
317317+ | `Optgroup -> "optgroup" | `Option -> "option"
318318+ | `Output -> "output" | `Progress -> "progress"
319319+ | `Select -> "select" | `Textarea -> "textarea"
320320+ (* Interactive *)
321321+ | `Details -> "details" | `Dialog -> "dialog" | `Summary -> "summary"
322322+ (* Scripting *)
323323+ | `Noscript -> "noscript" | `Script -> "script"
324324+ | `Slot -> "slot" | `Template -> "template"
325325+ (* Web Components / Misc *)
326326+ | `Portal -> "portal" | `Param -> "param"
327327+ (* Deprecated/obsolete elements *)
328328+ | `Applet -> "applet" | `Acronym -> "acronym" | `Bgsound -> "bgsound"
329329+ | `Dir -> "dir" | `Frame -> "frame" | `Frameset -> "frameset"
330330+ | `Noframes -> "noframes" | `Isindex -> "isindex" | `Keygen -> "keygen"
331331+ | `Listing -> "listing" | `Menuitem -> "menuitem" | `Nextid -> "nextid"
332332+ | `Noembed -> "noembed" | `Plaintext -> "plaintext"
333333+ | `Rb -> "rb" | `Rtc -> "rtc" | `Strike -> "strike" | `Xmp -> "xmp"
334334+ | `Basefont -> "basefont" | `Big -> "big" | `Blink -> "blink"
335335+ | `Center -> "center" | `Font -> "font" | `Marquee -> "marquee"
336336+ | `Multicol -> "multicol" | `Nobr -> "nobr" | `Spacer -> "spacer"
337337+ | `Tt -> "tt" | `Image -> "image"
338338+339339+(** Convert element_tag to string *)
340340+let tag_to_string = function
341341+ | Html tag -> html_tag_to_string tag
342342+ | Svg name -> name
343343+ | MathML name -> name
344344+ | Custom name -> name
345345+ | Unknown name -> name
346346+347347+(** {1 Category Predicates} *)
348348+349349+(** Check if element is a void element *)
350350+let is_void (tag : html_tag) : bool =
351351+ match tag with
352352+ | `Area | `Base | `Br | `Col | `Embed | `Hr | `Img | `Input
353353+ | `Link | `Meta | `Source | `Track | `Wbr
354354+ | `Basefont | `Frame | `Isindex | `Keygen | `Param -> true
355355+ | _ -> false
356356+357357+(** Check if element is a heading *)
358358+let is_heading (tag : html_tag) : bool =
359359+ match tag with
360360+ | `H1 | `H2 | `H3 | `H4 | `H5 | `H6 -> true
361361+ | _ -> false
362362+363363+(** Get heading level (1-6) or None *)
364364+let heading_level (tag : html_tag) : int option =
365365+ match tag with
366366+ | `H1 -> Some 1 | `H2 -> Some 2 | `H3 -> Some 3
367367+ | `H4 -> Some 4 | `H5 -> Some 5 | `H6 -> Some 6
368368+ | _ -> None
369369+370370+(** Check if element is sectioning content *)
371371+let is_sectioning (tag : html_tag) : bool =
372372+ match tag with
373373+ | `Article | `Aside | `Nav | `Section -> true
374374+ | _ -> false
375375+376376+(** Check if element is a sectioning root *)
377377+let is_sectioning_root (tag : html_tag) : bool =
378378+ match tag with
379379+ | `Blockquote | `Body | `Details | `Dialog | `Fieldset | `Figure | `Td -> true
380380+ | _ -> false
381381+382382+(** Check if element is embedded content *)
383383+let is_embedded (tag : html_tag) : bool =
384384+ match tag with
385385+ | `Audio | `Canvas | `Embed | `Iframe | `Img | `Object | `Picture | `Video -> true
386386+ | _ -> false
387387+388388+(** Check if element is interactive content *)
389389+let is_interactive (tag : html_tag) : bool =
390390+ match tag with
391391+ | `A | `Audio | `Button | `Details | `Embed | `Iframe | `Img
392392+ | `Input | `Label | `Select | `Textarea | `Video -> true
393393+ | _ -> false
394394+395395+(** Check if element is form-associated *)
396396+let is_form_associated (tag : html_tag) : bool =
397397+ match tag with
398398+ | `Button | `Fieldset | `Input | `Label | `Object | `Output
399399+ | `Select | `Textarea | `Meter | `Progress -> true
400400+ | _ -> false
401401+402402+(** Check if element is labelable *)
403403+let is_labelable (tag : html_tag) : bool =
404404+ match tag with
405405+ | `Button | `Input | `Meter | `Output | `Progress | `Select | `Textarea -> true
406406+ | _ -> false
407407+408408+(** Check if element is submittable *)
409409+let is_submittable (tag : html_tag) : bool =
410410+ match tag with
411411+ | `Button | `Input | `Select | `Textarea -> true
412412+ | _ -> false
413413+414414+(** Check if element is resettable *)
415415+let is_resettable (tag : html_tag) : bool =
416416+ match tag with
417417+ | `Input | `Output | `Select | `Textarea -> true
418418+ | _ -> false
419419+420420+(** Check if element has transparent content model *)
421421+let is_transparent (tag : html_tag) : bool =
422422+ match tag with
423423+ | `A | `Abbr | `Audio | `Canvas | `Del | `Ins | `Map | `Noscript
424424+ | `Object | `Slot | `Video -> true
425425+ | _ -> false
426426+427427+(** Check if element is script-supporting *)
428428+let is_script_supporting (tag : html_tag) : bool =
429429+ match tag with
430430+ | `Script | `Template -> true
431431+ | _ -> false
432432+433433+(** Check if element is a table element *)
434434+let is_table_element (tag : html_tag) : bool =
435435+ match tag with
436436+ | `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot
437437+ | `Th | `Thead | `Tr -> true
438438+ | _ -> false
439439+440440+(** Check if element is a media element *)
441441+let is_media (tag : html_tag) : bool =
442442+ match tag with
443443+ | `Audio | `Video -> true
444444+ | _ -> false
445445+446446+(** Check if element is a list container *)
447447+let is_list_container (tag : html_tag) : bool =
448448+ match tag with
449449+ | `Ul | `Ol | `Menu | `Dl -> true
450450+ | _ -> false
451451+452452+(** Check if element is a list item *)
453453+let is_list_item (tag : html_tag) : bool =
454454+ match tag with
455455+ | `Li | `Dd | `Dt -> true
456456+ | _ -> false
457457+458458+(** Check if element is metadata content *)
459459+let is_metadata (tag : html_tag) : bool =
460460+ match tag with
461461+ | `Base | `Link | `Meta | `Noscript | `Script | `Style | `Template | `Title -> true
462462+ | _ -> false
463463+464464+(** Check if element is a deprecated/obsolete element *)
465465+let is_obsolete (tag : html_tag) : bool =
466466+ match tag with
467467+ | `Applet | `Acronym | `Bgsound | `Dir | `Frame | `Frameset
468468+ | `Noframes | `Isindex | `Keygen | `Listing | `Menuitem | `Nextid
469469+ | `Noembed | `Plaintext | `Rb | `Rtc | `Strike | `Xmp
470470+ | `Basefont | `Big | `Blink | `Center | `Font | `Marquee
471471+ | `Multicol | `Nobr | `Spacer | `Tt | `Image -> true
472472+ | _ -> false
473473+474474+(** Check if element is a raw text element (script, style) *)
475475+let is_raw_text (tag : html_tag) : bool =
476476+ match tag with
477477+ | `Script | `Style -> true
478478+ | _ -> false
479479+480480+(** Check if element is an escapable raw text element (textarea, title) *)
481481+let is_escapable_raw_text (tag : html_tag) : bool =
482482+ match tag with
483483+ | `Textarea | `Title -> true
484484+ | _ -> false
485485+486486+(** Check if element is a phrasing content element *)
487487+let is_phrasing (tag : html_tag) : bool =
488488+ match tag with
489489+ | `A | `Abbr | `Audio | `B | `Bdi | `Bdo | `Br | `Button | `Canvas
490490+ | `Cite | `Code | `Data | `Datalist | `Del | `Dfn | `Em | `Embed
491491+ | `I | `Iframe | `Img | `Input | `Ins | `Kbd | `Label | `Map | `Mark
492492+ | `Meter | `Noscript | `Object | `Output | `Picture | `Progress | `Q
493493+ | `Ruby | `S | `Samp | `Script | `Select | `Slot | `Small | `Span
494494+ | `Strong | `Sub | `Sup | `Template | `Textarea | `Time | `U | `Var
495495+ | `Video | `Wbr
496496+ (* Deprecated phrasing *)
497497+ | `Acronym | `Big | `Blink | `Font | `Marquee | `Nobr | `Spacer | `Tt -> true
498498+ | _ -> false
499499+500500+(** Check if element is flow content *)
501501+let is_flow (tag : html_tag) : bool =
502502+ match tag with
503503+ (* Most elements are flow content *)
504504+ | `Html | `Head | `Title | `Base | `Link | `Meta | `Style -> false
505505+ | `Body -> false
506506+ | `Caption | `Col | `Colgroup | `Tbody | `Td | `Tfoot | `Th | `Thead | `Tr -> false
507507+ | `Dd | `Dt | `Li -> false
508508+ | `Optgroup | `Option -> false
509509+ | `Param | `Source | `Track -> false
510510+ | `Area -> false (* Only when descendant of map *)
511511+ | `Rp | `Rt | `Rb | `Rtc -> false
512512+ | `Legend | `Figcaption | `Summary -> false
513513+ | _ -> true
514514+515515+(** Pattern for matching HTML tags in element_tag *)
516516+let as_html_tag = function
517517+ | Html tag -> Some tag
518518+ | _ -> None
519519+520520+(** Pattern for matching specific HTML tag *)
521521+let is_html_tag expected = function
522522+ | Html tag -> tag = expected
523523+ | _ -> false
+36-31
lib/htmlrw_check/semantic/autofocus_checker.ml
···2525 state.context_stack <- [];
2626 state.current_depth <- 0
27272828-let start_element state ~name ~namespace ~attrs collector =
2828+let start_element state ~element collector =
2929 state.current_depth <- state.current_depth + 1;
30303131- match namespace with
3232- | Some _ -> ()
3333- | None ->
3434- let name_lower = String.lowercase_ascii name in
3535-3636- (* Check if we're entering a dialog or popover context *)
3737- let enters_context = match name_lower with
3838- | "dialog" -> Some Dialog
3939- | _ when Attr_utils.has_attr "popover" attrs -> Some Popover
4040- | _ -> None
4141- in
4242-4343- Option.iter (fun ctx_type ->
4444- let ctx = { context_type = ctx_type; autofocus_count = 0; depth = state.current_depth } in
3131+ match element.Element.tag with
3232+ | Tag.Html `Dialog ->
3333+ let ctx = { context_type = Dialog; autofocus_count = 0; depth = state.current_depth } in
3434+ state.context_stack <- ctx :: state.context_stack;
3535+ (* Check for autofocus on dialog itself *)
3636+ if Attr.has_autofocus element.attrs then
3737+ begin match state.context_stack with
3838+ | ctx :: _ ->
3939+ ctx.autofocus_count <- ctx.autofocus_count + 1;
4040+ if ctx.autofocus_count > 1 then
4141+ Message_collector.add_typed collector (`Misc `Multiple_autofocus)
4242+ | [] -> ()
4343+ end
4444+ | Tag.Html _ ->
4545+ (* Check if element has popover attribute *)
4646+ let has_popover = Attr_utils.has_attr "popover" element.raw_attrs in
4747+ if has_popover then begin
4848+ let ctx = { context_type = Popover; autofocus_count = 0; depth = state.current_depth } in
4549 state.context_stack <- ctx :: state.context_stack
4646- ) enters_context;
4747-5050+ end;
4851 (* Check for autofocus attribute *)
4949- if Attr_utils.has_attr "autofocus" attrs then
5252+ if Attr.has_autofocus element.attrs then begin
5053 match state.context_stack with
5154 | ctx :: _ ->
5255 ctx.autofocus_count <- ctx.autofocus_count + 1;
5356 if ctx.autofocus_count > 1 then
5457 Message_collector.add_typed collector (`Misc `Multiple_autofocus)
5558 | [] -> ()
5959+ end
6060+ | _ -> ()
56615757-let end_element state ~name ~namespace _collector =
5858- (match namespace with
5959- | Some _ -> ()
6060- | None ->
6161- let name_lower = String.lowercase_ascii name in
6262- match state.context_stack with
6363- | ctx :: rest when ctx.depth = state.current_depth ->
6464- let matches =
6565- (name_lower = "dialog" && ctx.context_type = Dialog) ||
6666- (ctx.context_type = Popover)
6767- in
6868- if matches then state.context_stack <- rest
6969- | _ -> ());
6262+let end_element state ~tag _collector =
6363+ (match tag with
6464+ | Tag.Html `Dialog ->
6565+ (match state.context_stack with
6666+ | ctx :: rest when ctx.depth = state.current_depth && ctx.context_type = Dialog ->
6767+ state.context_stack <- rest
6868+ | _ -> ())
6969+ | Tag.Html _ ->
7070+ (match state.context_stack with
7171+ | ctx :: rest when ctx.depth = state.current_depth && ctx.context_type = Popover ->
7272+ state.context_stack <- rest
7373+ | _ -> ())
7474+ | _ -> ());
70757176 state.current_depth <- state.current_depth - 1
7277
+6-5
lib/htmlrw_check/semantic/form_checker.ml
···3131 (`Attr (`Bad_value (`Elem element_name, `Attr "autocomplete", `Value value, `Reason reason)))
3232 end
33333434-let start_element _state ~name ~namespace:_ ~attrs collector =
3434+let start_element _state ~element collector =
3535 (* Check autocomplete attribute on form elements *)
3636- match name with
3737- | "input" | "select" | "textarea" ->
3838- (match Attr_utils.get_attr "autocomplete" attrs with
3636+ match element.Element.tag with
3737+ | Tag.Html (`Input | `Select | `Textarea as tag) ->
3838+ let name = Tag.html_tag_to_string tag in
3939+ (match Attr_utils.get_attr "autocomplete" element.raw_attrs with
3940 | Some autocomplete_value ->
4041 check_autocomplete_value autocomplete_value name collector
4142 | None -> ())
4243 | _ -> ()
43444444-let end_element _state ~name:_ ~namespace:_ _collector = ()
4545+let end_element _state ~tag:_ _collector = ()
45464647let characters _state _text _collector = ()
4748
+12-12
lib/htmlrw_check/semantic/id_checker.ml
···176176 | _ -> ()
177177 ) attrs
178178179179-let start_element state ~name ~namespace:_ ~attrs collector =
180180- (* For now, we don't have location information from the DOM walker,
181181- so we pass None. In a full implementation, this would be passed
182182- from the parser. *)
179179+let start_element state ~element collector =
180180+ let name = Tag.tag_to_string element.Element.tag in
181181+ let attrs = element.raw_attrs in
183182 let location = None in
184183 process_attrs state ~element:name ~attrs ~location collector;
185184186185 (* Special check: map element must have matching id and name if both present *)
187187- if name = "map" then begin
188188- let id_opt = List.find_map (fun (n, v) -> if n = "id" then Some v else None) attrs in
189189- let name_opt = List.find_map (fun (n, v) -> if n = "name" then Some v else None) attrs in
190190- match id_opt, name_opt with
191191- | Some id_val, Some name_val when id_val <> name_val ->
186186+ (match element.tag with
187187+ | Tag.Html `Map ->
188188+ let id_opt = Attr.get_id element.attrs in
189189+ let name_opt = Attr.get_name element.attrs in
190190+ (match id_opt, name_opt with
191191+ | Some id_val, Some name_val when id_val <> name_val ->
192192 Message_collector.add_typed collector (`Misc `Map_id_name_mismatch)
193193- | _ -> ()
194194- end
193193+ | _ -> ())
194194+ | _ -> ())
195195196196-let end_element _state ~name:_ ~namespace:_ _collector =
196196+let end_element _state ~tag:_ _collector =
197197 ()
198198199199let characters _state _text _collector =
···216216 (* If > 2% are Traditional-only characters, it's Traditional Chinese *)
217217 !total > 100 && (float_of_int !count /. float_of_int !total) > 0.02
218218219219-let start_element state ~name ~namespace ~attrs _collector =
220220- let name_lower = String.lowercase_ascii name in
221221- let ns = Option.value namespace ~default:"" in
222222-223223- if name_lower = "html" then begin
219219+let start_element state ~element _collector =
220220+ let attrs = element.Element.raw_attrs in
221221+ match element.tag with
222222+ | Tag.Html `Html ->
224223 state.html_lang <- Attr_utils.get_attr "lang" attrs;
225224 state.html_dir <- Attr_utils.get_attr "dir" attrs;
226225 (* TODO: get line/column from locator *)
227226 state.html_locator <- Some (1, 1)
228228- end
229229- else if name_lower = "body" then
227227+ | Tag.Html `Body ->
230228 state.in_body <- true
231231- else if state.in_body then begin
232232- (* Track foreign namespace depth (SVG/MathML) *)
233233- if is_foreign_namespace ns || is_foreign_element name then
229229+ | Tag.Svg _ | Tag.MathML _ ->
230230+ if state.in_body then
234231 state.foreign_depth <- state.foreign_depth + 1
235235- else if state.foreign_depth > 0 then
232232+ | Tag.Html tag when state.in_body ->
233233+ let name_lower = Tag.html_tag_to_string tag in
234234+ if state.foreign_depth > 0 then
236235 state.foreign_depth <- state.foreign_depth + 1
237236 (* Check if we should skip this element's text *)
238237 else if List.mem name_lower skip_elements then
···244243 state.skip_depth <- state.skip_depth + 1
245244 | _ -> ()
246245 end
247247- end
246246+ | _ -> ()
248247249249-let end_element state ~name ~namespace:_ _collector =
250250- let name_lower = String.lowercase_ascii name in
251251- if name_lower = "body" then
248248+let end_element state ~tag _collector =
249249+ match tag with
250250+ | Tag.Html `Body ->
252251 state.in_body <- false
253253- else if state.in_body then begin
252252+ | Tag.Svg _ | Tag.MathML _ when state.in_body ->
253253+ if state.foreign_depth > 0 then
254254+ state.foreign_depth <- state.foreign_depth - 1
255255+ | Tag.Html tag when state.in_body ->
256256+ let name_lower = Tag.html_tag_to_string tag in
254257 (* Track foreign namespace depth *)
255258 if state.foreign_depth > 0 then
256259 state.foreign_depth <- state.foreign_depth - 1
···261264 (* TODO: properly track nested elements with different lang *)
262265 state.skip_depth <- max 0 (state.skip_depth - 1)
263266 end
264264- end
267267+ | _ -> ()
265268266269let characters state text _collector =
267270 if state.in_body && state.skip_depth = 0 && state.foreign_depth = 0 && state.char_count < max_chars then begin
+10-8
lib/htmlrw_check/semantic/nesting_checker.ml
···300300 end
301301 | _ -> ()
302302303303-let start_element state ~name ~namespace ~attrs collector =
303303+let start_element state ~element collector =
304304 (* Only check HTML elements, not SVG or MathML *)
305305- match namespace with
306306- | Some _ -> ()
307307- | None ->
305305+ match element.Element.tag with
306306+ | Tag.Html _ ->
307307+ let name = Tag.tag_to_string element.tag in
308308+ let attrs = element.raw_attrs in
308309 (* Check for nesting violations *)
309310 check_nesting state name attrs collector;
310311 check_required_ancestors state name collector;
···334335 let node = { ancestor_mask = state.ancestor_mask; name; is_transparent } in
335336 state.stack <- node :: state.stack;
336337 state.ancestor_mask <- new_mask
338338+ | _ -> () (* SVG, MathML, Custom, Unknown *)
337339338338-let end_element state ~name:_ ~namespace _collector =
340340+let end_element state ~tag _collector =
339341 (* Only track HTML elements *)
340340- match namespace with
341341- | Some _ -> ()
342342- | None ->
342342+ match tag with
343343+ | Tag.Html _ ->
343344 (* Pop from stack and restore ancestor mask *)
344345 begin match state.stack with
345346 | [] -> () (* Should not happen in well-formed documents *)
···347348 state.stack <- rest;
348349 state.ancestor_mask <- node.ancestor_mask
349350 end
351351+ | _ -> ()
350352351353let characters _state _text _collector =
352354 () (* No text-specific nesting checks *)
+11-13
lib/htmlrw_check/semantic/obsolete_checker.ml
···250250251251let reset state = state.in_head <- false
252252253253-let start_element state ~name ~namespace ~attrs collector =
254254- (* Only check HTML elements (no namespace or explicit HTML namespace) *)
255255- let is_html = match namespace with
256256- | None -> true
257257- | Some ns -> String.equal (String.lowercase_ascii ns) "html"
258258- in
259259-260260- if not is_html then ()
261261- else begin
253253+let start_element state ~element collector =
254254+ (* Only check HTML elements *)
255255+ match element.Element.tag with
256256+ | Tag.Html _ ->
257257+ let name = Tag.tag_to_string element.tag in
262258 let name_lower = String.lowercase_ascii name in
259259+ let attrs = element.raw_attrs in
263260264261 (* Track head context *)
265262 if name_lower = "head" then state.in_head <- true;
···309306 (`Element (`Obsolete_global_attr (`Attr attr_name, `Suggestion suggestion))))
310307 end
311308 ) attrs
312312- end
309309+ | _ -> () (* Non-HTML elements don't have obsolete checks *)
313310314314-let end_element state ~name ~namespace:_ _collector =
315315- let name_lower = String.lowercase_ascii name in
316316- if name_lower = "head" then state.in_head <- false
311311+let end_element state ~tag _collector =
312312+ match tag with
313313+ | Tag.Html `Head -> state.in_head <- false
314314+ | _ -> ()
317315318316let characters _state _text _collector = ()
319317
+30-41
lib/htmlrw_check/semantic/option_checker.ml
···2222 state.option_stack <- [];
2323 state.in_template <- 0
24242525-let start_element state ~name ~namespace ~attrs collector =
2626- let name_lower = String.lowercase_ascii name in
2525+let start_element state ~element collector =
2626+ match element.Element.tag with
2727+ | Tag.Html `Template ->
2828+ state.in_template <- state.in_template + 1
2929+ | Tag.Html `Option when state.in_template = 0 ->
3030+ let label_opt = Attr_utils.get_attr "label" element.raw_attrs in
3131+ let has_label = label_opt <> None in
3232+ let label_empty = match label_opt with
3333+ | Some v -> String.trim v = ""
3434+ | None -> false
3535+ in
3636+ (* Report error for empty label attribute value *)
3737+ if label_empty then
3838+ Message_collector.add_typed collector
3939+ (`Attr (`Bad_value (`Elem "option", `Attr "label", `Value "", `Reason "Bad non-empty string: Must not be empty.")));
4040+ let ctx = { has_text = false; has_label; label_empty } in
4141+ state.option_stack <- ctx :: state.option_stack
4242+ | _ -> ()
27432828- if namespace <> None then ()
2929- else begin
3030- if name_lower = "template" then
3131- state.in_template <- state.in_template + 1
3232- else if state.in_template = 0 && name_lower = "option" then begin
3333- let label_opt = Attr_utils.get_attr "label" attrs in
3434- let has_label = label_opt <> None in
3535- let label_empty = match label_opt with
3636- | Some v -> String.trim v = ""
3737- | None -> false
3838- in
3939- (* Report error for empty label attribute value *)
4040- if label_empty then
4141- Message_collector.add_typed collector
4242- (`Attr (`Bad_value (`Elem "option", `Attr "label", `Value "", `Reason "Bad non-empty string: Must not be empty.")));
4343- let ctx = { has_text = false; has_label; label_empty } in
4444- state.option_stack <- ctx :: state.option_stack
4545- end
4646- end
4747-4848-let end_element state ~name ~namespace collector =
4949- let name_lower = String.lowercase_ascii name in
5050-5151- if namespace <> None then ()
5252- else begin
5353- if name_lower = "template" then
5454- state.in_template <- max 0 (state.in_template - 1)
5555- else if state.in_template = 0 && name_lower = "option" then begin
5656- match state.option_stack with
5757- | ctx :: rest ->
5858- state.option_stack <- rest;
5959- (* Validate: option must have text content or non-empty label *)
6060- (* Note: empty label error is already reported at start_element,
6161- so only report empty option without label when there's no label attribute at all *)
6262- if not ctx.has_text && not ctx.has_label then
6363- Message_collector.add_typed collector (`Misc `Option_empty_without_label)
6464- | [] -> ()
6565- end
6666- end
4444+let end_element state ~tag collector =
4545+ match tag with
4646+ | Tag.Html `Template ->
4747+ state.in_template <- max 0 (state.in_template - 1)
4848+ | Tag.Html `Option when state.in_template = 0 ->
4949+ (match state.option_stack with
5050+ | ctx :: rest ->
5151+ state.option_stack <- rest;
5252+ if not ctx.has_text && not ctx.has_label then
5353+ Message_collector.add_typed collector (`Misc `Option_empty_without_label)
5454+ | [] -> ())
5555+ | _ -> ()
67566857let characters state text _collector =
6958 if state.in_template = 0 then begin
···177177 (q "value") (q "max")))
178178 with _ -> ())
179179180180-let start_element state ~name ~namespace:_ ~attrs collector =
181181- match name with
182182- | "img" -> check_img_element state attrs collector
183183- | "area" -> check_area_element attrs collector
184184- | "input" -> check_input_element attrs collector
185185- | "script" -> check_script_element attrs collector
186186- | "meta" -> check_meta_element attrs collector
187187- | "link" -> check_link_element attrs collector
188188- | "a" ->
180180+let start_element state ~element collector =
181181+ let attrs = element.Element.raw_attrs in
182182+ match element.tag with
183183+ | Tag.Html `Img -> check_img_element state attrs collector
184184+ | Tag.Html `Area -> check_area_element attrs collector
185185+ | Tag.Html `Input -> check_input_element attrs collector
186186+ | Tag.Html `Script -> check_script_element attrs collector
187187+ | Tag.Html `Meta -> check_meta_element attrs collector
188188+ | Tag.Html `Link -> check_link_element attrs collector
189189+ | Tag.Html `A ->
189190 check_a_element attrs collector;
190191 if Attr_utils.has_attr "href" attrs then state.in_a_with_href <- true
191191- | "map" -> check_map_element attrs collector
192192- | "object" -> check_object_element attrs collector
193193- | "meter" -> check_meter_element attrs collector
194194- | "progress" -> check_progress_element attrs collector
195195- | "figure" -> state._in_figure <- true
196196- | _ ->
197197- (* Check popover attribute on any element *)
192192+ | Tag.Html `Map -> check_map_element attrs collector
193193+ | Tag.Html `Object -> check_object_element attrs collector
194194+ | Tag.Html `Meter -> check_meter_element attrs collector
195195+ | Tag.Html `Progress -> check_progress_element attrs collector
196196+ | Tag.Html `Figure -> state._in_figure <- true
197197+ | Tag.Html _ ->
198198+ (* Check popover attribute on any HTML element *)
199199+ let name = Tag.tag_to_string element.tag in
198200 if Attr_utils.has_attr "popover" attrs then check_popover_element name attrs collector
201201+ | _ -> () (* Non-HTML elements *)
199202200200-let end_element state ~name ~namespace:_ _collector =
201201- match name with
202202- | "figure" -> state._in_figure <- false
203203- | "a" -> state.in_a_with_href <- false
203203+let end_element state ~tag _collector =
204204+ match tag with
205205+ | Tag.Html `Figure -> state._in_figure <- false
206206+ | Tag.Html `A -> state.in_a_with_href <- false
204207 | _ -> ()
205208206209let characters _state _text _collector = ()
+13-11
lib/htmlrw_check/specialized/aria_checker.ml
···427427 let quoted = List.map (fun r -> "\"" ^ r ^ "\"") roles in
428428 String.concat " or " quoted
429429430430-let start_element state ~name ~namespace ~attrs collector =
430430+let start_element state ~element collector =
431431 (* Only process HTML elements *)
432432- match namespace with
433433- | Some _ -> () (* Skip non-HTML elements *)
434434- | None ->
432432+ match element.Element.tag with
433433+ | Tag.Html _ ->
434434+ let name = Tag.tag_to_string element.tag in
435435 let name_lower = String.lowercase_ascii name in
436436+ let attrs = element.raw_attrs in
436437 let role_attr = List.assoc_opt "role" attrs in
437438 let aria_label = List.assoc_opt "aria-label" attrs in
438439 let aria_labelledby = List.assoc_opt "aria-labelledby" attrs in
···723724 implicit_role;
724725 } in
725726 state.stack <- node :: state.stack
727727+ | _ -> () (* Skip non-HTML elements *)
726728727727-let end_element state ~name:_ ~namespace _collector =
729729+let end_element state ~tag _collector =
728730 (* Only process HTML elements *)
729729- match namespace with
730730- | Some _ -> () (* Skip non-HTML elements *)
731731- | None ->
731731+ match tag with
732732+ | Tag.Html _ ->
732733 (* Pop from stack *)
733733- match state.stack with
734734- | _ :: rest -> state.stack <- rest
735735- | [] -> () (* Stack underflow - shouldn't happen in well-formed docs *)
734734+ (match state.stack with
735735+ | _ :: rest -> state.stack <- rest
736736+ | [] -> ()) (* Stack underflow - shouldn't happen in well-formed docs *)
737737+ | _ -> ()
736738737739let characters _state _text _collector = ()
738740
···5252 Message_collector.add_typed collector
5353 (`Attr (`Not_allowed (`Attr attr, `Elem element)))
54545555-let start_element state ~name ~namespace ~attrs collector =
5656- let name_lower = String.lowercase_ascii name in
5555+let start_element state ~element collector =
5656+ match element.Element.tag with
5757+ | Tag.Html _ ->
5858+ let name = Tag.tag_to_string element.tag in
5959+ let name_lower = String.lowercase_ascii name in
6060+ let attrs = element.raw_attrs in
57615858- (* Detect XHTML mode from xmlns attribute on html element *)
5959- if name_lower = "html" then begin
6060- match Attr_utils.get_attr "xmlns" attrs with
6161- | Some "http://www.w3.org/1999/xhtml" -> state.is_xhtml <- true
6262- | _ -> ()
6363- end;
6262+ (* Detect XHTML mode from xmlns attribute on html element *)
6363+ if name_lower = "html" then begin
6464+ match Attr_utils.get_attr "xmlns" attrs with
6565+ | Some "http://www.w3.org/1999/xhtml" -> state.is_xhtml <- true
6666+ | _ -> ()
6767+ end;
64686565- (* Check HTML element attribute restrictions *)
6666- (match namespace with
6767- | Some _ -> ()
6868- | None ->
6969- match List.assoc_opt name_lower disallowed_attrs_html with
6969+ (* Check HTML element attribute restrictions *)
7070+ (match List.assoc_opt name_lower disallowed_attrs_html with
7071 | Some disallowed ->
7172 List.iter (fun attr ->
7273 if Attr_utils.has_attr attr attrs then
···7475 ) disallowed
7576 | None -> ());
76777777- (* Check for xml:base attribute - not allowed in HTML *)
7878- (match namespace with
7979- | Some _ -> ()
8080- | None when name_lower = "html" ->
8181- if Attr_utils.has_attr "xml:base" attrs then
8282- report_disallowed_attr name_lower "xml:base" collector
8383- | None -> ());
7878+ (* Check for xml:base attribute - not allowed in HTML *)
7979+ if name_lower = "html" then begin
8080+ if Attr_utils.has_attr "xml:base" attrs then
8181+ report_disallowed_attr name_lower "xml:base" collector
8282+ end;
84838585- (* Check for xmlns:* prefixed attributes - not allowed in HTML *)
8686- (* Standard xmlns declarations are allowed but custom prefixes are not *)
8787- (match namespace with
8888- | Some _ -> ()
8989- | None ->
8484+ (* Check for xmlns:* prefixed attributes - not allowed in HTML *)
8585+ (* Standard xmlns declarations are allowed but custom prefixes are not *)
9086 List.iter (fun (attr_name, _) ->
9187 let attr_lower = String.lowercase_ascii attr_name in
9288 if String.starts_with ~prefix:"xmlns:" attr_lower then begin
···9692 Message_collector.add_typed collector
9793 (`Attr (`Not_allowed_here (`Attr attr_name)))
9894 end
9999- ) attrs);
9595+ ) attrs;
9696+9797+ (* Check SVG element restrictions - works in both HTML-embedded and XHTML SVG *)
9898+ (* xml:id is never valid on SVG elements in HTML5 *)
9999+ if List.mem name_lower svg_no_xml_id then begin
100100+ if Attr_utils.has_attr "xml:id" attrs then
101101+ report_disallowed_attr name_lower "xml:id" collector
102102+ end;
100103101101- (* Check SVG element restrictions - works in both HTML-embedded and XHTML SVG *)
102102- (* xml:id is never valid on SVG elements in HTML5 *)
103103- if List.mem name_lower svg_no_xml_id then begin
104104- if Attr_utils.has_attr "xml:id" attrs then
105105- report_disallowed_attr name_lower "xml:id" collector
106106- end;
104104+ (* SVG feConvolveMatrix requires order attribute *)
105105+ if name_lower = "feconvolvematrix" then begin
106106+ if not (Attr_utils.has_attr "order" attrs) then
107107+ Message_collector.add_typed collector
108108+ (`Svg (`Missing_attr (`Elem "feConvolveMatrix", `Attr "order")))
109109+ end;
107110108108- (* SVG feConvolveMatrix requires order attribute *)
109109- if name_lower = "feconvolvematrix" then begin
110110- if not (Attr_utils.has_attr "order" attrs) then
111111- Message_collector.add_typed collector
112112- (`Svg (`Missing_attr (`Elem "feConvolveMatrix", `Attr "order")))
113113- end;
111111+ (* Validate style type attribute - must be "text/css" or omitted *)
112112+ if name_lower = "style" then begin
113113+ List.iter (fun (attr_name, attr_value) ->
114114+ let attr_lower = String.lowercase_ascii attr_name in
115115+ if attr_lower = "type" then begin
116116+ let value_lower = String.lowercase_ascii (String.trim attr_value) in
117117+ if value_lower <> "text/css" then
118118+ Message_collector.add_typed collector (`Misc `Style_type_invalid)
119119+ end
120120+ ) attrs
121121+ end;
114122115115- (* Validate style type attribute - must be "text/css" or omitted *)
116116- (match namespace with
117117- | Some _ -> ()
118118- | None when name_lower = "style" ->
119119- List.iter (fun (attr_name, attr_value) ->
120120- let attr_lower = String.lowercase_ascii attr_name in
121121- if attr_lower = "type" then begin
122122- let value_lower = String.lowercase_ascii (String.trim attr_value) in
123123- if value_lower <> "text/css" then
124124- Message_collector.add_typed collector (`Misc `Style_type_invalid)
125125- end
126126- ) attrs
127127- | None -> ());
123123+ (* Validate object element requires data or type attribute *)
124124+ if name_lower = "object" then begin
125125+ let has_data = Attr_utils.has_attr "data" attrs in
126126+ let has_type = Attr_utils.has_attr "type" attrs in
127127+ if not has_data && not has_type then
128128+ Message_collector.add_typed collector
129129+ (`Attr (`Missing (`Elem "object", `Attr "data")))
130130+ end;
128131129129- (* Validate object element requires data or type attribute *)
130130- (match namespace with
131131- | Some _ -> ()
132132- | None when name_lower = "object" ->
133133- let has_data = Attr_utils.has_attr "data" attrs in
134134- let has_type = Attr_utils.has_attr "type" attrs in
135135- if not has_data && not has_type then
136136- Message_collector.add_typed collector
137137- (`Attr (`Missing (`Elem "object", `Attr "data")))
138138- | None -> ());
132132+ (* Validate link imagesizes/imagesrcset attributes *)
133133+ if name_lower = "link" then begin
134134+ let has_imagesizes = Attr_utils.has_attr "imagesizes" attrs in
135135+ let has_imagesrcset = Attr_utils.has_attr "imagesrcset" attrs in
136136+ let rel_value = Attr_utils.get_attr "rel" attrs in
137137+ let as_value = Attr_utils.get_attr "as" attrs in
139138140140- (* Validate link imagesizes/imagesrcset attributes *)
141141- (match namespace with
142142- | Some _ -> ()
143143- | None when name_lower = "link" ->
144144- let has_imagesizes = Attr_utils.has_attr "imagesizes" attrs in
145145- let has_imagesrcset = Attr_utils.has_attr "imagesrcset" attrs in
146146- let rel_value = Attr_utils.get_attr "rel" attrs in
147147- let as_value = Attr_utils.get_attr "as" attrs in
139139+ (* imagesizes requires imagesrcset *)
140140+ if has_imagesizes && not has_imagesrcset then
141141+ Message_collector.add_typed collector (`Srcset `Imagesizes_without_imagesrcset);
148142149149- (* imagesizes requires imagesrcset *)
150150- if has_imagesizes && not has_imagesrcset then
151151- Message_collector.add_typed collector (`Srcset `Imagesizes_without_imagesrcset);
143143+ (* imagesrcset requires as="image" *)
144144+ if has_imagesrcset then begin
145145+ let as_is_image = match as_value with
146146+ | Some v -> String.lowercase_ascii (String.trim v) = "image"
147147+ | None -> false
148148+ in
149149+ if not as_is_image then
150150+ Message_collector.add_typed collector (`Link `Imagesrcset_requires_as_image)
151151+ end;
152152153153- (* imagesrcset requires as="image" *)
154154- if has_imagesrcset then begin
155155- let as_is_image = match as_value with
156156- | Some v -> String.lowercase_ascii (String.trim v) = "image"
157157- | None -> false
158158- in
159159- if not as_is_image then
160160- Message_collector.add_typed collector (`Link `Imagesrcset_requires_as_image)
153153+ (* as attribute requires rel="preload" or rel="modulepreload" *)
154154+ (match as_value with
155155+ | Some _ ->
156156+ let rel_is_preload = match rel_value with
157157+ | Some v ->
158158+ let rel_lower = String.lowercase_ascii (String.trim v) in
159159+ String.length rel_lower > 0 &&
160160+ (List.mem "preload" (String.split_on_char ' ' rel_lower) ||
161161+ List.mem "modulepreload" (String.split_on_char ' ' rel_lower))
162162+ | None -> false
163163+ in
164164+ if not rel_is_preload then
165165+ Message_collector.add_typed collector (`Link `As_requires_preload)
166166+ | None -> ())
161167 end;
162168163163- (* as attribute requires rel="preload" or rel="modulepreload" *)
164164- (match as_value with
165165- | Some _ ->
166166- let rel_is_preload = match rel_value with
167167- | Some v ->
168168- let rel_lower = String.lowercase_ascii (String.trim v) in
169169- String.length rel_lower > 0 &&
170170- (List.mem "preload" (String.split_on_char ' ' rel_lower) ||
171171- List.mem "modulepreload" (String.split_on_char ' ' rel_lower))
172172- | None -> false
173173- in
174174- if not rel_is_preload then
175175- Message_collector.add_typed collector (`Link `As_requires_preload)
176176- | None -> ())
177177- | None -> ());
178178-179179- (* Validate img usemap attribute - must be hash-name reference with content *)
180180- (match namespace with
181181- | Some _ -> ()
182182- | None when name_lower = "img" ->
183183- List.iter (fun (attr_name, attr_value) ->
184184- let attr_lower = String.lowercase_ascii attr_name in
185185- if attr_lower = "usemap" then begin
186186- if attr_value = "#" then
187187- Message_collector.add_typed collector
188188- (`Attr (`Bad_value_generic (`Message (Printf.sprintf
189189- "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad hash-name reference: A hash-name reference must have at least one character after \xe2\x80\x9c#\xe2\x80\x9d."
190190- attr_value attr_name name))))
191191- end
192192- ) attrs
193193- | None -> ());
169169+ (* Validate img usemap attribute - must be hash-name reference with content *)
170170+ if name_lower = "img" then begin
171171+ List.iter (fun (attr_name, attr_value) ->
172172+ let attr_lower = String.lowercase_ascii attr_name in
173173+ if attr_lower = "usemap" then begin
174174+ if attr_value = "#" then
175175+ Message_collector.add_typed collector
176176+ (`Attr (`Bad_value_generic (`Message (Printf.sprintf
177177+ "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad hash-name reference: A hash-name reference must have at least one character after \xe2\x80\x9c#\xe2\x80\x9d."
178178+ attr_value attr_name name))))
179179+ end
180180+ ) attrs
181181+ end;
194182195195- (* Validate embed type attribute - must be valid MIME type *)
196196- (match namespace with
197197- | Some _ -> ()
198198- | None when name_lower = "embed" ->
199199- List.iter (fun (attr_name, attr_value) ->
200200- let attr_lower = String.lowercase_ascii attr_name in
201201- if attr_lower = "type" then begin
202202- match Dt_mime.validate_mime_type attr_value with
203203- | Ok () -> ()
204204- | Error msg ->
205205- Message_collector.add_typed collector
206206- (`Attr (`Bad_value_generic (`Message (Printf.sprintf
207207- "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: %s"
208208- attr_value attr_name name msg))))
209209- end
210210- ) attrs
211211- | None -> ());
183183+ (* Validate embed type attribute - must be valid MIME type *)
184184+ if name_lower = "embed" then begin
185185+ List.iter (fun (attr_name, attr_value) ->
186186+ let attr_lower = String.lowercase_ascii attr_name in
187187+ if attr_lower = "type" then begin
188188+ match Dt_mime.validate_mime_type attr_value with
189189+ | Ok () -> ()
190190+ | Error msg ->
191191+ Message_collector.add_typed collector
192192+ (`Attr (`Bad_value_generic (`Message (Printf.sprintf
193193+ "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: %s"
194194+ attr_value attr_name name msg))))
195195+ end
196196+ ) attrs
197197+ end;
212198213213- (* Validate width/height on embed and img - must be non-negative integers *)
214214- let is_dimension_element = name_lower = "embed" || name_lower = "img" ||
215215- name_lower = "video" || name_lower = "canvas" ||
216216- name_lower = "iframe" || name_lower = "source" in
217217- (match namespace with
218218- | Some _ -> ()
219219- | None when is_dimension_element ->
220220- List.iter (fun (attr_name, attr_value) ->
221221- let attr_lower = String.lowercase_ascii attr_name in
222222- if attr_lower = "width" || attr_lower = "height" then begin
223223- (* Check for non-negative integer only *)
224224- let is_valid =
225225- String.length attr_value > 0 &&
226226- String.for_all (fun c -> c >= '0' && c <= '9') attr_value
227227- in
228228- if not is_valid then begin
229229- (* Determine specific error message *)
230230- let error_msg =
231231- if String.length attr_value = 0 then
232232- Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer."
233233- attr_name name
234234- else if String.contains attr_value '%' then
235235- Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead."
236236- attr_value attr_name name
237237- else if String.length attr_value > 0 && attr_value.[0] = '-' then
238238- Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead."
239239- attr_value attr_name name
240240- else
241241- (* Find first non-digit character *)
242242- let bad_char =
243243- try
244244- let i = ref 0 in
245245- while !i < String.length attr_value && attr_value.[!i] >= '0' && attr_value.[!i] <= '9' do
246246- incr i
247247- done;
248248- if !i < String.length attr_value then Some attr_value.[!i] else None
249249- with _ -> None
250250- in
251251- match bad_char with
252252- | Some c ->
253253- Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead."
254254- attr_value attr_name name c
255255- | None ->
256256- Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit."
199199+ (* Validate width/height on embed and img - must be non-negative integers *)
200200+ let is_dimension_element = name_lower = "embed" || name_lower = "img" ||
201201+ name_lower = "video" || name_lower = "canvas" ||
202202+ name_lower = "iframe" || name_lower = "source" in
203203+ if is_dimension_element then begin
204204+ List.iter (fun (attr_name, attr_value) ->
205205+ let attr_lower = String.lowercase_ascii attr_name in
206206+ if attr_lower = "width" || attr_lower = "height" then begin
207207+ (* Check for non-negative integer only *)
208208+ let is_valid =
209209+ String.length attr_value > 0 &&
210210+ String.for_all (fun c -> c >= '0' && c <= '9') attr_value
211211+ in
212212+ if not is_valid then begin
213213+ (* Determine specific error message *)
214214+ let error_msg =
215215+ if String.length attr_value = 0 then
216216+ Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer."
217217+ attr_name name
218218+ else if String.contains attr_value '%' then
219219+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead."
257220 attr_value attr_name name
258258- in
259259- Message_collector.add_typed collector
260260- (`Attr (`Bad_value_generic (`Message error_msg)))
221221+ else if String.length attr_value > 0 && attr_value.[0] = '-' then
222222+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead."
223223+ attr_value attr_name name
224224+ else
225225+ (* Find first non-digit character *)
226226+ let bad_char =
227227+ try
228228+ let i = ref 0 in
229229+ while !i < String.length attr_value && attr_value.[!i] >= '0' && attr_value.[!i] <= '9' do
230230+ incr i
231231+ done;
232232+ if !i < String.length attr_value then Some attr_value.[!i] else None
233233+ with _ -> None
234234+ in
235235+ match bad_char with
236236+ | Some c ->
237237+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead."
238238+ attr_value attr_name name c
239239+ | None ->
240240+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit."
241241+ attr_value attr_name name
242242+ in
243243+ Message_collector.add_typed collector
244244+ (`Attr (`Bad_value_generic (`Message error_msg)))
245245+ end
261246 end
262262- end
263263- ) attrs
264264- | None -> ());
247247+ ) attrs
248248+ end;
265249266266- (* Validate area[shape=default] cannot have coords *)
267267- (match namespace with
268268- | Some _ -> ()
269269- | None when name_lower = "area" ->
270270- (match Attr_utils.get_attr "shape" attrs with
271271- | Some s when String.lowercase_ascii (String.trim s) = "default" ->
272272- if Attr_utils.has_attr "coords" attrs then
273273- Message_collector.add_typed collector
274274- (`Attr (`Not_allowed (`Attr "coords", `Elem "area")))
275275- | _ -> ())
276276- | None -> ());
250250+ (* Validate area[shape=default] cannot have coords *)
251251+ if name_lower = "area" then begin
252252+ match Attr_utils.get_attr "shape" attrs with
253253+ | Some s when String.lowercase_ascii (String.trim s) = "default" ->
254254+ if Attr_utils.has_attr "coords" attrs then
255255+ Message_collector.add_typed collector
256256+ (`Attr (`Not_allowed (`Attr "coords", `Elem "area")))
257257+ | _ -> ()
258258+ end;
277259278278- (* Validate bdo element requires dir attribute, and dir cannot be "auto" *)
279279- (match namespace with
280280- | Some _ -> ()
281281- | None when name_lower = "bdo" ->
282282- (match Attr_utils.get_attr "dir" attrs with
283283- | None ->
284284- Message_collector.add_typed collector (`Misc `Bdo_missing_dir)
285285- | Some v when String.lowercase_ascii (String.trim v) = "auto" ->
286286- Message_collector.add_typed collector (`Misc `Bdo_dir_auto)
287287- | _ -> ())
288288- | None -> ());
260260+ (* Validate bdo element requires dir attribute, and dir cannot be "auto" *)
261261+ if name_lower = "bdo" then begin
262262+ match Attr_utils.get_attr "dir" attrs with
263263+ | None ->
264264+ Message_collector.add_typed collector (`Misc `Bdo_missing_dir)
265265+ | Some v when String.lowercase_ascii (String.trim v) = "auto" ->
266266+ Message_collector.add_typed collector (`Misc `Bdo_dir_auto)
267267+ | _ -> ()
268268+ end;
289269290290- (* Validate input list attribute - only allowed for certain types *)
291291- (match namespace with
292292- | Some _ -> ()
293293- | None when name_lower = "input" ->
294294- if Attr_utils.has_attr "list" attrs then begin
295295- let input_type = Attr_utils.get_attr_or "type" ~default:"text" attrs
296296- |> String.trim |> String.lowercase_ascii in
297297- if not (List.mem input_type input_types_allowing_list) then
298298- Message_collector.add_typed collector (`Input `List_not_allowed)
299299- end
300300- | None -> ());
270270+ (* Validate input list attribute - only allowed for certain types *)
271271+ if name_lower = "input" then begin
272272+ if Attr_utils.has_attr "list" attrs then begin
273273+ let input_type = Attr_utils.get_attr_or "type" ~default:"text" attrs
274274+ |> String.trim |> String.lowercase_ascii in
275275+ if not (List.mem input_type input_types_allowing_list) then
276276+ Message_collector.add_typed collector (`Input `List_not_allowed)
277277+ end
278278+ end;
301279302302- (* Validate data-* attributes *)
303303- (match namespace with
304304- | Some _ -> ()
305305- | None ->
280280+ (* Validate data-* attributes *)
306281 List.iter (fun (attr_name, _) ->
307282 let attr_lower = String.lowercase_ascii attr_name in
308283 (* Check if it starts with "data-" *)
···316291 Message_collector.add_typed collector
317292 (`Attr (`Data_invalid_name (`Reason "must be XML 1.0 4th ed. plus Namespaces NCNames")))
318293 end
319319- ) attrs);
294294+ ) attrs;
320295321321- (* Validate xml:lang must have matching lang attribute - only in HTML mode, not XHTML *)
322322- (match namespace with
323323- | Some _ -> ()
324324- | None when not state.is_xhtml ->
325325- let xmllang_value = Attr_utils.get_attr "xml:lang" attrs in
326326- let lang_value = Attr_utils.get_attr "lang" attrs in
327327- (match xmllang_value with
328328- | Some xmllang ->
329329- (match lang_value with
330330- | None ->
331331- Message_collector.add_typed collector (`I18n `Xml_lang_without_lang)
332332- | Some lang when String.lowercase_ascii lang <> String.lowercase_ascii xmllang ->
333333- Message_collector.add_typed collector (`I18n `Xml_lang_without_lang)
334334- | _ -> ())
335335- | None -> ())
336336- | None -> ());
296296+ (* Validate xml:lang must have matching lang attribute - only in HTML mode, not XHTML *)
297297+ if not state.is_xhtml then begin
298298+ let xmllang_value = Attr_utils.get_attr "xml:lang" attrs in
299299+ let lang_value = Attr_utils.get_attr "lang" attrs in
300300+ match xmllang_value with
301301+ | Some xmllang ->
302302+ (match lang_value with
303303+ | None ->
304304+ Message_collector.add_typed collector (`I18n `Xml_lang_without_lang)
305305+ | Some lang when String.lowercase_ascii lang <> String.lowercase_ascii xmllang ->
306306+ Message_collector.add_typed collector (`I18n `Xml_lang_without_lang)
307307+ | _ -> ())
308308+ | None -> ()
309309+ end;
337310338338- (* Validate spellcheck attribute - must be "true" or "false" or empty *)
339339- (match namespace with
340340- | Some _ -> ()
341341- | None ->
311311+ (* Validate spellcheck attribute - must be "true" or "false" or empty *)
342312 List.iter (fun (attr_name, attr_value) ->
343313 let attr_lower = String.lowercase_ascii attr_name in
344314 if attr_lower = "spellcheck" then begin
···347317 Message_collector.add_typed collector
348318 (`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason "")))
349319 end
350350- ) attrs);
320320+ ) attrs;
351321352352- (* Validate enterkeyhint attribute - must be one of specific values *)
353353- (match namespace with
354354- | Some _ -> ()
355355- | None ->
322322+ (* Validate enterkeyhint attribute - must be one of specific values *)
356323 let valid_enterkeyhint = ["enter"; "done"; "go"; "next"; "previous"; "search"; "send"] in
357324 List.iter (fun (attr_name, attr_value) ->
358325 let attr_lower = String.lowercase_ascii attr_name in
···362329 Message_collector.add_typed collector
363330 (`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason "")))
364331 end
365365- ) attrs);
332332+ ) attrs;
366333367367- (* Validate headingoffset attribute - must be a number between 0 and 8 *)
368368- (match namespace with
369369- | Some _ -> ()
370370- | None ->
334334+ (* Validate headingoffset attribute - must be a number between 0 and 8 *)
371335 List.iter (fun (attr_name, attr_value) ->
372336 let attr_lower = String.lowercase_ascii attr_name in
373337 if attr_lower = "headingoffset" then begin
···383347 if not is_valid then
384348 Message_collector.add_typed collector (`Misc `Headingoffset_invalid)
385349 end
386386- ) attrs);
350350+ ) attrs;
387351388388- (* Validate accesskey attribute - each key label must be a single code point *)
389389- (match namespace with
390390- | Some _ -> ()
391391- | None ->
352352+ (* Validate accesskey attribute - each key label must be a single code point *)
392353 List.iter (fun (attr_name, attr_value) ->
393354 let attr_lower = String.lowercase_ascii attr_name in
394355 if attr_lower = "accesskey" then begin
···433394 in
434395 find_duplicates [] keys
435396 end
436436- ) attrs);
397397+ ) attrs;
437398438438- (* Validate that command and popovertarget cannot have aria-expanded *)
439439- (match namespace with
440440- | Some _ -> ()
441441- | None when name_lower = "button" ->
442442- let has_command = Attr_utils.has_attr "command" attrs in
443443- let has_popovertarget = Attr_utils.has_attr "popovertarget" attrs in
444444- let has_aria_expanded = Attr_utils.has_attr "aria-expanded" attrs in
399399+ (* Validate that command and popovertarget cannot have aria-expanded *)
400400+ if name_lower = "button" then begin
401401+ let has_command = Attr_utils.has_attr "command" attrs in
402402+ let has_popovertarget = Attr_utils.has_attr "popovertarget" attrs in
403403+ let has_aria_expanded = Attr_utils.has_attr "aria-expanded" attrs in
445404446446- if has_command && has_aria_expanded then
447447- Message_collector.add_typed collector
448448- (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name,
449449- `Condition "a \xe2\x80\x9ccommand\xe2\x80\x9d attribute")));
405405+ if has_command && has_aria_expanded then
406406+ Message_collector.add_typed collector
407407+ (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name,
408408+ `Condition "a \xe2\x80\x9ccommand\xe2\x80\x9d attribute")));
450409451451- if has_popovertarget && has_aria_expanded then
452452- Message_collector.add_typed collector
453453- (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name,
454454- `Condition "a \xe2\x80\x9cpopovertarget\xe2\x80\x9d attribute")))
455455- | None -> ());
410410+ if has_popovertarget && has_aria_expanded then
411411+ Message_collector.add_typed collector
412412+ (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name,
413413+ `Condition "a \xe2\x80\x9cpopovertarget\xe2\x80\x9d attribute")))
414414+ end;
456415457457- (* Note: data-* uppercase check requires XML parsing which preserves case.
458458- The HTML5 parser normalizes attribute names to lowercase, so this check
459459- is only effective when the document is parsed as XML.
460460- Commenting out until we have XML parsing support. *)
461461- ignore state.is_xhtml;
416416+ (* Note: data-* uppercase check requires XML parsing which preserves case.
417417+ The HTML5 parser normalizes attribute names to lowercase, so this check
418418+ is only effective when the document is parsed as XML.
419419+ Commenting out until we have XML parsing support. *)
420420+ ignore state.is_xhtml;
462421463463- (* Validate media attribute on link, style, source elements *)
464464- let is_media_element = name_lower = "link" || name_lower = "style" || name_lower = "source" in
465465- (match namespace with
466466- | Some _ -> ()
467467- | None when is_media_element ->
468468- List.iter (fun (attr_name, attr_value) ->
469469- let attr_lower = String.lowercase_ascii attr_name in
470470- if attr_lower = "media" then begin
471471- let trimmed = String.trim attr_value in
472472- if trimmed <> "" then begin
473473- match Dt_media_query.validate_media_query_strict trimmed with
474474- | Ok () -> ()
475475- | Error msg ->
476476- Message_collector.add_typed collector
477477- (`Attr (`Bad_value_generic (`Message (Printf.sprintf
478478- "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad media query: %s"
479479- attr_value attr_name name msg))))
422422+ (* Validate media attribute on link, style, source elements *)
423423+ let is_media_element = name_lower = "link" || name_lower = "style" || name_lower = "source" in
424424+ if is_media_element then begin
425425+ List.iter (fun (attr_name, attr_value) ->
426426+ let attr_lower = String.lowercase_ascii attr_name in
427427+ if attr_lower = "media" then begin
428428+ let trimmed = String.trim attr_value in
429429+ if trimmed <> "" then begin
430430+ match Dt_media_query.validate_media_query_strict trimmed with
431431+ | Ok () -> ()
432432+ | Error msg ->
433433+ Message_collector.add_typed collector
434434+ (`Attr (`Bad_value_generic (`Message (Printf.sprintf
435435+ "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad media query: %s"
436436+ attr_value attr_name name msg))))
437437+ end
480438 end
481481- end
482482- ) attrs
483483- | None -> ());
439439+ ) attrs
440440+ end;
484441485485- (* Validate RDFa prefix attribute - space-separated list of prefix:iri pairs *)
486486- (match namespace with
487487- | Some _ -> ()
488488- | None ->
442442+ (* Validate RDFa prefix attribute - space-separated list of prefix:iri pairs *)
489443 List.iter (fun (attr_name, attr_value) ->
490444 let attr_lower = String.lowercase_ascii attr_name in
491445 if attr_lower = "prefix" then begin
···507461 end
508462 end
509463 end
510510- ) attrs)
464464+ ) attrs
511465512512-let end_element _state ~name:_ ~namespace:_ _collector = ()
466466+ | _ -> () (* Skip non-HTML elements *)
467467+468468+let end_element _state ~tag:_ _collector = ()
513469let characters _state _text _collector = ()
514470let end_document _state _collector = ()
515471
+14-15
lib/htmlrw_check/specialized/base_checker.ml
···1111let reset state =
1212 state.seen_link_or_script <- false
13131414-let start_element state ~name ~namespace ~attrs collector =
1515- match namespace with
1616- | Some _ -> ()
1717- | None ->
1818- match String.lowercase_ascii name with
1919- | "link" | "script" ->
2020- state.seen_link_or_script <- true
2121- | "base" ->
2222- if state.seen_link_or_script then
2323- Message_collector.add_typed collector (`Misc `Base_after_link_script);
2424- (* base element must have href or target attribute *)
2525- if not (Attr_utils.has_attr "href" attrs || Attr_utils.has_attr "target" attrs) then
2626- Message_collector.add_typed collector (`Misc `Base_missing_href_or_target)
2727- | _ -> ()
1414+let start_element state ~element collector =
1515+ match element.Element.tag with
1616+ | Tag.Html (`Link | `Script) ->
1717+ state.seen_link_or_script <- true
1818+ | Tag.Html `Base ->
1919+ if state.seen_link_or_script then
2020+ Message_collector.add_typed collector (`Misc `Base_after_link_script);
2121+ (* base element must have href or target attribute *)
2222+ let has_href = Attr.get_href element.attrs |> Option.is_some in
2323+ let has_target = Attr.exists (function `Target _ -> true | _ -> false) element.attrs in
2424+ if not (has_href || has_target) then
2525+ Message_collector.add_typed collector (`Misc `Base_missing_href_or_target)
2626+ | _ -> ()
28272929-let end_element _state ~name:_ ~namespace:_ _collector = ()
2828+let end_element _state ~tag:_ _collector = ()
3029let characters _state _text _collector = ()
3130let end_document _state _collector = ()
3231
+8-8
lib/htmlrw_check/specialized/datetime_checker.ml
···445445let create () = ()
446446let reset _state = ()
447447448448-let start_element _state ~name ~namespace ~attrs collector =
449449- if namespace <> None then ()
450450- else begin
451451- let name_lower = String.lowercase_ascii name in
452452- if List.mem name_lower datetime_elements then begin
448448+let start_element _state ~element collector =
449449+ match element.Element.tag with
450450+ | Tag.Html tag ->
451451+ let name = Tag.html_tag_to_string tag in
452452+ if List.mem name datetime_elements then begin
453453 (* Check for datetime attribute *)
454454 let datetime_attr = List.find_map (fun (k, v) ->
455455 if String.lowercase_ascii k = "datetime" then Some v else None
456456- ) attrs in
456456+ ) element.raw_attrs in
457457 match datetime_attr with
458458 | None -> ()
459459 | Some value ->
···468468 Message_collector.add_typed collector
469469 (`Generic warn_msg)
470470 end
471471- end
471471+ | _ -> () (* Non-HTML elements don't have datetime attributes *)
472472473473-let end_element _state ~name:_ ~namespace:_ _collector = ()
473473+let end_element _state ~tag:_ _collector = ()
474474let characters _state _text _collector = ()
475475let end_document _state _collector = ()
476476
+152-176
lib/htmlrw_check/specialized/dl_checker.ml
···5757 | ctx :: _ -> Some ctx
5858 | [] -> None
59596060-let start_element state ~name ~namespace ~attrs collector =
6161- let name_lower = String.lowercase_ascii name in
6060+let start_element state ~element collector =
6161+ let name_lower = Tag.tag_to_string element.Element.tag in
62626363 (* Track parent stack for all HTML elements first *)
6464- if namespace = None then
6565- state.parent_stack <- name_lower :: state.parent_stack;
6464+ (match element.tag with
6565+ | Tag.Html _ -> state.parent_stack <- name_lower :: state.parent_stack
6666+ | _ -> ());
66676767- if namespace <> None then ()
6868- else begin
6969- match name_lower with
7070- | "template" ->
7171- state.in_template <- state.in_template + 1;
7272- (* Track if template is direct child of dl *)
7373- begin match current_dl state with
7474- | Some dl_ctx when state.div_in_dl_stack = [] ->
7575- dl_ctx.has_template <- true
7676- | _ -> ()
7777- end
6868+ match element.tag with
6969+ | Tag.Html `Template ->
7070+ state.in_template <- state.in_template + 1;
7171+ (* Track if template is direct child of dl *)
7272+ (match current_dl state with
7373+ | Some dl_ctx when state.div_in_dl_stack = [] ->
7474+ dl_ctx.has_template <- true
7575+ | _ -> ())
78767979- | "dl" when state.in_template = 0 ->
8080- (* Check for nested dl - error if direct child of dl OR inside div-in-dl *)
8181- begin match current_div state with
8282- | Some _ ->
8383- Message_collector.add_typed collector
8484- (`Element (`Not_allowed_as_child (`Child "dl", `Parent "div")))
8585- | None ->
8686- match current_dl state with
8787- | Some _ when state.in_dt_dd = 0 ->
8888- Message_collector.add_typed collector
8989- (`Element (`Not_allowed_as_child (`Child "dl", `Parent "dl")))
9090- | _ -> ()
9191- end;
9292- let ctx = {
9393- has_dt = false;
9494- has_dd = false;
9595- last_was_dt = false;
9696- contains_div = false;
9797- contains_dt_dd = false;
9898- dd_before_dt_error_reported = false;
9999- has_template = false;
100100- } in
101101- state.dl_stack <- ctx :: state.dl_stack
7777+ | Tag.Html `Dl when state.in_template = 0 ->
7878+ (* Check for nested dl - error if direct child of dl OR inside div-in-dl *)
7979+ (match current_div state with
8080+ | Some _ ->
8181+ Message_collector.add_typed collector
8282+ (`Element (`Not_allowed_as_child (`Child "dl", `Parent "div")))
8383+ | None ->
8484+ match current_dl state with
8585+ | Some _ when state.in_dt_dd = 0 ->
8686+ Message_collector.add_typed collector
8787+ (`Element (`Not_allowed_as_child (`Child "dl", `Parent "dl")))
8888+ | _ -> ());
8989+ let ctx = {
9090+ has_dt = false;
9191+ has_dd = false;
9292+ last_was_dt = false;
9393+ contains_div = false;
9494+ contains_dt_dd = false;
9595+ dd_before_dt_error_reported = false;
9696+ has_template = false;
9797+ } in
9898+ state.dl_stack <- ctx :: state.dl_stack
10299103103- | "div" when state.in_template = 0 ->
104104- begin match current_dl state with
105105- | Some dl_ctx when state.div_in_dl_stack = [] ->
106106- (* Direct div child of dl *)
107107- dl_ctx.contains_div <- true;
108108- (* Check for mixed content - if we already have dt/dd, div is not allowed *)
109109- if dl_ctx.contains_dt_dd then
110110- Message_collector.add_typed collector
111111- (`Element (`Not_allowed_as_child (`Child "div", `Parent "dl")));
112112- (* Check that role is only presentation or none *)
113113- (match Attr_utils.get_attr "role" attrs with
100100+ | Tag.Html `Div when state.in_template = 0 ->
101101+ (match current_dl state with
102102+ | Some dl_ctx when state.div_in_dl_stack = [] ->
103103+ dl_ctx.contains_div <- true;
104104+ if dl_ctx.contains_dt_dd then
105105+ Message_collector.add_typed collector
106106+ (`Element (`Not_allowed_as_child (`Child "div", `Parent "dl")));
107107+ (match Attr.get_role element.attrs with
114108 | Some role_value ->
115109 let role_lower = String.lowercase_ascii (String.trim role_value) in
116110 if role_lower <> "presentation" && role_lower <> "none" then
117111 Message_collector.add_typed collector (`Li_role `Div_in_dl_bad_role)
118112 | None -> ());
119119- let div_ctx = { has_dt = false; has_dd = false; group_count = 0; in_dd_part = false } in
120120- state.div_in_dl_stack <- div_ctx :: state.div_in_dl_stack
121121- | Some _ when state.div_in_dl_stack <> [] ->
122122- Message_collector.add_typed collector
123123- (`Element (`Not_allowed_as_child (`Child "div", `Parent "div")))
124124- | _ -> ()
125125- end
126126-127127- | "dt" when state.in_template = 0 ->
128128- state.in_dt_dd <- state.in_dt_dd + 1;
129129- begin match current_div state with
130130- | Some div_ctx ->
131131- (* If we've already seen dd, this dt starts a new group - which is not allowed *)
132132- if div_ctx.in_dd_part then begin
133133- Message_collector.add_typed collector
134134- (`Element (`Not_allowed_as_child (`Child "dt", `Parent "div")));
135135- div_ctx.group_count <- div_ctx.group_count + 1;
136136- div_ctx.in_dd_part <- false
137137- end;
138138- div_ctx.has_dt <- true
139139- | None ->
140140- match current_dl state with
141141- | Some dl_ctx ->
142142- dl_ctx.has_dt <- true;
143143- dl_ctx.last_was_dt <- true;
144144- dl_ctx.contains_dt_dd <- true;
145145- (* Check for mixed content - if we already have div, dt is not allowed *)
146146- if dl_ctx.contains_div then
147147- Message_collector.add_typed collector
148148- (`Element (`Not_allowed_as_child (`Child "dt", `Parent "dl")))
149149- | None ->
150150- (* dt outside dl context - error *)
151151- let parent = match current_parent state with
152152- | Some p -> p
153153- | None -> "document"
154154- in
155155- Message_collector.add_typed collector
156156- (`Element (`Not_allowed_as_child (`Child "dt", `Parent parent)))
157157- end
113113+ let div_ctx = { has_dt = false; has_dd = false; group_count = 0; in_dd_part = false } in
114114+ state.div_in_dl_stack <- div_ctx :: state.div_in_dl_stack
115115+ | Some _ when state.div_in_dl_stack <> [] ->
116116+ Message_collector.add_typed collector
117117+ (`Element (`Not_allowed_as_child (`Child "div", `Parent "div")))
118118+ | _ -> ())
158119159159- | "dd" when state.in_template = 0 ->
160160- state.in_dt_dd <- state.in_dt_dd + 1;
161161- begin match current_div state with
162162- | Some div_ctx ->
163163- div_ctx.has_dd <- true;
164164- (* First dd after dt(s) completes the first group *)
165165- if not div_ctx.in_dd_part then begin
166166- div_ctx.in_dd_part <- true;
167167- div_ctx.group_count <- div_ctx.group_count + 1
168168- end
169169- | None ->
170170- match current_dl state with
171171- | Some dl_ctx ->
172172- (* Check if dd appears before any dt - only report once per dl *)
173173- if not dl_ctx.has_dt && not dl_ctx.dd_before_dt_error_reported then begin
174174- dl_ctx.dd_before_dt_error_reported <- true;
175175- Message_collector.add_typed collector
176176- (`Element (`Missing_child_generic (`Parent "dl")))
177177- end;
178178- dl_ctx.has_dd <- true;
179179- dl_ctx.last_was_dt <- false;
180180- dl_ctx.contains_dt_dd <- true;
181181- (* Check for mixed content *)
182182- if dl_ctx.contains_div then
183183- Message_collector.add_typed collector
184184- (`Element (`Not_allowed_as_child (`Child "dd", `Parent "dl")))
185185- | None ->
186186- (* dd outside dl context - error *)
187187- let parent = match current_parent state with
188188- | Some p -> p
189189- | None -> "document"
190190- in
191191- Message_collector.add_typed collector
192192- (`Element (`Not_allowed_as_child (`Child "dd", `Parent parent)))
193193- end
120120+ | Tag.Html `Dt when state.in_template = 0 ->
121121+ state.in_dt_dd <- state.in_dt_dd + 1;
122122+ (match current_div state with
123123+ | Some div_ctx ->
124124+ if div_ctx.in_dd_part then begin
125125+ Message_collector.add_typed collector
126126+ (`Element (`Not_allowed_as_child (`Child "dt", `Parent "div")));
127127+ div_ctx.group_count <- div_ctx.group_count + 1;
128128+ div_ctx.in_dd_part <- false
129129+ end;
130130+ div_ctx.has_dt <- true
131131+ | None ->
132132+ match current_dl state with
133133+ | Some dl_ctx ->
134134+ dl_ctx.has_dt <- true;
135135+ dl_ctx.last_was_dt <- true;
136136+ dl_ctx.contains_dt_dd <- true;
137137+ if dl_ctx.contains_div then
138138+ Message_collector.add_typed collector
139139+ (`Element (`Not_allowed_as_child (`Child "dt", `Parent "dl")))
140140+ | None ->
141141+ let parent = match current_parent state with
142142+ | Some p -> p
143143+ | None -> "document"
144144+ in
145145+ Message_collector.add_typed collector
146146+ (`Element (`Not_allowed_as_child (`Child "dt", `Parent parent))))
194147195195- | _ -> ()
196196- end
148148+ | Tag.Html `Dd when state.in_template = 0 ->
149149+ state.in_dt_dd <- state.in_dt_dd + 1;
150150+ (match current_div state with
151151+ | Some div_ctx ->
152152+ div_ctx.has_dd <- true;
153153+ if not div_ctx.in_dd_part then begin
154154+ div_ctx.in_dd_part <- true;
155155+ div_ctx.group_count <- div_ctx.group_count + 1
156156+ end
157157+ | None ->
158158+ match current_dl state with
159159+ | Some dl_ctx ->
160160+ if not dl_ctx.has_dt && not dl_ctx.dd_before_dt_error_reported then begin
161161+ dl_ctx.dd_before_dt_error_reported <- true;
162162+ Message_collector.add_typed collector
163163+ (`Element (`Missing_child_generic (`Parent "dl")))
164164+ end;
165165+ dl_ctx.has_dd <- true;
166166+ dl_ctx.last_was_dt <- false;
167167+ dl_ctx.contains_dt_dd <- true;
168168+ if dl_ctx.contains_div then
169169+ Message_collector.add_typed collector
170170+ (`Element (`Not_allowed_as_child (`Child "dd", `Parent "dl")))
171171+ | None ->
172172+ let parent = match current_parent state with
173173+ | Some p -> p
174174+ | None -> "document"
175175+ in
176176+ Message_collector.add_typed collector
177177+ (`Element (`Not_allowed_as_child (`Child "dd", `Parent parent))))
197178198198-let end_element state ~name ~namespace collector =
199199- if namespace <> None then ()
200200- else begin
201201- let name_lower = String.lowercase_ascii name in
179179+ | _ -> ()
202180181181+let end_element state ~tag collector =
182182+ match tag with
183183+ | Tag.Html _ ->
203184 (* Pop from parent stack *)
204185 (match state.parent_stack with
205205- | _ :: rest -> state.parent_stack <- rest
206206- | [] -> ());
186186+ | _ :: rest -> state.parent_stack <- rest
187187+ | [] -> ());
207188208208- match name_lower with
209209- | "template" ->
210210- state.in_template <- max 0 (state.in_template - 1)
189189+ (match tag with
190190+ | Tag.Html `Template ->
191191+ state.in_template <- max 0 (state.in_template - 1)
211192212212- | "dt" | "dd" when state.in_template = 0 ->
213213- state.in_dt_dd <- max 0 (state.in_dt_dd - 1)
193193+ | Tag.Html (`Dt | `Dd) when state.in_template = 0 ->
194194+ state.in_dt_dd <- max 0 (state.in_dt_dd - 1)
214195215215- | "dl" when state.in_template = 0 ->
216216- begin match state.dl_stack with
217217- | ctx :: rest ->
218218- state.dl_stack <- rest;
219219- (* Check dl content model at end *)
220220- if ctx.contains_dt_dd then begin
221221- (* Direct dt/dd content - must have both *)
222222- if not ctx.has_dt && not ctx.dd_before_dt_error_reported then
223223- Message_collector.add_typed collector
224224- (`Element (`Missing_child_generic (`Parent "dl")))
225225- else if not ctx.has_dd then begin
226226- if ctx.has_template then
196196+ | Tag.Html `Dl when state.in_template = 0 ->
197197+ (match state.dl_stack with
198198+ | ctx :: rest ->
199199+ state.dl_stack <- rest;
200200+ if ctx.contains_dt_dd then begin
201201+ if not ctx.has_dt && not ctx.dd_before_dt_error_reported then
227202 Message_collector.add_typed collector
228228- (`Element (`Missing_child_one_of (`Parent "dl", `Children ["dd"])))
229229- else
203203+ (`Element (`Missing_child_generic (`Parent "dl")))
204204+ else if not ctx.has_dd then begin
205205+ if ctx.has_template then
206206+ Message_collector.add_typed collector
207207+ (`Element (`Missing_child_one_of (`Parent "dl", `Children ["dd"])))
208208+ else
209209+ Message_collector.add_typed collector
210210+ (`Element (`Missing_child (`Parent "dl", `Child "dd")))
211211+ end
212212+ else if ctx.last_was_dt then
230213 Message_collector.add_typed collector
231214 (`Element (`Missing_child (`Parent "dl", `Child "dd")))
232232- end
233233- else if ctx.last_was_dt then
234234- Message_collector.add_typed collector
235235- (`Element (`Missing_child (`Parent "dl", `Child "dd")))
236236- end else if not ctx.contains_div && not ctx.has_dt && not ctx.has_dd then
237237- ()
238238- | [] -> ()
239239- end
215215+ end else if not ctx.contains_div && not ctx.has_dt && not ctx.has_dd then
216216+ ()
217217+ | [] -> ())
240218241241- | "div" when state.in_template = 0 ->
242242- begin match state.div_in_dl_stack with
243243- | div_ctx :: rest ->
244244- state.div_in_dl_stack <- rest;
245245- (* Check div in dl must have both dt and dd *)
246246- if not div_ctx.has_dt && not div_ctx.has_dd then
247247- Message_collector.add_typed collector
248248- (`Element (`Missing_child (`Parent "div", `Child "dd")))
249249- else if not div_ctx.has_dt then
250250- Message_collector.add_typed collector
251251- (`Element (`Missing_child (`Parent "div", `Child "dt")))
252252- else if not div_ctx.has_dd then
253253- Message_collector.add_typed collector
254254- (`Element (`Missing_child (`Parent "div", `Child "dd")))
255255- | [] -> ()
256256- end
219219+ | Tag.Html `Div when state.in_template = 0 ->
220220+ (match state.div_in_dl_stack with
221221+ | div_ctx :: rest ->
222222+ state.div_in_dl_stack <- rest;
223223+ if not div_ctx.has_dt && not div_ctx.has_dd then
224224+ Message_collector.add_typed collector
225225+ (`Element (`Missing_child (`Parent "div", `Child "dd")))
226226+ else if not div_ctx.has_dt then
227227+ Message_collector.add_typed collector
228228+ (`Element (`Missing_child (`Parent "div", `Child "dt")))
229229+ else if not div_ctx.has_dd then
230230+ Message_collector.add_typed collector
231231+ (`Element (`Missing_child (`Parent "div", `Child "dd")))
232232+ | [] -> ())
257233258258- | _ -> ()
259259- end
234234+ | _ -> ())
235235+ | _ -> ()
260236261237let characters state text collector =
262238 if state.in_template > 0 then ()
+12-11
lib/htmlrw_check/specialized/h1_checker.ml
···1414 state.h1_count <- 0;
1515 state.svg_depth <- 0
16161717-let start_element state ~name ~namespace ~attrs collector =
1818- ignore attrs;
1919- let name_lower = String.lowercase_ascii name in
1717+let start_element state ~element collector =
2018 (* Track SVG depth - h1 inside SVG (foreignObject, desc) shouldn't count *)
2121- if name_lower = "svg" then
1919+ match element.Element.tag with
2020+ | Tag.Svg _ ->
2221 state.svg_depth <- state.svg_depth + 1
2323- else if namespace <> None || state.svg_depth > 0 then
2424- () (* Skip non-HTML namespace or inside SVG *)
2525- else if name_lower = "h1" then begin
2222+ | Tag.Html `H1 when state.svg_depth = 0 ->
2623 state.h1_count <- state.h1_count + 1;
2724 if state.h1_count > 1 then
2825 Message_collector.add_typed collector (`Misc `Multiple_h1)
2929- end
2626+ | Tag.Html _ when state.svg_depth = 0 ->
2727+ () (* Other HTML elements outside SVG *)
2828+ | _ ->
2929+ () (* Non-HTML or inside SVG *)
30303131-let end_element state ~name ~namespace:_ _collector =
3232- let name_lower = String.lowercase_ascii name in
3333- if name_lower = "svg" && state.svg_depth > 0 then
3131+let end_element state ~tag _collector =
3232+ match tag with
3333+ | Tag.Svg _ when state.svg_depth > 0 ->
3434 state.svg_depth <- state.svg_depth - 1
3535+ | _ -> ()
35363637let characters _state _text _collector = ()
3738let end_document _state _collector = ()
+48-59
lib/htmlrw_check/specialized/heading_checker.ml
···1212 mutable h1_count : int;
1313 mutable has_any_heading : bool;
1414 mutable first_heading_checked : bool;
1515- mutable in_heading : string option;
1515+ mutable in_heading : Tag.html_tag option;
1616 mutable heading_has_text : bool;
1717}
1818···3434 state.in_heading <- None;
3535 state.heading_has_text <- false
36363737-(** Extract heading level from tag name (e.g., "h1" -> 1). *)
3838-let heading_level name =
3939- match String.lowercase_ascii name with
4040- | "h1" -> Some 1
4141- | "h2" -> Some 2
4242- | "h3" -> Some 3
4343- | "h4" -> Some 4
4444- | "h5" -> Some 5
4545- | "h6" -> Some 6
4646- | _ -> None
4747-4837(** Check if text is effectively empty (only whitespace). *)
4938let is_empty_text text =
5039 let rec check i =
···5746 in
5847 check 0
59486060-let start_element state ~name ~namespace:_ ~attrs:_ collector =
6161- match heading_level name with
6262- | Some level ->
6363- state.has_any_heading <- true;
4949+let start_element state ~element collector =
5050+ match element.Element.tag with
5151+ | Tag.Html (#Tag.heading_tag as h) ->
5252+ let level = match Tag.heading_level h with Some l -> l | None -> 0 in
5353+ let name = Tag.html_tag_to_string h in
5454+ state.has_any_heading <- true;
64556565- (* Check if this is the first heading *)
6666- if not state.first_heading_checked then begin
6767- state.first_heading_checked <- true;
6868- if level <> 1 then
5656+ (* Check if this is the first heading *)
5757+ if not state.first_heading_checked then begin
5858+ state.first_heading_checked <- true;
5959+ if level <> 1 then
6060+ Message_collector.add_typed collector
6161+ (`Generic (Printf.sprintf
6262+ "First heading in document is <%s>, should typically be <h1>" name))
6363+ end;
6464+6565+ (* Track h1 count *)
6666+ if level = 1 then begin
6767+ state.h1_count <- state.h1_count + 1;
6868+ if state.h1_count > 1 then
6969+ Message_collector.add_typed collector (`Misc `Multiple_h1)
7070+ end;
7171+7272+ (* Check for skipped levels *)
7373+ begin match state.current_level with
7474+ | None ->
7575+ state.current_level <- Some level
7676+ | Some prev_level ->
7777+ let diff = level - prev_level in
7878+ if diff > 1 then
6979 Message_collector.add_typed collector
7080 (`Generic (Printf.sprintf
7171- "First heading in document is <%s>, should typically be <h1>" name))
7272- end;
8181+ "Heading level skipped: <%s> follows <h%d>, skipping %d level%s. This can confuse screen reader users"
8282+ name prev_level (diff - 1) (if diff > 2 then "s" else "")));
8383+ state.current_level <- Some level
8484+ end;
73857474- (* Track h1 count *)
7575- if level = 1 then begin
7676- state.h1_count <- state.h1_count + 1;
7777- if state.h1_count > 1 then
7878- Message_collector.add_typed collector (`Misc `Multiple_h1)
7979- end;
8686+ (* Track that we're in a heading to check for empty content *)
8787+ state.in_heading <- Some h;
8888+ state.heading_has_text <- false
8989+ | _ -> ()
80908181- (* Check for skipped levels *)
8282- begin match state.current_level with
8383- | None ->
8484- state.current_level <- Some level
8585- | Some prev_level ->
8686- let diff = level - prev_level in
8787- if diff > 1 then
8888- Message_collector.add_typed collector
8989- (`Generic (Printf.sprintf
9090- "Heading level skipped: <%s> follows <h%d>, skipping %d level%s. This can confuse screen reader users"
9191- name prev_level (diff - 1) (if diff > 2 then "s" else "")));
9292- state.current_level <- Some level
9393- end;
9494-9595- (* Track that we're in a heading to check for empty content *)
9696- state.in_heading <- Some name;
9797- state.heading_has_text <- false
9898-9999- | None ->
100100- (* Not a heading element *)
101101- ()
102102-103103-let end_element state ~name ~namespace:_ collector =
104104- match state.in_heading with
105105- | Some heading when heading = name ->
106106- if not state.heading_has_text then
107107- Message_collector.add_typed collector
108108- (`Generic (Printf.sprintf
109109- "Heading <%s> is empty or contains only whitespace. Empty headings are problematic for screen readers" name));
110110- state.in_heading <- None;
111111- state.heading_has_text <- false
9191+let end_element state ~tag collector =
9292+ match state.in_heading, tag with
9393+ | Some h, Tag.Html h2 when h = h2 ->
9494+ if not state.heading_has_text then
9595+ Message_collector.add_typed collector
9696+ (`Generic (Printf.sprintf
9797+ "Heading <%s> is empty or contains only whitespace. Empty headings are problematic for screen readers"
9898+ (Tag.html_tag_to_string h)));
9999+ state.in_heading <- None;
100100+ state.heading_has_text <- false
112101 | _ -> ()
113102114103let characters state text _collector =
+23-29
lib/htmlrw_check/specialized/importmap_checker.ml
···265265266266 List.rev !errors
267267268268-let start_element state ~name ~namespace ~attrs _collector =
269269- if namespace <> None then ()
270270- else begin
271271- let name_lower = String.lowercase_ascii name in
272272- if name_lower = "script" then begin
273273- (* Check if type="importmap" *)
274274- let type_attr = List.find_opt (fun (n, _) ->
275275- String.lowercase_ascii n = "type"
276276- ) attrs in
277277- match type_attr with
278278- | Some (_, v) when String.lowercase_ascii v = "importmap" ->
279279- state.in_importmap <- true;
280280- Buffer.clear state.content
281281- | _ -> ()
282282- end
283283- end
268268+let start_element state ~element _collector =
269269+ match element.Element.tag with
270270+ | Tag.Html `Script ->
271271+ (* Check if type="importmap" *)
272272+ let type_attr = List.find_opt (fun (n, _) ->
273273+ String.lowercase_ascii n = "type"
274274+ ) element.raw_attrs in
275275+ (match type_attr with
276276+ | Some (_, v) when String.lowercase_ascii v = "importmap" ->
277277+ state.in_importmap <- true;
278278+ Buffer.clear state.content
279279+ | _ -> ())
280280+ | _ -> () (* Only script elements can be importmaps *)
284281285282let error_to_typed = function
286283 | InvalidJSON _ -> `Importmap `Invalid_json
···295292 | InvalidScopeValue _ -> `Importmap `Scopes_value_invalid_url
296293 | ScopeValueNotObject -> `Importmap `Scopes_values_not_object
297294298298-let end_element state ~name ~namespace collector =
299299- if namespace <> None then ()
300300- else begin
301301- let name_lower = String.lowercase_ascii name in
302302- if name_lower = "script" && state.in_importmap then begin
303303- let content = Buffer.contents state.content in
304304- let errors = validate_importmap content in
305305- List.iter (fun err ->
306306- Message_collector.add_typed collector (error_to_typed err)
307307- ) errors;
308308- state.in_importmap <- false
309309- end
310310- end
295295+let end_element state ~tag collector =
296296+ match tag with
297297+ | Tag.Html `Script when state.in_importmap ->
298298+ let content = Buffer.contents state.content in
299299+ let errors = validate_importmap content in
300300+ List.iter (fun err ->
301301+ Message_collector.add_typed collector (error_to_typed err)
302302+ ) errors;
303303+ state.in_importmap <- false
304304+ | _ -> ()
311305312306let characters state text _collector =
313307 if state.in_importmap then
+37-40
lib/htmlrw_check/specialized/label_checker.ml
···5050 state.labels_for <- [];
5151 state.labelable_ids <- []
52525353-let start_element state ~name ~namespace ~attrs collector =
5454- if namespace <> None then ()
5555- else begin
5656- let name_lower = String.lowercase_ascii name in
5353+let start_element state ~element collector =
5454+ match element.Element.tag with
5555+ | Tag.Html `Label ->
5656+ state.in_label <- true;
5757+ state.label_depth <- 1; (* Start at 1 for the label element itself *)
5858+ state.labelable_count <- 0;
5959+ let for_value = get_attr element.raw_attrs "for" in
6060+ let has_role = get_attr element.raw_attrs "role" <> None in
6161+ let has_aria_label = get_attr element.raw_attrs "aria-label" <> None in
6262+ state.label_for_value <- for_value;
6363+ state.label_has_role <- has_role;
6464+ state.label_has_aria_label <- has_aria_label;
6565+ (* Track this label if it has for= and role/aria-label *)
6666+ (match for_value with
6767+ | Some target when has_role || has_aria_label ->
6868+ state.labels_for <- { for_target = target; has_role; has_aria_label } :: state.labels_for
6969+ | _ -> ())
7070+7171+ | Tag.Html tag ->
7272+ let name_lower = String.lowercase_ascii (Tag.tag_to_string (Tag.Html tag)) in
57735858- if name_lower = "label" then begin
5959- state.in_label <- true;
6060- state.label_depth <- 1; (* Start at 1 for the label element itself *)
6161- state.labelable_count <- 0;
6262- let for_value = get_attr attrs "for" in
6363- let has_role = get_attr attrs "role" <> None in
6464- let has_aria_label = get_attr attrs "aria-label" <> None in
6565- state.label_for_value <- for_value;
6666- state.label_has_role <- has_role;
6767- state.label_has_aria_label <- has_aria_label;
6868- (* Track this label if it has for= and role/aria-label *)
6969- (match for_value with
7070- | Some target when has_role || has_aria_label ->
7171- state.labels_for <- { for_target = target; has_role; has_aria_label } :: state.labels_for
7272- | _ -> ())
7373- end;
7474 (* Track labelable element IDs *)
7575 (if List.mem name_lower labelable_elements then
7676- match get_attr attrs "id" with
7676+ match get_attr element.raw_attrs "id" with
7777 | Some id -> state.labelable_ids <- id :: state.labelable_ids
7878 | None -> ());
79798080- if state.in_label && name_lower <> "label" then begin
8080+ if state.in_label then begin
8181 state.label_depth <- state.label_depth + 1;
82828383 (* Check for labelable elements inside label *)
···8989 (* Check if label has for attribute and descendant has mismatched id *)
9090 (match state.label_for_value with
9191 | Some for_value ->
9292- let descendant_id = get_attr attrs "id" in
9292+ let descendant_id = get_attr element.raw_attrs "id" in
9393 (match descendant_id with
9494 | None ->
9595 Message_collector.add_typed collector (`Label `For_id_mismatch)
···9999 | None -> ())
100100 end
101101 end
102102- end
103102104104-let end_element state ~name ~namespace collector =
105105- if namespace <> None then ()
106106- else begin
107107- let name_lower = String.lowercase_ascii name in
103103+ | _ -> () (* Non-HTML elements (SVG, MathML, etc.) *)
108104109109- if state.in_label then begin
110110- state.label_depth <- state.label_depth - 1;
105105+let end_element state ~tag collector =
106106+ if state.in_label then begin
107107+ state.label_depth <- state.label_depth - 1;
111108112112- if name_lower = "label" && state.label_depth = 0 then begin
113113- if state.label_has_role && state.labelable_count > 0 then
114114- Message_collector.add_typed collector (`Label `Role_on_ancestor);
115115- state.in_label <- false;
116116- state.labelable_count <- 0;
117117- state.label_for_value <- None;
118118- state.label_has_role <- false;
119119- state.label_has_aria_label <- false
120120- end
121121- end
109109+ match tag with
110110+ | Tag.Html `Label when state.label_depth = 0 ->
111111+ if state.label_has_role && state.labelable_count > 0 then
112112+ Message_collector.add_typed collector (`Label `Role_on_ancestor);
113113+ state.in_label <- false;
114114+ state.labelable_count <- 0;
115115+ state.label_for_value <- None;
116116+ state.label_has_role <- false;
117117+ state.label_has_aria_label <- false
118118+ | _ -> ()
122119 end
123120124121let characters _state _text _collector = ()
···270270 let all_nodes = Hashtbl.to_seq_keys graph |> List.of_seq in
271271 check_all_nodes [] all_nodes
272272273273-let start_element state ~name ~namespace:_ ~attrs collector =
273273+let start_element state ~element collector =
274274+ let name = Tag.tag_to_string element.Element.tag in
275275+ let attrs = element.raw_attrs in
274276 let location = None in
275277 track_id state attrs;
276278 process_microdata_attrs state ~element:name ~attrs ~location collector
277279278278-let end_element state ~name ~namespace:_ _collector =
280280+let end_element state ~tag _collector =
281281+ let name = Tag.tag_to_string tag in
279282 (* Pop itemscope from stack if this element had one *)
280283 match state.scope_stack with
281284 | scope :: rest when scope.element = name ->
+9-8
lib/htmlrw_check/specialized/mime_type_checker.ml
···156156 if String.lowercase_ascii k = String.lowercase_ascii name then Some v else None
157157 ) attrs
158158159159-let start_element _state ~name ~namespace ~attrs collector =
160160- if namespace <> None then ()
161161- else begin
159159+let start_element _state ~element collector =
160160+ match element.Element.tag with
161161+ | Tag.Html tag ->
162162+ let name = Tag.html_tag_to_string tag in
162163 let name_lower = String.lowercase_ascii name in
163163- match List.assoc_opt name_lower mime_type_attrs with
164164+ (match List.assoc_opt name_lower mime_type_attrs with
164165 | None -> ()
165166 | Some type_attrs ->
166167 List.iter (fun attr_name ->
167167- match get_attr_value attr_name attrs with
168168+ match get_attr_value attr_name element.raw_attrs with
168169 | None -> ()
169170 | Some value ->
170171 (* Don't validate empty type attributes or special script types *)
···186187 | Some err ->
187188 Message_collector.add_typed collector
188189 (`Attr (`Bad_value_generic (`Message err)))
189189- ) type_attrs
190190- end
190190+ ) type_attrs)
191191+ | _ -> () (* Non-HTML elements don't have MIME type checks *)
191192192192-let end_element _state ~name:_ ~namespace:_ _collector = ()
193193+let end_element _state ~tag:_ _collector = ()
193194let characters _state _text _collector = ()
194195let end_document _state _collector = ()
195196
···4040 if end_pos = len then s
4141 else String.sub s 0 end_pos
42424343-let start_element _state ~name:_ ~namespace:_ ~attrs:_ _collector = ()
4343+let start_element _state ~element:_ _collector = ()
44444545-let end_element _state ~name:_ ~namespace:_ _collector = ()
4545+let end_element _state ~tag:_ _collector = ()
46464747let characters _state text collector =
4848 (* Skip empty text or whitespace-only text *)
+90-91
lib/htmlrw_check/specialized/picture_checker.ml
···9393let check_img_attrs attrs collector =
9494 check_disallowed_attrs "img" disallowed_img_attrs attrs collector
95959696-let start_element state ~name ~namespace ~attrs collector =
9797- let name_lower = String.lowercase_ascii name in
9696+let start_element state ~element collector =
9797+ let name_lower = Tag.tag_to_string element.Element.tag in
9898+ let attrs = element.raw_attrs in
989999100 (* Check for disallowed children of picture first - even foreign content *)
100101 if state.in_picture && state.picture_depth = 1 then begin
···103104 end;
104105105106 (* Rest of checks only apply to HTML namespace elements *)
106106- match namespace with
107107- | Some _ -> ()
108108- | None ->
109109- (match name_lower with
110110- | "picture" ->
111111- (* Check if picture is in a disallowed parent context *)
112112- (match state.parent_stack with
113113- | parent :: _ when List.mem parent disallowed_picture_parents ->
114114- Message_collector.add_typed collector
115115- (`Element (`Not_allowed_as_child (`Child "picture", `Parent parent)))
116116- | _ -> ());
117117- check_picture_attrs attrs collector;
118118- state.in_picture <- true;
119119- state.has_img_in_picture <- false;
120120- state.picture_depth <- 0;
121121- state.children_in_picture <- [];
122122- state.last_was_img <- false;
123123- state.has_source_after_img <- false;
124124- state.has_always_matching_source <- false;
125125- state.source_after_always_matching <- false
107107+ (match element.tag with
108108+ | Tag.Html `Picture ->
109109+ (* Check if picture is in a disallowed parent context *)
110110+ (match state.parent_stack with
111111+ | parent :: _ when List.mem parent disallowed_picture_parents ->
112112+ Message_collector.add_typed collector
113113+ (`Element (`Not_allowed_as_child (`Child "picture", `Parent parent)))
114114+ | _ -> ());
115115+ check_picture_attrs attrs collector;
116116+ state.in_picture <- true;
117117+ state.has_img_in_picture <- false;
118118+ state.picture_depth <- 0;
119119+ state.children_in_picture <- [];
120120+ state.last_was_img <- false;
121121+ state.has_source_after_img <- false;
122122+ state.has_always_matching_source <- false;
123123+ state.source_after_always_matching <- false
126124127127- | "source" when state.in_picture && state.picture_depth = 1 ->
128128- check_source_attrs_in_picture attrs collector;
129129- state.children_in_picture <- "source" :: state.children_in_picture;
130130- if state.last_was_img then
131131- state.has_source_after_img <- true;
132132- if state.has_always_matching_source then
133133- state.source_after_always_matching <- true;
134134- (* A source is "always matching" if it has no media/type, or media="" or media="all" *)
135135- let media_value = Attr_utils.get_attr "media" attrs in
136136- let has_type = Attr_utils.has_attr "type" attrs in
137137- let is_media_all = match media_value with
138138- | Some v -> String.lowercase_ascii (String.trim v) = "all"
139139- | None -> false in
140140- let is_media_empty = match media_value with
141141- | Some v -> String.trim v = ""
142142- | None -> false in
143143- let is_always_matching = match media_value with
144144- | None -> not has_type
145145- | Some v ->
146146- let trimmed = String.trim v in
147147- trimmed = "" || String.lowercase_ascii trimmed = "all"
148148- in
149149- if is_always_matching then begin
150150- state.has_always_matching_source <- true;
151151- (* Only set flags to true, never reset to false *)
152152- if is_media_all then state.always_matching_is_media_all <- true;
153153- if is_media_empty then state.always_matching_is_media_empty <- true
154154- end
125125+ | Tag.Html `Source when state.in_picture && state.picture_depth = 1 ->
126126+ check_source_attrs_in_picture attrs collector;
127127+ state.children_in_picture <- "source" :: state.children_in_picture;
128128+ if state.last_was_img then
129129+ state.has_source_after_img <- true;
130130+ if state.has_always_matching_source then
131131+ state.source_after_always_matching <- true;
132132+ (* A source is "always matching" if it has no media/type, or media="" or media="all" *)
133133+ let media_value = Attr_utils.get_attr "media" attrs in
134134+ let has_type = Attr_utils.has_attr "type" attrs in
135135+ let is_media_all = match media_value with
136136+ | Some v -> String.lowercase_ascii (String.trim v) = "all"
137137+ | None -> false in
138138+ let is_media_empty = match media_value with
139139+ | Some v -> String.trim v = ""
140140+ | None -> false in
141141+ let is_always_matching = match media_value with
142142+ | None -> not has_type
143143+ | Some v ->
144144+ let trimmed = String.trim v in
145145+ trimmed = "" || String.lowercase_ascii trimmed = "all"
146146+ in
147147+ if is_always_matching then begin
148148+ state.has_always_matching_source <- true;
149149+ if is_media_all then state.always_matching_is_media_all <- true;
150150+ if is_media_empty then state.always_matching_is_media_empty <- true
151151+ end
155152156156- | "img" when state.in_picture && state.picture_depth = 1 ->
157157- check_img_attrs attrs collector;
158158- state.has_img_in_picture <- true;
159159- state.children_in_picture <- "img" :: state.children_in_picture;
160160- state.last_was_img <- true;
161161- let img_count = List.length (List.filter (( = ) "img") state.children_in_picture) in
162162- if img_count > 1 then
163163- report_disallowed_child "picture" "img" collector;
164164- if state.has_always_matching_source && Attr_utils.has_attr "srcset" attrs then
165165- Message_collector.add_typed collector
166166- (if state.always_matching_is_media_all then `Misc `Media_all
167167- else if state.always_matching_is_media_empty then `Misc `Media_empty
168168- else `Srcset `Source_needs_media_or_type)
153153+ | Tag.Html `Img when state.in_picture && state.picture_depth = 1 ->
154154+ check_img_attrs attrs collector;
155155+ state.has_img_in_picture <- true;
156156+ state.children_in_picture <- "img" :: state.children_in_picture;
157157+ state.last_was_img <- true;
158158+ let img_count = List.length (List.filter (( = ) "img") state.children_in_picture) in
159159+ if img_count > 1 then
160160+ report_disallowed_child "picture" "img" collector;
161161+ if state.has_always_matching_source && Attr_utils.has_attr "srcset" attrs then
162162+ Message_collector.add_typed collector
163163+ (if state.always_matching_is_media_all then `Misc `Media_all
164164+ else if state.always_matching_is_media_empty then `Misc `Media_empty
165165+ else `Srcset `Source_needs_media_or_type)
169166170170- | "script" when state.in_picture && state.picture_depth = 1 ->
171171- state.children_in_picture <- "script" :: state.children_in_picture
167167+ | Tag.Html `Script when state.in_picture && state.picture_depth = 1 ->
168168+ state.children_in_picture <- "script" :: state.children_in_picture
172169173173- | "template" when state.in_picture && state.picture_depth = 1 ->
174174- state.children_in_picture <- "template" :: state.children_in_picture
170170+ | Tag.Html `Template when state.in_picture && state.picture_depth = 1 ->
171171+ state.children_in_picture <- "template" :: state.children_in_picture
175172176176- | "img" ->
177177- check_img_attrs attrs collector
173173+ | Tag.Html `Img ->
174174+ check_img_attrs attrs collector
178175179179- | _ -> ());
176176+ | _ -> ());
180177181178 (* Track depth when inside picture *)
182179 if state.in_picture then
183180 state.picture_depth <- state.picture_depth + 1;
184181185182 (* Push to parent stack (only HTML namespace elements) *)
186186- if namespace = None then
187187- state.parent_stack <- name_lower :: state.parent_stack
183183+ (match element.tag with
184184+ | Tag.Html _ -> state.parent_stack <- name_lower :: state.parent_stack
185185+ | _ -> ())
188186189189-let end_element state ~name ~namespace collector =
190190- match namespace with
191191- | Some _ -> ()
192192- | None ->
193193- let name_lower = String.lowercase_ascii name in
194194-187187+let end_element state ~tag collector =
188188+ match tag with
189189+ | Tag.Html _ ->
190190+ let name_lower = Tag.tag_to_string tag in
195191 if state.in_picture then
196192 state.picture_depth <- state.picture_depth - 1;
197193198198- if name_lower = "picture" && state.picture_depth = 0 then begin
199199- if not state.has_img_in_picture then
200200- Message_collector.add_typed collector (`Srcset `Picture_missing_img);
201201- if state.has_source_after_img then
202202- report_disallowed_child "picture" "source" collector;
203203- if state.source_after_always_matching then
204204- Message_collector.add_typed collector
205205- (if state.always_matching_is_media_all then `Misc `Media_all
206206- else if state.always_matching_is_media_empty then `Misc `Media_empty
207207- else `Srcset `Source_needs_media_or_type);
208208- state.in_picture <- false
209209- end;
194194+ (match tag with
195195+ | Tag.Html `Picture when state.picture_depth = 0 ->
196196+ if not state.has_img_in_picture then
197197+ Message_collector.add_typed collector (`Srcset `Picture_missing_img);
198198+ if state.has_source_after_img then
199199+ report_disallowed_child "picture" "source" collector;
200200+ if state.source_after_always_matching then
201201+ Message_collector.add_typed collector
202202+ (if state.always_matching_is_media_all then `Misc `Media_all
203203+ else if state.always_matching_is_media_empty then `Misc `Media_empty
204204+ else `Srcset `Source_needs_media_or_type);
205205+ state.in_picture <- false
206206+ | _ -> ());
210207211211- state.parent_stack <- match state.parent_stack with _ :: rest -> rest | [] -> []
208208+ ignore name_lower;
209209+ state.parent_stack <- (match state.parent_stack with _ :: rest -> rest | [] -> [])
210210+ | _ -> ()
212211213212let characters state text collector =
214213 (* Text in picture element is not allowed *)
+63-71
lib/htmlrw_check/specialized/ruby_checker.ml
···2626 state.in_template <- 0
27272828(** Check if element is phrasing content that can appear before rt *)
2929-let is_phrasing_content name =
3030- let name_lower = String.lowercase_ascii name in
3131- (* rt and rp are special - they don't count as "content before rt" *)
3232- name_lower <> "rt" && name_lower <> "rp"
2929+let is_phrasing_content tag =
3030+ match tag with
3131+ | Tag.Html `Rt | Tag.Html `Rp -> false
3232+ | _ -> true
33333434-let start_element state ~name ~namespace ~attrs _collector =
3535- ignore attrs;
3636- if namespace <> None then ()
3737- else begin
3838- let name_lower = String.lowercase_ascii name in
3434+let start_element state ~element _collector =
3535+ match element.Element.tag with
3636+ | Tag.Html `Template ->
3737+ state.in_template <- state.in_template + 1
39384040- if name_lower = "template" then
4141- state.in_template <- state.in_template + 1;
3939+ | Tag.Html `Ruby when state.in_template = 0 ->
4040+ (* Push new ruby context *)
4141+ let info = {
4242+ has_rt = false;
4343+ has_content_before_rt = false;
4444+ saw_rt = false;
4545+ depth = 1; (* Set depth to 1 for the ruby element itself *)
4646+ } in
4747+ state.ruby_stack <- info :: state.ruby_stack
42484343- if state.in_template > 0 then ()
4444- else begin
4545- if name_lower = "ruby" then begin
4646- (* Push new ruby context *)
4747- let info = {
4848- has_rt = false;
4949- has_content_before_rt = false;
5050- saw_rt = false;
5151- depth = 0;
5252- } in
5353- state.ruby_stack <- info :: state.ruby_stack
4949+ | tag when state.in_template = 0 ->
5050+ (match state.ruby_stack with
5151+ | info :: _ ->
5252+ (* Inside a ruby element *)
5353+ if info.depth = 1 then begin
5454+ (* Direct children of ruby *)
5555+ match tag with
5656+ | Tag.Html `Rt ->
5757+ info.has_rt <- true;
5858+ info.saw_rt <- true
5959+ | _ when is_phrasing_content tag ->
6060+ if not info.saw_rt then
6161+ info.has_content_before_rt <- true
6262+ | _ -> ()
5463 end;
6464+ info.depth <- info.depth + 1
6565+ | [] -> ())
55665656- match state.ruby_stack with
5757- | info :: _ ->
5858- (* Inside a ruby element *)
5959- if name_lower = "ruby" then begin
6060- (* This is the opening of ruby, set depth to 1 *)
6161- info.depth <- 1
6262- end else begin
6363- if info.depth = 1 then begin
6464- (* Direct children of ruby *)
6565- if name_lower = "rt" then begin
6666- info.has_rt <- true;
6767- info.saw_rt <- true
6868- end else if is_phrasing_content name_lower then begin
6969- if not info.saw_rt then
7070- info.has_content_before_rt <- true
7171- end
7272- end;
7373- info.depth <- info.depth + 1
7474- end
7575- | [] -> ()
7676- end
7777- end
6767+ | _ -> () (* In template or non-HTML element *)
6868+6969+let end_element state ~tag collector =
7070+ match tag with
7171+ | Tag.Html `Template when state.in_template > 0 ->
7272+ state.in_template <- state.in_template - 1
78737979-let end_element state ~name ~namespace collector =
8080- if namespace <> None then ()
8181- else begin
8282- let name_lower = String.lowercase_ascii name in
7474+ | Tag.Html `Ruby when state.in_template = 0 ->
7575+ (match state.ruby_stack with
7676+ | info :: rest ->
7777+ info.depth <- info.depth - 1;
7878+ (* Check if this is the closing ruby tag (depth becomes 0 when ruby closes) *)
7979+ if info.depth <= 0 then begin
8080+ (* Closing ruby element - validate *)
8181+ if not info.has_rt then
8282+ (* Empty ruby or ruby without any rt - needs rp or rt *)
8383+ Message_collector.add_typed collector
8484+ (`Element (`Missing_child_one_of (`Parent "ruby", `Children ["rp"; "rt"])))
8585+ else if not info.has_content_before_rt then
8686+ (* Has rt but missing content before it - needs content *)
8787+ Message_collector.add_typed collector
8888+ (`Element (`Missing_child (`Parent "ruby", `Child "rt")));
8989+ state.ruby_stack <- rest
9090+ end
9191+ | [] -> ())
83928484- if name_lower = "template" && state.in_template > 0 then
8585- state.in_template <- state.in_template - 1;
9393+ | _ when state.in_template = 0 ->
9494+ (match state.ruby_stack with
9595+ | info :: _ ->
9696+ info.depth <- info.depth - 1
9797+ | [] -> ())
86988787- if state.in_template > 0 then ()
8888- else begin
8989- match state.ruby_stack with
9090- | info :: rest ->
9191- info.depth <- info.depth - 1;
9292- (* Check if this is the closing ruby tag (depth becomes 0 when ruby closes) *)
9393- if name_lower = "ruby" && info.depth <= 0 then begin
9494- (* Closing ruby element - validate *)
9595- if not info.has_rt then
9696- (* Empty ruby or ruby without any rt - needs rp or rt *)
9797- Message_collector.add_typed collector
9898- (`Element (`Missing_child_one_of (`Parent "ruby", `Children ["rp"; "rt"])))
9999- else if not info.has_content_before_rt then
100100- (* Has rt but missing content before it - needs content *)
101101- Message_collector.add_typed collector
102102- (`Element (`Missing_child (`Parent "ruby", `Child "rt")));
103103- state.ruby_stack <- rest
104104- end
105105- | [] -> ()
106106- end
107107- end
9999+ | _ -> () (* In template or non-HTML element *)
108100109101let characters state text _collector =
110102 (* Text content counts as phrasing content before rt *)
+33-44
lib/htmlrw_check/specialized/source_checker.ml
···2323 | ctx :: _ -> ctx
2424 | [] -> Other
25252626-let start_element state ~name ~namespace ~attrs collector =
2727- if namespace <> None then ()
2828- else begin
2929- let name_lower = String.lowercase_ascii name in
3030- match name_lower with
3131- | "picture" ->
3232- state.context_stack <- Picture :: state.context_stack
3333- | "video" ->
3434- state.context_stack <- Video :: state.context_stack
3535- | "audio" ->
3636- state.context_stack <- Audio :: state.context_stack
3737- | "source" ->
3838- let ctx = current_context state in
3939- begin match ctx with
4040- | Video | Audio ->
4141- if Attr_utils.has_attr "srcset" attrs then
4242- Message_collector.add_typed collector
4343- (`Attr (`Not_allowed (`Attr "srcset", `Elem "source")));
4444- if Attr_utils.has_attr "sizes" attrs then
4545- Message_collector.add_typed collector
4646- (`Attr (`Not_allowed (`Attr "sizes", `Elem "source")));
4747- if Attr_utils.has_attr "width" attrs then
4848- Message_collector.add_typed collector
4949- (`Attr (`Not_allowed (`Attr "width", `Elem "source")));
5050- if Attr_utils.has_attr "height" attrs then
5151- Message_collector.add_typed collector
5252- (`Attr (`Not_allowed (`Attr "height", `Elem "source")))
5353- | Picture | Other -> ()
5454- end
5555- | _ ->
5656- (* Any other element maintains current context *)
5757- ()
5858- end
2626+let start_element state ~element collector =
2727+ match element.Element.tag with
2828+ | Tag.Html `Picture ->
2929+ state.context_stack <- Picture :: state.context_stack
3030+ | Tag.Html `Video ->
3131+ state.context_stack <- Video :: state.context_stack
3232+ | Tag.Html `Audio ->
3333+ state.context_stack <- Audio :: state.context_stack
3434+ | Tag.Html `Source ->
3535+ let ctx = current_context state in
3636+ (match ctx with
3737+ | Video | Audio ->
3838+ if Attr_utils.has_attr "srcset" element.raw_attrs then
3939+ Message_collector.add_typed collector
4040+ (`Attr (`Not_allowed (`Attr "srcset", `Elem "source")));
4141+ if Attr_utils.has_attr "sizes" element.raw_attrs then
4242+ Message_collector.add_typed collector
4343+ (`Attr (`Not_allowed (`Attr "sizes", `Elem "source")));
4444+ if Attr_utils.has_attr "width" element.raw_attrs then
4545+ Message_collector.add_typed collector
4646+ (`Attr (`Not_allowed (`Attr "width", `Elem "source")));
4747+ if Attr_utils.has_attr "height" element.raw_attrs then
4848+ Message_collector.add_typed collector
4949+ (`Attr (`Not_allowed (`Attr "height", `Elem "source")))
5050+ | Picture | Other -> ())
5151+ | _ -> ()
59526060-let end_element state ~name ~namespace _collector =
6161- if namespace <> None then ()
6262- else begin
6363- let name_lower = String.lowercase_ascii name in
6464- match name_lower with
6565- | "picture" | "video" | "audio" ->
6666- (match state.context_stack with
6767- | _ :: rest -> state.context_stack <- rest
6868- | [] -> ())
6969- | _ -> ()
7070- end
5353+let end_element state ~tag _collector =
5454+ match tag with
5555+ | Tag.Html (`Picture | `Video | `Audio) ->
5656+ (match state.context_stack with
5757+ | _ :: rest -> state.context_stack <- rest
5858+ | [] -> ())
5959+ | _ -> ()
71607261let characters _state _text _collector = ()
7362
···960960 Message_collector.add_typed collector
961961 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csrcset\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Mixing width and density descriptors is not allowed." value element_name))))
962962963963-let start_element _state ~name ~namespace ~attrs collector =
964964- let name_lower = String.lowercase_ascii name in
965965-966966- (* SVG image elements should not have srcset *)
967967- if namespace <> None && name_lower = "image" then begin
968968- if Attr_utils.get_attr "srcset" attrs <> None then
963963+let start_element _state ~element collector =
964964+ match element.Element.tag with
965965+ | Tag.Svg "image" ->
966966+ (* SVG image elements should not have srcset *)
967967+ if Attr_utils.get_attr "srcset" element.Element.raw_attrs <> None then
969968 Message_collector.add_typed collector
970969 (`Attr (`Not_allowed (`Attr "srcset", `Elem "image")))
971971- end;
970970+ | Tag.Html (`Img | `Source as tag) ->
971971+ let name_lower = Tag.html_tag_to_string tag in
972972+ let attrs = element.raw_attrs in
973973+ let sizes_value = Attr_utils.get_attr "sizes" attrs in
974974+ let srcset_value = Attr_utils.get_attr "srcset" attrs in
975975+ let has_sizes = sizes_value <> None in
976976+ let has_srcset = srcset_value <> None in
972977973973- if namespace <> None then ()
974974- else begin
975975- (* Check sizes and srcset on img and source *)
976976- if name_lower = "img" || name_lower = "source" then begin
977977- let sizes_value = Attr_utils.get_attr "sizes" attrs in
978978- let srcset_value = Attr_utils.get_attr "srcset" attrs in
979979- let has_sizes = sizes_value <> None in
980980- let has_srcset = srcset_value <> None in
981981-982982- (* Validate sizes if present *)
983983- (match sizes_value with
984984- | Some v -> ignore (validate_sizes v name_lower collector)
985985- | None -> ());
978978+ (* Validate sizes if present *)
979979+ (match sizes_value with
980980+ | Some v -> ignore (validate_sizes v name_lower collector)
981981+ | None -> ());
986982987987- (* Validate srcset if present *)
988988- (match srcset_value with
989989- | Some v -> validate_srcset v name_lower has_sizes collector
990990- | None -> ());
983983+ (* Validate srcset if present *)
984984+ (match srcset_value with
985985+ | Some v -> validate_srcset v name_lower has_sizes collector
986986+ | None -> ());
991987992992- (* Error: sizes without srcset on img *)
993993- if name_lower = "img" && has_sizes && not has_srcset then
994994- Message_collector.add_typed collector
995995- (`Srcset `Sizes_without_srcset)
996996- end
997997- end
988988+ (* Error: sizes without srcset on img *)
989989+ if name_lower = "img" && has_sizes && not has_srcset then
990990+ Message_collector.add_typed collector
991991+ (`Srcset `Sizes_without_srcset)
992992+ | _ -> () (* Other elements *)
998993999999-let end_element _state ~name:_ ~namespace:_ _collector = ()
994994+let end_element _state ~tag:_ _collector = ()
1000995let characters _state _text _collector = ()
1001996let end_document _state _collector = ()
1002997
+8-5
lib/htmlrw_check/specialized/svg_checker.ml
···3030 state.fecomponenttransfer_stack <- []
31313232(* SVG namespace - the DOM stores this as "svg" shorthand *)
3333-let svg_ns = "svg"
3333+let _svg_ns = "svg"
34343535(* Full SVG namespace URL for validation *)
3636let svg_ns_url = "http://www.w3.org/2000/svg"
···348348 end
349349 with Not_found -> ()
350350351351-let start_element state ~name ~namespace ~attrs collector =
352352- let is_svg_element = namespace = Some svg_ns in
351351+let start_element state ~element collector =
352352+ let is_svg_element = match element.Element.tag with Tag.Svg _ -> true | _ -> false in
353353+ let name = Tag.tag_to_string element.tag in
354354+ let attrs = element.raw_attrs in
353355354356 (* Track if we're in SVG context *)
355357 if name = "svg" && is_svg_element then
···448450 | None -> ())
449451 end
450452451451-let end_element state ~name ~namespace collector =
452452- let is_svg_element = namespace = Some svg_ns in
453453+let end_element state ~tag collector =
454454+ let is_svg_element = match tag with Tag.Svg _ -> true | _ -> false in
455455+ let name = Tag.tag_to_string tag in
453456454457 if is_svg_element || state.in_svg then begin
455458 let name_lower = String.lowercase_ascii name in
+42-41
lib/htmlrw_check/specialized/table_checker.ml
···688688689689let reset state = state.tables := []
690690691691-let is_html_namespace = function
691691+let _is_html_namespace = function
692692 | None -> true (* HTML mode - no namespace specified *)
693693 | Some ns -> ns = html_ns (* XHTML mode - check namespace *)
694694695695-let start_element state ~name ~namespace ~attrs collector =
696696- if is_html_namespace namespace then (
697697- let name_lower = String.lowercase_ascii name in
698698- match name_lower with
699699- | "table" ->
700700- (* Push a new table onto the stack *)
701701- state.tables := make_table () :: !(state.tables)
702702- | _ -> (
703703- match !(state.tables) with
704704- | [] -> ()
705705- | table :: _ -> (
706706- match name_lower with
707707- | "td" -> start_cell table false attrs collector
708708- | "th" -> start_cell table true attrs collector
709709- | "tr" -> start_row table collector
710710- | "tbody" | "thead" | "tfoot" -> start_row_group table name collector
711711- | "col" -> start_col table attrs collector
712712- | "colgroup" -> start_colgroup table attrs collector
713713- | _ -> ())))
695695+let start_element state ~element collector =
696696+ let attrs = element.Element.raw_attrs in
697697+ match element.tag with
698698+ | Tag.Html `Table ->
699699+ (* Push a new table onto the stack *)
700700+ state.tables := make_table () :: !(state.tables)
701701+ | Tag.Html tag -> (
702702+ match !(state.tables) with
703703+ | [] -> ()
704704+ | table :: _ -> (
705705+ match tag with
706706+ | `Td -> start_cell table false attrs collector
707707+ | `Th -> start_cell table true attrs collector
708708+ | `Tr -> start_row table collector
709709+ | `Tbody | `Thead | `Tfoot ->
710710+ let name = Tag.html_tag_to_string tag in
711711+ start_row_group table name collector
712712+ | `Col -> start_col table attrs collector
713713+ | `Colgroup -> start_colgroup table attrs collector
714714+ | _ -> ()))
715715+ | _ -> () (* Non-HTML elements *)
714716715715-let end_element state ~name ~namespace collector =
716716- if is_html_namespace namespace then (
717717- let name_lower = String.lowercase_ascii name in
718718- match name_lower with
719719- | "table" -> (
720720- match !(state.tables) with
721721- | [] -> () (* End tag without start - ignore *)
722722- | table :: rest ->
723723- end_table table collector;
724724- state.tables := rest)
725725- | _ -> (
726726- match !(state.tables) with
727727- | [] -> ()
728728- | table :: _ -> (
729729- match name_lower with
730730- | "td" | "th" -> end_cell table
731731- | "tr" -> end_row table collector
732732- | "tbody" | "thead" | "tfoot" -> end_row_group_handler table collector
733733- | "col" -> end_col table
734734- | "colgroup" -> end_colgroup table
735735- | _ -> ())))
717717+let end_element state ~tag collector =
718718+ match tag with
719719+ | Tag.Html `Table -> (
720720+ match !(state.tables) with
721721+ | [] -> () (* End tag without start - ignore *)
722722+ | table :: rest ->
723723+ end_table table collector;
724724+ state.tables := rest)
725725+ | Tag.Html html_tag -> (
726726+ match !(state.tables) with
727727+ | [] -> ()
728728+ | table :: _ -> (
729729+ match html_tag with
730730+ | `Td | `Th -> end_cell table
731731+ | `Tr -> end_row table collector
732732+ | `Tbody | `Thead | `Tfoot -> end_row_group_handler table collector
733733+ | `Col -> end_col table
734734+ | `Colgroup -> end_colgroup table
735735+ | _ -> ()))
736736+ | _ -> () (* Non-HTML elements *)
736737737738let characters _state _text _collector = ()
738739
+26-43
lib/htmlrw_check/specialized/title_checker.ml
···2626 state.title_depth <- 0;
2727 state.is_iframe_srcdoc <- false
28282929-let start_element state ~name ~namespace ~attrs collector =
3030- ignore (collector, attrs);
3131- if namespace <> None then ()
3232- else begin
3333- let name_lower = String.lowercase_ascii name in
3434- match name_lower with
3535- | "html" ->
3636- (* Check if this is an iframe srcdoc - title is not required *)
3737- (* We detect this by checking for srcdoc context - not directly checkable from HTML,
3838- but we can assume normal HTML document for now *)
3939- ()
4040- | "head" ->
4141- state.in_head <- true
4242- | "title" when state.in_head ->
4343- state.has_title <- true;
4444- state.in_title <- true;
4545- state.title_has_content <- false;
4646- state.title_depth <- 0
4747- | _ -> ()
4848- end;
2929+let start_element state ~element _collector =
3030+ (match element.Element.tag with
3131+ | Tag.Html `Html -> ()
3232+ | Tag.Html `Head ->
3333+ state.in_head <- true
3434+ | Tag.Html `Title when state.in_head ->
3535+ state.has_title <- true;
3636+ state.in_title <- true;
3737+ state.title_has_content <- false;
3838+ state.title_depth <- 0
3939+ | _ -> ());
4940 if state.in_title then
5041 state.title_depth <- state.title_depth + 1
51425252-let end_element state ~name ~namespace collector =
5353- if namespace <> None then ()
5454- else begin
5555- let name_lower = String.lowercase_ascii name in
5656-5757- if state.in_title then
5858- state.title_depth <- state.title_depth - 1;
5959-6060- match name_lower with
6161- | "title" when state.in_title && state.title_depth = 0 ->
6262- (* Check if title was empty *)
6363- if not state.title_has_content then
6464- Message_collector.add_typed collector
6565- (`Element (`Must_not_be_empty (`Elem "title")));
6666- state.in_title <- false
6767- | "head" ->
6868- (* Check if head had a title element *)
6969- if state.in_head && not state.has_title then
7070- Message_collector.add_typed collector
7171- (`Element (`Missing_child (`Parent "head", `Child "title")));
7272- state.in_head <- false
7373- | _ -> ()
7474- end
4343+let end_element state ~tag collector =
4444+ if state.in_title then
4545+ state.title_depth <- state.title_depth - 1;
4646+ match tag with
4747+ | Tag.Html `Title when state.in_title && state.title_depth = 0 ->
4848+ if not state.title_has_content then
4949+ Message_collector.add_typed collector
5050+ (`Element (`Must_not_be_empty (`Elem "title")));
5151+ state.in_title <- false
5252+ | Tag.Html `Head ->
5353+ if state.in_head && not state.has_title then
5454+ Message_collector.add_typed collector
5555+ (`Element (`Missing_child (`Parent "head", `Child "title")));
5656+ state.in_head <- false
5757+ | _ -> ()
75587659let characters state text _collector =
7760 if state.in_title then begin
···6767let reset state =
6868 state.stack <- []
69697070-let start_element state ~name ~namespace ~attrs:_ collector =
7171- (* Only check HTML namespace elements *)
7272- match namespace with
7373- | Some _ -> () (* Skip SVG, MathML, etc. *)
7474- | None ->
7575- let name_lower = String.lowercase_ascii name in
7070+let start_element state ~element collector =
7171+ match element.Element.tag with
7272+ | Tag.Unknown name ->
7373+ (* Get the parent element name *)
7474+ let parent = match state.stack with
7575+ | p :: _ -> p
7676+ | [] -> "document"
7777+ in
7878+ (* Produce error: unknown element not allowed as child *)
7979+ Message_collector.add_typed collector
8080+ (`Element (`Not_allowed_as_child (`Child name, `Parent parent)));
8181+ (* Push to stack for tracking *)
8282+ state.stack <- name :: state.stack
76837777- (* Check if element is unknown *)
7878- if not (is_known_element name_lower) then begin
7979- (* Get the parent element name *)
8080- let parent = match state.stack with
8181- | p :: _ -> p
8282- | [] -> "document"
8383- in
8484- (* Produce error: unknown element not allowed as child *)
8585- Message_collector.add_typed collector
8686- (`Element (`Not_allowed_as_child (`Child name, `Parent parent)))
8787- end;
8484+ | Tag.Html tag ->
8585+ let name_lower = String.lowercase_ascii (Tag.tag_to_string (Tag.Html tag)) in
8686+ state.stack <- name_lower :: state.stack
88878989- (* Always push to stack for tracking *)
9090- state.stack <- name_lower :: state.stack
8888+ | _ -> () (* SVG, MathML, Custom elements are allowed *)
91899292-let end_element state ~name:_ ~namespace _ =
9393- match namespace with
9494- | Some _ -> ()
9595- | None ->
9696- match state.stack with
9797- | _ :: rest -> state.stack <- rest
9898- | [] -> () (* Stack underflow - shouldn't happen *)
9090+let end_element state ~tag _ =
9191+ match tag with
9292+ | Tag.Html _ | Tag.Unknown _ ->
9393+ (match state.stack with
9494+ | _ :: rest -> state.stack <- rest
9595+ | [] -> ()) (* Stack underflow - shouldn't happen *)
9696+ | _ -> () (* SVG, MathML, Custom elements *)
999710098let characters _state _text _collector = ()
10199
+7-5
lib/htmlrw_check/specialized/url_checker.ml
···741741 if String.lowercase_ascii k = String.lowercase_ascii name then Some v else None
742742 ) attrs
743743744744-let start_element _state ~name ~namespace ~attrs collector =
745745- if namespace <> None then ()
746746- else begin
744744+let start_element _state ~element collector =
745745+ match element.Element.tag with
746746+ | Tag.Html _ ->
747747+ let name = Tag.tag_to_string element.tag in
747748 let name_lower = String.lowercase_ascii name in
749749+ let attrs = element.raw_attrs in
748750 (* Check URL attributes for elements that have them *)
749751 (match List.assoc_opt name_lower url_attributes with
750752 | None -> ()
···808810 | Some warn_msg -> Message_collector.add_typed collector (`Generic warn_msg)
809811 | None -> ())
810812 | _ -> ())
811811- end
813813+ | _ -> () (* Non-HTML elements *)
812814813813-let end_element _state ~name:_ ~namespace:_ _collector = ()
815815+let end_element _state ~tag:_ _collector = ()
814816let characters _state _text _collector = ()
815817let end_document _state _collector = ()
816818
···5252 Message_collector.add_typed collector (`Attr `Data_uppercase)
5353 ) attrs
54545555-let start_element state ~name ~namespace ~attrs collector =
5656- ignore namespace;
5555+let start_element state ~element collector =
5656+ let name = Tag.tag_to_string element.Element.tag in
5757 let name_lower = String.lowercase_ascii name in
5858+ let attrs = element.raw_attrs in
58595960 (* Check data-* attributes for uppercase *)
6061 check_data_attr_case attrs collector;
···9798 (* Push onto stack *)
9899 state.element_stack <- name :: state.element_stack
99100100100-let end_element state ~name ~namespace:_ _collector =
101101- let name_lower = String.lowercase_ascii name in
101101+let end_element state ~tag _collector =
102102+ let name_lower = String.lowercase_ascii (Tag.tag_to_string tag) in
102103 (* Pop figure state if leaving a figure *)
103104 if name_lower = "figure" then begin
104105 match state.figure_stack with