OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** XHTML content model checker.
2
3 Validates specific content model rules that the Nu validator checks,
4 particularly for elements that don't allow text content or specific children. *)
5
6type figure_state = {
7 mutable has_content_before_figcaption : bool;
8 mutable has_figcaption : bool;
9 mutable figcaption_at_start : bool; (* true if figcaption came first *)
10}
11
12type state = {
13 mutable element_stack : string list;
14 mutable figure_stack : figure_state list; (* Stack to handle nested figures *)
15}
16
17let create () = { element_stack = []; figure_stack = [] }
18
19let reset state =
20 state.element_stack <- [];
21 state.figure_stack <- []
22
23(* Elements that don't allow direct text content (only specific child elements) *)
24let no_text_elements = [
25 "menu"; (* Only li elements *)
26 "iframe"; (* In XHTML mode, no content allowed *)
27 (* Note: figure handled separately due to complex content model with figcaption *)
28]
29
30
31(* Check if an element is allowed as child of parent *)
32let is_child_allowed ~parent ~child =
33 match parent with
34 | "menu" ->
35 (* menu only allows li, script, template *)
36 List.mem child ["li"; "script"; "template"]
37 | "table" ->
38 (* col must be in colgroup, not directly in table *)
39 child <> "col"
40 | _ -> true
41
42(* Check if text is allowed in element *)
43let is_text_allowed element =
44 not (List.mem element no_text_elements)
45
46(* Check if data-* attribute has uppercase characters *)
47let check_data_attr_case attrs collector =
48 List.iter (fun (attr_name, _) ->
49 if String.starts_with ~prefix:"data-" attr_name then
50 let suffix = String.sub attr_name 5 (String.length attr_name - 5) in
51 if String.exists (fun c -> c >= 'A' && c <= 'Z') suffix then
52 Message_collector.add_typed collector (`Attr `Data_uppercase)
53 ) attrs
54
55let start_element state ~element collector =
56 let name = Tag.tag_to_string element.Element.tag in
57 let name_lower = Astring.String.Ascii.lowercase name in
58 let attrs = element.raw_attrs in
59
60 (* Check data-* attributes for uppercase *)
61 check_data_attr_case attrs collector;
62
63 (* Check if this element is allowed as child of parent *)
64 (match state.element_stack with
65 | parent :: _ ->
66 let parent_lower = Astring.String.Ascii.lowercase parent in
67 if not (is_child_allowed ~parent:parent_lower ~child:name_lower) then
68 Message_collector.add_typed collector
69 (`Element (`Not_allowed_as_child (`Child name_lower, `Parent parent_lower)))
70 | [] -> ());
71
72 (* Handle figure content model *)
73 (match state.element_stack with
74 | parent :: _ when Astring.String.Ascii.lowercase parent = "figure" ->
75 (* We're inside a figure, check content model *)
76 (match state.figure_stack with
77 | fig :: _ ->
78 if name_lower = "figcaption" then begin
79 (* figcaption appearing *)
80 if not fig.has_content_before_figcaption then
81 fig.figcaption_at_start <- true;
82 fig.has_figcaption <- true
83 end else begin
84 (* Flow content appearing in figure *)
85 if fig.has_figcaption && not fig.figcaption_at_start then
86 Message_collector.add_typed collector
87 (`Element (`Not_allowed_as_child (`Child name_lower, `Parent "figure")))
88 else if not fig.has_figcaption then
89 fig.has_content_before_figcaption <- true
90 end
91 | [] -> ())
92 | _ -> ());
93
94 (* If entering a figure, push new figure state *)
95 if name_lower = "figure" then
96 state.figure_stack <- { has_content_before_figcaption = false; has_figcaption = false; figcaption_at_start = false } :: state.figure_stack;
97
98 (* Push onto stack *)
99 state.element_stack <- name :: state.element_stack
100
101let end_element state ~tag _collector =
102 let name_lower = Astring.String.Ascii.lowercase (Tag.tag_to_string tag) in
103 (* Pop figure state if leaving a figure *)
104 if name_lower = "figure" then begin
105 match state.figure_stack with
106 | _ :: rest -> state.figure_stack <- rest
107 | [] -> ()
108 end;
109 (* Pop from element stack *)
110 match state.element_stack with
111 | _ :: rest -> state.element_stack <- rest
112 | [] -> ()
113
114let characters state text collector =
115 match state.element_stack with
116 | [] -> ()
117 | parent :: _ ->
118 let parent_lower = Astring.String.Ascii.lowercase parent in
119 let trimmed = String.trim text in
120 if trimmed <> "" then begin
121 if parent_lower = "figure" then begin
122 match state.figure_stack with
123 | fig :: _ ->
124 if fig.has_figcaption && not fig.figcaption_at_start then
125 Message_collector.add_typed collector
126 (`Element (`Text_not_allowed (`Parent "figure")))
127 else if not fig.has_figcaption then
128 fig.has_content_before_figcaption <- true
129 | [] -> ()
130 end
131 else if not (is_text_allowed parent_lower) then
132 Message_collector.add_typed collector
133 (`Element (`Text_not_allowed (`Parent parent_lower)))
134 end
135
136let checker = Checker.make ~create ~reset ~start_element ~end_element
137 ~characters ()