OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 137 lines 5.1 kB view raw
1(** XHTML content model checker. 2 3 Validates specific content model rules that the Nu validator checks, 4 particularly for elements that don't allow text content or specific children. *) 5 6type figure_state = { 7 mutable has_content_before_figcaption : bool; 8 mutable has_figcaption : bool; 9 mutable figcaption_at_start : bool; (* true if figcaption came first *) 10} 11 12type state = { 13 mutable element_stack : string list; 14 mutable figure_stack : figure_state list; (* Stack to handle nested figures *) 15} 16 17let create () = { element_stack = []; figure_stack = [] } 18 19let reset state = 20 state.element_stack <- []; 21 state.figure_stack <- [] 22 23(* Elements that don't allow direct text content (only specific child elements) *) 24let no_text_elements = [ 25 "menu"; (* Only li elements *) 26 "iframe"; (* In XHTML mode, no content allowed *) 27 (* Note: figure handled separately due to complex content model with figcaption *) 28] 29 30 31(* Check if an element is allowed as child of parent *) 32let is_child_allowed ~parent ~child = 33 match parent with 34 | "menu" -> 35 (* menu only allows li, script, template *) 36 List.mem child ["li"; "script"; "template"] 37 | "table" -> 38 (* col must be in colgroup, not directly in table *) 39 child <> "col" 40 | _ -> true 41 42(* Check if text is allowed in element *) 43let is_text_allowed element = 44 not (List.mem element no_text_elements) 45 46(* Check if data-* attribute has uppercase characters *) 47let check_data_attr_case attrs collector = 48 List.iter (fun (attr_name, _) -> 49 if String.starts_with ~prefix:"data-" attr_name then 50 let suffix = String.sub attr_name 5 (String.length attr_name - 5) in 51 if String.exists (fun c -> c >= 'A' && c <= 'Z') suffix then 52 Message_collector.add_typed collector (`Attr `Data_uppercase) 53 ) attrs 54 55let start_element state ~element collector = 56 let name = Tag.tag_to_string element.Element.tag in 57 let name_lower = Astring.String.Ascii.lowercase name in 58 let attrs = element.raw_attrs in 59 60 (* Check data-* attributes for uppercase *) 61 check_data_attr_case attrs collector; 62 63 (* Check if this element is allowed as child of parent *) 64 (match state.element_stack with 65 | parent :: _ -> 66 let parent_lower = Astring.String.Ascii.lowercase parent in 67 if not (is_child_allowed ~parent:parent_lower ~child:name_lower) then 68 Message_collector.add_typed collector 69 (`Element (`Not_allowed_as_child (`Child name_lower, `Parent parent_lower))) 70 | [] -> ()); 71 72 (* Handle figure content model *) 73 (match state.element_stack with 74 | parent :: _ when Astring.String.Ascii.lowercase parent = "figure" -> 75 (* We're inside a figure, check content model *) 76 (match state.figure_stack with 77 | fig :: _ -> 78 if name_lower = "figcaption" then begin 79 (* figcaption appearing *) 80 if not fig.has_content_before_figcaption then 81 fig.figcaption_at_start <- true; 82 fig.has_figcaption <- true 83 end else begin 84 (* Flow content appearing in figure *) 85 if fig.has_figcaption && not fig.figcaption_at_start then 86 Message_collector.add_typed collector 87 (`Element (`Not_allowed_as_child (`Child name_lower, `Parent "figure"))) 88 else if not fig.has_figcaption then 89 fig.has_content_before_figcaption <- true 90 end 91 | [] -> ()) 92 | _ -> ()); 93 94 (* If entering a figure, push new figure state *) 95 if name_lower = "figure" then 96 state.figure_stack <- { has_content_before_figcaption = false; has_figcaption = false; figcaption_at_start = false } :: state.figure_stack; 97 98 (* Push onto stack *) 99 state.element_stack <- name :: state.element_stack 100 101let end_element state ~tag _collector = 102 let name_lower = Astring.String.Ascii.lowercase (Tag.tag_to_string tag) in 103 (* Pop figure state if leaving a figure *) 104 if name_lower = "figure" then begin 105 match state.figure_stack with 106 | _ :: rest -> state.figure_stack <- rest 107 | [] -> () 108 end; 109 (* Pop from element stack *) 110 match state.element_stack with 111 | _ :: rest -> state.element_stack <- rest 112 | [] -> () 113 114let characters state text collector = 115 match state.element_stack with 116 | [] -> () 117 | parent :: _ -> 118 let parent_lower = Astring.String.Ascii.lowercase parent in 119 let trimmed = String.trim text in 120 if trimmed <> "" then begin 121 if parent_lower = "figure" then begin 122 match state.figure_stack with 123 | fig :: _ -> 124 if fig.has_figcaption && not fig.figcaption_at_start then 125 Message_collector.add_typed collector 126 (`Element (`Text_not_allowed (`Parent "figure"))) 127 else if not fig.has_figcaption then 128 fig.has_content_before_figcaption <- true 129 | [] -> () 130 end 131 else if not (is_text_allowed parent_lower) then 132 Message_collector.add_typed collector 133 (`Element (`Text_not_allowed (`Parent parent_lower))) 134 end 135 136let checker = Checker.make ~create ~reset ~start_element ~end_element 137 ~characters ()