OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* CSS selector matching *)
2
3module Dom = Dom
4open Selector_ast
5
6(* Use Astring for string operations *)
7let lowercase = Astring.String.Ascii.lowercase
8let trim = Astring.String.trim
9let find_sub = Astring.String.find_sub
10let fields = Astring.String.fields
11
12(* Check if haystack contains needle as a substring *)
13let string_contains ~haystack ~needle =
14 Option.is_some (find_sub ~sub:needle haystack)
15
16let is_element node =
17 let name = node.Dom.name in
18 name <> "#text" && name <> "#comment" && name <> "#document" &&
19 name <> "#document-fragment" && name <> "!doctype"
20
21let get_element_children node =
22 List.filter is_element node.Dom.children
23
24let get_previous_sibling node =
25 match node.Dom.parent with
26 | None -> None
27 | Some parent ->
28 let rec find_prev prev = function
29 | [] -> None
30 | n :: rest ->
31 if n == node then prev
32 else if is_element n then find_prev (Some n) rest
33 else find_prev prev rest
34 in
35 find_prev None parent.Dom.children
36
37let is_first_child node =
38 match node.Dom.parent with
39 | None -> false
40 | Some parent ->
41 match get_element_children parent with
42 | first :: _ -> first == node
43 | [] -> false
44
45let is_last_child node =
46 match node.Dom.parent with
47 | None -> false
48 | Some parent ->
49 match List.rev (get_element_children parent) with
50 | last :: _ -> last == node
51 | [] -> false
52
53let is_first_of_type node =
54 match node.Dom.parent with
55 | None -> false
56 | Some parent ->
57 let name = lowercase node.Dom.name in
58 let rec find = function
59 | [] -> false
60 | n :: _ when lowercase n.Dom.name = name -> n == node
61 | _ :: rest -> find rest
62 in
63 find (get_element_children parent)
64
65let is_last_of_type node =
66 match node.Dom.parent with
67 | None -> false
68 | Some parent ->
69 let name = lowercase node.Dom.name in
70 let rec find last = function
71 | [] -> (match last with Some l -> l == node | None -> false)
72 | n :: rest when lowercase n.Dom.name = name -> find (Some n) rest
73 | _ :: rest -> find last rest
74 in
75 find None (get_element_children parent)
76
77let get_index node =
78 match node.Dom.parent with
79 | None -> 0
80 | Some parent ->
81 let children = get_element_children parent in
82 let rec find idx = function
83 | [] -> 0
84 | n :: _ when n == node -> idx
85 | _ :: rest -> find (idx + 1) rest
86 in
87 find 1 children
88
89let get_type_index node =
90 match node.Dom.parent with
91 | None -> 0
92 | Some parent ->
93 let name = lowercase node.Dom.name in
94 let children = get_element_children parent in
95 let rec find idx = function
96 | [] -> 0
97 | n :: _ when n == node -> idx
98 | n :: rest when lowercase n.Dom.name = name -> find (idx + 1) rest
99 | _ :: rest -> find idx rest
100 in
101 find 1 children
102
103(* Parse nth expression: "odd", "even", "3", "2n+1", etc *)
104let parse_nth expr =
105 let expr = lowercase (trim expr) in
106 if expr = "odd" then Some (2, 1)
107 else if expr = "even" then Some (2, 0)
108 else
109 let expr = String.concat "" (fields ~is_sep:(fun c -> c = ' ') expr) in
110 if String.contains expr 'n' then
111 let parts = String.split_on_char 'n' expr in
112 match parts with
113 | [a_part; b_part] ->
114 let a =
115 if a_part = "" || a_part = "+" then 1
116 else if a_part = "-" then -1
117 else int_of_string_opt a_part |> Option.value ~default:0
118 in
119 let b =
120 if b_part = "" then 0
121 else int_of_string_opt b_part |> Option.value ~default:0
122 in
123 Some (a, b)
124 | _ -> None
125 else
126 match int_of_string_opt expr with
127 | Some n -> Some (0, n)
128 | None -> None
129
130let matches_nth index a b =
131 if a = 0 then index = b
132 else
133 let diff = index - b in
134 if a > 0 then diff >= 0 && diff mod a = 0
135 else diff <= 0 && diff mod a = 0
136
137let rec matches_simple node selector =
138 if not (is_element node) then false
139 else
140 match selector.selector_type with
141 | Type_universal -> true
142 | Type_tag ->
143 (match selector.name with
144 | Some name -> lowercase node.Dom.name = lowercase name
145 | None -> false)
146 | Type_id ->
147 (match selector.name with
148 | Some id ->
149 (match Dom.get_attr node "id" with
150 | Some node_id -> node_id = id
151 | None -> false)
152 | None -> false)
153 | Type_class ->
154 (match selector.name with
155 | Some cls ->
156 (match Dom.get_attr node "class" with
157 | Some class_attr ->
158 let classes = fields ~is_sep:(fun c -> c = ' ') class_attr in
159 List.mem cls classes
160 | None -> false)
161 | None -> false)
162 | Type_attr ->
163 (match selector.name with
164 | Some attr_name ->
165 let attr_name_lower = lowercase attr_name in
166 let node_value =
167 List.find_map (fun (k, v) ->
168 if lowercase k = attr_name_lower then Some v
169 else None
170 ) node.Dom.attrs
171 in
172 (match node_value with
173 | None -> false
174 | Some _ when selector.operator = None -> true
175 | Some attr_value ->
176 let value = Option.value selector.value ~default:"" in
177 (match selector.operator with
178 | Some "=" -> attr_value = value
179 | Some "~=" ->
180 let words = fields ~is_sep:(fun c -> c = ' ') attr_value in
181 List.mem value words
182 | Some "|=" ->
183 attr_value = value || String.length attr_value > String.length value &&
184 String.sub attr_value 0 (String.length value) = value &&
185 attr_value.[String.length value] = '-'
186 | Some "^=" -> value <> "" && String.length attr_value >= String.length value &&
187 String.sub attr_value 0 (String.length value) = value
188 | Some "$=" -> value <> "" && String.length attr_value >= String.length value &&
189 String.sub attr_value (String.length attr_value - String.length value) (String.length value) = value
190 | Some "*=" -> value <> "" && string_contains ~haystack:attr_value ~needle:value
191 | Some _ | None -> false))
192 | None -> false)
193 | Type_pseudo ->
194 (match selector.name with
195 | Some "first-child" -> is_first_child node
196 | Some "last-child" -> is_last_child node
197 | Some "first-of-type" -> is_first_of_type node
198 | Some "last-of-type" -> is_last_of_type node
199 | Some "only-child" -> is_first_child node && is_last_child node
200 | Some "only-of-type" -> is_first_of_type node && is_last_of_type node
201 | Some "empty" ->
202 not (List.exists (fun c ->
203 is_element c || (c.Dom.name = "#text" && trim c.Dom.data <> "")
204 ) node.Dom.children)
205 | Some "root" ->
206 (match node.Dom.parent with
207 | Some p -> p.Dom.name = "#document" || p.Dom.name = "#document-fragment"
208 | None -> false)
209 | Some "nth-child" ->
210 (match selector.arg with
211 | Some arg ->
212 (match parse_nth arg with
213 | Some (a, b) -> matches_nth (get_index node) a b
214 | None -> false)
215 | None -> false)
216 | Some "nth-of-type" ->
217 (match selector.arg with
218 | Some arg ->
219 (match parse_nth arg with
220 | Some (a, b) -> matches_nth (get_type_index node) a b
221 | None -> false)
222 | None -> false)
223 | Some "not" ->
224 (match selector.arg with
225 | Some arg ->
226 (try
227 let inner = Selector_parser.parse_selector arg in
228 not (matches_selector node inner)
229 with _ -> true)
230 | None -> true)
231 | _ -> false)
232
233and matches_compound node (compound : Selector_ast.compound_selector) =
234 List.for_all (matches_simple node) compound.selectors
235
236and matches_complex node complex =
237 (* Match from right to left *)
238 let parts = List.rev complex.parts in
239 match parts with
240 | [] -> false
241 | (_, rightmost) :: rest ->
242 if not (matches_compound node rightmost) then false
243 else
244 let rec check current remaining =
245 match remaining with
246 | [] -> true
247 | (Some " ", compound) :: rest ->
248 (* Descendant combinator *)
249 let rec find_ancestor n =
250 match n.Dom.parent with
251 | None -> false
252 | Some p ->
253 if matches_compound p compound then check p rest
254 else find_ancestor p
255 in
256 find_ancestor current
257 | (Some ">", compound) :: rest ->
258 (* Child combinator *)
259 (match current.Dom.parent with
260 | None -> false
261 | Some p ->
262 if matches_compound p compound then check p rest
263 else false)
264 | (Some "+", compound) :: rest ->
265 (* Adjacent sibling *)
266 (match get_previous_sibling current with
267 | None -> false
268 | Some sib ->
269 if matches_compound sib compound then check sib rest
270 else false)
271 | (Some "~", compound) :: rest ->
272 (* General sibling *)
273 let rec find_sibling n =
274 match get_previous_sibling n with
275 | None -> false
276 | Some sib ->
277 if matches_compound sib compound then check sib rest
278 else find_sibling sib
279 in
280 find_sibling current
281 | (None, compound) :: rest ->
282 if matches_compound current compound then check current rest
283 else false
284 | _ -> false
285 in
286 check node rest
287
288and matches_selector node selector =
289 match selector with
290 | Simple s -> matches_simple node s
291 | Compound c -> matches_compound node c
292 | Complex c -> matches_complex node c
293 | List l -> List.exists (fun c -> matches_complex node c) l.selectors
294
295let matches node selector_string =
296 try
297 let selector = Selector_parser.parse_selector selector_string in
298 matches_selector node selector
299 with _ -> false
300
301let rec query_descendants node selector results =
302 List.iter (fun child ->
303 if is_element child && matches_selector child selector then
304 results := child :: !results;
305 query_descendants child selector results;
306 (* Also search template content *)
307 (match child.Dom.template_content with
308 | Some tc -> query_descendants tc selector results
309 | None -> ())
310 ) node.Dom.children
311
312let query root selector_string =
313 try
314 let selector = Selector_parser.parse_selector selector_string in
315 let results = ref [] in
316 query_descendants root selector results;
317 List.rev !results
318 with _ -> []