OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 318 lines 11 kB view raw
1(* CSS selector matching *) 2 3module Dom = Dom 4open Selector_ast 5 6(* Use Astring for string operations *) 7let lowercase = Astring.String.Ascii.lowercase 8let trim = Astring.String.trim 9let find_sub = Astring.String.find_sub 10let fields = Astring.String.fields 11 12(* Check if haystack contains needle as a substring *) 13let string_contains ~haystack ~needle = 14 Option.is_some (find_sub ~sub:needle haystack) 15 16let is_element node = 17 let name = node.Dom.name in 18 name <> "#text" && name <> "#comment" && name <> "#document" && 19 name <> "#document-fragment" && name <> "!doctype" 20 21let get_element_children node = 22 List.filter is_element node.Dom.children 23 24let get_previous_sibling node = 25 match node.Dom.parent with 26 | None -> None 27 | Some parent -> 28 let rec find_prev prev = function 29 | [] -> None 30 | n :: rest -> 31 if n == node then prev 32 else if is_element n then find_prev (Some n) rest 33 else find_prev prev rest 34 in 35 find_prev None parent.Dom.children 36 37let is_first_child node = 38 match node.Dom.parent with 39 | None -> false 40 | Some parent -> 41 match get_element_children parent with 42 | first :: _ -> first == node 43 | [] -> false 44 45let is_last_child node = 46 match node.Dom.parent with 47 | None -> false 48 | Some parent -> 49 match List.rev (get_element_children parent) with 50 | last :: _ -> last == node 51 | [] -> false 52 53let is_first_of_type node = 54 match node.Dom.parent with 55 | None -> false 56 | Some parent -> 57 let name = lowercase node.Dom.name in 58 let rec find = function 59 | [] -> false 60 | n :: _ when lowercase n.Dom.name = name -> n == node 61 | _ :: rest -> find rest 62 in 63 find (get_element_children parent) 64 65let is_last_of_type node = 66 match node.Dom.parent with 67 | None -> false 68 | Some parent -> 69 let name = lowercase node.Dom.name in 70 let rec find last = function 71 | [] -> (match last with Some l -> l == node | None -> false) 72 | n :: rest when lowercase n.Dom.name = name -> find (Some n) rest 73 | _ :: rest -> find last rest 74 in 75 find None (get_element_children parent) 76 77let get_index node = 78 match node.Dom.parent with 79 | None -> 0 80 | Some parent -> 81 let children = get_element_children parent in 82 let rec find idx = function 83 | [] -> 0 84 | n :: _ when n == node -> idx 85 | _ :: rest -> find (idx + 1) rest 86 in 87 find 1 children 88 89let get_type_index node = 90 match node.Dom.parent with 91 | None -> 0 92 | Some parent -> 93 let name = lowercase node.Dom.name in 94 let children = get_element_children parent in 95 let rec find idx = function 96 | [] -> 0 97 | n :: _ when n == node -> idx 98 | n :: rest when lowercase n.Dom.name = name -> find (idx + 1) rest 99 | _ :: rest -> find idx rest 100 in 101 find 1 children 102 103(* Parse nth expression: "odd", "even", "3", "2n+1", etc *) 104let parse_nth expr = 105 let expr = lowercase (trim expr) in 106 if expr = "odd" then Some (2, 1) 107 else if expr = "even" then Some (2, 0) 108 else 109 let expr = String.concat "" (fields ~is_sep:(fun c -> c = ' ') expr) in 110 if String.contains expr 'n' then 111 let parts = String.split_on_char 'n' expr in 112 match parts with 113 | [a_part; b_part] -> 114 let a = 115 if a_part = "" || a_part = "+" then 1 116 else if a_part = "-" then -1 117 else int_of_string_opt a_part |> Option.value ~default:0 118 in 119 let b = 120 if b_part = "" then 0 121 else int_of_string_opt b_part |> Option.value ~default:0 122 in 123 Some (a, b) 124 | _ -> None 125 else 126 match int_of_string_opt expr with 127 | Some n -> Some (0, n) 128 | None -> None 129 130let matches_nth index a b = 131 if a = 0 then index = b 132 else 133 let diff = index - b in 134 if a > 0 then diff >= 0 && diff mod a = 0 135 else diff <= 0 && diff mod a = 0 136 137let rec matches_simple node selector = 138 if not (is_element node) then false 139 else 140 match selector.selector_type with 141 | Type_universal -> true 142 | Type_tag -> 143 (match selector.name with 144 | Some name -> lowercase node.Dom.name = lowercase name 145 | None -> false) 146 | Type_id -> 147 (match selector.name with 148 | Some id -> 149 (match Dom.get_attr node "id" with 150 | Some node_id -> node_id = id 151 | None -> false) 152 | None -> false) 153 | Type_class -> 154 (match selector.name with 155 | Some cls -> 156 (match Dom.get_attr node "class" with 157 | Some class_attr -> 158 let classes = fields ~is_sep:(fun c -> c = ' ') class_attr in 159 List.mem cls classes 160 | None -> false) 161 | None -> false) 162 | Type_attr -> 163 (match selector.name with 164 | Some attr_name -> 165 let attr_name_lower = lowercase attr_name in 166 let node_value = 167 List.find_map (fun (k, v) -> 168 if lowercase k = attr_name_lower then Some v 169 else None 170 ) node.Dom.attrs 171 in 172 (match node_value with 173 | None -> false 174 | Some _ when selector.operator = None -> true 175 | Some attr_value -> 176 let value = Option.value selector.value ~default:"" in 177 (match selector.operator with 178 | Some "=" -> attr_value = value 179 | Some "~=" -> 180 let words = fields ~is_sep:(fun c -> c = ' ') attr_value in 181 List.mem value words 182 | Some "|=" -> 183 attr_value = value || String.length attr_value > String.length value && 184 String.sub attr_value 0 (String.length value) = value && 185 attr_value.[String.length value] = '-' 186 | Some "^=" -> value <> "" && String.length attr_value >= String.length value && 187 String.sub attr_value 0 (String.length value) = value 188 | Some "$=" -> value <> "" && String.length attr_value >= String.length value && 189 String.sub attr_value (String.length attr_value - String.length value) (String.length value) = value 190 | Some "*=" -> value <> "" && string_contains ~haystack:attr_value ~needle:value 191 | Some _ | None -> false)) 192 | None -> false) 193 | Type_pseudo -> 194 (match selector.name with 195 | Some "first-child" -> is_first_child node 196 | Some "last-child" -> is_last_child node 197 | Some "first-of-type" -> is_first_of_type node 198 | Some "last-of-type" -> is_last_of_type node 199 | Some "only-child" -> is_first_child node && is_last_child node 200 | Some "only-of-type" -> is_first_of_type node && is_last_of_type node 201 | Some "empty" -> 202 not (List.exists (fun c -> 203 is_element c || (c.Dom.name = "#text" && trim c.Dom.data <> "") 204 ) node.Dom.children) 205 | Some "root" -> 206 (match node.Dom.parent with 207 | Some p -> p.Dom.name = "#document" || p.Dom.name = "#document-fragment" 208 | None -> false) 209 | Some "nth-child" -> 210 (match selector.arg with 211 | Some arg -> 212 (match parse_nth arg with 213 | Some (a, b) -> matches_nth (get_index node) a b 214 | None -> false) 215 | None -> false) 216 | Some "nth-of-type" -> 217 (match selector.arg with 218 | Some arg -> 219 (match parse_nth arg with 220 | Some (a, b) -> matches_nth (get_type_index node) a b 221 | None -> false) 222 | None -> false) 223 | Some "not" -> 224 (match selector.arg with 225 | Some arg -> 226 (try 227 let inner = Selector_parser.parse_selector arg in 228 not (matches_selector node inner) 229 with _ -> true) 230 | None -> true) 231 | _ -> false) 232 233and matches_compound node (compound : Selector_ast.compound_selector) = 234 List.for_all (matches_simple node) compound.selectors 235 236and matches_complex node complex = 237 (* Match from right to left *) 238 let parts = List.rev complex.parts in 239 match parts with 240 | [] -> false 241 | (_, rightmost) :: rest -> 242 if not (matches_compound node rightmost) then false 243 else 244 let rec check current remaining = 245 match remaining with 246 | [] -> true 247 | (Some " ", compound) :: rest -> 248 (* Descendant combinator *) 249 let rec find_ancestor n = 250 match n.Dom.parent with 251 | None -> false 252 | Some p -> 253 if matches_compound p compound then check p rest 254 else find_ancestor p 255 in 256 find_ancestor current 257 | (Some ">", compound) :: rest -> 258 (* Child combinator *) 259 (match current.Dom.parent with 260 | None -> false 261 | Some p -> 262 if matches_compound p compound then check p rest 263 else false) 264 | (Some "+", compound) :: rest -> 265 (* Adjacent sibling *) 266 (match get_previous_sibling current with 267 | None -> false 268 | Some sib -> 269 if matches_compound sib compound then check sib rest 270 else false) 271 | (Some "~", compound) :: rest -> 272 (* General sibling *) 273 let rec find_sibling n = 274 match get_previous_sibling n with 275 | None -> false 276 | Some sib -> 277 if matches_compound sib compound then check sib rest 278 else find_sibling sib 279 in 280 find_sibling current 281 | (None, compound) :: rest -> 282 if matches_compound current compound then check current rest 283 else false 284 | _ -> false 285 in 286 check node rest 287 288and matches_selector node selector = 289 match selector with 290 | Simple s -> matches_simple node s 291 | Compound c -> matches_compound node c 292 | Complex c -> matches_complex node c 293 | List l -> List.exists (fun c -> matches_complex node c) l.selectors 294 295let matches node selector_string = 296 try 297 let selector = Selector_parser.parse_selector selector_string in 298 matches_selector node selector 299 with _ -> false 300 301let rec query_descendants node selector results = 302 List.iter (fun child -> 303 if is_element child && matches_selector child selector then 304 results := child :: !results; 305 query_descendants child selector results; 306 (* Also search template content *) 307 (match child.Dom.template_content with 308 | Some tc -> query_descendants tc selector results 309 | None -> ()) 310 ) node.Dom.children 311 312let query root selector_string = 313 try 314 let selector = Selector_parser.parse_selector selector_string in 315 let results = ref [] in 316 query_descendants root selector results; 317 List.rev !results 318 with _ -> []