(* CSS selector lexer *) exception Selector_error of Selector_error_code.t type t = { input : string; len : int; mutable pos : int; } let create input = { input; len = String.length input; pos = 0 } let peek t = if t.pos < t.len then Some t.input.[t.pos] else None let advance t = if t.pos < t.len then t.pos <- t.pos + 1 let consume t = let c = peek t in advance t; c let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r' || c = '\x0C' let is_name_start c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || Char.code c > 127 let is_name_char c = is_name_start c || (c >= '0' && c <= '9') let skip_whitespace t = while t.pos < t.len && is_whitespace t.input.[t.pos] do advance t done let read_name t = let start = t.pos in while t.pos < t.len && is_name_char t.input.[t.pos] do advance t done; String.sub t.input start (t.pos - start) let read_string t quote = advance t; (* Skip opening quote *) let buf = Buffer.create 32 in let rec loop () = match peek t with | None -> raise (Selector_error Selector_error_code.Unterminated_string) | Some c when c = quote -> advance t | Some '\\' -> advance t; (match peek t with | Some c -> Buffer.add_char buf c; advance t; loop () | None -> raise (Selector_error Selector_error_code.Unterminated_escape)) | Some c -> Buffer.add_char buf c; advance t; loop () in loop (); Buffer.contents buf let read_unquoted_attr_value t = let start = t.pos in while t.pos < t.len && let c = t.input.[t.pos] in not (is_whitespace c) && c <> ']' do advance t done; String.sub t.input start (t.pos - start) let tokenize input = let t = create input in let tokens = ref [] in let pending_ws = ref false in while t.pos < t.len do let c = t.input.[t.pos] in if is_whitespace c then begin pending_ws := true; skip_whitespace t end else if c = '>' || c = '+' || c = '~' then begin pending_ws := false; advance t; skip_whitespace t; tokens := Selector_token.Combinator (String.make 1 c) :: !tokens end else begin if !pending_ws && !tokens <> [] && c <> ',' then tokens := Selector_token.Combinator " " :: !tokens; pending_ws := false; match c with | '*' -> advance t; tokens := Selector_token.Universal :: !tokens | '#' -> advance t; let name = read_name t in if name = "" then raise (Selector_error Selector_error_code.Expected_identifier_after_hash); tokens := Selector_token.Id name :: !tokens | '.' -> advance t; let name = read_name t in if name = "" then raise (Selector_error Selector_error_code.Expected_identifier_after_dot); tokens := Selector_token.Class name :: !tokens | '[' -> advance t; tokens := Selector_token.Attr_start :: !tokens; skip_whitespace t; let attr_name = read_name t in if attr_name = "" then raise (Selector_error Selector_error_code.Expected_attribute_name); tokens := Selector_token.Tag attr_name :: !tokens; skip_whitespace t; (match peek t with | Some ']' -> advance t; tokens := Selector_token.Attr_end :: !tokens | Some '=' -> advance t; tokens := Selector_token.Attr_op "=" :: !tokens; skip_whitespace t; let value = match peek t with | Some '"' -> read_string t '"' | Some '\'' -> read_string t '\'' | _ -> read_unquoted_attr_value t in tokens := Selector_token.String value :: !tokens; skip_whitespace t; if peek t <> Some ']' then raise (Selector_error Selector_error_code.Expected_closing_bracket); advance t; tokens := Selector_token.Attr_end :: !tokens | Some ('~' | '|' | '^' | '$' | '*') as op_char -> let op_c = Option.get op_char in advance t; if peek t <> Some '=' then raise (Selector_error (Selector_error_code.Expected_equals_after_operator op_c)); advance t; tokens := Selector_token.Attr_op (String.make 1 op_c ^ "=") :: !tokens; skip_whitespace t; let value = match peek t with | Some '"' -> read_string t '"' | Some '\'' -> read_string t '\'' | _ -> read_unquoted_attr_value t in tokens := Selector_token.String value :: !tokens; skip_whitespace t; if peek t <> Some ']' then raise (Selector_error Selector_error_code.Expected_closing_bracket); advance t; tokens := Selector_token.Attr_end :: !tokens | _ -> raise (Selector_error Selector_error_code.Unexpected_character_in_attribute_selector)) | ',' -> advance t; skip_whitespace t; tokens := Selector_token.Comma :: !tokens | ':' -> advance t; tokens := Selector_token.Colon :: !tokens; let name = read_name t in if name = "" then raise (Selector_error Selector_error_code.Expected_pseudo_class_name); tokens := Selector_token.Tag name :: !tokens; if peek t = Some '(' then begin advance t; tokens := Selector_token.Paren_open :: !tokens; skip_whitespace t; (* Read argument until closing paren *) let depth = ref 1 in let start = t.pos in while !depth > 0 && t.pos < t.len do match t.input.[t.pos] with | '(' -> incr depth; advance t | ')' -> decr depth; if !depth > 0 then advance t | _ -> advance t done; let arg = String.trim (String.sub t.input start (t.pos - start)) in if arg <> "" then tokens := Selector_token.String arg :: !tokens; if peek t <> Some ')' then raise (Selector_error Selector_error_code.Expected_closing_paren); advance t; tokens := Selector_token.Paren_close :: !tokens end | _ when is_name_start c -> let name = read_name t in tokens := Selector_token.Tag (String.lowercase_ascii name) :: !tokens | _ -> raise (Selector_error (Selector_error_code.Unexpected_character c)) end done; tokens := Selector_token.EOF :: !tokens; List.rev !tokens