(*--------------------------------------------------------------------------- Copyright (c) 2025 Anil Madhavapeddy . All rights reserved. SPDX-License-Identifier: MIT ---------------------------------------------------------------------------*) (** CSS Selector Engine This module provides CSS selector parsing and matching for querying the HTML5 DOM. It supports a subset of CSS3 selectors suitable for common web scraping and DOM manipulation tasks. {2 Supported Selectors} {3 Simple Selectors} - Tag: [div], [p], [span] - ID: [#myid] - Class: [.myclass] - Universal: [*] {3 Attribute Selectors} - Presence: [[attr]] - Exact match: [[attr="value"]] - Contains word: [[attr~="value"]] - Starts with: [[attr^="value"]] - Ends with: [[attr$="value"]] - Contains: [[attr*="value"]] - Hyphen-separated: [[attr|="value"]] {3 Pseudo-classes} - [:first-child], [:last-child] - [:nth-child(n)], [:nth-last-child(n)] - [:only-child] - [:empty] - [:not(selector)] {3 Combinators} - Descendant: [div p] (p anywhere inside div) - Child: [div > p] (p direct child of div) - Adjacent sibling: [div + p] (p immediately after div) - General sibling: [div ~ p] (p after div, same parent) {2 Usage} {[ let doc = Html5rw.parse reader in (* Find all paragraphs *) let paragraphs = Html5rw.query doc "p" in (* Find links with specific class *) let links = Html5rw.query doc "a.external" in (* Find table cells in rows *) let cells = Html5rw.query doc "tr > td" in (* Check if a node matches *) let is_active = Html5rw.matches node ".active" ]} *) (** {1 Error Types} *) (** CSS selector error codes. This module provides the {!Error_code.t} variant type that represents all possible errors when parsing CSS selectors. *) module Error_code : sig type t = | Empty_selector (** The selector string was empty or contained only whitespace. *) | Unterminated_string (** A quoted string was not closed before end of input. *) | Unterminated_escape (** An escape sequence was not completed before end of input. *) | Expected_identifier_after_hash (** Expected an identifier after [#] for ID selector. *) | Expected_identifier_after_dot (** Expected an identifier after [.] for class selector. *) | Expected_attribute_name (** Expected an attribute name inside an attribute selector. *) | Expected_closing_bracket (** Expected [\]] to close an attribute selector. *) | Expected_equals_after_operator of char (** Expected [=] after an attribute operator like [~], [|], [^], [$], or [*]. *) | Unexpected_character_in_attribute_selector (** Found an unexpected character inside an attribute selector. *) | Expected_pseudo_class_name (** Expected a pseudo-class name after [:]. *) | Expected_closing_paren (** Expected [)] to close a pseudo-class argument. *) | Unexpected_character of char (** Found an unexpected character in the selector. *) | Expected_attribute_value (** Expected a value after the attribute operator. *) | Expected_closing_bracket_or_operator (** Expected [\]] or an attribute operator like [=]. *) | Expected_selector_after_combinator (** Expected a selector after a combinator ([>], [+], [~], or space). *) | Unexpected_token (** Found an unexpected token in the selector. *) | Expected_end_of_selector (** Expected end of selector but found more tokens. *) val to_string : t -> string (** Convert to a kebab-case string identifier suitable for programmatic use. *) val to_human_string : t -> string (** Convert to a human-readable error message. *) val pp : Format.formatter -> t -> unit (** Pretty-print a selector error code. *) end (** {1 Exceptions} *) exception Selector_error of Error_code.t (** Raised when a selector string is malformed. The exception contains a typed error code describing the parse error. Use {!Error_code.to_string} or {!Error_code.to_human_string} to get a string representation. *) (** {1 Sub-modules} *) (** Abstract syntax tree for parsed selectors. *) module Ast : sig type simple_selector_type = Selector_ast.simple_selector_type = | Type_tag | Type_id | Type_class | Type_universal | Type_attr | Type_pseudo type simple_selector = Selector_ast.simple_selector = { selector_type : simple_selector_type; name : string option; operator : string option; value : string option; arg : string option; } type compound_selector = Selector_ast.compound_selector = { selectors : simple_selector list; } type complex_selector = Selector_ast.complex_selector = { parts : (string option * compound_selector) list; } type selector_list = Selector_ast.selector_list = { selectors : complex_selector list; } type selector = Selector_ast.selector = | Simple of simple_selector | Compound of compound_selector | Complex of complex_selector | List of selector_list val make_simple : simple_selector_type -> ?name:string -> ?operator:string -> ?value:string -> ?arg:string -> unit -> simple_selector val make_compound : simple_selector list -> compound_selector val make_complex : (string option * compound_selector) list -> complex_selector val make_list : complex_selector list -> selector_list val pp_simple_selector_type : Format.formatter -> simple_selector_type -> unit (** Pretty-print a simple selector type. *) val pp_simple_selector : Format.formatter -> simple_selector -> unit (** Pretty-print a simple selector. *) val pp_compound_selector : Format.formatter -> compound_selector -> unit (** Pretty-print a compound selector. *) val pp_complex_selector : Format.formatter -> complex_selector -> unit (** Pretty-print a complex selector. *) val pp_selector_list : Format.formatter -> selector_list -> unit (** Pretty-print a selector list. *) val pp : Format.formatter -> selector -> unit (** Pretty-print a selector. *) end (** Token types for the selector lexer. *) module Token : sig type t = Selector_token.t end (** {1 Functions} *) val parse : string -> Ast.selector (** Parse a CSS selector string. @raise Selector_error if the selector is malformed. *) val query : Dom.node -> string -> Dom.node list (** Query the DOM tree with a CSS selector. Returns all nodes matching the selector in document order. @raise Selector_error if the selector is malformed. {[ let divs = query root_node "div.content > p" ]} *) val matches : Dom.node -> string -> bool (** Check if a node matches a CSS selector. @raise Selector_error if the selector is malformed. {[ if matches node ".active" then (* node has class "active" *) ]} *)