(*--------------------------------------------------------------------------- Copyright (c) 2025 Anil Madhavapeddy . All rights reserved. SPDX-License-Identifier: MIT ---------------------------------------------------------------------------*) (** HTML5 Conformance Checker Validates HTML5 documents against the {{:https://html.spec.whatwg.org/} WHATWG HTML Living Standard}. {2 Quick Start} {[ let result = Htmlrw_check.check_string "" in if Htmlrw_check.has_errors result then print_endline (Htmlrw_check.to_text result) else print_endline "Valid HTML5!" ]} {2 Handling Specific Errors} Use pattern matching on [error_code] for fine-grained control: {[ List.iter (fun msg -> match msg.Htmlrw_check.error_code with | Parse code -> Printf.printf "Syntax error: %s\n" (Html5rw.Parse_error_code.to_string code) | Conformance code -> match code with | `Img `Missing_alt -> Printf.printf "Accessibility: %s needs alt text\n" (Option.value ~default:"image" msg.element) | `Attr (`Duplicate_id _) -> Printf.printf "Duplicate ID found\n" | _ -> Printf.printf "Error: %s\n" msg.text ) (Htmlrw_check.errors result) ]} {2 CI Integration} {[ let validate_file path = let ic = open_in path in let reader = Bytesrw.Bytes.Reader.of_in_channel ic in let result = Htmlrw_check.check ~system_id:path reader in close_in ic; if Htmlrw_check.has_errors result then begin print_string (Htmlrw_check.to_gnu result); exit 1 end ]} {2 What Gets Checked} - {b Parse errors}: Malformed syntax per WHATWG parsing specification - {b Content model}: Invalid element nesting (e.g., [
] inside [

]) - {b Attributes}: Missing required, disallowed, or invalid attributes - {b Accessibility}: ARIA misuse, missing alt text, form labeling - {b Structure}: Missing DOCTYPE, duplicate IDs, heading hierarchy - {b Internationalization}: Missing or mismatched lang attributes @see WHATWG HTML Living Standard @see Nu HTML Checker *) (** {1:types Types} *) (** Message severity level. *) type severity = | Error (** Conformance violation - document is invalid *) | Warning (** Likely problem - may be intentional *) | Info (** Suggestion for improvement *) (** Source location in the document. Line and column are 1-indexed. *) type location = { line : int; column : int; end_line : int option; end_column : int option; system_id : string option; (** File path or URL if provided *) } (** Typed error code. Pattern match to handle specific errors. {[ match msg.error_code with | Parse Html5rw.Parse_error_code.Eof_in_tag -> (* Unclosed tag at end of file *) | Conformance (`Img `Missing_alt) -> (* Image without alt attribute *) | _ -> () ]} *) type error_code = | Parse of Html5rw.Parse_error_code.t (** Syntax error from the HTML5 parser. @see *) | Conformance of Error_code.t (** Semantic error from conformance checking. *) (** A validation message. *) type message = { severity : severity; text : string; (** Human-readable description *) error_code : error_code; (** Typed code for pattern matching *) location : location option; (** Source location if available *) element : string option; (** Relevant element (lowercase) *) attribute : string option; (** Relevant attribute (lowercase) *) extract : string option; (** Source excerpt for context *) } (** Validation result. Use accessors below to inspect. *) type t (** {1:validation Validation} *) (** Validate HTML from a string. {[ let result = Htmlrw_check.check_string html in if Htmlrw_check.has_errors result then prerr_endline (Htmlrw_check.to_text result) ]} @param system_id File path or URL for error messages. *) val check_string : ?system_id:string -> string -> t (** Validate HTML from a reader. @param collect_parse_errors Include syntax errors (default: [true]). @param system_id File path or URL for error messages. *) val check : ?collect_parse_errors:bool -> ?system_id:string -> Bytesrw.Bytes.Reader.t -> t (** Validate an already-parsed document. Useful when you've parsed the HTML separately and want to run conformance checks without re-parsing. *) val check_parsed : ?collect_parse_errors:bool -> ?system_id:string -> Html5rw.t -> t (** {1:results Results} *) (** All messages in document order. *) val messages : t -> message list (** Only error-severity messages. *) val errors : t -> message list (** Only warning-severity messages. *) val warnings : t -> message list (** Only info-severity messages. *) val infos : t -> message list (** Only syntax errors from the parser. *) val parse_errors : t -> message list (** Only semantic errors from conformance checking. *) val conformance_errors : t -> message list (** [true] if any errors were found. *) val has_errors : t -> bool (** [true] if any warnings were found. *) val has_warnings : t -> bool (** The parsed document. *) val document : t -> Html5rw.t (** The system identifier (file path or URL) if provided. *) val system_id : t -> string option (** {1:formatting Output Formatting} *) (** Human-readable text format. {v file.html:5.3: error [missing-alt]: Element "img" is missing required attribute "alt". v} *) val to_text : t -> string (** JSON format compatible with Nu HTML Validator. {v {"messages":[{"type":"error","message":"...","firstLine":5,"firstColumn":3}]} v} *) val to_json : t -> string (** GNU error format for IDE integration. {v file.html:5:3: error: Element "img" is missing required attribute "alt". v} *) val to_gnu : t -> string (** {1:utilities Utilities} *) (** ["error"], ["warning"], or ["info"]. *) val severity_to_string : severity -> string (** String representation of an error code. *) val error_code_to_string : error_code -> string (** Pretty-printer for severity. *) val pp_severity : Format.formatter -> severity -> unit (** Pretty-printer for location. *) val pp_location : Format.formatter -> location -> unit (** Pretty-printer for message. *) val pp_message : Format.formatter -> message -> unit (** {1:error_codes Error Code Types} For pattern matching on conformance errors. Parse errors use {!Html5rw.Parse_error_code}. {[ match code with | `Attr (`Missing_required_attr _) -> ... | `Img `Missing_alt -> ... | `Aria _ -> ... (* Any ARIA error *) | _ -> ... ]} *) module Error_code = Error_code