···5566module Error_code = Error_code
7788-(* Public types - defined here to avoid re-exporting internal modules *)
88+(* Public types *)
991010type severity = Error | Warning | Info
1111···1717 system_id : string option;
1818}
19192020+type error_code =
2121+ | Parse of Html5rw.Parse_error_code.t
2222+ | Conformance of Error_code.t
2323+2024type message = {
2125 severity : severity;
2226 text : string;
2323- code : string;
2424- error_code : Error_code.t option;
2727+ error_code : error_code;
2528 location : location option;
2629 element : string option;
2730 attribute : string option;
···4952 system_id = loc.system_id;
5053}
51545555+let convert_error_code = function
5656+ | Message.Parse_error code -> Parse code
5757+ | Message.Conformance_error code -> Conformance code
5858+5259let convert_message (m : Message.t) : message = {
5360 severity = convert_severity m.severity;
5461 text = m.message;
5555- code = m.code;
5656- error_code = m.error_code;
6262+ error_code = convert_error_code m.error_code;
5763 location = Option.map convert_location m.location;
5864 element = m.element;
5965 attribute = m.attribute;
···77837884 match Xhtml_parser.parse_xhtml content with
7985 | Ok root ->
8080- (* Run all registered checkers via DOM traversal *)
8186 let registry = Checker_registry.default () in
8287 Dom_walker.walk_registry registry collector root;
8388 let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in
···111116 { doc; msgs; sys_id = system_id }
112117 end
113118119119+let check_string ?system_id html =
120120+ let reader = Bytesrw.Bytes.Reader.of_string html in
121121+ check ?system_id reader
122122+114123let check_parsed ?(collect_parse_errors = true) ?system_id doc =
115124 let collector = Message_collector.create () in
116125···138147let infos t =
139148 List.filter (fun msg -> msg.severity = Info) t.msgs
140149150150+let parse_errors t =
151151+ List.filter (fun msg ->
152152+ match msg.error_code with Parse _ -> true | Conformance _ -> false
153153+ ) t.msgs
154154+155155+let conformance_errors t =
156156+ List.filter (fun msg ->
157157+ match msg.error_code with Parse _ -> false | Conformance _ -> true
158158+ ) t.msgs
159159+141160let has_errors t =
142161 List.exists (fun msg -> msg.severity = Error) t.msgs
143162···163182 system_id = loc.system_id;
164183}
165184185185+let unconvert_error_code = function
186186+ | Parse code -> Message.Parse_error code
187187+ | Conformance code -> Message.Conformance_error code
188188+166189let unconvert_message (m : message) : Message.t = {
167190 severity = unconvert_severity m.severity;
168191 message = m.text;
169169- code = m.code;
170170- error_code = m.error_code;
192192+ error_code = unconvert_error_code m.error_code;
171193 location = Option.map unconvert_location m.location;
172194 element = m.element;
173195 attribute = m.attribute;
···193215 | Warning -> "warning"
194216 | Info -> "info"
195217218218+let error_code_to_string = function
219219+ | Parse code -> Html5rw.Parse_error_code.to_string code
220220+ | Conformance code -> Error_code.code_string code
221221+196222let pp_severity fmt sev =
197223 Format.pp_print_string fmt (severity_to_string sev)
198224···207233 match msg.location with
208234 | Some loc -> Format.fprintf fmt " (at %a)" pp_location loc
209235 | None -> ()
210210-211211-let message_to_string msg =
212212- Format.asprintf "%a" pp_message msg
+135-247
lib/htmlrw_check/htmlrw_check.mli
···5566(** HTML5 Conformance Checker
7788- This module validates HTML5 documents against the
99- {{:https://html.spec.whatwg.org/} WHATWG HTML Living Standard},
1010- reporting conformance errors, warnings, and suggestions.
88+ Validates HTML5 documents against the
99+ {{:https://html.spec.whatwg.org/} WHATWG HTML Living Standard}.
11101211 {2 Quick Start}
13121413 {[
1515- (* Validate HTML from a string *)
1616- let html = "<html><body><img></body></html>" in
1717- let reader = Bytesrw.Bytes.Reader.of_string html in
1818- let result = Htmlrw_check.check reader in
1414+ let result = Htmlrw_check.check_string "<html><body><img></body></html>" in
19152020- if Htmlrw_check.has_errors result then begin
2121- List.iter (fun msg ->
2222- Printf.printf "%s: %s\n"
2323- (Htmlrw_check.severity_to_string msg.Htmlrw_check.severity)
2424- msg.Htmlrw_check.text
2525- ) (Htmlrw_check.errors result)
2626- end
1616+ if Htmlrw_check.has_errors result then
1717+ print_endline (Htmlrw_check.to_text result)
1818+ else
1919+ print_endline "Valid HTML5!"
2720 ]}
28212929- {2 What Gets Checked}
2222+ {2 Handling Specific Errors}
30233131- The checker validates:
2424+ Use pattern matching on {!field-message.error_code} for fine-grained control:
32253333- - {b Parse errors}: Malformed HTML syntax (missing end tags, invalid
3434- nesting, etc.) per the WHATWG parsing specification
3535- - {b Content model}: Elements appearing in contexts where they're not
3636- allowed (e.g., [<div>] inside [<p>])
3737- - {b Attributes}: Missing required attributes, disallowed attributes,
3838- and invalid attribute values
3939- - {b Accessibility}: ARIA role/attribute misuse, missing alt text on
4040- images, form labeling issues
4141- - {b Document structure}: Missing DOCTYPE, duplicate IDs, heading
4242- hierarchy issues
4343- - {b Internationalization}: Missing or mismatched lang attributes
2626+ {[
2727+ List.iter (fun msg ->
2828+ match msg.Htmlrw_check.error_code with
2929+ | Parse code ->
3030+ Printf.printf "Syntax error: %s\n"
3131+ (Html5rw.Parse_error_code.to_string code)
3232+ | Conformance code ->
3333+ match code with
3434+ | `Img `Missing_alt ->
3535+ Printf.printf "Accessibility: %s needs alt text\n"
3636+ (Option.value ~default:"image" msg.element)
3737+ | `Attr (`Duplicate_id _) ->
3838+ Printf.printf "Duplicate ID found\n"
3939+ | _ ->
4040+ Printf.printf "Error: %s\n" msg.text
4141+ ) (Htmlrw_check.errors result)
4242+ ]}
44434545- {2 Output Formats}
4444+ {2 CI Integration}
46454747- Results can be formatted as:
4848- - {b Text}: Human-readable messages for terminal output
4949- - {b JSON}: Machine-readable format compatible with Nu HTML Validator
5050- - {b GNU}: Error format for IDE integration
4646+ {[
4747+ let validate_file path =
4848+ let ic = open_in path in
4949+ let reader = Bytesrw.Bytes.Reader.of_in_channel ic in
5050+ let result = Htmlrw_check.check ~system_id:path reader in
5151+ close_in ic;
5252+ if Htmlrw_check.has_errors result then begin
5353+ print_string (Htmlrw_check.to_gnu result);
5454+ exit 1
5555+ end
5656+ ]}
51575252- @see <https://html.spec.whatwg.org/>
5353- WHATWG HTML Living Standard
5454- @see <https://validator.w3.org/nu/>
5555- Nu HTML Checker (reference validator) *)
5858+ {2 What Gets Checked}
56595757-(** {1:types Types} *)
6060+ - {b Parse errors}: Malformed syntax per WHATWG parsing specification
6161+ - {b Content model}: Invalid element nesting (e.g., [<div>] inside [<p>])
6262+ - {b Attributes}: Missing required, disallowed, or invalid attributes
6363+ - {b Accessibility}: ARIA misuse, missing alt text, form labeling
6464+ - {b Structure}: Missing DOCTYPE, duplicate IDs, heading hierarchy
6565+ - {b Internationalization}: Missing or mismatched lang attributes
58665959-(** Message severity level.
6767+ @see <https://html.spec.whatwg.org/> WHATWG HTML Living Standard
6868+ @see <https://validator.w3.org/nu/> Nu HTML Checker *)
60696161- - [Error]: Conformance error - the document violates the HTML5 spec
6262- - [Warning]: Likely problem - should be reviewed but may be intentional
6363- - [Info]: Suggestion - best practice recommendation *)
6464-type severity = Error | Warning | Info
65706666-(** Source location of a validation issue.
7171+(** {1:types Types} *)
7272+7373+(** Message severity level. *)
7474+type severity =
7575+ | Error (** Conformance violation - document is invalid *)
7676+ | Warning (** Likely problem - may be intentional *)
7777+ | Info (** Suggestion for improvement *)
67786868- Locations use 1-based line and column numbers matching typical editor
6969- conventions. The [system_id] field contains the file path or URL if one
7070- was provided to the checker. *)
7979+(** Source location in the document. Line and column are 1-indexed. *)
7180type location = {
7281 line : int;
7373- (** Line number (1-indexed) where the issue was found. *)
7474-7582 column : int;
7676- (** Column number (1-indexed) within the line. *)
7777-7883 end_line : int option;
7979- (** End line for issues spanning multiple lines. *)
8080-8184 end_column : int option;
8282- (** End column for range-based issues. *)
8383-8484- system_id : string option;
8585- (** File path or URL, if provided to the checker. *)
8585+ system_id : string option; (** File path or URL if provided *)
8686}
87878888-(** A validation message describing a conformance issue.
8888+(** Typed error code. Pattern match to handle specific errors.
89899090- Each message contains:
9191- - The {!field-severity} indicating how serious the issue is
9292- - Human-readable {!field-text} explaining the problem
9393- - Machine-readable {!field-code} for programmatic handling
9494- - Optional {!field-error_code} for fine-grained pattern matching
9595- - Source {!field-location} when available
9696- - Context ({!field-element}, {!field-attribute}) when relevant *)
9090+ {[
9191+ match msg.error_code with
9292+ | Parse Html5rw.Parse_error_code.Eof_in_tag ->
9393+ (* Unclosed tag at end of file *)
9494+ | Conformance (`Img `Missing_alt) ->
9595+ (* Image without alt attribute *)
9696+ | _ -> ()
9797+ ]} *)
9898+type error_code =
9999+ | Parse of Html5rw.Parse_error_code.t
100100+ (** Syntax error from the HTML5 parser.
101101+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors> *)
102102+ | Conformance of Error_code.t
103103+ (** Semantic error from conformance checking. *)
104104+105105+(** A validation message. *)
97106type message = {
98107 severity : severity;
9999- (** Severity level of this message. *)
100100-101101- text : string;
102102- (** Human-readable description of the issue.
103103-104104- The text follows Nu HTML Validator message conventions, using
105105- Unicode quotes around element/attribute names:
106106- ["Element \xe2\x80\x9cdiv\xe2\x80\x9d not allowed as child..."] *)
107107-108108- code : string;
109109- (** Machine-readable error code in kebab-case.
110110-111111- Examples: ["missing-alt"], ["duplicate-id"], ["unexpected-end-tag"].
112112- Useful for filtering or categorizing errors programmatically. *)
113113-114114- error_code : Error_code.t option;
115115- (** Typed error code for pattern matching.
116116-117117- When present, allows fine-grained handling of specific errors:
118118- {[
119119- match msg.error_code with
120120- | Some (`Img `Missing_alt) -> suggest_alt_text ()
121121- | Some (`Attr (`Duplicate_id (`Id id))) -> highlight_duplicate id
122122- | _ -> show_generic_error msg
123123- ]} *)
124124-125125- location : location option;
126126- (** Source location where the issue was detected.
127127-128128- [None] for document-level issues or when location tracking is
129129- unavailable (e.g., for some content model errors). *)
130130-131131- element : string option;
132132- (** Element name relevant to this message (e.g., ["img"], ["div"]).
133133-134134- Lowercase, without angle brackets. *)
135135-136136- attribute : string option;
137137- (** Attribute name relevant to this message (e.g., ["alt"], ["href"]).
138138-139139- Lowercase. Only present for attribute-related errors. *)
140140-141141- extract : string option;
142142- (** Source excerpt showing context around the error.
143143-144144- Typically a few characters before and after the problematic location.
145145- Useful for displaying the error in context. *)
108108+ text : string; (** Human-readable description *)
109109+ error_code : error_code; (** Typed code for pattern matching *)
110110+ location : location option; (** Source location if available *)
111111+ element : string option; (** Relevant element (lowercase) *)
112112+ attribute : string option; (** Relevant attribute (lowercase) *)
113113+ extract : string option; (** Source excerpt for context *)
146114}
147115148148-(** Validation result containing all messages and the parsed document.
149149-150150- Use {!messages}, {!errors}, {!warnings}, and {!infos} to access
151151- the validation messages. Use {!document} to access the parsed DOM. *)
116116+(** Validation result. Use accessors below to inspect. *)
152117type t
153118154154-(** {1:validation Validation Functions} *)
155119156156-(** Validate HTML from a reader.
120120+(** {1:validation Validation} *)
157121158158- Parses the HTML input and runs all conformance checks, returning
159159- a result containing any validation messages.
122122+(** Validate HTML from a string.
160123161161- {b Example:}
162124 {[
163163- let ic = open_in "page.html" in
164164- let reader = Bytesrw.Bytes.Reader.of_in_channel ic in
165165- let result = Htmlrw_check.check ~system_id:"page.html" reader in
166166- close_in ic;
167167-125125+ let result = Htmlrw_check.check_string html in
168126 if Htmlrw_check.has_errors result then
169169- print_endline (Htmlrw_check.to_text result)
127127+ prerr_endline (Htmlrw_check.to_text result)
170128 ]}
171129172172- @param collect_parse_errors If [true] (default), include HTML parse
173173- errors in the results. Set to [false] to only get conformance
174174- checker errors (content model, attributes, etc.).
175175- @param system_id File path or URL for the document. Used in error
176176- messages and the {!location} field. Does not affect validation. *)
130130+ @param system_id File path or URL for error messages. *)
131131+val check_string : ?system_id:string -> string -> t
132132+133133+(** Validate HTML from a reader.
134134+135135+ @param collect_parse_errors Include syntax errors (default: [true]).
136136+ @param system_id File path or URL for error messages. *)
177137val check :
178138 ?collect_parse_errors:bool ->
179139 ?system_id:string ->
180140 Bytesrw.Bytes.Reader.t ->
181141 t
182142183183-(** Validate an already-parsed HTML document.
184184-185185- Runs conformance checks on an existing {!Html5rw.t} parse result.
186186- Useful when you've already parsed the document and want to validate
187187- it without re-parsing.
188188-189189- {b Example:}
190190- {[
191191- let doc = Html5rw.parse reader in
192192- (* ... manipulate the DOM ... *)
193193- let result = Htmlrw_check.check_parsed doc in
194194- ]}
143143+(** Validate an already-parsed document.
195144196196- @param collect_parse_errors If [true] (default), include any parse
197197- errors that were collected during the original parse.
198198- @param system_id File path or URL for error reporting. *)
145145+ Useful when you've parsed the HTML separately and want to run
146146+ conformance checks without re-parsing. *)
199147val check_parsed :
200148 ?collect_parse_errors:bool ->
201149 ?system_id:string ->
202150 Html5rw.t ->
203151 t
204152205205-(** {1:results Result Accessors} *)
206153207207-(** Get all validation messages.
154154+(** {1:results Results} *)
208155209209- Returns messages in the order they were generated, which roughly
210210- corresponds to document order for element-related errors. *)
156156+(** All messages in document order. *)
211157val messages : t -> message list
212158213213-(** Get only error messages.
214214-215215- Errors indicate conformance violations - the document does not
216216- comply with the HTML5 specification. *)
159159+(** Only error-severity messages. *)
217160val errors : t -> message list
218161219219-(** Get only warning messages.
220220-221221- Warnings indicate likely problems that may be intentional in
222222- some cases (e.g., deprecated features still in use). *)
162162+(** Only warning-severity messages. *)
223163val warnings : t -> message list
224164225225-(** Get only informational messages.
226226-227227- Info messages are suggestions for best practices that don't
228228- affect conformance. *)
165165+(** Only info-severity messages. *)
229166val infos : t -> message list
230167231231-(** Test if any errors were found.
168168+(** Only syntax errors from the parser. *)
169169+val parse_errors : t -> message list
232170233233- Equivalent to [errors result <> []] but more efficient. *)
234234-val has_errors : t -> bool
171171+(** Only semantic errors from conformance checking. *)
172172+val conformance_errors : t -> message list
235173236236-(** Test if any warnings were found.
174174+(** [true] if any errors were found. *)
175175+val has_errors : t -> bool
237176238238- Equivalent to [warnings result <> []] but more efficient. *)
177177+(** [true] if any warnings were found. *)
239178val has_warnings : t -> bool
240179241241-(** Get the parsed document.
242242-243243- Returns the DOM tree that was validated. For {!check}, this is the
244244- newly parsed document. For {!check_parsed}, this is the document
245245- that was passed in. *)
180180+(** The parsed document. *)
246181val document : t -> Html5rw.t
247182248248-(** Get the system identifier.
249249-250250- Returns the file path or URL that was passed to {!check} or
251251- {!check_parsed}, or [None] if not provided. *)
183183+(** The system identifier (file path or URL) if provided. *)
252184val system_id : t -> string option
253185186186+254187(** {1:formatting Output Formatting} *)
255188256256-(** Format messages as human-readable text.
189189+(** Human-readable text format.
257190258258- Produces multi-line output suitable for terminal display:
259191 {v
260260- Error: Element "img" is missing required attribute "alt".
261261- At line 5, column 3
262262- <img src="photo.jpg">
263263- v}
264264-265265- Messages are formatted with severity, description, location,
266266- and source excerpt when available. *)
192192+file.html:5.3: error [missing-alt]: Element "img" is missing required attribute "alt".
193193+ v} *)
267194val to_text : t -> string
268195269269-(** Format messages as JSON.
196196+(** JSON format compatible with Nu HTML Validator.
270197271271- Produces JSON output compatible with the Nu HTML Validator format:
272198 {v
273273- {
274274- "messages": [
275275- {
276276- "type": "error",
277277- "message": "Element \"img\" is missing required attribute \"alt\".",
278278- "lastLine": 5,
279279- "lastColumn": 3
280280- }
281281- ]
282282- }
283283- v}
284284-285285- Useful for machine processing and integration with other tools. *)
199199+{"messages":[{"type":"error","message":"...","firstLine":5,"firstColumn":3}]}
200200+ v} *)
286201val to_json : t -> string
287202288288-(** Format messages in GNU error format.
203203+(** GNU error format for IDE integration.
289204290290- Produces one-line-per-error output for IDE integration:
291205 {v
292292- page.html:5:3: error: Element "img" is missing required attribute "alt".
293293- v}
294294-295295- This format is recognized by many editors and build tools. *)
206206+file.html:5:3: error: Element "img" is missing required attribute "alt".
207207+ v} *)
296208val to_gnu : t -> string
297209298298-(** {1:utilities Utility Functions} *)
299210300300-(** Convert severity to lowercase string.
211211+(** {1:utilities Utilities} *)
301212302302- Returns ["error"], ["warning"], or ["info"]. *)
213213+(** ["error"], ["warning"], or ["info"]. *)
303214val severity_to_string : severity -> string
304215305305-(** Pretty-print a severity value. *)
216216+(** String representation of an error code. *)
217217+val error_code_to_string : error_code -> string
218218+219219+(** Pretty-printer for severity. *)
306220val pp_severity : Format.formatter -> severity -> unit
307221308308-(** Pretty-print a location. *)
222222+(** Pretty-printer for location. *)
309223val pp_location : Format.formatter -> location -> unit
310224311311-(** Pretty-print a message.
312312-313313- Includes severity, text, and location if available. *)
225225+(** Pretty-printer for message. *)
314226val pp_message : Format.formatter -> message -> unit
315227316316-(** Convert a message to a single-line string.
317228318318- Includes severity and message text. *)
319319-val message_to_string : message -> string
320320-321321-(** {1:error_codes Error Codes}
229229+(** {1:error_codes Error Code Types}
322230323323- The {!Error_code} module provides typed error codes for programmatic
324324- handling of validation issues. Use pattern matching to handle specific
325325- errors:
231231+ For pattern matching on conformance errors. Parse errors use
232232+ {!Html5rw.Parse_error_code}.
326233327234 {[
328328- let handle_message msg =
329329- match msg.Htmlrw_check.error_code with
330330- | Some (`Img `Missing_alt) ->
331331- (* Image accessibility issue *)
332332- suggest_alt_text msg
333333- | Some (`Attr (`Duplicate_id (`Id id))) ->
334334- (* Duplicate ID found *)
335335- highlight_all_with_id id
336336- | Some (`Aria _) ->
337337- (* Any ARIA-related error *)
338338- show_aria_help ()
339339- | _ ->
340340- (* Generic handling *)
341341- display_error msg
342342- ]}
343343-344344- The error codes are organized into categories:
345345- - [`Attr _]: Attribute errors (missing, invalid, duplicate)
346346- - [`Element _]: Element/content model errors
347347- - [`Aria _]: ARIA accessibility errors
348348- - [`Img _]: Image-related errors
349349- - [`Table _]: Table structure errors
350350- - And more...
351351-352352- See {!Error_code} for the complete type definition. *)
235235+ match code with
236236+ | `Attr (`Missing_required_attr _) -> ...
237237+ | `Img `Missing_alt -> ...
238238+ | `Aria _ -> ... (* Any ARIA error *)
239239+ | _ -> ...
240240+ ]} *)
353241module Error_code = Error_code
···1818 system_id : string option; (** File path or URL *)
1919}
20202121+(** Unified error code type covering both parse errors and conformance errors. *)
2222+type error_code =
2323+ | Parse_error of Html5rw.Parse_error_code.t
2424+ (** Parse error from the HTML5 tokenizer/parser *)
2525+ | Conformance_error of Error_code.t
2626+ (** Conformance error from semantic validation *)
2727+2128(** A validation message. *)
2229type t = {
2330 severity : severity;
2431 message : string; (** Human-readable description *)
2525- code : string; (** Machine-readable error code *)
2626- error_code : Error_code.t option; (** Typed error code if available *)
3232+ error_code : error_code; (** Typed error code *)
2733 location : location option;
2834 element : string option; (** Element name if relevant *)
2935 attribute : string option; (** Attribute name if relevant *)
···32383339(** {1 Constructors} *)
34403535-(** Create a message from a typed error code (preferred method). *)
3636-val of_error_code :
4141+(** Create a message from a conformance error code. *)
4242+val of_conformance_error :
3743 ?location:location ->
3844 ?element:string ->
3945 ?attribute:string ->
···4147 Error_code.t ->
4248 t
43494444-(** Create a validation message with specified severity (legacy). *)
4545-val make :
4646- severity:severity ->
4747- message:string ->
4848- ?code:string ->
4949- ?location:location ->
5050- ?element:string ->
5151- ?attribute:string ->
5252- ?extract:string ->
5353- unit ->
5454- t
5555-5656-(** Create an error message (legacy). *)
5757-val error :
5858- message:string ->
5959- ?code:string ->
5050+(** Create a message from a parse error code. *)
5151+val of_parse_error :
6052 ?location:location ->
6153 ?element:string ->
6254 ?attribute:string ->
6355 ?extract:string ->
6464- unit ->
6565- t
6666-6767-(** Create a warning message (legacy). *)
6868-val warning :
6956 message:string ->
7070- ?code:string ->
7171- ?location:location ->
7272- ?element:string ->
7373- ?attribute:string ->
7474- ?extract:string ->
7575- unit ->
7676- t
7777-7878-(** Create an informational message (legacy). *)
7979-val info :
8080- message:string ->
8181- ?code:string ->
8282- ?location:location ->
8383- ?element:string ->
8484- ?attribute:string ->
8585- ?extract:string ->
8686- unit ->
5757+ Html5rw.Parse_error_code.t ->
8758 t
88598960(** Create a location record. *)
···9566 ?system_id:string ->
9667 unit ->
9768 location
6969+7070+(** {1 Utilities} *)
7171+7272+(** Get the string representation of an error code. *)
7373+val error_code_to_string : error_code -> string
98749975(** {1 Formatting} *)
10076
+2-27
lib/htmlrw_check/message_collector.ml
···13131414let add t msg = t.messages <- msg :: t.messages
15151616-(** Add a message from a typed error code *)
1616+(** Add a message from a typed conformance error code *)
1717let add_typed t ?location ?element ?attribute ?extract error_code =
1818 (* Use provided location, or fall back to current_location *)
1919 let loc = match location with
2020 | Some _ -> location
2121 | None -> t.current_location
2222 in
2323- let msg = Message.of_error_code ?location:loc ?element ?attribute ?extract error_code in
2424- add t msg
2525-2626-(** Add an error from a typed error code *)
2727-let add_error_code t ?location ?element ?attribute ?extract error_code =
2828- add_typed t ?location ?element ?attribute ?extract error_code
2929-3030-(** Legacy: Add an error with manual message text *)
3131-let add_error t ~message ?code ?location ?element ?attribute ?extract () =
3232- let msg =
3333- Message.error ~message ?code ?location ?element ?attribute ?extract ()
3434- in
3535- add t msg
3636-3737-(** Legacy: Add a warning with manual message text *)
3838-let add_warning t ~message ?code ?location ?element ?attribute ?extract () =
3939- let msg =
4040- Message.warning ~message ?code ?location ?element ?attribute ?extract ()
4141- in
4242- add t msg
4343-4444-(** Legacy: Add an info message with manual message text *)
4545-let add_info t ~message ?code ?location ?element ?attribute ?extract () =
4646- let msg =
4747- Message.info ~message ?code ?location ?element ?attribute ?extract ()
4848- in
2323+ let msg = Message.of_conformance_error ?location:loc ?element ?attribute ?extract error_code in
4924 add t msg
50255126let messages t = List.rev t.messages
+5-52
lib/htmlrw_check/message_collector.mli
···2020(** Get the current location. *)
2121val get_current_location : t -> Message.location option
22222323-(** {1 Adding Messages - Typed Error Codes (Preferred)} *)
2424-2525-(** Add a message from a typed error code. *)
2626-val add_typed :
2727- t ->
2828- ?location:Message.location ->
2929- ?element:string ->
3030- ?attribute:string ->
3131- ?extract:string ->
3232- Error_code.t ->
3333- unit
3434-3535-(** Add an error from a typed error code. Alias for add_typed. *)
3636-val add_error_code :
3737- t ->
3838- ?location:Message.location ->
3939- ?element:string ->
4040- ?attribute:string ->
4141- ?extract:string ->
4242- Error_code.t ->
4343- unit
4444-4545-(** {1 Adding Messages - Legacy (for migration)} *)
2323+(** {1 Adding Messages} *)
46244725(** Add a message to the collector. *)
4826val add : t -> Message.t -> unit
49275050-(** Add an error message to the collector (legacy). *)
5151-val add_error :
2828+(** Add a message from a typed conformance error code.
2929+ Uses the current location if no explicit location is provided. *)
3030+val add_typed :
5231 t ->
5353- message:string ->
5454- ?code:string ->
5555- ?location:Message.location ->
5656- ?element:string ->
5757- ?attribute:string ->
5858- ?extract:string ->
5959- unit ->
6060- unit
6161-6262-(** Add a warning message to the collector (legacy). *)
6363-val add_warning :
6464- t ->
6565- message:string ->
6666- ?code:string ->
6732 ?location:Message.location ->
6833 ?element:string ->
6934 ?attribute:string ->
7035 ?extract:string ->
7171- unit ->
7272- unit
7373-7474-(** Add an info message to the collector (legacy). *)
7575-val add_info :
7676- t ->
7777- message:string ->
7878- ?code:string ->
7979- ?location:Message.location ->
8080- ?element:string ->
8181- ?attribute:string ->
8282- ?extract:string ->
8383- unit ->
3636+ Error_code.t ->
8437 unit
85388639(** {1 Retrieving Messages} *)
+3-3
lib/htmlrw_check/message_format.ml
···2424 match system_id with Some s -> s | None -> "input")
2525 in
2626 let severity_str = Message.severity_to_string msg.Message.severity in
2727- let code_str = " [" ^ msg.Message.code ^ "]" in
2727+ let code_str = " [" ^ Message.error_code_to_string msg.Message.error_code ^ "]" in
2828 let elem_str =
2929 match msg.Message.element with
3030 | Some e -> " (element: " ^ e ^ ")"
···5959 match system_id with Some s -> s ^ ":0:0" | None -> "input:0:0")
6060 in
6161 let severity_str = Message.severity_to_string msg.Message.severity in
6262- let code_str = " [" ^ msg.Message.code ^ "]" in
6262+ let code_str = " [" ^ Message.error_code_to_string msg.Message.error_code ^ "]" in
6363 Buffer.add_string buf
6464 (Printf.sprintf "%s: %s%s: %s\n" loc_str severity_str code_str
6565 msg.Message.message))
···7272 let message_text = String (msg.Message.message, Meta.none) in
7373 let base = [ (("type", Meta.none), severity); (("message", Meta.none), message_text) ] in
7474 let with_code =
7575- (("subType", Meta.none), String (msg.Message.code, Meta.none)) :: base
7575+ (("subType", Meta.none), String (Message.error_code_to_string msg.Message.error_code, Meta.none)) :: base
7676 in
7777 let with_location =
7878 match msg.Message.location with
+79-84
lib/htmlrw_check/parse_error_bridge.ml
···33 SPDX-License-Identifier: MIT
44 ---------------------------------------------------------------------------*)
5566+(** Generate human-readable message for a parse error code *)
77+let message_of_parse_error code =
88+ let code_str = Html5rw.Parse_error_code.to_string code in
99+ match code with
1010+ | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus ->
1111+ "Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag."
1212+ | Html5rw.Parse_error_code.Null_character_reference ->
1313+ "Character reference expands to zero."
1414+ | Html5rw.Parse_error_code.Tree_construction_error s ->
1515+ (* Check for control-character/noncharacter/surrogate with codepoint info *)
1616+ (try
1717+ if String.length s > 28 && String.sub s 0 28 = "control-character-in-input-s" then
1818+ let colon_pos = String.index s ':' in
1919+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
2020+ let cp = int_of_string ("0x" ^ cp_str) in
2121+ Printf.sprintf "Forbidden code point U+%04x." cp
2222+ else if String.length s > 25 && String.sub s 0 25 = "noncharacter-in-input-str" then
2323+ let colon_pos = String.index s ':' in
2424+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
2525+ let cp = int_of_string ("0x" ^ cp_str) in
2626+ Printf.sprintf "Forbidden code point U+%04x." cp
2727+ else if String.length s > 22 && String.sub s 0 22 = "surrogate-in-input-str" then
2828+ let colon_pos = String.index s ':' in
2929+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
3030+ let cp = int_of_string ("0x" ^ cp_str) in
3131+ Printf.sprintf "Forbidden code point U+%04x." cp
3232+ (* Character reference errors *)
3333+ else if String.length s > 28 && String.sub s 0 28 = "control-character-reference:" then
3434+ let cp_str = String.sub s 28 (String.length s - 28) in
3535+ let cp = int_of_string ("0x" ^ cp_str) in
3636+ if cp = 0x0D then
3737+ "A numeric character reference expanded to carriage return."
3838+ else
3939+ Printf.sprintf "Character reference expands to a control character (U+%04x)." cp
4040+ else if String.length s > 31 && String.sub s 0 31 = "noncharacter-character-referenc" then
4141+ let colon_pos = String.index s ':' in
4242+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
4343+ let cp = int_of_string ("0x" ^ cp_str) in
4444+ (* U+FDD0-U+FDEF are "permanently unassigned" *)
4545+ if cp >= 0xFDD0 && cp <= 0xFDEF then
4646+ "Character reference expands to a permanently unassigned code point."
4747+ (* Astral noncharacters (planes 1-16) *)
4848+ else if cp >= 0x10000 then
4949+ Printf.sprintf "Character reference expands to an astral non-character (U+%05x)." cp
5050+ else
5151+ Printf.sprintf "Character reference expands to a non-character (U+%04x)." cp
5252+ else if String.length s > 36 && String.sub s 0 36 = "character-reference-outside-unicode-" then
5353+ "Character reference outside the permissible Unicode range."
5454+ else if String.length s > 27 && String.sub s 0 27 = "surrogate-character-referen" then
5555+ let colon_pos = String.index s ':' in
5656+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
5757+ let cp = int_of_string ("0x" ^ cp_str) in
5858+ Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp
5959+ else if s = "no-p-element-in-scope" then
6060+ "No \xe2\x80\x9cp\xe2\x80\x9d element in scope but a \xe2\x80\x9cp\xe2\x80\x9d end tag seen."
6161+ else if s = "end-tag-p-implied-but-open-elements" then
6262+ "End tag \xe2\x80\x9cp\xe2\x80\x9d implied, but there were open elements."
6363+ else if s = "end-tag-br" then
6464+ "End tag \xe2\x80\x9cbr\xe2\x80\x9d."
6565+ else if s = "expected-closing-tag-but-got-eof" then
6666+ "End of file seen and there were open elements."
6767+ else if String.length s > 28 && String.sub s 0 28 = "bad-start-tag-in-head-noscri" then
6868+ let colon_pos = String.index s ':' in
6969+ let element = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
7070+ Printf.sprintf "Bad start tag in \xe2\x80\x9c%s\xe2\x80\x9d in \xe2\x80\x9cnoscript\xe2\x80\x9d in \xe2\x80\x9chead\xe2\x80\x9d." element
7171+ else if String.length s > 19 && String.sub s 0 19 = "unexpected-end-tag:" then
7272+ let element = String.sub s 19 (String.length s - 19) in
7373+ Printf.sprintf "Stray end tag \xe2\x80\x9c%s\xe2\x80\x9d." element
7474+ else if String.length s > 19 && String.sub s 0 19 = "start-tag-in-table:" then
7575+ let tag = String.sub s 19 (String.length s - 19) in
7676+ Printf.sprintf "Start tag \xe2\x80\x9c%s\xe2\x80\x9d seen in \xe2\x80\x9ctable\xe2\x80\x9d." tag
7777+ else
7878+ Printf.sprintf "Parse error: %s" s
7979+ with _ -> Printf.sprintf "Parse error: %s" s)
8080+ | _ -> Printf.sprintf "Parse error: %s" code_str
8181+682let of_parse_error ?system_id err =
783 let code = Html5rw.error_code err in
884 let line = Html5rw.error_line err in
985 let column = Html5rw.error_column err in
1010- let location =
1111- Message.make_location ~line ~column ?system_id ()
1212- in
1313- let code_str = Html5rw.Parse_error_code.to_string code in
1414- let (message, final_code) = match code with
1515- | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus ->
1616- ("Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag.", code_str)
1717- | Html5rw.Parse_error_code.Null_character_reference ->
1818- ("Character reference expands to zero.", "null-character-reference")
1919- | Html5rw.Parse_error_code.Tree_construction_error s ->
2020- (* Check for control-character/noncharacter/surrogate with codepoint info *)
2121- (try
2222- if String.length s > 28 && String.sub s 0 28 = "control-character-in-input-s" then
2323- let colon_pos = String.index s ':' in
2424- let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
2525- let cp = int_of_string ("0x" ^ cp_str) in
2626- (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
2727- else if String.length s > 25 && String.sub s 0 25 = "noncharacter-in-input-str" then
2828- let colon_pos = String.index s ':' in
2929- let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
3030- let cp = int_of_string ("0x" ^ cp_str) in
3131- (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
3232- else if String.length s > 22 && String.sub s 0 22 = "surrogate-in-input-str" then
3333- let colon_pos = String.index s ':' in
3434- let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
3535- let cp = int_of_string ("0x" ^ cp_str) in
3636- (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
3737- (* Character reference errors *)
3838- else if String.length s > 28 && String.sub s 0 28 = "control-character-reference:" then
3939- let cp_str = String.sub s 28 (String.length s - 28) in
4040- let cp = int_of_string ("0x" ^ cp_str) in
4141- if cp = 0x0D then
4242- ("A numeric character reference expanded to carriage return.", "control-character-reference")
4343- else
4444- (Printf.sprintf "Character reference expands to a control character (U+%04x)." cp, "control-character-reference")
4545- else if String.length s > 31 && String.sub s 0 31 = "noncharacter-character-referenc" then
4646- let colon_pos = String.index s ':' in
4747- let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
4848- let cp = int_of_string ("0x" ^ cp_str) in
4949- (* U+FDD0-U+FDEF are "permanently unassigned" *)
5050- if cp >= 0xFDD0 && cp <= 0xFDEF then
5151- ("Character reference expands to a permanently unassigned code point.", "noncharacter-character-reference")
5252- (* Astral noncharacters (planes 1-16) *)
5353- else if cp >= 0x10000 then
5454- (Printf.sprintf "Character reference expands to an astral non-character (U+%05x)." cp, "noncharacter-character-reference")
5555- else
5656- (Printf.sprintf "Character reference expands to a non-character (U+%04x)." cp, "noncharacter-character-reference")
5757- else if String.length s > 36 && String.sub s 0 36 = "character-reference-outside-unicode-" then
5858- let colon_pos = String.index s ':' in
5959- let _ = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
6060- ("Character reference outside the permissible Unicode range.", "character-reference-outside-unicode-range")
6161- else if String.length s > 27 && String.sub s 0 27 = "surrogate-character-referen" then
6262- let colon_pos = String.index s ':' in
6363- let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
6464- let cp = int_of_string ("0x" ^ cp_str) in
6565- (Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp, "surrogate-character-reference")
6666- else if s = "no-p-element-in-scope" then
6767- ("No \xe2\x80\x9cp\xe2\x80\x9d element in scope but a \xe2\x80\x9cp\xe2\x80\x9d end tag seen.", "no-p-element-in-scope")
6868- else if s = "end-tag-p-implied-but-open-elements" then
6969- ("End tag \xe2\x80\x9cp\xe2\x80\x9d implied, but there were open elements.", "end-tag-p-implied")
7070- else if s = "end-tag-br" then
7171- ("End tag \xe2\x80\x9cbr\xe2\x80\x9d.", "end-tag-br")
7272- else if s = "expected-closing-tag-but-got-eof" then
7373- ("End of file seen and there were open elements.", "eof-in-open-element")
7474- else if String.length s > 28 && String.sub s 0 28 = "bad-start-tag-in-head-noscri" then
7575- let colon_pos = String.index s ':' in
7676- let element = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
7777- (Printf.sprintf "Bad start tag in \xe2\x80\x9c%s\xe2\x80\x9d in \xe2\x80\x9cnoscript\xe2\x80\x9d in \xe2\x80\x9chead\xe2\x80\x9d." element, "bad-start-tag-in-head-noscript")
7878- else if String.length s > 19 && String.sub s 0 19 = "unexpected-end-tag:" then
7979- let element = String.sub s 19 (String.length s - 19) in
8080- (Printf.sprintf "Stray end tag \xe2\x80\x9c%s\xe2\x80\x9d." element, "stray-end-tag")
8181- else if String.length s > 19 && String.sub s 0 19 = "start-tag-in-table:" then
8282- let tag = String.sub s 19 (String.length s - 19) in
8383- (Printf.sprintf "Start tag \xe2\x80\x9c%s\xe2\x80\x9d seen in \xe2\x80\x9ctable\xe2\x80\x9d." tag, "start-tag-in-table")
8484- else
8585- (Printf.sprintf "Parse error: %s" s, s)
8686- with _ -> (Printf.sprintf "Parse error: %s" s, s))
8787- | _ -> (Printf.sprintf "Parse error: %s" code_str, code_str)
8888- in
8989- Message.error
9090- ~message
9191- ~code:final_code
9292- ~location
9393- ()
8686+ let location = Message.make_location ~line ~column ?system_id () in
8787+ let message = message_of_parse_error code in
8888+ Message.of_parse_error ~location ~message code
94899590let collect_parse_errors ?system_id result =
9691 let errors = Html5rw.errors result in
+2-2
test/expected_message.ml
···360360 let code_matches =
361361 match (expected.error_code, actual.Htmlrw_check.error_code) with
362362 | (None, _) -> true (* No expected code to match *)
363363- | (Some ec, Some ac) -> error_codes_match ec ac
364364- | (Some _, None) -> false (* Expected typed but got untyped *)
363363+ | (Some ec, Htmlrw_check.Conformance ac) -> error_codes_match ec ac
364364+ | (Some _, Htmlrw_check.Parse _) -> false (* Expected conformance but got parse error *)
365365 in
366366367367 (* Check message text *)