OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** HTML5 Parser - Low-Level API
7
8 This module provides the core HTML5 parsing functionality implementing
9 the {{:https://html.spec.whatwg.org/multipage/parsing.html} WHATWG
10 HTML5 parsing specification}. It handles tokenization, tree construction,
11 error recovery, and produces a DOM tree.
12
13 For most uses, prefer the top-level {!Html5rw} module which provides
14 a simpler interface. This module is for advanced use cases that need
15 access to parser internals.
16
17 {2 How HTML5 Parsing Works}
18
19 The HTML5 parsing algorithm is unusual compared to most parsers. It was
20 reverse-engineered from browser behavior rather than designed from a
21 formal grammar. This ensures the parser handles malformed HTML exactly
22 like web browsers do.
23
24 The algorithm has three main phases:
25
26 {3 1. Encoding Detection}
27
28 Before parsing begins, the character encoding must be determined. The
29 WHATWG specification defines a "sniffing" algorithm:
30
31 1. Check for a BOM (Byte Order Mark) at the start
32 2. Look for [<meta charset="...">] in the first 1024 bytes
33 3. Use HTTP Content-Type header hint if available
34 4. Fall back to UTF-8
35
36 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
37 WHATWG: Determining the character encoding
38
39 {3 2. Tokenization}
40
41 The tokenizer converts the input stream into a sequence of tokens.
42 It implements a state machine with over 80 states to handle:
43
44 - Data (text content)
45 - Tags (start tags, end tags, self-closing tags)
46 - Comments
47 - DOCTYPEs
48 - Character references ([&], [<], [<])
49 - CDATA sections (in SVG/MathML)
50
51 The tokenizer has special handling for:
52 - {b Raw text elements}: [<script>], [<style>] - no markup parsing inside
53 - {b Escapable raw text elements}: [<textarea>], [<title>] - limited parsing
54 - {b RCDATA}: Content where only character references are parsed
55
56 @see <https://html.spec.whatwg.org/multipage/parsing.html#tokenization>
57 WHATWG: Tokenization
58
59 {3 3. Tree Construction}
60
61 The tree builder receives tokens from the tokenizer and builds the DOM
62 tree. It uses {i insertion modes} - a state machine that determines how
63 each token should be processed based on the current document context.
64
65 {b Insertion modes} include:
66 - [initial]: Before the DOCTYPE
67 - [before_html]: Before the [<html>] element
68 - [before_head]: Before the [<head>] element
69 - [in_head]: Inside [<head>]
70 - [in_body]: Inside [<body>] (the most complex mode)
71 - [in_table]: Inside [<table>] (special handling)
72 - [in_template]: Inside [<template>]
73 - And many more...
74
75 The tree builder maintains:
76 - {b Stack of open elements}: Elements that have been opened but not closed
77 - {b List of active formatting elements}: For handling nested formatting
78 - {b The template insertion mode stack}: For [<template>] elements
79
80 @see <https://html.spec.whatwg.org/multipage/parsing.html#tree-construction>
81 WHATWG: Tree construction
82
83 {2 Error Recovery}
84
85 A key feature of HTML5 parsing is that it {b never fails}. The specification
86 defines error recovery for every possible malformed input. For example:
87
88 - Missing end tags are implicitly closed
89 - Misnested tags are handled via the "adoption agency algorithm"
90 - Invalid characters are replaced with U+FFFD
91 - Unexpected elements are either ignored or moved to valid positions
92
93 This ensures every HTML document produces a valid DOM tree.
94
95 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
96 WHATWG: Parse errors
97
98 {2 The Adoption Agency Algorithm}
99
100 One of the most complex parts of HTML5 parsing is handling misnested
101 formatting elements. For example:
102
103 {v <p>Hello <b>world</p> <p>more</b> text</p> v}
104
105 Browsers don't just error out - they use the "adoption agency algorithm"
106 to produce sensible results. This algorithm:
107 1. Identifies formatting elements that span across other elements
108 2. Reconstructs the tree to properly nest elements
109 3. Moves nodes between parents as needed
110
111 @see <https://html.spec.whatwg.org/multipage/parsing.html#adoption-agency-algorithm>
112 WHATWG: The adoption agency algorithm
113*)
114
115(** {1 Sub-modules} *)
116
117(** DOM types and manipulation. *)
118module Dom = Dom
119
120(** Parse error code types.
121
122 This module provides the {!Parse_error_code.t} variant type that represents
123 all WHATWG-defined parse errors plus tree construction errors.
124
125 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
126 WHATWG: Parse errors *)
127module Parse_error_code = Parse_error_code
128
129(** HTML5 tokenizer.
130
131 The tokenizer implements the first stage of HTML5 parsing, converting
132 an input byte stream into a sequence of tokens (start tags, end tags,
133 text, comments, DOCTYPEs).
134
135 @see <https://html.spec.whatwg.org/multipage/parsing.html#tokenization>
136 WHATWG: Tokenization *)
137module Tokenizer = Tokenizer
138
139(** Character encoding detection and conversion.
140
141 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
142 WHATWG: Determining the character encoding *)
143module Encoding = Encoding
144
145(** HTML element constants and categories.
146
147 This module provides lists of element names that have special handling
148 in the HTML5 parser:
149
150 - {b Void elements}: Elements that cannot have children and have no end
151 tag ([area], [base], [br], [col], [embed], [hr], [img], [input],
152 [link], [meta], [source], [track], [wbr])
153
154 - {b Formatting elements}: Elements tracked in the list of active
155 formatting elements for the adoption agency algorithm ([a], [b], [big],
156 [code], [em], [font], [i], [nobr], [s], [small], [strike], [strong],
157 [tt], [u])
158
159 - {b Special elements}: Elements with special parsing rules that affect
160 scope and formatting reconstruction
161
162 @see <https://html.spec.whatwg.org/multipage/syntax.html#void-elements>
163 WHATWG: Void elements
164 @see <https://html.spec.whatwg.org/multipage/parsing.html#formatting>
165 WHATWG: Formatting elements *)
166module Constants : sig
167 val void_elements : string list
168 (** Elements that cannot have children: [area], [base], [br], [col],
169 [embed], [hr], [img], [input], [link], [meta], [source], [track], [wbr].
170
171 @see <https://html.spec.whatwg.org/multipage/syntax.html#void-elements>
172 WHATWG: Void elements *)
173
174 val formatting_elements : string list
175 (** Elements tracked for the adoption agency algorithm: [a], [b], [big],
176 [code], [em], [font], [i], [nobr], [s], [small], [strike], [strong],
177 [tt], [u].
178
179 @see <https://html.spec.whatwg.org/multipage/parsing.html#formatting>
180 WHATWG: Formatting elements *)
181
182 val special_elements : string list
183 (** Elements with special parsing behavior that affect scope checking.
184
185 @see <https://html.spec.whatwg.org/multipage/parsing.html#special>
186 WHATWG: Special elements *)
187end
188
189(** Parser insertion modes.
190
191 Insertion modes are the states of the tree construction state machine.
192 They determine how each token from the tokenizer should be processed
193 based on the current document context.
194
195 For example, a [<td>] tag is handled differently depending on whether
196 the parser is currently in a table context or in the body.
197
198 @see <https://html.spec.whatwg.org/multipage/parsing.html#insertion-mode>
199 WHATWG: Insertion mode *)
200module Insertion_mode : sig
201 type t
202 (** The insertion mode type. Values include modes like [initial],
203 [before_html], [in_head], [in_body], [in_table], etc. *)
204end
205
206(** Tree builder state.
207
208 The tree builder maintains the state needed for tree construction:
209 - Stack of open elements
210 - List of active formatting elements
211 - Template insertion mode stack
212 - Current insertion mode
213 - Foster parenting flag
214
215 @see <https://html.spec.whatwg.org/multipage/parsing.html#tree-construction>
216 WHATWG: Tree construction *)
217module Tree_builder : sig
218 type t
219 (** The tree builder state. *)
220end
221
222(** {1 Types} *)
223
224(** A parse error encountered during parsing.
225
226 HTML5 parsing {b never fails} - it always produces a DOM tree. However,
227 the WHATWG specification defines 92 specific error conditions that
228 conformance checkers should report. These errors indicate malformed
229 HTML that browsers will still render (with error recovery).
230
231 {b Error categories:}
232
233 {i Tokenizer errors} (detected during tokenization):
234 - [abrupt-closing-of-empty-comment]: Comment closed with [-->] without content
235 - [abrupt-doctype-public-identifier]: DOCTYPE public ID ended unexpectedly
236 - [eof-before-tag-name]: End of file while reading a tag name
237 - [eof-in-tag]: End of file inside a tag
238 - [missing-attribute-value]: Attribute has [=] but no value
239 - [unexpected-null-character]: Null byte in the input
240 - [unexpected-question-mark-instead-of-tag-name]: [<?] used instead of [<!]
241
242 {i Tree construction errors} (detected during tree building):
243 - [missing-doctype]: No DOCTYPE before first element
244 - [unexpected-token-*]: Token appeared in wrong context
245 - [foster-parenting]: Content moved outside table due to invalid position
246
247 Enable error collection with [~collect_errors:true]. Error collection
248 has some performance overhead, so it's disabled by default.
249
250 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
251 WHATWG: Complete list of parse errors *)
252type parse_error
253
254(** Get the error code.
255
256 Returns the {!Parse_error_code.t} variant representing this error.
257 This allows pattern matching on specific error types:
258
259 {[
260 match Parser.error_code err with
261 | Parse_error_code.Unexpected_null_character -> (* handle *)
262 | Parse_error_code.Eof_in_tag -> (* handle *)
263 | Parse_error_code.Tree_construction_error msg -> (* handle tree error *)
264 | _ -> (* other *)
265 ]}
266
267 Use {!Parse_error_code.to_string} to convert to a string representation.
268
269 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
270 WHATWG: Parse error codes *)
271val error_code : parse_error -> Parse_error_code.t
272
273(** Get the line number where the error occurred.
274
275 Line numbers are 1-indexed (first line is 1). Line breaks are
276 detected at LF (U+000A), CR (U+000D), and CR+LF sequences. *)
277val error_line : parse_error -> int
278
279(** Get the column number where the error occurred.
280
281 Column numbers are 1-indexed (first column is 1). Columns reset
282 to 1 after each line break. Column counting uses code points,
283 not bytes or grapheme clusters. *)
284val error_column : parse_error -> int
285
286val pp_parse_error : Format.formatter -> parse_error -> unit
287(** Pretty-print a parse error with location information. *)
288
289(** Context element for HTML fragment parsing.
290
291 When parsing HTML fragments (the content that would be assigned to
292 an element's [innerHTML]), the parser needs to know what element
293 would contain the fragment. This affects parsing in several ways:
294
295 {b Parser state initialization:}
296 - For [<title>] or [<textarea>]: Tokenizer starts in RCDATA state
297 - For [<style>], [<xmp>], [<iframe>], [<noembed>], [<noframes>]:
298 Tokenizer starts in RAWTEXT state
299 - For [<script>]: Tokenizer starts in script data state
300 - For [<noscript>]: Tokenizer starts in RAWTEXT state (if scripting enabled)
301 - For [<plaintext>]: Tokenizer starts in PLAINTEXT state
302 - Otherwise: Tokenizer starts in data state
303
304 {b Insertion mode:}
305 The initial insertion mode depends on the context element:
306 - [<template>]: "in template" mode
307 - [<html>]: "before head" mode
308 - [<head>]: "in head" mode
309 - [<body>], [<div>], etc.: "in body" mode
310 - [<table>]: "in table" mode
311 - And so on...
312
313 @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments>
314 WHATWG: The fragment parsing algorithm *)
315type fragment_context
316
317(** Create a fragment parsing context.
318
319 @param tag_name Tag name of the context element. This should be the
320 tag name of the element that would contain the fragment.
321 Common choices:
322 - ["div"]: General-purpose (most common)
323 - ["body"]: For full body content
324 - ["tr"]: For table row content ([<td>] elements)
325 - ["ul"], ["ol"]: For list content ([<li>] elements)
326 - ["select"]: For [<option>] elements
327
328 @param namespace Element namespace:
329 - [None]: HTML namespace (default)
330 - [Some "svg"]: SVG namespace
331 - [Some "mathml"]: MathML namespace
332
333 {b Examples:}
334 {[
335 (* Parse innerHTML of a table row - <td> works correctly *)
336 let ctx = make_fragment_context ~tag_name:"tr" ()
337
338 (* Parse innerHTML of an SVG group element *)
339 let ctx = make_fragment_context ~tag_name:"g" ~namespace:(Some "svg") ()
340
341 (* Parse innerHTML of a select element - <option> works correctly *)
342 let ctx = make_fragment_context ~tag_name:"select" ()
343 ]}
344
345 @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments>
346 WHATWG: Fragment parsing algorithm *)
347val make_fragment_context : tag_name:string -> ?namespace:string option ->
348 unit -> fragment_context
349
350(** Get the tag name of a fragment context. *)
351val fragment_context_tag : fragment_context -> string
352
353(** Get the namespace of a fragment context ([None] for HTML). *)
354val fragment_context_namespace : fragment_context -> string option
355
356val pp_fragment_context : Format.formatter -> fragment_context -> unit
357(** Pretty-print a fragment context. *)
358
359(** Result of parsing an HTML document or fragment.
360
361 This opaque type contains:
362 - The DOM tree (access via {!val:root})
363 - Parse errors if collection was enabled (access via {!val:errors})
364 - Detected encoding for byte input (access via {!val:encoding})
365*)
366type t
367
368(** {1 Parsing Functions} *)
369
370(** Parse HTML from a byte stream reader.
371
372 This function implements the complete HTML5 parsing algorithm:
373
374 1. Reads bytes from the provided reader
375 2. Tokenizes the input into HTML tokens
376 3. Constructs a DOM tree using the tree construction algorithm
377 4. Returns the parsed result
378
379 The input should be valid UTF-8. For automatic encoding detection
380 from raw bytes, use {!parse_bytes} instead.
381
382 {b Parser behavior:}
383
384 For {b full document parsing} (no fragment context), the parser:
385 - Creates a Document node as the root
386 - Processes any DOCTYPE declaration
387 - Creates [<html>], [<head>], and [<body>] elements as needed
388 - Builds the full document tree
389
390 For {b fragment parsing} (with fragment context), the parser:
391 - Creates a Document Fragment as the root
392 - Initializes tokenizer state based on context element
393 - Initializes insertion mode based on context element
394 - Does not create implicit [<html>], [<head>], [<body>]
395
396 @param collect_errors If [true], collect parse errors in the result.
397 Default: [false]. Enabling error collection adds overhead.
398 @param fragment_context Context for fragment parsing. If provided,
399 the input is parsed as fragment content (like innerHTML).
400
401 @see <https://html.spec.whatwg.org/multipage/parsing.html>
402 WHATWG: HTML parsing *)
403val parse : ?collect_errors:bool -> ?fragment_context:fragment_context ->
404 Bytesrw.Bytes.Reader.t -> t
405
406(** Parse HTML bytes with automatic encoding detection.
407
408 This function wraps {!parse} with encoding detection, implementing the
409 WHATWG encoding sniffing algorithm:
410
411 {b Detection order:}
412 1. {b BOM}: Check first 2-3 bytes for UTF-8, UTF-16LE, or UTF-16BE BOM
413 2. {b Prescan}: Look for [<meta charset="...">] or
414 [<meta http-equiv="Content-Type" content="...charset=...">]
415 in the first 1024 bytes
416 3. {b Transport hint}: Use [transport_encoding] if provided
417 4. {b Fallback}: Use UTF-8
418
419 The detected encoding is stored in the result (access via {!val:encoding}).
420
421 {b Prescan details:}
422
423 The prescan algorithm parses just enough of the document to find a
424 charset declaration. It handles:
425 - [<meta charset="utf-8">]
426 - [<meta http-equiv="Content-Type" content="text/html; charset=utf-8">]
427 - Comments and other markup are skipped
428 - Parsing stops after 1024 bytes
429
430 @param collect_errors If [true], collect parse errors. Default: [false].
431 @param transport_encoding Encoding hint from HTTP Content-Type header.
432 For example: ["utf-8"], ["iso-8859-1"], ["windows-1252"].
433 @param fragment_context Context for fragment parsing.
434
435 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
436 WHATWG: Determining the character encoding
437 @see <https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding>
438 WHATWG: Prescan algorithm *)
439val parse_bytes : ?collect_errors:bool -> ?transport_encoding:string ->
440 ?fragment_context:fragment_context -> bytes -> t
441
442(** {1 Result Accessors} *)
443
444(** Get the root node of the parsed document.
445
446 For full document parsing, returns a Document node with structure:
447 {v
448 #document
449 ├── !doctype (if DOCTYPE was present)
450 └── html
451 ├── head
452 │ └── ... (title, meta, link, script, style)
453 └── body
454 └── ... (page content)
455 v}
456
457 For fragment parsing, returns a Document Fragment node containing
458 the parsed elements directly (no implicit html/head/body).
459
460 @see <https://html.spec.whatwg.org/multipage/dom.html#document>
461 WHATWG: The Document object *)
462val root : t -> Dom.node
463
464(** Get parse errors collected during parsing.
465
466 Returns an empty list if error collection was not enabled
467 ([collect_errors:false] or omitted) or if the document was well-formed.
468
469 Errors are returned in the order they were encountered.
470
471 {b Example:}
472 {[
473 let result = parse ~collect_errors:true reader in
474 List.iter (fun e ->
475 Printf.printf "Line %d, col %d: %s\n"
476 (error_line e) (error_column e) (error_code e)
477 ) (errors result)
478 ]}
479
480 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
481 WHATWG: Parse errors *)
482val errors : t -> parse_error list
483
484(** Get the detected encoding.
485
486 Returns [Some encoding] when {!parse_bytes} was used, indicating which
487 encoding was detected or specified.
488
489 Returns [None] when {!parse} was used (it expects pre-decoded UTF-8).
490
491 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
492 WHATWG: Determining the character encoding *)
493val encoding : t -> Encoding.encoding option
494
495val pp : Format.formatter -> t -> unit
496(** Pretty-print a parse result summary. *)
497
498(** {1 Querying} *)
499
500(** Query the DOM with a CSS selector.
501
502 Returns all elements matching the selector in document order.
503
504 {b Supported selectors:}
505
506 See {!Selector} for the complete list. Key selectors include:
507 - Type: [div], [p], [a]
508 - ID: [#myid]
509 - Class: [.myclass]
510 - Attribute: [[href]], [[type="text"]]
511 - Pseudo-class: [:first-child], [:nth-child(2)]
512 - Combinators: [div p] (descendant), [div > p] (child)
513
514 @raise Selector.Selector_error if the selector is invalid
515
516 @see <https://www.w3.org/TR/selectors-4/>
517 W3C: Selectors Level 4 *)
518val query : t -> string -> Dom.node list
519
520(** {1 Serialization} *)
521
522(** Serialize the DOM tree to a byte writer.
523
524 Outputs valid HTML5 that can be parsed to produce an equivalent DOM tree.
525 The output follows the WHATWG serialization algorithm.
526
527 {b Serialization rules:}
528 - Void elements are written without end tags
529 - Attributes are quoted with double quotes
530 - Special characters in text/attributes are escaped
531 - Comments preserve their content
532 - DOCTYPE is serialized as [<!DOCTYPE html>]
533
534 @param pretty If [true] (default), add indentation for readability.
535 @param indent_size Spaces per indent level (default: 2).
536
537 @see <https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments>
538 WHATWG: Serialising HTML fragments *)
539val to_writer : ?pretty:bool -> ?indent_size:int -> t ->
540 Bytesrw.Bytes.Writer.t -> unit
541
542(** Serialize the DOM tree to a string.
543
544 Convenience wrapper around {!to_writer} that returns a string.
545
546 @param pretty If [true] (default), add indentation for readability.
547 @param indent_size Spaces per indent level (default: 2). *)
548val to_string : ?pretty:bool -> ?indent_size:int -> t -> string
549
550(** Extract text content from the DOM tree.
551
552 Returns the concatenation of all text node content in document order,
553 with no HTML markup.
554
555 @param separator String to insert between text nodes (default: [" "])
556 @param strip If [true] (default), trim leading/trailing whitespace *)
557val to_text : ?separator:string -> ?strip:bool -> t -> string
558
559(** Serialize to html5lib test format.
560
561 This produces the tree representation format used by the
562 {{:https://github.com/html5lib/html5lib-tests} html5lib-tests} suite.
563
564 The format shows the tree structure with:
565 - Indentation indicating depth (2 spaces per level)
566 - Prefixes indicating node type:
567 - [<!DOCTYPE ...>] for DOCTYPE
568 - [<tagname>] for elements (with attributes on same line)
569 - ["text"] for text nodes
570 - [<!-- comment -->] for comments
571
572 Mainly useful for testing the parser against the reference test suite. *)
573val to_test_format : t -> string