OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** HTML5 DOM Types and Operations
7
8 This module provides the DOM (Document Object Model) node representation
9 used by the HTML5 parser. The DOM is a programming interface that
10 represents an HTML document as a tree of nodes, where each node represents
11 part of the document (an element, text content, comment, etc.).
12
13 {2 What is the DOM?}
14
15 When an HTML parser processes markup like [<p>Hello <b>world</b></p>], it
16 doesn't store the text directly. Instead, it builds a tree structure in
17 memory:
18
19 {v
20 Document
21 └── html
22 └── body
23 └── p
24 ├── #text "Hello "
25 └── b
26 └── #text "world"
27 v}
28
29 This tree is the DOM. Each box in the tree is a {i node}. Programs can
30 traverse and modify this tree to read or change the document.
31
32 @see <https://html.spec.whatwg.org/multipage/dom.html>
33 WHATWG: The elements of HTML (DOM chapter)
34
35 {2 Node Types}
36
37 The HTML5 DOM includes several node types, all represented by the same
38 record type with different field usage:
39
40 - {b Element nodes}: HTML elements like [<div>], [<p>], [<a href="...">].
41 Elements are the building blocks of HTML documents. They can have
42 attributes and contain other nodes.
43
44 - {b Text nodes}: The actual text content within elements. For example,
45 in [<p>Hello</p>], "Hello" is a text node that is a child of the [<p>]
46 element.
47
48 - {b Comment nodes}: HTML comments written as [<!-- comment text -->].
49 Comments are preserved in the DOM but not rendered.
50
51 - {b Document nodes}: The root of the entire document tree. Every HTML
52 document has exactly one Document node at the top.
53
54 - {b Document fragment nodes}: Lightweight containers that hold a
55 collection of nodes without a parent. Used for efficient batch DOM
56 operations and [<template>] element contents.
57
58 - {b Doctype nodes}: The [<!DOCTYPE html>] declaration at the start of
59 HTML5 documents. This declaration tells browsers to render the page
60 in standards mode.
61
62 @see <https://html.spec.whatwg.org/multipage/dom.html#kinds-of-content>
63 WHATWG: Kinds of content
64
65 {2 Namespaces}
66
67 HTML5 can embed content from other XML vocabularies. Elements belong to
68 one of three {i namespaces}:
69
70 - {b HTML namespace} ([None] or implicit): Standard HTML elements like
71 [<div>], [<p>], [<table>]. This is the default for all elements.
72
73 - {b SVG namespace} ([Some "svg"]): Scalable Vector Graphics for drawing.
74 When the parser encounters an [<svg>] tag, all elements inside it
75 (like [<rect>], [<circle>], [<path>]) are placed in the SVG namespace.
76
77 - {b MathML namespace} ([Some "mathml"]): Mathematical Markup Language
78 for equations. When the parser encounters a [<math>] tag, elements
79 inside it are placed in the MathML namespace.
80
81 The parser automatically switches namespaces when entering and leaving
82 these foreign content islands.
83
84 @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign>
85 WHATWG: Parsing foreign content
86
87 {2 Tree Structure}
88
89 Nodes form a bidirectional tree: each node has a list of children and
90 an optional parent reference. Modification functions in this module
91 maintain these references automatically.
92
93 The tree is always well-formed: a node can only have one parent, and
94 circular references are not possible.
95*)
96
97(** {1 Types} *)
98
99(** Information associated with a DOCTYPE node.
100
101 The {i document type declaration} (DOCTYPE) tells browsers what version
102 of HTML the document uses. In HTML5, the standard declaration is simply:
103
104 {v <!DOCTYPE html> v}
105
106 This minimal DOCTYPE triggers {i standards mode} (no quirks). The DOCTYPE
107 can optionally include a public identifier and system identifier for
108 legacy compatibility with SGML-based tools, but these are rarely used
109 in modern HTML5 documents.
110
111 {b Historical context:} In HTML4 and XHTML, DOCTYPEs were verbose and
112 referenced DTD files. For example:
113 {v <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
114 "http://www.w3.org/TR/html4/strict.dtd"> v}
115
116 HTML5 simplified this to just [<!DOCTYPE html>] because:
117 - Browsers never actually fetched or validated against DTDs
118 - The DOCTYPE's only real purpose is triggering standards mode
119 - A minimal DOCTYPE achieves this goal
120
121 {b Field meanings:}
122 - [name]: The document type name, almost always ["html"] for HTML documents
123 - [public_id]: A public identifier (legacy); [None] for HTML5
124 - [system_id]: A system identifier/URL (legacy); [None] for HTML5
125
126 @see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
127 WHATWG: The DOCTYPE
128 @see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode>
129 WHATWG: DOCTYPE handling during parsing
130*)
131type doctype_data = Dom_node.doctype_data = {
132 name : string option; (** The DOCTYPE name, e.g., "html" *)
133 public_id : string option; (** Public identifier (legacy, rarely used) *)
134 system_id : string option; (** System identifier (legacy, rarely used) *)
135}
136
137val pp_doctype_data : Format.formatter -> doctype_data -> unit
138(** Pretty-print DOCTYPE data. *)
139
140(** Quirks mode setting for the document.
141
142 {i Quirks mode} is a browser rendering mode that emulates bugs and
143 non-standard behaviors from older browsers (primarily Internet Explorer 5).
144 Modern HTML5 documents should always render in {i standards mode}
145 (no quirks) for consistent, predictable behavior.
146
147 The HTML5 parser determines quirks mode based on the DOCTYPE declaration:
148
149 - {b No_quirks} (Standards mode): The document renders according to modern
150 HTML5 and CSS specifications. This is triggered by [<!DOCTYPE html>].
151 CSS box model, table layout, and other features work as specified.
152
153 - {b Quirks} (Full quirks mode): The document renders with legacy browser
154 bugs emulated. This happens when:
155 {ul
156 {- DOCTYPE is missing entirely}
157 {- DOCTYPE has certain legacy public identifiers}
158 {- DOCTYPE has the wrong format}}
159
160 In quirks mode, many CSS properties behave differently:
161 {ul
162 {- Tables don't inherit font properties}
163 {- Box model uses non-standard width calculations}
164 {- Certain CSS selectors don't work correctly}}
165
166 - {b Limited_quirks} (Almost standards mode): A middle ground that applies
167 only a few specific quirks, primarily affecting table cell vertical
168 sizing. Triggered by XHTML DOCTYPEs and certain HTML4 DOCTYPEs.
169
170 {b Recommendation:} Always use [<!DOCTYPE html>] at the start of HTML5
171 documents to ensure {b No_quirks} mode.
172
173 @see <https://quirks.spec.whatwg.org/>
174 Quirks Mode Standard - detailed specification
175 @see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode>
176 WHATWG: How the parser determines quirks mode
177*)
178type quirks_mode = Dom_node.quirks_mode = No_quirks | Quirks | Limited_quirks
179
180val pp_quirks_mode : Format.formatter -> quirks_mode -> unit
181(** Pretty-print quirks mode. *)
182
183(** Source location where a node was parsed. *)
184type location = Dom_node.location = {
185 line : int;
186 column : int;
187 end_line : int option;
188 end_column : int option;
189}
190
191(** A DOM node in the parsed document tree.
192
193 All node types use the same record structure. The [name] field determines
194 the node type:
195 - Element: the tag name (e.g., "div", "p", "span")
196 - Text: "#text"
197 - Comment: "#comment"
198 - Document: "#document"
199 - Document fragment: "#document-fragment"
200 - Doctype: "!doctype"
201
202 {3 Understanding Node Fields}
203
204 Different node types use different combinations of fields:
205
206 {v
207 Node Type | name | namespace | attrs | data | template_content | doctype
208 ------------------|------------------|-----------|-------|------|------------------|--------
209 Element | tag name | Yes | Yes | No | If <template> | No
210 Text | "#text" | No | No | Yes | No | No
211 Comment | "#comment" | No | No | Yes | No | No
212 Document | "#document" | No | No | No | No | No
213 Document Fragment | "#document-frag" | No | No | No | No | No
214 Doctype | "!doctype" | No | No | No | No | Yes
215 v}
216
217 {3 Element Tag Names}
218
219 For element nodes, the [name] field contains the lowercase tag name.
220 HTML5 defines many elements with specific meanings:
221
222 {b Structural elements:} [html], [head], [body], [header], [footer],
223 [main], [nav], [article], [section], [aside]
224
225 {b Text content:} [p], [div], [span], [h1]-[h6], [pre], [blockquote]
226
227 {b Lists:} [ul], [ol], [li], [dl], [dt], [dd]
228
229 {b Tables:} [table], [tr], [td], [th], [thead], [tbody], [tfoot]
230
231 {b Forms:} [form], [input], [button], [select], [textarea], [label]
232
233 {b Media:} [img], [audio], [video], [canvas], [svg]
234
235 @see <https://html.spec.whatwg.org/multipage/indices.html#elements-3>
236 WHATWG: Index of HTML elements
237
238 {3 Void Elements}
239
240 Some elements are {i void elements} - they cannot have children and have
241 no end tag. These include: [area], [base], [br], [col], [embed], [hr],
242 [img], [input], [link], [meta], [source], [track], [wbr].
243
244 @see <https://html.spec.whatwg.org/multipage/syntax.html#void-elements>
245 WHATWG: Void elements
246
247 {3 The Template Element}
248
249 The [<template>] element is special: its children are not rendered
250 directly but stored in a separate document fragment accessible via
251 the [template_content] field. Templates are used for client-side
252 templating where content is cloned and inserted via JavaScript.
253
254 @see <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>
255 WHATWG: The template element
256*)
257type node = Dom_node.node = {
258 mutable name : string;
259 (** Tag name for elements, or special name for other node types.
260
261 For elements, this is the lowercase tag name (e.g., "div", "span").
262 For other node types, use the constants {!document_name},
263 {!text_name}, {!comment_name}, etc. *)
264
265 mutable namespace : string option;
266 (** Element namespace: [None] for HTML, [Some "svg"], [Some "mathml"].
267
268 Most elements are in the HTML namespace ([None]). The SVG and MathML
269 namespaces are only used when content appears inside [<svg>] or
270 [<math>] elements respectively.
271
272 @see <https://html.spec.whatwg.org/multipage/dom.html#elements-in-the-dom>
273 WHATWG: Elements in the DOM *)
274
275 mutable attrs : (string * string) list;
276 (** Element attributes as (name, value) pairs.
277
278 Attributes provide additional information about elements. Common
279 global attributes include:
280 - [id]: Unique identifier for the element
281 - [class]: Space-separated list of CSS class names
282 - [style]: Inline CSS styles
283 - [title]: Advisory text (shown as tooltip)
284 - [lang]: Language of the element's content
285 - [hidden]: Whether the element should be hidden
286
287 Element-specific attributes include:
288 - [href] on [<a>]: The link destination URL
289 - [src] on [<img>]: The image source URL
290 - [type] on [<input>]: The input control type
291 - [disabled] on form controls: Whether the control is disabled
292
293 In HTML5, attribute names are case-insensitive and are normalized
294 to lowercase by the parser.
295
296 @see <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>
297 WHATWG: Global attributes
298 @see <https://html.spec.whatwg.org/multipage/indices.html#attributes-3>
299 WHATWG: Index of attributes *)
300
301 mutable children : node list;
302 (** Child nodes in document order.
303
304 For most elements, this list contains the nested elements and text.
305 For void elements (like [<br>], [<img>]), this is always empty.
306 For [<template>] elements, the actual content is in
307 [template_content], not here. *)
308
309 mutable parent : node option;
310 (** Parent node, [None] for root nodes.
311
312 Every node except the Document node has a parent. This back-reference
313 enables traversing up the tree. *)
314
315 mutable data : string;
316 (** Text content for text and comment nodes.
317
318 For text nodes, this contains the actual text. For comment nodes,
319 this contains the comment text (without the [<!--] and [-->]
320 delimiters). For other node types, this field is empty. *)
321
322 mutable template_content : node option;
323 (** Document fragment for [<template>] element contents.
324
325 The [<template>] element holds "inert" content that is not
326 rendered but can be cloned and inserted elsewhere. This field
327 contains a document fragment with the template's content.
328
329 For non-template elements, this is [None].
330
331 @see <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>
332 WHATWG: The template element *)
333
334 mutable doctype : doctype_data option;
335 (** DOCTYPE information for doctype nodes.
336
337 Only doctype nodes use this field; for all other nodes it is [None]. *)
338
339 mutable location : location option;
340 (** Source location where this node was parsed. *)
341}
342
343val pp : Format.formatter -> node -> unit
344(** Pretty-print a DOM node. Prints a summary representation showing the
345 node type and key attributes. Does not recursively print children. *)
346
347(** {1 Node Name Constants}
348
349 These constants identify special node types. Compare with [node.name]
350 to determine the node type.
351*)
352
353val document_name : string
354(** ["#document"] - name for document nodes.
355
356 The Document node is the root of every HTML document tree. It represents
357 the entire document and is the parent of the [<html>] element.
358
359 @see <https://html.spec.whatwg.org/multipage/dom.html#document>
360 WHATWG: The Document object *)
361
362val document_fragment_name : string
363(** ["#document-fragment"] - name for document fragment nodes.
364
365 Document fragments are lightweight container nodes used to hold a
366 collection of nodes without a parent document. They are used:
367 - To hold [<template>] element contents
368 - As results of fragment parsing (innerHTML)
369 - For efficient batch DOM operations
370
371 @see <https://dom.spec.whatwg.org/#documentfragment>
372 DOM Standard: DocumentFragment *)
373
374val text_name : string
375(** ["#text"] - name for text nodes.
376
377 Text nodes contain the character data within elements. When the
378 parser encounters text between tags like [<p>Hello world</p>],
379 it creates a text node with data ["Hello world"] as a child of
380 the [<p>] element.
381
382 Adjacent text nodes are automatically merged by the parser. *)
383
384val comment_name : string
385(** ["#comment"] - name for comment nodes.
386
387 Comment nodes represent HTML comments: [<!-- comment text -->].
388 Comments are preserved in the DOM but not rendered to users.
389 They're useful for development notes or conditional content. *)
390
391val doctype_name : string
392(** ["!doctype"] - name for doctype nodes.
393
394 The DOCTYPE node represents the [<!DOCTYPE html>] declaration.
395 It is always the first child of the Document node (if present).
396
397 @see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
398 WHATWG: The DOCTYPE *)
399
400(** {1 Constructors}
401
402 Functions to create new DOM nodes. All nodes start with no parent and
403 no children. Use {!append_child} or {!insert_before} to build a tree.
404*)
405
406val create_element :
407 string ->
408 ?namespace:string option ->
409 ?attrs:(string * string) list ->
410 ?location:location ->
411 unit ->
412 node
413(** Create an element node.
414
415 Elements are the primary building blocks of HTML documents. Each
416 element represents a component of the document with semantic meaning.
417
418 @param name The tag name (e.g., "div", "p", "span"). Tag names are
419 case-insensitive in HTML; by convention, use lowercase.
420 @param namespace Element namespace:
421 - [None] (default): HTML namespace for standard elements
422 - [Some "svg"]: SVG namespace for graphics elements
423 - [Some "mathml"]: MathML namespace for mathematical notation
424 @param attrs Initial attributes as [(name, value)] pairs
425
426 {b Examples:}
427 {[
428 (* Simple HTML element *)
429 let div = create_element "div" ()
430
431 (* Element with attributes *)
432 let link = create_element "a"
433 ~attrs:[("href", "https://example.com"); ("class", "external")]
434 ()
435
436 (* SVG element *)
437 let rect = create_element "rect"
438 ~namespace:(Some "svg")
439 ~attrs:[("width", "100"); ("height", "50"); ("fill", "blue")]
440 ()
441 ]}
442
443 @see <https://html.spec.whatwg.org/multipage/dom.html#elements-in-the-dom>
444 WHATWG: Elements in the DOM
445*)
446
447val create_text : ?location:location -> string -> node
448(** Create a text node with the given content.
449
450 Text nodes contain the readable content of HTML documents. They
451 appear as children of elements and represent the characters that
452 users see.
453
454 {b Note:} Text content is stored as-is. Character references like
455 [&] should already be decoded to their character values.
456
457 {b Example:}
458 {[
459 let text = create_text "Hello, world!"
460 (* To put text in a paragraph: *)
461 let p = create_element "p" () in
462 append_child p text
463 ]}
464*)
465
466val create_comment : ?location:location -> string -> node
467(** Create a comment node with the given content.
468
469 Comments are human-readable notes in HTML that don't appear in
470 the rendered output. They're written as [<!-- comment -->] in HTML.
471
472 @param data The comment text (without the [<!--] and [-->] delimiters)
473
474 {b Example:}
475 {[
476 let comment = create_comment " TODO: Add navigation "
477 (* Represents: <!-- TODO: Add navigation --> *)
478 ]}
479
480 @see <https://html.spec.whatwg.org/multipage/syntax.html#comments>
481 WHATWG: HTML comments
482*)
483
484val create_document : unit -> node
485(** Create an empty document node.
486
487 The Document node is the root of an HTML document tree. It represents
488 the entire document and serves as the parent for the DOCTYPE (if any)
489 and the root [<html>] element.
490
491 In a complete HTML document, the structure is:
492 {v
493 #document
494 ├── !doctype
495 └── html
496 ├── head
497 └── body
498 v}
499
500 @see <https://html.spec.whatwg.org/multipage/dom.html#document>
501 WHATWG: The Document object
502*)
503
504val create_document_fragment : unit -> node
505(** Create an empty document fragment.
506
507 Document fragments are lightweight containers that can hold multiple
508 nodes without being part of the main document tree. They're useful for:
509
510 - {b Template contents:} The [<template>] element stores its children
511 in a document fragment, keeping them inert until cloned
512
513 - {b Fragment parsing:} When parsing HTML fragments (like innerHTML),
514 the result is placed in a document fragment
515
516 - {b Batch operations:} Build a subtree in a fragment, then insert it
517 into the document in one operation for better performance
518
519 @see <https://dom.spec.whatwg.org/#documentfragment>
520 DOM Standard: DocumentFragment
521*)
522
523val create_doctype :
524 ?name:string -> ?public_id:string -> ?system_id:string -> ?location:location -> unit -> node
525(** Create a DOCTYPE node.
526
527 The DOCTYPE declaration tells browsers to use standards mode for
528 rendering. For HTML5 documents, use:
529
530 {[
531 let doctype = create_doctype ~name:"html" ()
532 (* Represents: <!DOCTYPE html> *)
533 ]}
534
535 @param name DOCTYPE name (usually ["html"] for HTML documents)
536 @param public_id Public identifier (legacy, rarely needed)
537 @param system_id System identifier (legacy, rarely needed)
538
539 {b Legacy example:}
540 {[
541 (* HTML 4.01 Strict DOCTYPE - not recommended for new documents *)
542 let legacy = create_doctype
543 ~name:"HTML"
544 ~public_id:"-//W3C//DTD HTML 4.01//EN"
545 ~system_id:"http://www.w3.org/TR/html4/strict.dtd"
546 ()
547 ]}
548
549 @see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
550 WHATWG: The DOCTYPE
551*)
552
553val create_template :
554 ?namespace:string option -> ?attrs:(string * string) list -> ?location:location -> unit -> node
555(** Create a [<template>] element with its content document fragment.
556
557 The [<template>] element holds inert HTML content that is not
558 rendered directly. The content is stored in a separate document
559 fragment and can be:
560 - Cloned and inserted into the document via JavaScript
561 - Used as a stamping template for repeated content
562 - Pre-parsed without affecting the page
563
564 {b How templates work:}
565
566 Unlike normal elements, a [<template>]'s children are not rendered.
567 Instead, they're stored in the [template_content] field. This means:
568 - Images inside won't load
569 - Scripts inside won't execute
570 - The content is "inert" until explicitly activated
571
572 {b Example:}
573 {[
574 let template = create_template () in
575 let div = create_element "div" () in
576 let text = create_text "Template content" in
577 append_child div text;
578 (* Add to template's content fragment, not children *)
579 match template.template_content with
580 | Some fragment -> append_child fragment div
581 | None -> ()
582 ]}
583
584 @see <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>
585 WHATWG: The template element
586*)
587
588(** {1 Node Type Predicates}
589
590 Functions to test what type of node you have. Since all nodes use the
591 same record type, these predicates check the [name] field to determine
592 the actual node type.
593*)
594
595val is_element : node -> bool
596(** [is_element node] returns [true] if the node is an element node.
597
598 Elements are HTML tags like [<div>], [<p>], [<a>]. They are
599 identified by having a tag name that doesn't match any of the
600 special node name constants.
601*)
602
603val is_text : node -> bool
604(** [is_text node] returns [true] if the node is a text node.
605
606 Text nodes contain the character content within elements.
607 They have [name = "#text"]. *)
608
609val is_comment : node -> bool
610(** [is_comment node] returns [true] if the node is a comment node.
611
612 Comment nodes represent HTML comments [<!-- ... -->].
613 They have [name = "#comment"]. *)
614
615val is_document : node -> bool
616(** [is_document node] returns [true] if the node is a document node.
617
618 The document node is the root of the DOM tree.
619 It has [name = "#document"]. *)
620
621val is_document_fragment : node -> bool
622(** [is_document_fragment node] returns [true] if the node is a document fragment.
623
624 Document fragments are lightweight containers.
625 They have [name = "#document-fragment"]. *)
626
627val is_doctype : node -> bool
628(** [is_doctype node] returns [true] if the node is a DOCTYPE node.
629
630 DOCTYPE nodes represent the [<!DOCTYPE>] declaration.
631 They have [name = "!doctype"]. *)
632
633val has_children : node -> bool
634(** [has_children node] returns [true] if the node has any children.
635
636 Note: For [<template>] elements, this checks the direct children list,
637 not the template content fragment. *)
638
639(** {1 Tree Manipulation}
640
641 Functions to modify the DOM tree structure. These functions automatically
642 maintain parent/child references, ensuring the tree remains consistent.
643*)
644
645val append_child : node -> node -> unit
646(** [append_child parent child] adds [child] as the last child of [parent].
647
648 The child's parent reference is updated to point to [parent].
649 If the child already has a parent, it is first removed from that parent.
650
651 {b Example:}
652 {[
653 let body = create_element "body" () in
654 let p = create_element "p" () in
655 let text = create_text "Hello!" in
656 append_child p text;
657 append_child body p
658 (* Result:
659 body
660 └── p
661 └── #text "Hello!"
662 *)
663 ]}
664*)
665
666val insert_before : node -> node -> node -> unit
667(** [insert_before parent new_child ref_child] inserts [new_child] before
668 [ref_child] in [parent]'s children.
669
670 @param parent The parent node
671 @param new_child The node to insert
672 @param ref_child The existing child to insert before
673
674 Raises [Not_found] if [ref_child] is not a child of [parent].
675
676 {b Example:}
677 {[
678 let ul = create_element "ul" () in
679 let li1 = create_element "li" () in
680 let li3 = create_element "li" () in
681 append_child ul li1;
682 append_child ul li3;
683 let li2 = create_element "li" () in
684 insert_before ul li2 li3
685 (* Result: ul contains li1, li2, li3 in that order *)
686 ]}
687*)
688
689val remove_child : node -> node -> unit
690(** [remove_child parent child] removes [child] from [parent]'s children.
691
692 The child's parent reference is set to [None].
693
694 Raises [Not_found] if [child] is not a child of [parent].
695*)
696
697val insert_text_at : node -> string -> node option -> unit
698(** [insert_text_at parent text before_node] inserts text content.
699
700 If [before_node] is [None], appends at the end. If the previous sibling
701 is a text node, the text is merged into it (text nodes are coalesced).
702 Otherwise, a new text node is created.
703
704 This implements the HTML5 parser's text insertion algorithm which
705 ensures adjacent text nodes are always merged, matching browser behavior.
706
707 @see <https://html.spec.whatwg.org/multipage/parsing.html#appropriate-place-for-inserting-a-node>
708 WHATWG: Inserting text in the DOM
709*)
710
711(** {1 Attribute Operations}
712
713 Functions to read and modify element attributes. Attributes are
714 name-value pairs that provide additional information about elements.
715
716 In HTML5, attribute names are case-insensitive and normalized to
717 lowercase by the parser.
718
719 @see <https://html.spec.whatwg.org/multipage/dom.html#attributes>
720 WHATWG: Attributes
721*)
722
723val get_attr : node -> string -> string option
724(** [get_attr node name] returns the value of attribute [name], or [None]
725 if the attribute doesn't exist.
726
727 Attribute lookup is case-sensitive on the stored (lowercase) names.
728*)
729
730val set_attr : node -> string -> string -> unit
731(** [set_attr node name value] sets attribute [name] to [value].
732
733 If the attribute already exists, it is replaced.
734 If it doesn't exist, it is added.
735*)
736
737val has_attr : node -> string -> bool
738(** [has_attr node name] returns [true] if the node has attribute [name]. *)
739
740(** {1 Space-Separated Attribute Values}
741
742 Many HTML attributes contain space-separated lists of values. For example,
743 the [class] attribute contains CSS class names: [class="header main active"].
744 These functions parse such attributes into OCaml lists.
745
746 Per the HTML5 spec, "ASCII whitespace" (space, tab, newline, carriage return,
747 form feed) is used as the separator.
748*)
749
750val split_on_whitespace : string -> string list
751(** [split_on_whitespace s] splits a string on ASCII whitespace.
752
753 This implements the HTML5 "split on ASCII whitespace" algorithm.
754
755 {b Example:}
756 {[
757 split_on_whitespace "foo bar\tbaz"
758 (* Returns: ["foo"; "bar"; "baz"] *)
759 ]}
760*)
761
762val get_attr_list : node -> string -> string list
763(** [get_attr_list node name] returns a space-separated attribute as a list.
764
765 Returns an empty list if the attribute doesn't exist.
766*)
767
768val get_class_list : node -> string list
769(** [get_class_list node] returns the class attribute as a list of class names. *)
770
771val get_rel_list : node -> string list
772(** [get_rel_list node] returns the rel attribute as a list of link types
773 (lowercased since they are case-insensitive). *)
774
775val get_headers_list : node -> string list
776(** [get_headers_list node] returns the headers attribute as a list of IDs. *)
777
778val get_itemref_list : node -> string list
779(** [get_itemref_list node] returns the itemref attribute as a list of IDs. *)
780
781val get_itemprop_list : node -> string list
782(** [get_itemprop_list node] returns the itemprop attribute as a list. *)
783
784val get_itemtype_list : node -> string list
785(** [get_itemtype_list node] returns the itemtype attribute as a list of URLs. *)
786
787(** {1 Location Helpers} *)
788
789val make_location : line:int -> column:int -> ?end_line:int -> ?end_column:int ->
790 unit -> location
791(** [make_location ~line ~column ()] creates a source location record. *)
792
793val set_location : node -> line:int -> column:int -> ?end_line:int ->
794 ?end_column:int -> unit -> unit
795(** [set_location node ~line ~column ()] sets the source location of a node. *)
796
797val get_location : node -> location option
798(** [get_location node] returns the source location if set, or [None]. *)
799
800(** {1 Tree Traversal}
801
802 Functions to navigate the DOM tree.
803*)
804
805val descendants : node -> node list
806(** [descendants node] returns all descendant nodes in document order.
807
808 This performs a depth-first traversal, returning children before
809 siblings at each level. The node itself is not included.
810
811 {b Document order} is the order nodes appear in the HTML source:
812 parent before children, earlier siblings before later ones.
813
814 {b Example:}
815 {[
816 (* For tree: div > (p > "hello", span > "world") *)
817 descendants div
818 (* Returns: [p; text("hello"); span; text("world")] *)
819 ]}
820*)
821
822val ancestors : node -> node list
823(** [ancestors node] returns all ancestor nodes from parent to root.
824
825 The first element is the immediate parent, the last is the root
826 (usually the Document node).
827
828 {b Example:}
829 {[
830 (* For a text node inside: html > body > p > text *)
831 ancestors text_node
832 (* Returns: [p; body; html; #document] *)
833 ]}
834*)
835
836val get_text_content : node -> string
837(** [get_text_content node] returns the concatenated text content.
838
839 For text nodes, returns the text data directly.
840 For elements, recursively concatenates all descendant text content.
841 For other node types, returns an empty string.
842
843 {b Example:}
844 {[
845 (* For: <p>Hello <b>world</b>!</p> *)
846 get_text_content p_element
847 (* Returns: "Hello world!" *)
848 ]}
849*)
850
851(** {1 Cloning} *)
852
853val clone : ?deep:bool -> node -> node
854(** [clone ?deep node] creates a copy of the node.
855
856 @param deep If [true], recursively clone all descendants (default: [false])
857
858 The cloned node has no parent. With [deep:false], only the node itself
859 is copied (with its attributes, but not its children).
860
861 {b Example:}
862 {[
863 let original = create_element "div" ~attrs:[("class", "box")] () in
864 let shallow = clone original in
865 let deep = clone ~deep:true original
866 ]}
867*)
868
869(** {1 Serialization} *)
870
871val to_html : ?pretty:bool -> ?indent_size:int -> ?indent:int -> node -> string
872(** [to_html ?pretty ?indent_size ?indent node] converts a DOM node to an
873 HTML string.
874
875 @param pretty If [true] (default), format with indentation and newlines
876 @param indent_size Number of spaces per indentation level (default: 2)
877 @param indent Starting indentation level (default: 0)
878 @return The HTML string representation of the node
879*)
880
881val to_writer :
882 ?pretty:bool ->
883 ?indent_size:int ->
884 ?indent:int ->
885 Bytesrw.Bytes.Writer.t ->
886 node ->
887 unit
888(** [to_writer ?pretty ?indent_size ?indent writer node] streams a DOM node
889 as HTML to a bytes writer.
890
891 This is more memory-efficient than {!to_html} for large documents as it
892 doesn't build intermediate strings.
893
894 @param pretty If [true] (default), format with indentation and newlines
895 @param indent_size Number of spaces per indentation level (default: 2)
896 @param indent Starting indentation level (default: 0)
897 @param writer The bytes writer to output to
898*)
899
900val to_test_format : ?indent:int -> node -> string
901(** [to_test_format ?indent node] converts a DOM node to the html5lib test
902 format.
903
904 This format is used by the html5lib test suite for comparing parser
905 output. It represents the DOM tree in a human-readable, line-based format.
906
907 @param indent Starting indentation level (default: 0)
908 @return The test format string representation
909*)
910
911val to_text : ?separator:string -> ?strip:bool -> node -> string
912(** [to_text ?separator ?strip node] extracts all text content from a node.
913
914 Recursively collects text from all descendant text nodes.
915
916 @param separator String to insert between text nodes (default: [" "])
917 @param strip If [true] (default), trim whitespace from result
918 @return The concatenated text content
919*)