OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* HTML5 tokenizer states *)
2
3type t =
4 | Data
5 | Rcdata
6 | Rawtext
7 | Script_data
8 | Plaintext
9 | Tag_open
10 | End_tag_open
11 | Tag_name
12 | Rcdata_less_than_sign
13 | Rcdata_end_tag_open
14 | Rcdata_end_tag_name
15 | Rawtext_less_than_sign
16 | Rawtext_end_tag_open
17 | Rawtext_end_tag_name
18 | Script_data_less_than_sign
19 | Script_data_end_tag_open
20 | Script_data_end_tag_name
21 | Script_data_escape_start
22 | Script_data_escape_start_dash
23 | Script_data_escaped
24 | Script_data_escaped_dash
25 | Script_data_escaped_dash_dash
26 | Script_data_escaped_less_than_sign
27 | Script_data_escaped_end_tag_open
28 | Script_data_escaped_end_tag_name
29 | Script_data_double_escape_start
30 | Script_data_double_escaped
31 | Script_data_double_escaped_dash
32 | Script_data_double_escaped_dash_dash
33 | Script_data_double_escaped_less_than_sign
34 | Script_data_double_escape_end
35 | Before_attribute_name
36 | Attribute_name
37 | After_attribute_name
38 | Before_attribute_value
39 | Attribute_value_double_quoted
40 | Attribute_value_single_quoted
41 | Attribute_value_unquoted
42 | After_attribute_value_quoted
43 | Self_closing_start_tag
44 | Bogus_comment
45 | Markup_declaration_open
46 | Comment_start
47 | Comment_start_dash
48 | Comment
49 | Comment_less_than_sign
50 | Comment_less_than_sign_bang
51 | Comment_less_than_sign_bang_dash
52 | Comment_less_than_sign_bang_dash_dash
53 | Comment_end_dash
54 | Comment_end
55 | Comment_end_bang
56 | Doctype
57 | Before_doctype_name
58 | Doctype_name
59 | After_doctype_name
60 | After_doctype_public_keyword
61 | Before_doctype_public_identifier
62 | Doctype_public_identifier_double_quoted
63 | Doctype_public_identifier_single_quoted
64 | After_doctype_public_identifier
65 | Between_doctype_public_and_system_identifiers
66 | After_doctype_system_keyword
67 | Before_doctype_system_identifier
68 | Doctype_system_identifier_double_quoted
69 | Doctype_system_identifier_single_quoted
70 | After_doctype_system_identifier
71 | Bogus_doctype
72 | Cdata_section
73 | Cdata_section_bracket
74 | Cdata_section_end
75 | Character_reference
76 | Named_character_reference
77 | Ambiguous_ampersand
78 | Numeric_character_reference
79 | Hexadecimal_character_reference_start
80 | Decimal_character_reference_start
81 | Hexadecimal_character_reference
82 | Decimal_character_reference
83 | Numeric_character_reference_end
84
85let pp fmt t =
86 let s = match t with
87 | Data -> "Data"
88 | Rcdata -> "Rcdata"
89 | Rawtext -> "Rawtext"
90 | Script_data -> "Script_data"
91 | Plaintext -> "Plaintext"
92 | Tag_open -> "Tag_open"
93 | End_tag_open -> "End_tag_open"
94 | Tag_name -> "Tag_name"
95 | Rcdata_less_than_sign -> "Rcdata_less_than_sign"
96 | Rcdata_end_tag_open -> "Rcdata_end_tag_open"
97 | Rcdata_end_tag_name -> "Rcdata_end_tag_name"
98 | Rawtext_less_than_sign -> "Rawtext_less_than_sign"
99 | Rawtext_end_tag_open -> "Rawtext_end_tag_open"
100 | Rawtext_end_tag_name -> "Rawtext_end_tag_name"
101 | Script_data_less_than_sign -> "Script_data_less_than_sign"
102 | Script_data_end_tag_open -> "Script_data_end_tag_open"
103 | Script_data_end_tag_name -> "Script_data_end_tag_name"
104 | Script_data_escape_start -> "Script_data_escape_start"
105 | Script_data_escape_start_dash -> "Script_data_escape_start_dash"
106 | Script_data_escaped -> "Script_data_escaped"
107 | Script_data_escaped_dash -> "Script_data_escaped_dash"
108 | Script_data_escaped_dash_dash -> "Script_data_escaped_dash_dash"
109 | Script_data_escaped_less_than_sign -> "Script_data_escaped_less_than_sign"
110 | Script_data_escaped_end_tag_open -> "Script_data_escaped_end_tag_open"
111 | Script_data_escaped_end_tag_name -> "Script_data_escaped_end_tag_name"
112 | Script_data_double_escape_start -> "Script_data_double_escape_start"
113 | Script_data_double_escaped -> "Script_data_double_escaped"
114 | Script_data_double_escaped_dash -> "Script_data_double_escaped_dash"
115 | Script_data_double_escaped_dash_dash -> "Script_data_double_escaped_dash_dash"
116 | Script_data_double_escaped_less_than_sign -> "Script_data_double_escaped_less_than_sign"
117 | Script_data_double_escape_end -> "Script_data_double_escape_end"
118 | Before_attribute_name -> "Before_attribute_name"
119 | Attribute_name -> "Attribute_name"
120 | After_attribute_name -> "After_attribute_name"
121 | Before_attribute_value -> "Before_attribute_value"
122 | Attribute_value_double_quoted -> "Attribute_value_double_quoted"
123 | Attribute_value_single_quoted -> "Attribute_value_single_quoted"
124 | Attribute_value_unquoted -> "Attribute_value_unquoted"
125 | After_attribute_value_quoted -> "After_attribute_value_quoted"
126 | Self_closing_start_tag -> "Self_closing_start_tag"
127 | Bogus_comment -> "Bogus_comment"
128 | Markup_declaration_open -> "Markup_declaration_open"
129 | Comment_start -> "Comment_start"
130 | Comment_start_dash -> "Comment_start_dash"
131 | Comment -> "Comment"
132 | Comment_less_than_sign -> "Comment_less_than_sign"
133 | Comment_less_than_sign_bang -> "Comment_less_than_sign_bang"
134 | Comment_less_than_sign_bang_dash -> "Comment_less_than_sign_bang_dash"
135 | Comment_less_than_sign_bang_dash_dash -> "Comment_less_than_sign_bang_dash_dash"
136 | Comment_end_dash -> "Comment_end_dash"
137 | Comment_end -> "Comment_end"
138 | Comment_end_bang -> "Comment_end_bang"
139 | Doctype -> "Doctype"
140 | Before_doctype_name -> "Before_doctype_name"
141 | Doctype_name -> "Doctype_name"
142 | After_doctype_name -> "After_doctype_name"
143 | After_doctype_public_keyword -> "After_doctype_public_keyword"
144 | Before_doctype_public_identifier -> "Before_doctype_public_identifier"
145 | Doctype_public_identifier_double_quoted -> "Doctype_public_identifier_double_quoted"
146 | Doctype_public_identifier_single_quoted -> "Doctype_public_identifier_single_quoted"
147 | After_doctype_public_identifier -> "After_doctype_public_identifier"
148 | Between_doctype_public_and_system_identifiers -> "Between_doctype_public_and_system_identifiers"
149 | After_doctype_system_keyword -> "After_doctype_system_keyword"
150 | Before_doctype_system_identifier -> "Before_doctype_system_identifier"
151 | Doctype_system_identifier_double_quoted -> "Doctype_system_identifier_double_quoted"
152 | Doctype_system_identifier_single_quoted -> "Doctype_system_identifier_single_quoted"
153 | After_doctype_system_identifier -> "After_doctype_system_identifier"
154 | Bogus_doctype -> "Bogus_doctype"
155 | Cdata_section -> "Cdata_section"
156 | Cdata_section_bracket -> "Cdata_section_bracket"
157 | Cdata_section_end -> "Cdata_section_end"
158 | Character_reference -> "Character_reference"
159 | Named_character_reference -> "Named_character_reference"
160 | Ambiguous_ampersand -> "Ambiguous_ampersand"
161 | Numeric_character_reference -> "Numeric_character_reference"
162 | Hexadecimal_character_reference_start -> "Hexadecimal_character_reference_start"
163 | Decimal_character_reference_start -> "Decimal_character_reference_start"
164 | Hexadecimal_character_reference -> "Hexadecimal_character_reference"
165 | Decimal_character_reference -> "Decimal_character_reference"
166 | Numeric_character_reference_end -> "Numeric_character_reference_end"
167 in
168 Format.pp_print_string fmt s