OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at f7c69be4eae5476a0985d55de71f2cc34c8d5361 168 lines 7.2 kB view raw
1(* HTML5 tokenizer states *) 2 3type t = 4 | Data 5 | Rcdata 6 | Rawtext 7 | Script_data 8 | Plaintext 9 | Tag_open 10 | End_tag_open 11 | Tag_name 12 | Rcdata_less_than_sign 13 | Rcdata_end_tag_open 14 | Rcdata_end_tag_name 15 | Rawtext_less_than_sign 16 | Rawtext_end_tag_open 17 | Rawtext_end_tag_name 18 | Script_data_less_than_sign 19 | Script_data_end_tag_open 20 | Script_data_end_tag_name 21 | Script_data_escape_start 22 | Script_data_escape_start_dash 23 | Script_data_escaped 24 | Script_data_escaped_dash 25 | Script_data_escaped_dash_dash 26 | Script_data_escaped_less_than_sign 27 | Script_data_escaped_end_tag_open 28 | Script_data_escaped_end_tag_name 29 | Script_data_double_escape_start 30 | Script_data_double_escaped 31 | Script_data_double_escaped_dash 32 | Script_data_double_escaped_dash_dash 33 | Script_data_double_escaped_less_than_sign 34 | Script_data_double_escape_end 35 | Before_attribute_name 36 | Attribute_name 37 | After_attribute_name 38 | Before_attribute_value 39 | Attribute_value_double_quoted 40 | Attribute_value_single_quoted 41 | Attribute_value_unquoted 42 | After_attribute_value_quoted 43 | Self_closing_start_tag 44 | Bogus_comment 45 | Markup_declaration_open 46 | Comment_start 47 | Comment_start_dash 48 | Comment 49 | Comment_less_than_sign 50 | Comment_less_than_sign_bang 51 | Comment_less_than_sign_bang_dash 52 | Comment_less_than_sign_bang_dash_dash 53 | Comment_end_dash 54 | Comment_end 55 | Comment_end_bang 56 | Doctype 57 | Before_doctype_name 58 | Doctype_name 59 | After_doctype_name 60 | After_doctype_public_keyword 61 | Before_doctype_public_identifier 62 | Doctype_public_identifier_double_quoted 63 | Doctype_public_identifier_single_quoted 64 | After_doctype_public_identifier 65 | Between_doctype_public_and_system_identifiers 66 | After_doctype_system_keyword 67 | Before_doctype_system_identifier 68 | Doctype_system_identifier_double_quoted 69 | Doctype_system_identifier_single_quoted 70 | After_doctype_system_identifier 71 | Bogus_doctype 72 | Cdata_section 73 | Cdata_section_bracket 74 | Cdata_section_end 75 | Character_reference 76 | Named_character_reference 77 | Ambiguous_ampersand 78 | Numeric_character_reference 79 | Hexadecimal_character_reference_start 80 | Decimal_character_reference_start 81 | Hexadecimal_character_reference 82 | Decimal_character_reference 83 | Numeric_character_reference_end 84 85let pp fmt t = 86 let s = match t with 87 | Data -> "Data" 88 | Rcdata -> "Rcdata" 89 | Rawtext -> "Rawtext" 90 | Script_data -> "Script_data" 91 | Plaintext -> "Plaintext" 92 | Tag_open -> "Tag_open" 93 | End_tag_open -> "End_tag_open" 94 | Tag_name -> "Tag_name" 95 | Rcdata_less_than_sign -> "Rcdata_less_than_sign" 96 | Rcdata_end_tag_open -> "Rcdata_end_tag_open" 97 | Rcdata_end_tag_name -> "Rcdata_end_tag_name" 98 | Rawtext_less_than_sign -> "Rawtext_less_than_sign" 99 | Rawtext_end_tag_open -> "Rawtext_end_tag_open" 100 | Rawtext_end_tag_name -> "Rawtext_end_tag_name" 101 | Script_data_less_than_sign -> "Script_data_less_than_sign" 102 | Script_data_end_tag_open -> "Script_data_end_tag_open" 103 | Script_data_end_tag_name -> "Script_data_end_tag_name" 104 | Script_data_escape_start -> "Script_data_escape_start" 105 | Script_data_escape_start_dash -> "Script_data_escape_start_dash" 106 | Script_data_escaped -> "Script_data_escaped" 107 | Script_data_escaped_dash -> "Script_data_escaped_dash" 108 | Script_data_escaped_dash_dash -> "Script_data_escaped_dash_dash" 109 | Script_data_escaped_less_than_sign -> "Script_data_escaped_less_than_sign" 110 | Script_data_escaped_end_tag_open -> "Script_data_escaped_end_tag_open" 111 | Script_data_escaped_end_tag_name -> "Script_data_escaped_end_tag_name" 112 | Script_data_double_escape_start -> "Script_data_double_escape_start" 113 | Script_data_double_escaped -> "Script_data_double_escaped" 114 | Script_data_double_escaped_dash -> "Script_data_double_escaped_dash" 115 | Script_data_double_escaped_dash_dash -> "Script_data_double_escaped_dash_dash" 116 | Script_data_double_escaped_less_than_sign -> "Script_data_double_escaped_less_than_sign" 117 | Script_data_double_escape_end -> "Script_data_double_escape_end" 118 | Before_attribute_name -> "Before_attribute_name" 119 | Attribute_name -> "Attribute_name" 120 | After_attribute_name -> "After_attribute_name" 121 | Before_attribute_value -> "Before_attribute_value" 122 | Attribute_value_double_quoted -> "Attribute_value_double_quoted" 123 | Attribute_value_single_quoted -> "Attribute_value_single_quoted" 124 | Attribute_value_unquoted -> "Attribute_value_unquoted" 125 | After_attribute_value_quoted -> "After_attribute_value_quoted" 126 | Self_closing_start_tag -> "Self_closing_start_tag" 127 | Bogus_comment -> "Bogus_comment" 128 | Markup_declaration_open -> "Markup_declaration_open" 129 | Comment_start -> "Comment_start" 130 | Comment_start_dash -> "Comment_start_dash" 131 | Comment -> "Comment" 132 | Comment_less_than_sign -> "Comment_less_than_sign" 133 | Comment_less_than_sign_bang -> "Comment_less_than_sign_bang" 134 | Comment_less_than_sign_bang_dash -> "Comment_less_than_sign_bang_dash" 135 | Comment_less_than_sign_bang_dash_dash -> "Comment_less_than_sign_bang_dash_dash" 136 | Comment_end_dash -> "Comment_end_dash" 137 | Comment_end -> "Comment_end" 138 | Comment_end_bang -> "Comment_end_bang" 139 | Doctype -> "Doctype" 140 | Before_doctype_name -> "Before_doctype_name" 141 | Doctype_name -> "Doctype_name" 142 | After_doctype_name -> "After_doctype_name" 143 | After_doctype_public_keyword -> "After_doctype_public_keyword" 144 | Before_doctype_public_identifier -> "Before_doctype_public_identifier" 145 | Doctype_public_identifier_double_quoted -> "Doctype_public_identifier_double_quoted" 146 | Doctype_public_identifier_single_quoted -> "Doctype_public_identifier_single_quoted" 147 | After_doctype_public_identifier -> "After_doctype_public_identifier" 148 | Between_doctype_public_and_system_identifiers -> "Between_doctype_public_and_system_identifiers" 149 | After_doctype_system_keyword -> "After_doctype_system_keyword" 150 | Before_doctype_system_identifier -> "Before_doctype_system_identifier" 151 | Doctype_system_identifier_double_quoted -> "Doctype_system_identifier_double_quoted" 152 | Doctype_system_identifier_single_quoted -> "Doctype_system_identifier_single_quoted" 153 | After_doctype_system_identifier -> "After_doctype_system_identifier" 154 | Bogus_doctype -> "Bogus_doctype" 155 | Cdata_section -> "Cdata_section" 156 | Cdata_section_bracket -> "Cdata_section_bracket" 157 | Cdata_section_end -> "Cdata_section_end" 158 | Character_reference -> "Character_reference" 159 | Named_character_reference -> "Named_character_reference" 160 | Ambiguous_ampersand -> "Ambiguous_ampersand" 161 | Numeric_character_reference -> "Numeric_character_reference" 162 | Hexadecimal_character_reference_start -> "Hexadecimal_character_reference_start" 163 | Decimal_character_reference_start -> "Decimal_character_reference_start" 164 | Hexadecimal_character_reference -> "Hexadecimal_character_reference" 165 | Decimal_character_reference -> "Decimal_character_reference" 166 | Numeric_character_reference_end -> "Numeric_character_reference_end" 167 in 168 Format.pp_print_string fmt s