OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 264 lines 9.2 kB view raw
1(* HTML meta charset prescan per WHATWG spec *) 2 3(* Character classification using Astring *) 4let is_ascii_whitespace c = c = '\x09' || c = '\x0A' || c = '\x0C' || c = '\x0D' || c = '\x20' 5let is_ascii_alpha = Astring.Char.Ascii.is_letter 6 7let skip_whitespace data i len = 8 let j = ref i in 9 while !j < len && is_ascii_whitespace (Bytes.get data !j) do 10 incr j 11 done; 12 !j 13 14let strip_whitespace data start len = 15 let s = ref start in 16 let e = ref (start + len) in 17 while !s < !e && is_ascii_whitespace (Bytes.get data !s) do incr s done; 18 while !e > !s && is_ascii_whitespace (Bytes.get data (!e - 1)) do decr e done; 19 Bytes.sub_string data !s (!e - !s) 20 21let extract_charset_from_content content = 22 let len = String.length content in 23 (* Find "charset" *) 24 let rec find_charset i = 25 if i + 7 > len then None 26 else 27 let sub = String.lowercase_ascii (String.sub content i 7) in 28 if sub = "charset" then 29 let j = ref (i + 7) in 30 (* Skip whitespace *) 31 while !j < len && is_ascii_whitespace content.[!j] do incr j done; 32 if !j >= len || content.[!j] <> '=' then find_charset (i + 1) 33 else begin 34 incr j; 35 (* Skip whitespace after = *) 36 while !j < len && is_ascii_whitespace content.[!j] do incr j done; 37 if !j >= len then None 38 else 39 let quote = 40 if content.[!j] = '"' || content.[!j] = '\'' then begin 41 let q = content.[!j] in 42 incr j; 43 Some q 44 end else None 45 in 46 let start = !j in 47 (match quote with 48 | Some q -> 49 while !j < len && content.[!j] <> q do incr j done; 50 if !j >= len then None 51 else Some (String.sub content start (!j - start)) 52 | None -> 53 while !j < len && 54 not (is_ascii_whitespace content.[!j]) && 55 content.[!j] <> ';' do 56 incr j 57 done; 58 Some (String.sub content start (!j - start))) 59 end 60 else find_charset (i + 1) 61 in 62 find_charset 0 63 64let prescan_for_meta_charset data = 65 let len = Bytes.length data in 66 let max_non_comment = 1024 in 67 let max_total = 65536 in 68 let i = ref 0 in 69 let non_comment = ref 0 in 70 71 let result = ref None in 72 73 while !result = None && !i < len && !i < max_total && !non_comment < max_non_comment do 74 if Bytes.get data !i <> '<' then begin 75 incr i; 76 incr non_comment 77 end else begin 78 (* Check for comment *) 79 if !i + 3 < len && 80 Bytes.get data (!i + 1) = '!' && 81 Bytes.get data (!i + 2) = '-' && 82 Bytes.get data (!i + 3) = '-' then begin 83 (* Skip comment *) 84 let j = ref (!i + 4) in 85 while !j + 2 < len && not ( 86 Bytes.get data !j = '-' && 87 Bytes.get data (!j + 1) = '-' && 88 Bytes.get data (!j + 2) = '>' 89 ) do incr j done; 90 if !j + 2 < len then 91 i := !j + 3 92 else 93 i := len (* Unclosed comment - stop scanning *) 94 end 95 (* Check for end tag - skip it *) 96 else if !i + 1 < len && Bytes.get data (!i + 1) = '/' then begin 97 let j = ref (!i + 2) in 98 let in_quote = ref None in 99 let done_tag = ref false in 100 while not !done_tag && !j < len && !j < max_total && !non_comment < max_non_comment do 101 let c = Bytes.get data !j in 102 match !in_quote with 103 | None -> 104 if c = '"' || c = '\'' then begin 105 in_quote := Some c; 106 incr j; 107 incr non_comment 108 end else if c = '>' then begin 109 incr j; 110 incr non_comment; 111 done_tag := true 112 end else begin 113 incr j; 114 incr non_comment 115 end 116 | Some q -> 117 if c = q then in_quote := None; 118 incr j; 119 incr non_comment 120 done; 121 i := !j 122 end 123 (* Check for tag *) 124 else if !i + 1 < len && is_ascii_alpha (Bytes.get data (!i + 1)) then begin 125 let j = ref (!i + 1) in 126 while !j < len && is_ascii_alpha (Bytes.get data !j) do incr j done; 127 let tag_name = 128 let name_bytes = Bytes.sub data (!i + 1) (!j - !i - 1) in 129 String.lowercase_ascii (Bytes.to_string name_bytes) 130 in 131 132 if tag_name <> "meta" then begin 133 (* Skip non-meta tag *) 134 let in_quote = ref None in 135 let done_tag = ref false in 136 while not !done_tag && !j < len && !j < max_total && !non_comment < max_non_comment do 137 let c = Bytes.get data !j in 138 match !in_quote with 139 | None -> 140 if c = '"' || c = '\'' then begin 141 in_quote := Some c; 142 incr j; 143 incr non_comment 144 end else if c = '>' then begin 145 incr j; 146 incr non_comment; 147 done_tag := true 148 end else begin 149 incr j; 150 incr non_comment 151 end 152 | Some q -> 153 if c = q then in_quote := None; 154 incr j; 155 incr non_comment 156 done; 157 i := !j 158 end else begin 159 (* Parse meta tag attributes *) 160 let charset = ref None in 161 let http_equiv = ref None in 162 let content = ref None in 163 let k = ref !j in 164 let saw_gt = ref false in 165 166 while not !saw_gt && !k < len && !k < max_total do 167 let c = Bytes.get data !k in 168 if c = '>' then begin 169 saw_gt := true; 170 incr k 171 end else if c = '<' then begin 172 (* Restart scanning from here *) 173 k := len 174 end else if is_ascii_whitespace c || c = '/' then begin 175 incr k 176 end else begin 177 (* Attribute name *) 178 let attr_start = !k in 179 while !k < len && 180 not (is_ascii_whitespace (Bytes.get data !k)) && 181 Bytes.get data !k <> '=' && 182 Bytes.get data !k <> '>' && 183 Bytes.get data !k <> '/' && 184 Bytes.get data !k <> '<' do 185 incr k 186 done; 187 let attr_name = 188 String.lowercase_ascii (Bytes.sub_string data attr_start (!k - attr_start)) 189 in 190 k := skip_whitespace data !k len; 191 192 let value = ref None in 193 if !k < len && Bytes.get data !k = '=' then begin 194 incr k; 195 k := skip_whitespace data !k len; 196 if !k < len then begin 197 let qc = Bytes.get data !k in 198 if qc = '"' || qc = '\'' then begin 199 incr k; 200 let val_start = !k in 201 while !k < len && Bytes.get data !k <> qc do incr k done; 202 if !k < len then begin 203 value := Some (Bytes.sub_string data val_start (!k - val_start)); 204 incr k 205 end 206 end else begin 207 let val_start = !k in 208 while !k < len && 209 not (is_ascii_whitespace (Bytes.get data !k)) && 210 Bytes.get data !k <> '>' && 211 Bytes.get data !k <> '<' do 212 incr k 213 done; 214 value := Some (Bytes.sub_string data val_start (!k - val_start)) 215 end 216 end 217 end; 218 219 if attr_name = "charset" then 220 charset := !value 221 else if attr_name = "http-equiv" then 222 http_equiv := !value 223 else if attr_name = "content" then 224 content := !value 225 end 226 done; 227 228 if !saw_gt then begin 229 (* Check for charset *) 230 (match !charset with 231 | Some cs -> 232 (match Encoding_labels.normalize_meta_declared cs with 233 | Some enc -> result := Some enc 234 | None -> ()) 235 | None -> ()); 236 237 (* Check for http-equiv="content-type" with content *) 238 (* Note: http-equiv value must be exactly "content-type" (case-insensitive) *) 239 if !result = None then 240 (match !http_equiv, !content with 241 | Some he, Some ct when String.lowercase_ascii he = "content-type" -> 242 (match extract_charset_from_content ct with 243 | Some extracted -> 244 (match Encoding_labels.normalize_meta_declared extracted with 245 | Some enc -> result := Some enc 246 | None -> ()) 247 | None -> ()) 248 | _ -> ()); 249 250 i := !k; 251 non_comment := !non_comment + (!k - !j) 252 end else begin 253 incr i; 254 incr non_comment 255 end 256 end 257 end else begin 258 incr i; 259 incr non_comment 260 end 261 end 262 done; 263 264 !result