(* HTML meta charset prescan per WHATWG spec *) (* Character classification using Astring *) let is_ascii_whitespace c = c = '\x09' || c = '\x0A' || c = '\x0C' || c = '\x0D' || c = '\x20' let is_ascii_alpha = Astring.Char.Ascii.is_letter let skip_whitespace data i len = let j = ref i in while !j < len && is_ascii_whitespace (Bytes.get data !j) do incr j done; !j let strip_whitespace data start len = let s = ref start in let e = ref (start + len) in while !s < !e && is_ascii_whitespace (Bytes.get data !s) do incr s done; while !e > !s && is_ascii_whitespace (Bytes.get data (!e - 1)) do decr e done; Bytes.sub_string data !s (!e - !s) let extract_charset_from_content content = let len = String.length content in (* Find "charset" *) let rec find_charset i = if i + 7 > len then None else let sub = String.lowercase_ascii (String.sub content i 7) in if sub = "charset" then let j = ref (i + 7) in (* Skip whitespace *) while !j < len && is_ascii_whitespace content.[!j] do incr j done; if !j >= len || content.[!j] <> '=' then find_charset (i + 1) else begin incr j; (* Skip whitespace after = *) while !j < len && is_ascii_whitespace content.[!j] do incr j done; if !j >= len then None else let quote = if content.[!j] = '"' || content.[!j] = '\'' then begin let q = content.[!j] in incr j; Some q end else None in let start = !j in (match quote with | Some q -> while !j < len && content.[!j] <> q do incr j done; if !j >= len then None else Some (String.sub content start (!j - start)) | None -> while !j < len && not (is_ascii_whitespace content.[!j]) && content.[!j] <> ';' do incr j done; Some (String.sub content start (!j - start))) end else find_charset (i + 1) in find_charset 0 let prescan_for_meta_charset data = let len = Bytes.length data in let max_non_comment = 1024 in let max_total = 65536 in let i = ref 0 in let non_comment = ref 0 in let result = ref None in while !result = None && !i < len && !i < max_total && !non_comment < max_non_comment do if Bytes.get data !i <> '<' then begin incr i; incr non_comment end else begin (* Check for comment *) if !i + 3 < len && Bytes.get data (!i + 1) = '!' && Bytes.get data (!i + 2) = '-' && Bytes.get data (!i + 3) = '-' then begin (* Skip comment *) let j = ref (!i + 4) in while !j + 2 < len && not ( Bytes.get data !j = '-' && Bytes.get data (!j + 1) = '-' && Bytes.get data (!j + 2) = '>' ) do incr j done; if !j + 2 < len then i := !j + 3 else i := len (* Unclosed comment - stop scanning *) end (* Check for end tag - skip it *) else if !i + 1 < len && Bytes.get data (!i + 1) = '/' then begin let j = ref (!i + 2) in let in_quote = ref None in let done_tag = ref false in while not !done_tag && !j < len && !j < max_total && !non_comment < max_non_comment do let c = Bytes.get data !j in match !in_quote with | None -> if c = '"' || c = '\'' then begin in_quote := Some c; incr j; incr non_comment end else if c = '>' then begin incr j; incr non_comment; done_tag := true end else begin incr j; incr non_comment end | Some q -> if c = q then in_quote := None; incr j; incr non_comment done; i := !j end (* Check for tag *) else if !i + 1 < len && is_ascii_alpha (Bytes.get data (!i + 1)) then begin let j = ref (!i + 1) in while !j < len && is_ascii_alpha (Bytes.get data !j) do incr j done; let tag_name = let name_bytes = Bytes.sub data (!i + 1) (!j - !i - 1) in String.lowercase_ascii (Bytes.to_string name_bytes) in if tag_name <> "meta" then begin (* Skip non-meta tag *) let in_quote = ref None in let done_tag = ref false in while not !done_tag && !j < len && !j < max_total && !non_comment < max_non_comment do let c = Bytes.get data !j in match !in_quote with | None -> if c = '"' || c = '\'' then begin in_quote := Some c; incr j; incr non_comment end else if c = '>' then begin incr j; incr non_comment; done_tag := true end else begin incr j; incr non_comment end | Some q -> if c = q then in_quote := None; incr j; incr non_comment done; i := !j end else begin (* Parse meta tag attributes *) let charset = ref None in let http_equiv = ref None in let content = ref None in let k = ref !j in let saw_gt = ref false in while not !saw_gt && !k < len && !k < max_total do let c = Bytes.get data !k in if c = '>' then begin saw_gt := true; incr k end else if c = '<' then begin (* Restart scanning from here *) k := len end else if is_ascii_whitespace c || c = '/' then begin incr k end else begin (* Attribute name *) let attr_start = !k in while !k < len && not (is_ascii_whitespace (Bytes.get data !k)) && Bytes.get data !k <> '=' && Bytes.get data !k <> '>' && Bytes.get data !k <> '/' && Bytes.get data !k <> '<' do incr k done; let attr_name = String.lowercase_ascii (Bytes.sub_string data attr_start (!k - attr_start)) in k := skip_whitespace data !k len; let value = ref None in if !k < len && Bytes.get data !k = '=' then begin incr k; k := skip_whitespace data !k len; if !k < len then begin let qc = Bytes.get data !k in if qc = '"' || qc = '\'' then begin incr k; let val_start = !k in while !k < len && Bytes.get data !k <> qc do incr k done; if !k < len then begin value := Some (Bytes.sub_string data val_start (!k - val_start)); incr k end end else begin let val_start = !k in while !k < len && not (is_ascii_whitespace (Bytes.get data !k)) && Bytes.get data !k <> '>' && Bytes.get data !k <> '<' do incr k done; value := Some (Bytes.sub_string data val_start (!k - val_start)) end end end; if attr_name = "charset" then charset := !value else if attr_name = "http-equiv" then http_equiv := !value else if attr_name = "content" then content := !value end done; if !saw_gt then begin (* Check for charset *) (match !charset with | Some cs -> (match Encoding_labels.normalize_meta_declared cs with | Some enc -> result := Some enc | None -> ()) | None -> ()); (* Check for http-equiv="content-type" with content *) (* Note: http-equiv value must be exactly "content-type" (case-insensitive) *) if !result = None then (match !http_equiv, !content with | Some he, Some ct when String.lowercase_ascii he = "content-type" -> (match extract_charset_from_content ct with | Some extracted -> (match Encoding_labels.normalize_meta_declared extracted with | Some enc -> result := Some enc | None -> ()) | None -> ()) | _ -> ()); i := !k; non_comment := !non_comment + (!k - !j) end else begin incr i; incr non_comment end end end else begin incr i; incr non_comment end end done; !result