(* HTML5 encoding detection and decoding *) (* UTF-8 replacement character *) let replacement_char = Uchar.of_int 0xFFFD let decode_utf16 data ~is_le ~bom_len = let len = Bytes.length data in let buf = Buffer.create len in let i = ref bom_len in while !i + 1 < len do let b0 = Char.code (Bytes.get data !i) in let b1 = Char.code (Bytes.get data (!i + 1)) in let code_unit = if is_le then b0 lor (b1 lsl 8) else (b0 lsl 8) lor b1 in i := !i + 2; (* Handle surrogate pairs *) if code_unit >= 0xD800 && code_unit <= 0xDBFF && !i + 1 < len then begin (* High surrogate, look for low surrogate *) let b2 = Char.code (Bytes.get data !i) in let b3 = Char.code (Bytes.get data (!i + 1)) in let code_unit2 = if is_le then b2 lor (b3 lsl 8) else (b2 lsl 8) lor b3 in if code_unit2 >= 0xDC00 && code_unit2 <= 0xDFFF then begin i := !i + 2; let high = code_unit - 0xD800 in let low = code_unit2 - 0xDC00 in let cp = 0x10000 + (high lsl 10) lor low in Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp) end else begin (* Invalid surrogate, output replacement *) Uutf.Buffer.add_utf_8 buf replacement_char end end else if code_unit >= 0xD800 && code_unit <= 0xDFFF then begin (* Lone surrogate *) Uutf.Buffer.add_utf_8 buf replacement_char end else begin Uutf.Buffer.add_utf_8 buf (Uchar.of_int code_unit) end done; (* Odd trailing byte *) if !i < len then Uutf.Buffer.add_utf_8 buf replacement_char; Buffer.contents buf let decode_with_encoding data enc ~bom_len = match enc with | Encoding_types.Utf8 -> (* UTF-8: Just validate and replace errors with replacement character *) let len = Bytes.length data in let buf = Buffer.create len in let decoder = Uutf.decoder ~encoding:`UTF_8 (`String (Bytes.to_string data)) in (* Skip BOM if present *) let _ = if bom_len > 0 then begin for _ = 1 to bom_len do ignore (Uutf.decode decoder) done end in let rec loop () = match Uutf.decode decoder with | `Uchar u -> Uutf.Buffer.add_utf_8 buf u; loop () | `Malformed _ -> Buffer.add_string buf "\xEF\xBF\xBD"; loop () | `End -> () | `Await -> assert false in loop (); Buffer.contents buf | Encoding_types.Utf16le -> decode_utf16 data ~is_le:true ~bom_len | Encoding_types.Utf16be -> decode_utf16 data ~is_le:false ~bom_len | Encoding_types.Windows_1252 -> (* Windows-1252 mapping table for 0x80-0x9F range *) let len = Bytes.length data in let buf = Buffer.create len in let table = [| (* 0x80-0x9F *) 0x20AC; 0x0081; 0x201A; 0x0192; 0x201E; 0x2026; 0x2020; 0x2021; 0x02C6; 0x2030; 0x0160; 0x2039; 0x0152; 0x008D; 0x017D; 0x008F; 0x0090; 0x2018; 0x2019; 0x201C; 0x201D; 0x2022; 0x2013; 0x2014; 0x02DC; 0x2122; 0x0161; 0x203A; 0x0153; 0x009D; 0x017E; 0x0178; |] in for i = bom_len to len - 1 do let b = Char.code (Bytes.get data i) in let cp = if b >= 0x80 && b <= 0x9F then table.(b - 0x80) else b in Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp) done; Buffer.contents buf | Encoding_types.Iso_8859_2 -> (* Use uuuu for ISO-8859-2 decoding *) let len = Bytes.length data in let buf = Buffer.create len in let s = Bytes.sub_string data bom_len (len - bom_len) in Uuuu.String.fold `ISO_8859_2 (fun () _pos -> function | `Uchar u -> Uutf.Buffer.add_utf_8 buf u | `Malformed _ -> Uutf.Buffer.add_utf_8 buf replacement_char ) () s; Buffer.contents buf | Encoding_types.Euc_jp -> (* For EUC-JP, use uutf with best effort *) let len = Bytes.length data in let buf = Buffer.create len in let s = Bytes.sub_string data bom_len (len - bom_len) in (* EUC-JP not directly supported by uutf, fall back to treating high bytes as replacement *) (* This is a simplification - full EUC-JP would need a separate decoder *) String.iter (fun c -> if Char.code c <= 0x7F then Buffer.add_char buf c else Buffer.add_string buf "\xEF\xBF\xBD" ) s; Buffer.contents buf let decode data ?transport_encoding () = (* Step 1: Check for BOM *) let bom_result = Encoding_bom.sniff data in match bom_result with | Some (enc, bom_len) -> (decode_with_encoding data enc ~bom_len, enc) | None -> (* Step 2: Check transport encoding (e.g., HTTP Content-Type) *) let enc_from_transport = match transport_encoding with | Some te -> Encoding_labels.normalize_label te | None -> None in match enc_from_transport with | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc) | None -> (* Step 3: Prescan for meta charset *) match Encoding_prescan.prescan_for_meta_charset data with | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc) | None -> (* Default to Windows-1252 per HTML5 spec when no encoding detected *) (decode_with_encoding data Encoding_types.Windows_1252 ~bom_len:0, Encoding_types.Windows_1252)