(* HTML5 encoding detection and decoding *)
(* UTF-8 replacement character *)
let replacement_char = Uchar.of_int 0xFFFD
let decode_utf16 data ~is_le ~bom_len =
let len = Bytes.length data in
let buf = Buffer.create len in
let i = ref bom_len in
while !i + 1 < len do
let b0 = Char.code (Bytes.get data !i) in
let b1 = Char.code (Bytes.get data (!i + 1)) in
let code_unit =
if is_le then b0 lor (b1 lsl 8)
else (b0 lsl 8) lor b1
in
i := !i + 2;
(* Handle surrogate pairs *)
if code_unit >= 0xD800 && code_unit <= 0xDBFF && !i + 1 < len then begin
(* High surrogate, look for low surrogate *)
let b2 = Char.code (Bytes.get data !i) in
let b3 = Char.code (Bytes.get data (!i + 1)) in
let code_unit2 =
if is_le then b2 lor (b3 lsl 8)
else (b2 lsl 8) lor b3
in
if code_unit2 >= 0xDC00 && code_unit2 <= 0xDFFF then begin
i := !i + 2;
let high = code_unit - 0xD800 in
let low = code_unit2 - 0xDC00 in
let cp = 0x10000 + (high lsl 10) lor low in
Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp)
end else begin
(* Invalid surrogate, output replacement *)
Uutf.Buffer.add_utf_8 buf replacement_char
end
end else if code_unit >= 0xD800 && code_unit <= 0xDFFF then begin
(* Lone surrogate *)
Uutf.Buffer.add_utf_8 buf replacement_char
end else begin
Uutf.Buffer.add_utf_8 buf (Uchar.of_int code_unit)
end
done;
(* Odd trailing byte *)
if !i < len then Uutf.Buffer.add_utf_8 buf replacement_char;
Buffer.contents buf
let decode_with_encoding data enc ~bom_len =
match enc with
| Encoding_types.Utf8 ->
(* UTF-8: Just validate and replace errors with replacement character *)
let len = Bytes.length data in
let buf = Buffer.create len in
let decoder = Uutf.decoder ~encoding:`UTF_8 (`String (Bytes.to_string data)) in
(* Skip BOM if present *)
let _ =
if bom_len > 0 then begin
for _ = 1 to bom_len do
ignore (Uutf.decode decoder)
done
end
in
let rec loop () =
match Uutf.decode decoder with
| `Uchar u -> Uutf.Buffer.add_utf_8 buf u; loop ()
| `Malformed _ -> Buffer.add_string buf "\xEF\xBF\xBD"; loop ()
| `End -> ()
| `Await -> assert false
in
loop ();
Buffer.contents buf
| Encoding_types.Utf16le -> decode_utf16 data ~is_le:true ~bom_len
| Encoding_types.Utf16be -> decode_utf16 data ~is_le:false ~bom_len
| Encoding_types.Windows_1252 ->
(* Windows-1252 mapping table for 0x80-0x9F range *)
let len = Bytes.length data in
let buf = Buffer.create len in
let table = [|
(* 0x80-0x9F *)
0x20AC; 0x0081; 0x201A; 0x0192; 0x201E; 0x2026; 0x2020; 0x2021;
0x02C6; 0x2030; 0x0160; 0x2039; 0x0152; 0x008D; 0x017D; 0x008F;
0x0090; 0x2018; 0x2019; 0x201C; 0x201D; 0x2022; 0x2013; 0x2014;
0x02DC; 0x2122; 0x0161; 0x203A; 0x0153; 0x009D; 0x017E; 0x0178;
|] in
for i = bom_len to len - 1 do
let b = Char.code (Bytes.get data i) in
let cp =
if b >= 0x80 && b <= 0x9F then table.(b - 0x80)
else b
in
Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp)
done;
Buffer.contents buf
| Encoding_types.Iso_8859_2 ->
(* Use uuuu for ISO-8859-2 decoding *)
let len = Bytes.length data in
let buf = Buffer.create len in
let s = Bytes.sub_string data bom_len (len - bom_len) in
Uuuu.String.fold `ISO_8859_2 (fun () _pos -> function
| `Uchar u -> Uutf.Buffer.add_utf_8 buf u
| `Malformed _ -> Uutf.Buffer.add_utf_8 buf replacement_char
) () s;
Buffer.contents buf
| Encoding_types.Euc_jp ->
(* For EUC-JP, use uutf with best effort *)
let len = Bytes.length data in
let buf = Buffer.create len in
let s = Bytes.sub_string data bom_len (len - bom_len) in
(* EUC-JP not directly supported by uutf, fall back to treating high bytes as replacement *)
(* This is a simplification - full EUC-JP would need a separate decoder *)
String.iter (fun c ->
if Char.code c <= 0x7F then
Buffer.add_char buf c
else
Buffer.add_string buf "\xEF\xBF\xBD"
) s;
Buffer.contents buf
let decode data ?transport_encoding () =
(* Step 1: Check for BOM *)
let bom_result = Encoding_bom.sniff data in
match bom_result with
| Some (enc, bom_len) ->
(decode_with_encoding data enc ~bom_len, enc)
| None ->
(* Step 2: Check transport encoding (e.g., HTTP Content-Type) *)
let enc_from_transport =
match transport_encoding with
| Some te -> Encoding_labels.normalize_label te
| None -> None
in
match enc_from_transport with
| Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
| None ->
(* Step 3: Prescan for meta charset *)
match Encoding_prescan.prescan_for_meta_charset data with
| Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
| None ->
(* Default to Windows-1252 per HTML5 spec when no encoding detected *)
(decode_with_encoding data Encoding_types.Windows_1252 ~bom_len:0, Encoding_types.Windows_1252)