(* HTML5 encoding detection and decoding *)

(* UTF-8 replacement character *)
let replacement_char = Uchar.of_int 0xFFFD

let decode_utf16 data ~is_le ~bom_len =
  let len = Bytes.length data in
  let buf = Buffer.create len in
  let i = ref bom_len in

  while !i + 1 < len do
    let b0 = Char.code (Bytes.get data !i) in
    let b1 = Char.code (Bytes.get data (!i + 1)) in
    let code_unit =
      if is_le then b0 lor (b1 lsl 8)
      else (b0 lsl 8) lor b1
    in
    i := !i + 2;

    (* Handle surrogate pairs *)
    if code_unit >= 0xD800 && code_unit <= 0xDBFF && !i + 1 < len then begin
      (* High surrogate, look for low surrogate *)
      let b2 = Char.code (Bytes.get data !i) in
      let b3 = Char.code (Bytes.get data (!i + 1)) in
      let code_unit2 =
        if is_le then b2 lor (b3 lsl 8)
        else (b2 lsl 8) lor b3
      in
      if code_unit2 >= 0xDC00 && code_unit2 <= 0xDFFF then begin
        i := !i + 2;
        let high = code_unit - 0xD800 in
        let low = code_unit2 - 0xDC00 in
        let cp = 0x10000 + (high lsl 10) lor low in
        Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp)
      end else begin
        (* Invalid surrogate, output replacement *)
        Uutf.Buffer.add_utf_8 buf replacement_char
      end
    end else if code_unit >= 0xD800 && code_unit <= 0xDFFF then begin
      (* Lone surrogate *)
      Uutf.Buffer.add_utf_8 buf replacement_char
    end else begin
      Uutf.Buffer.add_utf_8 buf (Uchar.of_int code_unit)
    end
  done;

  (* Odd trailing byte *)
  if !i < len then Uutf.Buffer.add_utf_8 buf replacement_char;

  Buffer.contents buf

let decode_with_encoding data enc ~bom_len =
  match enc with
  | Encoding_types.Utf8 ->
    (* UTF-8: Just validate and replace errors with replacement character *)
    let len = Bytes.length data in
    let buf = Buffer.create len in
    let decoder = Uutf.decoder ~encoding:`UTF_8 (`String (Bytes.to_string data)) in
    (* Skip BOM if present *)
    let _ =
      if bom_len > 0 then begin
        for _ = 1 to bom_len do
          ignore (Uutf.decode decoder)
        done
      end
    in
    let rec loop () =
      match Uutf.decode decoder with
      | `Uchar u -> Uutf.Buffer.add_utf_8 buf u; loop ()
      | `Malformed _ -> Buffer.add_string buf "\xEF\xBF\xBD"; loop ()
      | `End -> ()
      | `Await -> assert false
    in
    loop ();
    Buffer.contents buf

  | Encoding_types.Utf16le -> decode_utf16 data ~is_le:true ~bom_len
  | Encoding_types.Utf16be -> decode_utf16 data ~is_le:false ~bom_len

  | Encoding_types.Windows_1252 ->
    (* Windows-1252 mapping table for 0x80-0x9F range *)
    let len = Bytes.length data in
    let buf = Buffer.create len in
    let table = [|
      (* 0x80-0x9F *)
      0x20AC; 0x0081; 0x201A; 0x0192; 0x201E; 0x2026; 0x2020; 0x2021;
      0x02C6; 0x2030; 0x0160; 0x2039; 0x0152; 0x008D; 0x017D; 0x008F;
      0x0090; 0x2018; 0x2019; 0x201C; 0x201D; 0x2022; 0x2013; 0x2014;
      0x02DC; 0x2122; 0x0161; 0x203A; 0x0153; 0x009D; 0x017E; 0x0178;
    |] in
    for i = bom_len to len - 1 do
      let b = Char.code (Bytes.get data i) in
      let cp =
        if b >= 0x80 && b <= 0x9F then table.(b - 0x80)
        else b
      in
      Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp)
    done;
    Buffer.contents buf

  | Encoding_types.Iso_8859_2 ->
    (* Use uuuu for ISO-8859-2 decoding *)
    let len = Bytes.length data in
    let buf = Buffer.create len in
    let s = Bytes.sub_string data bom_len (len - bom_len) in
    Uuuu.String.fold `ISO_8859_2 (fun () _pos -> function
      | `Uchar u -> Uutf.Buffer.add_utf_8 buf u
      | `Malformed _ -> Uutf.Buffer.add_utf_8 buf replacement_char
    ) () s;
    Buffer.contents buf

  | Encoding_types.Euc_jp ->
    (* For EUC-JP, use uutf with best effort *)
    let len = Bytes.length data in
    let buf = Buffer.create len in
    let s = Bytes.sub_string data bom_len (len - bom_len) in
    (* EUC-JP not directly supported by uutf, fall back to treating high bytes as replacement *)
    (* This is a simplification - full EUC-JP would need a separate decoder *)
    String.iter (fun c ->
      if Char.code c <= 0x7F then
        Buffer.add_char buf c
      else
        Buffer.add_string buf "\xEF\xBF\xBD"
    ) s;
    Buffer.contents buf

let decode data ?transport_encoding () =
  (* Step 1: Check for BOM *)
  let bom_result = Encoding_bom.sniff data in
  match bom_result with
  | Some (enc, bom_len) ->
    (decode_with_encoding data enc ~bom_len, enc)
  | None ->
    (* Step 2: Check transport encoding (e.g., HTTP Content-Type) *)
    let enc_from_transport =
      match transport_encoding with
      | Some te -> Encoding_labels.normalize_label te
      | None -> None
    in
    match enc_from_transport with
    | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
    | None ->
      (* Step 3: Prescan for meta charset *)
      match Encoding_prescan.prescan_for_meta_charset data with
      | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
      | None ->
        (* Default to Windows-1252 per HTML5 spec when no encoding detected *)
        (decode_with_encoding data Encoding_types.Windows_1252 ~bom_len:0, Encoding_types.Windows_1252)