(* Input stream for tokenizer with position tracking using bytesrw

   This implementation is designed to be as streaming as possible:
   - Reads slices on-demand from the Bytes.Reader.t
   - Only buffers what's needed for lookahead (typically 1-2 chars)
   - Avoids string allocations in hot paths like matches_ci
*)

open Bytesrw

type t = {
  reader : Bytes.Reader.t;
  (* Current slice and position within it *)
  mutable current_slice : Bytes.Slice.t;
  mutable slice_pos : int;
  (* Lookahead buffer for reconsume and peek_n - small, typically 0-7 chars *)
  mutable lookahead : char list;
  (* Position tracking *)
  mutable line : int;
  mutable column : int;
  (* Track if we just saw CR (for CR/LF normalization) *)
  mutable last_was_cr : bool;
  (* Track if we need to skip the next LF from raw stream (set after peek of CR) *)
  mutable skip_next_lf : bool;
  (* Error callback for surrogate/noncharacter detection *)
  mutable error_callback : (string -> unit) option;
}

(* Create a stream from a Bytes.Reader.t *)
let create_from_reader reader =
  let slice = Bytes.Reader.read reader in
  {
    reader;
    current_slice = slice;
    slice_pos = 0;
    lookahead = [];
    line = 1;
    column = 0;
    last_was_cr = false;
    skip_next_lf = false;
    error_callback = None;
  }

let set_error_callback t cb =
  t.error_callback <- Some cb

(* Check if a Unicode codepoint is a surrogate *)
let is_surrogate cp = cp >= 0xD800 && cp <= 0xDFFF

(* Check if a Unicode codepoint is a noncharacter *)
let is_noncharacter cp =
  (* U+FDD0 to U+FDEF *)
  (cp >= 0xFDD0 && cp <= 0xFDEF) ||
  (* U+FFFE and U+FFFF in each plane (0-16) *)
  ((cp land 0xFFFF) = 0xFFFE || (cp land 0xFFFF) = 0xFFFF)

(* Create a stream from a string - discouraged, prefer create_from_reader *)
let create input =
  create_from_reader (Bytes.Reader.of_string input)

let position t = (t.line, t.column)

(* Read next raw byte from the stream (before CR/LF normalization) *)
let read_raw_char t =
  (* First check lookahead *)
  match t.lookahead with
  | c :: rest ->
    t.lookahead <- rest;
    Some c
  | [] ->
    (* Check if current slice is exhausted *)
    if Bytes.Slice.is_eod t.current_slice then
      None
    else if t.slice_pos >= Bytes.Slice.length t.current_slice then begin
      (* Get next slice *)
      t.current_slice <- Bytes.Reader.read t.reader;
      t.slice_pos <- 0;
      if Bytes.Slice.is_eod t.current_slice then
        None
      else begin
        let c = Bytes.get (Bytes.Slice.bytes t.current_slice)
                         (Bytes.Slice.first t.current_slice + t.slice_pos) in
        t.slice_pos <- t.slice_pos + 1;
        Some c
      end
    end else begin
      let c = Bytes.get (Bytes.Slice.bytes t.current_slice)
                       (Bytes.Slice.first t.current_slice + t.slice_pos) in
      t.slice_pos <- t.slice_pos + 1;
      Some c
    end

(* Push a char back to lookahead *)
let push_back_char t c =
  t.lookahead <- c :: t.lookahead

(* Check for surrogates and noncharacters in UTF-8 sequences.
   Called after reading a lead byte, peeks continuation bytes to decode codepoint. *)
let check_utf8_codepoint t lead_byte =
  let b0 = Char.code lead_byte in
  if b0 < 0x80 then
    (* ASCII - control characters are handled in tokenizer_impl.ml *)
    ()
  else if b0 >= 0xC2 && b0 <= 0xDF then begin
    (* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *)
    (* Check for C1 control characters U+0080-U+009F *)
    match read_raw_char t with
    | Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
      let b1 = Char.code c1 in
      let cp = ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F) in
      push_back_char t c1;
      (* C1 controls: U+0080 to U+009F *)
      if cp >= 0x80 && cp <= 0x9F then
        (match t.error_callback with
        | Some cb -> cb (Printf.sprintf "control-character-in-input-stream:%04x" cp)
        | None -> ())
    | Some c1 ->
      push_back_char t c1
    | None -> ()
  end else if b0 >= 0xE0 && b0 <= 0xEF then begin
    (* 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx -> U+0800 to U+FFFF *)
    (* Need to peek 2 continuation bytes *)
    match read_raw_char t with
    | Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
      let b1 = Char.code c1 in
      (match read_raw_char t with
      | Some c2 when (Char.code c2 land 0xC0) = 0x80 ->
        let b2 = Char.code c2 in
        let cp = ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) in
        push_back_char t c2;
        push_back_char t c1;
        (* Check for surrogates and noncharacters *)
        (match t.error_callback with
        | Some cb ->
          if is_surrogate cp then cb (Printf.sprintf "surrogate-in-input-stream:%04x" cp)
          else if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%04x" cp)
        | None -> ())
      | Some c2 ->
        push_back_char t c2;
        push_back_char t c1
      | None ->
        push_back_char t c1)
    | Some c1 ->
      push_back_char t c1
    | None -> ()
  end else if b0 >= 0xF0 && b0 <= 0xF4 then begin
    (* 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx -> U+10000 to U+10FFFF *)
    match read_raw_char t with
    | Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
      let b1 = Char.code c1 in
      (match read_raw_char t with
      | Some c2 when (Char.code c2 land 0xC0) = 0x80 ->
        let b2 = Char.code c2 in
        (match read_raw_char t with
        | Some c3 when (Char.code c3 land 0xC0) = 0x80 ->
          let b3 = Char.code c3 in
          let cp = ((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor
                   ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) in
          push_back_char t c3;
          push_back_char t c2;
          push_back_char t c1;
          (* Check for noncharacters (no surrogates in 4-byte range) *)
          (match t.error_callback with
          | Some cb ->
            if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%05x" cp)
          | None -> ())
        | Some c3 ->
          push_back_char t c3;
          push_back_char t c2;
          push_back_char t c1
        | None ->
          push_back_char t c2;
          push_back_char t c1)
      | Some c2 ->
        push_back_char t c2;
        push_back_char t c1
      | None ->
        push_back_char t c1)
    | Some c1 ->
      push_back_char t c1
    | None -> ()
  end

(* Read next char with CR/LF normalization *)
let rec read_normalized_char t =
  (* Track if we're reading from lookahead - if so, we've already checked this byte *)
  let from_lookahead = t.lookahead <> [] in
  match read_raw_char t with
  | None ->
    t.last_was_cr <- false;
    None
  | Some '\r' ->
    t.last_was_cr <- true;
    (* Immediately consume following LF if present (CRLF -> single LF) *)
    (match read_raw_char t with
     | Some '\n' -> ()  (* Consume the LF that follows CR *)
     | Some c -> push_back_char t c  (* Put non-LF char back *)
     | None -> ());
    Some '\n'  (* CR (or CRLF) becomes single LF *)
  | Some '\n' when t.last_was_cr ->
    (* Skip LF after CR - it was already converted *)
    t.last_was_cr <- false;
    read_normalized_char t
  | Some c ->
    t.last_was_cr <- false;
    (* Only check for surrogates/noncharacters when reading fresh from stream,
       not when re-reading from lookahead (to avoid duplicate errors) *)
    if not from_lookahead then check_utf8_codepoint t c;
    Some c

let is_eof t =
  t.lookahead = [] &&
  (Bytes.Slice.is_eod t.current_slice ||
   (t.slice_pos >= Bytes.Slice.length t.current_slice &&
    (let next = Bytes.Reader.read t.reader in
     t.current_slice <- next;
     t.slice_pos <- 0;
     Bytes.Slice.is_eod next)))

let peek t =
  (* Save last_was_cr state before reading *)
  let saved_last_was_cr = t.last_was_cr in
  match read_normalized_char t with
  | None -> None
  | Some c ->
    push_back_char t c;
    (* Restore the last_was_cr state so advance handles CR/LF correctly *)
    t.last_was_cr <- saved_last_was_cr;
    Some c

(* Read n characters into a list, returns (chars_read, all_read_successfully) *)
let peek_chars t n =
  let rec collect acc remaining =
    if remaining <= 0 then (List.rev acc, true)
    else match read_normalized_char t with
      | None -> (List.rev acc, false)  (* Not enough chars available *)
      | Some c -> collect (c :: acc) (remaining - 1)
  in
  let (chars, success) = collect [] n in
  (* Always push back characters we read, in reverse order *)
  List.iter (push_back_char t) (List.rev chars);
  t.last_was_cr <- false;
  (chars, success)

(* peek_n returns Some string only when exactly n chars are available
   Avoid using this in hot paths - prefer peek_chars + direct comparison *)
let peek_n t n =
  let (chars, success) = peek_chars t n in
  if success then
    Some (String.init n (fun i -> List.nth chars i))
  else
    None

let advance t =
  match read_normalized_char t with
  | None -> ()
  | Some c ->
    (* Update position tracking *)
    if c = '\n' then begin
      t.line <- t.line + 1;
      t.column <- 0
    end else
      t.column <- t.column + 1

let consume t =
  let c = peek t in
  advance t;
  c

let consume_if t pred =
  match peek t with
  | Some c when pred c -> advance t; Some c
  | _ -> None

let consume_while t pred =
  let buf = Buffer.create 16 in
  let rec loop () =
    match peek t with
    | Some c when pred c ->
      Buffer.add_char buf c;
      advance t;
      loop ()
    | _ -> ()
  in
  loop ();
  Buffer.contents buf

(* Case-insensitive match without allocating a string
   Compares directly with the char list from peek_chars *)
let matches_ci t s =
  let slen = String.length s in
  let (chars, success) = peek_chars t slen in
  if not success then false
  else begin
    let rec check chars_remaining i =
      match chars_remaining with
      | [] -> i >= slen  (* Matched all *)
      | c :: rest ->
        if i >= slen then true
        else
          let c1 = Char.lowercase_ascii c in
          let c2 = Char.lowercase_ascii (String.unsafe_get s i) in
          if c1 = c2 then check rest (i + 1)
          else false
    in
    check chars 0
  end

let consume_exact_ci t s =
  if matches_ci t s then begin
    for _ = 1 to String.length s do advance t done;
    true
  end else false

let reconsume t =
  (* Move back one position - simplified, doesn't handle CR/LF properly for reconsume *)
  (* This is called after advance, so we just need to push back a placeholder *)
  (* The tokenizer will call peek again which will get the right character *)
  (* Actually, for reconsume we need to track what we last consumed *)
  (* For now, just adjust column *)
  if t.column > 0 then t.column <- t.column - 1