(* Input stream for tokenizer with position tracking using bytesrw This implementation is designed to be as streaming as possible: - Reads slices on-demand from the Bytes.Reader.t - Only buffers what's needed for lookahead (typically 1-2 chars) - Avoids string allocations in hot paths like matches_ci *) open Bytesrw type t = { reader : Bytes.Reader.t; (* Current slice and position within it *) mutable current_slice : Bytes.Slice.t; mutable slice_pos : int; (* Lookahead buffer for reconsume and peek_n - small, typically 0-7 chars *) mutable lookahead : char list; (* Position tracking *) mutable line : int; mutable column : int; (* Track if we just saw CR (for CR/LF normalization) *) mutable last_was_cr : bool; (* Track if we need to skip the next LF from raw stream (set after peek of CR) *) mutable skip_next_lf : bool; (* Error callback for surrogate/noncharacter detection *) mutable error_callback : (string -> unit) option; } (* Create a stream from a Bytes.Reader.t *) let create_from_reader reader = let slice = Bytes.Reader.read reader in { reader; current_slice = slice; slice_pos = 0; lookahead = []; line = 1; column = 0; last_was_cr = false; skip_next_lf = false; error_callback = None; } let set_error_callback t cb = t.error_callback <- Some cb (* Check if a Unicode codepoint is a surrogate *) let is_surrogate cp = cp >= 0xD800 && cp <= 0xDFFF (* Check if a Unicode codepoint is a noncharacter *) let is_noncharacter cp = (* U+FDD0 to U+FDEF *) (cp >= 0xFDD0 && cp <= 0xFDEF) || (* U+FFFE and U+FFFF in each plane (0-16) *) ((cp land 0xFFFF) = 0xFFFE || (cp land 0xFFFF) = 0xFFFF) (* Create a stream from a string - discouraged, prefer create_from_reader *) let create input = create_from_reader (Bytes.Reader.of_string input) let position t = (t.line, t.column) (* Read next raw byte from the stream (before CR/LF normalization) *) let read_raw_char t = (* First check lookahead *) match t.lookahead with | c :: rest -> t.lookahead <- rest; Some c | [] -> (* Check if current slice is exhausted *) if Bytes.Slice.is_eod t.current_slice then None else if t.slice_pos >= Bytes.Slice.length t.current_slice then begin (* Get next slice *) t.current_slice <- Bytes.Reader.read t.reader; t.slice_pos <- 0; if Bytes.Slice.is_eod t.current_slice then None else begin let c = Bytes.get (Bytes.Slice.bytes t.current_slice) (Bytes.Slice.first t.current_slice + t.slice_pos) in t.slice_pos <- t.slice_pos + 1; Some c end end else begin let c = Bytes.get (Bytes.Slice.bytes t.current_slice) (Bytes.Slice.first t.current_slice + t.slice_pos) in t.slice_pos <- t.slice_pos + 1; Some c end (* Push a char back to lookahead *) let push_back_char t c = t.lookahead <- c :: t.lookahead (* Check for surrogates and noncharacters in UTF-8 sequences. Called after reading a lead byte, peeks continuation bytes to decode codepoint. *) let check_utf8_codepoint t lead_byte = let b0 = Char.code lead_byte in if b0 < 0x80 then (* ASCII - control characters are handled in tokenizer_impl.ml *) () else if b0 >= 0xC2 && b0 <= 0xDF then begin (* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *) (* Check for C1 control characters U+0080-U+009F *) match read_raw_char t with | Some c1 when (Char.code c1 land 0xC0) = 0x80 -> let b1 = Char.code c1 in let cp = ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F) in push_back_char t c1; (* C1 controls: U+0080 to U+009F *) if cp >= 0x80 && cp <= 0x9F then (match t.error_callback with | Some cb -> cb (Printf.sprintf "control-character-in-input-stream:%04x" cp) | None -> ()) | Some c1 -> push_back_char t c1 | None -> () end else if b0 >= 0xE0 && b0 <= 0xEF then begin (* 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx -> U+0800 to U+FFFF *) (* Need to peek 2 continuation bytes *) match read_raw_char t with | Some c1 when (Char.code c1 land 0xC0) = 0x80 -> let b1 = Char.code c1 in (match read_raw_char t with | Some c2 when (Char.code c2 land 0xC0) = 0x80 -> let b2 = Char.code c2 in let cp = ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) in push_back_char t c2; push_back_char t c1; (* Check for surrogates and noncharacters *) (match t.error_callback with | Some cb -> if is_surrogate cp then cb (Printf.sprintf "surrogate-in-input-stream:%04x" cp) else if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%04x" cp) | None -> ()) | Some c2 -> push_back_char t c2; push_back_char t c1 | None -> push_back_char t c1) | Some c1 -> push_back_char t c1 | None -> () end else if b0 >= 0xF0 && b0 <= 0xF4 then begin (* 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx -> U+10000 to U+10FFFF *) match read_raw_char t with | Some c1 when (Char.code c1 land 0xC0) = 0x80 -> let b1 = Char.code c1 in (match read_raw_char t with | Some c2 when (Char.code c2 land 0xC0) = 0x80 -> let b2 = Char.code c2 in (match read_raw_char t with | Some c3 when (Char.code c3 land 0xC0) = 0x80 -> let b3 = Char.code c3 in let cp = ((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) in push_back_char t c3; push_back_char t c2; push_back_char t c1; (* Check for noncharacters (no surrogates in 4-byte range) *) (match t.error_callback with | Some cb -> if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%05x" cp) | None -> ()) | Some c3 -> push_back_char t c3; push_back_char t c2; push_back_char t c1 | None -> push_back_char t c2; push_back_char t c1) | Some c2 -> push_back_char t c2; push_back_char t c1 | None -> push_back_char t c1) | Some c1 -> push_back_char t c1 | None -> () end (* Read next char with CR/LF normalization *) let rec read_normalized_char t = (* Track if we're reading from lookahead - if so, we've already checked this byte *) let from_lookahead = t.lookahead <> [] in match read_raw_char t with | None -> t.last_was_cr <- false; None | Some '\r' -> t.last_was_cr <- true; (* Immediately consume following LF if present (CRLF -> single LF) *) (match read_raw_char t with | Some '\n' -> () (* Consume the LF that follows CR *) | Some c -> push_back_char t c (* Put non-LF char back *) | None -> ()); Some '\n' (* CR (or CRLF) becomes single LF *) | Some '\n' when t.last_was_cr -> (* Skip LF after CR - it was already converted *) t.last_was_cr <- false; read_normalized_char t | Some c -> t.last_was_cr <- false; (* Only check for surrogates/noncharacters when reading fresh from stream, not when re-reading from lookahead (to avoid duplicate errors) *) if not from_lookahead then check_utf8_codepoint t c; Some c let is_eof t = t.lookahead = [] && (Bytes.Slice.is_eod t.current_slice || (t.slice_pos >= Bytes.Slice.length t.current_slice && (let next = Bytes.Reader.read t.reader in t.current_slice <- next; t.slice_pos <- 0; Bytes.Slice.is_eod next))) let peek t = (* Save last_was_cr state before reading *) let saved_last_was_cr = t.last_was_cr in match read_normalized_char t with | None -> None | Some c -> push_back_char t c; (* Restore the last_was_cr state so advance handles CR/LF correctly *) t.last_was_cr <- saved_last_was_cr; Some c (* Read n characters into a list, returns (chars_read, all_read_successfully) *) let peek_chars t n = let rec collect acc remaining = if remaining <= 0 then (List.rev acc, true) else match read_normalized_char t with | None -> (List.rev acc, false) (* Not enough chars available *) | Some c -> collect (c :: acc) (remaining - 1) in let (chars, success) = collect [] n in (* Always push back characters we read, in reverse order *) List.iter (push_back_char t) (List.rev chars); t.last_was_cr <- false; (chars, success) (* peek_n returns Some string only when exactly n chars are available Avoid using this in hot paths - prefer peek_chars + direct comparison *) let peek_n t n = let (chars, success) = peek_chars t n in if success then Some (String.init n (fun i -> List.nth chars i)) else None let advance t = match read_normalized_char t with | None -> () | Some c -> (* Update position tracking *) if c = '\n' then begin t.line <- t.line + 1; t.column <- 0 end else t.column <- t.column + 1 let consume t = let c = peek t in advance t; c let consume_if t pred = match peek t with | Some c when pred c -> advance t; Some c | _ -> None let consume_while t pred = let buf = Buffer.create 16 in let rec loop () = match peek t with | Some c when pred c -> Buffer.add_char buf c; advance t; loop () | _ -> () in loop (); Buffer.contents buf (* Case-insensitive match without allocating a string Compares directly with the char list from peek_chars *) let matches_ci t s = let slen = String.length s in let (chars, success) = peek_chars t slen in if not success then false else begin let rec check chars_remaining i = match chars_remaining with | [] -> i >= slen (* Matched all *) | c :: rest -> if i >= slen then true else let c1 = Char.lowercase_ascii c in let c2 = Char.lowercase_ascii (String.unsafe_get s i) in if c1 = c2 then check rest (i + 1) else false in check chars 0 end let consume_exact_ci t s = if matches_ci t s then begin for _ = 1 to String.length s do advance t done; true end else false let reconsume t = (* Move back one position - simplified, doesn't handle CR/LF properly for reconsume *) (* This is called after advance, so we just need to push back a placeholder *) (* The tokenizer will call peek again which will get the right character *) (* Actually, for reconsume we need to track what we last consumed *) (* For now, just adjust column *) if t.column > 0 then t.column <- t.column - 1