lib/html5rw/tokenizer/tokenizer_stream.ml at main · anil.recoil.org/ocaml-html5rw

anil.recoil.org / ocaml-html5rw
fork
OCaml HTML5 parser/serialiser based on Python's JustHTML
fork
ocaml-html5rw / lib / html5rw / tokenizer / tokenizer_stream.ml
at main 321 lines 11 kB view raw
wrap content
Anil Madhavapeddy more 4mo ago
aa572f37
  1(* Input stream for tokenizer with position tracking using bytesrw
  2
  3   This implementation is designed to be as streaming as possible:
  4   - Reads slices on-demand from the Bytes.Reader.t
  5   - Only buffers what's needed for lookahead (typically 1-2 chars)
  6   - Avoids string allocations in hot paths like matches_ci
  7*)
  8
  9open Bytesrw
 10
 11type t = {
 12  reader : Bytes.Reader.t;
 13  (* Current slice and position within it *)
 14  mutable current_slice : Bytes.Slice.t;
 15  mutable slice_pos : int;
 16  (* Lookahead buffer for reconsume and peek_n - small, typically 0-7 chars *)
 17  mutable lookahead : char list;
 18  (* Position tracking *)
 19  mutable line : int;
 20  mutable column : int;
 21  (* Track if we just saw CR (for CR/LF normalization) *)
 22  mutable last_was_cr : bool;
 23  (* Track if we need to skip the next LF from raw stream (set after peek of CR) *)
 24  mutable skip_next_lf : bool;
 25  (* Error callback for surrogate/noncharacter detection *)
 26  mutable error_callback : (string -> unit) option;
 27}
 28
 29(* Create a stream from a Bytes.Reader.t *)
 30let create_from_reader reader =
 31  let slice = Bytes.Reader.read reader in
 32  {
 33    reader;
 34    current_slice = slice;
 35    slice_pos = 0;
 36    lookahead = [];
 37    line = 1;
 38    column = 0;
 39    last_was_cr = false;
 40    skip_next_lf = false;
 41    error_callback = None;
 42  }
 43
 44let set_error_callback t cb =
 45  t.error_callback <- Some cb
 46
 47(* Check if a Unicode codepoint is a surrogate *)
 48let is_surrogate cp = cp >= 0xD800 && cp <= 0xDFFF
 49
 50(* Check if a Unicode codepoint is a noncharacter *)
 51let is_noncharacter cp =
 52  (* U+FDD0 to U+FDEF *)
 53  (cp >= 0xFDD0 && cp <= 0xFDEF) ||
 54  (* U+FFFE and U+FFFF in each plane (0-16) *)
 55  ((cp land 0xFFFF) = 0xFFFE || (cp land 0xFFFF) = 0xFFFF)
 56
 57(* Create a stream from a string - discouraged, prefer create_from_reader *)
 58let create input =
 59  create_from_reader (Bytes.Reader.of_string input)
 60
 61let position t = (t.line, t.column)
 62
 63(* Read next raw byte from the stream (before CR/LF normalization) *)
 64let read_raw_char t =
 65  (* First check lookahead *)
 66  match t.lookahead with
 67  | c :: rest ->
 68    t.lookahead <- rest;
 69    Some c
 70  | [] ->
 71    (* Check if current slice is exhausted *)
 72    if Bytes.Slice.is_eod t.current_slice then
 73      None
 74    else if t.slice_pos >= Bytes.Slice.length t.current_slice then begin
 75      (* Get next slice *)
 76      t.current_slice <- Bytes.Reader.read t.reader;
 77      t.slice_pos <- 0;
 78      if Bytes.Slice.is_eod t.current_slice then
 79        None
 80      else begin
 81        let c = Bytes.get (Bytes.Slice.bytes t.current_slice)
 82                         (Bytes.Slice.first t.current_slice + t.slice_pos) in
 83        t.slice_pos <- t.slice_pos + 1;
 84        Some c
 85      end
 86    end else begin
 87      let c = Bytes.get (Bytes.Slice.bytes t.current_slice)
 88                       (Bytes.Slice.first t.current_slice + t.slice_pos) in
 89      t.slice_pos <- t.slice_pos + 1;
 90      Some c
 91    end
 92
 93(* Push a char back to lookahead *)
 94let push_back_char t c =
 95  t.lookahead <- c :: t.lookahead
 96
 97(* Check for surrogates and noncharacters in UTF-8 sequences.
 98   Called after reading a lead byte, peeks continuation bytes to decode codepoint. *)
 99let check_utf8_codepoint t lead_byte =
100  let b0 = Char.code lead_byte in
101  if b0 < 0x80 then
102    (* ASCII - control characters are handled in tokenizer_impl.ml *)
103    ()
104  else if b0 >= 0xC2 && b0 <= 0xDF then begin
105    (* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *)
106    (* Check for C1 control characters U+0080-U+009F *)
107    match read_raw_char t with
108    | Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
109      let b1 = Char.code c1 in
110      let cp = ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F) in
111      push_back_char t c1;
112      (* C1 controls: U+0080 to U+009F *)
113      if cp >= 0x80 && cp <= 0x9F then
114        (match t.error_callback with
115        | Some cb -> cb (Printf.sprintf "control-character-in-input-stream:%04x" cp)
116        | None -> ())
117    | Some c1 ->
118      push_back_char t c1
119    | None -> ()
120  end else if b0 >= 0xE0 && b0 <= 0xEF then begin
121    (* 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx -> U+0800 to U+FFFF *)
122    (* Need to peek 2 continuation bytes *)
123    match read_raw_char t with
124    | Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
125      let b1 = Char.code c1 in
126      (match read_raw_char t with
127      | Some c2 when (Char.code c2 land 0xC0) = 0x80 ->
128        let b2 = Char.code c2 in
129        let cp = ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) in
130        push_back_char t c2;
131        push_back_char t c1;
132        (* Check for surrogates and noncharacters *)
133        (match t.error_callback with
134        | Some cb ->
135          if is_surrogate cp then cb (Printf.sprintf "surrogate-in-input-stream:%04x" cp)
136          else if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%04x" cp)
137        | None -> ())
138      | Some c2 ->
139        push_back_char t c2;
140        push_back_char t c1
141      | None ->
142        push_back_char t c1)
143    | Some c1 ->
144      push_back_char t c1
145    | None -> ()
146  end else if b0 >= 0xF0 && b0 <= 0xF4 then begin
147    (* 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx -> U+10000 to U+10FFFF *)
148    match read_raw_char t with
149    | Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
150      let b1 = Char.code c1 in
151      (match read_raw_char t with
152      | Some c2 when (Char.code c2 land 0xC0) = 0x80 ->
153        let b2 = Char.code c2 in
154        (match read_raw_char t with
155        | Some c3 when (Char.code c3 land 0xC0) = 0x80 ->
156          let b3 = Char.code c3 in
157          let cp = ((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor
158                   ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) in
159          push_back_char t c3;
160          push_back_char t c2;
161          push_back_char t c1;
162          (* Check for noncharacters (no surrogates in 4-byte range) *)
163          (match t.error_callback with
164          | Some cb ->
165            if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%05x" cp)
166          | None -> ())
167        | Some c3 ->
168          push_back_char t c3;
169          push_back_char t c2;
170          push_back_char t c1
171        | None ->
172          push_back_char t c2;
173          push_back_char t c1)
174      | Some c2 ->
175        push_back_char t c2;
176        push_back_char t c1
177      | None ->
178        push_back_char t c1)
179    | Some c1 ->
180      push_back_char t c1
181    | None -> ()
182  end
183
184(* Read next char with CR/LF normalization *)
185let rec read_normalized_char t =
186  (* Track if we're reading from lookahead - if so, we've already checked this byte *)
187  let from_lookahead = t.lookahead <> [] in
188  match read_raw_char t with
189  | None ->
190    t.last_was_cr <- false;
191    None
192  | Some '\r' ->
193    t.last_was_cr <- true;
194    (* Immediately consume following LF if present (CRLF -> single LF) *)
195    (match read_raw_char t with
196     | Some '\n' -> ()  (* Consume the LF that follows CR *)
197     | Some c -> push_back_char t c  (* Put non-LF char back *)
198     | None -> ());
199    Some '\n'  (* CR (or CRLF) becomes single LF *)
200  | Some '\n' when t.last_was_cr ->
201    (* Skip LF after CR - it was already converted *)
202    t.last_was_cr <- false;
203    read_normalized_char t
204  | Some c ->
205    t.last_was_cr <- false;
206    (* Only check for surrogates/noncharacters when reading fresh from stream,
207       not when re-reading from lookahead (to avoid duplicate errors) *)
208    if not from_lookahead then check_utf8_codepoint t c;
209    Some c
210
211let is_eof t =
212  t.lookahead = [] &&
213  (Bytes.Slice.is_eod t.current_slice ||
214   (t.slice_pos >= Bytes.Slice.length t.current_slice &&
215    (let next = Bytes.Reader.read t.reader in
216     t.current_slice <- next;
217     t.slice_pos <- 0;
218     Bytes.Slice.is_eod next)))
219
220let peek t =
221  (* Save last_was_cr state before reading *)
222  let saved_last_was_cr = t.last_was_cr in
223  match read_normalized_char t with
224  | None -> None
225  | Some c ->
226    push_back_char t c;
227    (* Restore the last_was_cr state so advance handles CR/LF correctly *)
228    t.last_was_cr <- saved_last_was_cr;
229    Some c
230
231(* Read n characters into a list, returns (chars_read, all_read_successfully) *)
232let peek_chars t n =
233  let rec collect acc remaining =
234    if remaining <= 0 then (List.rev acc, true)
235    else match read_normalized_char t with
236      | None -> (List.rev acc, false)  (* Not enough chars available *)
237      | Some c -> collect (c :: acc) (remaining - 1)
238  in
239  let (chars, success) = collect [] n in
240  (* Always push back characters we read, in reverse order *)
241  List.iter (push_back_char t) (List.rev chars);
242  t.last_was_cr <- false;
243  (chars, success)
244
245(* peek_n returns Some string only when exactly n chars are available
246   Avoid using this in hot paths - prefer peek_chars + direct comparison *)
247let peek_n t n =
248  let (chars, success) = peek_chars t n in
249  if success then
250    Some (String.init n (fun i -> List.nth chars i))
251  else
252    None
253
254let advance t =
255  match read_normalized_char t with
256  | None -> ()
257  | Some c ->
258    (* Update position tracking *)
259    if c = '\n' then begin
260      t.line <- t.line + 1;
261      t.column <- 0
262    end else
263      t.column <- t.column + 1
264
265let consume t =
266  let c = peek t in
267  advance t;
268  c
269
270let consume_if t pred =
271  match peek t with
272  | Some c when pred c -> advance t; Some c
273  | _ -> None
274
275let consume_while t pred =
276  let buf = Buffer.create 16 in
277  let rec loop () =
278    match peek t with
279    | Some c when pred c ->
280      Buffer.add_char buf c;
281      advance t;
282      loop ()
283    | _ -> ()
284  in
285  loop ();
286  Buffer.contents buf
287
288(* Case-insensitive match without allocating a string
289   Compares directly with the char list from peek_chars *)
290let matches_ci t s =
291  let slen = String.length s in
292  let (chars, success) = peek_chars t slen in
293  if not success then false
294  else begin
295    let rec check chars_remaining i =
296      match chars_remaining with
297      | [] -> i >= slen  (* Matched all *)
298      | c :: rest ->
299        if i >= slen then true
300        else
301          let c1 = Char.lowercase_ascii c in
302          let c2 = Char.lowercase_ascii (String.unsafe_get s i) in
303          if c1 = c2 then check rest (i + 1)
304          else false
305    in
306    check chars 0
307  end
308
309let consume_exact_ci t s =
310  if matches_ci t s then begin
311    for _ = 1 to String.length s do advance t done;
312    true
313  end else false
314
315let reconsume t =
316  (* Move back one position - simplified, doesn't handle CR/LF properly for reconsume *)
317  (* This is called after advance, so we just need to push back a placeholder *)
318  (* The tokenizer will call peek again which will get the right character *)
319  (* Actually, for reconsume we need to track what we last consumed *)
320  (* For now, just adjust column *)
321  if t.column > 0 then t.column <- t.column - 1
Configure Feed

Configure Feed