OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Input stream for tokenizer with position tracking using bytesrw
2
3 This implementation is designed to be as streaming as possible:
4 - Reads slices on-demand from the Bytes.Reader.t
5 - Only buffers what's needed for lookahead (typically 1-2 chars)
6 - Avoids string allocations in hot paths like matches_ci
7*)
8
9open Bytesrw
10
11type t = {
12 reader : Bytes.Reader.t;
13 (* Current slice and position within it *)
14 mutable current_slice : Bytes.Slice.t;
15 mutable slice_pos : int;
16 (* Lookahead buffer for reconsume and peek_n - small, typically 0-7 chars *)
17 mutable lookahead : char list;
18 (* Position tracking *)
19 mutable line : int;
20 mutable column : int;
21 (* Track if we just saw CR (for CR/LF normalization) *)
22 mutable last_was_cr : bool;
23 (* Track if we need to skip the next LF from raw stream (set after peek of CR) *)
24 mutable skip_next_lf : bool;
25 (* Error callback for surrogate/noncharacter detection *)
26 mutable error_callback : (string -> unit) option;
27}
28
29(* Create a stream from a Bytes.Reader.t *)
30let create_from_reader reader =
31 let slice = Bytes.Reader.read reader in
32 {
33 reader;
34 current_slice = slice;
35 slice_pos = 0;
36 lookahead = [];
37 line = 1;
38 column = 0;
39 last_was_cr = false;
40 skip_next_lf = false;
41 error_callback = None;
42 }
43
44let set_error_callback t cb =
45 t.error_callback <- Some cb
46
47(* Check if a Unicode codepoint is a surrogate *)
48let is_surrogate cp = cp >= 0xD800 && cp <= 0xDFFF
49
50(* Check if a Unicode codepoint is a noncharacter *)
51let is_noncharacter cp =
52 (* U+FDD0 to U+FDEF *)
53 (cp >= 0xFDD0 && cp <= 0xFDEF) ||
54 (* U+FFFE and U+FFFF in each plane (0-16) *)
55 ((cp land 0xFFFF) = 0xFFFE || (cp land 0xFFFF) = 0xFFFF)
56
57(* Create a stream from a string - discouraged, prefer create_from_reader *)
58let create input =
59 create_from_reader (Bytes.Reader.of_string input)
60
61let position t = (t.line, t.column)
62
63(* Read next raw byte from the stream (before CR/LF normalization) *)
64let read_raw_char t =
65 (* First check lookahead *)
66 match t.lookahead with
67 | c :: rest ->
68 t.lookahead <- rest;
69 Some c
70 | [] ->
71 (* Check if current slice is exhausted *)
72 if Bytes.Slice.is_eod t.current_slice then
73 None
74 else if t.slice_pos >= Bytes.Slice.length t.current_slice then begin
75 (* Get next slice *)
76 t.current_slice <- Bytes.Reader.read t.reader;
77 t.slice_pos <- 0;
78 if Bytes.Slice.is_eod t.current_slice then
79 None
80 else begin
81 let c = Bytes.get (Bytes.Slice.bytes t.current_slice)
82 (Bytes.Slice.first t.current_slice + t.slice_pos) in
83 t.slice_pos <- t.slice_pos + 1;
84 Some c
85 end
86 end else begin
87 let c = Bytes.get (Bytes.Slice.bytes t.current_slice)
88 (Bytes.Slice.first t.current_slice + t.slice_pos) in
89 t.slice_pos <- t.slice_pos + 1;
90 Some c
91 end
92
93(* Push a char back to lookahead *)
94let push_back_char t c =
95 t.lookahead <- c :: t.lookahead
96
97(* Check for surrogates and noncharacters in UTF-8 sequences.
98 Called after reading a lead byte, peeks continuation bytes to decode codepoint. *)
99let check_utf8_codepoint t lead_byte =
100 let b0 = Char.code lead_byte in
101 if b0 < 0x80 then
102 (* ASCII - control characters are handled in tokenizer_impl.ml *)
103 ()
104 else if b0 >= 0xC2 && b0 <= 0xDF then begin
105 (* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *)
106 (* Check for C1 control characters U+0080-U+009F *)
107 match read_raw_char t with
108 | Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
109 let b1 = Char.code c1 in
110 let cp = ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F) in
111 push_back_char t c1;
112 (* C1 controls: U+0080 to U+009F *)
113 if cp >= 0x80 && cp <= 0x9F then
114 (match t.error_callback with
115 | Some cb -> cb (Printf.sprintf "control-character-in-input-stream:%04x" cp)
116 | None -> ())
117 | Some c1 ->
118 push_back_char t c1
119 | None -> ()
120 end else if b0 >= 0xE0 && b0 <= 0xEF then begin
121 (* 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx -> U+0800 to U+FFFF *)
122 (* Need to peek 2 continuation bytes *)
123 match read_raw_char t with
124 | Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
125 let b1 = Char.code c1 in
126 (match read_raw_char t with
127 | Some c2 when (Char.code c2 land 0xC0) = 0x80 ->
128 let b2 = Char.code c2 in
129 let cp = ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) in
130 push_back_char t c2;
131 push_back_char t c1;
132 (* Check for surrogates and noncharacters *)
133 (match t.error_callback with
134 | Some cb ->
135 if is_surrogate cp then cb (Printf.sprintf "surrogate-in-input-stream:%04x" cp)
136 else if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%04x" cp)
137 | None -> ())
138 | Some c2 ->
139 push_back_char t c2;
140 push_back_char t c1
141 | None ->
142 push_back_char t c1)
143 | Some c1 ->
144 push_back_char t c1
145 | None -> ()
146 end else if b0 >= 0xF0 && b0 <= 0xF4 then begin
147 (* 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx -> U+10000 to U+10FFFF *)
148 match read_raw_char t with
149 | Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
150 let b1 = Char.code c1 in
151 (match read_raw_char t with
152 | Some c2 when (Char.code c2 land 0xC0) = 0x80 ->
153 let b2 = Char.code c2 in
154 (match read_raw_char t with
155 | Some c3 when (Char.code c3 land 0xC0) = 0x80 ->
156 let b3 = Char.code c3 in
157 let cp = ((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor
158 ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) in
159 push_back_char t c3;
160 push_back_char t c2;
161 push_back_char t c1;
162 (* Check for noncharacters (no surrogates in 4-byte range) *)
163 (match t.error_callback with
164 | Some cb ->
165 if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%05x" cp)
166 | None -> ())
167 | Some c3 ->
168 push_back_char t c3;
169 push_back_char t c2;
170 push_back_char t c1
171 | None ->
172 push_back_char t c2;
173 push_back_char t c1)
174 | Some c2 ->
175 push_back_char t c2;
176 push_back_char t c1
177 | None ->
178 push_back_char t c1)
179 | Some c1 ->
180 push_back_char t c1
181 | None -> ()
182 end
183
184(* Read next char with CR/LF normalization *)
185let rec read_normalized_char t =
186 (* Track if we're reading from lookahead - if so, we've already checked this byte *)
187 let from_lookahead = t.lookahead <> [] in
188 match read_raw_char t with
189 | None ->
190 t.last_was_cr <- false;
191 None
192 | Some '\r' ->
193 t.last_was_cr <- true;
194 (* Immediately consume following LF if present (CRLF -> single LF) *)
195 (match read_raw_char t with
196 | Some '\n' -> () (* Consume the LF that follows CR *)
197 | Some c -> push_back_char t c (* Put non-LF char back *)
198 | None -> ());
199 Some '\n' (* CR (or CRLF) becomes single LF *)
200 | Some '\n' when t.last_was_cr ->
201 (* Skip LF after CR - it was already converted *)
202 t.last_was_cr <- false;
203 read_normalized_char t
204 | Some c ->
205 t.last_was_cr <- false;
206 (* Only check for surrogates/noncharacters when reading fresh from stream,
207 not when re-reading from lookahead (to avoid duplicate errors) *)
208 if not from_lookahead then check_utf8_codepoint t c;
209 Some c
210
211let is_eof t =
212 t.lookahead = [] &&
213 (Bytes.Slice.is_eod t.current_slice ||
214 (t.slice_pos >= Bytes.Slice.length t.current_slice &&
215 (let next = Bytes.Reader.read t.reader in
216 t.current_slice <- next;
217 t.slice_pos <- 0;
218 Bytes.Slice.is_eod next)))
219
220let peek t =
221 (* Save last_was_cr state before reading *)
222 let saved_last_was_cr = t.last_was_cr in
223 match read_normalized_char t with
224 | None -> None
225 | Some c ->
226 push_back_char t c;
227 (* Restore the last_was_cr state so advance handles CR/LF correctly *)
228 t.last_was_cr <- saved_last_was_cr;
229 Some c
230
231(* Read n characters into a list, returns (chars_read, all_read_successfully) *)
232let peek_chars t n =
233 let rec collect acc remaining =
234 if remaining <= 0 then (List.rev acc, true)
235 else match read_normalized_char t with
236 | None -> (List.rev acc, false) (* Not enough chars available *)
237 | Some c -> collect (c :: acc) (remaining - 1)
238 in
239 let (chars, success) = collect [] n in
240 (* Always push back characters we read, in reverse order *)
241 List.iter (push_back_char t) (List.rev chars);
242 t.last_was_cr <- false;
243 (chars, success)
244
245(* peek_n returns Some string only when exactly n chars are available
246 Avoid using this in hot paths - prefer peek_chars + direct comparison *)
247let peek_n t n =
248 let (chars, success) = peek_chars t n in
249 if success then
250 Some (String.init n (fun i -> List.nth chars i))
251 else
252 None
253
254let advance t =
255 match read_normalized_char t with
256 | None -> ()
257 | Some c ->
258 (* Update position tracking *)
259 if c = '\n' then begin
260 t.line <- t.line + 1;
261 t.column <- 0
262 end else
263 t.column <- t.column + 1
264
265let consume t =
266 let c = peek t in
267 advance t;
268 c
269
270let consume_if t pred =
271 match peek t with
272 | Some c when pred c -> advance t; Some c
273 | _ -> None
274
275let consume_while t pred =
276 let buf = Buffer.create 16 in
277 let rec loop () =
278 match peek t with
279 | Some c when pred c ->
280 Buffer.add_char buf c;
281 advance t;
282 loop ()
283 | _ -> ()
284 in
285 loop ();
286 Buffer.contents buf
287
288(* Case-insensitive match without allocating a string
289 Compares directly with the char list from peek_chars *)
290let matches_ci t s =
291 let slen = String.length s in
292 let (chars, success) = peek_chars t slen in
293 if not success then false
294 else begin
295 let rec check chars_remaining i =
296 match chars_remaining with
297 | [] -> i >= slen (* Matched all *)
298 | c :: rest ->
299 if i >= slen then true
300 else
301 let c1 = Char.lowercase_ascii c in
302 let c2 = Char.lowercase_ascii (String.unsafe_get s i) in
303 if c1 = c2 then check rest (i + 1)
304 else false
305 in
306 check chars 0
307 end
308
309let consume_exact_ci t s =
310 if matches_ci t s then begin
311 for _ = 1 to String.length s do advance t done;
312 true
313 end else false
314
315let reconsume t =
316 (* Move back one position - simplified, doesn't handle CR/LF properly for reconsume *)
317 (* This is called after advance, so we just need to push back a placeholder *)
318 (* The tokenizer will call peek again which will get the right character *)
319 (* Actually, for reconsume we need to track what we last consumed *)
320 (* For now, just adjust column *)
321 if t.column > 0 then t.column <- t.column - 1