OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 321 lines 11 kB view raw
1(* Input stream for tokenizer with position tracking using bytesrw 2 3 This implementation is designed to be as streaming as possible: 4 - Reads slices on-demand from the Bytes.Reader.t 5 - Only buffers what's needed for lookahead (typically 1-2 chars) 6 - Avoids string allocations in hot paths like matches_ci 7*) 8 9open Bytesrw 10 11type t = { 12 reader : Bytes.Reader.t; 13 (* Current slice and position within it *) 14 mutable current_slice : Bytes.Slice.t; 15 mutable slice_pos : int; 16 (* Lookahead buffer for reconsume and peek_n - small, typically 0-7 chars *) 17 mutable lookahead : char list; 18 (* Position tracking *) 19 mutable line : int; 20 mutable column : int; 21 (* Track if we just saw CR (for CR/LF normalization) *) 22 mutable last_was_cr : bool; 23 (* Track if we need to skip the next LF from raw stream (set after peek of CR) *) 24 mutable skip_next_lf : bool; 25 (* Error callback for surrogate/noncharacter detection *) 26 mutable error_callback : (string -> unit) option; 27} 28 29(* Create a stream from a Bytes.Reader.t *) 30let create_from_reader reader = 31 let slice = Bytes.Reader.read reader in 32 { 33 reader; 34 current_slice = slice; 35 slice_pos = 0; 36 lookahead = []; 37 line = 1; 38 column = 0; 39 last_was_cr = false; 40 skip_next_lf = false; 41 error_callback = None; 42 } 43 44let set_error_callback t cb = 45 t.error_callback <- Some cb 46 47(* Check if a Unicode codepoint is a surrogate *) 48let is_surrogate cp = cp >= 0xD800 && cp <= 0xDFFF 49 50(* Check if a Unicode codepoint is a noncharacter *) 51let is_noncharacter cp = 52 (* U+FDD0 to U+FDEF *) 53 (cp >= 0xFDD0 && cp <= 0xFDEF) || 54 (* U+FFFE and U+FFFF in each plane (0-16) *) 55 ((cp land 0xFFFF) = 0xFFFE || (cp land 0xFFFF) = 0xFFFF) 56 57(* Create a stream from a string - discouraged, prefer create_from_reader *) 58let create input = 59 create_from_reader (Bytes.Reader.of_string input) 60 61let position t = (t.line, t.column) 62 63(* Read next raw byte from the stream (before CR/LF normalization) *) 64let read_raw_char t = 65 (* First check lookahead *) 66 match t.lookahead with 67 | c :: rest -> 68 t.lookahead <- rest; 69 Some c 70 | [] -> 71 (* Check if current slice is exhausted *) 72 if Bytes.Slice.is_eod t.current_slice then 73 None 74 else if t.slice_pos >= Bytes.Slice.length t.current_slice then begin 75 (* Get next slice *) 76 t.current_slice <- Bytes.Reader.read t.reader; 77 t.slice_pos <- 0; 78 if Bytes.Slice.is_eod t.current_slice then 79 None 80 else begin 81 let c = Bytes.get (Bytes.Slice.bytes t.current_slice) 82 (Bytes.Slice.first t.current_slice + t.slice_pos) in 83 t.slice_pos <- t.slice_pos + 1; 84 Some c 85 end 86 end else begin 87 let c = Bytes.get (Bytes.Slice.bytes t.current_slice) 88 (Bytes.Slice.first t.current_slice + t.slice_pos) in 89 t.slice_pos <- t.slice_pos + 1; 90 Some c 91 end 92 93(* Push a char back to lookahead *) 94let push_back_char t c = 95 t.lookahead <- c :: t.lookahead 96 97(* Check for surrogates and noncharacters in UTF-8 sequences. 98 Called after reading a lead byte, peeks continuation bytes to decode codepoint. *) 99let check_utf8_codepoint t lead_byte = 100 let b0 = Char.code lead_byte in 101 if b0 < 0x80 then 102 (* ASCII - control characters are handled in tokenizer_impl.ml *) 103 () 104 else if b0 >= 0xC2 && b0 <= 0xDF then begin 105 (* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *) 106 (* Check for C1 control characters U+0080-U+009F *) 107 match read_raw_char t with 108 | Some c1 when (Char.code c1 land 0xC0) = 0x80 -> 109 let b1 = Char.code c1 in 110 let cp = ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F) in 111 push_back_char t c1; 112 (* C1 controls: U+0080 to U+009F *) 113 if cp >= 0x80 && cp <= 0x9F then 114 (match t.error_callback with 115 | Some cb -> cb (Printf.sprintf "control-character-in-input-stream:%04x" cp) 116 | None -> ()) 117 | Some c1 -> 118 push_back_char t c1 119 | None -> () 120 end else if b0 >= 0xE0 && b0 <= 0xEF then begin 121 (* 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx -> U+0800 to U+FFFF *) 122 (* Need to peek 2 continuation bytes *) 123 match read_raw_char t with 124 | Some c1 when (Char.code c1 land 0xC0) = 0x80 -> 125 let b1 = Char.code c1 in 126 (match read_raw_char t with 127 | Some c2 when (Char.code c2 land 0xC0) = 0x80 -> 128 let b2 = Char.code c2 in 129 let cp = ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) in 130 push_back_char t c2; 131 push_back_char t c1; 132 (* Check for surrogates and noncharacters *) 133 (match t.error_callback with 134 | Some cb -> 135 if is_surrogate cp then cb (Printf.sprintf "surrogate-in-input-stream:%04x" cp) 136 else if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%04x" cp) 137 | None -> ()) 138 | Some c2 -> 139 push_back_char t c2; 140 push_back_char t c1 141 | None -> 142 push_back_char t c1) 143 | Some c1 -> 144 push_back_char t c1 145 | None -> () 146 end else if b0 >= 0xF0 && b0 <= 0xF4 then begin 147 (* 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx -> U+10000 to U+10FFFF *) 148 match read_raw_char t with 149 | Some c1 when (Char.code c1 land 0xC0) = 0x80 -> 150 let b1 = Char.code c1 in 151 (match read_raw_char t with 152 | Some c2 when (Char.code c2 land 0xC0) = 0x80 -> 153 let b2 = Char.code c2 in 154 (match read_raw_char t with 155 | Some c3 when (Char.code c3 land 0xC0) = 0x80 -> 156 let b3 = Char.code c3 in 157 let cp = ((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor 158 ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) in 159 push_back_char t c3; 160 push_back_char t c2; 161 push_back_char t c1; 162 (* Check for noncharacters (no surrogates in 4-byte range) *) 163 (match t.error_callback with 164 | Some cb -> 165 if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%05x" cp) 166 | None -> ()) 167 | Some c3 -> 168 push_back_char t c3; 169 push_back_char t c2; 170 push_back_char t c1 171 | None -> 172 push_back_char t c2; 173 push_back_char t c1) 174 | Some c2 -> 175 push_back_char t c2; 176 push_back_char t c1 177 | None -> 178 push_back_char t c1) 179 | Some c1 -> 180 push_back_char t c1 181 | None -> () 182 end 183 184(* Read next char with CR/LF normalization *) 185let rec read_normalized_char t = 186 (* Track if we're reading from lookahead - if so, we've already checked this byte *) 187 let from_lookahead = t.lookahead <> [] in 188 match read_raw_char t with 189 | None -> 190 t.last_was_cr <- false; 191 None 192 | Some '\r' -> 193 t.last_was_cr <- true; 194 (* Immediately consume following LF if present (CRLF -> single LF) *) 195 (match read_raw_char t with 196 | Some '\n' -> () (* Consume the LF that follows CR *) 197 | Some c -> push_back_char t c (* Put non-LF char back *) 198 | None -> ()); 199 Some '\n' (* CR (or CRLF) becomes single LF *) 200 | Some '\n' when t.last_was_cr -> 201 (* Skip LF after CR - it was already converted *) 202 t.last_was_cr <- false; 203 read_normalized_char t 204 | Some c -> 205 t.last_was_cr <- false; 206 (* Only check for surrogates/noncharacters when reading fresh from stream, 207 not when re-reading from lookahead (to avoid duplicate errors) *) 208 if not from_lookahead then check_utf8_codepoint t c; 209 Some c 210 211let is_eof t = 212 t.lookahead = [] && 213 (Bytes.Slice.is_eod t.current_slice || 214 (t.slice_pos >= Bytes.Slice.length t.current_slice && 215 (let next = Bytes.Reader.read t.reader in 216 t.current_slice <- next; 217 t.slice_pos <- 0; 218 Bytes.Slice.is_eod next))) 219 220let peek t = 221 (* Save last_was_cr state before reading *) 222 let saved_last_was_cr = t.last_was_cr in 223 match read_normalized_char t with 224 | None -> None 225 | Some c -> 226 push_back_char t c; 227 (* Restore the last_was_cr state so advance handles CR/LF correctly *) 228 t.last_was_cr <- saved_last_was_cr; 229 Some c 230 231(* Read n characters into a list, returns (chars_read, all_read_successfully) *) 232let peek_chars t n = 233 let rec collect acc remaining = 234 if remaining <= 0 then (List.rev acc, true) 235 else match read_normalized_char t with 236 | None -> (List.rev acc, false) (* Not enough chars available *) 237 | Some c -> collect (c :: acc) (remaining - 1) 238 in 239 let (chars, success) = collect [] n in 240 (* Always push back characters we read, in reverse order *) 241 List.iter (push_back_char t) (List.rev chars); 242 t.last_was_cr <- false; 243 (chars, success) 244 245(* peek_n returns Some string only when exactly n chars are available 246 Avoid using this in hot paths - prefer peek_chars + direct comparison *) 247let peek_n t n = 248 let (chars, success) = peek_chars t n in 249 if success then 250 Some (String.init n (fun i -> List.nth chars i)) 251 else 252 None 253 254let advance t = 255 match read_normalized_char t with 256 | None -> () 257 | Some c -> 258 (* Update position tracking *) 259 if c = '\n' then begin 260 t.line <- t.line + 1; 261 t.column <- 0 262 end else 263 t.column <- t.column + 1 264 265let consume t = 266 let c = peek t in 267 advance t; 268 c 269 270let consume_if t pred = 271 match peek t with 272 | Some c when pred c -> advance t; Some c 273 | _ -> None 274 275let consume_while t pred = 276 let buf = Buffer.create 16 in 277 let rec loop () = 278 match peek t with 279 | Some c when pred c -> 280 Buffer.add_char buf c; 281 advance t; 282 loop () 283 | _ -> () 284 in 285 loop (); 286 Buffer.contents buf 287 288(* Case-insensitive match without allocating a string 289 Compares directly with the char list from peek_chars *) 290let matches_ci t s = 291 let slen = String.length s in 292 let (chars, success) = peek_chars t slen in 293 if not success then false 294 else begin 295 let rec check chars_remaining i = 296 match chars_remaining with 297 | [] -> i >= slen (* Matched all *) 298 | c :: rest -> 299 if i >= slen then true 300 else 301 let c1 = Char.lowercase_ascii c in 302 let c2 = Char.lowercase_ascii (String.unsafe_get s i) in 303 if c1 = c2 then check rest (i + 1) 304 else false 305 in 306 check chars 0 307 end 308 309let consume_exact_ci t s = 310 if matches_ci t s then begin 311 for _ = 1 to String.length s do advance t done; 312 true 313 end else false 314 315let reconsume t = 316 (* Move back one position - simplified, doesn't handle CR/LF properly for reconsume *) 317 (* This is called after advance, so we just need to push back a placeholder *) 318 (* The tokenizer will call peek again which will get the right character *) 319 (* Actually, for reconsume we need to track what we last consumed *) 320 (* For now, just adjust column *) 321 if t.column > 0 then t.column <- t.column - 1