OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* HTML meta charset prescan per WHATWG spec *)
2
3(* Character classification using Astring *)
4let is_ascii_whitespace c = c = '\x09' || c = '\x0A' || c = '\x0C' || c = '\x0D' || c = '\x20'
5let is_ascii_alpha = Astring.Char.Ascii.is_letter
6
7let skip_whitespace data i len =
8 let j = ref i in
9 while !j < len && is_ascii_whitespace (Bytes.get data !j) do
10 incr j
11 done;
12 !j
13
14let strip_whitespace data start len =
15 let s = ref start in
16 let e = ref (start + len) in
17 while !s < !e && is_ascii_whitespace (Bytes.get data !s) do incr s done;
18 while !e > !s && is_ascii_whitespace (Bytes.get data (!e - 1)) do decr e done;
19 Bytes.sub_string data !s (!e - !s)
20
21let extract_charset_from_content content =
22 let len = String.length content in
23 (* Find "charset" *)
24 let rec find_charset i =
25 if i + 7 > len then None
26 else
27 let sub = String.lowercase_ascii (String.sub content i 7) in
28 if sub = "charset" then
29 let j = ref (i + 7) in
30 (* Skip whitespace *)
31 while !j < len && is_ascii_whitespace content.[!j] do incr j done;
32 if !j >= len || content.[!j] <> '=' then find_charset (i + 1)
33 else begin
34 incr j;
35 (* Skip whitespace after = *)
36 while !j < len && is_ascii_whitespace content.[!j] do incr j done;
37 if !j >= len then None
38 else
39 let quote =
40 if content.[!j] = '"' || content.[!j] = '\'' then begin
41 let q = content.[!j] in
42 incr j;
43 Some q
44 end else None
45 in
46 let start = !j in
47 (match quote with
48 | Some q ->
49 while !j < len && content.[!j] <> q do incr j done;
50 if !j >= len then None
51 else Some (String.sub content start (!j - start))
52 | None ->
53 while !j < len &&
54 not (is_ascii_whitespace content.[!j]) &&
55 content.[!j] <> ';' do
56 incr j
57 done;
58 Some (String.sub content start (!j - start)))
59 end
60 else find_charset (i + 1)
61 in
62 find_charset 0
63
64let prescan_for_meta_charset data =
65 let len = Bytes.length data in
66 let max_non_comment = 1024 in
67 let max_total = 65536 in
68 let i = ref 0 in
69 let non_comment = ref 0 in
70
71 let result = ref None in
72
73 while !result = None && !i < len && !i < max_total && !non_comment < max_non_comment do
74 if Bytes.get data !i <> '<' then begin
75 incr i;
76 incr non_comment
77 end else begin
78 (* Check for comment *)
79 if !i + 3 < len &&
80 Bytes.get data (!i + 1) = '!' &&
81 Bytes.get data (!i + 2) = '-' &&
82 Bytes.get data (!i + 3) = '-' then begin
83 (* Skip comment *)
84 let j = ref (!i + 4) in
85 while !j + 2 < len && not (
86 Bytes.get data !j = '-' &&
87 Bytes.get data (!j + 1) = '-' &&
88 Bytes.get data (!j + 2) = '>'
89 ) do incr j done;
90 if !j + 2 < len then
91 i := !j + 3
92 else
93 i := len (* Unclosed comment - stop scanning *)
94 end
95 (* Check for end tag - skip it *)
96 else if !i + 1 < len && Bytes.get data (!i + 1) = '/' then begin
97 let j = ref (!i + 2) in
98 let in_quote = ref None in
99 let done_tag = ref false in
100 while not !done_tag && !j < len && !j < max_total && !non_comment < max_non_comment do
101 let c = Bytes.get data !j in
102 match !in_quote with
103 | None ->
104 if c = '"' || c = '\'' then begin
105 in_quote := Some c;
106 incr j;
107 incr non_comment
108 end else if c = '>' then begin
109 incr j;
110 incr non_comment;
111 done_tag := true
112 end else begin
113 incr j;
114 incr non_comment
115 end
116 | Some q ->
117 if c = q then in_quote := None;
118 incr j;
119 incr non_comment
120 done;
121 i := !j
122 end
123 (* Check for tag *)
124 else if !i + 1 < len && is_ascii_alpha (Bytes.get data (!i + 1)) then begin
125 let j = ref (!i + 1) in
126 while !j < len && is_ascii_alpha (Bytes.get data !j) do incr j done;
127 let tag_name =
128 let name_bytes = Bytes.sub data (!i + 1) (!j - !i - 1) in
129 String.lowercase_ascii (Bytes.to_string name_bytes)
130 in
131
132 if tag_name <> "meta" then begin
133 (* Skip non-meta tag *)
134 let in_quote = ref None in
135 let done_tag = ref false in
136 while not !done_tag && !j < len && !j < max_total && !non_comment < max_non_comment do
137 let c = Bytes.get data !j in
138 match !in_quote with
139 | None ->
140 if c = '"' || c = '\'' then begin
141 in_quote := Some c;
142 incr j;
143 incr non_comment
144 end else if c = '>' then begin
145 incr j;
146 incr non_comment;
147 done_tag := true
148 end else begin
149 incr j;
150 incr non_comment
151 end
152 | Some q ->
153 if c = q then in_quote := None;
154 incr j;
155 incr non_comment
156 done;
157 i := !j
158 end else begin
159 (* Parse meta tag attributes *)
160 let charset = ref None in
161 let http_equiv = ref None in
162 let content = ref None in
163 let k = ref !j in
164 let saw_gt = ref false in
165
166 while not !saw_gt && !k < len && !k < max_total do
167 let c = Bytes.get data !k in
168 if c = '>' then begin
169 saw_gt := true;
170 incr k
171 end else if c = '<' then begin
172 (* Restart scanning from here *)
173 k := len
174 end else if is_ascii_whitespace c || c = '/' then begin
175 incr k
176 end else begin
177 (* Attribute name *)
178 let attr_start = !k in
179 while !k < len &&
180 not (is_ascii_whitespace (Bytes.get data !k)) &&
181 Bytes.get data !k <> '=' &&
182 Bytes.get data !k <> '>' &&
183 Bytes.get data !k <> '/' &&
184 Bytes.get data !k <> '<' do
185 incr k
186 done;
187 let attr_name =
188 String.lowercase_ascii (Bytes.sub_string data attr_start (!k - attr_start))
189 in
190 k := skip_whitespace data !k len;
191
192 let value = ref None in
193 if !k < len && Bytes.get data !k = '=' then begin
194 incr k;
195 k := skip_whitespace data !k len;
196 if !k < len then begin
197 let qc = Bytes.get data !k in
198 if qc = '"' || qc = '\'' then begin
199 incr k;
200 let val_start = !k in
201 while !k < len && Bytes.get data !k <> qc do incr k done;
202 if !k < len then begin
203 value := Some (Bytes.sub_string data val_start (!k - val_start));
204 incr k
205 end
206 end else begin
207 let val_start = !k in
208 while !k < len &&
209 not (is_ascii_whitespace (Bytes.get data !k)) &&
210 Bytes.get data !k <> '>' &&
211 Bytes.get data !k <> '<' do
212 incr k
213 done;
214 value := Some (Bytes.sub_string data val_start (!k - val_start))
215 end
216 end
217 end;
218
219 if attr_name = "charset" then
220 charset := !value
221 else if attr_name = "http-equiv" then
222 http_equiv := !value
223 else if attr_name = "content" then
224 content := !value
225 end
226 done;
227
228 if !saw_gt then begin
229 (* Check for charset *)
230 (match !charset with
231 | Some cs ->
232 (match Encoding_labels.normalize_meta_declared cs with
233 | Some enc -> result := Some enc
234 | None -> ())
235 | None -> ());
236
237 (* Check for http-equiv="content-type" with content *)
238 (* Note: http-equiv value must be exactly "content-type" (case-insensitive) *)
239 if !result = None then
240 (match !http_equiv, !content with
241 | Some he, Some ct when String.lowercase_ascii he = "content-type" ->
242 (match extract_charset_from_content ct with
243 | Some extracted ->
244 (match Encoding_labels.normalize_meta_declared extracted with
245 | Some enc -> result := Some enc
246 | None -> ())
247 | None -> ())
248 | _ -> ());
249
250 i := !k;
251 non_comment := !non_comment + (!k - !j)
252 end else begin
253 incr i;
254 incr non_comment
255 end
256 end
257 end else begin
258 incr i;
259 incr non_comment
260 end
261 end
262 done;
263
264 !result