OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Srcset and sizes attribute validation checker. *)
2
3(** Quote helper for consistent message formatting. *)
4let q = Error_code.q
5
6(** Valid CSS length units for sizes attribute *)
7let valid_length_units = [
8 "em"; "ex"; "ch"; "rem"; "cap"; "ic";
9 "vw"; "svw"; "lvw"; "dvw"; "vh"; "svh"; "lvh"; "dvh";
10 "vi"; "svi"; "lvi"; "dvi"; "vb"; "svb"; "lvb"; "dvb";
11 "vmin"; "svmin"; "lvmin"; "dvmin"; "vmax"; "svmax"; "lvmax"; "dvmax";
12 "cm"; "mm"; "q"; "in"; "pc"; "pt"; "px"
13]
14
15type state = unit [@@warning "-34"]
16
17let create () = ()
18let reset _state = ()
19
20(** Split string on a character while respecting parentheses *)
21let split_respecting_parens ~sep s =
22 let len = String.length s in
23 let result = ref [] in
24 let current = Buffer.create 64 in
25 let depth = ref 0 in
26 for i = 0 to len - 1 do
27 let c = s.[i] in
28 if c = '(' then begin
29 incr depth;
30 Buffer.add_char current c
31 end else if c = ')' then begin
32 decr depth;
33 Buffer.add_char current c
34 end else if c = sep && !depth = 0 then begin
35 result := Buffer.contents current :: !result;
36 Buffer.clear current
37 end else
38 Buffer.add_char current c
39 done;
40 (* Add the last segment *)
41 result := Buffer.contents current :: !result;
42 List.rev !result
43
44(** Split string on commas while respecting parentheses *)
45let split_on_comma_respecting_parens s = split_respecting_parens ~sep:',' s
46
47(** Split on commas respecting BALANCED parentheses only (for srcset).
48 If parens are unbalanced overall, just split on all commas. *)
49let split_on_comma_balanced_parens s =
50 (* First, check if parens are balanced overall *)
51 let opens = ref 0 and closes = ref 0 in
52 String.iter (fun c -> if c = '(' then incr opens else if c = ')' then incr closes) s;
53 if !opens <> !closes then
54 (* Unbalanced parens - just split on all commas *)
55 String.split_on_char ',' s
56 else
57 (* Balanced parens - respect them during split *)
58 split_on_comma_respecting_parens s
59
60(** Split string on spaces while respecting parentheses, filtering empty segments *)
61let split_on_space_respecting_parens s =
62 split_respecting_parens ~sep:' ' s |> List.filter (fun s -> s <> "")
63
64(** Invalid units that are not CSS lengths but might be confused for them *)
65let invalid_size_units = [
66 "deg"; "grad"; "rad"; "turn"; (* angle units *)
67 "s"; "ms"; (* time units *)
68 "hz"; "khz"; (* frequency units *)
69 "dpi"; "dpcm"; "dppx"; (* resolution units *)
70 "%" (* percentage - not valid in sizes *)
71]
72
73(** Strip CSS comments from a value *)
74let strip_css_comments s =
75 let buf = Buffer.create (String.length s) in
76 let len = String.length s in
77 let i = ref 0 in
78 while !i < len do
79 if !i + 1 < len && s.[!i] = '/' && s.[!i + 1] = '*' then begin
80 (* Start of comment, find end *)
81 i := !i + 2;
82 while !i + 1 < len && not (s.[!i] = '*' && s.[!i + 1] = '/') do
83 incr i
84 done;
85 if !i + 1 < len then i := !i + 2
86 end else begin
87 Buffer.add_char buf s.[!i];
88 incr i
89 end
90 done;
91 Buffer.contents buf
92
93(** Check if a size value has a valid CSS length unit and non-negative value *)
94type size_check_result =
95 | Valid
96 | InvalidUnit of string * string (* (found_unit, context) *)
97 | NegativeValue
98 | CssCommentAfterSign of string * string (* what was found, context *)
99 | CssCommentBeforeUnit of string * string (* what was found, context *)
100 | BadScientificNotation
101 | BadCssNumber of char * string (* (first_char, context) - not starting with digit or minus *)
102
103(** CSS comment error types *)
104type css_comment_error =
105 | NoCommentError
106 | CommentAfterSign of string * string (* what was found, context *)
107 | CommentBetweenNumberAndUnit of string * string (* what was found at comment position, context *)
108
109(** Check if CSS comment appears in an invalid position:
110 - Between sign and number (+/**/50vw)
111 - Between number and unit (50/**/vw)
112 Trailing comments (50vw/**/) are valid. *)
113let check_css_comment_position s =
114 let len = String.length s in
115 (* Find comment position *)
116 let rec find_comment i =
117 if i + 1 >= len then None
118 else if s.[i] = '/' && s.[i + 1] = '*' then Some i
119 else find_comment (i + 1)
120 in
121 match find_comment 0 with
122 | None -> NoCommentError
123 | Some comment_pos ->
124 let before = String.sub s 0 comment_pos in
125 let trimmed_before = String.trim before in
126 if String.length trimmed_before = 0 then NoCommentError (* Leading comment is OK *)
127 else begin
128 (* Find end of comment *)
129 let rec find_end i =
130 if i + 1 >= len then len
131 else if s.[i] = '*' && s.[i + 1] = '/' then i + 2
132 else find_end (i + 1)
133 in
134 let end_pos = find_end (comment_pos + 2) in
135 let after = if end_pos < len then String.sub s end_pos (len - end_pos) else "" in
136 let trimmed_after = String.trim (strip_css_comments after) in
137 if trimmed_after = "" then NoCommentError (* Trailing comment is OK *)
138 else begin
139 (* Comment is in the middle - check if it breaks a number/unit combo *)
140 let last = trimmed_before.[String.length trimmed_before - 1] in
141 (* What's at the comment position? Just show "/" *)
142 let slash = "/" in
143 (* Invalid if comment appears after +/- *)
144 if last = '+' || last = '-' then
145 CommentAfterSign (trimmed_before ^ slash, s)
146 (* Invalid if comment appears after digit (before more content) *)
147 else if (last >= '0' && last <= '9') || last = '.' then
148 CommentBetweenNumberAndUnit (slash ^ trimmed_after, s)
149 else
150 NoCommentError
151 end
152 end
153
154(** Check if scientific notation has invalid exponent (like 1e+1.5 - decimal in exponent) *)
155let has_invalid_scientific_notation s =
156 let lower = Astring.String.Ascii.lowercase s in
157 (* Find 'e' for scientific notation *)
158 match String.index_opt lower 'e' with
159 | None -> false
160 | Some e_pos ->
161 (* Check if there's a decimal after the exponent sign *)
162 let after_e = String.sub lower (e_pos + 1) (String.length lower - e_pos - 1) in
163 let after_sign =
164 if String.length after_e > 0 && (after_e.[0] = '+' || after_e.[0] = '-') then
165 String.sub after_e 1 (String.length after_e - 1)
166 else after_e
167 in
168 String.contains after_sign '.'
169
170(** Extract unit from a size value like "10px" -> "px", "100vw" -> "vw", "50%" -> "%"
171 Returns the unit with original case preserved *)
172let extract_unit s =
173 let trimmed = String.trim s in
174 let len = String.length trimmed in
175 if len = 0 then ""
176 (* Check for % at the end *)
177 else if trimmed.[len - 1] = '%' then "%"
178 else begin
179 let lower = Astring.String.Ascii.lowercase trimmed in
180 (* Try to find a unit at the end (letters only) *)
181 let rec find_unit_length i =
182 if i < 0 then 0
183 else if lower.[i] >= 'a' && lower.[i] <= 'z' then find_unit_length (i - 1)
184 else i + 1
185 in
186 let start = find_unit_length (len - 1) in
187 if start < len then
188 (* Return the unit from the original string (preserving case) *)
189 String.sub trimmed start (len - start)
190 else ""
191 end
192
193let check_size_value size_value =
194 let trimmed = String.trim size_value in
195 if trimmed = "" then InvalidUnit ("", trimmed)
196 else begin
197 (* Check for CSS comments inside numbers - this is invalid *)
198 match check_css_comment_position trimmed with
199 | CommentAfterSign (found, ctx) -> CssCommentAfterSign (found, ctx)
200 | CommentBetweenNumberAndUnit (found, ctx) -> CssCommentBeforeUnit (found, ctx)
201 | NoCommentError ->
202 (* Strip valid leading/trailing CSS comments for further checks *)
203 let value_no_comments = String.trim (strip_css_comments trimmed) in
204 (* Check for invalid scientific notation like 1e+1.5px *)
205 if has_invalid_scientific_notation value_no_comments then BadScientificNotation
206 (* "auto" is only valid with lazy loading, which requires checking the element context.
207 For general validation, treat "auto" alone as invalid in sizes. *)
208 else if Astring.String.Ascii.lowercase value_no_comments = "auto" then
209 BadCssNumber (value_no_comments.[0], trimmed)
210 else if value_no_comments = "" then InvalidUnit ("", trimmed)
211 else begin
212 let lower = Astring.String.Ascii.lowercase value_no_comments in
213 (* Check for calc() or other CSS functions first - these are always valid *)
214 if String.contains value_no_comments '(' then Valid
215 else begin
216 (* Check if the value starts with a digit, minus, or plus sign *)
217 let first_char = value_no_comments.[0] in
218 let starts_with_number =
219 (first_char >= '0' && first_char <= '9') ||
220 first_char = '-' ||
221 first_char = '+' ||
222 first_char = '.' (* decimal point like .5px *)
223 in
224 if not starts_with_number then
225 (* Not a valid CSS number token - doesn't start with digit or sign *)
226 BadCssNumber (first_char, trimmed)
227 else begin
228 (* Check for invalid units first *)
229 let found_invalid = List.find_opt (fun unit ->
230 let len = String.length unit in
231 String.length lower > len &&
232 String.sub lower (String.length lower - len) len = unit
233 ) invalid_size_units in
234 match found_invalid with
235 | Some _unit -> InvalidUnit (extract_unit value_no_comments, trimmed)
236 | None ->
237 (* Check for valid CSS length units *)
238 let has_valid_unit = List.exists (fun unit ->
239 let len = String.length unit in
240 String.length lower > len &&
241 String.sub lower (String.length lower - len) len = unit
242 ) valid_length_units in
243 if has_valid_unit then begin
244 (* Check if it's negative (starts with - but not -0) *)
245 if String.length value_no_comments > 0 && value_no_comments.[0] = '-' then begin
246 (* Check if it's -0 which is valid *)
247 let after_minus = String.sub value_no_comments 1 (String.length value_no_comments - 1) in
248 try
249 let num_str = Str.global_replace (Str.regexp "[a-zA-Z]+$") "" after_minus in
250 let f = float_of_string num_str in
251 if f = 0.0 then Valid else NegativeValue
252 with _ -> NegativeValue
253 end else
254 Valid
255 end
256 else begin
257 (* Check if it's a zero value (0, -0, +0) - these are valid without units *)
258 let stripped =
259 let s = value_no_comments in
260 let s = if String.length s > 0 && (s.[0] = '+' || s.[0] = '-') then String.sub s 1 (String.length s - 1) else s in
261 s
262 in
263 (* Check if it's zero or a numeric value starting with 0 *)
264 try
265 let f = float_of_string stripped in
266 if f = 0.0 then Valid else InvalidUnit (extract_unit value_no_comments, trimmed)
267 with _ -> InvalidUnit (extract_unit value_no_comments, trimmed)
268 end
269 end
270 end
271 end
272 end
273
274(** Check if a sizes entry has a media condition (starts with '(') *)
275let has_media_condition entry =
276 let trimmed = String.trim entry in
277 String.length trimmed > 0 && trimmed.[0] = '('
278
279(** Check if entry looks like it's trying to be a media condition but isn't properly formatted *)
280let has_invalid_media_condition entry =
281 let trimmed = String.trim entry in
282 if String.length trimmed = 0 then None
283 else begin
284 let first_char = trimmed.[0] in
285 if first_char = '(' then begin
286 (* Check for bad content inside the media condition *)
287 let len = String.length trimmed in
288 let rec find_close_paren i depth =
289 if i >= len then None
290 else match trimmed.[i] with
291 | '(' -> find_close_paren (i + 1) (depth + 1)
292 | ')' -> if depth = 1 then Some i else find_close_paren (i + 1) (depth - 1)
293 | _ -> find_close_paren (i + 1) depth
294 in
295 match find_close_paren 0 0 with
296 | None -> Some "Unclosed media condition"
297 | Some close_pos ->
298 let inner = String.sub trimmed 1 (close_pos - 1) in
299 let inner_trimmed = String.trim inner in
300 (* Check for obviously invalid content like just numbers or curly braces *)
301 if String.length inner_trimmed > 0 then begin
302 let first_inner = inner_trimmed.[0] in
303 if first_inner >= '0' && first_inner <= '9' then
304 Some "Bad media condition: Parse Error"
305 else if String.contains inner_trimmed '}' || String.contains inner_trimmed '{' then
306 Some "Bad media condition: Parse Error"
307 else
308 None
309 end else
310 Some "Bad media condition: Parse Error"
311 end else begin
312 (* Check for bare "all" which is invalid *)
313 let lower = Astring.String.Ascii.lowercase trimmed in
314 let parts = String.split_on_char ' ' lower |> List.filter (fun s -> s <> "") in
315 match parts with
316 | keyword :: _ when keyword = "all" ->
317 Some "Bad media condition: Parse Error"
318 | keyword :: _ when String.length keyword > 0 && not (keyword.[0] >= '0' && keyword.[0] <= '9') ->
319 (* Looks like a keyword without parens like "min-width:500px" *)
320 if String.contains keyword ':' then
321 Some "Bad media condition: Parse Error"
322 else
323 None
324 | _ -> None
325 end
326 end
327
328(** Extract the size value from a sizes entry (after media condition if any) *)
329let extract_size_value entry =
330 let trimmed = String.trim entry in
331 if not (has_media_condition trimmed) then
332 trimmed
333 else begin
334 (* Media conditions can have "and", "or", "not" operators connecting
335 multiple parenthesized groups, e.g., "(not (width:500px)) and (width:500px) 500px"
336 We need to skip all media condition parts to find the size value *)
337 let len = String.length trimmed in
338 let rec skip_media_condition i =
339 if i >= len then len
340 else begin
341 let remaining = String.trim (String.sub trimmed i (len - i)) in
342 let remaining_len = String.length remaining in
343 if remaining_len = 0 then len
344 else begin
345 let first_char = remaining.[0] in
346 if first_char = '(' then begin
347 (* Skip this parenthesized group *)
348 let rec find_close_paren j depth =
349 if j >= remaining_len then remaining_len
350 else match remaining.[j] with
351 | '(' -> find_close_paren (j + 1) (depth + 1)
352 | ')' -> if depth = 1 then j + 1 else find_close_paren (j + 1) (depth - 1)
353 | _ -> find_close_paren (j + 1) depth
354 in
355 let after_paren = find_close_paren 0 0 in
356 let new_pos = i + (len - i) - remaining_len + after_paren in
357 skip_media_condition new_pos
358 end
359 else begin
360 (* Check if remaining starts with "and", "or", "not" followed by space or paren *)
361 let lower_remaining = Astring.String.Ascii.lowercase remaining in
362 if remaining_len >= 4 && String.sub lower_remaining 0 4 = "and " then
363 skip_media_condition (i + (len - i) - remaining_len + 4)
364 else if remaining_len >= 3 && String.sub lower_remaining 0 3 = "or " then
365 skip_media_condition (i + (len - i) - remaining_len + 3)
366 else if remaining_len >= 4 && String.sub lower_remaining 0 4 = "not " then
367 skip_media_condition (i + (len - i) - remaining_len + 4)
368 else if remaining_len >= 4 && String.sub lower_remaining 0 4 = "and(" then
369 skip_media_condition (i + (len - i) - remaining_len + 3)
370 else if remaining_len >= 3 && String.sub lower_remaining 0 3 = "or(" then
371 skip_media_condition (i + (len - i) - remaining_len + 2)
372 else if remaining_len >= 4 && String.sub lower_remaining 0 4 = "not(" then
373 skip_media_condition (i + (len - i) - remaining_len + 3)
374 else
375 (* Found something that's not a media condition part - this is the size value *)
376 i + (len - i) - remaining_len
377 end
378 end
379 end
380 in
381 let size_start = skip_media_condition 0 in
382 if size_start >= len then ""
383 else String.trim (String.sub trimmed size_start (len - size_start))
384 end
385
386(** Validate sizes attribute value *)
387let validate_sizes value element_name collector =
388 (* Empty sizes is invalid *)
389 if String.trim value = "" then begin
390 Message_collector.add_typed collector
391 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Must not be empty." (q "") (q "sizes") (q element_name)))));
392 false
393 end else begin
394 (* Split on comma and check each entry *)
395 let entries = String.split_on_char ',' value in
396 let first_entry = String.trim (List.hd entries) in
397
398 (* Check if starts with comma (empty first entry) *)
399 if first_entry = "" then begin
400 Message_collector.add_typed collector
401 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Starts with empty source size." (q value) (q "sizes") (q element_name)))));
402 false
403 end else begin
404 (* Check for trailing comma *)
405 let last_entry = String.trim (List.nth entries (List.length entries - 1)) in
406 if List.length entries > 1 && last_entry = "" then begin
407 Message_collector.add_typed collector
408 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected media condition before %s at %s." (q value) (q "sizes") (q element_name) (q "") (q value)))));
409 false
410 end else begin
411 let valid = ref true in
412
413 (* Check for default-first pattern: unconditional value before conditional ones *)
414 let non_empty_entries = List.filter (fun e -> String.trim e <> "") entries in
415 (* Filter out entries that have invalid media conditions - they'll be reported separately *)
416 let valid_entries = List.filter (fun e ->
417 has_invalid_media_condition (String.trim e) = None
418 ) non_empty_entries in
419 if List.length valid_entries > 1 then begin
420 let first = List.hd valid_entries in
421 let rest = List.tl valid_entries in
422 (* If first entry has no media condition but later ones do, that's invalid *)
423 if not (has_media_condition first) && List.exists has_media_condition rest then begin
424 (* Context is the first entry with a comma *)
425 let context = (String.trim first) ^ "," in
426 Message_collector.add_typed collector
427 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected media condition before %s at %s." (q value) (q "sizes") (q element_name) (q "") (q context)))));
428 valid := false
429 end;
430 (* Check for multiple entries without media conditions.
431 When the first entry has no media condition, report "Expected media condition"
432 regardless of whether later entries have media conditions or not *)
433 if not (has_media_condition first) && !valid then begin
434 (* Only report if we haven't already reported the default-first error *)
435 if not (List.exists has_media_condition rest) then begin
436 (* Multiple defaults - report as "Expected media condition" *)
437 let context = (String.trim first) ^ "," in
438 Message_collector.add_typed collector
439 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected media condition before %s at %s." (q value) (q "sizes") (q element_name) (q "") (q context)))));
440 valid := false
441 end
442 end
443 end;
444
445 (* Validate each entry's media condition and size value *)
446 let num_entries = List.length entries in
447 List.iteri (fun idx entry ->
448 let trimmed = String.trim entry in
449 if trimmed <> "" then begin
450 (* Check for invalid media condition *)
451 (match has_invalid_media_condition trimmed with
452 | Some err_msg ->
453 let context = trimmed ^ "," in
454 Message_collector.add_typed collector
455 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: %s at %s." (q value) (q "sizes") (q element_name) err_msg (q context)))));
456 valid := false
457 | None -> ());
458
459 let size_val = extract_size_value trimmed in
460 if size_val <> "" then begin
461 (* Check if there are multiple space-separated words in the size value.
462 Only the first word should be the size, rest is junk. *)
463 let size_parts = String.split_on_char ' ' size_val |> List.filter (fun s -> s <> "") in
464 let first_size = match size_parts with [] -> size_val | hd :: _ -> hd in
465 let extra_parts = match size_parts with [] -> [] | _ :: tl -> tl in
466
467 (* Check if first word looks like it should have been a media condition
468 (doesn't start with digit, sign, decimal, '/', or look like a CSS function) *)
469 let first_char = if String.length first_size > 0 then first_size.[0] else 'x' in
470 let has_paren = String.contains size_val '(' in (* calc(), etc. *)
471 let looks_like_junk_entry =
472 not (has_media_condition trimmed) &&
473 not has_paren && (* Allow CSS functions like calc() *)
474 not (first_char = '/') && (* Allow leading CSS comments *)
475 not ((first_char >= '0' && first_char <= '9') ||
476 first_char = '+' || first_char = '-' || first_char = '.')
477 in
478
479 (* If this entry looks like junk and there are multiple entries,
480 report "Expected media condition" instead of "Bad CSS number".
481 For single entries with invalid values, fall through to BadCssNumber. *)
482 if looks_like_junk_entry && num_entries > 1 then begin
483 (* Find the context ending with the previous entry *)
484 let prev_entries = List.filter (fun e -> String.trim e <> "" && e <> entry) entries in
485 let context =
486 if List.length prev_entries > 0 then
487 String.concat ", " (List.map String.trim prev_entries) ^ ","
488 else value
489 in
490 Message_collector.add_typed collector
491 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected media condition before %s at %s." (q value) (q "sizes") (q element_name) (q "") (q context)))));
492 valid := false
493 end
494 (* If there's extra junk after the size, report BadCssNumber error for it *)
495 else if extra_parts <> [] then begin
496 let last_junk = List.nth extra_parts (List.length extra_parts - 1) in
497 let first_char = if String.length last_junk > 0 then last_junk.[0] else 'x' in
498 let is_last_entry = idx = num_entries - 1 in
499 let context =
500 if is_last_entry then value
501 else trimmed ^ ","
502 in
503 Message_collector.add_typed collector
504 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Bad CSS number token: Expected a minus sign or a digit but saw %s instead at %s." (q value) (q "sizes") (q element_name) (q (String.make 1 first_char)) (q context)))));
505 valid := false
506 end
507 else
508 match check_size_value first_size with
509 | Valid -> ()
510 | NegativeValue ->
511 let full_context =
512 if List.length entries > 1 then size_val ^ ","
513 else size_val
514 in
515 let _ = full_context in
516 Message_collector.add_typed collector
517 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected positive size value but found %s at %s." (q value) (q "sizes") (q element_name) (q size_val) (q size_val)))));
518 valid := false
519 | CssCommentAfterSign (found, context) ->
520 (* e.g., +/**/50vw - expected number after sign *)
521 Message_collector.add_typed collector
522 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected number but found %s at %s." (q value) (q "sizes") (q element_name) (q found) (q context)))));
523 valid := false
524 | CssCommentBeforeUnit (found, context) ->
525 (* e.g., 50/**/vw - expected units after number *)
526 let units_list = List.map q valid_length_units in
527 let units_str = String.concat ", " units_list in
528 Message_collector.add_typed collector
529 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected units (one of %s) but found %s at %s." (q value) (q "sizes") (q element_name) units_str (q found) (q context)))));
530 valid := false
531 | BadScientificNotation ->
532 (* For scientific notation with bad exponent, show what char was expected vs found *)
533 let context =
534 if List.length entries > 1 then trimmed ^ ","
535 else trimmed
536 in
537 (* Find the period in the exponent *)
538 let _ = context in
539 Message_collector.add_typed collector
540 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Bad CSS number token: Expected a digit but saw %s instead at %s." (q value) (q "sizes") (q element_name) (q ".") (q size_val)))));
541 valid := false
542 | BadCssNumber (first_char, context) ->
543 (* Value doesn't start with a digit or minus sign *)
544 let full_context =
545 if List.length entries > 1 then context ^ ","
546 else context
547 in
548 let _ = full_context in
549 Message_collector.add_typed collector
550 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Bad CSS number token: Expected a minus sign or a digit but saw %s instead at %s." (q value) (q "sizes") (q element_name) (q (String.make 1 first_char)) (q context)))));
551 valid := false
552 | InvalidUnit (found_unit, _context) ->
553 (* Generate the full list of expected units *)
554 let units_list = List.map q valid_length_units in
555 let units_str = String.concat ", " units_list in
556 (* Context should be the full entry, with comma only if there are multiple entries *)
557 let full_context =
558 if List.length entries > 1 then trimmed ^ ","
559 else trimmed
560 in
561 (* When found_unit is empty, say "no units" instead of quoting empty string *)
562 let found_str =
563 if found_unit = "" then "no units"
564 else q found_unit
565 in
566 Message_collector.add_typed collector
567 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected units (one of %s) but found %s at %s." (q value) (q "sizes") (q element_name) units_str found_str (q full_context)))));
568 valid := false
569 end
570 end
571 ) entries;
572
573 !valid
574 end
575 end
576 end
577
578(** Validate srcset descriptor *)
579let validate_srcset_descriptor desc element_name srcset_value has_sizes collector =
580 let desc_lower = Astring.String.Ascii.lowercase (String.trim desc) in
581 if String.length desc_lower = 0 then true
582 else begin
583 let last_char = desc_lower.[String.length desc_lower - 1] in
584 let num_part = String.sub desc_lower 0 (String.length desc_lower - 1) in
585
586 match last_char with
587 | 'w' ->
588 (* Width descriptor - must be positive integer, no leading + *)
589 let trimmed_desc = String.trim desc in
590 if String.length trimmed_desc > 0 && trimmed_desc.[0] = '+' then begin
591 (* Show just the number part (without the 'w') *)
592 let num_part_for_msg = String.sub trimmed_desc 0 (String.length trimmed_desc - 1) in
593 Message_collector.add_typed collector
594 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected number without leading plus sign but found %s at %s." (q srcset_value) (q "srcset") (q element_name) (q num_part_for_msg) (q srcset_value)))));
595 false
596 end else
597 (try
598 let n = int_of_string num_part in
599 if n <= 0 then begin
600 Message_collector.add_typed collector
601 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected number greater than zero but found %s at %s." (q srcset_value) (q "srcset") (q element_name) (q num_part) (q srcset_value)))));
602 false
603 end else begin
604 (* Check for uppercase W - compare original desc with lowercase version *)
605 let original_last = desc.[String.length desc - 1] in
606 if original_last = 'W' then begin
607 Message_collector.add_typed collector
608 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected width descriptor but found %s at %s. (When the %s attribute is present, all image candidate strings must specify a width.)" (q srcset_value) (q "srcset") (q element_name) (q desc) (q srcset_value) (q "sizes")))));
609 false
610 end else true
611 end
612 with _ ->
613 (* Check for scientific notation, decimal, or other non-integer values *)
614 if String.contains num_part 'e' || String.contains num_part 'E' || String.contains num_part '.' then begin
615 Message_collector.add_typed collector
616 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected integer but found %s at %s." (q srcset_value) (q "srcset") (q element_name) (q num_part) (q srcset_value)))));
617 false
618 end else begin
619 Message_collector.add_typed collector
620 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad srcset descriptor: Invalid width descriptor." (q srcset_value) (q "srcset") (q element_name)))));
621 false
622 end)
623 | 'x' ->
624 (* Pixel density descriptor - must be positive number, no leading + *)
625 let trimmed_desc = String.trim desc in
626 if String.length trimmed_desc > 0 && trimmed_desc.[0] = '+' then begin
627 (* Extract the number part including the plus sign *)
628 let num_with_plus = String.sub trimmed_desc 0 (String.length trimmed_desc - 1) in
629 Message_collector.add_typed collector
630 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected number without leading plus sign but found %s at %s." (q srcset_value) (q "srcset") (q element_name) (q num_with_plus) (q srcset_value)))));
631 false
632 end else begin
633 (try
634 let n = float_of_string num_part in
635 if Float.is_nan n then begin
636 (* NaN is not a valid float - report as parse error with first char from ORIGINAL desc *)
637 let trimmed_desc = String.trim desc in
638 let orig_num_part = String.sub trimmed_desc 0 (String.length trimmed_desc - 1) in
639 let first_char = if String.length orig_num_part > 0 then String.make 1 orig_num_part.[0] else "" in
640 Message_collector.add_typed collector
641 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad positive floating point number: Expected a digit but saw %s instead at %s." (q srcset_value) (q "srcset") (q element_name) (q first_char) (q srcset_value)))));
642 false
643 end else if n = 0.0 then begin
644 (* Check if it's -0 (starts with minus) - report as "greater than zero" error *)
645 let trimmed_desc = String.trim desc in
646 let orig_num_part = String.sub trimmed_desc 0 (String.length trimmed_desc - 1) in
647 if String.length orig_num_part > 0 && orig_num_part.[0] = '-' then begin
648 Message_collector.add_typed collector
649 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected number greater than zero but found %s at %s." (q srcset_value) (q "srcset") (q element_name) (q orig_num_part) (q srcset_value)))))
650 end else begin
651 Message_collector.add_typed collector
652 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad positive floating point number: Zero is not a valid positive floating point number at %s." (q srcset_value) (q "srcset") (q element_name) (q srcset_value)))))
653 end;
654 false
655 end else if n < 0.0 then begin
656 Message_collector.add_typed collector
657 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected number greater than zero but found %s at %s." (q srcset_value) (q "srcset") (q element_name) (q num_part) (q srcset_value)))));
658 false
659 end else if n = neg_infinity || n = infinity then begin
660 (* Infinity is not a valid float - report as parse error with first char from ORIGINAL desc *)
661 let trimmed_desc = String.trim desc in
662 let orig_num_part = String.sub trimmed_desc 0 (String.length trimmed_desc - 1) in
663 let first_char = if String.length orig_num_part > 0 then String.make 1 orig_num_part.[0] else "" in
664 Message_collector.add_typed collector
665 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad positive floating point number: Expected a digit but saw %s instead at %s." (q srcset_value) (q "srcset") (q element_name) (q first_char) (q srcset_value)))));
666 false
667 end else true
668 with _ ->
669 Message_collector.add_typed collector
670 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad srcset descriptor: Invalid density descriptor." (q srcset_value) (q "srcset") (q element_name)))));
671 false)
672 end
673 | 'h' ->
674 (* Height descriptor - not allowed *)
675 let trimmed_desc = String.trim desc in
676 (* Generate context: find where this entry appears *)
677 let context =
678 try
679 let pos = Str.search_forward (Str.regexp_string trimmed_desc) srcset_value 0 in
680 (* Get the entry context ending with comma *)
681 let search_from = max 0 (pos - 3) in
682 let comma_pos = try Str.search_forward (Str.regexp_string ",") srcset_value pos with Not_found -> String.length srcset_value - 1 in
683 let end_pos = min (comma_pos + 1) (String.length srcset_value) in
684 let len = end_pos - search_from in
685 if len > 0 then String.trim (String.sub srcset_value search_from len) else srcset_value
686 with Not_found | Invalid_argument _ -> srcset_value
687 in
688 if has_sizes then
689 Message_collector.add_typed collector
690 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected width descriptor but found %s at %s. (When the %s attribute is present, all image candidate strings must specify a width.)" (q srcset_value) (q "srcset") (q element_name) (q trimmed_desc) (q context) (q "sizes")))))
691 else
692 Message_collector.add_typed collector
693 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad srcset descriptor: Height descriptor %s is not allowed." (q srcset_value) (q "srcset") (q element_name) (q "h")))));
694 false
695 | _ ->
696 (* Unknown descriptor - find context in srcset_value *)
697 let trimmed_desc = String.trim desc in
698 (* Nu validator adds extra ')' after the last ')' if descriptor contains any '(' *)
699 let found_desc =
700 if String.contains trimmed_desc '(' then
701 (* Find position of last ')' and insert extra ')' after it *)
702 try
703 let last_close = String.rindex trimmed_desc ')' in
704 let before = String.sub trimmed_desc 0 (last_close + 1) in
705 let after = String.sub trimmed_desc (last_close + 1) (String.length trimmed_desc - last_close - 1) in
706 before ^ ")" ^ after
707 with Not_found -> trimmed_desc ^ ")"
708 else trimmed_desc
709 in
710 (* Find context: the entry containing the error with trailing comma *)
711 let context =
712 try
713 let pos = Str.search_forward (Str.regexp_string trimmed_desc) srcset_value 0 in
714 (* Get the context ending with the descriptor and the comma after *)
715 let end_pos = min (pos + String.length trimmed_desc + 1) (String.length srcset_value) in
716 String.trim (String.sub srcset_value 0 end_pos)
717 with Not_found -> srcset_value
718 in
719 Message_collector.add_typed collector
720 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected number followed by %s or %s but found %s at %s." (q srcset_value) (q "srcset") (q element_name) (q "w") (q "x") (q found_desc) (q context)))));
721 false
722 end
723
724(** Normalize descriptor for duplicate detection (e.g., 1x = 1.0x) *)
725let normalize_descriptor desc =
726 let desc_lower = Astring.String.Ascii.lowercase (String.trim desc) in
727 if String.length desc_lower = 0 then desc_lower
728 else
729 let last_char = desc_lower.[String.length desc_lower - 1] in
730 let num_part = String.sub desc_lower 0 (String.length desc_lower - 1) in
731 match last_char with
732 | 'x' ->
733 (* Normalize density to a float string for comparison *)
734 (try
735 let f = float_of_string num_part in
736 Printf.sprintf "%gx" f (* %g removes trailing zeros *)
737 with _ -> desc_lower)
738 | 'w' ->
739 (* Width should be integer, just return as-is *)
740 desc_lower
741 | _ -> desc_lower
742
743(** Parse and validate srcset attribute value *)
744let validate_srcset value element_name has_sizes collector =
745 (* Srcset entries are split on commas - only balanced parentheses prevent split *)
746 let entries = split_on_comma_balanced_parens value in
747 let has_w_descriptor = ref false in
748 let has_x_descriptor = ref false in
749 let no_descriptor_url = ref None in (* Track URL of first entry without width descriptor *)
750 let x_with_sizes_error_reported = ref false in (* Track if we already reported x-with-sizes error *)
751 let seen_descriptors = Hashtbl.create 8 in (* Track seen descriptor values -> first URL *)
752
753 (* Check for empty srcset *)
754 if String.trim value = "" then begin
755 Message_collector.add_typed collector
756 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Must contain one or more image candidate strings." (q value) (q "srcset") (q element_name)))))
757 end;
758
759 (* Check for leading comma *)
760 if String.length value > 0 && value.[0] = ',' then begin
761 Message_collector.add_typed collector
762 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Starts with empty image-candidate string." (q value) (q "srcset") (q element_name)))))
763 end;
764
765 (* Check for trailing comma(s) / empty entries *)
766 let trimmed_value = String.trim value in
767 if String.length trimmed_value > 0 && trimmed_value.[String.length trimmed_value - 1] = ',' then begin
768 (* Count consecutive trailing commas *)
769 let rec count_trailing_commas s idx count =
770 if idx < 0 then count
771 else if s.[idx] = ',' then count_trailing_commas s (idx - 1) (count + 1)
772 else if s.[idx] = ' ' || s.[idx] = '\t' then count_trailing_commas s (idx - 1) count
773 else count
774 in
775 let trailing_commas = count_trailing_commas trimmed_value (String.length trimmed_value - 1) 0 in
776 if trailing_commas > 1 then
777 (* Multiple trailing commas: "Empty image-candidate string at" *)
778 Message_collector.add_typed collector
779 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Empty image-candidate string at %s." (q value) (q "srcset") (q element_name) (q value)))))
780 else
781 (* Single trailing comma: "Ends with empty image-candidate string." *)
782 Message_collector.add_typed collector
783 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Ends with empty image-candidate string." (q value) (q "srcset") (q element_name)))))
784 end;
785
786 List.iter (fun entry ->
787 let entry = String.trim entry in
788 if entry <> "" then begin
789 (* Split entry into URL and optional descriptor - respect parentheses *)
790 let parts = split_on_space_respecting_parens entry in
791 (* Check if URL is valid *)
792 let check_srcset_url url =
793 (* Special schemes that require host/content after :// *)
794 let special_schemes = ["http"; "https"; "ftp"; "ws"; "wss"] in
795 (* Check for scheme-only URL like "http:" *)
796 let url_lower = Astring.String.Ascii.lowercase url in
797 List.iter (fun scheme ->
798 let scheme_colon = scheme ^ ":" in
799 if url_lower = scheme_colon then
800 Message_collector.add_typed collector
801 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad image-candidate URL: %s: Expected a slash (\"/\")." (q value) (q "srcset") (q element_name) (q url)))))
802 ) special_schemes
803 in
804 match parts with
805 | [] -> ()
806 | [url] ->
807 check_srcset_url url;
808 (* URL only = implicit 1x descriptor - only flag if explicit 1x also seen *)
809 if !no_descriptor_url = None then no_descriptor_url := Some url;
810 begin match Hashtbl.find_opt seen_descriptors "explicit-1x" with
811 | Some first_url ->
812 Message_collector.add_typed collector
813 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Density for image %s is identical to density for image %s." (q value) (q "srcset") (q element_name) (q url) (q first_url)))))
814 | None ->
815 Hashtbl.add seen_descriptors "implicit-1x" url
816 end
817 | url :: desc :: rest ->
818 (* Check URL for broken schemes *)
819 check_srcset_url url;
820 (* Check for extra junk - multiple descriptors are not allowed *)
821 if rest <> [] then begin
822 let extra_desc = List.hd rest in
823 Message_collector.add_typed collector
824 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected single descriptor but found extraneous descriptor %s at %s." (q value) (q "srcset") (q element_name) (q extra_desc) (q value)))))
825 end;
826
827 let desc_lower = Astring.String.Ascii.lowercase (String.trim desc) in
828 if String.length desc_lower > 0 then begin
829 let last_char = desc_lower.[String.length desc_lower - 1] in
830 if last_char = 'w' then has_w_descriptor := true
831 else if last_char = 'x' then begin
832 has_x_descriptor := true;
833 (* If sizes is present and we have an x descriptor, generate detailed error *)
834 if has_sizes && not !x_with_sizes_error_reported then begin
835 x_with_sizes_error_reported := true;
836 (* Build context:
837 - If entry has extra parts (multiple descriptors): show "url descriptor "
838 - Else if entry has trailing comma: show "url descriptor,"
839 - Else (last entry, no extra parts): show full srcset value *)
840 let trimmed_url = String.trim url in
841 let trimmed_desc = String.trim desc in
842 let entry_context =
843 if rest <> [] then
844 (* Entry has multiple descriptors - show URL + first descriptor + space *)
845 trimmed_url ^ " " ^ trimmed_desc ^ " "
846 else
847 (* Check if entry ends with comma in original value *)
848 let trimmed_entry = String.trim entry in
849 try
850 let entry_start = Str.search_forward (Str.regexp_string trimmed_url) value 0 in
851 let entry_end = entry_start + String.length trimmed_entry in
852 let has_trailing_comma = entry_end < String.length value && value.[entry_end] = ',' in
853 if has_trailing_comma then
854 (* Entry followed by comma - show "url descriptor," *)
855 trimmed_url ^ " " ^ trimmed_desc ^ ","
856 else
857 (* Last entry - show full srcset value *)
858 value
859 with Not_found ->
860 value
861 in
862 Message_collector.add_typed collector
863 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected width descriptor but found %s at %s. (When the %s attribute is present, all image candidate strings must specify a width.)" (q value) (q "srcset") (q element_name) (q trimmed_desc) (q entry_context) (q "sizes")))))
864 end
865 end;
866
867 (* Check for duplicate descriptors - use normalized form *)
868 let normalized = normalize_descriptor desc in
869 let is_1x = (normalized = "1x") in
870 let is_width = (last_char = 'w') in
871 let dup_type = if is_width then "Width" else "Density" in
872 begin match Hashtbl.find_opt seen_descriptors normalized with
873 | Some first_url ->
874 Message_collector.add_typed collector
875 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: %s for image %s is identical to %s for image %s." (q value) (q "srcset") (q element_name) dup_type (q url) (Astring.String.Ascii.lowercase dup_type) (q first_url)))))
876 | None ->
877 begin match (if is_1x then Hashtbl.find_opt seen_descriptors "implicit-1x" else None) with
878 | Some first_url ->
879 (* Explicit 1x conflicts with implicit 1x *)
880 Message_collector.add_typed collector
881 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: %s for image %s is identical to %s for image %s." (q value) (q "srcset") (q element_name) dup_type (q url) (Astring.String.Ascii.lowercase dup_type) (q first_url)))))
882 | None ->
883 Hashtbl.add seen_descriptors normalized url;
884 if is_1x then Hashtbl.add seen_descriptors "explicit-1x" url
885 end
886 end
887 end;
888
889 ignore (validate_srcset_descriptor desc element_name value has_sizes collector)
890 end
891 ) entries;
892
893 (* Check: if w descriptor used and no sizes, that's an error for img and source *)
894 if !has_w_descriptor && not has_sizes then
895 Message_collector.add_typed collector
896 (`Srcset `W_without_sizes);
897
898 (* Check: if sizes is present, all entries must have width descriptors *)
899 (match !no_descriptor_url with
900 | Some url when has_sizes ->
901 Message_collector.add_typed collector
902 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: No width specified for image %s. (When the %s attribute is present, all image candidate strings must specify a width.)" (q value) (q "srcset") (q element_name) (q url) (q "sizes")))))
903 | _ -> ());
904
905 (* Check: if sizes is present and srcset uses x descriptors, that's an error.
906 Only report if we haven't already reported the detailed error. *)
907 if has_sizes && !has_x_descriptor && not !x_with_sizes_error_reported then
908 Message_collector.add_typed collector
909 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: When the %s attribute is present, all image candidate strings must specify a width." (q value) (q "srcset") (q element_name) (q "sizes")))));
910
911 (* Check for mixing w and x descriptors *)
912 if !has_w_descriptor && !has_x_descriptor then
913 Message_collector.add_typed collector
914 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Mixing width and density descriptors is not allowed." (q value) (q "srcset") (q element_name)))))
915
916let start_element _state ~element collector =
917 match element.Element.tag with
918 | Tag.Svg "image" ->
919 (* SVG image elements should not have srcset *)
920 if Attr_utils.get_attr "srcset" element.Element.raw_attrs <> None then
921 Message_collector.add_typed collector
922 (`Attr (`Not_allowed (`Attr "srcset", `Elem "image")))
923 | Tag.Html (`Img | `Source as tag) ->
924 let name_lower = Tag.html_tag_to_string tag in
925 let attrs = element.raw_attrs in
926 let sizes_value = Attr_utils.get_attr "sizes" attrs in
927 let srcset_value = Attr_utils.get_attr "srcset" attrs in
928 let has_sizes = sizes_value <> None in
929 let has_srcset = srcset_value <> None in
930
931 (* Validate sizes if present *)
932 (match sizes_value with
933 | Some v -> ignore (validate_sizes v name_lower collector)
934 | None -> ());
935
936 (* Validate srcset if present *)
937 (match srcset_value with
938 | Some v -> validate_srcset v name_lower has_sizes collector
939 | None -> ());
940
941 (* Error: sizes without srcset on img *)
942 if name_lower = "img" && has_sizes && not has_srcset then
943 Message_collector.add_typed collector
944 (`Srcset `Sizes_without_srcset)
945 | _ -> () (* Other elements *)
946
947let end_element _state ~tag:_ _collector = ()
948
949let checker = Checker.make ~create ~reset ~start_element ~end_element ()