OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 949 lines 51 kB view raw
1(** Srcset and sizes attribute validation checker. *) 2 3(** Quote helper for consistent message formatting. *) 4let q = Error_code.q 5 6(** Valid CSS length units for sizes attribute *) 7let valid_length_units = [ 8 "em"; "ex"; "ch"; "rem"; "cap"; "ic"; 9 "vw"; "svw"; "lvw"; "dvw"; "vh"; "svh"; "lvh"; "dvh"; 10 "vi"; "svi"; "lvi"; "dvi"; "vb"; "svb"; "lvb"; "dvb"; 11 "vmin"; "svmin"; "lvmin"; "dvmin"; "vmax"; "svmax"; "lvmax"; "dvmax"; 12 "cm"; "mm"; "q"; "in"; "pc"; "pt"; "px" 13] 14 15type state = unit [@@warning "-34"] 16 17let create () = () 18let reset _state = () 19 20(** Split string on a character while respecting parentheses *) 21let split_respecting_parens ~sep s = 22 let len = String.length s in 23 let result = ref [] in 24 let current = Buffer.create 64 in 25 let depth = ref 0 in 26 for i = 0 to len - 1 do 27 let c = s.[i] in 28 if c = '(' then begin 29 incr depth; 30 Buffer.add_char current c 31 end else if c = ')' then begin 32 decr depth; 33 Buffer.add_char current c 34 end else if c = sep && !depth = 0 then begin 35 result := Buffer.contents current :: !result; 36 Buffer.clear current 37 end else 38 Buffer.add_char current c 39 done; 40 (* Add the last segment *) 41 result := Buffer.contents current :: !result; 42 List.rev !result 43 44(** Split string on commas while respecting parentheses *) 45let split_on_comma_respecting_parens s = split_respecting_parens ~sep:',' s 46 47(** Split on commas respecting BALANCED parentheses only (for srcset). 48 If parens are unbalanced overall, just split on all commas. *) 49let split_on_comma_balanced_parens s = 50 (* First, check if parens are balanced overall *) 51 let opens = ref 0 and closes = ref 0 in 52 String.iter (fun c -> if c = '(' then incr opens else if c = ')' then incr closes) s; 53 if !opens <> !closes then 54 (* Unbalanced parens - just split on all commas *) 55 String.split_on_char ',' s 56 else 57 (* Balanced parens - respect them during split *) 58 split_on_comma_respecting_parens s 59 60(** Split string on spaces while respecting parentheses, filtering empty segments *) 61let split_on_space_respecting_parens s = 62 split_respecting_parens ~sep:' ' s |> List.filter (fun s -> s <> "") 63 64(** Invalid units that are not CSS lengths but might be confused for them *) 65let invalid_size_units = [ 66 "deg"; "grad"; "rad"; "turn"; (* angle units *) 67 "s"; "ms"; (* time units *) 68 "hz"; "khz"; (* frequency units *) 69 "dpi"; "dpcm"; "dppx"; (* resolution units *) 70 "%" (* percentage - not valid in sizes *) 71] 72 73(** Strip CSS comments from a value *) 74let strip_css_comments s = 75 let buf = Buffer.create (String.length s) in 76 let len = String.length s in 77 let i = ref 0 in 78 while !i < len do 79 if !i + 1 < len && s.[!i] = '/' && s.[!i + 1] = '*' then begin 80 (* Start of comment, find end *) 81 i := !i + 2; 82 while !i + 1 < len && not (s.[!i] = '*' && s.[!i + 1] = '/') do 83 incr i 84 done; 85 if !i + 1 < len then i := !i + 2 86 end else begin 87 Buffer.add_char buf s.[!i]; 88 incr i 89 end 90 done; 91 Buffer.contents buf 92 93(** Check if a size value has a valid CSS length unit and non-negative value *) 94type size_check_result = 95 | Valid 96 | InvalidUnit of string * string (* (found_unit, context) *) 97 | NegativeValue 98 | CssCommentAfterSign of string * string (* what was found, context *) 99 | CssCommentBeforeUnit of string * string (* what was found, context *) 100 | BadScientificNotation 101 | BadCssNumber of char * string (* (first_char, context) - not starting with digit or minus *) 102 103(** CSS comment error types *) 104type css_comment_error = 105 | NoCommentError 106 | CommentAfterSign of string * string (* what was found, context *) 107 | CommentBetweenNumberAndUnit of string * string (* what was found at comment position, context *) 108 109(** Check if CSS comment appears in an invalid position: 110 - Between sign and number (+/**/50vw) 111 - Between number and unit (50/**/vw) 112 Trailing comments (50vw/**/) are valid. *) 113let check_css_comment_position s = 114 let len = String.length s in 115 (* Find comment position *) 116 let rec find_comment i = 117 if i + 1 >= len then None 118 else if s.[i] = '/' && s.[i + 1] = '*' then Some i 119 else find_comment (i + 1) 120 in 121 match find_comment 0 with 122 | None -> NoCommentError 123 | Some comment_pos -> 124 let before = String.sub s 0 comment_pos in 125 let trimmed_before = String.trim before in 126 if String.length trimmed_before = 0 then NoCommentError (* Leading comment is OK *) 127 else begin 128 (* Find end of comment *) 129 let rec find_end i = 130 if i + 1 >= len then len 131 else if s.[i] = '*' && s.[i + 1] = '/' then i + 2 132 else find_end (i + 1) 133 in 134 let end_pos = find_end (comment_pos + 2) in 135 let after = if end_pos < len then String.sub s end_pos (len - end_pos) else "" in 136 let trimmed_after = String.trim (strip_css_comments after) in 137 if trimmed_after = "" then NoCommentError (* Trailing comment is OK *) 138 else begin 139 (* Comment is in the middle - check if it breaks a number/unit combo *) 140 let last = trimmed_before.[String.length trimmed_before - 1] in 141 (* What's at the comment position? Just show "/" *) 142 let slash = "/" in 143 (* Invalid if comment appears after +/- *) 144 if last = '+' || last = '-' then 145 CommentAfterSign (trimmed_before ^ slash, s) 146 (* Invalid if comment appears after digit (before more content) *) 147 else if (last >= '0' && last <= '9') || last = '.' then 148 CommentBetweenNumberAndUnit (slash ^ trimmed_after, s) 149 else 150 NoCommentError 151 end 152 end 153 154(** Check if scientific notation has invalid exponent (like 1e+1.5 - decimal in exponent) *) 155let has_invalid_scientific_notation s = 156 let lower = Astring.String.Ascii.lowercase s in 157 (* Find 'e' for scientific notation *) 158 match String.index_opt lower 'e' with 159 | None -> false 160 | Some e_pos -> 161 (* Check if there's a decimal after the exponent sign *) 162 let after_e = String.sub lower (e_pos + 1) (String.length lower - e_pos - 1) in 163 let after_sign = 164 if String.length after_e > 0 && (after_e.[0] = '+' || after_e.[0] = '-') then 165 String.sub after_e 1 (String.length after_e - 1) 166 else after_e 167 in 168 String.contains after_sign '.' 169 170(** Extract unit from a size value like "10px" -> "px", "100vw" -> "vw", "50%" -> "%" 171 Returns the unit with original case preserved *) 172let extract_unit s = 173 let trimmed = String.trim s in 174 let len = String.length trimmed in 175 if len = 0 then "" 176 (* Check for % at the end *) 177 else if trimmed.[len - 1] = '%' then "%" 178 else begin 179 let lower = Astring.String.Ascii.lowercase trimmed in 180 (* Try to find a unit at the end (letters only) *) 181 let rec find_unit_length i = 182 if i < 0 then 0 183 else if lower.[i] >= 'a' && lower.[i] <= 'z' then find_unit_length (i - 1) 184 else i + 1 185 in 186 let start = find_unit_length (len - 1) in 187 if start < len then 188 (* Return the unit from the original string (preserving case) *) 189 String.sub trimmed start (len - start) 190 else "" 191 end 192 193let check_size_value size_value = 194 let trimmed = String.trim size_value in 195 if trimmed = "" then InvalidUnit ("", trimmed) 196 else begin 197 (* Check for CSS comments inside numbers - this is invalid *) 198 match check_css_comment_position trimmed with 199 | CommentAfterSign (found, ctx) -> CssCommentAfterSign (found, ctx) 200 | CommentBetweenNumberAndUnit (found, ctx) -> CssCommentBeforeUnit (found, ctx) 201 | NoCommentError -> 202 (* Strip valid leading/trailing CSS comments for further checks *) 203 let value_no_comments = String.trim (strip_css_comments trimmed) in 204 (* Check for invalid scientific notation like 1e+1.5px *) 205 if has_invalid_scientific_notation value_no_comments then BadScientificNotation 206 (* "auto" is only valid with lazy loading, which requires checking the element context. 207 For general validation, treat "auto" alone as invalid in sizes. *) 208 else if Astring.String.Ascii.lowercase value_no_comments = "auto" then 209 BadCssNumber (value_no_comments.[0], trimmed) 210 else if value_no_comments = "" then InvalidUnit ("", trimmed) 211 else begin 212 let lower = Astring.String.Ascii.lowercase value_no_comments in 213 (* Check for calc() or other CSS functions first - these are always valid *) 214 if String.contains value_no_comments '(' then Valid 215 else begin 216 (* Check if the value starts with a digit, minus, or plus sign *) 217 let first_char = value_no_comments.[0] in 218 let starts_with_number = 219 (first_char >= '0' && first_char <= '9') || 220 first_char = '-' || 221 first_char = '+' || 222 first_char = '.' (* decimal point like .5px *) 223 in 224 if not starts_with_number then 225 (* Not a valid CSS number token - doesn't start with digit or sign *) 226 BadCssNumber (first_char, trimmed) 227 else begin 228 (* Check for invalid units first *) 229 let found_invalid = List.find_opt (fun unit -> 230 let len = String.length unit in 231 String.length lower > len && 232 String.sub lower (String.length lower - len) len = unit 233 ) invalid_size_units in 234 match found_invalid with 235 | Some _unit -> InvalidUnit (extract_unit value_no_comments, trimmed) 236 | None -> 237 (* Check for valid CSS length units *) 238 let has_valid_unit = List.exists (fun unit -> 239 let len = String.length unit in 240 String.length lower > len && 241 String.sub lower (String.length lower - len) len = unit 242 ) valid_length_units in 243 if has_valid_unit then begin 244 (* Check if it's negative (starts with - but not -0) *) 245 if String.length value_no_comments > 0 && value_no_comments.[0] = '-' then begin 246 (* Check if it's -0 which is valid *) 247 let after_minus = String.sub value_no_comments 1 (String.length value_no_comments - 1) in 248 try 249 let num_str = Str.global_replace (Str.regexp "[a-zA-Z]+$") "" after_minus in 250 let f = float_of_string num_str in 251 if f = 0.0 then Valid else NegativeValue 252 with _ -> NegativeValue 253 end else 254 Valid 255 end 256 else begin 257 (* Check if it's a zero value (0, -0, +0) - these are valid without units *) 258 let stripped = 259 let s = value_no_comments in 260 let s = if String.length s > 0 && (s.[0] = '+' || s.[0] = '-') then String.sub s 1 (String.length s - 1) else s in 261 s 262 in 263 (* Check if it's zero or a numeric value starting with 0 *) 264 try 265 let f = float_of_string stripped in 266 if f = 0.0 then Valid else InvalidUnit (extract_unit value_no_comments, trimmed) 267 with _ -> InvalidUnit (extract_unit value_no_comments, trimmed) 268 end 269 end 270 end 271 end 272 end 273 274(** Check if a sizes entry has a media condition (starts with '(') *) 275let has_media_condition entry = 276 let trimmed = String.trim entry in 277 String.length trimmed > 0 && trimmed.[0] = '(' 278 279(** Check if entry looks like it's trying to be a media condition but isn't properly formatted *) 280let has_invalid_media_condition entry = 281 let trimmed = String.trim entry in 282 if String.length trimmed = 0 then None 283 else begin 284 let first_char = trimmed.[0] in 285 if first_char = '(' then begin 286 (* Check for bad content inside the media condition *) 287 let len = String.length trimmed in 288 let rec find_close_paren i depth = 289 if i >= len then None 290 else match trimmed.[i] with 291 | '(' -> find_close_paren (i + 1) (depth + 1) 292 | ')' -> if depth = 1 then Some i else find_close_paren (i + 1) (depth - 1) 293 | _ -> find_close_paren (i + 1) depth 294 in 295 match find_close_paren 0 0 with 296 | None -> Some "Unclosed media condition" 297 | Some close_pos -> 298 let inner = String.sub trimmed 1 (close_pos - 1) in 299 let inner_trimmed = String.trim inner in 300 (* Check for obviously invalid content like just numbers or curly braces *) 301 if String.length inner_trimmed > 0 then begin 302 let first_inner = inner_trimmed.[0] in 303 if first_inner >= '0' && first_inner <= '9' then 304 Some "Bad media condition: Parse Error" 305 else if String.contains inner_trimmed '}' || String.contains inner_trimmed '{' then 306 Some "Bad media condition: Parse Error" 307 else 308 None 309 end else 310 Some "Bad media condition: Parse Error" 311 end else begin 312 (* Check for bare "all" which is invalid *) 313 let lower = Astring.String.Ascii.lowercase trimmed in 314 let parts = String.split_on_char ' ' lower |> List.filter (fun s -> s <> "") in 315 match parts with 316 | keyword :: _ when keyword = "all" -> 317 Some "Bad media condition: Parse Error" 318 | keyword :: _ when String.length keyword > 0 && not (keyword.[0] >= '0' && keyword.[0] <= '9') -> 319 (* Looks like a keyword without parens like "min-width:500px" *) 320 if String.contains keyword ':' then 321 Some "Bad media condition: Parse Error" 322 else 323 None 324 | _ -> None 325 end 326 end 327 328(** Extract the size value from a sizes entry (after media condition if any) *) 329let extract_size_value entry = 330 let trimmed = String.trim entry in 331 if not (has_media_condition trimmed) then 332 trimmed 333 else begin 334 (* Media conditions can have "and", "or", "not" operators connecting 335 multiple parenthesized groups, e.g., "(not (width:500px)) and (width:500px) 500px" 336 We need to skip all media condition parts to find the size value *) 337 let len = String.length trimmed in 338 let rec skip_media_condition i = 339 if i >= len then len 340 else begin 341 let remaining = String.trim (String.sub trimmed i (len - i)) in 342 let remaining_len = String.length remaining in 343 if remaining_len = 0 then len 344 else begin 345 let first_char = remaining.[0] in 346 if first_char = '(' then begin 347 (* Skip this parenthesized group *) 348 let rec find_close_paren j depth = 349 if j >= remaining_len then remaining_len 350 else match remaining.[j] with 351 | '(' -> find_close_paren (j + 1) (depth + 1) 352 | ')' -> if depth = 1 then j + 1 else find_close_paren (j + 1) (depth - 1) 353 | _ -> find_close_paren (j + 1) depth 354 in 355 let after_paren = find_close_paren 0 0 in 356 let new_pos = i + (len - i) - remaining_len + after_paren in 357 skip_media_condition new_pos 358 end 359 else begin 360 (* Check if remaining starts with "and", "or", "not" followed by space or paren *) 361 let lower_remaining = Astring.String.Ascii.lowercase remaining in 362 if remaining_len >= 4 && String.sub lower_remaining 0 4 = "and " then 363 skip_media_condition (i + (len - i) - remaining_len + 4) 364 else if remaining_len >= 3 && String.sub lower_remaining 0 3 = "or " then 365 skip_media_condition (i + (len - i) - remaining_len + 3) 366 else if remaining_len >= 4 && String.sub lower_remaining 0 4 = "not " then 367 skip_media_condition (i + (len - i) - remaining_len + 4) 368 else if remaining_len >= 4 && String.sub lower_remaining 0 4 = "and(" then 369 skip_media_condition (i + (len - i) - remaining_len + 3) 370 else if remaining_len >= 3 && String.sub lower_remaining 0 3 = "or(" then 371 skip_media_condition (i + (len - i) - remaining_len + 2) 372 else if remaining_len >= 4 && String.sub lower_remaining 0 4 = "not(" then 373 skip_media_condition (i + (len - i) - remaining_len + 3) 374 else 375 (* Found something that's not a media condition part - this is the size value *) 376 i + (len - i) - remaining_len 377 end 378 end 379 end 380 in 381 let size_start = skip_media_condition 0 in 382 if size_start >= len then "" 383 else String.trim (String.sub trimmed size_start (len - size_start)) 384 end 385 386(** Validate sizes attribute value *) 387let validate_sizes value element_name collector = 388 (* Empty sizes is invalid *) 389 if String.trim value = "" then begin 390 Message_collector.add_typed collector 391 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Must not be empty." (q "") (q "sizes") (q element_name))))); 392 false 393 end else begin 394 (* Split on comma and check each entry *) 395 let entries = String.split_on_char ',' value in 396 let first_entry = String.trim (List.hd entries) in 397 398 (* Check if starts with comma (empty first entry) *) 399 if first_entry = "" then begin 400 Message_collector.add_typed collector 401 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Starts with empty source size." (q value) (q "sizes") (q element_name))))); 402 false 403 end else begin 404 (* Check for trailing comma *) 405 let last_entry = String.trim (List.nth entries (List.length entries - 1)) in 406 if List.length entries > 1 && last_entry = "" then begin 407 Message_collector.add_typed collector 408 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected media condition before %s at %s." (q value) (q "sizes") (q element_name) (q "") (q value))))); 409 false 410 end else begin 411 let valid = ref true in 412 413 (* Check for default-first pattern: unconditional value before conditional ones *) 414 let non_empty_entries = List.filter (fun e -> String.trim e <> "") entries in 415 (* Filter out entries that have invalid media conditions - they'll be reported separately *) 416 let valid_entries = List.filter (fun e -> 417 has_invalid_media_condition (String.trim e) = None 418 ) non_empty_entries in 419 if List.length valid_entries > 1 then begin 420 let first = List.hd valid_entries in 421 let rest = List.tl valid_entries in 422 (* If first entry has no media condition but later ones do, that's invalid *) 423 if not (has_media_condition first) && List.exists has_media_condition rest then begin 424 (* Context is the first entry with a comma *) 425 let context = (String.trim first) ^ "," in 426 Message_collector.add_typed collector 427 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected media condition before %s at %s." (q value) (q "sizes") (q element_name) (q "") (q context))))); 428 valid := false 429 end; 430 (* Check for multiple entries without media conditions. 431 When the first entry has no media condition, report "Expected media condition" 432 regardless of whether later entries have media conditions or not *) 433 if not (has_media_condition first) && !valid then begin 434 (* Only report if we haven't already reported the default-first error *) 435 if not (List.exists has_media_condition rest) then begin 436 (* Multiple defaults - report as "Expected media condition" *) 437 let context = (String.trim first) ^ "," in 438 Message_collector.add_typed collector 439 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected media condition before %s at %s." (q value) (q "sizes") (q element_name) (q "") (q context))))); 440 valid := false 441 end 442 end 443 end; 444 445 (* Validate each entry's media condition and size value *) 446 let num_entries = List.length entries in 447 List.iteri (fun idx entry -> 448 let trimmed = String.trim entry in 449 if trimmed <> "" then begin 450 (* Check for invalid media condition *) 451 (match has_invalid_media_condition trimmed with 452 | Some err_msg -> 453 let context = trimmed ^ "," in 454 Message_collector.add_typed collector 455 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: %s at %s." (q value) (q "sizes") (q element_name) err_msg (q context))))); 456 valid := false 457 | None -> ()); 458 459 let size_val = extract_size_value trimmed in 460 if size_val <> "" then begin 461 (* Check if there are multiple space-separated words in the size value. 462 Only the first word should be the size, rest is junk. *) 463 let size_parts = String.split_on_char ' ' size_val |> List.filter (fun s -> s <> "") in 464 let first_size = match size_parts with [] -> size_val | hd :: _ -> hd in 465 let extra_parts = match size_parts with [] -> [] | _ :: tl -> tl in 466 467 (* Check if first word looks like it should have been a media condition 468 (doesn't start with digit, sign, decimal, '/', or look like a CSS function) *) 469 let first_char = if String.length first_size > 0 then first_size.[0] else 'x' in 470 let has_paren = String.contains size_val '(' in (* calc(), etc. *) 471 let looks_like_junk_entry = 472 not (has_media_condition trimmed) && 473 not has_paren && (* Allow CSS functions like calc() *) 474 not (first_char = '/') && (* Allow leading CSS comments *) 475 not ((first_char >= '0' && first_char <= '9') || 476 first_char = '+' || first_char = '-' || first_char = '.') 477 in 478 479 (* If this entry looks like junk and there are multiple entries, 480 report "Expected media condition" instead of "Bad CSS number". 481 For single entries with invalid values, fall through to BadCssNumber. *) 482 if looks_like_junk_entry && num_entries > 1 then begin 483 (* Find the context ending with the previous entry *) 484 let prev_entries = List.filter (fun e -> String.trim e <> "" && e <> entry) entries in 485 let context = 486 if List.length prev_entries > 0 then 487 String.concat ", " (List.map String.trim prev_entries) ^ "," 488 else value 489 in 490 Message_collector.add_typed collector 491 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected media condition before %s at %s." (q value) (q "sizes") (q element_name) (q "") (q context))))); 492 valid := false 493 end 494 (* If there's extra junk after the size, report BadCssNumber error for it *) 495 else if extra_parts <> [] then begin 496 let last_junk = List.nth extra_parts (List.length extra_parts - 1) in 497 let first_char = if String.length last_junk > 0 then last_junk.[0] else 'x' in 498 let is_last_entry = idx = num_entries - 1 in 499 let context = 500 if is_last_entry then value 501 else trimmed ^ "," 502 in 503 Message_collector.add_typed collector 504 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Bad CSS number token: Expected a minus sign or a digit but saw %s instead at %s." (q value) (q "sizes") (q element_name) (q (String.make 1 first_char)) (q context))))); 505 valid := false 506 end 507 else 508 match check_size_value first_size with 509 | Valid -> () 510 | NegativeValue -> 511 let full_context = 512 if List.length entries > 1 then size_val ^ "," 513 else size_val 514 in 515 let _ = full_context in 516 Message_collector.add_typed collector 517 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected positive size value but found %s at %s." (q value) (q "sizes") (q element_name) (q size_val) (q size_val))))); 518 valid := false 519 | CssCommentAfterSign (found, context) -> 520 (* e.g., +/**/50vw - expected number after sign *) 521 Message_collector.add_typed collector 522 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected number but found %s at %s." (q value) (q "sizes") (q element_name) (q found) (q context))))); 523 valid := false 524 | CssCommentBeforeUnit (found, context) -> 525 (* e.g., 50/**/vw - expected units after number *) 526 let units_list = List.map q valid_length_units in 527 let units_str = String.concat ", " units_list in 528 Message_collector.add_typed collector 529 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected units (one of %s) but found %s at %s." (q value) (q "sizes") (q element_name) units_str (q found) (q context))))); 530 valid := false 531 | BadScientificNotation -> 532 (* For scientific notation with bad exponent, show what char was expected vs found *) 533 let context = 534 if List.length entries > 1 then trimmed ^ "," 535 else trimmed 536 in 537 (* Find the period in the exponent *) 538 let _ = context in 539 Message_collector.add_typed collector 540 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Bad CSS number token: Expected a digit but saw %s instead at %s." (q value) (q "sizes") (q element_name) (q ".") (q size_val))))); 541 valid := false 542 | BadCssNumber (first_char, context) -> 543 (* Value doesn't start with a digit or minus sign *) 544 let full_context = 545 if List.length entries > 1 then context ^ "," 546 else context 547 in 548 let _ = full_context in 549 Message_collector.add_typed collector 550 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Bad CSS number token: Expected a minus sign or a digit but saw %s instead at %s." (q value) (q "sizes") (q element_name) (q (String.make 1 first_char)) (q context))))); 551 valid := false 552 | InvalidUnit (found_unit, _context) -> 553 (* Generate the full list of expected units *) 554 let units_list = List.map q valid_length_units in 555 let units_str = String.concat ", " units_list in 556 (* Context should be the full entry, with comma only if there are multiple entries *) 557 let full_context = 558 if List.length entries > 1 then trimmed ^ "," 559 else trimmed 560 in 561 (* When found_unit is empty, say "no units" instead of quoting empty string *) 562 let found_str = 563 if found_unit = "" then "no units" 564 else q found_unit 565 in 566 Message_collector.add_typed collector 567 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad source size list: Expected units (one of %s) but found %s at %s." (q value) (q "sizes") (q element_name) units_str found_str (q full_context))))); 568 valid := false 569 end 570 end 571 ) entries; 572 573 !valid 574 end 575 end 576 end 577 578(** Validate srcset descriptor *) 579let validate_srcset_descriptor desc element_name srcset_value has_sizes collector = 580 let desc_lower = Astring.String.Ascii.lowercase (String.trim desc) in 581 if String.length desc_lower = 0 then true 582 else begin 583 let last_char = desc_lower.[String.length desc_lower - 1] in 584 let num_part = String.sub desc_lower 0 (String.length desc_lower - 1) in 585 586 match last_char with 587 | 'w' -> 588 (* Width descriptor - must be positive integer, no leading + *) 589 let trimmed_desc = String.trim desc in 590 if String.length trimmed_desc > 0 && trimmed_desc.[0] = '+' then begin 591 (* Show just the number part (without the 'w') *) 592 let num_part_for_msg = String.sub trimmed_desc 0 (String.length trimmed_desc - 1) in 593 Message_collector.add_typed collector 594 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected number without leading plus sign but found %s at %s." (q srcset_value) (q "srcset") (q element_name) (q num_part_for_msg) (q srcset_value))))); 595 false 596 end else 597 (try 598 let n = int_of_string num_part in 599 if n <= 0 then begin 600 Message_collector.add_typed collector 601 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected number greater than zero but found %s at %s." (q srcset_value) (q "srcset") (q element_name) (q num_part) (q srcset_value))))); 602 false 603 end else begin 604 (* Check for uppercase W - compare original desc with lowercase version *) 605 let original_last = desc.[String.length desc - 1] in 606 if original_last = 'W' then begin 607 Message_collector.add_typed collector 608 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected width descriptor but found %s at %s. (When the %s attribute is present, all image candidate strings must specify a width.)" (q srcset_value) (q "srcset") (q element_name) (q desc) (q srcset_value) (q "sizes"))))); 609 false 610 end else true 611 end 612 with _ -> 613 (* Check for scientific notation, decimal, or other non-integer values *) 614 if String.contains num_part 'e' || String.contains num_part 'E' || String.contains num_part '.' then begin 615 Message_collector.add_typed collector 616 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected integer but found %s at %s." (q srcset_value) (q "srcset") (q element_name) (q num_part) (q srcset_value))))); 617 false 618 end else begin 619 Message_collector.add_typed collector 620 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad srcset descriptor: Invalid width descriptor." (q srcset_value) (q "srcset") (q element_name))))); 621 false 622 end) 623 | 'x' -> 624 (* Pixel density descriptor - must be positive number, no leading + *) 625 let trimmed_desc = String.trim desc in 626 if String.length trimmed_desc > 0 && trimmed_desc.[0] = '+' then begin 627 (* Extract the number part including the plus sign *) 628 let num_with_plus = String.sub trimmed_desc 0 (String.length trimmed_desc - 1) in 629 Message_collector.add_typed collector 630 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected number without leading plus sign but found %s at %s." (q srcset_value) (q "srcset") (q element_name) (q num_with_plus) (q srcset_value))))); 631 false 632 end else begin 633 (try 634 let n = float_of_string num_part in 635 if Float.is_nan n then begin 636 (* NaN is not a valid float - report as parse error with first char from ORIGINAL desc *) 637 let trimmed_desc = String.trim desc in 638 let orig_num_part = String.sub trimmed_desc 0 (String.length trimmed_desc - 1) in 639 let first_char = if String.length orig_num_part > 0 then String.make 1 orig_num_part.[0] else "" in 640 Message_collector.add_typed collector 641 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad positive floating point number: Expected a digit but saw %s instead at %s." (q srcset_value) (q "srcset") (q element_name) (q first_char) (q srcset_value))))); 642 false 643 end else if n = 0.0 then begin 644 (* Check if it's -0 (starts with minus) - report as "greater than zero" error *) 645 let trimmed_desc = String.trim desc in 646 let orig_num_part = String.sub trimmed_desc 0 (String.length trimmed_desc - 1) in 647 if String.length orig_num_part > 0 && orig_num_part.[0] = '-' then begin 648 Message_collector.add_typed collector 649 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected number greater than zero but found %s at %s." (q srcset_value) (q "srcset") (q element_name) (q orig_num_part) (q srcset_value))))) 650 end else begin 651 Message_collector.add_typed collector 652 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad positive floating point number: Zero is not a valid positive floating point number at %s." (q srcset_value) (q "srcset") (q element_name) (q srcset_value))))) 653 end; 654 false 655 end else if n < 0.0 then begin 656 Message_collector.add_typed collector 657 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected number greater than zero but found %s at %s." (q srcset_value) (q "srcset") (q element_name) (q num_part) (q srcset_value))))); 658 false 659 end else if n = neg_infinity || n = infinity then begin 660 (* Infinity is not a valid float - report as parse error with first char from ORIGINAL desc *) 661 let trimmed_desc = String.trim desc in 662 let orig_num_part = String.sub trimmed_desc 0 (String.length trimmed_desc - 1) in 663 let first_char = if String.length orig_num_part > 0 then String.make 1 orig_num_part.[0] else "" in 664 Message_collector.add_typed collector 665 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad positive floating point number: Expected a digit but saw %s instead at %s." (q srcset_value) (q "srcset") (q element_name) (q first_char) (q srcset_value))))); 666 false 667 end else true 668 with _ -> 669 Message_collector.add_typed collector 670 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad srcset descriptor: Invalid density descriptor." (q srcset_value) (q "srcset") (q element_name))))); 671 false) 672 end 673 | 'h' -> 674 (* Height descriptor - not allowed *) 675 let trimmed_desc = String.trim desc in 676 (* Generate context: find where this entry appears *) 677 let context = 678 try 679 let pos = Str.search_forward (Str.regexp_string trimmed_desc) srcset_value 0 in 680 (* Get the entry context ending with comma *) 681 let search_from = max 0 (pos - 3) in 682 let comma_pos = try Str.search_forward (Str.regexp_string ",") srcset_value pos with Not_found -> String.length srcset_value - 1 in 683 let end_pos = min (comma_pos + 1) (String.length srcset_value) in 684 let len = end_pos - search_from in 685 if len > 0 then String.trim (String.sub srcset_value search_from len) else srcset_value 686 with Not_found | Invalid_argument _ -> srcset_value 687 in 688 if has_sizes then 689 Message_collector.add_typed collector 690 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected width descriptor but found %s at %s. (When the %s attribute is present, all image candidate strings must specify a width.)" (q srcset_value) (q "srcset") (q element_name) (q trimmed_desc) (q context) (q "sizes"))))) 691 else 692 Message_collector.add_typed collector 693 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad srcset descriptor: Height descriptor %s is not allowed." (q srcset_value) (q "srcset") (q element_name) (q "h"))))); 694 false 695 | _ -> 696 (* Unknown descriptor - find context in srcset_value *) 697 let trimmed_desc = String.trim desc in 698 (* Nu validator adds extra ')' after the last ')' if descriptor contains any '(' *) 699 let found_desc = 700 if String.contains trimmed_desc '(' then 701 (* Find position of last ')' and insert extra ')' after it *) 702 try 703 let last_close = String.rindex trimmed_desc ')' in 704 let before = String.sub trimmed_desc 0 (last_close + 1) in 705 let after = String.sub trimmed_desc (last_close + 1) (String.length trimmed_desc - last_close - 1) in 706 before ^ ")" ^ after 707 with Not_found -> trimmed_desc ^ ")" 708 else trimmed_desc 709 in 710 (* Find context: the entry containing the error with trailing comma *) 711 let context = 712 try 713 let pos = Str.search_forward (Str.regexp_string trimmed_desc) srcset_value 0 in 714 (* Get the context ending with the descriptor and the comma after *) 715 let end_pos = min (pos + String.length trimmed_desc + 1) (String.length srcset_value) in 716 String.trim (String.sub srcset_value 0 end_pos) 717 with Not_found -> srcset_value 718 in 719 Message_collector.add_typed collector 720 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected number followed by %s or %s but found %s at %s." (q srcset_value) (q "srcset") (q element_name) (q "w") (q "x") (q found_desc) (q context))))); 721 false 722 end 723 724(** Normalize descriptor for duplicate detection (e.g., 1x = 1.0x) *) 725let normalize_descriptor desc = 726 let desc_lower = Astring.String.Ascii.lowercase (String.trim desc) in 727 if String.length desc_lower = 0 then desc_lower 728 else 729 let last_char = desc_lower.[String.length desc_lower - 1] in 730 let num_part = String.sub desc_lower 0 (String.length desc_lower - 1) in 731 match last_char with 732 | 'x' -> 733 (* Normalize density to a float string for comparison *) 734 (try 735 let f = float_of_string num_part in 736 Printf.sprintf "%gx" f (* %g removes trailing zeros *) 737 with _ -> desc_lower) 738 | 'w' -> 739 (* Width should be integer, just return as-is *) 740 desc_lower 741 | _ -> desc_lower 742 743(** Parse and validate srcset attribute value *) 744let validate_srcset value element_name has_sizes collector = 745 (* Srcset entries are split on commas - only balanced parentheses prevent split *) 746 let entries = split_on_comma_balanced_parens value in 747 let has_w_descriptor = ref false in 748 let has_x_descriptor = ref false in 749 let no_descriptor_url = ref None in (* Track URL of first entry without width descriptor *) 750 let x_with_sizes_error_reported = ref false in (* Track if we already reported x-with-sizes error *) 751 let seen_descriptors = Hashtbl.create 8 in (* Track seen descriptor values -> first URL *) 752 753 (* Check for empty srcset *) 754 if String.trim value = "" then begin 755 Message_collector.add_typed collector 756 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Must contain one or more image candidate strings." (q value) (q "srcset") (q element_name))))) 757 end; 758 759 (* Check for leading comma *) 760 if String.length value > 0 && value.[0] = ',' then begin 761 Message_collector.add_typed collector 762 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Starts with empty image-candidate string." (q value) (q "srcset") (q element_name))))) 763 end; 764 765 (* Check for trailing comma(s) / empty entries *) 766 let trimmed_value = String.trim value in 767 if String.length trimmed_value > 0 && trimmed_value.[String.length trimmed_value - 1] = ',' then begin 768 (* Count consecutive trailing commas *) 769 let rec count_trailing_commas s idx count = 770 if idx < 0 then count 771 else if s.[idx] = ',' then count_trailing_commas s (idx - 1) (count + 1) 772 else if s.[idx] = ' ' || s.[idx] = '\t' then count_trailing_commas s (idx - 1) count 773 else count 774 in 775 let trailing_commas = count_trailing_commas trimmed_value (String.length trimmed_value - 1) 0 in 776 if trailing_commas > 1 then 777 (* Multiple trailing commas: "Empty image-candidate string at" *) 778 Message_collector.add_typed collector 779 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Empty image-candidate string at %s." (q value) (q "srcset") (q element_name) (q value))))) 780 else 781 (* Single trailing comma: "Ends with empty image-candidate string." *) 782 Message_collector.add_typed collector 783 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Ends with empty image-candidate string." (q value) (q "srcset") (q element_name))))) 784 end; 785 786 List.iter (fun entry -> 787 let entry = String.trim entry in 788 if entry <> "" then begin 789 (* Split entry into URL and optional descriptor - respect parentheses *) 790 let parts = split_on_space_respecting_parens entry in 791 (* Check if URL is valid *) 792 let check_srcset_url url = 793 (* Special schemes that require host/content after :// *) 794 let special_schemes = ["http"; "https"; "ftp"; "ws"; "wss"] in 795 (* Check for scheme-only URL like "http:" *) 796 let url_lower = Astring.String.Ascii.lowercase url in 797 List.iter (fun scheme -> 798 let scheme_colon = scheme ^ ":" in 799 if url_lower = scheme_colon then 800 Message_collector.add_typed collector 801 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad image-candidate URL: %s: Expected a slash (\"/\")." (q value) (q "srcset") (q element_name) (q url))))) 802 ) special_schemes 803 in 804 match parts with 805 | [] -> () 806 | [url] -> 807 check_srcset_url url; 808 (* URL only = implicit 1x descriptor - only flag if explicit 1x also seen *) 809 if !no_descriptor_url = None then no_descriptor_url := Some url; 810 begin match Hashtbl.find_opt seen_descriptors "explicit-1x" with 811 | Some first_url -> 812 Message_collector.add_typed collector 813 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Density for image %s is identical to density for image %s." (q value) (q "srcset") (q element_name) (q url) (q first_url))))) 814 | None -> 815 Hashtbl.add seen_descriptors "implicit-1x" url 816 end 817 | url :: desc :: rest -> 818 (* Check URL for broken schemes *) 819 check_srcset_url url; 820 (* Check for extra junk - multiple descriptors are not allowed *) 821 if rest <> [] then begin 822 let extra_desc = List.hd rest in 823 Message_collector.add_typed collector 824 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected single descriptor but found extraneous descriptor %s at %s." (q value) (q "srcset") (q element_name) (q extra_desc) (q value))))) 825 end; 826 827 let desc_lower = Astring.String.Ascii.lowercase (String.trim desc) in 828 if String.length desc_lower > 0 then begin 829 let last_char = desc_lower.[String.length desc_lower - 1] in 830 if last_char = 'w' then has_w_descriptor := true 831 else if last_char = 'x' then begin 832 has_x_descriptor := true; 833 (* If sizes is present and we have an x descriptor, generate detailed error *) 834 if has_sizes && not !x_with_sizes_error_reported then begin 835 x_with_sizes_error_reported := true; 836 (* Build context: 837 - If entry has extra parts (multiple descriptors): show "url descriptor " 838 - Else if entry has trailing comma: show "url descriptor," 839 - Else (last entry, no extra parts): show full srcset value *) 840 let trimmed_url = String.trim url in 841 let trimmed_desc = String.trim desc in 842 let entry_context = 843 if rest <> [] then 844 (* Entry has multiple descriptors - show URL + first descriptor + space *) 845 trimmed_url ^ " " ^ trimmed_desc ^ " " 846 else 847 (* Check if entry ends with comma in original value *) 848 let trimmed_entry = String.trim entry in 849 try 850 let entry_start = Str.search_forward (Str.regexp_string trimmed_url) value 0 in 851 let entry_end = entry_start + String.length trimmed_entry in 852 let has_trailing_comma = entry_end < String.length value && value.[entry_end] = ',' in 853 if has_trailing_comma then 854 (* Entry followed by comma - show "url descriptor," *) 855 trimmed_url ^ " " ^ trimmed_desc ^ "," 856 else 857 (* Last entry - show full srcset value *) 858 value 859 with Not_found -> 860 value 861 in 862 Message_collector.add_typed collector 863 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Expected width descriptor but found %s at %s. (When the %s attribute is present, all image candidate strings must specify a width.)" (q value) (q "srcset") (q element_name) (q trimmed_desc) (q entry_context) (q "sizes"))))) 864 end 865 end; 866 867 (* Check for duplicate descriptors - use normalized form *) 868 let normalized = normalize_descriptor desc in 869 let is_1x = (normalized = "1x") in 870 let is_width = (last_char = 'w') in 871 let dup_type = if is_width then "Width" else "Density" in 872 begin match Hashtbl.find_opt seen_descriptors normalized with 873 | Some first_url -> 874 Message_collector.add_typed collector 875 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: %s for image %s is identical to %s for image %s." (q value) (q "srcset") (q element_name) dup_type (q url) (Astring.String.Ascii.lowercase dup_type) (q first_url))))) 876 | None -> 877 begin match (if is_1x then Hashtbl.find_opt seen_descriptors "implicit-1x" else None) with 878 | Some first_url -> 879 (* Explicit 1x conflicts with implicit 1x *) 880 Message_collector.add_typed collector 881 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: %s for image %s is identical to %s for image %s." (q value) (q "srcset") (q element_name) dup_type (q url) (Astring.String.Ascii.lowercase dup_type) (q first_url))))) 882 | None -> 883 Hashtbl.add seen_descriptors normalized url; 884 if is_1x then Hashtbl.add seen_descriptors "explicit-1x" url 885 end 886 end 887 end; 888 889 ignore (validate_srcset_descriptor desc element_name value has_sizes collector) 890 end 891 ) entries; 892 893 (* Check: if w descriptor used and no sizes, that's an error for img and source *) 894 if !has_w_descriptor && not has_sizes then 895 Message_collector.add_typed collector 896 (`Srcset `W_without_sizes); 897 898 (* Check: if sizes is present, all entries must have width descriptors *) 899 (match !no_descriptor_url with 900 | Some url when has_sizes -> 901 Message_collector.add_typed collector 902 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: No width specified for image %s. (When the %s attribute is present, all image candidate strings must specify a width.)" (q value) (q "srcset") (q element_name) (q url) (q "sizes"))))) 903 | _ -> ()); 904 905 (* Check: if sizes is present and srcset uses x descriptors, that's an error. 906 Only report if we haven't already reported the detailed error. *) 907 if has_sizes && !has_x_descriptor && not !x_with_sizes_error_reported then 908 Message_collector.add_typed collector 909 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: When the %s attribute is present, all image candidate strings must specify a width." (q value) (q "srcset") (q element_name) (q "sizes"))))); 910 911 (* Check for mixing w and x descriptors *) 912 if !has_w_descriptor && !has_x_descriptor then 913 Message_collector.add_typed collector 914 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s: Mixing width and density descriptors is not allowed." (q value) (q "srcset") (q element_name))))) 915 916let start_element _state ~element collector = 917 match element.Element.tag with 918 | Tag.Svg "image" -> 919 (* SVG image elements should not have srcset *) 920 if Attr_utils.get_attr "srcset" element.Element.raw_attrs <> None then 921 Message_collector.add_typed collector 922 (`Attr (`Not_allowed (`Attr "srcset", `Elem "image"))) 923 | Tag.Html (`Img | `Source as tag) -> 924 let name_lower = Tag.html_tag_to_string tag in 925 let attrs = element.raw_attrs in 926 let sizes_value = Attr_utils.get_attr "sizes" attrs in 927 let srcset_value = Attr_utils.get_attr "srcset" attrs in 928 let has_sizes = sizes_value <> None in 929 let has_srcset = srcset_value <> None in 930 931 (* Validate sizes if present *) 932 (match sizes_value with 933 | Some v -> ignore (validate_sizes v name_lower collector) 934 | None -> ()); 935 936 (* Validate srcset if present *) 937 (match srcset_value with 938 | Some v -> validate_srcset v name_lower has_sizes collector 939 | None -> ()); 940 941 (* Error: sizes without srcset on img *) 942 if name_lower = "img" && has_sizes && not has_srcset then 943 Message_collector.add_typed collector 944 (`Srcset `Sizes_without_srcset) 945 | _ -> () (* Other elements *) 946 947let end_element _state ~tag:_ _collector = () 948 949let checker = Checker.make ~create ~reset ~start_element ~end_element ()