OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 474 lines 21 kB view raw
1(** Datetime attribute validation checker *) 2 3let q = Error_code.q 4 5(** Elements that have datetime attribute *) 6let datetime_elements = ["del"; "ins"; "time"] 7 8(** Parse int safely *) 9let parse_int s = 10 try Some (int_of_string s) with _ -> None 11 12(** Days in each month (non-leap year) *) 13let days_in_month = [| 31; 28; 31; 30; 31; 30; 31; 31; 30; 31; 30; 31 |] 14 15(** Check if a year is a leap year *) 16let is_leap_year year = 17 (year mod 400 = 0) || (year mod 4 = 0 && year mod 100 <> 0) 18 19(** Get max day for a given month/year *) 20let max_day_for_month year month = 21 if month = 2 && is_leap_year year then 29 22 else if month >= 1 && month <= 12 then days_in_month.(month - 1) 23 else 31 24 25(** Validate date string YYYY-MM-DD. Returns (valid, error_reason option) *) 26let validate_date s = 27 let pattern = Str.regexp "^\\([0-9]+\\)-\\([0-9][0-9]\\)-\\([0-9][0-9]\\)$" in 28 if not (Str.string_match pattern s 0) then 29 (false, Some "The literal did not satisfy the date format") 30 else 31 let year_s = Str.matched_group 1 s in 32 let month_s = Str.matched_group 2 s in 33 let day_s = Str.matched_group 3 s in 34 if String.length year_s < 4 then 35 (false, Some "The literal did not satisfy the date format") 36 else 37 match (parse_int year_s, parse_int month_s, parse_int day_s) with 38 | None, _, _ | _, None, _ | _, _, None -> 39 (false, Some "Invalid year, month or day") 40 | Some year, Some month, Some day -> 41 if year < 1 then (false, Some "Year cannot be less than 1") 42 else if month = 0 then (false, Some "Month cannot be less than 1") 43 else if month > 12 then (false, Some "Month cannot be greater than 12") 44 else if day < 1 then (false, Some "Day cannot be less than 1") 45 else 46 let max_day = max_day_for_month year month in 47 if day > max_day then (false, Some "Day out of range") 48 else (true, None) 49 50(** Check if a date-like value has a 5+ digit year (might be mistyped) *) 51let has_suspicious_year s = 52 let pattern = Str.regexp "^\\([0-9]+\\)-" in 53 if Str.string_match pattern s 0 then 54 let year_s = Str.matched_group 1 s in 55 String.length year_s > 4 56 else 57 false 58 59(** Check if a date has year before 1000 (might be mistyped or unusual) *) 60let has_old_year s = 61 let pattern = Str.regexp "^\\([0-9]+\\)-" in 62 if Str.string_match pattern s 0 then 63 let year_s = Str.matched_group 1 s in 64 match parse_int year_s with 65 | Some year -> year < 1000 66 | None -> false 67 else 68 false 69 70(** Validate time string HH:MM[:SS[.sss]] *) 71let validate_time s = 72 let pattern = Str.regexp "^\\([0-9][0-9]\\):\\([0-9][0-9]\\)\\(:\\([0-9][0-9]\\)\\(\\.\\([0-9]+\\)\\)?\\)?$" in 73 if not (Str.string_match pattern s 0) then 74 (false, None) (* Format error - return None so caller uses generic message *) 75 else 76 let hour_s = Str.matched_group 1 s in 77 let minute_s = Str.matched_group 2 s in 78 match (parse_int hour_s, parse_int minute_s) with 79 | None, _ | _, None -> (false, Some "Invalid hour or minute") 80 | Some hour, Some minute -> 81 if hour > 23 then (false, Some "Hour cannot be greater than 23") 82 else if minute > 59 then (false, Some "Minute cannot be greater than 59") 83 else 84 let second_s = try Some (Str.matched_group 4 s) with Not_found -> None in 85 match second_s with 86 | None -> (true, None) 87 | Some sec_s -> 88 match parse_int sec_s with 89 | None -> (false, Some "Invalid seconds") 90 | Some sec -> 91 if sec > 59 then (false, Some "Second cannot be greater than 59") 92 else 93 (* Check milliseconds if present *) 94 let millis_s = try Some (Str.matched_group 6 s) with Not_found -> None in 95 match millis_s with 96 | None -> (true, None) 97 | Some ms -> 98 if String.length ms < 1 || String.length ms > 3 then 99 (false, Some "A fraction of a second must be one, two, or three digits") 100 else 101 (true, None) 102 103(** Validate year-only format YYYY (at least 4 digits, > 0) *) 104let validate_year_only s = 105 let pattern = Str.regexp "^\\([0-9]+\\)$" in 106 if not (Str.string_match pattern s 0) then 107 (false, Some "Year must be digits only") 108 else 109 let year_s = Str.matched_group 1 s in 110 if String.length year_s < 4 then 111 (false, Some "The literal did not satisfy the date format") 112 else 113 match parse_int year_s with 114 | None -> (false, Some "Invalid year") 115 | Some year -> 116 if year < 1 then (false, Some "Year cannot be less than 1") 117 else (true, None) 118 119(** Validate month format YYYY-MM *) 120let validate_year_month s = 121 let pattern = Str.regexp "^\\([0-9]+\\)-\\([0-9][0-9]\\)$" in 122 if not (Str.string_match pattern s 0) then 123 (false, Some "Month must be in YYYY-MM format") 124 else 125 let year_s = Str.matched_group 1 s in 126 let month_s = Str.matched_group 2 s in 127 if String.length year_s < 4 then 128 (false, Some "The literal did not satisfy the date format") 129 else 130 match (parse_int year_s, parse_int month_s) with 131 | None, _ | _, None -> (false, Some "Invalid year or month") 132 | Some year, Some month -> 133 if year < 1 then (false, Some "Year cannot be less than 1") 134 else if month < 1 || month > 12 then (false, Some "Month out of range") 135 else (true, None) 136 137(** Validate week format YYYY-Www *) 138let validate_week s = 139 let pattern = Str.regexp "^\\([0-9]+\\)-W\\([0-9][0-9]\\)$" in 140 if not (Str.string_match pattern s 0) then 141 (false, Some "Week must be in YYYY-Www format") 142 else 143 let year_s = Str.matched_group 1 s in 144 let week_s = Str.matched_group 2 s in 145 if String.length year_s < 4 then 146 (false, Some "The literal did not satisfy the date format") 147 else 148 match (parse_int year_s, parse_int week_s) with 149 | None, _ | _, None -> (false, Some "Invalid year or week") 150 | Some year, Some week -> 151 if year < 1 then (false, Some "Year cannot be less than 1") 152 else if week < 1 || week > 53 then (false, Some "Week out of range") 153 else (true, None) 154 155(** Validate yearless date format --MM-DD *) 156let validate_yearless_date s = 157 let pattern = Str.regexp "^--\\([0-9][0-9]\\)-\\([0-9][0-9]\\)$" in 158 if not (Str.string_match pattern s 0) then 159 (false, Some "Yearless date must be in --MM-DD format") 160 else 161 let month_s = Str.matched_group 1 s in 162 let day_s = Str.matched_group 2 s in 163 match (parse_int month_s, parse_int day_s) with 164 | None, _ | _, None -> (false, Some "Invalid month or day") 165 | Some month, Some day -> 166 if month < 1 || month > 12 then (false, Some "Month out of range") 167 else if day < 1 then (false, Some "Day cannot be less than 1") 168 else 169 (* Use non-leap year for yearless date validation *) 170 let max_day = if month = 2 then 29 else days_in_month.(month - 1) in 171 if day > max_day then (false, Some "Day out of range") 172 else (true, None) 173 174(** Validate duration format - HTML5 only accepts: 175 1. Duration time component: PT#H#M#S (or PT#H, PT#M, PT#S, etc.) 176 2. Duration weeks: P#W 177 3. Duration days: P#D or P#DT#H#M#S *) 178let validate_duration s = 179 if String.length s < 2 then 180 (false, Some "Duration too short") 181 else if s.[0] <> 'P' then 182 (false, Some "Duration must start with P") 183 else 184 let rest = String.sub s 1 (String.length s - 1) in 185 (* Valid HTML5 duration patterns: 186 - PT#H#M#S (or any combination of H, M, S after T) 187 - P#W (weeks only) 188 - P#D or P#DT#H#M#S (days with optional time) *) 189 let pattern_time_only = Str.regexp "^T\\([0-9]+H\\)?\\([0-9]+M\\)?\\([0-9]+\\(\\.[0-9]+\\)?S\\)?$" in 190 let pattern_weeks = Str.regexp "^[0-9]+W$" in 191 let pattern_days = Str.regexp "^[0-9]+D\\(T\\([0-9]+H\\)?\\([0-9]+M\\)?\\([0-9]+\\(\\.[0-9]+\\)?S\\)?\\)?$" in 192 if Str.string_match pattern_time_only rest 0 then 193 (* Check that at least one component exists after T *) 194 if String.length rest > 1 then (true, None) 195 else (false, Some "Invalid duration format") 196 else if Str.string_match pattern_weeks rest 0 then 197 (true, None) 198 else if Str.string_match pattern_days rest 0 then 199 (true, None) 200 else 201 (false, Some "Invalid duration format") 202 203(** Result type for timezone validation *) 204type tz_result = TzOk | TzWarning of string | TzError of string 205 206(** Validate timezone offset +HH:MM or -HH:MM or +HHMM or -HHMM 207 Returns warning for unusual but valid offsets: 208 - Negative offsets > 12:00 (e.g., -13:00) 209 - Positive offsets > 14:00 (e.g., +15:00) 210 - Offsets with unusual minutes (not 00, 30, 45) *) 211let validate_timezone_offset s = 212 (* Try +HH:MM format *) 213 let pattern_colon = Str.regexp "^\\([+-]\\)\\([0-9][0-9]\\):\\([0-9][0-9]\\)$" in 214 (* Try +HHMM format (no colon) *) 215 let pattern_no_colon = Str.regexp "^\\([+-]\\)\\([0-9][0-9]\\)\\([0-9][0-9]\\)$" in 216 let matched, sign = 217 if Str.string_match pattern_colon s 0 then 218 (true, Str.matched_group 1 s) 219 else if Str.string_match pattern_no_colon s 0 then 220 (true, Str.matched_group 1 s) 221 else 222 (false, "+") 223 in 224 if not matched then 225 TzError "The literal did not satisfy the datetime with timezone format" 226 else 227 let hour_s = Str.matched_group 2 s in 228 let minute_s = Str.matched_group 3 s in 229 match (parse_int hour_s, parse_int minute_s) with 230 | None, _ | _, None -> TzError "Invalid timezone" 231 | Some hour, Some minute -> 232 if hour > 23 then TzError "Hours out of range in time zone designator" 233 else if minute > 59 then TzError "Minutes out of range in time zone designator" 234 else begin 235 (* Check for unusual but valid offsets *) 236 let unusual_range = 237 if sign = "-" && hour >= 13 then true 238 else if sign = "+" && hour >= 15 then true 239 else false 240 in 241 let unusual_minutes = 242 minute <> 0 && minute <> 30 && minute <> 45 243 in 244 if unusual_range then 245 TzWarning "Hours in time zone designator should be from \"-12:00\" to \"+14:00\"" 246 else if unusual_minutes then 247 TzWarning "Minutes in time zone designator should be either \"00\", \"30\", or \"45\"." 248 else 249 TzOk 250 end 251 252(** Result type for datetime with timezone validation *) 253type dt_tz_result = DtOk | DtWarning of string | DtError of string 254 255(** Validate datetime with timezone: YYYY-MM-DDTHH:MM:SS[.sss]Z or YYYY-MM-DDTHH:MM:SS[.sss]+HH:MM *) 256let validate_datetime_with_timezone s = 257 (* Try to split on T or space *) 258 let sep_pos = 259 try Some (String.index s 'T') 260 with Not_found -> 261 try Some (String.index s ' ') 262 with Not_found -> None 263 in 264 match sep_pos with 265 | None -> DtError "The literal did not satisfy the datetime with timezone format" 266 | Some pos -> 267 let date_part = String.sub s 0 pos in 268 let time_and_tz = String.sub s (pos + 1) (String.length s - pos - 1) in 269 (* Validate date *) 270 match validate_date date_part with 271 | (false, _) -> 272 DtError "The literal did not satisfy the datetime with timezone format" 273 | (true, _) -> 274 let date_old = has_old_year date_part in 275 (* Check if ends with Z *) 276 if String.length time_and_tz > 0 && time_and_tz.[String.length time_and_tz - 1] = 'Z' then begin 277 let time_part = String.sub time_and_tz 0 (String.length time_and_tz - 1) in 278 match validate_time time_part with 279 | (false, Some reason) -> DtError reason 280 | (false, None) -> DtError "The literal did not satisfy the datetime with timezone format" 281 | (true, _) -> 282 if date_old then DtWarning "Year may be mistyped" 283 else DtOk 284 end 285 else begin 286 (* Check for +/- timezone offset *) 287 let plus_pos = try Some (String.rindex time_and_tz '+') with Not_found -> None in 288 let minus_pos = try Some (String.rindex time_and_tz '-') with Not_found -> None in 289 let tz_pos = match plus_pos, minus_pos with 290 | Some p, Some m -> Some (max p m) 291 | Some p, None -> Some p 292 | None, Some m -> Some m 293 | None, None -> None 294 in 295 match tz_pos with 296 | None -> DtError "The literal did not satisfy the datetime with timezone format" 297 | Some tp -> 298 let time_part = String.sub time_and_tz 0 tp in 299 let tz_part = String.sub time_and_tz tp (String.length time_and_tz - tp) in 300 match validate_time time_part with 301 | (false, Some reason) -> DtError reason 302 | (false, None) -> DtError "The literal did not satisfy the datetime with timezone format" 303 | (true, _) -> 304 match validate_timezone_offset tz_part with 305 | TzError e -> DtError e 306 | TzWarning w -> 307 DtWarning w 308 | TzOk -> 309 if date_old then DtWarning "Year may be mistyped" 310 else DtOk 311 end 312 313(** Validate datetime-local: YYYY-MM-DDTHH:MM[:SS[.sss]] or YYYY-MM-DD HH:MM *) 314let validate_datetime_local s = 315 let sep_pos = 316 try Some (String.index s 'T') 317 with Not_found -> 318 try Some (String.index s ' ') 319 with Not_found -> None 320 in 321 match sep_pos with 322 | None -> (false, Some "Invalid datetime-local format") 323 | Some pos -> 324 let date_part = String.sub s 0 pos in 325 let time_part = String.sub s (pos + 1) (String.length s - pos - 1) in 326 match validate_date date_part with 327 | (false, reason) -> (false, reason) 328 | (true, _) -> 329 match validate_time time_part with 330 | (false, reason) -> (false, reason) 331 | (true, _) -> (true, None) 332 333(** Result type for datetime validation - can be Ok, Error, or Warning *) 334type datetime_result = 335 | Ok 336 | Error of string 337 | Warning of string 338 339(** Validate datetime attribute - valid formats depend on element: 340 - del/ins: only date or datetime-with-timezone 341 - time: date, time, datetime-local, datetime-with-timezone, year, month, week, yearless, duration *) 342let validate_datetime_attr value element_name attr_name = 343 let is_time_element = element_name = "time" in 344 (* Check for leading/trailing whitespace - not allowed *) 345 if value <> String.trim value then begin 346 let tz_msg = "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format." in 347 let date_msg = "Bad date: The literal did not satisfy the date format." in 348 Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 349 (q value) (q attr_name) (q element_name) tz_msg date_msg) 350 end 351 else 352 (* Try datetime with timezone first *) 353 match validate_datetime_with_timezone value with 354 | DtOk -> Ok (* Valid datetime with timezone *) 355 | DtWarning w -> 356 (* Valid but with warning - format matches Nu validator *) 357 Warning (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad datetime with timezone: %s Bad date: The literal did not satisfy the date format." 358 (q value) (q attr_name) (q element_name) w) 359 | DtError tz_error -> 360 (* Try just date - valid for all elements *) 361 match validate_date value with 362 | (true, _) -> 363 (* Date is valid, but check for suspicious year (5+ digits or old year) *) 364 if has_suspicious_year value || has_old_year value then begin 365 let date_msg = "Bad date: Year may be mistyped." in 366 let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in 367 Warning (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 368 (q value) (q attr_name) (q element_name) date_msg tz_msg) 369 end else 370 Ok (* Valid date with normal year *) 371 | (false, date_error) -> 372 (* For time element only, try additional formats *) 373 if is_time_element then begin 374 match validate_datetime_local value with 375 | (true, _) -> Ok (* Valid datetime-local *) 376 | (false, _) -> 377 match validate_time value with 378 | (true, _) -> Ok (* Valid time *) 379 | (false, _) -> 380 match validate_year_month value with 381 | (true, _) -> Ok (* Valid month YYYY-MM *) 382 | (false, _) -> 383 match validate_year_only value with 384 | (true, _) -> Ok (* Valid year YYYY *) 385 | (false, _) -> 386 match validate_week value with 387 | (true, _) -> Ok (* Valid week YYYY-Www *) 388 | (false, _) -> 389 match validate_yearless_date value with 390 | (true, _) -> Ok (* Valid yearless date --MM-DD *) 391 | (false, _) -> 392 match validate_duration value with 393 | (true, _) -> Ok (* Valid duration P... *) 394 | (false, _) -> 395 (* Use simplified message for time element matching Nu validator format *) 396 Error (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad time-datetime: The literal did not satisfy the time-datetime format." 397 (q value) (q attr_name) (q element_name)) 398 end 399 else begin 400 (* del/ins only allow date or datetime-with-timezone *) 401 let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in 402 let date_msg = match date_error with 403 | Some e -> Printf.sprintf "Bad date: %s." e 404 | None -> "Bad date: The literal did not satisfy the date format." 405 in 406 (* Order depends on error type. The Nu validator has specific patterns: 407 - Time hour/minute errors (not timezone) -> datetime first 408 - Timezone hours error -> datetime first 409 - Timezone minutes error -> date first 410 - Time fraction error -> date first 411 - Date "less than" error -> date first 412 - Date "greater than" error -> datetime first 413 - Generic errors both sides -> datetime first *) 414 let is_generic_tz = tz_error = "The literal did not satisfy the datetime with timezone format" in 415 let is_tz_hours_error = String.length tz_error >= 5 && String.sub tz_error 0 5 = "Hours" in 416 let is_tz_minutes_error = String.length tz_error >= 7 && String.sub tz_error 0 7 = "Minutes" in 417 let is_time_minute_or_hour_error = 418 (try ignore (Str.search_forward (Str.regexp "Minute cannot\\|Hour cannot") tz_error 0); true with Not_found -> false) 419 in 420 let is_fraction_error = try ignore (Str.search_forward (Str.regexp "fraction") tz_error 0); true with Not_found -> false in 421 let is_month_less_than_error = match date_error with 422 | Some e -> (try ignore (Str.search_forward (Str.regexp "Month cannot be less than") e 0); true with Not_found -> false) 423 | None -> false 424 in 425 (* Datetime first for: generic tz, tz hours error, time minute/hour errors, year errors 426 Date first for: "Month cannot be less than" date error, tz minutes error, fraction error *) 427 if is_month_less_than_error then 428 Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 429 (q value) (q attr_name) (q element_name) date_msg tz_msg) 430 else if is_tz_minutes_error || is_fraction_error then 431 Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 432 (q value) (q attr_name) (q element_name) date_msg tz_msg) 433 else if is_tz_hours_error || is_time_minute_or_hour_error || is_generic_tz then 434 Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 435 (q value) (q attr_name) (q element_name) tz_msg date_msg) 436 else 437 Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 438 (q value) (q attr_name) (q element_name) tz_msg date_msg) 439 end 440 441(** Checker state *) 442type state = unit [@@warning "-34"] 443 444let create () = () 445let reset _state = () 446 447let start_element _state ~element collector = 448 match element.Element.tag with 449 | Tag.Html tag -> 450 let name = Tag.html_tag_to_string tag in 451 if List.mem name datetime_elements then begin 452 (* Check for datetime attribute *) 453 let datetime_attr = List.find_map (fun (k, v) -> 454 if Astring.String.Ascii.lowercase k = "datetime" then Some v else None 455 ) element.raw_attrs in 456 match datetime_attr with 457 | None -> () 458 | Some value -> 459 if String.trim value = "" then () 460 else 461 match validate_datetime_attr value name "datetime" with 462 | Ok -> () 463 | Error error_msg -> 464 Message_collector.add_typed collector 465 (`Attr (`Bad_value_generic (`Message error_msg))) 466 | Warning warn_msg -> 467 Message_collector.add_typed collector 468 (`Generic warn_msg) 469 end 470 | _ -> () (* Non-HTML elements don't have datetime attributes *) 471 472let end_element _state ~tag:_ _collector = () 473 474let checker = Checker.make ~create ~reset ~start_element ~end_element ()