OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Datetime attribute validation checker *)
2
3let q = Error_code.q
4
5(** Elements that have datetime attribute *)
6let datetime_elements = ["del"; "ins"; "time"]
7
8(** Parse int safely *)
9let parse_int s =
10 try Some (int_of_string s) with _ -> None
11
12(** Days in each month (non-leap year) *)
13let days_in_month = [| 31; 28; 31; 30; 31; 30; 31; 31; 30; 31; 30; 31 |]
14
15(** Check if a year is a leap year *)
16let is_leap_year year =
17 (year mod 400 = 0) || (year mod 4 = 0 && year mod 100 <> 0)
18
19(** Get max day for a given month/year *)
20let max_day_for_month year month =
21 if month = 2 && is_leap_year year then 29
22 else if month >= 1 && month <= 12 then days_in_month.(month - 1)
23 else 31
24
25(** Validate date string YYYY-MM-DD. Returns (valid, error_reason option) *)
26let validate_date s =
27 let pattern = Str.regexp "^\\([0-9]+\\)-\\([0-9][0-9]\\)-\\([0-9][0-9]\\)$" in
28 if not (Str.string_match pattern s 0) then
29 (false, Some "The literal did not satisfy the date format")
30 else
31 let year_s = Str.matched_group 1 s in
32 let month_s = Str.matched_group 2 s in
33 let day_s = Str.matched_group 3 s in
34 if String.length year_s < 4 then
35 (false, Some "The literal did not satisfy the date format")
36 else
37 match (parse_int year_s, parse_int month_s, parse_int day_s) with
38 | None, _, _ | _, None, _ | _, _, None ->
39 (false, Some "Invalid year, month or day")
40 | Some year, Some month, Some day ->
41 if year < 1 then (false, Some "Year cannot be less than 1")
42 else if month = 0 then (false, Some "Month cannot be less than 1")
43 else if month > 12 then (false, Some "Month cannot be greater than 12")
44 else if day < 1 then (false, Some "Day cannot be less than 1")
45 else
46 let max_day = max_day_for_month year month in
47 if day > max_day then (false, Some "Day out of range")
48 else (true, None)
49
50(** Check if a date-like value has a 5+ digit year (might be mistyped) *)
51let has_suspicious_year s =
52 let pattern = Str.regexp "^\\([0-9]+\\)-" in
53 if Str.string_match pattern s 0 then
54 let year_s = Str.matched_group 1 s in
55 String.length year_s > 4
56 else
57 false
58
59(** Check if a date has year before 1000 (might be mistyped or unusual) *)
60let has_old_year s =
61 let pattern = Str.regexp "^\\([0-9]+\\)-" in
62 if Str.string_match pattern s 0 then
63 let year_s = Str.matched_group 1 s in
64 match parse_int year_s with
65 | Some year -> year < 1000
66 | None -> false
67 else
68 false
69
70(** Validate time string HH:MM[:SS[.sss]] *)
71let validate_time s =
72 let pattern = Str.regexp "^\\([0-9][0-9]\\):\\([0-9][0-9]\\)\\(:\\([0-9][0-9]\\)\\(\\.\\([0-9]+\\)\\)?\\)?$" in
73 if not (Str.string_match pattern s 0) then
74 (false, None) (* Format error - return None so caller uses generic message *)
75 else
76 let hour_s = Str.matched_group 1 s in
77 let minute_s = Str.matched_group 2 s in
78 match (parse_int hour_s, parse_int minute_s) with
79 | None, _ | _, None -> (false, Some "Invalid hour or minute")
80 | Some hour, Some minute ->
81 if hour > 23 then (false, Some "Hour cannot be greater than 23")
82 else if minute > 59 then (false, Some "Minute cannot be greater than 59")
83 else
84 let second_s = try Some (Str.matched_group 4 s) with Not_found -> None in
85 match second_s with
86 | None -> (true, None)
87 | Some sec_s ->
88 match parse_int sec_s with
89 | None -> (false, Some "Invalid seconds")
90 | Some sec ->
91 if sec > 59 then (false, Some "Second cannot be greater than 59")
92 else
93 (* Check milliseconds if present *)
94 let millis_s = try Some (Str.matched_group 6 s) with Not_found -> None in
95 match millis_s with
96 | None -> (true, None)
97 | Some ms ->
98 if String.length ms < 1 || String.length ms > 3 then
99 (false, Some "A fraction of a second must be one, two, or three digits")
100 else
101 (true, None)
102
103(** Validate year-only format YYYY (at least 4 digits, > 0) *)
104let validate_year_only s =
105 let pattern = Str.regexp "^\\([0-9]+\\)$" in
106 if not (Str.string_match pattern s 0) then
107 (false, Some "Year must be digits only")
108 else
109 let year_s = Str.matched_group 1 s in
110 if String.length year_s < 4 then
111 (false, Some "The literal did not satisfy the date format")
112 else
113 match parse_int year_s with
114 | None -> (false, Some "Invalid year")
115 | Some year ->
116 if year < 1 then (false, Some "Year cannot be less than 1")
117 else (true, None)
118
119(** Validate month format YYYY-MM *)
120let validate_year_month s =
121 let pattern = Str.regexp "^\\([0-9]+\\)-\\([0-9][0-9]\\)$" in
122 if not (Str.string_match pattern s 0) then
123 (false, Some "Month must be in YYYY-MM format")
124 else
125 let year_s = Str.matched_group 1 s in
126 let month_s = Str.matched_group 2 s in
127 if String.length year_s < 4 then
128 (false, Some "The literal did not satisfy the date format")
129 else
130 match (parse_int year_s, parse_int month_s) with
131 | None, _ | _, None -> (false, Some "Invalid year or month")
132 | Some year, Some month ->
133 if year < 1 then (false, Some "Year cannot be less than 1")
134 else if month < 1 || month > 12 then (false, Some "Month out of range")
135 else (true, None)
136
137(** Validate week format YYYY-Www *)
138let validate_week s =
139 let pattern = Str.regexp "^\\([0-9]+\\)-W\\([0-9][0-9]\\)$" in
140 if not (Str.string_match pattern s 0) then
141 (false, Some "Week must be in YYYY-Www format")
142 else
143 let year_s = Str.matched_group 1 s in
144 let week_s = Str.matched_group 2 s in
145 if String.length year_s < 4 then
146 (false, Some "The literal did not satisfy the date format")
147 else
148 match (parse_int year_s, parse_int week_s) with
149 | None, _ | _, None -> (false, Some "Invalid year or week")
150 | Some year, Some week ->
151 if year < 1 then (false, Some "Year cannot be less than 1")
152 else if week < 1 || week > 53 then (false, Some "Week out of range")
153 else (true, None)
154
155(** Validate yearless date format --MM-DD *)
156let validate_yearless_date s =
157 let pattern = Str.regexp "^--\\([0-9][0-9]\\)-\\([0-9][0-9]\\)$" in
158 if not (Str.string_match pattern s 0) then
159 (false, Some "Yearless date must be in --MM-DD format")
160 else
161 let month_s = Str.matched_group 1 s in
162 let day_s = Str.matched_group 2 s in
163 match (parse_int month_s, parse_int day_s) with
164 | None, _ | _, None -> (false, Some "Invalid month or day")
165 | Some month, Some day ->
166 if month < 1 || month > 12 then (false, Some "Month out of range")
167 else if day < 1 then (false, Some "Day cannot be less than 1")
168 else
169 (* Use non-leap year for yearless date validation *)
170 let max_day = if month = 2 then 29 else days_in_month.(month - 1) in
171 if day > max_day then (false, Some "Day out of range")
172 else (true, None)
173
174(** Validate duration format - HTML5 only accepts:
175 1. Duration time component: PT#H#M#S (or PT#H, PT#M, PT#S, etc.)
176 2. Duration weeks: P#W
177 3. Duration days: P#D or P#DT#H#M#S *)
178let validate_duration s =
179 if String.length s < 2 then
180 (false, Some "Duration too short")
181 else if s.[0] <> 'P' then
182 (false, Some "Duration must start with P")
183 else
184 let rest = String.sub s 1 (String.length s - 1) in
185 (* Valid HTML5 duration patterns:
186 - PT#H#M#S (or any combination of H, M, S after T)
187 - P#W (weeks only)
188 - P#D or P#DT#H#M#S (days with optional time) *)
189 let pattern_time_only = Str.regexp "^T\\([0-9]+H\\)?\\([0-9]+M\\)?\\([0-9]+\\(\\.[0-9]+\\)?S\\)?$" in
190 let pattern_weeks = Str.regexp "^[0-9]+W$" in
191 let pattern_days = Str.regexp "^[0-9]+D\\(T\\([0-9]+H\\)?\\([0-9]+M\\)?\\([0-9]+\\(\\.[0-9]+\\)?S\\)?\\)?$" in
192 if Str.string_match pattern_time_only rest 0 then
193 (* Check that at least one component exists after T *)
194 if String.length rest > 1 then (true, None)
195 else (false, Some "Invalid duration format")
196 else if Str.string_match pattern_weeks rest 0 then
197 (true, None)
198 else if Str.string_match pattern_days rest 0 then
199 (true, None)
200 else
201 (false, Some "Invalid duration format")
202
203(** Result type for timezone validation *)
204type tz_result = TzOk | TzWarning of string | TzError of string
205
206(** Validate timezone offset +HH:MM or -HH:MM or +HHMM or -HHMM
207 Returns warning for unusual but valid offsets:
208 - Negative offsets > 12:00 (e.g., -13:00)
209 - Positive offsets > 14:00 (e.g., +15:00)
210 - Offsets with unusual minutes (not 00, 30, 45) *)
211let validate_timezone_offset s =
212 (* Try +HH:MM format *)
213 let pattern_colon = Str.regexp "^\\([+-]\\)\\([0-9][0-9]\\):\\([0-9][0-9]\\)$" in
214 (* Try +HHMM format (no colon) *)
215 let pattern_no_colon = Str.regexp "^\\([+-]\\)\\([0-9][0-9]\\)\\([0-9][0-9]\\)$" in
216 let matched, sign =
217 if Str.string_match pattern_colon s 0 then
218 (true, Str.matched_group 1 s)
219 else if Str.string_match pattern_no_colon s 0 then
220 (true, Str.matched_group 1 s)
221 else
222 (false, "+")
223 in
224 if not matched then
225 TzError "The literal did not satisfy the datetime with timezone format"
226 else
227 let hour_s = Str.matched_group 2 s in
228 let minute_s = Str.matched_group 3 s in
229 match (parse_int hour_s, parse_int minute_s) with
230 | None, _ | _, None -> TzError "Invalid timezone"
231 | Some hour, Some minute ->
232 if hour > 23 then TzError "Hours out of range in time zone designator"
233 else if minute > 59 then TzError "Minutes out of range in time zone designator"
234 else begin
235 (* Check for unusual but valid offsets *)
236 let unusual_range =
237 if sign = "-" && hour >= 13 then true
238 else if sign = "+" && hour >= 15 then true
239 else false
240 in
241 let unusual_minutes =
242 minute <> 0 && minute <> 30 && minute <> 45
243 in
244 if unusual_range then
245 TzWarning "Hours in time zone designator should be from \"-12:00\" to \"+14:00\""
246 else if unusual_minutes then
247 TzWarning "Minutes in time zone designator should be either \"00\", \"30\", or \"45\"."
248 else
249 TzOk
250 end
251
252(** Result type for datetime with timezone validation *)
253type dt_tz_result = DtOk | DtWarning of string | DtError of string
254
255(** Validate datetime with timezone: YYYY-MM-DDTHH:MM:SS[.sss]Z or YYYY-MM-DDTHH:MM:SS[.sss]+HH:MM *)
256let validate_datetime_with_timezone s =
257 (* Try to split on T or space *)
258 let sep_pos =
259 try Some (String.index s 'T')
260 with Not_found ->
261 try Some (String.index s ' ')
262 with Not_found -> None
263 in
264 match sep_pos with
265 | None -> DtError "The literal did not satisfy the datetime with timezone format"
266 | Some pos ->
267 let date_part = String.sub s 0 pos in
268 let time_and_tz = String.sub s (pos + 1) (String.length s - pos - 1) in
269 (* Validate date *)
270 match validate_date date_part with
271 | (false, _) ->
272 DtError "The literal did not satisfy the datetime with timezone format"
273 | (true, _) ->
274 let date_old = has_old_year date_part in
275 (* Check if ends with Z *)
276 if String.length time_and_tz > 0 && time_and_tz.[String.length time_and_tz - 1] = 'Z' then begin
277 let time_part = String.sub time_and_tz 0 (String.length time_and_tz - 1) in
278 match validate_time time_part with
279 | (false, Some reason) -> DtError reason
280 | (false, None) -> DtError "The literal did not satisfy the datetime with timezone format"
281 | (true, _) ->
282 if date_old then DtWarning "Year may be mistyped"
283 else DtOk
284 end
285 else begin
286 (* Check for +/- timezone offset *)
287 let plus_pos = try Some (String.rindex time_and_tz '+') with Not_found -> None in
288 let minus_pos = try Some (String.rindex time_and_tz '-') with Not_found -> None in
289 let tz_pos = match plus_pos, minus_pos with
290 | Some p, Some m -> Some (max p m)
291 | Some p, None -> Some p
292 | None, Some m -> Some m
293 | None, None -> None
294 in
295 match tz_pos with
296 | None -> DtError "The literal did not satisfy the datetime with timezone format"
297 | Some tp ->
298 let time_part = String.sub time_and_tz 0 tp in
299 let tz_part = String.sub time_and_tz tp (String.length time_and_tz - tp) in
300 match validate_time time_part with
301 | (false, Some reason) -> DtError reason
302 | (false, None) -> DtError "The literal did not satisfy the datetime with timezone format"
303 | (true, _) ->
304 match validate_timezone_offset tz_part with
305 | TzError e -> DtError e
306 | TzWarning w ->
307 DtWarning w
308 | TzOk ->
309 if date_old then DtWarning "Year may be mistyped"
310 else DtOk
311 end
312
313(** Validate datetime-local: YYYY-MM-DDTHH:MM[:SS[.sss]] or YYYY-MM-DD HH:MM *)
314let validate_datetime_local s =
315 let sep_pos =
316 try Some (String.index s 'T')
317 with Not_found ->
318 try Some (String.index s ' ')
319 with Not_found -> None
320 in
321 match sep_pos with
322 | None -> (false, Some "Invalid datetime-local format")
323 | Some pos ->
324 let date_part = String.sub s 0 pos in
325 let time_part = String.sub s (pos + 1) (String.length s - pos - 1) in
326 match validate_date date_part with
327 | (false, reason) -> (false, reason)
328 | (true, _) ->
329 match validate_time time_part with
330 | (false, reason) -> (false, reason)
331 | (true, _) -> (true, None)
332
333(** Result type for datetime validation - can be Ok, Error, or Warning *)
334type datetime_result =
335 | Ok
336 | Error of string
337 | Warning of string
338
339(** Validate datetime attribute - valid formats depend on element:
340 - del/ins: only date or datetime-with-timezone
341 - time: date, time, datetime-local, datetime-with-timezone, year, month, week, yearless, duration *)
342let validate_datetime_attr value element_name attr_name =
343 let is_time_element = element_name = "time" in
344 (* Check for leading/trailing whitespace - not allowed *)
345 if value <> String.trim value then begin
346 let tz_msg = "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format." in
347 let date_msg = "Bad date: The literal did not satisfy the date format." in
348 Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s"
349 (q value) (q attr_name) (q element_name) tz_msg date_msg)
350 end
351 else
352 (* Try datetime with timezone first *)
353 match validate_datetime_with_timezone value with
354 | DtOk -> Ok (* Valid datetime with timezone *)
355 | DtWarning w ->
356 (* Valid but with warning - format matches Nu validator *)
357 Warning (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad datetime with timezone: %s Bad date: The literal did not satisfy the date format."
358 (q value) (q attr_name) (q element_name) w)
359 | DtError tz_error ->
360 (* Try just date - valid for all elements *)
361 match validate_date value with
362 | (true, _) ->
363 (* Date is valid, but check for suspicious year (5+ digits or old year) *)
364 if has_suspicious_year value || has_old_year value then begin
365 let date_msg = "Bad date: Year may be mistyped." in
366 let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in
367 Warning (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s"
368 (q value) (q attr_name) (q element_name) date_msg tz_msg)
369 end else
370 Ok (* Valid date with normal year *)
371 | (false, date_error) ->
372 (* For time element only, try additional formats *)
373 if is_time_element then begin
374 match validate_datetime_local value with
375 | (true, _) -> Ok (* Valid datetime-local *)
376 | (false, _) ->
377 match validate_time value with
378 | (true, _) -> Ok (* Valid time *)
379 | (false, _) ->
380 match validate_year_month value with
381 | (true, _) -> Ok (* Valid month YYYY-MM *)
382 | (false, _) ->
383 match validate_year_only value with
384 | (true, _) -> Ok (* Valid year YYYY *)
385 | (false, _) ->
386 match validate_week value with
387 | (true, _) -> Ok (* Valid week YYYY-Www *)
388 | (false, _) ->
389 match validate_yearless_date value with
390 | (true, _) -> Ok (* Valid yearless date --MM-DD *)
391 | (false, _) ->
392 match validate_duration value with
393 | (true, _) -> Ok (* Valid duration P... *)
394 | (false, _) ->
395 (* Use simplified message for time element matching Nu validator format *)
396 Error (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad time-datetime: The literal did not satisfy the time-datetime format."
397 (q value) (q attr_name) (q element_name))
398 end
399 else begin
400 (* del/ins only allow date or datetime-with-timezone *)
401 let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in
402 let date_msg = match date_error with
403 | Some e -> Printf.sprintf "Bad date: %s." e
404 | None -> "Bad date: The literal did not satisfy the date format."
405 in
406 (* Order depends on error type. The Nu validator has specific patterns:
407 - Time hour/minute errors (not timezone) -> datetime first
408 - Timezone hours error -> datetime first
409 - Timezone minutes error -> date first
410 - Time fraction error -> date first
411 - Date "less than" error -> date first
412 - Date "greater than" error -> datetime first
413 - Generic errors both sides -> datetime first *)
414 let is_generic_tz = tz_error = "The literal did not satisfy the datetime with timezone format" in
415 let is_tz_hours_error = String.length tz_error >= 5 && String.sub tz_error 0 5 = "Hours" in
416 let is_tz_minutes_error = String.length tz_error >= 7 && String.sub tz_error 0 7 = "Minutes" in
417 let is_time_minute_or_hour_error =
418 (try ignore (Str.search_forward (Str.regexp "Minute cannot\\|Hour cannot") tz_error 0); true with Not_found -> false)
419 in
420 let is_fraction_error = try ignore (Str.search_forward (Str.regexp "fraction") tz_error 0); true with Not_found -> false in
421 let is_month_less_than_error = match date_error with
422 | Some e -> (try ignore (Str.search_forward (Str.regexp "Month cannot be less than") e 0); true with Not_found -> false)
423 | None -> false
424 in
425 (* Datetime first for: generic tz, tz hours error, time minute/hour errors, year errors
426 Date first for: "Month cannot be less than" date error, tz minutes error, fraction error *)
427 if is_month_less_than_error then
428 Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s"
429 (q value) (q attr_name) (q element_name) date_msg tz_msg)
430 else if is_tz_minutes_error || is_fraction_error then
431 Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s"
432 (q value) (q attr_name) (q element_name) date_msg tz_msg)
433 else if is_tz_hours_error || is_time_minute_or_hour_error || is_generic_tz then
434 Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s"
435 (q value) (q attr_name) (q element_name) tz_msg date_msg)
436 else
437 Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s"
438 (q value) (q attr_name) (q element_name) tz_msg date_msg)
439 end
440
441(** Checker state *)
442type state = unit [@@warning "-34"]
443
444let create () = ()
445let reset _state = ()
446
447let start_element _state ~element collector =
448 match element.Element.tag with
449 | Tag.Html tag ->
450 let name = Tag.html_tag_to_string tag in
451 if List.mem name datetime_elements then begin
452 (* Check for datetime attribute *)
453 let datetime_attr = List.find_map (fun (k, v) ->
454 if Astring.String.Ascii.lowercase k = "datetime" then Some v else None
455 ) element.raw_attrs in
456 match datetime_attr with
457 | None -> ()
458 | Some value ->
459 if String.trim value = "" then ()
460 else
461 match validate_datetime_attr value name "datetime" with
462 | Ok -> ()
463 | Error error_msg ->
464 Message_collector.add_typed collector
465 (`Attr (`Bad_value_generic (`Message error_msg)))
466 | Warning warn_msg ->
467 Message_collector.add_typed collector
468 (`Generic warn_msg)
469 end
470 | _ -> () (* Non-HTML elements don't have datetime attributes *)
471
472let end_element _state ~tag:_ _collector = ()
473
474let checker = Checker.make ~create ~reset ~start_element ~end_element ()