OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Unicode normalization checker.
2
3 Validates that text content is in Unicode Normalization Form C (NFC). *)
4
5type state = {
6 mutable in_raw_text : int; (** Depth inside style/script elements *)
7}
8
9let create () = { in_raw_text = 0 }
10let reset state = state.in_raw_text <- 0
11
12(** Elements whose text content is raw text and should be skipped *)
13let is_raw_text_element name =
14 name = "style" || name = "script" || name = "xmp" || name = "textarea"
15
16(** Normalize a string to NFC form using uunf. *)
17let normalize_nfc text =
18 Uunf_string.normalize_utf_8 `NFC text
19
20(** Check if a string is in NFC form. *)
21let is_nfc text =
22 (* A string is in NFC if normalizing it produces the same string *)
23 let normalized = normalize_nfc text in
24 text = normalized
25
26(** Check if a character is ASCII punctuation *)
27let is_ascii_punct c =
28 let code = Char.code c in
29 (code >= 0x21 && code <= 0x2F) || (* ! to / *)
30 (code >= 0x3A && code <= 0x40) || (* : to @ including ? *)
31 (code >= 0x5B && code <= 0x60) || (* [ to ` *)
32 (code >= 0x7B && code <= 0x7E) (* { to ~ *)
33
34(** Strip trailing ASCII punctuation but keep trailing space if present before punct *)
35let strip_trailing_punct s =
36 let len = String.length s in
37 if len = 0 then s
38 else
39 (* Find the last non-ASCII-punct character *)
40 let rec find_end i =
41 if i < 0 then 0
42 else if not (is_ascii_punct s.[i]) then i + 1
43 else find_end (i - 1)
44 in
45 let end_pos = find_end (len - 1) in
46 if end_pos = len then s
47 else String.sub s 0 end_pos
48
49let start_element state ~element _collector =
50 let name = Tag.tag_to_string element.Element.tag in
51 if is_raw_text_element name then
52 state.in_raw_text <- state.in_raw_text + 1
53
54let end_element state ~tag _collector =
55 let name = Tag.tag_to_string tag in
56 if is_raw_text_element name && state.in_raw_text > 0 then
57 state.in_raw_text <- state.in_raw_text - 1
58
59let characters state text collector =
60 (* Skip text inside raw text elements like style/script *)
61 if state.in_raw_text > 0 then () else
62 (* Skip empty text or whitespace-only text *)
63 let text_trimmed = String.trim text in
64 if String.length text_trimmed = 0 then ()
65 else if not (is_nfc text_trimmed) then begin
66 let normalized = normalize_nfc text_trimmed in
67 (* Strip trailing ASCII punctuation from replacement to match Nu validator *)
68 let replacement = strip_trailing_punct normalized in
69 Message_collector.add_typed collector
70 (`I18n (`Not_nfc (`Replacement replacement)))
71 end
72
73let checker = Checker.make ~create ~reset ~start_element ~end_element
74 ~characters ()