OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at fuzz 74 lines 2.6 kB view raw
1(** Unicode normalization checker. 2 3 Validates that text content is in Unicode Normalization Form C (NFC). *) 4 5type state = { 6 mutable in_raw_text : int; (** Depth inside style/script elements *) 7} 8 9let create () = { in_raw_text = 0 } 10let reset state = state.in_raw_text <- 0 11 12(** Elements whose text content is raw text and should be skipped *) 13let is_raw_text_element name = 14 name = "style" || name = "script" || name = "xmp" || name = "textarea" 15 16(** Normalize a string to NFC form using uunf. *) 17let normalize_nfc text = 18 Uunf_string.normalize_utf_8 `NFC text 19 20(** Check if a string is in NFC form. *) 21let is_nfc text = 22 (* A string is in NFC if normalizing it produces the same string *) 23 let normalized = normalize_nfc text in 24 text = normalized 25 26(** Check if a character is ASCII punctuation *) 27let is_ascii_punct c = 28 let code = Char.code c in 29 (code >= 0x21 && code <= 0x2F) || (* ! to / *) 30 (code >= 0x3A && code <= 0x40) || (* : to @ including ? *) 31 (code >= 0x5B && code <= 0x60) || (* [ to ` *) 32 (code >= 0x7B && code <= 0x7E) (* { to ~ *) 33 34(** Strip trailing ASCII punctuation but keep trailing space if present before punct *) 35let strip_trailing_punct s = 36 let len = String.length s in 37 if len = 0 then s 38 else 39 (* Find the last non-ASCII-punct character *) 40 let rec find_end i = 41 if i < 0 then 0 42 else if not (is_ascii_punct s.[i]) then i + 1 43 else find_end (i - 1) 44 in 45 let end_pos = find_end (len - 1) in 46 if end_pos = len then s 47 else String.sub s 0 end_pos 48 49let start_element state ~element _collector = 50 let name = Tag.tag_to_string element.Element.tag in 51 if is_raw_text_element name then 52 state.in_raw_text <- state.in_raw_text + 1 53 54let end_element state ~tag _collector = 55 let name = Tag.tag_to_string tag in 56 if is_raw_text_element name && state.in_raw_text > 0 then 57 state.in_raw_text <- state.in_raw_text - 1 58 59let characters state text collector = 60 (* Skip text inside raw text elements like style/script *) 61 if state.in_raw_text > 0 then () else 62 (* Skip empty text or whitespace-only text *) 63 let text_trimmed = String.trim text in 64 if String.length text_trimmed = 0 then () 65 else if not (is_nfc text_trimmed) then begin 66 let normalized = normalize_nfc text_trimmed in 67 (* Strip trailing ASCII punctuation from replacement to match Nu validator *) 68 let replacement = strip_trailing_punct normalized in 69 Message_collector.add_typed collector 70 (`I18n (`Not_nfc (`Replacement replacement))) 71 end 72 73let checker = Checker.make ~create ~reset ~start_element ~end_element 74 ~characters ()