OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Language attribute validation checker.
2
3 Validates language attributes. *)
4
5(** Checker state - currently minimal since we only check attributes. *)
6type state = unit [@@warning "-34"]
7
8let create () = ()
9
10let reset _state = ()
11
12(** Get attribute value from attribute list. *)
13let get_attr attrs name =
14 try Some (List.assoc name attrs)
15 with Not_found -> None
16
17(** Deprecated language subtags from IANA registry.
18 See: https://www.iana.org/assignments/language-subtag-registry/ *)
19let deprecated_subtags = [
20 ("mo", "ro"); (* Moldavian -> Romanian *)
21 ("iw", "he"); (* Hebrew (old) -> Hebrew *)
22 ("in", "id"); (* Indonesian (old) -> Indonesian *)
23 ("ji", "yi"); (* Yiddish (old) -> Yiddish *)
24 ("jw", "jv"); (* Javanese (old) -> Javanese *)
25 ("sh", "sr"); (* Serbo-Croatian -> Serbian *)
26]
27
28(** Check if a language tag contains deprecated subtags. *)
29let check_deprecated_tag value =
30 let lower = Astring.String.Ascii.lowercase value in
31 let subtags = String.split_on_char '-' lower in
32 match subtags with
33 | [] -> None
34 | primary :: _ ->
35 (* Check primary language subtag for deprecation *)
36 match List.assoc_opt primary deprecated_subtags with
37 | Some replacement -> Some (primary, replacement)
38 | None -> None
39
40(** Validate language attribute. *)
41let validate_lang_attr value ~location:_ ~element ~attribute collector =
42 (* First check structural validity *)
43 match Dt_language.Language_or_empty.validate value with
44 | Error msg ->
45 let reason = Printf.sprintf "Bad language tag: %s." msg in
46 Message_collector.add_typed collector
47 (`Attr (`Bad_value (`Elem element, `Attr attribute, `Value value, `Reason reason)))
48 | Ok () ->
49 (* Then check for deprecated subtags *)
50 match check_deprecated_tag value with
51 | Some (deprecated, replacement) ->
52 let reason = Printf.sprintf "Bad language tag: The language subtag %s is deprecated. Use %s instead."
53 (Error_code.q deprecated) (Error_code.q replacement) in
54 Message_collector.add_typed collector
55 (`Generic (Printf.sprintf "Bad value %s for attribute %s on element %s: %s"
56 (Error_code.q value) (Error_code.q attribute) (Error_code.q element) reason))
57 | None -> ()
58
59(** Check if lang and xml:lang match. *)
60let check_lang_xmllang_match ~lang:_ ~xmllang:_ ~location:_ ~element:_ collector =
61 (* Note: This check is disabled as the Error_code.Xml_lang_lang_mismatch format
62 differs from what the tests expect. We use add_typed when enabled. *)
63 ignore collector
64
65(** Process language attributes. *)
66let process_language_attrs ~element ~namespace ~attrs ~location collector =
67 ignore namespace;
68 let lang_opt = get_attr attrs "lang" in
69 let xmllang_opt = get_attr attrs "xml:lang" in
70
71 (* Validate lang attribute *)
72 begin match lang_opt with
73 | Some lang ->
74 validate_lang_attr lang ~location ~element ~attribute:"lang" collector
75 | None -> ()
76 end;
77
78 (* Validate xml:lang attribute *)
79 begin match xmllang_opt with
80 | Some xmllang ->
81 validate_lang_attr xmllang ~location ~element ~attribute:"xml:lang" collector
82 | None -> ()
83 end;
84
85 (* Check that lang and xml:lang match if both present *)
86 begin match lang_opt, xmllang_opt with
87 | Some lang, Some xmllang ->
88 check_lang_xmllang_match ~lang ~xmllang ~location ~element collector
89 | _ -> ()
90 end
91
92let start_element _state ~element collector =
93 let location = None in
94 let name = Tag.tag_to_string element.Element.tag in
95 process_language_attrs ~element:name ~namespace:None ~attrs:element.raw_attrs ~location collector
96
97let end_element _state ~tag:_ _collector = ()
98
99let checker = Checker.make ~create ~reset ~start_element ~end_element ()