···3838 | None -> None
39394040(** Validate language attribute. *)
4141-let validate_lang_attr value ~location ~element collector =
4141+let validate_lang_attr value ~location ~element ~attribute collector =
4242 (* First check structural validity *)
4343 match Dt_language.Language_or_empty.validate value with
4444 | Error msg ->
4545 Message_collector.add_error collector
4646- ~message:(Printf.sprintf "Invalid lang attribute: %s" msg)
4646+ ~message:(Printf.sprintf
4747+ "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad language tag: %s."
4848+ value attribute element msg)
4749 ~code:"invalid-lang"
4850 ?location
4951 ~element
5050- ~attribute:"lang"
5252+ ~attribute
5153 ()
5254 | Ok () ->
5355 (* Then check for deprecated subtags *)
···8486 (* Validate lang attribute *)
8587 begin match lang_opt with
8688 | Some lang ->
8787- validate_lang_attr lang ~location ~element collector
8989+ validate_lang_attr lang ~location ~element ~attribute:"lang" collector
8890 | None -> ()
8991 end;
90929193 (* Validate xml:lang attribute *)
9294 begin match xmllang_opt with
9395 | Some xmllang ->
9494- validate_lang_attr xmllang ~location ~element collector
9696+ validate_lang_attr xmllang ~location ~element ~attribute:"xml:lang" collector
9597 | None -> ()
9698 end;
9799···113115 ()
114116115117let end_document _state _collector =
116116- (* Note: The "missing lang on html" warning is not produced by default since
117117- the Nu validator only produces it for specific test cases. *)
118118+ (* Note: The "missing lang on html" warning is only produced for specific
119119+ test cases in the Nu validator. We don't produce it by default. *)
118120 ()
119121120122let checker = (module struct
···11+(** Unicode normalization checker.
22+33+ Validates that text content is in Unicode Normalization Form C (NFC). *)
44+55+type state = unit
66+77+let create () = ()
88+let reset _state = ()
99+1010+(** Normalize a string to NFC form using uunf. *)
1111+let normalize_nfc text =
1212+ Uunf_string.normalize_utf_8 `NFC text
1313+1414+(** Check if a string is in NFC form. *)
1515+let is_nfc text =
1616+ (* A string is in NFC if normalizing it produces the same string *)
1717+ let normalized = normalize_nfc text in
1818+ text = normalized
1919+2020+let start_element _state ~name:_ ~namespace:_ ~attrs:_ _collector = ()
2121+2222+let end_element _state ~name:_ ~namespace:_ _collector = ()
2323+2424+let characters _state text collector =
2525+ (* Skip empty text or whitespace-only text *)
2626+ let text_trimmed = String.trim text in
2727+ if String.length text_trimmed = 0 then ()
2828+ else if not (is_nfc text_trimmed) then begin
2929+ let normalized = normalize_nfc text_trimmed in
3030+ Message_collector.add_warning collector
3131+ ~message:(Printf.sprintf
3232+ "Text run is not in Unicode Normalization Form C. Should instead be \xe2\x80\x9c%s\xe2\x80\x9d. (Copy and paste that into your source document to replace the un-normalized text.)"
3333+ normalized)
3434+ ~code:"unicode-normalization"
3535+ ()
3636+ end
3737+3838+let end_document _state _collector = ()
3939+4040+let checker =
4141+ (module struct
4242+ type nonrec state = state
4343+ let create = create
4444+ let reset = reset
4545+ let start_element = start_element
4646+ let end_element = end_element
4747+ let characters = characters
4848+ let end_document = end_document
4949+ end : Checker.S)
···11+(** Unicode normalization checker.
22+33+ Validates that text content is in Unicode Normalization Form C (NFC). *)
44+55+val checker : Checker.t
+28-4
test/debug_check.ml
···11let () =
22- let test_file = "validator/tests/html/mime-types/004-novalid.html" in
22+ let test_file = "validator/tests/xhtml/elements/menu/menu-containing-hr-novalid.xhtml" in
33 let ic = open_in test_file in
44 let html = really_input_string ic (in_channel_length ic) in
55 close_in ic;
66 let reader = Bytesrw.Bytes.Reader.of_string html in
77- let result = Html5_checker.check ~collect_parse_errors:true ~system_id:test_file reader in
77+ let doc = Html5rw.parse ~collect_errors:true reader in
88+ let root = Html5rw.root doc in
99+ print_endline "=== DOM Structure ===";
1010+ let rec print_node indent (node : Html5rw.Dom.node) =
1111+ let open Html5rw.Dom in
1212+ match node.name with
1313+ | "#text" ->
1414+ let text = String.trim node.data in
1515+ if String.length text > 0 then
1616+ Printf.printf "%sTEXT: %s\n" indent text
1717+ | "#document" | "#document-fragment" ->
1818+ Printf.printf "%s%s\n" indent node.name;
1919+ List.iter (print_node (indent ^ " ")) node.children
2020+ | "!doctype" -> Printf.printf "%s<!DOCTYPE>\n" indent
2121+ | "#comment" -> ()
2222+ | _ ->
2323+ Printf.printf "%s<%s>\n" indent node.name;
2424+ List.iter (print_node (indent ^ " ")) node.children
2525+ in
2626+ print_node "" root;
2727+ print_endline "\n=== Now checking ===";
2828+ let reader2 = Bytesrw.Bytes.Reader.of_string html in
2929+ let result = Html5_checker.check ~collect_parse_errors:true ~system_id:test_file reader2 in
830 let errors = Html5_checker.errors result in
931 let warnings = Html5_checker.warnings result in
1032 print_endline "=== Errors ===";
1133 List.iter (fun e -> print_endline e.Html5_checker.Message.message) errors;
1234 print_endline "=== Warnings ===";
1335 List.iter (fun e -> print_endline e.Html5_checker.Message.message) warnings;
3636+ print_endline "\n=== Expected ===";
3737+ print_endline "Element \xe2\x80\x9chr\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cmenu\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)";
1438 if List.length errors > 0 then
1515- print_endline "PASS (has errors)"
3939+ print_endline "\nPASS (has errors)"
1640 else
1717- print_endline "FAIL (no errors)"
4141+ print_endline "\nFAIL (no errors)"