(** XHTML parser using xmlm for proper XML parsing.
This module provides XML parsing for XHTML files, which the HTML5 parser
cannot handle correctly (especially self-closing tags on non-void elements). *)
(** Parse XHTML content using xmlm and return a DOM tree. *)
let parse_xhtml content =
let input = Xmlm.make_input (`String (0, content)) in
(* Stack of nodes during parsing *)
let stack = ref [] in
let root = Html5rw.Dom.create_document () in
stack := [root];
(* Helper to get namespace shorthand *)
let ns_shorthand ns =
if ns = "http://www.w3.org/2000/svg" then Some "svg"
else if ns = "http://www.w3.org/1998/Math/MathML" then Some "mathml"
else if ns = "http://www.w3.org/1999/xhtml" then None (* HTML namespace *)
else if ns = "" then None (* No namespace = HTML *)
else Some ns (* Keep other namespaces as-is *)
in
(* Process xmlm signals *)
let rec process () =
if Xmlm.eoi input then ()
else begin
match Xmlm.input input with
| `Dtd _ ->
(* Skip DTD for now *)
process ()
| `El_start ((ns, local), attrs) ->
(* Create element node *)
let namespace = ns_shorthand ns in
let attr_list = List.map (fun ((_, aname), aval) -> (aname, aval)) attrs in
let node = Html5rw.Dom.create_element local ~namespace ~attrs:attr_list () in
(* Append to current parent *)
(match !stack with
| parent :: _ -> Html5rw.Dom.append_child parent node
| [] -> ());
(* Push onto stack *)
stack := node :: !stack;
process ()
| `El_end ->
(* Pop from stack *)
(match !stack with
| _ :: rest -> stack := rest
| [] -> ());
process ()
| `Data text ->
(* Create text node and append to current parent *)
let trimmed = String.trim text in
if trimmed <> "" || String.length text > 0 then begin
let text_node = Html5rw.Dom.create_text text in
(match !stack with
| parent :: _ -> Html5rw.Dom.append_child parent text_node
| [] -> ())
end;
process ()
end
in
try
process ();
Ok root
with
| Xmlm.Error ((line, col), err) ->
Error (Printf.sprintf "XML parse error at %d:%d: %s" line col (Xmlm.error_message err))
(** Check if a system_id indicates an XHTML file. *)
let is_xhtml_file system_id =
match system_id with
| Some path ->
String.length path > 6 &&
String.sub path (String.length path - 6) 6 = ".xhtml"
| None -> false
(** Wrap DOM in an Html5rw.t-compatible structure for the checker. *)
type xhtml_doc = {
root : Html5rw.Dom.node;
errors : Html5rw.Error.t list;
}
let xhtml_root doc = doc.root
let xhtml_errors _doc = [] (* XML parser handles errors differently *)